]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Add update_Request
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
b7ab0590 17import itertools
03f9daab 18import io
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
13ebea79 27import ssl
c496ca96 28import socket
b53466e1 29import struct
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
8bb56eee 38 compat_HTMLParser,
8f9312c3 39 compat_basestring,
8c25f81b 40 compat_chr,
36e6f62c 41 compat_etree_fromstring,
8c25f81b 42 compat_html_entities,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
be4a824d 46 compat_socket_create_connection,
8c25f81b
PH
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
15707c7e 50 compat_urllib_parse_urlencode,
8c25f81b
PH
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
810c10ba 54 compat_xpath,
7d4111ed 55 shlex_quote,
8c25f81b 56)
4644ac55
S
57
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
3e669f36 62std_headers = {
9c7b3898 63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 68}
f427df17 69
5f6a1245 70
bf42a990
S
71NO_DEFAULT = object()
72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
a7aaa398
S
77KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
7105440c 92
d77c3dfd 93def preferredencoding():
59ae15a5 94 """Get preferred encoding.
d77c3dfd 95
59ae15a5
PH
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
98 """
99 try:
100 pref = locale.getpreferredencoding()
28e614de 101 'TEST'.encode(pref)
70a1165b 102 except Exception:
59ae15a5 103 pref = 'UTF-8'
bae611f2 104
59ae15a5 105 return pref
d77c3dfd 106
f4bfd65f 107
181c8655 108def write_json_file(obj, fn):
1394646a 109 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 110
92120217 111 fn = encodeFilename(fn)
61ee5aeb 112 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 else:
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
123
73159f99
S
124 args = {
125 'suffix': '.tmp',
ec5f6016
JMF
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
73159f99
S
128 'delete': False,
129 }
130
181c8655
PH
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
73159f99 134 args['mode'] = 'wb'
181c8655 135 else:
73159f99
S
136 args.update({
137 'mode': 'w',
138 'encoding': 'utf-8',
139 })
140
c86b6142 141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
142
143 try:
144 with tf:
145 json.dump(obj, tf)
1394646a
IK
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
149 try:
150 os.unlink(fn)
151 except OSError:
152 pass
181c8655 153 os.rename(tf.name, fn)
70a1165b 154 except Exception:
181c8655
PH
155 try:
156 os.remove(tf.name)
157 except OSError:
158 pass
159 raise
160
161
162if sys.version_info >= (2, 7):
ee114368 163 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 164 """ Find the xpath xpath[@key=val] """
5d2354f1 165 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
167 return node.find(expr)
168else:
ee114368 169 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 170 for f in node.findall(compat_xpath(xpath)):
ee114368
S
171 if key not in f.attrib:
172 continue
173 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
174 return f
175 return None
176
d7e66d39
JMF
177# On python2.6 the xml.etree.ElementTree.Element methods don't support
178# the namespace parameter
5f6a1245
JW
179
180
d7e66d39
JMF
181def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
183 replaced = []
184 for c in components:
185 if len(c) == 1:
186 replaced.append(c[0])
187 else:
188 ns, tag = c
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
191
d77c3dfd 192
a41fb80c 193def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 194 def _find_xpath(xpath):
810c10ba 195 return node.find(compat_xpath(xpath))
578c0745
S
196
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
199 else:
200 for xp in xpath:
201 n = _find_xpath(xp)
202 if n is not None:
203 break
d74bebd5 204
8e636da4 205 if n is None:
bf42a990
S
206 if default is not NO_DEFAULT:
207 return default
208 elif fatal:
bf0ff932
PH
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
211 else:
212 return None
a41fb80c
S
213 return n
214
215
216def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
219 return n
220 if n.text is None:
221 if default is not NO_DEFAULT:
222 return default
223 elif fatal:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
226 else:
227 return None
228 return n.text
a41fb80c
S
229
230
231def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
233 if n is None:
234 if default is not NO_DEFAULT:
235 return default
236 elif fatal:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
239 else:
240 return None
241 return n.attrib[key]
bf0ff932
PH
242
243
9e6dd238 244def get_element_by_id(id, html):
43e8fafd 245 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 246 return get_element_by_attribute('id', id, html)
43e8fafd 247
12ea2f30 248
43e8fafd
ND
249def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 251
38285056
PH
252 m = re.search(r'''(?xs)
253 <([a-zA-Z0-9:._-]+)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 \s+%s=['"]?%s['"]?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
257 \s*>
258 (?P<content>.*?)
259 </\1>
260 ''' % (re.escape(attribute), re.escape(value)), html)
261
262 if not m:
263 return None
264 res = m.group('content')
265
266 if res.startswith('"') or res.startswith("'"):
267 res = res[1:-1]
a921f407 268
38285056 269 return unescapeHTML(res)
a921f407 270
c5229f39 271
8bb56eee
BF
272class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
274 def __init__(self):
c5229f39 275 self.attrs = {}
8bb56eee
BF
276 compat_HTMLParser.__init__(self)
277
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
280
c5229f39 281
8bb56eee
BF
282def extract_attributes(html_element):
283 """Given a string for an HTML element such as
284 <el
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&amp;"
287 sq='"' dq="'"
288 >
289 Decode and return a dictionary of attributes.
290 {
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
294 }.
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 """
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
300 parser.close()
301 return parser.attrs
9e6dd238 302
c5229f39 303
9e6dd238 304def clean_html(html):
59ae15a5 305 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
306
307 if html is None: # Convenience for sanitizing descriptions etc.
308 return html
309
59ae15a5
PH
310 # Newline vs <br />
311 html = html.replace('\n', ' ')
6b3aef80
FV
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
314 # Strip html tags
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
7decf895 318 return html.strip()
9e6dd238
FV
319
320
d77c3dfd 321def sanitize_open(filename, open_mode):
59ae15a5
PH
322 """Try to open the given filename, and slightly tweak it if this fails.
323
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
327 function.
328
329 It returns the tuple (stream, definitive_file_name).
330 """
331 try:
28e614de 332 if filename == '-':
59ae15a5
PH
333 if sys.platform == 'win32':
334 import msvcrt
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
f45c185f
PH
340 if err.errno in (errno.EACCES,):
341 raise
59ae15a5 342
f45c185f 343 # In case of error, try to remove win32 forbidden chars
d55de57b 344 alt_filename = sanitize_path(filename)
f45c185f
PH
345 if alt_filename == filename:
346 raise
347 else:
348 # An exception here should be caught in the caller
d55de57b 349 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 350 return (stream, alt_filename)
d77c3dfd
FV
351
352
353def timeconvert(timestr):
59ae15a5
PH
354 """Convert RFC 2822 defined time string into system timestamp"""
355 timestamp = None
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
359 return timestamp
1c469a94 360
5f6a1245 361
796173d0 362def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
796173d0 365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
366 """
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
369 return ''
370 elif char == '"':
371 return '' if restricted else '\''
372 elif char == ':':
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
375 return '_'
627dcfff 376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
377 return '_'
378 if restricted and ord(char) > 127:
379 return '_'
380 return char
381
2aeb06d6
PH
382 # Handle timestamps
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 384 result = ''.join(map(replace_insane, s))
796173d0
PH
385 if not is_id:
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
391 result = result[2:]
5a42414b
PH
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
a7440261 394 result = result.lstrip('.')
796173d0
PH
395 if not result:
396 result = '_'
59ae15a5 397 return result
d77c3dfd 398
5f6a1245 399
a2aaf4db
S
400def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
403 return s
be531ef1
S
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
408 if drive_or_unc:
a2aaf4db
S
409 norm_path.pop(0)
410 sanitized_path = [
c90d16cf 411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 412 for path_part in norm_path]
be531ef1
S
413 if drive_or_unc:
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
415 return os.path.join(*sanitized_path)
416
417
67dda517
S
418# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419# unwanted failures due to missing protocol
17bcc626
S
420def sanitize_url(url):
421 return 'http:%s' % url if url.startswith('//') else url
422
423
67dda517 424def sanitized_Request(url, *args, **kwargs):
17bcc626 425 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
426
427
d77c3dfd 428def orderedSet(iterable):
59ae15a5
PH
429 """ Remove all duplicates from the input iterable """
430 res = []
431 for el in iterable:
432 if el not in res:
433 res.append(el)
434 return res
d77c3dfd 435
912b38b4 436
4e408e47
PH
437def _htmlentity_transform(entity):
438 """Transforms an HTML entity to a character."""
439 # Known non-numeric HTML entity
440 if entity in compat_html_entities.name2codepoint:
441 return compat_chr(compat_html_entities.name2codepoint[entity])
442
91757b0f 443 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
444 if mobj is not None:
445 numstr = mobj.group(1)
28e614de 446 if numstr.startswith('x'):
4e408e47 447 base = 16
28e614de 448 numstr = '0%s' % numstr
4e408e47
PH
449 else:
450 base = 10
7aefc49c
S
451 # See https://github.com/rg3/youtube-dl/issues/7518
452 try:
453 return compat_chr(int(numstr, base))
454 except ValueError:
455 pass
4e408e47
PH
456
457 # Unknown entity in name, return its literal representation
7a3f0c00 458 return '&%s;' % entity
4e408e47
PH
459
460
d77c3dfd 461def unescapeHTML(s):
912b38b4
PH
462 if s is None:
463 return None
464 assert type(s) == compat_str
d77c3dfd 465
4e408e47
PH
466 return re.sub(
467 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 468
8bf48f23 469
aa49acd1
S
470def get_subprocess_encoding():
471 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
472 # For subprocess calls, encode with locale encoding
473 # Refer to http://stackoverflow.com/a/9951851/35070
474 encoding = preferredencoding()
475 else:
476 encoding = sys.getfilesystemencoding()
477 if encoding is None:
478 encoding = 'utf-8'
479 return encoding
480
481
8bf48f23 482def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
483 """
484 @param s The name of the file
485 """
d77c3dfd 486
8bf48f23 487 assert type(s) == compat_str
d77c3dfd 488
59ae15a5
PH
489 # Python 3 has a Unicode API
490 if sys.version_info >= (3, 0):
491 return s
0f00efed 492
aa49acd1
S
493 # Pass '' directly to use Unicode APIs on Windows 2000 and up
494 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
495 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
496 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
497 return s
498
8ee239e9
YCH
499 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
500 if sys.platform.startswith('java'):
501 return s
502
aa49acd1
S
503 return s.encode(get_subprocess_encoding(), 'ignore')
504
505
506def decodeFilename(b, for_subprocess=False):
507
508 if sys.version_info >= (3, 0):
509 return b
510
511 if not isinstance(b, bytes):
512 return b
513
514 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 515
f07b74fc
PH
516
517def encodeArgument(s):
518 if not isinstance(s, compat_str):
519 # Legacy code that uses byte strings
520 # Uncomment the following line after fixing all post processors
7af808a5 521 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
522 s = s.decode('ascii')
523 return encodeFilename(s, True)
524
525
aa49acd1
S
526def decodeArgument(b):
527 return decodeFilename(b, True)
528
529
8271226a
PH
530def decodeOption(optval):
531 if optval is None:
532 return optval
533 if isinstance(optval, bytes):
534 optval = optval.decode(preferredencoding())
535
536 assert isinstance(optval, compat_str)
537 return optval
1c256f70 538
5f6a1245 539
4539dd30
PH
540def formatSeconds(secs):
541 if secs > 3600:
542 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
543 elif secs > 60:
544 return '%d:%02d' % (secs // 60, secs % 60)
545 else:
546 return '%d' % secs
547
a0ddb8a2 548
be4a824d
PH
549def make_HTTPS_handler(params, **kwargs):
550 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 551 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 552 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 553 if opts_no_check_certificate:
be5f2c19 554 context.check_hostname = False
0db261ba 555 context.verify_mode = ssl.CERT_NONE
a2366922 556 try:
be4a824d 557 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
558 except TypeError:
559 # Python 2.7.8
560 # (create_default_context present but HTTPSHandler has no context=)
561 pass
562
563 if sys.version_info < (3, 2):
d7932313 564 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 565 else: # Python < 3.4
d7932313 566 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 567 context.verify_mode = (ssl.CERT_NONE
dca08720 568 if opts_no_check_certificate
ea6d901e 569 else ssl.CERT_REQUIRED)
303b479e 570 context.set_default_verify_paths()
be4a824d 571 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 572
732ea2f0 573
08f2a92c
JMF
574def bug_reports_message():
575 if ytdl_is_updateable():
576 update_cmd = 'type youtube-dl -U to update'
577 else:
578 update_cmd = 'see https://yt-dl.org/update on how to update'
579 msg = '; please report this issue on https://yt-dl.org/bug .'
580 msg += ' Make sure you are using the latest version; %s.' % update_cmd
581 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
582 return msg
583
584
1c256f70
PH
585class ExtractorError(Exception):
586 """Error during info extraction."""
5f6a1245 587
d11271dd 588 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
591 """
592
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
594 expected = True
d11271dd
PH
595 if video_id is not None:
596 msg = video_id + ': ' + msg
410f3e73 597 if cause:
28e614de 598 msg += ' (caused by %r)' % cause
9a82b238 599 if not expected:
08f2a92c 600 msg += bug_reports_message()
1c256f70 601 super(ExtractorError, self).__init__(msg)
d5979c5d 602
1c256f70 603 self.traceback = tb
8cc83b8d 604 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 605 self.cause = cause
d11271dd 606 self.video_id = video_id
1c256f70 607
01951dda
PH
608 def format_traceback(self):
609 if self.traceback is None:
610 return None
28e614de 611 return ''.join(traceback.format_tb(self.traceback))
01951dda 612
1c256f70 613
416c7fcb
PH
614class UnsupportedError(ExtractorError):
615 def __init__(self, url):
616 super(UnsupportedError, self).__init__(
617 'Unsupported URL: %s' % url, expected=True)
618 self.url = url
619
620
55b3e45b
JMF
621class RegexNotFoundError(ExtractorError):
622 """Error when a regex didn't match"""
623 pass
624
625
d77c3dfd 626class DownloadError(Exception):
59ae15a5 627 """Download Error exception.
d77c3dfd 628
59ae15a5
PH
629 This exception may be thrown by FileDownloader objects if they are not
630 configured to continue on errors. They will contain the appropriate
631 error message.
632 """
5f6a1245 633
8cc83b8d
FV
634 def __init__(self, msg, exc_info=None):
635 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
636 super(DownloadError, self).__init__(msg)
637 self.exc_info = exc_info
d77c3dfd
FV
638
639
640class SameFileError(Exception):
59ae15a5 641 """Same File exception.
d77c3dfd 642
59ae15a5
PH
643 This exception will be thrown by FileDownloader objects if they detect
644 multiple files would have to be downloaded to the same file on disk.
645 """
646 pass
d77c3dfd
FV
647
648
649class PostProcessingError(Exception):
59ae15a5 650 """Post Processing exception.
d77c3dfd 651
59ae15a5
PH
652 This exception may be raised by PostProcessor's .run() method to
653 indicate an error in the postprocessing task.
654 """
5f6a1245 655
7851b379
PH
656 def __init__(self, msg):
657 self.msg = msg
d77c3dfd 658
5f6a1245 659
d77c3dfd 660class MaxDownloadsReached(Exception):
59ae15a5
PH
661 """ --max-downloads limit has been reached. """
662 pass
d77c3dfd
FV
663
664
665class UnavailableVideoError(Exception):
59ae15a5 666 """Unavailable Format exception.
d77c3dfd 667
59ae15a5
PH
668 This exception will be thrown when a video is requested
669 in a format that is not available for that video.
670 """
671 pass
d77c3dfd
FV
672
673
674class ContentTooShortError(Exception):
59ae15a5 675 """Content Too Short exception.
d77c3dfd 676
59ae15a5
PH
677 This exception may be raised by FileDownloader objects when a file they
678 download is too small for what the server announced first, indicating
679 the connection was probably interrupted.
680 """
d77c3dfd 681
59ae15a5 682 def __init__(self, downloaded, expected):
2c7ed247 683 # Both in bytes
59ae15a5
PH
684 self.downloaded = downloaded
685 self.expected = expected
d77c3dfd 686
5f6a1245 687
c5a59d93 688def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
689 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
690 # expected HTTP responses to meet HTTP/1.0 or later (see also
691 # https://github.com/rg3/youtube-dl/issues/6727)
692 if sys.version_info < (3, 0):
5a1a2e94 693 kwargs[b'strict'] = True
be4a824d
PH
694 hc = http_class(*args, **kwargs)
695 source_address = ydl_handler._params.get('source_address')
696 if source_address is not None:
697 sa = (source_address, 0)
698 if hasattr(hc, 'source_address'): # Python 2.7+
699 hc.source_address = sa
700 else: # Python 2.6
701 def _hc_connect(self, *args, **kwargs):
702 sock = compat_socket_create_connection(
703 (self.host, self.port), self.timeout, sa)
704 if is_https:
d7932313
PH
705 self.sock = ssl.wrap_socket(
706 sock, self.key_file, self.cert_file,
707 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
708 else:
709 self.sock = sock
710 hc.connect = functools.partial(_hc_connect, hc)
711
712 return hc
713
714
87f0e62d 715def handle_youtubedl_headers(headers):
992fc9d6
YCH
716 filtered_headers = headers
717
718 if 'Youtubedl-no-compression' in filtered_headers:
719 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 720 del filtered_headers['Youtubedl-no-compression']
87f0e62d 721
992fc9d6 722 return filtered_headers
87f0e62d
YCH
723
724
acebc9cd 725class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
726 """Handler for HTTP requests and responses.
727
728 This class, when installed with an OpenerDirector, automatically adds
729 the standard headers to every HTTP request and handles gzipped and
730 deflated responses from web servers. If compression is to be avoided in
731 a particular request, the original request in the program code only has
0424ec30 732 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
733 removed before making the real request.
734
735 Part of this code was copied from:
736
737 http://techknack.net/python-urllib2-handlers/
738
739 Andrew Rowls, the author of that code, agreed to release it to the
740 public domain.
741 """
742
be4a824d
PH
743 def __init__(self, params, *args, **kwargs):
744 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
745 self._params = params
746
747 def http_open(self, req):
748 return self.do_open(functools.partial(
c5a59d93 749 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
750 req)
751
59ae15a5
PH
752 @staticmethod
753 def deflate(data):
754 try:
755 return zlib.decompress(data, -zlib.MAX_WBITS)
756 except zlib.error:
757 return zlib.decompress(data)
758
759 @staticmethod
760 def addinfourl_wrapper(stream, headers, url, code):
761 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
762 return compat_urllib_request.addinfourl(stream, headers, url, code)
763 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 ret.code = code
765 return ret
766
acebc9cd 767 def http_request(self, req):
51f267d9
S
768 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
769 # always respected by websites, some tend to give out URLs with non percent-encoded
770 # non-ASCII characters (see telemb.py, ard.py [#3412])
771 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
772 # To work around aforementioned issue we will replace request's original URL with
773 # percent-encoded one
774 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
775 # the code of this workaround has been moved here from YoutubeDL.urlopen()
776 url = req.get_full_url()
777 url_escaped = escape_url(url)
778
779 # Substitute URL if any change after escaping
780 if url != url_escaped:
781 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
782 new_req = req_type(
783 url_escaped, data=req.data, headers=req.headers,
784 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
785 new_req.timeout = req.timeout
786 req = new_req
787
33ac271b 788 for h, v in std_headers.items():
3d5f7a39
JK
789 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
790 # The dict keys are capitalized because of this bug by urllib
791 if h.capitalize() not in req.headers:
33ac271b 792 req.add_header(h, v)
87f0e62d
YCH
793
794 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
795
796 if sys.version_info < (2, 7) and '#' in req.get_full_url():
797 # Python 2.6 is brain-dead when it comes to fragments
798 req._Request__original = req._Request__original.partition('#')[0]
799 req._Request__r_type = req._Request__r_type.partition('#')[0]
800
59ae15a5
PH
801 return req
802
acebc9cd 803 def http_response(self, req, resp):
59ae15a5
PH
804 old_resp = resp
805 # gzip
806 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
807 content = resp.read()
808 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
809 try:
810 uncompressed = io.BytesIO(gz.read())
811 except IOError as original_ioerror:
812 # There may be junk add the end of the file
813 # See http://stackoverflow.com/q/4928560/35070 for details
814 for i in range(1, 1024):
815 try:
816 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
817 uncompressed = io.BytesIO(gz.read())
818 except IOError:
819 continue
820 break
821 else:
822 raise original_ioerror
823 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 824 resp.msg = old_resp.msg
c047270c 825 del resp.headers['Content-encoding']
59ae15a5
PH
826 # deflate
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
c047270c 831 del resp.headers['Content-encoding']
ad729172
S
832 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
833 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
834 if 300 <= resp.code < 400:
835 location = resp.headers.get('Location')
836 if location:
837 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
838 if sys.version_info >= (3, 0):
839 location = location.encode('iso-8859-1').decode('utf-8')
840 location_escaped = escape_url(location)
841 if location != location_escaped:
842 del resp.headers['Location']
843 resp.headers['Location'] = location_escaped
59ae15a5 844 return resp
0f8d03f8 845
acebc9cd
PH
846 https_request = http_request
847 https_response = http_response
bf50b038 848
5de90176 849
be4a824d
PH
850class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
851 def __init__(self, params, https_conn_class=None, *args, **kwargs):
852 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
853 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
854 self._params = params
855
856 def https_open(self, req):
4f264c02
JMF
857 kwargs = {}
858 if hasattr(self, '_context'): # python > 2.6
859 kwargs['context'] = self._context
860 if hasattr(self, '_check_hostname'): # python 3.x
861 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
862 return self.do_open(functools.partial(
863 _create_http_connection, self, self._https_conn_class, True),
4f264c02 864 req, **kwargs)
be4a824d
PH
865
866
a6420bf5
S
867class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
868 def __init__(self, cookiejar=None):
869 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
870
871 def http_response(self, request, response):
872 # Python 2 will choke on next HTTP request in row if there are non-ASCII
873 # characters in Set-Cookie HTTP header of last response (see
874 # https://github.com/rg3/youtube-dl/issues/6769).
875 # In order to at least prevent crashing we will percent encode Set-Cookie
876 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
877 # if sys.version_info < (3, 0) and response.headers:
878 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
879 # set_cookie = response.headers.get(set_cookie_header)
880 # if set_cookie:
881 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
882 # if set_cookie != set_cookie_escaped:
883 # del response.headers[set_cookie_header]
884 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
885 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
886
887 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
888 https_response = http_response
889
890
08b38d54 891def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
892 """ Return a UNIX timestamp from the given date """
893
894 if date_str is None:
895 return None
896
52c3a6e4
S
897 date_str = re.sub(r'\.[0-9]+', '', date_str)
898
08b38d54
PH
899 if timezone is None:
900 m = re.search(
52c3a6e4 901 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
902 date_str)
903 if not m:
912b38b4
PH
904 timezone = datetime.timedelta()
905 else:
08b38d54
PH
906 date_str = date_str[:-len(m.group(0))]
907 if not m.group('sign'):
908 timezone = datetime.timedelta()
909 else:
910 sign = 1 if m.group('sign') == '+' else -1
911 timezone = datetime.timedelta(
912 hours=sign * int(m.group('hours')),
913 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
914 try:
915 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
916 dt = datetime.datetime.strptime(date_str, date_format) - timezone
917 return calendar.timegm(dt.timetuple())
918 except ValueError:
919 pass
912b38b4
PH
920
921
42bdd9d0 922def unified_strdate(date_str, day_first=True):
bf50b038 923 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
924
925 if date_str is None:
926 return None
bf50b038 927 upload_date = None
5f6a1245 928 # Replace commas
026fcc04 929 date_str = date_str.replace(',', ' ')
bf50b038 930 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
931 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
932 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 933 # Remove AM/PM + timezone
9bb8e0a3 934 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 935
19e1d359
JMF
936 format_expressions = [
937 '%d %B %Y',
0f99566c 938 '%d %b %Y',
19e1d359
JMF
939 '%B %d %Y',
940 '%b %d %Y',
f160785c
S
941 '%b %dst %Y %I:%M',
942 '%b %dnd %Y %I:%M',
943 '%b %dth %Y %I:%M',
a69801e2 944 '%Y %m %d',
19e1d359 945 '%Y-%m-%d',
fe556f1b 946 '%Y/%m/%d',
19e1d359 947 '%Y/%m/%d %H:%M:%S',
5d73273f 948 '%Y-%m-%d %H:%M:%S',
e9be9a6a 949 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 950 '%d.%m.%Y %H:%M',
b047de6f 951 '%d.%m.%Y %H.%M',
19e1d359 952 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
953 '%Y-%m-%dT%H:%M:%S.%fZ',
954 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 955 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 956 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 957 '%Y-%m-%dT%H:%M',
19e1d359 958 ]
42bdd9d0
PH
959 if day_first:
960 format_expressions.extend([
79c21abb 961 '%d-%m-%Y',
776dc399
S
962 '%d.%m.%Y',
963 '%d/%m/%Y',
964 '%d/%m/%y',
42bdd9d0
PH
965 '%d/%m/%Y %H:%M:%S',
966 ])
967 else:
968 format_expressions.extend([
79c21abb 969 '%m-%d-%Y',
776dc399
S
970 '%m.%d.%Y',
971 '%m/%d/%Y',
972 '%m/%d/%y',
42bdd9d0
PH
973 '%m/%d/%Y %H:%M:%S',
974 ])
bf50b038
JMF
975 for expression in format_expressions:
976 try:
977 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 978 except ValueError:
bf50b038 979 pass
42393ce2
PH
980 if upload_date is None:
981 timetuple = email.utils.parsedate_tz(date_str)
982 if timetuple:
983 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
984 if upload_date is not None:
985 return compat_str(upload_date)
bf50b038 986
5f6a1245 987
28e614de 988def determine_ext(url, default_ext='unknown_video'):
f4776371
S
989 if url is None:
990 return default_ext
9cb9a5df 991 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
992 if re.match(r'^[A-Za-z0-9]+$', guess):
993 return guess
a7aaa398
S
994 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
995 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 996 return guess.rstrip('/')
73e79f2a 997 else:
cbdbb766 998 return default_ext
73e79f2a 999
5f6a1245 1000
d4051a8e 1001def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1002 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1003
5f6a1245 1004
bd558525 1005def date_from_str(date_str):
37254abc
JMF
1006 """
1007 Return a datetime object from a string in the format YYYYMMDD or
1008 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1009 today = datetime.date.today()
f8795e10 1010 if date_str in ('now', 'today'):
37254abc 1011 return today
f8795e10
PH
1012 if date_str == 'yesterday':
1013 return today - datetime.timedelta(days=1)
37254abc
JMF
1014 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1015 if match is not None:
1016 sign = match.group('sign')
1017 time = int(match.group('time'))
1018 if sign == '-':
1019 time = -time
1020 unit = match.group('unit')
dfb1b146 1021 # A bad approximation?
37254abc
JMF
1022 if unit == 'month':
1023 unit = 'day'
1024 time *= 30
1025 elif unit == 'year':
1026 unit = 'day'
1027 time *= 365
1028 unit += 's'
1029 delta = datetime.timedelta(**{unit: time})
1030 return today + delta
611c1dd9 1031 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1032
1033
e63fc1be 1034def hyphenate_date(date_str):
1035 """
1036 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1037 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1038 if match is not None:
1039 return '-'.join(match.groups())
1040 else:
1041 return date_str
1042
5f6a1245 1043
bd558525
JMF
1044class DateRange(object):
1045 """Represents a time interval between two dates"""
5f6a1245 1046
bd558525
JMF
1047 def __init__(self, start=None, end=None):
1048 """start and end must be strings in the format accepted by date"""
1049 if start is not None:
1050 self.start = date_from_str(start)
1051 else:
1052 self.start = datetime.datetime.min.date()
1053 if end is not None:
1054 self.end = date_from_str(end)
1055 else:
1056 self.end = datetime.datetime.max.date()
37254abc 1057 if self.start > self.end:
bd558525 1058 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1059
bd558525
JMF
1060 @classmethod
1061 def day(cls, day):
1062 """Returns a range that only contains the given day"""
5f6a1245
JW
1063 return cls(day, day)
1064
bd558525
JMF
1065 def __contains__(self, date):
1066 """Check if the date is in the range"""
37254abc
JMF
1067 if not isinstance(date, datetime.date):
1068 date = date_from_str(date)
1069 return self.start <= date <= self.end
5f6a1245 1070
bd558525 1071 def __str__(self):
5f6a1245 1072 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1073
1074
1075def platform_name():
1076 """ Returns the platform name as a compat_str """
1077 res = platform.platform()
1078 if isinstance(res, bytes):
1079 res = res.decode(preferredencoding())
1080
1081 assert isinstance(res, compat_str)
1082 return res
c257baff
PH
1083
1084
b58ddb32
PH
1085def _windows_write_string(s, out):
1086 """ Returns True if the string was written using special methods,
1087 False if it has yet to be written out."""
1088 # Adapted from http://stackoverflow.com/a/3259271/35070
1089
1090 import ctypes
1091 import ctypes.wintypes
1092
1093 WIN_OUTPUT_IDS = {
1094 1: -11,
1095 2: -12,
1096 }
1097
a383a98a
PH
1098 try:
1099 fileno = out.fileno()
1100 except AttributeError:
1101 # If the output stream doesn't have a fileno, it's virtual
1102 return False
aa42e873
PH
1103 except io.UnsupportedOperation:
1104 # Some strange Windows pseudo files?
1105 return False
b58ddb32
PH
1106 if fileno not in WIN_OUTPUT_IDS:
1107 return False
1108
e2f89ec7 1109 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1110 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1111 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1112 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1113
e2f89ec7 1114 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1115 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1116 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1117 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1118 written = ctypes.wintypes.DWORD(0)
1119
611c1dd9 1120 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1121 FILE_TYPE_CHAR = 0x0002
1122 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1123 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1124 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1125 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1126 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1127 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1128
1129 def not_a_console(handle):
1130 if handle == INVALID_HANDLE_VALUE or handle is None:
1131 return True
8fb3ac36
PH
1132 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1133 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1134
1135 if not_a_console(h):
1136 return False
1137
d1b9c912
PH
1138 def next_nonbmp_pos(s):
1139 try:
1140 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1141 except StopIteration:
1142 return len(s)
1143
1144 while s:
1145 count = min(next_nonbmp_pos(s), 1024)
1146
b58ddb32 1147 ret = WriteConsoleW(
d1b9c912 1148 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1149 if ret == 0:
1150 raise OSError('Failed to write string')
d1b9c912
PH
1151 if not count: # We just wrote a non-BMP character
1152 assert written.value == 2
1153 s = s[1:]
1154 else:
1155 assert written.value > 0
1156 s = s[written.value:]
b58ddb32
PH
1157 return True
1158
1159
734f90bb 1160def write_string(s, out=None, encoding=None):
7459e3a2
PH
1161 if out is None:
1162 out = sys.stderr
8bf48f23 1163 assert type(s) == compat_str
7459e3a2 1164
b58ddb32
PH
1165 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1166 if _windows_write_string(s, out):
1167 return
1168
7459e3a2
PH
1169 if ('b' in getattr(out, 'mode', '') or
1170 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1171 byt = s.encode(encoding or preferredencoding(), 'ignore')
1172 out.write(byt)
1173 elif hasattr(out, 'buffer'):
1174 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1175 byt = s.encode(enc, 'ignore')
1176 out.buffer.write(byt)
1177 else:
8bf48f23 1178 out.write(s)
7459e3a2
PH
1179 out.flush()
1180
1181
48ea9cea
PH
1182def bytes_to_intlist(bs):
1183 if not bs:
1184 return []
1185 if isinstance(bs[0], int): # Python 3
1186 return list(bs)
1187 else:
1188 return [ord(c) for c in bs]
1189
c257baff 1190
cba892fa 1191def intlist_to_bytes(xs):
1192 if not xs:
1193 return b''
eb4157fd 1194 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1195
1196
c1c9a79c
PH
1197# Cross-platform file locking
1198if sys.platform == 'win32':
1199 import ctypes.wintypes
1200 import msvcrt
1201
1202 class OVERLAPPED(ctypes.Structure):
1203 _fields_ = [
1204 ('Internal', ctypes.wintypes.LPVOID),
1205 ('InternalHigh', ctypes.wintypes.LPVOID),
1206 ('Offset', ctypes.wintypes.DWORD),
1207 ('OffsetHigh', ctypes.wintypes.DWORD),
1208 ('hEvent', ctypes.wintypes.HANDLE),
1209 ]
1210
1211 kernel32 = ctypes.windll.kernel32
1212 LockFileEx = kernel32.LockFileEx
1213 LockFileEx.argtypes = [
1214 ctypes.wintypes.HANDLE, # hFile
1215 ctypes.wintypes.DWORD, # dwFlags
1216 ctypes.wintypes.DWORD, # dwReserved
1217 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1218 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1219 ctypes.POINTER(OVERLAPPED) # Overlapped
1220 ]
1221 LockFileEx.restype = ctypes.wintypes.BOOL
1222 UnlockFileEx = kernel32.UnlockFileEx
1223 UnlockFileEx.argtypes = [
1224 ctypes.wintypes.HANDLE, # hFile
1225 ctypes.wintypes.DWORD, # dwReserved
1226 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1227 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1228 ctypes.POINTER(OVERLAPPED) # Overlapped
1229 ]
1230 UnlockFileEx.restype = ctypes.wintypes.BOOL
1231 whole_low = 0xffffffff
1232 whole_high = 0x7fffffff
1233
1234 def _lock_file(f, exclusive):
1235 overlapped = OVERLAPPED()
1236 overlapped.Offset = 0
1237 overlapped.OffsetHigh = 0
1238 overlapped.hEvent = 0
1239 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1240 handle = msvcrt.get_osfhandle(f.fileno())
1241 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1242 whole_low, whole_high, f._lock_file_overlapped_p):
1243 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1244
1245 def _unlock_file(f):
1246 assert f._lock_file_overlapped_p
1247 handle = msvcrt.get_osfhandle(f.fileno())
1248 if not UnlockFileEx(handle, 0,
1249 whole_low, whole_high, f._lock_file_overlapped_p):
1250 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1251
1252else:
399a76e6
YCH
1253 # Some platforms, such as Jython, is missing fcntl
1254 try:
1255 import fcntl
c1c9a79c 1256
399a76e6
YCH
1257 def _lock_file(f, exclusive):
1258 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1259
399a76e6
YCH
1260 def _unlock_file(f):
1261 fcntl.flock(f, fcntl.LOCK_UN)
1262 except ImportError:
1263 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1264
1265 def _lock_file(f, exclusive):
1266 raise IOError(UNSUPPORTED_MSG)
1267
1268 def _unlock_file(f):
1269 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1270
1271
1272class locked_file(object):
1273 def __init__(self, filename, mode, encoding=None):
1274 assert mode in ['r', 'a', 'w']
1275 self.f = io.open(filename, mode, encoding=encoding)
1276 self.mode = mode
1277
1278 def __enter__(self):
1279 exclusive = self.mode != 'r'
1280 try:
1281 _lock_file(self.f, exclusive)
1282 except IOError:
1283 self.f.close()
1284 raise
1285 return self
1286
1287 def __exit__(self, etype, value, traceback):
1288 try:
1289 _unlock_file(self.f)
1290 finally:
1291 self.f.close()
1292
1293 def __iter__(self):
1294 return iter(self.f)
1295
1296 def write(self, *args):
1297 return self.f.write(*args)
1298
1299 def read(self, *args):
1300 return self.f.read(*args)
4eb7f1d1
JMF
1301
1302
4644ac55
S
1303def get_filesystem_encoding():
1304 encoding = sys.getfilesystemencoding()
1305 return encoding if encoding is not None else 'utf-8'
1306
1307
4eb7f1d1 1308def shell_quote(args):
a6a173c2 1309 quoted_args = []
4644ac55 1310 encoding = get_filesystem_encoding()
a6a173c2
JMF
1311 for a in args:
1312 if isinstance(a, bytes):
1313 # We may get a filename encoded with 'encodeFilename'
1314 a = a.decode(encoding)
1315 quoted_args.append(pipes.quote(a))
28e614de 1316 return ' '.join(quoted_args)
9d4660ca
PH
1317
1318
1319def smuggle_url(url, data):
1320 """ Pass additional data in a URL for internal use. """
1321
15707c7e 1322 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1323 {'__youtubedl_smuggle': json.dumps(data)})
1324 return url + '#' + sdata
9d4660ca
PH
1325
1326
79f82953 1327def unsmuggle_url(smug_url, default=None):
83e865a3 1328 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1329 return smug_url, default
28e614de
PH
1330 url, _, sdata = smug_url.rpartition('#')
1331 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1332 data = json.loads(jsond)
1333 return url, data
02dbf93f
PH
1334
1335
02dbf93f
PH
1336def format_bytes(bytes):
1337 if bytes is None:
28e614de 1338 return 'N/A'
02dbf93f
PH
1339 if type(bytes) is str:
1340 bytes = float(bytes)
1341 if bytes == 0.0:
1342 exponent = 0
1343 else:
1344 exponent = int(math.log(bytes, 1024.0))
28e614de 1345 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1346 converted = float(bytes) / float(1024 ** exponent)
28e614de 1347 return '%.2f%s' % (converted, suffix)
f53c966a 1348
1c088fa8 1349
fb47597b
S
1350def lookup_unit_table(unit_table, s):
1351 units_re = '|'.join(re.escape(u) for u in unit_table)
1352 m = re.match(
782b1b5b 1353 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1354 if not m:
1355 return None
1356 num_str = m.group('num').replace(',', '.')
1357 mult = unit_table[m.group('unit')]
1358 return int(float(num_str) * mult)
1359
1360
be64b5b0
PH
1361def parse_filesize(s):
1362 if s is None:
1363 return None
1364
dfb1b146 1365 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1366 # but we support those too
1367 _UNIT_TABLE = {
1368 'B': 1,
1369 'b': 1,
1370 'KiB': 1024,
1371 'KB': 1000,
1372 'kB': 1024,
1373 'Kb': 1000,
1374 'MiB': 1024 ** 2,
1375 'MB': 1000 ** 2,
1376 'mB': 1024 ** 2,
1377 'Mb': 1000 ** 2,
1378 'GiB': 1024 ** 3,
1379 'GB': 1000 ** 3,
1380 'gB': 1024 ** 3,
1381 'Gb': 1000 ** 3,
1382 'TiB': 1024 ** 4,
1383 'TB': 1000 ** 4,
1384 'tB': 1024 ** 4,
1385 'Tb': 1000 ** 4,
1386 'PiB': 1024 ** 5,
1387 'PB': 1000 ** 5,
1388 'pB': 1024 ** 5,
1389 'Pb': 1000 ** 5,
1390 'EiB': 1024 ** 6,
1391 'EB': 1000 ** 6,
1392 'eB': 1024 ** 6,
1393 'Eb': 1000 ** 6,
1394 'ZiB': 1024 ** 7,
1395 'ZB': 1000 ** 7,
1396 'zB': 1024 ** 7,
1397 'Zb': 1000 ** 7,
1398 'YiB': 1024 ** 8,
1399 'YB': 1000 ** 8,
1400 'yB': 1024 ** 8,
1401 'Yb': 1000 ** 8,
1402 }
1403
fb47597b
S
1404 return lookup_unit_table(_UNIT_TABLE, s)
1405
1406
1407def parse_count(s):
1408 if s is None:
be64b5b0
PH
1409 return None
1410
fb47597b
S
1411 s = s.strip()
1412
1413 if re.match(r'^[\d,.]+$', s):
1414 return str_to_int(s)
1415
1416 _UNIT_TABLE = {
1417 'k': 1000,
1418 'K': 1000,
1419 'm': 1000 ** 2,
1420 'M': 1000 ** 2,
1421 'kk': 1000 ** 2,
1422 'KK': 1000 ** 2,
1423 }
be64b5b0 1424
fb47597b 1425 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1426
2f7ae819 1427
caefb1de
PH
1428def month_by_name(name):
1429 """ Return the number of a month by (locale-independently) English name """
1430
caefb1de 1431 try:
7105440c
YCH
1432 return ENGLISH_MONTH_NAMES.index(name) + 1
1433 except ValueError:
1434 return None
1435
1436
1437def month_by_abbreviation(abbrev):
1438 """ Return the number of a month by (locale-independently) English
1439 abbreviations """
1440
1441 try:
1442 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1443 except ValueError:
1444 return None
18258362
JMF
1445
1446
5aafe895 1447def fix_xml_ampersands(xml_str):
18258362 1448 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1449 return re.sub(
1450 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1451 '&amp;',
5aafe895 1452 xml_str)
e3946f98
PH
1453
1454
1455def setproctitle(title):
8bf48f23 1456 assert isinstance(title, compat_str)
c1c05c67
YCH
1457
1458 # ctypes in Jython is not complete
1459 # http://bugs.jython.org/issue2148
1460 if sys.platform.startswith('java'):
1461 return
1462
e3946f98 1463 try:
611c1dd9 1464 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1465 except OSError:
1466 return
6eefe533
PH
1467 title_bytes = title.encode('utf-8')
1468 buf = ctypes.create_string_buffer(len(title_bytes))
1469 buf.value = title_bytes
e3946f98 1470 try:
6eefe533 1471 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1472 except AttributeError:
1473 return # Strange libc, just skip this
d7dda168
PH
1474
1475
1476def remove_start(s, start):
1477 if s.startswith(start):
1478 return s[len(start):]
1479 return s
29eb5174
PH
1480
1481
2b9faf55
PH
1482def remove_end(s, end):
1483 if s.endswith(end):
1484 return s[:-len(end)]
1485 return s
1486
1487
31b2051e
S
1488def remove_quotes(s):
1489 if s is None or len(s) < 2:
1490 return s
1491 for quote in ('"', "'", ):
1492 if s[0] == quote and s[-1] == quote:
1493 return s[1:-1]
1494 return s
1495
1496
29eb5174 1497def url_basename(url):
9b8aaeed 1498 path = compat_urlparse.urlparse(url).path
28e614de 1499 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1500
1501
1502class HEADRequest(compat_urllib_request.Request):
1503 def get_method(self):
611c1dd9 1504 return 'HEAD'
7217e148
PH
1505
1506
9732d77e 1507def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1508 if get_attr:
1509 if v is not None:
1510 v = getattr(v, get_attr, None)
9572013d
PH
1511 if v == '':
1512 v = None
1812afb7
S
1513 if v is None:
1514 return default
1515 try:
1516 return int(v) * invscale // scale
1517 except ValueError:
af98f8ff 1518 return default
9732d77e 1519
9572013d 1520
40a90862
JMF
1521def str_or_none(v, default=None):
1522 return default if v is None else compat_str(v)
1523
9732d77e
PH
1524
1525def str_to_int(int_str):
48d4681e 1526 """ A more relaxed version of int_or_none """
9732d77e
PH
1527 if int_str is None:
1528 return None
28e614de 1529 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1530 return int(int_str)
608d11f5
PH
1531
1532
9732d77e 1533def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1534 if v is None:
1535 return default
1536 try:
1537 return float(v) * invscale / scale
1538 except ValueError:
1539 return default
43f775e4
PH
1540
1541
608d11f5 1542def parse_duration(s):
8f9312c3 1543 if not isinstance(s, compat_basestring):
608d11f5
PH
1544 return None
1545
ca7b3246
S
1546 s = s.strip()
1547
608d11f5 1548 m = re.match(
9d22a7df 1549 r'''(?ix)(?:P?T)?
e8df5cee 1550 (?:
9c29bc69 1551 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1552 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1553
9c29bc69 1554 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1555 (?:
8f4b58d7
PH
1556 (?:
1557 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1558 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1559 )?
6a68bb57
PH
1560 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1561 )?
e8df5cee
PH
1562 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1563 )$''', s)
608d11f5
PH
1564 if not m:
1565 return None
e8df5cee
PH
1566 res = 0
1567 if m.group('only_mins'):
1568 return float_or_none(m.group('only_mins'), invscale=60)
1569 if m.group('only_hours'):
1570 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1571 if m.group('secs'):
1572 res += int(m.group('secs'))
3e675fab
PH
1573 if m.group('mins_reversed'):
1574 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1575 if m.group('mins'):
1576 res += int(m.group('mins')) * 60
e8df5cee
PH
1577 if m.group('hours'):
1578 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1579 if m.group('hours_reversed'):
1580 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1581 if m.group('days'):
1582 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1583 if m.group('ms'):
1584 res += float(m.group('ms'))
608d11f5 1585 return res
91d7d0b3
JMF
1586
1587
e65e4c88 1588def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1589 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1590 return (
1591 '{0}.{1}{2}'.format(name, ext, real_ext)
1592 if not expected_real_ext or real_ext[1:] == expected_real_ext
1593 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1594
1595
b3ed15b7
S
1596def replace_extension(filename, ext, expected_real_ext=None):
1597 name, real_ext = os.path.splitext(filename)
1598 return '{0}.{1}'.format(
1599 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1600 ext)
1601
1602
d70ad093
PH
1603def check_executable(exe, args=[]):
1604 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1605 args can be a list of arguments for a short output (like -version) """
1606 try:
1607 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1608 except OSError:
1609 return False
1610 return exe
b7ab0590
PH
1611
1612
95807118 1613def get_exe_version(exe, args=['--version'],
cae97f65 1614 version_re=None, unrecognized='present'):
95807118
PH
1615 """ Returns the version of the specified executable,
1616 or False if the executable is not present """
1617 try:
cae97f65 1618 out, _ = subprocess.Popen(
54116803 1619 [encodeArgument(exe)] + args,
95807118
PH
1620 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1621 except OSError:
1622 return False
cae97f65
PH
1623 if isinstance(out, bytes): # Python 2.x
1624 out = out.decode('ascii', 'ignore')
1625 return detect_exe_version(out, version_re, unrecognized)
1626
1627
1628def detect_exe_version(output, version_re=None, unrecognized='present'):
1629 assert isinstance(output, compat_str)
1630 if version_re is None:
1631 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1632 m = re.search(version_re, output)
95807118
PH
1633 if m:
1634 return m.group(1)
1635 else:
1636 return unrecognized
1637
1638
b7ab0590 1639class PagedList(object):
dd26ced1
PH
1640 def __len__(self):
1641 # This is only useful for tests
1642 return len(self.getslice())
1643
9c44d242
PH
1644
1645class OnDemandPagedList(PagedList):
b95dc034 1646 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1647 self._pagefunc = pagefunc
1648 self._pagesize = pagesize
b95dc034
YCH
1649 self._use_cache = use_cache
1650 if use_cache:
1651 self._cache = {}
9c44d242 1652
b7ab0590
PH
1653 def getslice(self, start=0, end=None):
1654 res = []
1655 for pagenum in itertools.count(start // self._pagesize):
1656 firstid = pagenum * self._pagesize
1657 nextfirstid = pagenum * self._pagesize + self._pagesize
1658 if start >= nextfirstid:
1659 continue
1660
b95dc034
YCH
1661 page_results = None
1662 if self._use_cache:
1663 page_results = self._cache.get(pagenum)
1664 if page_results is None:
1665 page_results = list(self._pagefunc(pagenum))
1666 if self._use_cache:
1667 self._cache[pagenum] = page_results
b7ab0590
PH
1668
1669 startv = (
1670 start % self._pagesize
1671 if firstid <= start < nextfirstid
1672 else 0)
1673
1674 endv = (
1675 ((end - 1) % self._pagesize) + 1
1676 if (end is not None and firstid <= end <= nextfirstid)
1677 else None)
1678
1679 if startv != 0 or endv is not None:
1680 page_results = page_results[startv:endv]
1681 res.extend(page_results)
1682
1683 # A little optimization - if current page is not "full", ie. does
1684 # not contain page_size videos then we can assume that this page
1685 # is the last one - there are no more ids on further pages -
1686 # i.e. no need to query again.
1687 if len(page_results) + startv < self._pagesize:
1688 break
1689
1690 # If we got the whole page, but the next page is not interesting,
1691 # break out early as well
1692 if end == nextfirstid:
1693 break
1694 return res
81c2f20b
PH
1695
1696
9c44d242
PH
1697class InAdvancePagedList(PagedList):
1698 def __init__(self, pagefunc, pagecount, pagesize):
1699 self._pagefunc = pagefunc
1700 self._pagecount = pagecount
1701 self._pagesize = pagesize
1702
1703 def getslice(self, start=0, end=None):
1704 res = []
1705 start_page = start // self._pagesize
1706 end_page = (
1707 self._pagecount if end is None else (end // self._pagesize + 1))
1708 skip_elems = start - start_page * self._pagesize
1709 only_more = None if end is None else end - start
1710 for pagenum in range(start_page, end_page):
1711 page = list(self._pagefunc(pagenum))
1712 if skip_elems:
1713 page = page[skip_elems:]
1714 skip_elems = None
1715 if only_more is not None:
1716 if len(page) < only_more:
1717 only_more -= len(page)
1718 else:
1719 page = page[:only_more]
1720 res.extend(page)
1721 break
1722 res.extend(page)
1723 return res
1724
1725
81c2f20b 1726def uppercase_escape(s):
676eb3f2 1727 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1728 return re.sub(
a612753d 1729 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1730 lambda m: unicode_escape(m.group(0))[0],
1731 s)
0fe2ff78
YCH
1732
1733
1734def lowercase_escape(s):
1735 unicode_escape = codecs.getdecoder('unicode_escape')
1736 return re.sub(
1737 r'\\u[0-9a-fA-F]{4}',
1738 lambda m: unicode_escape(m.group(0))[0],
1739 s)
b53466e1 1740
d05cfe06
S
1741
1742def escape_rfc3986(s):
1743 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1744 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1745 s = s.encode('utf-8')
ecc0c5ee 1746 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1747
1748
1749def escape_url(url):
1750 """Escape URL as suggested by RFC 3986"""
1751 url_parsed = compat_urllib_parse_urlparse(url)
1752 return url_parsed._replace(
efbed08d 1753 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1754 path=escape_rfc3986(url_parsed.path),
1755 params=escape_rfc3986(url_parsed.params),
1756 query=escape_rfc3986(url_parsed.query),
1757 fragment=escape_rfc3986(url_parsed.fragment)
1758 ).geturl()
1759
b53466e1 1760try:
28e614de 1761 struct.pack('!I', 0)
b53466e1 1762except TypeError:
622d1916
YCH
1763 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1764 # See https://bugs.python.org/issue19099
b53466e1
PH
1765 def struct_pack(spec, *args):
1766 if isinstance(spec, compat_str):
1767 spec = spec.encode('ascii')
1768 return struct.pack(spec, *args)
1769
1770 def struct_unpack(spec, *args):
1771 if isinstance(spec, compat_str):
1772 spec = spec.encode('ascii')
1773 return struct.unpack(spec, *args)
1774else:
1775 struct_pack = struct.pack
1776 struct_unpack = struct.unpack
62e609ab
PH
1777
1778
1779def read_batch_urls(batch_fd):
1780 def fixup(url):
1781 if not isinstance(url, compat_str):
1782 url = url.decode('utf-8', 'replace')
28e614de 1783 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1784 if url.startswith(BOM_UTF8):
1785 url = url[len(BOM_UTF8):]
1786 url = url.strip()
1787 if url.startswith(('#', ';', ']')):
1788 return False
1789 return url
1790
1791 with contextlib.closing(batch_fd) as fd:
1792 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1793
1794
1795def urlencode_postdata(*args, **kargs):
15707c7e 1796 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1797
1798
38f9ef31 1799def update_url_query(url, query):
1800 parsed_url = compat_urlparse.urlparse(url)
1801 qs = compat_parse_qs(parsed_url.query)
1802 qs.update(query)
1803 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1804 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1805
8e60dc75 1806
ed0291d1
S
1807def update_Request(req, url=None, data=None, headers={}, query={}):
1808 req_headers = req.headers.copy()
1809 req_headers.update(headers)
1810 req_data = data or req.data
1811 req_url = update_url_query(url or req.get_full_url(), query)
1812 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1813 new_req = req_type(
1814 req_url, data=req_data, headers=req_headers,
1815 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1816 if hasattr(req, 'timeout'):
1817 new_req.timeout = req.timeout
1818 return new_req
1819
1820
86296ad2 1821def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1822 if isinstance(key_or_keys, (list, tuple)):
1823 for key in key_or_keys:
86296ad2
S
1824 if key not in d or d[key] is None or skip_false_values and not d[key]:
1825 continue
1826 return d[key]
cbecc9b9
S
1827 return default
1828 return d.get(key_or_keys, default)
1829
1830
8e60dc75
S
1831def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1832 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1833
16392824 1834
a1a530b0
PH
1835US_RATINGS = {
1836 'G': 0,
1837 'PG': 10,
1838 'PG-13': 13,
1839 'R': 16,
1840 'NC': 18,
1841}
fac55558
PH
1842
1843
146c80e2
S
1844def parse_age_limit(s):
1845 if s is None:
d838b1bd 1846 return None
146c80e2 1847 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1848 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1849
1850
fac55558 1851def strip_jsonp(code):
609a61e3 1852 return re.sub(
8411229b 1853 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1854
1855
e05f6939
PH
1856def js_to_json(code):
1857 def fix_kv(m):
e7b6d122
PH
1858 v = m.group(0)
1859 if v in ('true', 'false', 'null'):
1860 return v
1861 if v.startswith('"'):
d01949dc
S
1862 v = re.sub(r"\\'", "'", v[1:-1])
1863 elif v.startswith("'"):
e7b6d122
PH
1864 v = v[1:-1]
1865 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1866 '\\\\': '\\\\',
1867 "\\'": "'",
1868 '"': '\\"',
1869 }[m.group(0)], v)
1870 return '"%s"' % v
e05f6939
PH
1871
1872 res = re.sub(r'''(?x)
d305dd73
PH
1873 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1874 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1875 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1876 ''', fix_kv, code)
ba9e68f4 1877 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1878 return res
1879
1880
478c2c61
PH
1881def qualities(quality_ids):
1882 """ Get a numeric quality value out of a list of possible values """
1883 def q(qid):
1884 try:
1885 return quality_ids.index(qid)
1886 except ValueError:
1887 return -1
1888 return q
1889
acd69589
PH
1890
1891DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1892
a020a0dc
PH
1893
1894def limit_length(s, length):
1895 """ Add ellipses to overly long strings """
1896 if s is None:
1897 return None
1898 ELLIPSES = '...'
1899 if len(s) > length:
1900 return s[:length - len(ELLIPSES)] + ELLIPSES
1901 return s
48844745
PH
1902
1903
1904def version_tuple(v):
5f9b8394 1905 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1906
1907
1908def is_outdated_version(version, limit, assume_new=True):
1909 if not version:
1910 return not assume_new
1911 try:
1912 return version_tuple(version) < version_tuple(limit)
1913 except ValueError:
1914 return not assume_new
732ea2f0
PH
1915
1916
1917def ytdl_is_updateable():
1918 """ Returns if youtube-dl can be updated with -U """
1919 from zipimport import zipimporter
1920
1921 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1922
1923
1924def args_to_str(args):
1925 # Get a short string representation for a subprocess command
1926 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1927
1928
9b9c5355 1929def error_to_compat_str(err):
fdae2358
S
1930 err_str = str(err)
1931 # On python 2 error byte string must be decoded with proper
1932 # encoding rather than ascii
1933 if sys.version_info[0] < 3:
1934 err_str = err_str.decode(preferredencoding())
1935 return err_str
1936
1937
c460bdd5 1938def mimetype2ext(mt):
765ac263
JMF
1939 ext = {
1940 'audio/mp4': 'm4a',
1941 }.get(mt)
1942 if ext is not None:
1943 return ext
1944
c460bdd5
PH
1945 _, _, res = mt.rpartition('/')
1946
1947 return {
f6861ec9 1948 '3gpp': '3gp',
cafcf657 1949 'smptett+xml': 'tt',
1950 'srt': 'srt',
1951 'ttaf+xml': 'dfxp',
a0d8d704 1952 'ttml+xml': 'ttml',
cafcf657 1953 'vtt': 'vtt',
f6861ec9 1954 'x-flv': 'flv',
a0d8d704
YCH
1955 'x-mp4-fragmented': 'mp4',
1956 'x-ms-wmv': 'wmv',
c460bdd5
PH
1957 }.get(res, res)
1958
1959
2ccd1b10
PH
1960def urlhandle_detect_ext(url_handle):
1961 try:
1962 url_handle.headers
1963 getheader = lambda h: url_handle.headers[h]
1964 except AttributeError: # Python < 3
1965 getheader = url_handle.info().getheader
1966
b55ee18f
PH
1967 cd = getheader('Content-Disposition')
1968 if cd:
1969 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1970 if m:
1971 e = determine_ext(m.group('filename'), default_ext=None)
1972 if e:
1973 return e
1974
c460bdd5 1975 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1976
1977
1e399778
YCH
1978def encode_data_uri(data, mime_type):
1979 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1980
1981
05900629 1982def age_restricted(content_limit, age_limit):
6ec6cb4e 1983 """ Returns True iff the content should be blocked """
05900629
PH
1984
1985 if age_limit is None: # No limit set
1986 return False
1987 if content_limit is None:
1988 return False # Content available for everyone
1989 return age_limit < content_limit
61ca9a80
PH
1990
1991
1992def is_html(first_bytes):
1993 """ Detect whether a file contains HTML by examining its first bytes. """
1994
1995 BOMS = [
1996 (b'\xef\xbb\xbf', 'utf-8'),
1997 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1998 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1999 (b'\xff\xfe', 'utf-16-le'),
2000 (b'\xfe\xff', 'utf-16-be'),
2001 ]
2002 for bom, enc in BOMS:
2003 if first_bytes.startswith(bom):
2004 s = first_bytes[len(bom):].decode(enc, 'replace')
2005 break
2006 else:
2007 s = first_bytes.decode('utf-8', 'replace')
2008
2009 return re.match(r'^\s*<', s)
a055469f
PH
2010
2011
2012def determine_protocol(info_dict):
2013 protocol = info_dict.get('protocol')
2014 if protocol is not None:
2015 return protocol
2016
2017 url = info_dict['url']
2018 if url.startswith('rtmp'):
2019 return 'rtmp'
2020 elif url.startswith('mms'):
2021 return 'mms'
2022 elif url.startswith('rtsp'):
2023 return 'rtsp'
2024
2025 ext = determine_ext(url)
2026 if ext == 'm3u8':
2027 return 'm3u8'
2028 elif ext == 'f4m':
2029 return 'f4m'
2030
2031 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2032
2033
2034def render_table(header_row, data):
2035 """ Render a list of rows, each as a list of values """
2036 table = [header_row] + data
2037 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2038 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2039 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2040
2041
2042def _match_one(filter_part, dct):
2043 COMPARISON_OPERATORS = {
2044 '<': operator.lt,
2045 '<=': operator.le,
2046 '>': operator.gt,
2047 '>=': operator.ge,
2048 '=': operator.eq,
2049 '!=': operator.ne,
2050 }
2051 operator_rex = re.compile(r'''(?x)\s*
2052 (?P<key>[a-z_]+)
2053 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2054 (?:
2055 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2056 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2057 )
2058 \s*$
2059 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2060 m = operator_rex.search(filter_part)
2061 if m:
2062 op = COMPARISON_OPERATORS[m.group('op')]
2063 if m.group('strval') is not None:
2064 if m.group('op') not in ('=', '!='):
2065 raise ValueError(
2066 'Operator %s does not support string values!' % m.group('op'))
2067 comparison_value = m.group('strval')
2068 else:
2069 try:
2070 comparison_value = int(m.group('intval'))
2071 except ValueError:
2072 comparison_value = parse_filesize(m.group('intval'))
2073 if comparison_value is None:
2074 comparison_value = parse_filesize(m.group('intval') + 'B')
2075 if comparison_value is None:
2076 raise ValueError(
2077 'Invalid integer value %r in filter part %r' % (
2078 m.group('intval'), filter_part))
2079 actual_value = dct.get(m.group('key'))
2080 if actual_value is None:
2081 return m.group('none_inclusive')
2082 return op(actual_value, comparison_value)
2083
2084 UNARY_OPERATORS = {
2085 '': lambda v: v is not None,
2086 '!': lambda v: v is None,
2087 }
2088 operator_rex = re.compile(r'''(?x)\s*
2089 (?P<op>%s)\s*(?P<key>[a-z_]+)
2090 \s*$
2091 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2092 m = operator_rex.search(filter_part)
2093 if m:
2094 op = UNARY_OPERATORS[m.group('op')]
2095 actual_value = dct.get(m.group('key'))
2096 return op(actual_value)
2097
2098 raise ValueError('Invalid filter part %r' % filter_part)
2099
2100
2101def match_str(filter_str, dct):
2102 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2103
2104 return all(
2105 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2106
2107
2108def match_filter_func(filter_str):
2109 def _match_func(info_dict):
2110 if match_str(filter_str, info_dict):
2111 return None
2112 else:
2113 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2114 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2115 return _match_func
91410c9b
PH
2116
2117
bf6427d2
YCH
2118def parse_dfxp_time_expr(time_expr):
2119 if not time_expr:
d631d5f9 2120 return
bf6427d2
YCH
2121
2122 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2123 if mobj:
2124 return float(mobj.group('time_offset'))
2125
db2fe38b 2126 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2127 if mobj:
db2fe38b 2128 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2129
2130
c1c924ab
YCH
2131def srt_subtitles_timecode(seconds):
2132 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2133
2134
2135def dfxp2srt(dfxp_data):
4e335771
YCH
2136 _x = functools.partial(xpath_with_ns, ns_map={
2137 'ttml': 'http://www.w3.org/ns/ttml',
2138 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2139 })
bf6427d2 2140
87de7069 2141 class TTMLPElementParser(object):
2b14cb56 2142 out = ''
bf6427d2 2143
2b14cb56 2144 def start(self, tag, attrib):
2145 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2146 self.out += '\n'
bf6427d2 2147
2b14cb56 2148 def end(self, tag):
2149 pass
bf6427d2 2150
2b14cb56 2151 def data(self, data):
2152 self.out += data
2153
2154 def close(self):
2155 return self.out.strip()
2156
2157 def parse_node(node):
2158 target = TTMLPElementParser()
2159 parser = xml.etree.ElementTree.XMLParser(target=target)
2160 parser.feed(xml.etree.ElementTree.tostring(node))
2161 return parser.close()
bf6427d2 2162
36e6f62c 2163 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2164 out = []
4e335771 2165 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2166
2167 if not paras:
2168 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2169
2170 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2171 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2172 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2173 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2174 if begin_time is None:
2175 continue
7dff0363 2176 if not end_time:
d631d5f9
YCH
2177 if not dur:
2178 continue
2179 end_time = begin_time + dur
bf6427d2
YCH
2180 out.append('%d\n%s --> %s\n%s\n\n' % (
2181 index,
c1c924ab
YCH
2182 srt_subtitles_timecode(begin_time),
2183 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2184 parse_node(para)))
2185
2186 return ''.join(out)
2187
2188
66e289ba
S
2189def cli_option(params, command_option, param):
2190 param = params.get(param)
2191 return [command_option, param] if param is not None else []
2192
2193
2194def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2195 param = params.get(param)
2196 assert isinstance(param, bool)
2197 if separator:
2198 return [command_option + separator + (true_value if param else false_value)]
2199 return [command_option, true_value if param else false_value]
2200
2201
2202def cli_valueless_option(params, command_option, param, expected_value=True):
2203 param = params.get(param)
2204 return [command_option] if param == expected_value else []
2205
2206
2207def cli_configuration_args(params, param, default=[]):
2208 ex_args = params.get(param)
2209 if ex_args is None:
2210 return default
2211 assert isinstance(ex_args, list)
2212 return ex_args
2213
2214
39672624
YCH
2215class ISO639Utils(object):
2216 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2217 _lang_map = {
2218 'aa': 'aar',
2219 'ab': 'abk',
2220 'ae': 'ave',
2221 'af': 'afr',
2222 'ak': 'aka',
2223 'am': 'amh',
2224 'an': 'arg',
2225 'ar': 'ara',
2226 'as': 'asm',
2227 'av': 'ava',
2228 'ay': 'aym',
2229 'az': 'aze',
2230 'ba': 'bak',
2231 'be': 'bel',
2232 'bg': 'bul',
2233 'bh': 'bih',
2234 'bi': 'bis',
2235 'bm': 'bam',
2236 'bn': 'ben',
2237 'bo': 'bod',
2238 'br': 'bre',
2239 'bs': 'bos',
2240 'ca': 'cat',
2241 'ce': 'che',
2242 'ch': 'cha',
2243 'co': 'cos',
2244 'cr': 'cre',
2245 'cs': 'ces',
2246 'cu': 'chu',
2247 'cv': 'chv',
2248 'cy': 'cym',
2249 'da': 'dan',
2250 'de': 'deu',
2251 'dv': 'div',
2252 'dz': 'dzo',
2253 'ee': 'ewe',
2254 'el': 'ell',
2255 'en': 'eng',
2256 'eo': 'epo',
2257 'es': 'spa',
2258 'et': 'est',
2259 'eu': 'eus',
2260 'fa': 'fas',
2261 'ff': 'ful',
2262 'fi': 'fin',
2263 'fj': 'fij',
2264 'fo': 'fao',
2265 'fr': 'fra',
2266 'fy': 'fry',
2267 'ga': 'gle',
2268 'gd': 'gla',
2269 'gl': 'glg',
2270 'gn': 'grn',
2271 'gu': 'guj',
2272 'gv': 'glv',
2273 'ha': 'hau',
2274 'he': 'heb',
2275 'hi': 'hin',
2276 'ho': 'hmo',
2277 'hr': 'hrv',
2278 'ht': 'hat',
2279 'hu': 'hun',
2280 'hy': 'hye',
2281 'hz': 'her',
2282 'ia': 'ina',
2283 'id': 'ind',
2284 'ie': 'ile',
2285 'ig': 'ibo',
2286 'ii': 'iii',
2287 'ik': 'ipk',
2288 'io': 'ido',
2289 'is': 'isl',
2290 'it': 'ita',
2291 'iu': 'iku',
2292 'ja': 'jpn',
2293 'jv': 'jav',
2294 'ka': 'kat',
2295 'kg': 'kon',
2296 'ki': 'kik',
2297 'kj': 'kua',
2298 'kk': 'kaz',
2299 'kl': 'kal',
2300 'km': 'khm',
2301 'kn': 'kan',
2302 'ko': 'kor',
2303 'kr': 'kau',
2304 'ks': 'kas',
2305 'ku': 'kur',
2306 'kv': 'kom',
2307 'kw': 'cor',
2308 'ky': 'kir',
2309 'la': 'lat',
2310 'lb': 'ltz',
2311 'lg': 'lug',
2312 'li': 'lim',
2313 'ln': 'lin',
2314 'lo': 'lao',
2315 'lt': 'lit',
2316 'lu': 'lub',
2317 'lv': 'lav',
2318 'mg': 'mlg',
2319 'mh': 'mah',
2320 'mi': 'mri',
2321 'mk': 'mkd',
2322 'ml': 'mal',
2323 'mn': 'mon',
2324 'mr': 'mar',
2325 'ms': 'msa',
2326 'mt': 'mlt',
2327 'my': 'mya',
2328 'na': 'nau',
2329 'nb': 'nob',
2330 'nd': 'nde',
2331 'ne': 'nep',
2332 'ng': 'ndo',
2333 'nl': 'nld',
2334 'nn': 'nno',
2335 'no': 'nor',
2336 'nr': 'nbl',
2337 'nv': 'nav',
2338 'ny': 'nya',
2339 'oc': 'oci',
2340 'oj': 'oji',
2341 'om': 'orm',
2342 'or': 'ori',
2343 'os': 'oss',
2344 'pa': 'pan',
2345 'pi': 'pli',
2346 'pl': 'pol',
2347 'ps': 'pus',
2348 'pt': 'por',
2349 'qu': 'que',
2350 'rm': 'roh',
2351 'rn': 'run',
2352 'ro': 'ron',
2353 'ru': 'rus',
2354 'rw': 'kin',
2355 'sa': 'san',
2356 'sc': 'srd',
2357 'sd': 'snd',
2358 'se': 'sme',
2359 'sg': 'sag',
2360 'si': 'sin',
2361 'sk': 'slk',
2362 'sl': 'slv',
2363 'sm': 'smo',
2364 'sn': 'sna',
2365 'so': 'som',
2366 'sq': 'sqi',
2367 'sr': 'srp',
2368 'ss': 'ssw',
2369 'st': 'sot',
2370 'su': 'sun',
2371 'sv': 'swe',
2372 'sw': 'swa',
2373 'ta': 'tam',
2374 'te': 'tel',
2375 'tg': 'tgk',
2376 'th': 'tha',
2377 'ti': 'tir',
2378 'tk': 'tuk',
2379 'tl': 'tgl',
2380 'tn': 'tsn',
2381 'to': 'ton',
2382 'tr': 'tur',
2383 'ts': 'tso',
2384 'tt': 'tat',
2385 'tw': 'twi',
2386 'ty': 'tah',
2387 'ug': 'uig',
2388 'uk': 'ukr',
2389 'ur': 'urd',
2390 'uz': 'uzb',
2391 've': 'ven',
2392 'vi': 'vie',
2393 'vo': 'vol',
2394 'wa': 'wln',
2395 'wo': 'wol',
2396 'xh': 'xho',
2397 'yi': 'yid',
2398 'yo': 'yor',
2399 'za': 'zha',
2400 'zh': 'zho',
2401 'zu': 'zul',
2402 }
2403
2404 @classmethod
2405 def short2long(cls, code):
2406 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2407 return cls._lang_map.get(code[:2])
2408
2409 @classmethod
2410 def long2short(cls, code):
2411 """Convert language code from ISO 639-2/T to ISO 639-1"""
2412 for short_name, long_name in cls._lang_map.items():
2413 if long_name == code:
2414 return short_name
2415
2416
4eb10f66
YCH
2417class ISO3166Utils(object):
2418 # From http://data.okfn.org/data/core/country-list
2419 _country_map = {
2420 'AF': 'Afghanistan',
2421 'AX': 'Åland Islands',
2422 'AL': 'Albania',
2423 'DZ': 'Algeria',
2424 'AS': 'American Samoa',
2425 'AD': 'Andorra',
2426 'AO': 'Angola',
2427 'AI': 'Anguilla',
2428 'AQ': 'Antarctica',
2429 'AG': 'Antigua and Barbuda',
2430 'AR': 'Argentina',
2431 'AM': 'Armenia',
2432 'AW': 'Aruba',
2433 'AU': 'Australia',
2434 'AT': 'Austria',
2435 'AZ': 'Azerbaijan',
2436 'BS': 'Bahamas',
2437 'BH': 'Bahrain',
2438 'BD': 'Bangladesh',
2439 'BB': 'Barbados',
2440 'BY': 'Belarus',
2441 'BE': 'Belgium',
2442 'BZ': 'Belize',
2443 'BJ': 'Benin',
2444 'BM': 'Bermuda',
2445 'BT': 'Bhutan',
2446 'BO': 'Bolivia, Plurinational State of',
2447 'BQ': 'Bonaire, Sint Eustatius and Saba',
2448 'BA': 'Bosnia and Herzegovina',
2449 'BW': 'Botswana',
2450 'BV': 'Bouvet Island',
2451 'BR': 'Brazil',
2452 'IO': 'British Indian Ocean Territory',
2453 'BN': 'Brunei Darussalam',
2454 'BG': 'Bulgaria',
2455 'BF': 'Burkina Faso',
2456 'BI': 'Burundi',
2457 'KH': 'Cambodia',
2458 'CM': 'Cameroon',
2459 'CA': 'Canada',
2460 'CV': 'Cape Verde',
2461 'KY': 'Cayman Islands',
2462 'CF': 'Central African Republic',
2463 'TD': 'Chad',
2464 'CL': 'Chile',
2465 'CN': 'China',
2466 'CX': 'Christmas Island',
2467 'CC': 'Cocos (Keeling) Islands',
2468 'CO': 'Colombia',
2469 'KM': 'Comoros',
2470 'CG': 'Congo',
2471 'CD': 'Congo, the Democratic Republic of the',
2472 'CK': 'Cook Islands',
2473 'CR': 'Costa Rica',
2474 'CI': 'Côte d\'Ivoire',
2475 'HR': 'Croatia',
2476 'CU': 'Cuba',
2477 'CW': 'Curaçao',
2478 'CY': 'Cyprus',
2479 'CZ': 'Czech Republic',
2480 'DK': 'Denmark',
2481 'DJ': 'Djibouti',
2482 'DM': 'Dominica',
2483 'DO': 'Dominican Republic',
2484 'EC': 'Ecuador',
2485 'EG': 'Egypt',
2486 'SV': 'El Salvador',
2487 'GQ': 'Equatorial Guinea',
2488 'ER': 'Eritrea',
2489 'EE': 'Estonia',
2490 'ET': 'Ethiopia',
2491 'FK': 'Falkland Islands (Malvinas)',
2492 'FO': 'Faroe Islands',
2493 'FJ': 'Fiji',
2494 'FI': 'Finland',
2495 'FR': 'France',
2496 'GF': 'French Guiana',
2497 'PF': 'French Polynesia',
2498 'TF': 'French Southern Territories',
2499 'GA': 'Gabon',
2500 'GM': 'Gambia',
2501 'GE': 'Georgia',
2502 'DE': 'Germany',
2503 'GH': 'Ghana',
2504 'GI': 'Gibraltar',
2505 'GR': 'Greece',
2506 'GL': 'Greenland',
2507 'GD': 'Grenada',
2508 'GP': 'Guadeloupe',
2509 'GU': 'Guam',
2510 'GT': 'Guatemala',
2511 'GG': 'Guernsey',
2512 'GN': 'Guinea',
2513 'GW': 'Guinea-Bissau',
2514 'GY': 'Guyana',
2515 'HT': 'Haiti',
2516 'HM': 'Heard Island and McDonald Islands',
2517 'VA': 'Holy See (Vatican City State)',
2518 'HN': 'Honduras',
2519 'HK': 'Hong Kong',
2520 'HU': 'Hungary',
2521 'IS': 'Iceland',
2522 'IN': 'India',
2523 'ID': 'Indonesia',
2524 'IR': 'Iran, Islamic Republic of',
2525 'IQ': 'Iraq',
2526 'IE': 'Ireland',
2527 'IM': 'Isle of Man',
2528 'IL': 'Israel',
2529 'IT': 'Italy',
2530 'JM': 'Jamaica',
2531 'JP': 'Japan',
2532 'JE': 'Jersey',
2533 'JO': 'Jordan',
2534 'KZ': 'Kazakhstan',
2535 'KE': 'Kenya',
2536 'KI': 'Kiribati',
2537 'KP': 'Korea, Democratic People\'s Republic of',
2538 'KR': 'Korea, Republic of',
2539 'KW': 'Kuwait',
2540 'KG': 'Kyrgyzstan',
2541 'LA': 'Lao People\'s Democratic Republic',
2542 'LV': 'Latvia',
2543 'LB': 'Lebanon',
2544 'LS': 'Lesotho',
2545 'LR': 'Liberia',
2546 'LY': 'Libya',
2547 'LI': 'Liechtenstein',
2548 'LT': 'Lithuania',
2549 'LU': 'Luxembourg',
2550 'MO': 'Macao',
2551 'MK': 'Macedonia, the Former Yugoslav Republic of',
2552 'MG': 'Madagascar',
2553 'MW': 'Malawi',
2554 'MY': 'Malaysia',
2555 'MV': 'Maldives',
2556 'ML': 'Mali',
2557 'MT': 'Malta',
2558 'MH': 'Marshall Islands',
2559 'MQ': 'Martinique',
2560 'MR': 'Mauritania',
2561 'MU': 'Mauritius',
2562 'YT': 'Mayotte',
2563 'MX': 'Mexico',
2564 'FM': 'Micronesia, Federated States of',
2565 'MD': 'Moldova, Republic of',
2566 'MC': 'Monaco',
2567 'MN': 'Mongolia',
2568 'ME': 'Montenegro',
2569 'MS': 'Montserrat',
2570 'MA': 'Morocco',
2571 'MZ': 'Mozambique',
2572 'MM': 'Myanmar',
2573 'NA': 'Namibia',
2574 'NR': 'Nauru',
2575 'NP': 'Nepal',
2576 'NL': 'Netherlands',
2577 'NC': 'New Caledonia',
2578 'NZ': 'New Zealand',
2579 'NI': 'Nicaragua',
2580 'NE': 'Niger',
2581 'NG': 'Nigeria',
2582 'NU': 'Niue',
2583 'NF': 'Norfolk Island',
2584 'MP': 'Northern Mariana Islands',
2585 'NO': 'Norway',
2586 'OM': 'Oman',
2587 'PK': 'Pakistan',
2588 'PW': 'Palau',
2589 'PS': 'Palestine, State of',
2590 'PA': 'Panama',
2591 'PG': 'Papua New Guinea',
2592 'PY': 'Paraguay',
2593 'PE': 'Peru',
2594 'PH': 'Philippines',
2595 'PN': 'Pitcairn',
2596 'PL': 'Poland',
2597 'PT': 'Portugal',
2598 'PR': 'Puerto Rico',
2599 'QA': 'Qatar',
2600 'RE': 'Réunion',
2601 'RO': 'Romania',
2602 'RU': 'Russian Federation',
2603 'RW': 'Rwanda',
2604 'BL': 'Saint Barthélemy',
2605 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2606 'KN': 'Saint Kitts and Nevis',
2607 'LC': 'Saint Lucia',
2608 'MF': 'Saint Martin (French part)',
2609 'PM': 'Saint Pierre and Miquelon',
2610 'VC': 'Saint Vincent and the Grenadines',
2611 'WS': 'Samoa',
2612 'SM': 'San Marino',
2613 'ST': 'Sao Tome and Principe',
2614 'SA': 'Saudi Arabia',
2615 'SN': 'Senegal',
2616 'RS': 'Serbia',
2617 'SC': 'Seychelles',
2618 'SL': 'Sierra Leone',
2619 'SG': 'Singapore',
2620 'SX': 'Sint Maarten (Dutch part)',
2621 'SK': 'Slovakia',
2622 'SI': 'Slovenia',
2623 'SB': 'Solomon Islands',
2624 'SO': 'Somalia',
2625 'ZA': 'South Africa',
2626 'GS': 'South Georgia and the South Sandwich Islands',
2627 'SS': 'South Sudan',
2628 'ES': 'Spain',
2629 'LK': 'Sri Lanka',
2630 'SD': 'Sudan',
2631 'SR': 'Suriname',
2632 'SJ': 'Svalbard and Jan Mayen',
2633 'SZ': 'Swaziland',
2634 'SE': 'Sweden',
2635 'CH': 'Switzerland',
2636 'SY': 'Syrian Arab Republic',
2637 'TW': 'Taiwan, Province of China',
2638 'TJ': 'Tajikistan',
2639 'TZ': 'Tanzania, United Republic of',
2640 'TH': 'Thailand',
2641 'TL': 'Timor-Leste',
2642 'TG': 'Togo',
2643 'TK': 'Tokelau',
2644 'TO': 'Tonga',
2645 'TT': 'Trinidad and Tobago',
2646 'TN': 'Tunisia',
2647 'TR': 'Turkey',
2648 'TM': 'Turkmenistan',
2649 'TC': 'Turks and Caicos Islands',
2650 'TV': 'Tuvalu',
2651 'UG': 'Uganda',
2652 'UA': 'Ukraine',
2653 'AE': 'United Arab Emirates',
2654 'GB': 'United Kingdom',
2655 'US': 'United States',
2656 'UM': 'United States Minor Outlying Islands',
2657 'UY': 'Uruguay',
2658 'UZ': 'Uzbekistan',
2659 'VU': 'Vanuatu',
2660 'VE': 'Venezuela, Bolivarian Republic of',
2661 'VN': 'Viet Nam',
2662 'VG': 'Virgin Islands, British',
2663 'VI': 'Virgin Islands, U.S.',
2664 'WF': 'Wallis and Futuna',
2665 'EH': 'Western Sahara',
2666 'YE': 'Yemen',
2667 'ZM': 'Zambia',
2668 'ZW': 'Zimbabwe',
2669 }
2670
2671 @classmethod
2672 def short2full(cls, code):
2673 """Convert an ISO 3166-2 country code to the corresponding full name"""
2674 return cls._country_map.get(code.upper())
2675
2676
91410c9b 2677class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2678 def __init__(self, proxies=None):
2679 # Set default handlers
2680 for type in ('http', 'https'):
2681 setattr(self, '%s_open' % type,
2682 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2683 meth(r, proxy, type))
2684 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2685
91410c9b 2686 def proxy_open(self, req, proxy, type):
2461f79d 2687 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2688 if req_proxy is not None:
2689 proxy = req_proxy
2461f79d
PH
2690 del req.headers['Ytdl-request-proxy']
2691
2692 if proxy == '__noproxy__':
2693 return None # No Proxy
91410c9b
PH
2694 return compat_urllib_request.ProxyHandler.proxy_open(
2695 self, req, proxy, type)
5bc880b9
YCH
2696
2697
2698def ohdave_rsa_encrypt(data, exponent, modulus):
2699 '''
2700 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2701
2702 Input:
2703 data: data to encrypt, bytes-like object
2704 exponent, modulus: parameter e and N of RSA algorithm, both integer
2705 Output: hex string of encrypted data
2706
2707 Limitation: supports one block encryption only
2708 '''
2709
2710 payload = int(binascii.hexlify(data[::-1]), 16)
2711 encrypted = pow(payload, exponent, modulus)
2712 return '%x' % encrypted
81bdc8fd
YCH
2713
2714
5eb6bdce 2715def encode_base_n(num, n, table=None):
59f898b7 2716 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2717 if not table:
2718 table = FULL_TABLE[:n]
2719
5eb6bdce
YCH
2720 if n > len(table):
2721 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2722
2723 if num == 0:
2724 return table[0]
2725
81bdc8fd
YCH
2726 ret = ''
2727 while num:
2728 ret = table[num % n] + ret
2729 num = num // n
2730 return ret
f52354a8
YCH
2731
2732
2733def decode_packed_codes(code):
2734 mobj = re.search(
680079be 2735 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2736 code)
2737 obfucasted_code, base, count, symbols = mobj.groups()
2738 base = int(base)
2739 count = int(count)
2740 symbols = symbols.split('|')
2741 symbol_table = {}
2742
2743 while count:
2744 count -= 1
5eb6bdce 2745 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2746 symbol_table[base_n_count] = symbols[count] or base_n_count
2747
2748 return re.sub(
2749 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2750 obfucasted_code)