]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[postprocessor/execafterdownload] Encode command line (closes #13407)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import email.header
15 import errno
16 import functools
17 import gzip
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import operator
24 import os
25 import pipes
26 import platform
27 import random
28 import re
29 import socket
30 import ssl
31 import subprocess
32 import sys
33 import tempfile
34 import traceback
35 import xml.etree.ElementTree
36 import zlib
37
38 from .compat import (
39 compat_HTMLParseError,
40 compat_HTMLParser,
41 compat_basestring,
42 compat_chr,
43 compat_etree_fromstring,
44 compat_expanduser,
45 compat_html_entities,
46 compat_html_entities_html5,
47 compat_http_client,
48 compat_kwargs,
49 compat_os_name,
50 compat_parse_qs,
51 compat_shlex_quote,
52 compat_socket_create_connection,
53 compat_str,
54 compat_struct_pack,
55 compat_struct_unpack,
56 compat_urllib_error,
57 compat_urllib_parse,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
62 compat_urlparse,
63 compat_xpath,
64 )
65
66 from .socks import (
67 ProxyType,
68 sockssocket,
69 )
70
71
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
83
84 std_headers = {
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
90 }
91
92
93 USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95 }
96
97
98 NO_DEFAULT = object()
99
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
104 MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109 }
110
111 KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
130
131 DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
138 '%b %d %Y',
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
148 '%Y/%m/%d %H:%M',
149 '%Y/%m/%d %H:%M:%S',
150 '%Y-%m-%d %H:%M',
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
163 )
164
165 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
166 DATE_FORMATS_DAY_FIRST.extend([
167 '%d-%m-%Y',
168 '%d.%m.%Y',
169 '%d.%m.%y',
170 '%d/%m/%Y',
171 '%d/%m/%y',
172 '%d/%m/%Y %H:%M:%S',
173 ])
174
175 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
176 DATE_FORMATS_MONTH_FIRST.extend([
177 '%m-%d-%Y',
178 '%m.%d.%Y',
179 '%m/%d/%Y',
180 '%m/%d/%y',
181 '%m/%d/%Y %H:%M:%S',
182 ])
183
184 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
185
186
187 def preferredencoding():
188 """Get preferred encoding.
189
190 Returns the best encoding scheme for the system, based on
191 locale.getpreferredencoding() and some further tweaks.
192 """
193 try:
194 pref = locale.getpreferredencoding()
195 'TEST'.encode(pref)
196 except Exception:
197 pref = 'UTF-8'
198
199 return pref
200
201
202 def write_json_file(obj, fn):
203 """ Encode obj as JSON and write it to fn, atomically if possible """
204
205 fn = encodeFilename(fn)
206 if sys.version_info < (3, 0) and sys.platform != 'win32':
207 encoding = get_filesystem_encoding()
208 # os.path.basename returns a bytes object, but NamedTemporaryFile
209 # will fail if the filename contains non ascii characters unless we
210 # use a unicode object
211 path_basename = lambda f: os.path.basename(fn).decode(encoding)
212 # the same for os.path.dirname
213 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
214 else:
215 path_basename = os.path.basename
216 path_dirname = os.path.dirname
217
218 args = {
219 'suffix': '.tmp',
220 'prefix': path_basename(fn) + '.',
221 'dir': path_dirname(fn),
222 'delete': False,
223 }
224
225 # In Python 2.x, json.dump expects a bytestream.
226 # In Python 3.x, it writes to a character stream
227 if sys.version_info < (3, 0):
228 args['mode'] = 'wb'
229 else:
230 args.update({
231 'mode': 'w',
232 'encoding': 'utf-8',
233 })
234
235 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
236
237 try:
238 with tf:
239 json.dump(obj, tf)
240 if sys.platform == 'win32':
241 # Need to remove existing file on Windows, else os.rename raises
242 # WindowsError or FileExistsError.
243 try:
244 os.unlink(fn)
245 except OSError:
246 pass
247 os.rename(tf.name, fn)
248 except Exception:
249 try:
250 os.remove(tf.name)
251 except OSError:
252 pass
253 raise
254
255
256 if sys.version_info >= (2, 7):
257 def find_xpath_attr(node, xpath, key, val=None):
258 """ Find the xpath xpath[@key=val] """
259 assert re.match(r'^[a-zA-Z_-]+$', key)
260 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
261 return node.find(expr)
262 else:
263 def find_xpath_attr(node, xpath, key, val=None):
264 for f in node.findall(compat_xpath(xpath)):
265 if key not in f.attrib:
266 continue
267 if val is None or f.attrib.get(key) == val:
268 return f
269 return None
270
271 # On python2.6 the xml.etree.ElementTree.Element methods don't support
272 # the namespace parameter
273
274
275 def xpath_with_ns(path, ns_map):
276 components = [c.split(':') for c in path.split('/')]
277 replaced = []
278 for c in components:
279 if len(c) == 1:
280 replaced.append(c[0])
281 else:
282 ns, tag = c
283 replaced.append('{%s}%s' % (ns_map[ns], tag))
284 return '/'.join(replaced)
285
286
287 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
288 def _find_xpath(xpath):
289 return node.find(compat_xpath(xpath))
290
291 if isinstance(xpath, (str, compat_str)):
292 n = _find_xpath(xpath)
293 else:
294 for xp in xpath:
295 n = _find_xpath(xp)
296 if n is not None:
297 break
298
299 if n is None:
300 if default is not NO_DEFAULT:
301 return default
302 elif fatal:
303 name = xpath if name is None else name
304 raise ExtractorError('Could not find XML element %s' % name)
305 else:
306 return None
307 return n
308
309
310 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
311 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
312 if n is None or n == default:
313 return n
314 if n.text is None:
315 if default is not NO_DEFAULT:
316 return default
317 elif fatal:
318 name = xpath if name is None else name
319 raise ExtractorError('Could not find XML element\'s text %s' % name)
320 else:
321 return None
322 return n.text
323
324
325 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
326 n = find_xpath_attr(node, xpath, key)
327 if n is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = '%s[@%s]' % (xpath, key) if name is None else name
332 raise ExtractorError('Could not find XML attribute %s' % name)
333 else:
334 return None
335 return n.attrib[key]
336
337
338 def get_element_by_id(id, html):
339 """Return the content of the tag with the specified ID in the passed HTML document"""
340 return get_element_by_attribute('id', id, html)
341
342
343 def get_element_by_class(class_name, html):
344 """Return the content of the first tag with the specified class in the passed HTML document"""
345 retval = get_elements_by_class(class_name, html)
346 return retval[0] if retval else None
347
348
349 def get_element_by_attribute(attribute, value, html, escape_value=True):
350 retval = get_elements_by_attribute(attribute, value, html, escape_value)
351 return retval[0] if retval else None
352
353
354 def get_elements_by_class(class_name, html):
355 """Return the content of all tags with the specified class in the passed HTML document as a list"""
356 return get_elements_by_attribute(
357 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
358 html, escape_value=False)
359
360
361 def get_elements_by_attribute(attribute, value, html, escape_value=True):
362 """Return the content of the tag with the specified attribute in the passed HTML document"""
363
364 value = re.escape(value) if escape_value else value
365
366 retlist = []
367 for m in re.finditer(r'''(?xs)
368 <([a-zA-Z0-9:._-]+)
369 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
370 \s+%s=['"]?%s['"]?
371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
372 \s*>
373 (?P<content>.*?)
374 </\1>
375 ''' % (re.escape(attribute), value), html):
376 res = m.group('content')
377
378 if res.startswith('"') or res.startswith("'"):
379 res = res[1:-1]
380
381 retlist.append(unescapeHTML(res))
382
383 return retlist
384
385
386 class HTMLAttributeParser(compat_HTMLParser):
387 """Trivial HTML parser to gather the attributes for a single element"""
388 def __init__(self):
389 self.attrs = {}
390 compat_HTMLParser.__init__(self)
391
392 def handle_starttag(self, tag, attrs):
393 self.attrs = dict(attrs)
394
395
396 def extract_attributes(html_element):
397 """Given a string for an HTML element such as
398 <el
399 a="foo" B="bar" c="&98;az" d=boz
400 empty= noval entity="&amp;"
401 sq='"' dq="'"
402 >
403 Decode and return a dictionary of attributes.
404 {
405 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
406 'empty': '', 'noval': None, 'entity': '&',
407 'sq': '"', 'dq': '\''
408 }.
409 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
410 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
411 """
412 parser = HTMLAttributeParser()
413 try:
414 parser.feed(html_element)
415 parser.close()
416 # Older Python may throw HTMLParseError in case of malformed HTML
417 except compat_HTMLParseError:
418 pass
419 return parser.attrs
420
421
422 def clean_html(html):
423 """Clean an HTML snippet into a readable string"""
424
425 if html is None: # Convenience for sanitizing descriptions etc.
426 return html
427
428 # Newline vs <br />
429 html = html.replace('\n', ' ')
430 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
431 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
432 # Strip html tags
433 html = re.sub('<.*?>', '', html)
434 # Replace html entities
435 html = unescapeHTML(html)
436 return html.strip()
437
438
439 def sanitize_open(filename, open_mode):
440 """Try to open the given filename, and slightly tweak it if this fails.
441
442 Attempts to open the given filename. If this fails, it tries to change
443 the filename slightly, step by step, until it's either able to open it
444 or it fails and raises a final exception, like the standard open()
445 function.
446
447 It returns the tuple (stream, definitive_file_name).
448 """
449 try:
450 if filename == '-':
451 if sys.platform == 'win32':
452 import msvcrt
453 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
454 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
455 stream = open(encodeFilename(filename), open_mode)
456 return (stream, filename)
457 except (IOError, OSError) as err:
458 if err.errno in (errno.EACCES,):
459 raise
460
461 # In case of error, try to remove win32 forbidden chars
462 alt_filename = sanitize_path(filename)
463 if alt_filename == filename:
464 raise
465 else:
466 # An exception here should be caught in the caller
467 stream = open(encodeFilename(alt_filename), open_mode)
468 return (stream, alt_filename)
469
470
471 def timeconvert(timestr):
472 """Convert RFC 2822 defined time string into system timestamp"""
473 timestamp = None
474 timetuple = email.utils.parsedate_tz(timestr)
475 if timetuple is not None:
476 timestamp = email.utils.mktime_tz(timetuple)
477 return timestamp
478
479
480 def sanitize_filename(s, restricted=False, is_id=False):
481 """Sanitizes a string so it could be used as part of a filename.
482 If restricted is set, use a stricter subset of allowed characters.
483 Set is_id if this is not an arbitrary string, but an ID that should be kept
484 if possible.
485 """
486 def replace_insane(char):
487 if restricted and char in ACCENT_CHARS:
488 return ACCENT_CHARS[char]
489 if char == '?' or ord(char) < 32 or ord(char) == 127:
490 return ''
491 elif char == '"':
492 return '' if restricted else '\''
493 elif char == ':':
494 return '_-' if restricted else ' -'
495 elif char in '\\/|*<>':
496 return '_'
497 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
498 return '_'
499 if restricted and ord(char) > 127:
500 return '_'
501 return char
502
503 # Handle timestamps
504 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
505 result = ''.join(map(replace_insane, s))
506 if not is_id:
507 while '__' in result:
508 result = result.replace('__', '_')
509 result = result.strip('_')
510 # Common case of "Foreign band name - English song title"
511 if restricted and result.startswith('-_'):
512 result = result[2:]
513 if result.startswith('-'):
514 result = '_' + result[len('-'):]
515 result = result.lstrip('.')
516 if not result:
517 result = '_'
518 return result
519
520
521 def sanitize_path(s):
522 """Sanitizes and normalizes path on Windows"""
523 if sys.platform != 'win32':
524 return s
525 drive_or_unc, _ = os.path.splitdrive(s)
526 if sys.version_info < (2, 7) and not drive_or_unc:
527 drive_or_unc, _ = os.path.splitunc(s)
528 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
529 if drive_or_unc:
530 norm_path.pop(0)
531 sanitized_path = [
532 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
533 for path_part in norm_path]
534 if drive_or_unc:
535 sanitized_path.insert(0, drive_or_unc + os.path.sep)
536 return os.path.join(*sanitized_path)
537
538
539 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
540 # unwanted failures due to missing protocol
541 def sanitize_url(url):
542 return 'http:%s' % url if url.startswith('//') else url
543
544
545 def sanitized_Request(url, *args, **kwargs):
546 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
547
548
549 def expand_path(s):
550 """Expand shell variables and ~"""
551 return os.path.expandvars(compat_expanduser(s))
552
553
554 def orderedSet(iterable):
555 """ Remove all duplicates from the input iterable """
556 res = []
557 for el in iterable:
558 if el not in res:
559 res.append(el)
560 return res
561
562
563 def _htmlentity_transform(entity_with_semicolon):
564 """Transforms an HTML entity to a character."""
565 entity = entity_with_semicolon[:-1]
566
567 # Known non-numeric HTML entity
568 if entity in compat_html_entities.name2codepoint:
569 return compat_chr(compat_html_entities.name2codepoint[entity])
570
571 # TODO: HTML5 allows entities without a semicolon. For example,
572 # '&Eacuteric' should be decoded as 'Éric'.
573 if entity_with_semicolon in compat_html_entities_html5:
574 return compat_html_entities_html5[entity_with_semicolon]
575
576 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
577 if mobj is not None:
578 numstr = mobj.group(1)
579 if numstr.startswith('x'):
580 base = 16
581 numstr = '0%s' % numstr
582 else:
583 base = 10
584 # See https://github.com/rg3/youtube-dl/issues/7518
585 try:
586 return compat_chr(int(numstr, base))
587 except ValueError:
588 pass
589
590 # Unknown entity in name, return its literal representation
591 return '&%s;' % entity
592
593
594 def unescapeHTML(s):
595 if s is None:
596 return None
597 assert type(s) == compat_str
598
599 return re.sub(
600 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
601
602
603 def get_subprocess_encoding():
604 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
605 # For subprocess calls, encode with locale encoding
606 # Refer to http://stackoverflow.com/a/9951851/35070
607 encoding = preferredencoding()
608 else:
609 encoding = sys.getfilesystemencoding()
610 if encoding is None:
611 encoding = 'utf-8'
612 return encoding
613
614
615 def encodeFilename(s, for_subprocess=False):
616 """
617 @param s The name of the file
618 """
619
620 assert type(s) == compat_str
621
622 # Python 3 has a Unicode API
623 if sys.version_info >= (3, 0):
624 return s
625
626 # Pass '' directly to use Unicode APIs on Windows 2000 and up
627 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
628 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
629 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
630 return s
631
632 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
633 if sys.platform.startswith('java'):
634 return s
635
636 return s.encode(get_subprocess_encoding(), 'ignore')
637
638
639 def decodeFilename(b, for_subprocess=False):
640
641 if sys.version_info >= (3, 0):
642 return b
643
644 if not isinstance(b, bytes):
645 return b
646
647 return b.decode(get_subprocess_encoding(), 'ignore')
648
649
650 def encodeArgument(s):
651 if not isinstance(s, compat_str):
652 # Legacy code that uses byte strings
653 # Uncomment the following line after fixing all post processors
654 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
655 s = s.decode('ascii')
656 return encodeFilename(s, True)
657
658
659 def decodeArgument(b):
660 return decodeFilename(b, True)
661
662
663 def decodeOption(optval):
664 if optval is None:
665 return optval
666 if isinstance(optval, bytes):
667 optval = optval.decode(preferredencoding())
668
669 assert isinstance(optval, compat_str)
670 return optval
671
672
673 def formatSeconds(secs):
674 if secs > 3600:
675 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
676 elif secs > 60:
677 return '%d:%02d' % (secs // 60, secs % 60)
678 else:
679 return '%d' % secs
680
681
682 def make_HTTPS_handler(params, **kwargs):
683 opts_no_check_certificate = params.get('nocheckcertificate', False)
684 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
685 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
686 if opts_no_check_certificate:
687 context.check_hostname = False
688 context.verify_mode = ssl.CERT_NONE
689 try:
690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
691 except TypeError:
692 # Python 2.7.8
693 # (create_default_context present but HTTPSHandler has no context=)
694 pass
695
696 if sys.version_info < (3, 2):
697 return YoutubeDLHTTPSHandler(params, **kwargs)
698 else: # Python < 3.4
699 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
700 context.verify_mode = (ssl.CERT_NONE
701 if opts_no_check_certificate
702 else ssl.CERT_REQUIRED)
703 context.set_default_verify_paths()
704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
705
706
707 def bug_reports_message():
708 if ytdl_is_updateable():
709 update_cmd = 'type youtube-dl -U to update'
710 else:
711 update_cmd = 'see https://yt-dl.org/update on how to update'
712 msg = '; please report this issue on https://yt-dl.org/bug .'
713 msg += ' Make sure you are using the latest version; %s.' % update_cmd
714 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
715 return msg
716
717
718 class YoutubeDLError(Exception):
719 """Base exception for YoutubeDL errors."""
720 pass
721
722
723 class ExtractorError(YoutubeDLError):
724 """Error during info extraction."""
725
726 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
727 """ tb, if given, is the original traceback (so that it can be printed out).
728 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
729 """
730
731 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
732 expected = True
733 if video_id is not None:
734 msg = video_id + ': ' + msg
735 if cause:
736 msg += ' (caused by %r)' % cause
737 if not expected:
738 msg += bug_reports_message()
739 super(ExtractorError, self).__init__(msg)
740
741 self.traceback = tb
742 self.exc_info = sys.exc_info() # preserve original exception
743 self.cause = cause
744 self.video_id = video_id
745
746 def format_traceback(self):
747 if self.traceback is None:
748 return None
749 return ''.join(traceback.format_tb(self.traceback))
750
751
752 class UnsupportedError(ExtractorError):
753 def __init__(self, url):
754 super(UnsupportedError, self).__init__(
755 'Unsupported URL: %s' % url, expected=True)
756 self.url = url
757
758
759 class RegexNotFoundError(ExtractorError):
760 """Error when a regex didn't match"""
761 pass
762
763
764 class GeoRestrictedError(ExtractorError):
765 """Geographic restriction Error exception.
766
767 This exception may be thrown when a video is not available from your
768 geographic location due to geographic restrictions imposed by a website.
769 """
770 def __init__(self, msg, countries=None):
771 super(GeoRestrictedError, self).__init__(msg, expected=True)
772 self.msg = msg
773 self.countries = countries
774
775
776 class DownloadError(YoutubeDLError):
777 """Download Error exception.
778
779 This exception may be thrown by FileDownloader objects if they are not
780 configured to continue on errors. They will contain the appropriate
781 error message.
782 """
783
784 def __init__(self, msg, exc_info=None):
785 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
786 super(DownloadError, self).__init__(msg)
787 self.exc_info = exc_info
788
789
790 class SameFileError(YoutubeDLError):
791 """Same File exception.
792
793 This exception will be thrown by FileDownloader objects if they detect
794 multiple files would have to be downloaded to the same file on disk.
795 """
796 pass
797
798
799 class PostProcessingError(YoutubeDLError):
800 """Post Processing exception.
801
802 This exception may be raised by PostProcessor's .run() method to
803 indicate an error in the postprocessing task.
804 """
805
806 def __init__(self, msg):
807 super(PostProcessingError, self).__init__(msg)
808 self.msg = msg
809
810
811 class MaxDownloadsReached(YoutubeDLError):
812 """ --max-downloads limit has been reached. """
813 pass
814
815
816 class UnavailableVideoError(YoutubeDLError):
817 """Unavailable Format exception.
818
819 This exception will be thrown when a video is requested
820 in a format that is not available for that video.
821 """
822 pass
823
824
825 class ContentTooShortError(YoutubeDLError):
826 """Content Too Short exception.
827
828 This exception may be raised by FileDownloader objects when a file they
829 download is too small for what the server announced first, indicating
830 the connection was probably interrupted.
831 """
832
833 def __init__(self, downloaded, expected):
834 super(ContentTooShortError, self).__init__(
835 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
836 )
837 # Both in bytes
838 self.downloaded = downloaded
839 self.expected = expected
840
841
842 class XAttrMetadataError(YoutubeDLError):
843 def __init__(self, code=None, msg='Unknown error'):
844 super(XAttrMetadataError, self).__init__(msg)
845 self.code = code
846 self.msg = msg
847
848 # Parsing code and msg
849 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
850 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
851 self.reason = 'NO_SPACE'
852 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
853 self.reason = 'VALUE_TOO_LONG'
854 else:
855 self.reason = 'NOT_SUPPORTED'
856
857
858 class XAttrUnavailableError(YoutubeDLError):
859 pass
860
861
862 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
863 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
864 # expected HTTP responses to meet HTTP/1.0 or later (see also
865 # https://github.com/rg3/youtube-dl/issues/6727)
866 if sys.version_info < (3, 0):
867 kwargs[b'strict'] = True
868 hc = http_class(*args, **kwargs)
869 source_address = ydl_handler._params.get('source_address')
870 if source_address is not None:
871 sa = (source_address, 0)
872 if hasattr(hc, 'source_address'): # Python 2.7+
873 hc.source_address = sa
874 else: # Python 2.6
875 def _hc_connect(self, *args, **kwargs):
876 sock = compat_socket_create_connection(
877 (self.host, self.port), self.timeout, sa)
878 if is_https:
879 self.sock = ssl.wrap_socket(
880 sock, self.key_file, self.cert_file,
881 ssl_version=ssl.PROTOCOL_TLSv1)
882 else:
883 self.sock = sock
884 hc.connect = functools.partial(_hc_connect, hc)
885
886 return hc
887
888
889 def handle_youtubedl_headers(headers):
890 filtered_headers = headers
891
892 if 'Youtubedl-no-compression' in filtered_headers:
893 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
894 del filtered_headers['Youtubedl-no-compression']
895
896 return filtered_headers
897
898
899 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
900 """Handler for HTTP requests and responses.
901
902 This class, when installed with an OpenerDirector, automatically adds
903 the standard headers to every HTTP request and handles gzipped and
904 deflated responses from web servers. If compression is to be avoided in
905 a particular request, the original request in the program code only has
906 to include the HTTP header "Youtubedl-no-compression", which will be
907 removed before making the real request.
908
909 Part of this code was copied from:
910
911 http://techknack.net/python-urllib2-handlers/
912
913 Andrew Rowls, the author of that code, agreed to release it to the
914 public domain.
915 """
916
917 def __init__(self, params, *args, **kwargs):
918 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
919 self._params = params
920
921 def http_open(self, req):
922 conn_class = compat_http_client.HTTPConnection
923
924 socks_proxy = req.headers.get('Ytdl-socks-proxy')
925 if socks_proxy:
926 conn_class = make_socks_conn_class(conn_class, socks_proxy)
927 del req.headers['Ytdl-socks-proxy']
928
929 return self.do_open(functools.partial(
930 _create_http_connection, self, conn_class, False),
931 req)
932
933 @staticmethod
934 def deflate(data):
935 try:
936 return zlib.decompress(data, -zlib.MAX_WBITS)
937 except zlib.error:
938 return zlib.decompress(data)
939
940 def http_request(self, req):
941 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
942 # always respected by websites, some tend to give out URLs with non percent-encoded
943 # non-ASCII characters (see telemb.py, ard.py [#3412])
944 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
945 # To work around aforementioned issue we will replace request's original URL with
946 # percent-encoded one
947 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
948 # the code of this workaround has been moved here from YoutubeDL.urlopen()
949 url = req.get_full_url()
950 url_escaped = escape_url(url)
951
952 # Substitute URL if any change after escaping
953 if url != url_escaped:
954 req = update_Request(req, url=url_escaped)
955
956 for h, v in std_headers.items():
957 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
958 # The dict keys are capitalized because of this bug by urllib
959 if h.capitalize() not in req.headers:
960 req.add_header(h, v)
961
962 req.headers = handle_youtubedl_headers(req.headers)
963
964 if sys.version_info < (2, 7) and '#' in req.get_full_url():
965 # Python 2.6 is brain-dead when it comes to fragments
966 req._Request__original = req._Request__original.partition('#')[0]
967 req._Request__r_type = req._Request__r_type.partition('#')[0]
968
969 return req
970
971 def http_response(self, req, resp):
972 old_resp = resp
973 # gzip
974 if resp.headers.get('Content-encoding', '') == 'gzip':
975 content = resp.read()
976 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
977 try:
978 uncompressed = io.BytesIO(gz.read())
979 except IOError as original_ioerror:
980 # There may be junk add the end of the file
981 # See http://stackoverflow.com/q/4928560/35070 for details
982 for i in range(1, 1024):
983 try:
984 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
985 uncompressed = io.BytesIO(gz.read())
986 except IOError:
987 continue
988 break
989 else:
990 raise original_ioerror
991 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
992 resp.msg = old_resp.msg
993 del resp.headers['Content-encoding']
994 # deflate
995 if resp.headers.get('Content-encoding', '') == 'deflate':
996 gz = io.BytesIO(self.deflate(resp.read()))
997 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
998 resp.msg = old_resp.msg
999 del resp.headers['Content-encoding']
1000 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1001 # https://github.com/rg3/youtube-dl/issues/6457).
1002 if 300 <= resp.code < 400:
1003 location = resp.headers.get('Location')
1004 if location:
1005 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1006 if sys.version_info >= (3, 0):
1007 location = location.encode('iso-8859-1').decode('utf-8')
1008 else:
1009 location = location.decode('utf-8')
1010 location_escaped = escape_url(location)
1011 if location != location_escaped:
1012 del resp.headers['Location']
1013 if sys.version_info < (3, 0):
1014 location_escaped = location_escaped.encode('utf-8')
1015 resp.headers['Location'] = location_escaped
1016 return resp
1017
1018 https_request = http_request
1019 https_response = http_response
1020
1021
1022 def make_socks_conn_class(base_class, socks_proxy):
1023 assert issubclass(base_class, (
1024 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1025
1026 url_components = compat_urlparse.urlparse(socks_proxy)
1027 if url_components.scheme.lower() == 'socks5':
1028 socks_type = ProxyType.SOCKS5
1029 elif url_components.scheme.lower() in ('socks', 'socks4'):
1030 socks_type = ProxyType.SOCKS4
1031 elif url_components.scheme.lower() == 'socks4a':
1032 socks_type = ProxyType.SOCKS4A
1033
1034 def unquote_if_non_empty(s):
1035 if not s:
1036 return s
1037 return compat_urllib_parse_unquote_plus(s)
1038
1039 proxy_args = (
1040 socks_type,
1041 url_components.hostname, url_components.port or 1080,
1042 True, # Remote DNS
1043 unquote_if_non_empty(url_components.username),
1044 unquote_if_non_empty(url_components.password),
1045 )
1046
1047 class SocksConnection(base_class):
1048 def connect(self):
1049 self.sock = sockssocket()
1050 self.sock.setproxy(*proxy_args)
1051 if type(self.timeout) in (int, float):
1052 self.sock.settimeout(self.timeout)
1053 self.sock.connect((self.host, self.port))
1054
1055 if isinstance(self, compat_http_client.HTTPSConnection):
1056 if hasattr(self, '_context'): # Python > 2.6
1057 self.sock = self._context.wrap_socket(
1058 self.sock, server_hostname=self.host)
1059 else:
1060 self.sock = ssl.wrap_socket(self.sock)
1061
1062 return SocksConnection
1063
1064
1065 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1066 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1067 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1068 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1069 self._params = params
1070
1071 def https_open(self, req):
1072 kwargs = {}
1073 conn_class = self._https_conn_class
1074
1075 if hasattr(self, '_context'): # python > 2.6
1076 kwargs['context'] = self._context
1077 if hasattr(self, '_check_hostname'): # python 3.x
1078 kwargs['check_hostname'] = self._check_hostname
1079
1080 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1081 if socks_proxy:
1082 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1083 del req.headers['Ytdl-socks-proxy']
1084
1085 return self.do_open(functools.partial(
1086 _create_http_connection, self, conn_class, True),
1087 req, **kwargs)
1088
1089
1090 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1091 def __init__(self, cookiejar=None):
1092 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1093
1094 def http_response(self, request, response):
1095 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1096 # characters in Set-Cookie HTTP header of last response (see
1097 # https://github.com/rg3/youtube-dl/issues/6769).
1098 # In order to at least prevent crashing we will percent encode Set-Cookie
1099 # header before HTTPCookieProcessor starts processing it.
1100 # if sys.version_info < (3, 0) and response.headers:
1101 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1102 # set_cookie = response.headers.get(set_cookie_header)
1103 # if set_cookie:
1104 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1105 # if set_cookie != set_cookie_escaped:
1106 # del response.headers[set_cookie_header]
1107 # response.headers[set_cookie_header] = set_cookie_escaped
1108 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1109
1110 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1111 https_response = http_response
1112
1113
1114 def extract_timezone(date_str):
1115 m = re.search(
1116 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1117 date_str)
1118 if not m:
1119 timezone = datetime.timedelta()
1120 else:
1121 date_str = date_str[:-len(m.group('tz'))]
1122 if not m.group('sign'):
1123 timezone = datetime.timedelta()
1124 else:
1125 sign = 1 if m.group('sign') == '+' else -1
1126 timezone = datetime.timedelta(
1127 hours=sign * int(m.group('hours')),
1128 minutes=sign * int(m.group('minutes')))
1129 return timezone, date_str
1130
1131
1132 def parse_iso8601(date_str, delimiter='T', timezone=None):
1133 """ Return a UNIX timestamp from the given date """
1134
1135 if date_str is None:
1136 return None
1137
1138 date_str = re.sub(r'\.[0-9]+', '', date_str)
1139
1140 if timezone is None:
1141 timezone, date_str = extract_timezone(date_str)
1142
1143 try:
1144 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1145 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1146 return calendar.timegm(dt.timetuple())
1147 except ValueError:
1148 pass
1149
1150
1151 def date_formats(day_first=True):
1152 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1153
1154
1155 def unified_strdate(date_str, day_first=True):
1156 """Return a string with the date in the format YYYYMMDD"""
1157
1158 if date_str is None:
1159 return None
1160 upload_date = None
1161 # Replace commas
1162 date_str = date_str.replace(',', ' ')
1163 # Remove AM/PM + timezone
1164 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1165 _, date_str = extract_timezone(date_str)
1166
1167 for expression in date_formats(day_first):
1168 try:
1169 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1170 except ValueError:
1171 pass
1172 if upload_date is None:
1173 timetuple = email.utils.parsedate_tz(date_str)
1174 if timetuple:
1175 try:
1176 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1177 except ValueError:
1178 pass
1179 if upload_date is not None:
1180 return compat_str(upload_date)
1181
1182
1183 def unified_timestamp(date_str, day_first=True):
1184 if date_str is None:
1185 return None
1186
1187 date_str = re.sub(r'[,|]', '', date_str)
1188
1189 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1190 timezone, date_str = extract_timezone(date_str)
1191
1192 # Remove AM/PM + timezone
1193 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1194
1195 # Remove unrecognized timezones from ISO 8601 alike timestamps
1196 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1197 if m:
1198 date_str = date_str[:-len(m.group('tz'))]
1199
1200 for expression in date_formats(day_first):
1201 try:
1202 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1203 return calendar.timegm(dt.timetuple())
1204 except ValueError:
1205 pass
1206 timetuple = email.utils.parsedate_tz(date_str)
1207 if timetuple:
1208 return calendar.timegm(timetuple) + pm_delta * 3600
1209
1210
1211 def determine_ext(url, default_ext='unknown_video'):
1212 if url is None:
1213 return default_ext
1214 guess = url.partition('?')[0].rpartition('.')[2]
1215 if re.match(r'^[A-Za-z0-9]+$', guess):
1216 return guess
1217 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1218 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1219 return guess.rstrip('/')
1220 else:
1221 return default_ext
1222
1223
1224 def subtitles_filename(filename, sub_lang, sub_format):
1225 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1226
1227
1228 def date_from_str(date_str):
1229 """
1230 Return a datetime object from a string in the format YYYYMMDD or
1231 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1232 today = datetime.date.today()
1233 if date_str in ('now', 'today'):
1234 return today
1235 if date_str == 'yesterday':
1236 return today - datetime.timedelta(days=1)
1237 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1238 if match is not None:
1239 sign = match.group('sign')
1240 time = int(match.group('time'))
1241 if sign == '-':
1242 time = -time
1243 unit = match.group('unit')
1244 # A bad approximation?
1245 if unit == 'month':
1246 unit = 'day'
1247 time *= 30
1248 elif unit == 'year':
1249 unit = 'day'
1250 time *= 365
1251 unit += 's'
1252 delta = datetime.timedelta(**{unit: time})
1253 return today + delta
1254 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1255
1256
1257 def hyphenate_date(date_str):
1258 """
1259 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1260 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1261 if match is not None:
1262 return '-'.join(match.groups())
1263 else:
1264 return date_str
1265
1266
1267 class DateRange(object):
1268 """Represents a time interval between two dates"""
1269
1270 def __init__(self, start=None, end=None):
1271 """start and end must be strings in the format accepted by date"""
1272 if start is not None:
1273 self.start = date_from_str(start)
1274 else:
1275 self.start = datetime.datetime.min.date()
1276 if end is not None:
1277 self.end = date_from_str(end)
1278 else:
1279 self.end = datetime.datetime.max.date()
1280 if self.start > self.end:
1281 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1282
1283 @classmethod
1284 def day(cls, day):
1285 """Returns a range that only contains the given day"""
1286 return cls(day, day)
1287
1288 def __contains__(self, date):
1289 """Check if the date is in the range"""
1290 if not isinstance(date, datetime.date):
1291 date = date_from_str(date)
1292 return self.start <= date <= self.end
1293
1294 def __str__(self):
1295 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1296
1297
1298 def platform_name():
1299 """ Returns the platform name as a compat_str """
1300 res = platform.platform()
1301 if isinstance(res, bytes):
1302 res = res.decode(preferredencoding())
1303
1304 assert isinstance(res, compat_str)
1305 return res
1306
1307
1308 def _windows_write_string(s, out):
1309 """ Returns True if the string was written using special methods,
1310 False if it has yet to be written out."""
1311 # Adapted from http://stackoverflow.com/a/3259271/35070
1312
1313 import ctypes
1314 import ctypes.wintypes
1315
1316 WIN_OUTPUT_IDS = {
1317 1: -11,
1318 2: -12,
1319 }
1320
1321 try:
1322 fileno = out.fileno()
1323 except AttributeError:
1324 # If the output stream doesn't have a fileno, it's virtual
1325 return False
1326 except io.UnsupportedOperation:
1327 # Some strange Windows pseudo files?
1328 return False
1329 if fileno not in WIN_OUTPUT_IDS:
1330 return False
1331
1332 GetStdHandle = ctypes.WINFUNCTYPE(
1333 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1334 (b'GetStdHandle', ctypes.windll.kernel32))
1335 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1336
1337 WriteConsoleW = ctypes.WINFUNCTYPE(
1338 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1339 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1340 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1341 written = ctypes.wintypes.DWORD(0)
1342
1343 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1344 FILE_TYPE_CHAR = 0x0002
1345 FILE_TYPE_REMOTE = 0x8000
1346 GetConsoleMode = ctypes.WINFUNCTYPE(
1347 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1348 ctypes.POINTER(ctypes.wintypes.DWORD))(
1349 (b'GetConsoleMode', ctypes.windll.kernel32))
1350 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1351
1352 def not_a_console(handle):
1353 if handle == INVALID_HANDLE_VALUE or handle is None:
1354 return True
1355 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1356 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1357
1358 if not_a_console(h):
1359 return False
1360
1361 def next_nonbmp_pos(s):
1362 try:
1363 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1364 except StopIteration:
1365 return len(s)
1366
1367 while s:
1368 count = min(next_nonbmp_pos(s), 1024)
1369
1370 ret = WriteConsoleW(
1371 h, s, count if count else 2, ctypes.byref(written), None)
1372 if ret == 0:
1373 raise OSError('Failed to write string')
1374 if not count: # We just wrote a non-BMP character
1375 assert written.value == 2
1376 s = s[1:]
1377 else:
1378 assert written.value > 0
1379 s = s[written.value:]
1380 return True
1381
1382
1383 def write_string(s, out=None, encoding=None):
1384 if out is None:
1385 out = sys.stderr
1386 assert type(s) == compat_str
1387
1388 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1389 if _windows_write_string(s, out):
1390 return
1391
1392 if ('b' in getattr(out, 'mode', '') or
1393 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1394 byt = s.encode(encoding or preferredencoding(), 'ignore')
1395 out.write(byt)
1396 elif hasattr(out, 'buffer'):
1397 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1398 byt = s.encode(enc, 'ignore')
1399 out.buffer.write(byt)
1400 else:
1401 out.write(s)
1402 out.flush()
1403
1404
1405 def bytes_to_intlist(bs):
1406 if not bs:
1407 return []
1408 if isinstance(bs[0], int): # Python 3
1409 return list(bs)
1410 else:
1411 return [ord(c) for c in bs]
1412
1413
1414 def intlist_to_bytes(xs):
1415 if not xs:
1416 return b''
1417 return compat_struct_pack('%dB' % len(xs), *xs)
1418
1419
1420 # Cross-platform file locking
1421 if sys.platform == 'win32':
1422 import ctypes.wintypes
1423 import msvcrt
1424
1425 class OVERLAPPED(ctypes.Structure):
1426 _fields_ = [
1427 ('Internal', ctypes.wintypes.LPVOID),
1428 ('InternalHigh', ctypes.wintypes.LPVOID),
1429 ('Offset', ctypes.wintypes.DWORD),
1430 ('OffsetHigh', ctypes.wintypes.DWORD),
1431 ('hEvent', ctypes.wintypes.HANDLE),
1432 ]
1433
1434 kernel32 = ctypes.windll.kernel32
1435 LockFileEx = kernel32.LockFileEx
1436 LockFileEx.argtypes = [
1437 ctypes.wintypes.HANDLE, # hFile
1438 ctypes.wintypes.DWORD, # dwFlags
1439 ctypes.wintypes.DWORD, # dwReserved
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1441 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1442 ctypes.POINTER(OVERLAPPED) # Overlapped
1443 ]
1444 LockFileEx.restype = ctypes.wintypes.BOOL
1445 UnlockFileEx = kernel32.UnlockFileEx
1446 UnlockFileEx.argtypes = [
1447 ctypes.wintypes.HANDLE, # hFile
1448 ctypes.wintypes.DWORD, # dwReserved
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1450 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1451 ctypes.POINTER(OVERLAPPED) # Overlapped
1452 ]
1453 UnlockFileEx.restype = ctypes.wintypes.BOOL
1454 whole_low = 0xffffffff
1455 whole_high = 0x7fffffff
1456
1457 def _lock_file(f, exclusive):
1458 overlapped = OVERLAPPED()
1459 overlapped.Offset = 0
1460 overlapped.OffsetHigh = 0
1461 overlapped.hEvent = 0
1462 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1463 handle = msvcrt.get_osfhandle(f.fileno())
1464 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1465 whole_low, whole_high, f._lock_file_overlapped_p):
1466 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1467
1468 def _unlock_file(f):
1469 assert f._lock_file_overlapped_p
1470 handle = msvcrt.get_osfhandle(f.fileno())
1471 if not UnlockFileEx(handle, 0,
1472 whole_low, whole_high, f._lock_file_overlapped_p):
1473 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1474
1475 else:
1476 # Some platforms, such as Jython, is missing fcntl
1477 try:
1478 import fcntl
1479
1480 def _lock_file(f, exclusive):
1481 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1482
1483 def _unlock_file(f):
1484 fcntl.flock(f, fcntl.LOCK_UN)
1485 except ImportError:
1486 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1487
1488 def _lock_file(f, exclusive):
1489 raise IOError(UNSUPPORTED_MSG)
1490
1491 def _unlock_file(f):
1492 raise IOError(UNSUPPORTED_MSG)
1493
1494
1495 class locked_file(object):
1496 def __init__(self, filename, mode, encoding=None):
1497 assert mode in ['r', 'a', 'w']
1498 self.f = io.open(filename, mode, encoding=encoding)
1499 self.mode = mode
1500
1501 def __enter__(self):
1502 exclusive = self.mode != 'r'
1503 try:
1504 _lock_file(self.f, exclusive)
1505 except IOError:
1506 self.f.close()
1507 raise
1508 return self
1509
1510 def __exit__(self, etype, value, traceback):
1511 try:
1512 _unlock_file(self.f)
1513 finally:
1514 self.f.close()
1515
1516 def __iter__(self):
1517 return iter(self.f)
1518
1519 def write(self, *args):
1520 return self.f.write(*args)
1521
1522 def read(self, *args):
1523 return self.f.read(*args)
1524
1525
1526 def get_filesystem_encoding():
1527 encoding = sys.getfilesystemencoding()
1528 return encoding if encoding is not None else 'utf-8'
1529
1530
1531 def shell_quote(args):
1532 quoted_args = []
1533 encoding = get_filesystem_encoding()
1534 for a in args:
1535 if isinstance(a, bytes):
1536 # We may get a filename encoded with 'encodeFilename'
1537 a = a.decode(encoding)
1538 quoted_args.append(pipes.quote(a))
1539 return ' '.join(quoted_args)
1540
1541
1542 def smuggle_url(url, data):
1543 """ Pass additional data in a URL for internal use. """
1544
1545 url, idata = unsmuggle_url(url, {})
1546 data.update(idata)
1547 sdata = compat_urllib_parse_urlencode(
1548 {'__youtubedl_smuggle': json.dumps(data)})
1549 return url + '#' + sdata
1550
1551
1552 def unsmuggle_url(smug_url, default=None):
1553 if '#__youtubedl_smuggle' not in smug_url:
1554 return smug_url, default
1555 url, _, sdata = smug_url.rpartition('#')
1556 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1557 data = json.loads(jsond)
1558 return url, data
1559
1560
1561 def format_bytes(bytes):
1562 if bytes is None:
1563 return 'N/A'
1564 if type(bytes) is str:
1565 bytes = float(bytes)
1566 if bytes == 0.0:
1567 exponent = 0
1568 else:
1569 exponent = int(math.log(bytes, 1024.0))
1570 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1571 converted = float(bytes) / float(1024 ** exponent)
1572 return '%.2f%s' % (converted, suffix)
1573
1574
1575 def lookup_unit_table(unit_table, s):
1576 units_re = '|'.join(re.escape(u) for u in unit_table)
1577 m = re.match(
1578 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1579 if not m:
1580 return None
1581 num_str = m.group('num').replace(',', '.')
1582 mult = unit_table[m.group('unit')]
1583 return int(float(num_str) * mult)
1584
1585
1586 def parse_filesize(s):
1587 if s is None:
1588 return None
1589
1590 # The lower-case forms are of course incorrect and unofficial,
1591 # but we support those too
1592 _UNIT_TABLE = {
1593 'B': 1,
1594 'b': 1,
1595 'bytes': 1,
1596 'KiB': 1024,
1597 'KB': 1000,
1598 'kB': 1024,
1599 'Kb': 1000,
1600 'kb': 1000,
1601 'kilobytes': 1000,
1602 'kibibytes': 1024,
1603 'MiB': 1024 ** 2,
1604 'MB': 1000 ** 2,
1605 'mB': 1024 ** 2,
1606 'Mb': 1000 ** 2,
1607 'mb': 1000 ** 2,
1608 'megabytes': 1000 ** 2,
1609 'mebibytes': 1024 ** 2,
1610 'GiB': 1024 ** 3,
1611 'GB': 1000 ** 3,
1612 'gB': 1024 ** 3,
1613 'Gb': 1000 ** 3,
1614 'gb': 1000 ** 3,
1615 'gigabytes': 1000 ** 3,
1616 'gibibytes': 1024 ** 3,
1617 'TiB': 1024 ** 4,
1618 'TB': 1000 ** 4,
1619 'tB': 1024 ** 4,
1620 'Tb': 1000 ** 4,
1621 'tb': 1000 ** 4,
1622 'terabytes': 1000 ** 4,
1623 'tebibytes': 1024 ** 4,
1624 'PiB': 1024 ** 5,
1625 'PB': 1000 ** 5,
1626 'pB': 1024 ** 5,
1627 'Pb': 1000 ** 5,
1628 'pb': 1000 ** 5,
1629 'petabytes': 1000 ** 5,
1630 'pebibytes': 1024 ** 5,
1631 'EiB': 1024 ** 6,
1632 'EB': 1000 ** 6,
1633 'eB': 1024 ** 6,
1634 'Eb': 1000 ** 6,
1635 'eb': 1000 ** 6,
1636 'exabytes': 1000 ** 6,
1637 'exbibytes': 1024 ** 6,
1638 'ZiB': 1024 ** 7,
1639 'ZB': 1000 ** 7,
1640 'zB': 1024 ** 7,
1641 'Zb': 1000 ** 7,
1642 'zb': 1000 ** 7,
1643 'zettabytes': 1000 ** 7,
1644 'zebibytes': 1024 ** 7,
1645 'YiB': 1024 ** 8,
1646 'YB': 1000 ** 8,
1647 'yB': 1024 ** 8,
1648 'Yb': 1000 ** 8,
1649 'yb': 1000 ** 8,
1650 'yottabytes': 1000 ** 8,
1651 'yobibytes': 1024 ** 8,
1652 }
1653
1654 return lookup_unit_table(_UNIT_TABLE, s)
1655
1656
1657 def parse_count(s):
1658 if s is None:
1659 return None
1660
1661 s = s.strip()
1662
1663 if re.match(r'^[\d,.]+$', s):
1664 return str_to_int(s)
1665
1666 _UNIT_TABLE = {
1667 'k': 1000,
1668 'K': 1000,
1669 'm': 1000 ** 2,
1670 'M': 1000 ** 2,
1671 'kk': 1000 ** 2,
1672 'KK': 1000 ** 2,
1673 }
1674
1675 return lookup_unit_table(_UNIT_TABLE, s)
1676
1677
1678 def month_by_name(name, lang='en'):
1679 """ Return the number of a month by (locale-independently) English name """
1680
1681 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1682
1683 try:
1684 return month_names.index(name) + 1
1685 except ValueError:
1686 return None
1687
1688
1689 def month_by_abbreviation(abbrev):
1690 """ Return the number of a month by (locale-independently) English
1691 abbreviations """
1692
1693 try:
1694 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1695 except ValueError:
1696 return None
1697
1698
1699 def fix_xml_ampersands(xml_str):
1700 """Replace all the '&' by '&amp;' in XML"""
1701 return re.sub(
1702 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1703 '&amp;',
1704 xml_str)
1705
1706
1707 def setproctitle(title):
1708 assert isinstance(title, compat_str)
1709
1710 # ctypes in Jython is not complete
1711 # http://bugs.jython.org/issue2148
1712 if sys.platform.startswith('java'):
1713 return
1714
1715 try:
1716 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1717 except OSError:
1718 return
1719 except TypeError:
1720 # LoadLibrary in Windows Python 2.7.13 only expects
1721 # a bytestring, but since unicode_literals turns
1722 # every string into a unicode string, it fails.
1723 return
1724 title_bytes = title.encode('utf-8')
1725 buf = ctypes.create_string_buffer(len(title_bytes))
1726 buf.value = title_bytes
1727 try:
1728 libc.prctl(15, buf, 0, 0, 0)
1729 except AttributeError:
1730 return # Strange libc, just skip this
1731
1732
1733 def remove_start(s, start):
1734 return s[len(start):] if s is not None and s.startswith(start) else s
1735
1736
1737 def remove_end(s, end):
1738 return s[:-len(end)] if s is not None and s.endswith(end) else s
1739
1740
1741 def remove_quotes(s):
1742 if s is None or len(s) < 2:
1743 return s
1744 for quote in ('"', "'", ):
1745 if s[0] == quote and s[-1] == quote:
1746 return s[1:-1]
1747 return s
1748
1749
1750 def url_basename(url):
1751 path = compat_urlparse.urlparse(url).path
1752 return path.strip('/').split('/')[-1]
1753
1754
1755 def base_url(url):
1756 return re.match(r'https?://[^?#&]+/', url).group()
1757
1758
1759 def urljoin(base, path):
1760 if isinstance(path, bytes):
1761 path = path.decode('utf-8')
1762 if not isinstance(path, compat_str) or not path:
1763 return None
1764 if re.match(r'^(?:https?:)?//', path):
1765 return path
1766 if isinstance(base, bytes):
1767 base = base.decode('utf-8')
1768 if not isinstance(base, compat_str) or not re.match(
1769 r'^(?:https?:)?//', base):
1770 return None
1771 return compat_urlparse.urljoin(base, path)
1772
1773
1774 class HEADRequest(compat_urllib_request.Request):
1775 def get_method(self):
1776 return 'HEAD'
1777
1778
1779 class PUTRequest(compat_urllib_request.Request):
1780 def get_method(self):
1781 return 'PUT'
1782
1783
1784 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1785 if get_attr:
1786 if v is not None:
1787 v = getattr(v, get_attr, None)
1788 if v == '':
1789 v = None
1790 if v is None:
1791 return default
1792 try:
1793 return int(v) * invscale // scale
1794 except ValueError:
1795 return default
1796
1797
1798 def str_or_none(v, default=None):
1799 return default if v is None else compat_str(v)
1800
1801
1802 def str_to_int(int_str):
1803 """ A more relaxed version of int_or_none """
1804 if int_str is None:
1805 return None
1806 int_str = re.sub(r'[,\.\+]', '', int_str)
1807 return int(int_str)
1808
1809
1810 def float_or_none(v, scale=1, invscale=1, default=None):
1811 if v is None:
1812 return default
1813 try:
1814 return float(v) * invscale / scale
1815 except ValueError:
1816 return default
1817
1818
1819 def strip_or_none(v):
1820 return None if v is None else v.strip()
1821
1822
1823 def parse_duration(s):
1824 if not isinstance(s, compat_basestring):
1825 return None
1826
1827 s = s.strip()
1828
1829 days, hours, mins, secs, ms = [None] * 5
1830 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1831 if m:
1832 days, hours, mins, secs, ms = m.groups()
1833 else:
1834 m = re.match(
1835 r'''(?ix)(?:P?T)?
1836 (?:
1837 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1838 )?
1839 (?:
1840 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1841 )?
1842 (?:
1843 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1844 )?
1845 (?:
1846 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1847 )?Z?$''', s)
1848 if m:
1849 days, hours, mins, secs, ms = m.groups()
1850 else:
1851 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1852 if m:
1853 hours, mins = m.groups()
1854 else:
1855 return None
1856
1857 duration = 0
1858 if secs:
1859 duration += float(secs)
1860 if mins:
1861 duration += float(mins) * 60
1862 if hours:
1863 duration += float(hours) * 60 * 60
1864 if days:
1865 duration += float(days) * 24 * 60 * 60
1866 if ms:
1867 duration += float(ms)
1868 return duration
1869
1870
1871 def prepend_extension(filename, ext, expected_real_ext=None):
1872 name, real_ext = os.path.splitext(filename)
1873 return (
1874 '{0}.{1}{2}'.format(name, ext, real_ext)
1875 if not expected_real_ext or real_ext[1:] == expected_real_ext
1876 else '{0}.{1}'.format(filename, ext))
1877
1878
1879 def replace_extension(filename, ext, expected_real_ext=None):
1880 name, real_ext = os.path.splitext(filename)
1881 return '{0}.{1}'.format(
1882 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1883 ext)
1884
1885
1886 def check_executable(exe, args=[]):
1887 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1888 args can be a list of arguments for a short output (like -version) """
1889 try:
1890 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1891 except OSError:
1892 return False
1893 return exe
1894
1895
1896 def get_exe_version(exe, args=['--version'],
1897 version_re=None, unrecognized='present'):
1898 """ Returns the version of the specified executable,
1899 or False if the executable is not present """
1900 try:
1901 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1902 # SIGTTOU if youtube-dl is run in the background.
1903 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1904 out, _ = subprocess.Popen(
1905 [encodeArgument(exe)] + args,
1906 stdin=subprocess.PIPE,
1907 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1908 except OSError:
1909 return False
1910 if isinstance(out, bytes): # Python 2.x
1911 out = out.decode('ascii', 'ignore')
1912 return detect_exe_version(out, version_re, unrecognized)
1913
1914
1915 def detect_exe_version(output, version_re=None, unrecognized='present'):
1916 assert isinstance(output, compat_str)
1917 if version_re is None:
1918 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1919 m = re.search(version_re, output)
1920 if m:
1921 return m.group(1)
1922 else:
1923 return unrecognized
1924
1925
1926 class PagedList(object):
1927 def __len__(self):
1928 # This is only useful for tests
1929 return len(self.getslice())
1930
1931
1932 class OnDemandPagedList(PagedList):
1933 def __init__(self, pagefunc, pagesize, use_cache=False):
1934 self._pagefunc = pagefunc
1935 self._pagesize = pagesize
1936 self._use_cache = use_cache
1937 if use_cache:
1938 self._cache = {}
1939
1940 def getslice(self, start=0, end=None):
1941 res = []
1942 for pagenum in itertools.count(start // self._pagesize):
1943 firstid = pagenum * self._pagesize
1944 nextfirstid = pagenum * self._pagesize + self._pagesize
1945 if start >= nextfirstid:
1946 continue
1947
1948 page_results = None
1949 if self._use_cache:
1950 page_results = self._cache.get(pagenum)
1951 if page_results is None:
1952 page_results = list(self._pagefunc(pagenum))
1953 if self._use_cache:
1954 self._cache[pagenum] = page_results
1955
1956 startv = (
1957 start % self._pagesize
1958 if firstid <= start < nextfirstid
1959 else 0)
1960
1961 endv = (
1962 ((end - 1) % self._pagesize) + 1
1963 if (end is not None and firstid <= end <= nextfirstid)
1964 else None)
1965
1966 if startv != 0 or endv is not None:
1967 page_results = page_results[startv:endv]
1968 res.extend(page_results)
1969
1970 # A little optimization - if current page is not "full", ie. does
1971 # not contain page_size videos then we can assume that this page
1972 # is the last one - there are no more ids on further pages -
1973 # i.e. no need to query again.
1974 if len(page_results) + startv < self._pagesize:
1975 break
1976
1977 # If we got the whole page, but the next page is not interesting,
1978 # break out early as well
1979 if end == nextfirstid:
1980 break
1981 return res
1982
1983
1984 class InAdvancePagedList(PagedList):
1985 def __init__(self, pagefunc, pagecount, pagesize):
1986 self._pagefunc = pagefunc
1987 self._pagecount = pagecount
1988 self._pagesize = pagesize
1989
1990 def getslice(self, start=0, end=None):
1991 res = []
1992 start_page = start // self._pagesize
1993 end_page = (
1994 self._pagecount if end is None else (end // self._pagesize + 1))
1995 skip_elems = start - start_page * self._pagesize
1996 only_more = None if end is None else end - start
1997 for pagenum in range(start_page, end_page):
1998 page = list(self._pagefunc(pagenum))
1999 if skip_elems:
2000 page = page[skip_elems:]
2001 skip_elems = None
2002 if only_more is not None:
2003 if len(page) < only_more:
2004 only_more -= len(page)
2005 else:
2006 page = page[:only_more]
2007 res.extend(page)
2008 break
2009 res.extend(page)
2010 return res
2011
2012
2013 def uppercase_escape(s):
2014 unicode_escape = codecs.getdecoder('unicode_escape')
2015 return re.sub(
2016 r'\\U[0-9a-fA-F]{8}',
2017 lambda m: unicode_escape(m.group(0))[0],
2018 s)
2019
2020
2021 def lowercase_escape(s):
2022 unicode_escape = codecs.getdecoder('unicode_escape')
2023 return re.sub(
2024 r'\\u[0-9a-fA-F]{4}',
2025 lambda m: unicode_escape(m.group(0))[0],
2026 s)
2027
2028
2029 def escape_rfc3986(s):
2030 """Escape non-ASCII characters as suggested by RFC 3986"""
2031 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2032 s = s.encode('utf-8')
2033 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2034
2035
2036 def escape_url(url):
2037 """Escape URL as suggested by RFC 3986"""
2038 url_parsed = compat_urllib_parse_urlparse(url)
2039 return url_parsed._replace(
2040 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2041 path=escape_rfc3986(url_parsed.path),
2042 params=escape_rfc3986(url_parsed.params),
2043 query=escape_rfc3986(url_parsed.query),
2044 fragment=escape_rfc3986(url_parsed.fragment)
2045 ).geturl()
2046
2047
2048 def read_batch_urls(batch_fd):
2049 def fixup(url):
2050 if not isinstance(url, compat_str):
2051 url = url.decode('utf-8', 'replace')
2052 BOM_UTF8 = '\xef\xbb\xbf'
2053 if url.startswith(BOM_UTF8):
2054 url = url[len(BOM_UTF8):]
2055 url = url.strip()
2056 if url.startswith(('#', ';', ']')):
2057 return False
2058 return url
2059
2060 with contextlib.closing(batch_fd) as fd:
2061 return [url for url in map(fixup, fd) if url]
2062
2063
2064 def urlencode_postdata(*args, **kargs):
2065 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2066
2067
2068 def update_url_query(url, query):
2069 if not query:
2070 return url
2071 parsed_url = compat_urlparse.urlparse(url)
2072 qs = compat_parse_qs(parsed_url.query)
2073 qs.update(query)
2074 return compat_urlparse.urlunparse(parsed_url._replace(
2075 query=compat_urllib_parse_urlencode(qs, True)))
2076
2077
2078 def update_Request(req, url=None, data=None, headers={}, query={}):
2079 req_headers = req.headers.copy()
2080 req_headers.update(headers)
2081 req_data = data or req.data
2082 req_url = update_url_query(url or req.get_full_url(), query)
2083 req_get_method = req.get_method()
2084 if req_get_method == 'HEAD':
2085 req_type = HEADRequest
2086 elif req_get_method == 'PUT':
2087 req_type = PUTRequest
2088 else:
2089 req_type = compat_urllib_request.Request
2090 new_req = req_type(
2091 req_url, data=req_data, headers=req_headers,
2092 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2093 if hasattr(req, 'timeout'):
2094 new_req.timeout = req.timeout
2095 return new_req
2096
2097
2098 def _multipart_encode_impl(data, boundary):
2099 content_type = 'multipart/form-data; boundary=%s' % boundary
2100
2101 out = b''
2102 for k, v in data.items():
2103 out += b'--' + boundary.encode('ascii') + b'\r\n'
2104 if isinstance(k, compat_str):
2105 k = k.encode('utf-8')
2106 if isinstance(v, compat_str):
2107 v = v.encode('utf-8')
2108 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2109 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2110 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2111 if boundary.encode('ascii') in content:
2112 raise ValueError('Boundary overlaps with data')
2113 out += content
2114
2115 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2116
2117 return out, content_type
2118
2119
2120 def multipart_encode(data, boundary=None):
2121 '''
2122 Encode a dict to RFC 7578-compliant form-data
2123
2124 data:
2125 A dict where keys and values can be either Unicode or bytes-like
2126 objects.
2127 boundary:
2128 If specified a Unicode object, it's used as the boundary. Otherwise
2129 a random boundary is generated.
2130
2131 Reference: https://tools.ietf.org/html/rfc7578
2132 '''
2133 has_specified_boundary = boundary is not None
2134
2135 while True:
2136 if boundary is None:
2137 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2138
2139 try:
2140 out, content_type = _multipart_encode_impl(data, boundary)
2141 break
2142 except ValueError:
2143 if has_specified_boundary:
2144 raise
2145 boundary = None
2146
2147 return out, content_type
2148
2149
2150 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2151 if isinstance(key_or_keys, (list, tuple)):
2152 for key in key_or_keys:
2153 if key not in d or d[key] is None or skip_false_values and not d[key]:
2154 continue
2155 return d[key]
2156 return default
2157 return d.get(key_or_keys, default)
2158
2159
2160 def try_get(src, getter, expected_type=None):
2161 if not isinstance(getter, (list, tuple)):
2162 getter = [getter]
2163 for get in getter:
2164 try:
2165 v = get(src)
2166 except (AttributeError, KeyError, TypeError, IndexError):
2167 pass
2168 else:
2169 if expected_type is None or isinstance(v, expected_type):
2170 return v
2171
2172
2173 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2174 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2175
2176
2177 US_RATINGS = {
2178 'G': 0,
2179 'PG': 10,
2180 'PG-13': 13,
2181 'R': 16,
2182 'NC': 18,
2183 }
2184
2185
2186 TV_PARENTAL_GUIDELINES = {
2187 'TV-Y': 0,
2188 'TV-Y7': 7,
2189 'TV-G': 0,
2190 'TV-PG': 0,
2191 'TV-14': 14,
2192 'TV-MA': 17,
2193 }
2194
2195
2196 def parse_age_limit(s):
2197 if type(s) == int:
2198 return s if 0 <= s <= 21 else None
2199 if not isinstance(s, compat_basestring):
2200 return None
2201 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2202 if m:
2203 return int(m.group('age'))
2204 if s in US_RATINGS:
2205 return US_RATINGS[s]
2206 return TV_PARENTAL_GUIDELINES.get(s)
2207
2208
2209 def strip_jsonp(code):
2210 return re.sub(
2211 r'''(?sx)^
2212 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2213 (?:\s*&&\s*(?P=func_name))?
2214 \s*\(\s*(?P<callback_data>.*)\);?
2215 \s*?(?://[^\n]*)*$''',
2216 r'\g<callback_data>', code)
2217
2218
2219 def js_to_json(code):
2220 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2221 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2222 INTEGER_TABLE = (
2223 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2224 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2225 )
2226
2227 def fix_kv(m):
2228 v = m.group(0)
2229 if v in ('true', 'false', 'null'):
2230 return v
2231 elif v.startswith('/*') or v.startswith('//') or v == ',':
2232 return ""
2233
2234 if v[0] in ("'", '"'):
2235 v = re.sub(r'(?s)\\.|"', lambda m: {
2236 '"': '\\"',
2237 "\\'": "'",
2238 '\\\n': '',
2239 '\\x': '\\u00',
2240 }.get(m.group(0), m.group(0)), v[1:-1])
2241
2242 for regex, base in INTEGER_TABLE:
2243 im = re.match(regex, v)
2244 if im:
2245 i = int(im.group(1), base)
2246 return '"%d":' % i if v.endswith(':') else '%d' % i
2247
2248 return '"%s"' % v
2249
2250 return re.sub(r'''(?sx)
2251 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2252 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2253 {comment}|,(?={skip}[\]}}])|
2254 [a-zA-Z_][.a-zA-Z_0-9]*|
2255 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2256 [0-9]+(?={skip}:)
2257 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2258
2259
2260 def qualities(quality_ids):
2261 """ Get a numeric quality value out of a list of possible values """
2262 def q(qid):
2263 try:
2264 return quality_ids.index(qid)
2265 except ValueError:
2266 return -1
2267 return q
2268
2269
2270 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2271
2272
2273 def limit_length(s, length):
2274 """ Add ellipses to overly long strings """
2275 if s is None:
2276 return None
2277 ELLIPSES = '...'
2278 if len(s) > length:
2279 return s[:length - len(ELLIPSES)] + ELLIPSES
2280 return s
2281
2282
2283 def version_tuple(v):
2284 return tuple(int(e) for e in re.split(r'[-.]', v))
2285
2286
2287 def is_outdated_version(version, limit, assume_new=True):
2288 if not version:
2289 return not assume_new
2290 try:
2291 return version_tuple(version) < version_tuple(limit)
2292 except ValueError:
2293 return not assume_new
2294
2295
2296 def ytdl_is_updateable():
2297 """ Returns if youtube-dl can be updated with -U """
2298 from zipimport import zipimporter
2299
2300 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2301
2302
2303 def args_to_str(args):
2304 # Get a short string representation for a subprocess command
2305 return ' '.join(compat_shlex_quote(a) for a in args)
2306
2307
2308 def error_to_compat_str(err):
2309 err_str = str(err)
2310 # On python 2 error byte string must be decoded with proper
2311 # encoding rather than ascii
2312 if sys.version_info[0] < 3:
2313 err_str = err_str.decode(preferredencoding())
2314 return err_str
2315
2316
2317 def mimetype2ext(mt):
2318 if mt is None:
2319 return None
2320
2321 ext = {
2322 'audio/mp4': 'm4a',
2323 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2324 # it's the most popular one
2325 'audio/mpeg': 'mp3',
2326 }.get(mt)
2327 if ext is not None:
2328 return ext
2329
2330 _, _, res = mt.rpartition('/')
2331 res = res.split(';')[0].strip().lower()
2332
2333 return {
2334 '3gpp': '3gp',
2335 'smptett+xml': 'tt',
2336 'ttaf+xml': 'dfxp',
2337 'ttml+xml': 'ttml',
2338 'x-flv': 'flv',
2339 'x-mp4-fragmented': 'mp4',
2340 'x-ms-wmv': 'wmv',
2341 'mpegurl': 'm3u8',
2342 'x-mpegurl': 'm3u8',
2343 'vnd.apple.mpegurl': 'm3u8',
2344 'dash+xml': 'mpd',
2345 'f4m+xml': 'f4m',
2346 'hds+xml': 'f4m',
2347 'vnd.ms-sstr+xml': 'ism',
2348 'quicktime': 'mov',
2349 'mp2t': 'ts',
2350 }.get(res, res)
2351
2352
2353 def parse_codecs(codecs_str):
2354 # http://tools.ietf.org/html/rfc6381
2355 if not codecs_str:
2356 return {}
2357 splited_codecs = list(filter(None, map(
2358 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2359 vcodec, acodec = None, None
2360 for full_codec in splited_codecs:
2361 codec = full_codec.split('.')[0]
2362 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2363 if not vcodec:
2364 vcodec = full_codec
2365 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2366 if not acodec:
2367 acodec = full_codec
2368 else:
2369 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2370 if not vcodec and not acodec:
2371 if len(splited_codecs) == 2:
2372 return {
2373 'vcodec': vcodec,
2374 'acodec': acodec,
2375 }
2376 elif len(splited_codecs) == 1:
2377 return {
2378 'vcodec': 'none',
2379 'acodec': vcodec,
2380 }
2381 else:
2382 return {
2383 'vcodec': vcodec or 'none',
2384 'acodec': acodec or 'none',
2385 }
2386 return {}
2387
2388
2389 def urlhandle_detect_ext(url_handle):
2390 getheader = url_handle.headers.get
2391
2392 cd = getheader('Content-Disposition')
2393 if cd:
2394 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2395 if m:
2396 e = determine_ext(m.group('filename'), default_ext=None)
2397 if e:
2398 return e
2399
2400 return mimetype2ext(getheader('Content-Type'))
2401
2402
2403 def encode_data_uri(data, mime_type):
2404 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2405
2406
2407 def age_restricted(content_limit, age_limit):
2408 """ Returns True iff the content should be blocked """
2409
2410 if age_limit is None: # No limit set
2411 return False
2412 if content_limit is None:
2413 return False # Content available for everyone
2414 return age_limit < content_limit
2415
2416
2417 def is_html(first_bytes):
2418 """ Detect whether a file contains HTML by examining its first bytes. """
2419
2420 BOMS = [
2421 (b'\xef\xbb\xbf', 'utf-8'),
2422 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2423 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2424 (b'\xff\xfe', 'utf-16-le'),
2425 (b'\xfe\xff', 'utf-16-be'),
2426 ]
2427 for bom, enc in BOMS:
2428 if first_bytes.startswith(bom):
2429 s = first_bytes[len(bom):].decode(enc, 'replace')
2430 break
2431 else:
2432 s = first_bytes.decode('utf-8', 'replace')
2433
2434 return re.match(r'^\s*<', s)
2435
2436
2437 def determine_protocol(info_dict):
2438 protocol = info_dict.get('protocol')
2439 if protocol is not None:
2440 return protocol
2441
2442 url = info_dict['url']
2443 if url.startswith('rtmp'):
2444 return 'rtmp'
2445 elif url.startswith('mms'):
2446 return 'mms'
2447 elif url.startswith('rtsp'):
2448 return 'rtsp'
2449
2450 ext = determine_ext(url)
2451 if ext == 'm3u8':
2452 return 'm3u8'
2453 elif ext == 'f4m':
2454 return 'f4m'
2455
2456 return compat_urllib_parse_urlparse(url).scheme
2457
2458
2459 def render_table(header_row, data):
2460 """ Render a list of rows, each as a list of values """
2461 table = [header_row] + data
2462 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2463 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2464 return '\n'.join(format_str % tuple(row) for row in table)
2465
2466
2467 def _match_one(filter_part, dct):
2468 COMPARISON_OPERATORS = {
2469 '<': operator.lt,
2470 '<=': operator.le,
2471 '>': operator.gt,
2472 '>=': operator.ge,
2473 '=': operator.eq,
2474 '!=': operator.ne,
2475 }
2476 operator_rex = re.compile(r'''(?x)\s*
2477 (?P<key>[a-z_]+)
2478 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2479 (?:
2480 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2481 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2482 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2483 )
2484 \s*$
2485 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2486 m = operator_rex.search(filter_part)
2487 if m:
2488 op = COMPARISON_OPERATORS[m.group('op')]
2489 actual_value = dct.get(m.group('key'))
2490 if (m.group('quotedstrval') is not None or
2491 m.group('strval') is not None or
2492 # If the original field is a string and matching comparisonvalue is
2493 # a number we should respect the origin of the original field
2494 # and process comparison value as a string (see
2495 # https://github.com/rg3/youtube-dl/issues/11082).
2496 actual_value is not None and m.group('intval') is not None and
2497 isinstance(actual_value, compat_str)):
2498 if m.group('op') not in ('=', '!='):
2499 raise ValueError(
2500 'Operator %s does not support string values!' % m.group('op'))
2501 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2502 quote = m.group('quote')
2503 if quote is not None:
2504 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2505 else:
2506 try:
2507 comparison_value = int(m.group('intval'))
2508 except ValueError:
2509 comparison_value = parse_filesize(m.group('intval'))
2510 if comparison_value is None:
2511 comparison_value = parse_filesize(m.group('intval') + 'B')
2512 if comparison_value is None:
2513 raise ValueError(
2514 'Invalid integer value %r in filter part %r' % (
2515 m.group('intval'), filter_part))
2516 if actual_value is None:
2517 return m.group('none_inclusive')
2518 return op(actual_value, comparison_value)
2519
2520 UNARY_OPERATORS = {
2521 '': lambda v: v is not None,
2522 '!': lambda v: v is None,
2523 }
2524 operator_rex = re.compile(r'''(?x)\s*
2525 (?P<op>%s)\s*(?P<key>[a-z_]+)
2526 \s*$
2527 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2528 m = operator_rex.search(filter_part)
2529 if m:
2530 op = UNARY_OPERATORS[m.group('op')]
2531 actual_value = dct.get(m.group('key'))
2532 return op(actual_value)
2533
2534 raise ValueError('Invalid filter part %r' % filter_part)
2535
2536
2537 def match_str(filter_str, dct):
2538 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2539
2540 return all(
2541 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2542
2543
2544 def match_filter_func(filter_str):
2545 def _match_func(info_dict):
2546 if match_str(filter_str, info_dict):
2547 return None
2548 else:
2549 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2550 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2551 return _match_func
2552
2553
2554 def parse_dfxp_time_expr(time_expr):
2555 if not time_expr:
2556 return
2557
2558 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2559 if mobj:
2560 return float(mobj.group('time_offset'))
2561
2562 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2563 if mobj:
2564 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2565
2566
2567 def srt_subtitles_timecode(seconds):
2568 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2569
2570
2571 def dfxp2srt(dfxp_data):
2572 LEGACY_NAMESPACES = (
2573 ('http://www.w3.org/ns/ttml', [
2574 'http://www.w3.org/2004/11/ttaf1',
2575 'http://www.w3.org/2006/04/ttaf1',
2576 'http://www.w3.org/2006/10/ttaf1',
2577 ]),
2578 ('http://www.w3.org/ns/ttml#styling', [
2579 'http://www.w3.org/ns/ttml#style',
2580 ]),
2581 )
2582
2583 SUPPORTED_STYLING = [
2584 'color',
2585 'fontFamily',
2586 'fontSize',
2587 'fontStyle',
2588 'fontWeight',
2589 'textDecoration'
2590 ]
2591
2592 _x = functools.partial(xpath_with_ns, ns_map={
2593 'ttml': 'http://www.w3.org/ns/ttml',
2594 'tts': 'http://www.w3.org/ns/ttml#styling',
2595 })
2596
2597 styles = {}
2598 default_style = {}
2599
2600 class TTMLPElementParser(object):
2601 _out = ''
2602 _unclosed_elements = []
2603 _applied_styles = []
2604
2605 def start(self, tag, attrib):
2606 if tag in (_x('ttml:br'), 'br'):
2607 self._out += '\n'
2608 else:
2609 unclosed_elements = []
2610 style = {}
2611 element_style_id = attrib.get('style')
2612 if default_style:
2613 style.update(default_style)
2614 if element_style_id:
2615 style.update(styles.get(element_style_id, {}))
2616 for prop in SUPPORTED_STYLING:
2617 prop_val = attrib.get(_x('tts:' + prop))
2618 if prop_val:
2619 style[prop] = prop_val
2620 if style:
2621 font = ''
2622 for k, v in sorted(style.items()):
2623 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2624 continue
2625 if k == 'color':
2626 font += ' color="%s"' % v
2627 elif k == 'fontSize':
2628 font += ' size="%s"' % v
2629 elif k == 'fontFamily':
2630 font += ' face="%s"' % v
2631 elif k == 'fontWeight' and v == 'bold':
2632 self._out += '<b>'
2633 unclosed_elements.append('b')
2634 elif k == 'fontStyle' and v == 'italic':
2635 self._out += '<i>'
2636 unclosed_elements.append('i')
2637 elif k == 'textDecoration' and v == 'underline':
2638 self._out += '<u>'
2639 unclosed_elements.append('u')
2640 if font:
2641 self._out += '<font' + font + '>'
2642 unclosed_elements.append('font')
2643 applied_style = {}
2644 if self._applied_styles:
2645 applied_style.update(self._applied_styles[-1])
2646 applied_style.update(style)
2647 self._applied_styles.append(applied_style)
2648 self._unclosed_elements.append(unclosed_elements)
2649
2650 def end(self, tag):
2651 if tag not in (_x('ttml:br'), 'br'):
2652 unclosed_elements = self._unclosed_elements.pop()
2653 for element in reversed(unclosed_elements):
2654 self._out += '</%s>' % element
2655 if unclosed_elements and self._applied_styles:
2656 self._applied_styles.pop()
2657
2658 def data(self, data):
2659 self._out += data
2660
2661 def close(self):
2662 return self._out.strip()
2663
2664 def parse_node(node):
2665 target = TTMLPElementParser()
2666 parser = xml.etree.ElementTree.XMLParser(target=target)
2667 parser.feed(xml.etree.ElementTree.tostring(node))
2668 return parser.close()
2669
2670 for k, v in LEGACY_NAMESPACES:
2671 for ns in v:
2672 dfxp_data = dfxp_data.replace(ns, k)
2673
2674 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2675 out = []
2676 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2677
2678 if not paras:
2679 raise ValueError('Invalid dfxp/TTML subtitle')
2680
2681 repeat = False
2682 while True:
2683 for style in dfxp.findall(_x('.//ttml:style')):
2684 style_id = style.get('id')
2685 parent_style_id = style.get('style')
2686 if parent_style_id:
2687 if parent_style_id not in styles:
2688 repeat = True
2689 continue
2690 styles[style_id] = styles[parent_style_id].copy()
2691 for prop in SUPPORTED_STYLING:
2692 prop_val = style.get(_x('tts:' + prop))
2693 if prop_val:
2694 styles.setdefault(style_id, {})[prop] = prop_val
2695 if repeat:
2696 repeat = False
2697 else:
2698 break
2699
2700 for p in ('body', 'div'):
2701 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2702 if ele is None:
2703 continue
2704 style = styles.get(ele.get('style'))
2705 if not style:
2706 continue
2707 default_style.update(style)
2708
2709 for para, index in zip(paras, itertools.count(1)):
2710 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2711 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2712 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2713 if begin_time is None:
2714 continue
2715 if not end_time:
2716 if not dur:
2717 continue
2718 end_time = begin_time + dur
2719 out.append('%d\n%s --> %s\n%s\n\n' % (
2720 index,
2721 srt_subtitles_timecode(begin_time),
2722 srt_subtitles_timecode(end_time),
2723 parse_node(para)))
2724
2725 return ''.join(out)
2726
2727
2728 def cli_option(params, command_option, param):
2729 param = params.get(param)
2730 if param:
2731 param = compat_str(param)
2732 return [command_option, param] if param is not None else []
2733
2734
2735 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2736 param = params.get(param)
2737 assert isinstance(param, bool)
2738 if separator:
2739 return [command_option + separator + (true_value if param else false_value)]
2740 return [command_option, true_value if param else false_value]
2741
2742
2743 def cli_valueless_option(params, command_option, param, expected_value=True):
2744 param = params.get(param)
2745 return [command_option] if param == expected_value else []
2746
2747
2748 def cli_configuration_args(params, param, default=[]):
2749 ex_args = params.get(param)
2750 if ex_args is None:
2751 return default
2752 assert isinstance(ex_args, list)
2753 return ex_args
2754
2755
2756 class ISO639Utils(object):
2757 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2758 _lang_map = {
2759 'aa': 'aar',
2760 'ab': 'abk',
2761 'ae': 'ave',
2762 'af': 'afr',
2763 'ak': 'aka',
2764 'am': 'amh',
2765 'an': 'arg',
2766 'ar': 'ara',
2767 'as': 'asm',
2768 'av': 'ava',
2769 'ay': 'aym',
2770 'az': 'aze',
2771 'ba': 'bak',
2772 'be': 'bel',
2773 'bg': 'bul',
2774 'bh': 'bih',
2775 'bi': 'bis',
2776 'bm': 'bam',
2777 'bn': 'ben',
2778 'bo': 'bod',
2779 'br': 'bre',
2780 'bs': 'bos',
2781 'ca': 'cat',
2782 'ce': 'che',
2783 'ch': 'cha',
2784 'co': 'cos',
2785 'cr': 'cre',
2786 'cs': 'ces',
2787 'cu': 'chu',
2788 'cv': 'chv',
2789 'cy': 'cym',
2790 'da': 'dan',
2791 'de': 'deu',
2792 'dv': 'div',
2793 'dz': 'dzo',
2794 'ee': 'ewe',
2795 'el': 'ell',
2796 'en': 'eng',
2797 'eo': 'epo',
2798 'es': 'spa',
2799 'et': 'est',
2800 'eu': 'eus',
2801 'fa': 'fas',
2802 'ff': 'ful',
2803 'fi': 'fin',
2804 'fj': 'fij',
2805 'fo': 'fao',
2806 'fr': 'fra',
2807 'fy': 'fry',
2808 'ga': 'gle',
2809 'gd': 'gla',
2810 'gl': 'glg',
2811 'gn': 'grn',
2812 'gu': 'guj',
2813 'gv': 'glv',
2814 'ha': 'hau',
2815 'he': 'heb',
2816 'hi': 'hin',
2817 'ho': 'hmo',
2818 'hr': 'hrv',
2819 'ht': 'hat',
2820 'hu': 'hun',
2821 'hy': 'hye',
2822 'hz': 'her',
2823 'ia': 'ina',
2824 'id': 'ind',
2825 'ie': 'ile',
2826 'ig': 'ibo',
2827 'ii': 'iii',
2828 'ik': 'ipk',
2829 'io': 'ido',
2830 'is': 'isl',
2831 'it': 'ita',
2832 'iu': 'iku',
2833 'ja': 'jpn',
2834 'jv': 'jav',
2835 'ka': 'kat',
2836 'kg': 'kon',
2837 'ki': 'kik',
2838 'kj': 'kua',
2839 'kk': 'kaz',
2840 'kl': 'kal',
2841 'km': 'khm',
2842 'kn': 'kan',
2843 'ko': 'kor',
2844 'kr': 'kau',
2845 'ks': 'kas',
2846 'ku': 'kur',
2847 'kv': 'kom',
2848 'kw': 'cor',
2849 'ky': 'kir',
2850 'la': 'lat',
2851 'lb': 'ltz',
2852 'lg': 'lug',
2853 'li': 'lim',
2854 'ln': 'lin',
2855 'lo': 'lao',
2856 'lt': 'lit',
2857 'lu': 'lub',
2858 'lv': 'lav',
2859 'mg': 'mlg',
2860 'mh': 'mah',
2861 'mi': 'mri',
2862 'mk': 'mkd',
2863 'ml': 'mal',
2864 'mn': 'mon',
2865 'mr': 'mar',
2866 'ms': 'msa',
2867 'mt': 'mlt',
2868 'my': 'mya',
2869 'na': 'nau',
2870 'nb': 'nob',
2871 'nd': 'nde',
2872 'ne': 'nep',
2873 'ng': 'ndo',
2874 'nl': 'nld',
2875 'nn': 'nno',
2876 'no': 'nor',
2877 'nr': 'nbl',
2878 'nv': 'nav',
2879 'ny': 'nya',
2880 'oc': 'oci',
2881 'oj': 'oji',
2882 'om': 'orm',
2883 'or': 'ori',
2884 'os': 'oss',
2885 'pa': 'pan',
2886 'pi': 'pli',
2887 'pl': 'pol',
2888 'ps': 'pus',
2889 'pt': 'por',
2890 'qu': 'que',
2891 'rm': 'roh',
2892 'rn': 'run',
2893 'ro': 'ron',
2894 'ru': 'rus',
2895 'rw': 'kin',
2896 'sa': 'san',
2897 'sc': 'srd',
2898 'sd': 'snd',
2899 'se': 'sme',
2900 'sg': 'sag',
2901 'si': 'sin',
2902 'sk': 'slk',
2903 'sl': 'slv',
2904 'sm': 'smo',
2905 'sn': 'sna',
2906 'so': 'som',
2907 'sq': 'sqi',
2908 'sr': 'srp',
2909 'ss': 'ssw',
2910 'st': 'sot',
2911 'su': 'sun',
2912 'sv': 'swe',
2913 'sw': 'swa',
2914 'ta': 'tam',
2915 'te': 'tel',
2916 'tg': 'tgk',
2917 'th': 'tha',
2918 'ti': 'tir',
2919 'tk': 'tuk',
2920 'tl': 'tgl',
2921 'tn': 'tsn',
2922 'to': 'ton',
2923 'tr': 'tur',
2924 'ts': 'tso',
2925 'tt': 'tat',
2926 'tw': 'twi',
2927 'ty': 'tah',
2928 'ug': 'uig',
2929 'uk': 'ukr',
2930 'ur': 'urd',
2931 'uz': 'uzb',
2932 've': 'ven',
2933 'vi': 'vie',
2934 'vo': 'vol',
2935 'wa': 'wln',
2936 'wo': 'wol',
2937 'xh': 'xho',
2938 'yi': 'yid',
2939 'yo': 'yor',
2940 'za': 'zha',
2941 'zh': 'zho',
2942 'zu': 'zul',
2943 }
2944
2945 @classmethod
2946 def short2long(cls, code):
2947 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2948 return cls._lang_map.get(code[:2])
2949
2950 @classmethod
2951 def long2short(cls, code):
2952 """Convert language code from ISO 639-2/T to ISO 639-1"""
2953 for short_name, long_name in cls._lang_map.items():
2954 if long_name == code:
2955 return short_name
2956
2957
2958 class ISO3166Utils(object):
2959 # From http://data.okfn.org/data/core/country-list
2960 _country_map = {
2961 'AF': 'Afghanistan',
2962 'AX': 'Åland Islands',
2963 'AL': 'Albania',
2964 'DZ': 'Algeria',
2965 'AS': 'American Samoa',
2966 'AD': 'Andorra',
2967 'AO': 'Angola',
2968 'AI': 'Anguilla',
2969 'AQ': 'Antarctica',
2970 'AG': 'Antigua and Barbuda',
2971 'AR': 'Argentina',
2972 'AM': 'Armenia',
2973 'AW': 'Aruba',
2974 'AU': 'Australia',
2975 'AT': 'Austria',
2976 'AZ': 'Azerbaijan',
2977 'BS': 'Bahamas',
2978 'BH': 'Bahrain',
2979 'BD': 'Bangladesh',
2980 'BB': 'Barbados',
2981 'BY': 'Belarus',
2982 'BE': 'Belgium',
2983 'BZ': 'Belize',
2984 'BJ': 'Benin',
2985 'BM': 'Bermuda',
2986 'BT': 'Bhutan',
2987 'BO': 'Bolivia, Plurinational State of',
2988 'BQ': 'Bonaire, Sint Eustatius and Saba',
2989 'BA': 'Bosnia and Herzegovina',
2990 'BW': 'Botswana',
2991 'BV': 'Bouvet Island',
2992 'BR': 'Brazil',
2993 'IO': 'British Indian Ocean Territory',
2994 'BN': 'Brunei Darussalam',
2995 'BG': 'Bulgaria',
2996 'BF': 'Burkina Faso',
2997 'BI': 'Burundi',
2998 'KH': 'Cambodia',
2999 'CM': 'Cameroon',
3000 'CA': 'Canada',
3001 'CV': 'Cape Verde',
3002 'KY': 'Cayman Islands',
3003 'CF': 'Central African Republic',
3004 'TD': 'Chad',
3005 'CL': 'Chile',
3006 'CN': 'China',
3007 'CX': 'Christmas Island',
3008 'CC': 'Cocos (Keeling) Islands',
3009 'CO': 'Colombia',
3010 'KM': 'Comoros',
3011 'CG': 'Congo',
3012 'CD': 'Congo, the Democratic Republic of the',
3013 'CK': 'Cook Islands',
3014 'CR': 'Costa Rica',
3015 'CI': 'Côte d\'Ivoire',
3016 'HR': 'Croatia',
3017 'CU': 'Cuba',
3018 'CW': 'Curaçao',
3019 'CY': 'Cyprus',
3020 'CZ': 'Czech Republic',
3021 'DK': 'Denmark',
3022 'DJ': 'Djibouti',
3023 'DM': 'Dominica',
3024 'DO': 'Dominican Republic',
3025 'EC': 'Ecuador',
3026 'EG': 'Egypt',
3027 'SV': 'El Salvador',
3028 'GQ': 'Equatorial Guinea',
3029 'ER': 'Eritrea',
3030 'EE': 'Estonia',
3031 'ET': 'Ethiopia',
3032 'FK': 'Falkland Islands (Malvinas)',
3033 'FO': 'Faroe Islands',
3034 'FJ': 'Fiji',
3035 'FI': 'Finland',
3036 'FR': 'France',
3037 'GF': 'French Guiana',
3038 'PF': 'French Polynesia',
3039 'TF': 'French Southern Territories',
3040 'GA': 'Gabon',
3041 'GM': 'Gambia',
3042 'GE': 'Georgia',
3043 'DE': 'Germany',
3044 'GH': 'Ghana',
3045 'GI': 'Gibraltar',
3046 'GR': 'Greece',
3047 'GL': 'Greenland',
3048 'GD': 'Grenada',
3049 'GP': 'Guadeloupe',
3050 'GU': 'Guam',
3051 'GT': 'Guatemala',
3052 'GG': 'Guernsey',
3053 'GN': 'Guinea',
3054 'GW': 'Guinea-Bissau',
3055 'GY': 'Guyana',
3056 'HT': 'Haiti',
3057 'HM': 'Heard Island and McDonald Islands',
3058 'VA': 'Holy See (Vatican City State)',
3059 'HN': 'Honduras',
3060 'HK': 'Hong Kong',
3061 'HU': 'Hungary',
3062 'IS': 'Iceland',
3063 'IN': 'India',
3064 'ID': 'Indonesia',
3065 'IR': 'Iran, Islamic Republic of',
3066 'IQ': 'Iraq',
3067 'IE': 'Ireland',
3068 'IM': 'Isle of Man',
3069 'IL': 'Israel',
3070 'IT': 'Italy',
3071 'JM': 'Jamaica',
3072 'JP': 'Japan',
3073 'JE': 'Jersey',
3074 'JO': 'Jordan',
3075 'KZ': 'Kazakhstan',
3076 'KE': 'Kenya',
3077 'KI': 'Kiribati',
3078 'KP': 'Korea, Democratic People\'s Republic of',
3079 'KR': 'Korea, Republic of',
3080 'KW': 'Kuwait',
3081 'KG': 'Kyrgyzstan',
3082 'LA': 'Lao People\'s Democratic Republic',
3083 'LV': 'Latvia',
3084 'LB': 'Lebanon',
3085 'LS': 'Lesotho',
3086 'LR': 'Liberia',
3087 'LY': 'Libya',
3088 'LI': 'Liechtenstein',
3089 'LT': 'Lithuania',
3090 'LU': 'Luxembourg',
3091 'MO': 'Macao',
3092 'MK': 'Macedonia, the Former Yugoslav Republic of',
3093 'MG': 'Madagascar',
3094 'MW': 'Malawi',
3095 'MY': 'Malaysia',
3096 'MV': 'Maldives',
3097 'ML': 'Mali',
3098 'MT': 'Malta',
3099 'MH': 'Marshall Islands',
3100 'MQ': 'Martinique',
3101 'MR': 'Mauritania',
3102 'MU': 'Mauritius',
3103 'YT': 'Mayotte',
3104 'MX': 'Mexico',
3105 'FM': 'Micronesia, Federated States of',
3106 'MD': 'Moldova, Republic of',
3107 'MC': 'Monaco',
3108 'MN': 'Mongolia',
3109 'ME': 'Montenegro',
3110 'MS': 'Montserrat',
3111 'MA': 'Morocco',
3112 'MZ': 'Mozambique',
3113 'MM': 'Myanmar',
3114 'NA': 'Namibia',
3115 'NR': 'Nauru',
3116 'NP': 'Nepal',
3117 'NL': 'Netherlands',
3118 'NC': 'New Caledonia',
3119 'NZ': 'New Zealand',
3120 'NI': 'Nicaragua',
3121 'NE': 'Niger',
3122 'NG': 'Nigeria',
3123 'NU': 'Niue',
3124 'NF': 'Norfolk Island',
3125 'MP': 'Northern Mariana Islands',
3126 'NO': 'Norway',
3127 'OM': 'Oman',
3128 'PK': 'Pakistan',
3129 'PW': 'Palau',
3130 'PS': 'Palestine, State of',
3131 'PA': 'Panama',
3132 'PG': 'Papua New Guinea',
3133 'PY': 'Paraguay',
3134 'PE': 'Peru',
3135 'PH': 'Philippines',
3136 'PN': 'Pitcairn',
3137 'PL': 'Poland',
3138 'PT': 'Portugal',
3139 'PR': 'Puerto Rico',
3140 'QA': 'Qatar',
3141 'RE': 'Réunion',
3142 'RO': 'Romania',
3143 'RU': 'Russian Federation',
3144 'RW': 'Rwanda',
3145 'BL': 'Saint Barthélemy',
3146 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3147 'KN': 'Saint Kitts and Nevis',
3148 'LC': 'Saint Lucia',
3149 'MF': 'Saint Martin (French part)',
3150 'PM': 'Saint Pierre and Miquelon',
3151 'VC': 'Saint Vincent and the Grenadines',
3152 'WS': 'Samoa',
3153 'SM': 'San Marino',
3154 'ST': 'Sao Tome and Principe',
3155 'SA': 'Saudi Arabia',
3156 'SN': 'Senegal',
3157 'RS': 'Serbia',
3158 'SC': 'Seychelles',
3159 'SL': 'Sierra Leone',
3160 'SG': 'Singapore',
3161 'SX': 'Sint Maarten (Dutch part)',
3162 'SK': 'Slovakia',
3163 'SI': 'Slovenia',
3164 'SB': 'Solomon Islands',
3165 'SO': 'Somalia',
3166 'ZA': 'South Africa',
3167 'GS': 'South Georgia and the South Sandwich Islands',
3168 'SS': 'South Sudan',
3169 'ES': 'Spain',
3170 'LK': 'Sri Lanka',
3171 'SD': 'Sudan',
3172 'SR': 'Suriname',
3173 'SJ': 'Svalbard and Jan Mayen',
3174 'SZ': 'Swaziland',
3175 'SE': 'Sweden',
3176 'CH': 'Switzerland',
3177 'SY': 'Syrian Arab Republic',
3178 'TW': 'Taiwan, Province of China',
3179 'TJ': 'Tajikistan',
3180 'TZ': 'Tanzania, United Republic of',
3181 'TH': 'Thailand',
3182 'TL': 'Timor-Leste',
3183 'TG': 'Togo',
3184 'TK': 'Tokelau',
3185 'TO': 'Tonga',
3186 'TT': 'Trinidad and Tobago',
3187 'TN': 'Tunisia',
3188 'TR': 'Turkey',
3189 'TM': 'Turkmenistan',
3190 'TC': 'Turks and Caicos Islands',
3191 'TV': 'Tuvalu',
3192 'UG': 'Uganda',
3193 'UA': 'Ukraine',
3194 'AE': 'United Arab Emirates',
3195 'GB': 'United Kingdom',
3196 'US': 'United States',
3197 'UM': 'United States Minor Outlying Islands',
3198 'UY': 'Uruguay',
3199 'UZ': 'Uzbekistan',
3200 'VU': 'Vanuatu',
3201 'VE': 'Venezuela, Bolivarian Republic of',
3202 'VN': 'Viet Nam',
3203 'VG': 'Virgin Islands, British',
3204 'VI': 'Virgin Islands, U.S.',
3205 'WF': 'Wallis and Futuna',
3206 'EH': 'Western Sahara',
3207 'YE': 'Yemen',
3208 'ZM': 'Zambia',
3209 'ZW': 'Zimbabwe',
3210 }
3211
3212 @classmethod
3213 def short2full(cls, code):
3214 """Convert an ISO 3166-2 country code to the corresponding full name"""
3215 return cls._country_map.get(code.upper())
3216
3217
3218 class GeoUtils(object):
3219 # Major IPv4 address blocks per country
3220 _country_ip_map = {
3221 'AD': '85.94.160.0/19',
3222 'AE': '94.200.0.0/13',
3223 'AF': '149.54.0.0/17',
3224 'AG': '209.59.64.0/18',
3225 'AI': '204.14.248.0/21',
3226 'AL': '46.99.0.0/16',
3227 'AM': '46.70.0.0/15',
3228 'AO': '105.168.0.0/13',
3229 'AP': '159.117.192.0/21',
3230 'AR': '181.0.0.0/12',
3231 'AS': '202.70.112.0/20',
3232 'AT': '84.112.0.0/13',
3233 'AU': '1.128.0.0/11',
3234 'AW': '181.41.0.0/18',
3235 'AZ': '5.191.0.0/16',
3236 'BA': '31.176.128.0/17',
3237 'BB': '65.48.128.0/17',
3238 'BD': '114.130.0.0/16',
3239 'BE': '57.0.0.0/8',
3240 'BF': '129.45.128.0/17',
3241 'BG': '95.42.0.0/15',
3242 'BH': '37.131.0.0/17',
3243 'BI': '154.117.192.0/18',
3244 'BJ': '137.255.0.0/16',
3245 'BL': '192.131.134.0/24',
3246 'BM': '196.12.64.0/18',
3247 'BN': '156.31.0.0/16',
3248 'BO': '161.56.0.0/16',
3249 'BQ': '161.0.80.0/20',
3250 'BR': '152.240.0.0/12',
3251 'BS': '24.51.64.0/18',
3252 'BT': '119.2.96.0/19',
3253 'BW': '168.167.0.0/16',
3254 'BY': '178.120.0.0/13',
3255 'BZ': '179.42.192.0/18',
3256 'CA': '99.224.0.0/11',
3257 'CD': '41.243.0.0/16',
3258 'CF': '196.32.200.0/21',
3259 'CG': '197.214.128.0/17',
3260 'CH': '85.0.0.0/13',
3261 'CI': '154.232.0.0/14',
3262 'CK': '202.65.32.0/19',
3263 'CL': '152.172.0.0/14',
3264 'CM': '165.210.0.0/15',
3265 'CN': '36.128.0.0/10',
3266 'CO': '181.240.0.0/12',
3267 'CR': '201.192.0.0/12',
3268 'CU': '152.206.0.0/15',
3269 'CV': '165.90.96.0/19',
3270 'CW': '190.88.128.0/17',
3271 'CY': '46.198.0.0/15',
3272 'CZ': '88.100.0.0/14',
3273 'DE': '53.0.0.0/8',
3274 'DJ': '197.241.0.0/17',
3275 'DK': '87.48.0.0/12',
3276 'DM': '192.243.48.0/20',
3277 'DO': '152.166.0.0/15',
3278 'DZ': '41.96.0.0/12',
3279 'EC': '186.68.0.0/15',
3280 'EE': '90.190.0.0/15',
3281 'EG': '156.160.0.0/11',
3282 'ER': '196.200.96.0/20',
3283 'ES': '88.0.0.0/11',
3284 'ET': '196.188.0.0/14',
3285 'EU': '2.16.0.0/13',
3286 'FI': '91.152.0.0/13',
3287 'FJ': '144.120.0.0/16',
3288 'FM': '119.252.112.0/20',
3289 'FO': '88.85.32.0/19',
3290 'FR': '90.0.0.0/9',
3291 'GA': '41.158.0.0/15',
3292 'GB': '25.0.0.0/8',
3293 'GD': '74.122.88.0/21',
3294 'GE': '31.146.0.0/16',
3295 'GF': '161.22.64.0/18',
3296 'GG': '62.68.160.0/19',
3297 'GH': '45.208.0.0/14',
3298 'GI': '85.115.128.0/19',
3299 'GL': '88.83.0.0/19',
3300 'GM': '160.182.0.0/15',
3301 'GN': '197.149.192.0/18',
3302 'GP': '104.250.0.0/19',
3303 'GQ': '105.235.224.0/20',
3304 'GR': '94.64.0.0/13',
3305 'GT': '168.234.0.0/16',
3306 'GU': '168.123.0.0/16',
3307 'GW': '197.214.80.0/20',
3308 'GY': '181.41.64.0/18',
3309 'HK': '113.252.0.0/14',
3310 'HN': '181.210.0.0/16',
3311 'HR': '93.136.0.0/13',
3312 'HT': '148.102.128.0/17',
3313 'HU': '84.0.0.0/14',
3314 'ID': '39.192.0.0/10',
3315 'IE': '87.32.0.0/12',
3316 'IL': '79.176.0.0/13',
3317 'IM': '5.62.80.0/20',
3318 'IN': '117.192.0.0/10',
3319 'IO': '203.83.48.0/21',
3320 'IQ': '37.236.0.0/14',
3321 'IR': '2.176.0.0/12',
3322 'IS': '82.221.0.0/16',
3323 'IT': '79.0.0.0/10',
3324 'JE': '87.244.64.0/18',
3325 'JM': '72.27.0.0/17',
3326 'JO': '176.29.0.0/16',
3327 'JP': '126.0.0.0/8',
3328 'KE': '105.48.0.0/12',
3329 'KG': '158.181.128.0/17',
3330 'KH': '36.37.128.0/17',
3331 'KI': '103.25.140.0/22',
3332 'KM': '197.255.224.0/20',
3333 'KN': '198.32.32.0/19',
3334 'KP': '175.45.176.0/22',
3335 'KR': '175.192.0.0/10',
3336 'KW': '37.36.0.0/14',
3337 'KY': '64.96.0.0/15',
3338 'KZ': '2.72.0.0/13',
3339 'LA': '115.84.64.0/18',
3340 'LB': '178.135.0.0/16',
3341 'LC': '192.147.231.0/24',
3342 'LI': '82.117.0.0/19',
3343 'LK': '112.134.0.0/15',
3344 'LR': '41.86.0.0/19',
3345 'LS': '129.232.0.0/17',
3346 'LT': '78.56.0.0/13',
3347 'LU': '188.42.0.0/16',
3348 'LV': '46.109.0.0/16',
3349 'LY': '41.252.0.0/14',
3350 'MA': '105.128.0.0/11',
3351 'MC': '88.209.64.0/18',
3352 'MD': '37.246.0.0/16',
3353 'ME': '178.175.0.0/17',
3354 'MF': '74.112.232.0/21',
3355 'MG': '154.126.0.0/17',
3356 'MH': '117.103.88.0/21',
3357 'MK': '77.28.0.0/15',
3358 'ML': '154.118.128.0/18',
3359 'MM': '37.111.0.0/17',
3360 'MN': '49.0.128.0/17',
3361 'MO': '60.246.0.0/16',
3362 'MP': '202.88.64.0/20',
3363 'MQ': '109.203.224.0/19',
3364 'MR': '41.188.64.0/18',
3365 'MS': '208.90.112.0/22',
3366 'MT': '46.11.0.0/16',
3367 'MU': '105.16.0.0/12',
3368 'MV': '27.114.128.0/18',
3369 'MW': '105.234.0.0/16',
3370 'MX': '187.192.0.0/11',
3371 'MY': '175.136.0.0/13',
3372 'MZ': '197.218.0.0/15',
3373 'NA': '41.182.0.0/16',
3374 'NC': '101.101.0.0/18',
3375 'NE': '197.214.0.0/18',
3376 'NF': '203.17.240.0/22',
3377 'NG': '105.112.0.0/12',
3378 'NI': '186.76.0.0/15',
3379 'NL': '145.96.0.0/11',
3380 'NO': '84.208.0.0/13',
3381 'NP': '36.252.0.0/15',
3382 'NR': '203.98.224.0/19',
3383 'NU': '49.156.48.0/22',
3384 'NZ': '49.224.0.0/14',
3385 'OM': '5.36.0.0/15',
3386 'PA': '186.72.0.0/15',
3387 'PE': '186.160.0.0/14',
3388 'PF': '123.50.64.0/18',
3389 'PG': '124.240.192.0/19',
3390 'PH': '49.144.0.0/13',
3391 'PK': '39.32.0.0/11',
3392 'PL': '83.0.0.0/11',
3393 'PM': '70.36.0.0/20',
3394 'PR': '66.50.0.0/16',
3395 'PS': '188.161.0.0/16',
3396 'PT': '85.240.0.0/13',
3397 'PW': '202.124.224.0/20',
3398 'PY': '181.120.0.0/14',
3399 'QA': '37.210.0.0/15',
3400 'RE': '139.26.0.0/16',
3401 'RO': '79.112.0.0/13',
3402 'RS': '178.220.0.0/14',
3403 'RU': '5.136.0.0/13',
3404 'RW': '105.178.0.0/15',
3405 'SA': '188.48.0.0/13',
3406 'SB': '202.1.160.0/19',
3407 'SC': '154.192.0.0/11',
3408 'SD': '154.96.0.0/13',
3409 'SE': '78.64.0.0/12',
3410 'SG': '152.56.0.0/14',
3411 'SI': '188.196.0.0/14',
3412 'SK': '78.98.0.0/15',
3413 'SL': '197.215.0.0/17',
3414 'SM': '89.186.32.0/19',
3415 'SN': '41.82.0.0/15',
3416 'SO': '197.220.64.0/19',
3417 'SR': '186.179.128.0/17',
3418 'SS': '105.235.208.0/21',
3419 'ST': '197.159.160.0/19',
3420 'SV': '168.243.0.0/16',
3421 'SX': '190.102.0.0/20',
3422 'SY': '5.0.0.0/16',
3423 'SZ': '41.84.224.0/19',
3424 'TC': '65.255.48.0/20',
3425 'TD': '154.68.128.0/19',
3426 'TG': '196.168.0.0/14',
3427 'TH': '171.96.0.0/13',
3428 'TJ': '85.9.128.0/18',
3429 'TK': '27.96.24.0/21',
3430 'TL': '180.189.160.0/20',
3431 'TM': '95.85.96.0/19',
3432 'TN': '197.0.0.0/11',
3433 'TO': '175.176.144.0/21',
3434 'TR': '78.160.0.0/11',
3435 'TT': '186.44.0.0/15',
3436 'TV': '202.2.96.0/19',
3437 'TW': '120.96.0.0/11',
3438 'TZ': '156.156.0.0/14',
3439 'UA': '93.72.0.0/13',
3440 'UG': '154.224.0.0/13',
3441 'US': '3.0.0.0/8',
3442 'UY': '167.56.0.0/13',
3443 'UZ': '82.215.64.0/18',
3444 'VA': '212.77.0.0/19',
3445 'VC': '24.92.144.0/20',
3446 'VE': '186.88.0.0/13',
3447 'VG': '172.103.64.0/18',
3448 'VI': '146.226.0.0/16',
3449 'VN': '14.160.0.0/11',
3450 'VU': '202.80.32.0/20',
3451 'WF': '117.20.32.0/21',
3452 'WS': '202.4.32.0/19',
3453 'YE': '134.35.0.0/16',
3454 'YT': '41.242.116.0/22',
3455 'ZA': '41.0.0.0/11',
3456 'ZM': '165.56.0.0/13',
3457 'ZW': '41.85.192.0/19',
3458 }
3459
3460 @classmethod
3461 def random_ipv4(cls, code):
3462 block = cls._country_ip_map.get(code.upper())
3463 if not block:
3464 return None
3465 addr, preflen = block.split('/')
3466 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3467 addr_max = addr_min | (0xffffffff >> int(preflen))
3468 return compat_str(socket.inet_ntoa(
3469 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3470
3471
3472 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3473 def __init__(self, proxies=None):
3474 # Set default handlers
3475 for type in ('http', 'https'):
3476 setattr(self, '%s_open' % type,
3477 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3478 meth(r, proxy, type))
3479 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3480
3481 def proxy_open(self, req, proxy, type):
3482 req_proxy = req.headers.get('Ytdl-request-proxy')
3483 if req_proxy is not None:
3484 proxy = req_proxy
3485 del req.headers['Ytdl-request-proxy']
3486
3487 if proxy == '__noproxy__':
3488 return None # No Proxy
3489 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3490 req.add_header('Ytdl-socks-proxy', proxy)
3491 # youtube-dl's http/https handlers do wrapping the socket with socks
3492 return None
3493 return compat_urllib_request.ProxyHandler.proxy_open(
3494 self, req, proxy, type)
3495
3496
3497 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3498 # released into Public Domain
3499 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3500
3501 def long_to_bytes(n, blocksize=0):
3502 """long_to_bytes(n:long, blocksize:int) : string
3503 Convert a long integer to a byte string.
3504
3505 If optional blocksize is given and greater than zero, pad the front of the
3506 byte string with binary zeros so that the length is a multiple of
3507 blocksize.
3508 """
3509 # after much testing, this algorithm was deemed to be the fastest
3510 s = b''
3511 n = int(n)
3512 while n > 0:
3513 s = compat_struct_pack('>I', n & 0xffffffff) + s
3514 n = n >> 32
3515 # strip off leading zeros
3516 for i in range(len(s)):
3517 if s[i] != b'\000'[0]:
3518 break
3519 else:
3520 # only happens when n == 0
3521 s = b'\000'
3522 i = 0
3523 s = s[i:]
3524 # add back some pad bytes. this could be done more efficiently w.r.t. the
3525 # de-padding being done above, but sigh...
3526 if blocksize > 0 and len(s) % blocksize:
3527 s = (blocksize - len(s) % blocksize) * b'\000' + s
3528 return s
3529
3530
3531 def bytes_to_long(s):
3532 """bytes_to_long(string) : long
3533 Convert a byte string to a long integer.
3534
3535 This is (essentially) the inverse of long_to_bytes().
3536 """
3537 acc = 0
3538 length = len(s)
3539 if length % 4:
3540 extra = (4 - length % 4)
3541 s = b'\000' * extra + s
3542 length = length + extra
3543 for i in range(0, length, 4):
3544 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3545 return acc
3546
3547
3548 def ohdave_rsa_encrypt(data, exponent, modulus):
3549 '''
3550 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3551
3552 Input:
3553 data: data to encrypt, bytes-like object
3554 exponent, modulus: parameter e and N of RSA algorithm, both integer
3555 Output: hex string of encrypted data
3556
3557 Limitation: supports one block encryption only
3558 '''
3559
3560 payload = int(binascii.hexlify(data[::-1]), 16)
3561 encrypted = pow(payload, exponent, modulus)
3562 return '%x' % encrypted
3563
3564
3565 def pkcs1pad(data, length):
3566 """
3567 Padding input data with PKCS#1 scheme
3568
3569 @param {int[]} data input data
3570 @param {int} length target length
3571 @returns {int[]} padded data
3572 """
3573 if len(data) > length - 11:
3574 raise ValueError('Input data too long for PKCS#1 padding')
3575
3576 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3577 return [0, 2] + pseudo_random + [0] + data
3578
3579
3580 def encode_base_n(num, n, table=None):
3581 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3582 if not table:
3583 table = FULL_TABLE[:n]
3584
3585 if n > len(table):
3586 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3587
3588 if num == 0:
3589 return table[0]
3590
3591 ret = ''
3592 while num:
3593 ret = table[num % n] + ret
3594 num = num // n
3595 return ret
3596
3597
3598 def decode_packed_codes(code):
3599 mobj = re.search(PACKED_CODES_RE, code)
3600 obfucasted_code, base, count, symbols = mobj.groups()
3601 base = int(base)
3602 count = int(count)
3603 symbols = symbols.split('|')
3604 symbol_table = {}
3605
3606 while count:
3607 count -= 1
3608 base_n_count = encode_base_n(count, base)
3609 symbol_table[base_n_count] = symbols[count] or base_n_count
3610
3611 return re.sub(
3612 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3613 obfucasted_code)
3614
3615
3616 def parse_m3u8_attributes(attrib):
3617 info = {}
3618 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3619 if val.startswith('"'):
3620 val = val[1:-1]
3621 info[key] = val
3622 return info
3623
3624
3625 def urshift(val, n):
3626 return val >> n if val >= 0 else (val + 0x100000000) >> n
3627
3628
3629 # Based on png2str() written by @gdkchan and improved by @yokrysty
3630 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3631 def decode_png(png_data):
3632 # Reference: https://www.w3.org/TR/PNG/
3633 header = png_data[8:]
3634
3635 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3636 raise IOError('Not a valid PNG file.')
3637
3638 int_map = {1: '>B', 2: '>H', 4: '>I'}
3639 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3640
3641 chunks = []
3642
3643 while header:
3644 length = unpack_integer(header[:4])
3645 header = header[4:]
3646
3647 chunk_type = header[:4]
3648 header = header[4:]
3649
3650 chunk_data = header[:length]
3651 header = header[length:]
3652
3653 header = header[4:] # Skip CRC
3654
3655 chunks.append({
3656 'type': chunk_type,
3657 'length': length,
3658 'data': chunk_data
3659 })
3660
3661 ihdr = chunks[0]['data']
3662
3663 width = unpack_integer(ihdr[:4])
3664 height = unpack_integer(ihdr[4:8])
3665
3666 idat = b''
3667
3668 for chunk in chunks:
3669 if chunk['type'] == b'IDAT':
3670 idat += chunk['data']
3671
3672 if not idat:
3673 raise IOError('Unable to read PNG data.')
3674
3675 decompressed_data = bytearray(zlib.decompress(idat))
3676
3677 stride = width * 3
3678 pixels = []
3679
3680 def _get_pixel(idx):
3681 x = idx % stride
3682 y = idx // stride
3683 return pixels[y][x]
3684
3685 for y in range(height):
3686 basePos = y * (1 + stride)
3687 filter_type = decompressed_data[basePos]
3688
3689 current_row = []
3690
3691 pixels.append(current_row)
3692
3693 for x in range(stride):
3694 color = decompressed_data[1 + basePos + x]
3695 basex = y * stride + x
3696 left = 0
3697 up = 0
3698
3699 if x > 2:
3700 left = _get_pixel(basex - 3)
3701 if y > 0:
3702 up = _get_pixel(basex - stride)
3703
3704 if filter_type == 1: # Sub
3705 color = (color + left) & 0xff
3706 elif filter_type == 2: # Up
3707 color = (color + up) & 0xff
3708 elif filter_type == 3: # Average
3709 color = (color + ((left + up) >> 1)) & 0xff
3710 elif filter_type == 4: # Paeth
3711 a = left
3712 b = up
3713 c = 0
3714
3715 if x > 2 and y > 0:
3716 c = _get_pixel(basex - stride - 3)
3717
3718 p = a + b - c
3719
3720 pa = abs(p - a)
3721 pb = abs(p - b)
3722 pc = abs(p - c)
3723
3724 if pa <= pb and pa <= pc:
3725 color = (color + a) & 0xff
3726 elif pb <= pc:
3727 color = (color + b) & 0xff
3728 else:
3729 color = (color + c) & 0xff
3730
3731 current_row.append(color)
3732
3733 return width, height, pixels
3734
3735
3736 def write_xattr(path, key, value):
3737 # This mess below finds the best xattr tool for the job
3738 try:
3739 # try the pyxattr module...
3740 import xattr
3741
3742 if hasattr(xattr, 'set'): # pyxattr
3743 # Unicode arguments are not supported in python-pyxattr until
3744 # version 0.5.0
3745 # See https://github.com/rg3/youtube-dl/issues/5498
3746 pyxattr_required_version = '0.5.0'
3747 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3748 # TODO: fallback to CLI tools
3749 raise XAttrUnavailableError(
3750 'python-pyxattr is detected but is too old. '
3751 'youtube-dl requires %s or above while your version is %s. '
3752 'Falling back to other xattr implementations' % (
3753 pyxattr_required_version, xattr.__version__))
3754
3755 setxattr = xattr.set
3756 else: # xattr
3757 setxattr = xattr.setxattr
3758
3759 try:
3760 setxattr(path, key, value)
3761 except EnvironmentError as e:
3762 raise XAttrMetadataError(e.errno, e.strerror)
3763
3764 except ImportError:
3765 if compat_os_name == 'nt':
3766 # Write xattrs to NTFS Alternate Data Streams:
3767 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3768 assert ':' not in key
3769 assert os.path.exists(path)
3770
3771 ads_fn = path + ':' + key
3772 try:
3773 with open(ads_fn, 'wb') as f:
3774 f.write(value)
3775 except EnvironmentError as e:
3776 raise XAttrMetadataError(e.errno, e.strerror)
3777 else:
3778 user_has_setfattr = check_executable('setfattr', ['--version'])
3779 user_has_xattr = check_executable('xattr', ['-h'])
3780
3781 if user_has_setfattr or user_has_xattr:
3782
3783 value = value.decode('utf-8')
3784 if user_has_setfattr:
3785 executable = 'setfattr'
3786 opts = ['-n', key, '-v', value]
3787 elif user_has_xattr:
3788 executable = 'xattr'
3789 opts = ['-w', key, value]
3790
3791 cmd = ([encodeFilename(executable, True)] +
3792 [encodeArgument(o) for o in opts] +
3793 [encodeFilename(path, True)])
3794
3795 try:
3796 p = subprocess.Popen(
3797 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3798 except EnvironmentError as e:
3799 raise XAttrMetadataError(e.errno, e.strerror)
3800 stdout, stderr = p.communicate()
3801 stderr = stderr.decode('utf-8', 'replace')
3802 if p.returncode != 0:
3803 raise XAttrMetadataError(p.returncode, stderr)
3804
3805 else:
3806 # On Unix, and can't find pyxattr, setfattr, or xattr.
3807 if sys.platform.startswith('linux'):
3808 raise XAttrUnavailableError(
3809 "Couldn't find a tool to set the xattrs. "
3810 "Install either the python 'pyxattr' or 'xattr' "
3811 "modules, or the GNU 'attr' package "
3812 "(which contains the 'setfattr' tool).")
3813 else:
3814 raise XAttrUnavailableError(
3815 "Couldn't find a tool to set the xattrs. "
3816 "Install either the python 'xattr' module, "
3817 "or the 'xattr' binary.")
3818
3819
3820 def random_birthday(year_field, month_field, day_field):
3821 return {
3822 year_field: str(random.randint(1950, 1995)),
3823 month_field: str(random.randint(1, 12)),
3824 day_field: str(random.randint(1, 31)),
3825 }