]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[dailymotion:playlist] fix extraction(closes #16894)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import email.header
15 import errno
16 import functools
17 import gzip
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import operator
24 import os
25 import platform
26 import random
27 import re
28 import socket
29 import ssl
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParseError,
39 compat_HTMLParser,
40 compat_basestring,
41 compat_chr,
42 compat_ctypes_WINFUNCTYPE,
43 compat_etree_fromstring,
44 compat_expanduser,
45 compat_html_entities,
46 compat_html_entities_html5,
47 compat_http_client,
48 compat_kwargs,
49 compat_os_name,
50 compat_parse_qs,
51 compat_shlex_quote,
52 compat_socket_create_connection,
53 compat_str,
54 compat_struct_pack,
55 compat_struct_unpack,
56 compat_urllib_error,
57 compat_urllib_parse,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
62 compat_urlparse,
63 compat_xpath,
64 )
65
66 from .socks import (
67 ProxyType,
68 sockssocket,
69 )
70
71
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
83
84 std_headers = {
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
90 }
91
92
93 USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95 }
96
97
98 NO_DEFAULT = object()
99
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
104 MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109 }
110
111 KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
130
131 DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
138 '%b %d %Y',
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
148 '%Y/%m/%d %H:%M',
149 '%Y/%m/%d %H:%M:%S',
150 '%Y-%m-%d %H:%M',
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
165 )
166
167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168 DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175 ])
176
177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178 DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184 ])
185
186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187 JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
188
189
190 def preferredencoding():
191 """Get preferred encoding.
192
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
198 'TEST'.encode(pref)
199 except Exception:
200 pref = 'UTF-8'
201
202 return pref
203
204
205 def write_json_file(obj, fn):
206 """ Encode obj as JSON and write it to fn, atomically if possible """
207
208 fn = encodeFilename(fn)
209 if sys.version_info < (3, 0) and sys.platform != 'win32':
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
221 args = {
222 'suffix': '.tmp',
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
225 'delete': False,
226 }
227
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
231 args['mode'] = 'wb'
232 else:
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
239
240 try:
241 with tf:
242 json.dump(obj, tf)
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
250 os.rename(tf.name, fn)
251 except Exception:
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259 if sys.version_info >= (2, 7):
260 def find_xpath_attr(node, xpath, key, val=None):
261 """ Find the xpath xpath[@key=val] """
262 assert re.match(r'^[a-zA-Z_-]+$', key)
263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
264 return node.find(expr)
265 else:
266 def find_xpath_attr(node, xpath, key, val=None):
267 for f in node.findall(compat_xpath(xpath)):
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
271 return f
272 return None
273
274 # On python2.6 the xml.etree.ElementTree.Element methods don't support
275 # the namespace parameter
276
277
278 def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
289
290 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
291 def _find_xpath(xpath):
292 return node.find(compat_xpath(xpath))
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
301
302 if n is None:
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
310 return n
311
312
313 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
326
327
328 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
339
340
341 def get_element_by_id(id, html):
342 """Return the content of the tag with the specified ID in the passed HTML document"""
343 return get_element_by_attribute('id', id, html)
344
345
346 def get_element_by_class(class_name, html):
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352 def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357 def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
364 def get_elements_by_attribute(attribute, value, html, escape_value=True):
365 """Return the content of the tag with the specified attribute in the passed HTML document"""
366
367 value = re.escape(value) if escape_value else value
368
369 retlist = []
370 for m in re.finditer(r'''(?xs)
371 <([a-zA-Z0-9:._-]+)
372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
373 \s+%s=['"]?%s['"]?
374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
375 \s*>
376 (?P<content>.*?)
377 </\1>
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
380
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
383
384 retlist.append(unescapeHTML(res))
385
386 return retlist
387
388
389 class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
392 self.attrs = {}
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
398
399 def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
422 return parser.attrs
423
424
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
439 return html.strip()
440
441
442 def sanitize_open(filename, open_mode):
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
453 if filename == '-':
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
461 if err.errno in (errno.EACCES,):
462 raise
463
464 # In case of error, try to remove win32 forbidden chars
465 alt_filename = sanitize_path(filename)
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
470 stream = open(encodeFilename(alt_filename), open_mode)
471 return (stream, alt_filename)
472
473
474 def timeconvert(timestr):
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
481
482
483 def sanitize_filename(s, restricted=False, is_id=False):
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
488 """
489 def replace_insane(char):
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
508 result = ''.join(map(replace_insane, s))
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
518 result = result.lstrip('.')
519 if not result:
520 result = '_'
521 return result
522
523
524 def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
533 norm_path.pop(0)
534 sanitized_path = [
535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
536 for path_part in norm_path]
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
539 return os.path.join(*sanitized_path)
540
541
542 def sanitize_url(url):
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
558
559
560 def sanitized_Request(url, *args, **kwargs):
561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
562
563
564 def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
569 def orderedSet(iterable):
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
576
577
578 def _htmlentity_transform(entity_with_semicolon):
579 """Transforms an HTML entity to a character."""
580 entity = entity_with_semicolon[:-1]
581
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
592 if mobj is not None:
593 numstr = mobj.group(1)
594 if numstr.startswith('x'):
595 base = 16
596 numstr = '0%s' % numstr
597 else:
598 base = 10
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
604
605 # Unknown entity in name, return its literal representation
606 return '&%s;' % entity
607
608
609 def unescapeHTML(s):
610 if s is None:
611 return None
612 assert type(s) == compat_str
613
614 return re.sub(
615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
616
617
618 def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
630 def encodeFilename(s, for_subprocess=False):
631 """
632 @param s The name of the file
633 """
634
635 assert type(s) == compat_str
636
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
640
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654 def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
663
664
665 def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
674 def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
678 def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
686
687
688 def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
696
697 def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
701 if opts_no_check_certificate:
702 context.check_hostname = False
703 context.verify_mode = ssl.CERT_NONE
704 try:
705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
712 return YoutubeDLHTTPSHandler(params, **kwargs)
713 else: # Python < 3.4
714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
715 context.verify_mode = (ssl.CERT_NONE
716 if opts_no_check_certificate
717 else ssl.CERT_REQUIRED)
718 context.set_default_verify_paths()
719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
720
721
722 def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
733 class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738 class ExtractorError(YoutubeDLError):
739 """Error during info extraction."""
740
741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
748 if video_id is not None:
749 msg = video_id + ': ' + msg
750 if cause:
751 msg += ' (caused by %r)' % cause
752 if not expected:
753 msg += bug_reports_message()
754 super(ExtractorError, self).__init__(msg)
755
756 self.traceback = tb
757 self.exc_info = sys.exc_info() # preserve original exception
758 self.cause = cause
759 self.video_id = video_id
760
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
764 return ''.join(traceback.format_tb(self.traceback))
765
766
767 class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
774 class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
779 class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
791 class DownloadError(YoutubeDLError):
792 """Download Error exception.
793
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
798
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
803
804
805 class SameFileError(YoutubeDLError):
806 """Same File exception.
807
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
812
813
814 class PostProcessingError(YoutubeDLError):
815 """Post Processing exception.
816
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
820
821 def __init__(self, msg):
822 super(PostProcessingError, self).__init__(msg)
823 self.msg = msg
824
825
826 class MaxDownloadsReached(YoutubeDLError):
827 """ --max-downloads limit has been reached. """
828 pass
829
830
831 class UnavailableVideoError(YoutubeDLError):
832 """Unavailable Format exception.
833
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
838
839
840 class ContentTooShortError(YoutubeDLError):
841 """Content Too Short exception.
842
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
847
848 def __init__(self, downloaded, expected):
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
852 # Both in bytes
853 self.downloaded = downloaded
854 self.expected = expected
855
856
857 class XAttrMetadataError(YoutubeDLError):
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
861 self.msg = msg
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
873 class XAttrUnavailableError(YoutubeDLError):
874 pass
875
876
877 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
884 source_address = ydl_handler._params.get('source_address')
885 if source_address is not None:
886 sa = (source_address, 0)
887 if hasattr(hc, 'source_address'): # Python 2.7+
888 hc.source_address = sa
889 else: # Python 2.6
890 def _hc_connect(self, *args, **kwargs):
891 sock = compat_socket_create_connection(
892 (self.host, self.port), self.timeout, sa)
893 if is_https:
894 self.sock = ssl.wrap_socket(
895 sock, self.key_file, self.cert_file,
896 ssl_version=ssl.PROTOCOL_TLSv1)
897 else:
898 self.sock = sock
899 hc.connect = functools.partial(_hc_connect, hc)
900
901 return hc
902
903
904 def handle_youtubedl_headers(headers):
905 filtered_headers = headers
906
907 if 'Youtubedl-no-compression' in filtered_headers:
908 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
909 del filtered_headers['Youtubedl-no-compression']
910
911 return filtered_headers
912
913
914 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
915 """Handler for HTTP requests and responses.
916
917 This class, when installed with an OpenerDirector, automatically adds
918 the standard headers to every HTTP request and handles gzipped and
919 deflated responses from web servers. If compression is to be avoided in
920 a particular request, the original request in the program code only has
921 to include the HTTP header "Youtubedl-no-compression", which will be
922 removed before making the real request.
923
924 Part of this code was copied from:
925
926 http://techknack.net/python-urllib2-handlers/
927
928 Andrew Rowls, the author of that code, agreed to release it to the
929 public domain.
930 """
931
932 def __init__(self, params, *args, **kwargs):
933 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
934 self._params = params
935
936 def http_open(self, req):
937 conn_class = compat_http_client.HTTPConnection
938
939 socks_proxy = req.headers.get('Ytdl-socks-proxy')
940 if socks_proxy:
941 conn_class = make_socks_conn_class(conn_class, socks_proxy)
942 del req.headers['Ytdl-socks-proxy']
943
944 return self.do_open(functools.partial(
945 _create_http_connection, self, conn_class, False),
946 req)
947
948 @staticmethod
949 def deflate(data):
950 try:
951 return zlib.decompress(data, -zlib.MAX_WBITS)
952 except zlib.error:
953 return zlib.decompress(data)
954
955 def http_request(self, req):
956 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
957 # always respected by websites, some tend to give out URLs with non percent-encoded
958 # non-ASCII characters (see telemb.py, ard.py [#3412])
959 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
960 # To work around aforementioned issue we will replace request's original URL with
961 # percent-encoded one
962 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
963 # the code of this workaround has been moved here from YoutubeDL.urlopen()
964 url = req.get_full_url()
965 url_escaped = escape_url(url)
966
967 # Substitute URL if any change after escaping
968 if url != url_escaped:
969 req = update_Request(req, url=url_escaped)
970
971 for h, v in std_headers.items():
972 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
973 # The dict keys are capitalized because of this bug by urllib
974 if h.capitalize() not in req.headers:
975 req.add_header(h, v)
976
977 req.headers = handle_youtubedl_headers(req.headers)
978
979 if sys.version_info < (2, 7) and '#' in req.get_full_url():
980 # Python 2.6 is brain-dead when it comes to fragments
981 req._Request__original = req._Request__original.partition('#')[0]
982 req._Request__r_type = req._Request__r_type.partition('#')[0]
983
984 return req
985
986 def http_response(self, req, resp):
987 old_resp = resp
988 # gzip
989 if resp.headers.get('Content-encoding', '') == 'gzip':
990 content = resp.read()
991 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
992 try:
993 uncompressed = io.BytesIO(gz.read())
994 except IOError as original_ioerror:
995 # There may be junk add the end of the file
996 # See http://stackoverflow.com/q/4928560/35070 for details
997 for i in range(1, 1024):
998 try:
999 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1000 uncompressed = io.BytesIO(gz.read())
1001 except IOError:
1002 continue
1003 break
1004 else:
1005 raise original_ioerror
1006 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1007 resp.msg = old_resp.msg
1008 del resp.headers['Content-encoding']
1009 # deflate
1010 if resp.headers.get('Content-encoding', '') == 'deflate':
1011 gz = io.BytesIO(self.deflate(resp.read()))
1012 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1013 resp.msg = old_resp.msg
1014 del resp.headers['Content-encoding']
1015 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1016 # https://github.com/rg3/youtube-dl/issues/6457).
1017 if 300 <= resp.code < 400:
1018 location = resp.headers.get('Location')
1019 if location:
1020 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1021 if sys.version_info >= (3, 0):
1022 location = location.encode('iso-8859-1').decode('utf-8')
1023 else:
1024 location = location.decode('utf-8')
1025 location_escaped = escape_url(location)
1026 if location != location_escaped:
1027 del resp.headers['Location']
1028 if sys.version_info < (3, 0):
1029 location_escaped = location_escaped.encode('utf-8')
1030 resp.headers['Location'] = location_escaped
1031 return resp
1032
1033 https_request = http_request
1034 https_response = http_response
1035
1036
1037 def make_socks_conn_class(base_class, socks_proxy):
1038 assert issubclass(base_class, (
1039 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1040
1041 url_components = compat_urlparse.urlparse(socks_proxy)
1042 if url_components.scheme.lower() == 'socks5':
1043 socks_type = ProxyType.SOCKS5
1044 elif url_components.scheme.lower() in ('socks', 'socks4'):
1045 socks_type = ProxyType.SOCKS4
1046 elif url_components.scheme.lower() == 'socks4a':
1047 socks_type = ProxyType.SOCKS4A
1048
1049 def unquote_if_non_empty(s):
1050 if not s:
1051 return s
1052 return compat_urllib_parse_unquote_plus(s)
1053
1054 proxy_args = (
1055 socks_type,
1056 url_components.hostname, url_components.port or 1080,
1057 True, # Remote DNS
1058 unquote_if_non_empty(url_components.username),
1059 unquote_if_non_empty(url_components.password),
1060 )
1061
1062 class SocksConnection(base_class):
1063 def connect(self):
1064 self.sock = sockssocket()
1065 self.sock.setproxy(*proxy_args)
1066 if type(self.timeout) in (int, float):
1067 self.sock.settimeout(self.timeout)
1068 self.sock.connect((self.host, self.port))
1069
1070 if isinstance(self, compat_http_client.HTTPSConnection):
1071 if hasattr(self, '_context'): # Python > 2.6
1072 self.sock = self._context.wrap_socket(
1073 self.sock, server_hostname=self.host)
1074 else:
1075 self.sock = ssl.wrap_socket(self.sock)
1076
1077 return SocksConnection
1078
1079
1080 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1081 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1082 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1083 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1084 self._params = params
1085
1086 def https_open(self, req):
1087 kwargs = {}
1088 conn_class = self._https_conn_class
1089
1090 if hasattr(self, '_context'): # python > 2.6
1091 kwargs['context'] = self._context
1092 if hasattr(self, '_check_hostname'): # python 3.x
1093 kwargs['check_hostname'] = self._check_hostname
1094
1095 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1096 if socks_proxy:
1097 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1098 del req.headers['Ytdl-socks-proxy']
1099
1100 return self.do_open(functools.partial(
1101 _create_http_connection, self, conn_class, True),
1102 req, **kwargs)
1103
1104
1105 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1106 def __init__(self, cookiejar=None):
1107 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1108
1109 def http_response(self, request, response):
1110 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1111 # characters in Set-Cookie HTTP header of last response (see
1112 # https://github.com/rg3/youtube-dl/issues/6769).
1113 # In order to at least prevent crashing we will percent encode Set-Cookie
1114 # header before HTTPCookieProcessor starts processing it.
1115 # if sys.version_info < (3, 0) and response.headers:
1116 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1117 # set_cookie = response.headers.get(set_cookie_header)
1118 # if set_cookie:
1119 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1120 # if set_cookie != set_cookie_escaped:
1121 # del response.headers[set_cookie_header]
1122 # response.headers[set_cookie_header] = set_cookie_escaped
1123 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1124
1125 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1126 https_response = http_response
1127
1128
1129 def extract_timezone(date_str):
1130 m = re.search(
1131 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1132 date_str)
1133 if not m:
1134 timezone = datetime.timedelta()
1135 else:
1136 date_str = date_str[:-len(m.group('tz'))]
1137 if not m.group('sign'):
1138 timezone = datetime.timedelta()
1139 else:
1140 sign = 1 if m.group('sign') == '+' else -1
1141 timezone = datetime.timedelta(
1142 hours=sign * int(m.group('hours')),
1143 minutes=sign * int(m.group('minutes')))
1144 return timezone, date_str
1145
1146
1147 def parse_iso8601(date_str, delimiter='T', timezone=None):
1148 """ Return a UNIX timestamp from the given date """
1149
1150 if date_str is None:
1151 return None
1152
1153 date_str = re.sub(r'\.[0-9]+', '', date_str)
1154
1155 if timezone is None:
1156 timezone, date_str = extract_timezone(date_str)
1157
1158 try:
1159 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1160 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1161 return calendar.timegm(dt.timetuple())
1162 except ValueError:
1163 pass
1164
1165
1166 def date_formats(day_first=True):
1167 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1168
1169
1170 def unified_strdate(date_str, day_first=True):
1171 """Return a string with the date in the format YYYYMMDD"""
1172
1173 if date_str is None:
1174 return None
1175 upload_date = None
1176 # Replace commas
1177 date_str = date_str.replace(',', ' ')
1178 # Remove AM/PM + timezone
1179 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1180 _, date_str = extract_timezone(date_str)
1181
1182 for expression in date_formats(day_first):
1183 try:
1184 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1185 except ValueError:
1186 pass
1187 if upload_date is None:
1188 timetuple = email.utils.parsedate_tz(date_str)
1189 if timetuple:
1190 try:
1191 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1192 except ValueError:
1193 pass
1194 if upload_date is not None:
1195 return compat_str(upload_date)
1196
1197
1198 def unified_timestamp(date_str, day_first=True):
1199 if date_str is None:
1200 return None
1201
1202 date_str = re.sub(r'[,|]', '', date_str)
1203
1204 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1205 timezone, date_str = extract_timezone(date_str)
1206
1207 # Remove AM/PM + timezone
1208 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1209
1210 # Remove unrecognized timezones from ISO 8601 alike timestamps
1211 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1212 if m:
1213 date_str = date_str[:-len(m.group('tz'))]
1214
1215 # Python only supports microseconds, so remove nanoseconds
1216 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1217 if m:
1218 date_str = m.group(1)
1219
1220 for expression in date_formats(day_first):
1221 try:
1222 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1223 return calendar.timegm(dt.timetuple())
1224 except ValueError:
1225 pass
1226 timetuple = email.utils.parsedate_tz(date_str)
1227 if timetuple:
1228 return calendar.timegm(timetuple) + pm_delta * 3600
1229
1230
1231 def determine_ext(url, default_ext='unknown_video'):
1232 if url is None or '.' not in url:
1233 return default_ext
1234 guess = url.partition('?')[0].rpartition('.')[2]
1235 if re.match(r'^[A-Za-z0-9]+$', guess):
1236 return guess
1237 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1238 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1239 return guess.rstrip('/')
1240 else:
1241 return default_ext
1242
1243
1244 def subtitles_filename(filename, sub_lang, sub_format):
1245 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1246
1247
1248 def date_from_str(date_str):
1249 """
1250 Return a datetime object from a string in the format YYYYMMDD or
1251 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1252 today = datetime.date.today()
1253 if date_str in ('now', 'today'):
1254 return today
1255 if date_str == 'yesterday':
1256 return today - datetime.timedelta(days=1)
1257 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1258 if match is not None:
1259 sign = match.group('sign')
1260 time = int(match.group('time'))
1261 if sign == '-':
1262 time = -time
1263 unit = match.group('unit')
1264 # A bad approximation?
1265 if unit == 'month':
1266 unit = 'day'
1267 time *= 30
1268 elif unit == 'year':
1269 unit = 'day'
1270 time *= 365
1271 unit += 's'
1272 delta = datetime.timedelta(**{unit: time})
1273 return today + delta
1274 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1275
1276
1277 def hyphenate_date(date_str):
1278 """
1279 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1280 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1281 if match is not None:
1282 return '-'.join(match.groups())
1283 else:
1284 return date_str
1285
1286
1287 class DateRange(object):
1288 """Represents a time interval between two dates"""
1289
1290 def __init__(self, start=None, end=None):
1291 """start and end must be strings in the format accepted by date"""
1292 if start is not None:
1293 self.start = date_from_str(start)
1294 else:
1295 self.start = datetime.datetime.min.date()
1296 if end is not None:
1297 self.end = date_from_str(end)
1298 else:
1299 self.end = datetime.datetime.max.date()
1300 if self.start > self.end:
1301 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1302
1303 @classmethod
1304 def day(cls, day):
1305 """Returns a range that only contains the given day"""
1306 return cls(day, day)
1307
1308 def __contains__(self, date):
1309 """Check if the date is in the range"""
1310 if not isinstance(date, datetime.date):
1311 date = date_from_str(date)
1312 return self.start <= date <= self.end
1313
1314 def __str__(self):
1315 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1316
1317
1318 def platform_name():
1319 """ Returns the platform name as a compat_str """
1320 res = platform.platform()
1321 if isinstance(res, bytes):
1322 res = res.decode(preferredencoding())
1323
1324 assert isinstance(res, compat_str)
1325 return res
1326
1327
1328 def _windows_write_string(s, out):
1329 """ Returns True if the string was written using special methods,
1330 False if it has yet to be written out."""
1331 # Adapted from http://stackoverflow.com/a/3259271/35070
1332
1333 import ctypes
1334 import ctypes.wintypes
1335
1336 WIN_OUTPUT_IDS = {
1337 1: -11,
1338 2: -12,
1339 }
1340
1341 try:
1342 fileno = out.fileno()
1343 except AttributeError:
1344 # If the output stream doesn't have a fileno, it's virtual
1345 return False
1346 except io.UnsupportedOperation:
1347 # Some strange Windows pseudo files?
1348 return False
1349 if fileno not in WIN_OUTPUT_IDS:
1350 return False
1351
1352 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1353 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1354 ('GetStdHandle', ctypes.windll.kernel32))
1355 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1356
1357 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1358 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1359 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1360 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1361 written = ctypes.wintypes.DWORD(0)
1362
1363 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1364 FILE_TYPE_CHAR = 0x0002
1365 FILE_TYPE_REMOTE = 0x8000
1366 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1367 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1368 ctypes.POINTER(ctypes.wintypes.DWORD))(
1369 ('GetConsoleMode', ctypes.windll.kernel32))
1370 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1371
1372 def not_a_console(handle):
1373 if handle == INVALID_HANDLE_VALUE or handle is None:
1374 return True
1375 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1376 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1377
1378 if not_a_console(h):
1379 return False
1380
1381 def next_nonbmp_pos(s):
1382 try:
1383 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1384 except StopIteration:
1385 return len(s)
1386
1387 while s:
1388 count = min(next_nonbmp_pos(s), 1024)
1389
1390 ret = WriteConsoleW(
1391 h, s, count if count else 2, ctypes.byref(written), None)
1392 if ret == 0:
1393 raise OSError('Failed to write string')
1394 if not count: # We just wrote a non-BMP character
1395 assert written.value == 2
1396 s = s[1:]
1397 else:
1398 assert written.value > 0
1399 s = s[written.value:]
1400 return True
1401
1402
1403 def write_string(s, out=None, encoding=None):
1404 if out is None:
1405 out = sys.stderr
1406 assert type(s) == compat_str
1407
1408 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1409 if _windows_write_string(s, out):
1410 return
1411
1412 if ('b' in getattr(out, 'mode', '') or
1413 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1414 byt = s.encode(encoding or preferredencoding(), 'ignore')
1415 out.write(byt)
1416 elif hasattr(out, 'buffer'):
1417 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1418 byt = s.encode(enc, 'ignore')
1419 out.buffer.write(byt)
1420 else:
1421 out.write(s)
1422 out.flush()
1423
1424
1425 def bytes_to_intlist(bs):
1426 if not bs:
1427 return []
1428 if isinstance(bs[0], int): # Python 3
1429 return list(bs)
1430 else:
1431 return [ord(c) for c in bs]
1432
1433
1434 def intlist_to_bytes(xs):
1435 if not xs:
1436 return b''
1437 return compat_struct_pack('%dB' % len(xs), *xs)
1438
1439
1440 # Cross-platform file locking
1441 if sys.platform == 'win32':
1442 import ctypes.wintypes
1443 import msvcrt
1444
1445 class OVERLAPPED(ctypes.Structure):
1446 _fields_ = [
1447 ('Internal', ctypes.wintypes.LPVOID),
1448 ('InternalHigh', ctypes.wintypes.LPVOID),
1449 ('Offset', ctypes.wintypes.DWORD),
1450 ('OffsetHigh', ctypes.wintypes.DWORD),
1451 ('hEvent', ctypes.wintypes.HANDLE),
1452 ]
1453
1454 kernel32 = ctypes.windll.kernel32
1455 LockFileEx = kernel32.LockFileEx
1456 LockFileEx.argtypes = [
1457 ctypes.wintypes.HANDLE, # hFile
1458 ctypes.wintypes.DWORD, # dwFlags
1459 ctypes.wintypes.DWORD, # dwReserved
1460 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1461 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1462 ctypes.POINTER(OVERLAPPED) # Overlapped
1463 ]
1464 LockFileEx.restype = ctypes.wintypes.BOOL
1465 UnlockFileEx = kernel32.UnlockFileEx
1466 UnlockFileEx.argtypes = [
1467 ctypes.wintypes.HANDLE, # hFile
1468 ctypes.wintypes.DWORD, # dwReserved
1469 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1470 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1471 ctypes.POINTER(OVERLAPPED) # Overlapped
1472 ]
1473 UnlockFileEx.restype = ctypes.wintypes.BOOL
1474 whole_low = 0xffffffff
1475 whole_high = 0x7fffffff
1476
1477 def _lock_file(f, exclusive):
1478 overlapped = OVERLAPPED()
1479 overlapped.Offset = 0
1480 overlapped.OffsetHigh = 0
1481 overlapped.hEvent = 0
1482 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1483 handle = msvcrt.get_osfhandle(f.fileno())
1484 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1485 whole_low, whole_high, f._lock_file_overlapped_p):
1486 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1487
1488 def _unlock_file(f):
1489 assert f._lock_file_overlapped_p
1490 handle = msvcrt.get_osfhandle(f.fileno())
1491 if not UnlockFileEx(handle, 0,
1492 whole_low, whole_high, f._lock_file_overlapped_p):
1493 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1494
1495 else:
1496 # Some platforms, such as Jython, is missing fcntl
1497 try:
1498 import fcntl
1499
1500 def _lock_file(f, exclusive):
1501 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1502
1503 def _unlock_file(f):
1504 fcntl.flock(f, fcntl.LOCK_UN)
1505 except ImportError:
1506 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1507
1508 def _lock_file(f, exclusive):
1509 raise IOError(UNSUPPORTED_MSG)
1510
1511 def _unlock_file(f):
1512 raise IOError(UNSUPPORTED_MSG)
1513
1514
1515 class locked_file(object):
1516 def __init__(self, filename, mode, encoding=None):
1517 assert mode in ['r', 'a', 'w']
1518 self.f = io.open(filename, mode, encoding=encoding)
1519 self.mode = mode
1520
1521 def __enter__(self):
1522 exclusive = self.mode != 'r'
1523 try:
1524 _lock_file(self.f, exclusive)
1525 except IOError:
1526 self.f.close()
1527 raise
1528 return self
1529
1530 def __exit__(self, etype, value, traceback):
1531 try:
1532 _unlock_file(self.f)
1533 finally:
1534 self.f.close()
1535
1536 def __iter__(self):
1537 return iter(self.f)
1538
1539 def write(self, *args):
1540 return self.f.write(*args)
1541
1542 def read(self, *args):
1543 return self.f.read(*args)
1544
1545
1546 def get_filesystem_encoding():
1547 encoding = sys.getfilesystemencoding()
1548 return encoding if encoding is not None else 'utf-8'
1549
1550
1551 def shell_quote(args):
1552 quoted_args = []
1553 encoding = get_filesystem_encoding()
1554 for a in args:
1555 if isinstance(a, bytes):
1556 # We may get a filename encoded with 'encodeFilename'
1557 a = a.decode(encoding)
1558 quoted_args.append(compat_shlex_quote(a))
1559 return ' '.join(quoted_args)
1560
1561
1562 def smuggle_url(url, data):
1563 """ Pass additional data in a URL for internal use. """
1564
1565 url, idata = unsmuggle_url(url, {})
1566 data.update(idata)
1567 sdata = compat_urllib_parse_urlencode(
1568 {'__youtubedl_smuggle': json.dumps(data)})
1569 return url + '#' + sdata
1570
1571
1572 def unsmuggle_url(smug_url, default=None):
1573 if '#__youtubedl_smuggle' not in smug_url:
1574 return smug_url, default
1575 url, _, sdata = smug_url.rpartition('#')
1576 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1577 data = json.loads(jsond)
1578 return url, data
1579
1580
1581 def format_bytes(bytes):
1582 if bytes is None:
1583 return 'N/A'
1584 if type(bytes) is str:
1585 bytes = float(bytes)
1586 if bytes == 0.0:
1587 exponent = 0
1588 else:
1589 exponent = int(math.log(bytes, 1024.0))
1590 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1591 converted = float(bytes) / float(1024 ** exponent)
1592 return '%.2f%s' % (converted, suffix)
1593
1594
1595 def lookup_unit_table(unit_table, s):
1596 units_re = '|'.join(re.escape(u) for u in unit_table)
1597 m = re.match(
1598 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1599 if not m:
1600 return None
1601 num_str = m.group('num').replace(',', '.')
1602 mult = unit_table[m.group('unit')]
1603 return int(float(num_str) * mult)
1604
1605
1606 def parse_filesize(s):
1607 if s is None:
1608 return None
1609
1610 # The lower-case forms are of course incorrect and unofficial,
1611 # but we support those too
1612 _UNIT_TABLE = {
1613 'B': 1,
1614 'b': 1,
1615 'bytes': 1,
1616 'KiB': 1024,
1617 'KB': 1000,
1618 'kB': 1024,
1619 'Kb': 1000,
1620 'kb': 1000,
1621 'kilobytes': 1000,
1622 'kibibytes': 1024,
1623 'MiB': 1024 ** 2,
1624 'MB': 1000 ** 2,
1625 'mB': 1024 ** 2,
1626 'Mb': 1000 ** 2,
1627 'mb': 1000 ** 2,
1628 'megabytes': 1000 ** 2,
1629 'mebibytes': 1024 ** 2,
1630 'GiB': 1024 ** 3,
1631 'GB': 1000 ** 3,
1632 'gB': 1024 ** 3,
1633 'Gb': 1000 ** 3,
1634 'gb': 1000 ** 3,
1635 'gigabytes': 1000 ** 3,
1636 'gibibytes': 1024 ** 3,
1637 'TiB': 1024 ** 4,
1638 'TB': 1000 ** 4,
1639 'tB': 1024 ** 4,
1640 'Tb': 1000 ** 4,
1641 'tb': 1000 ** 4,
1642 'terabytes': 1000 ** 4,
1643 'tebibytes': 1024 ** 4,
1644 'PiB': 1024 ** 5,
1645 'PB': 1000 ** 5,
1646 'pB': 1024 ** 5,
1647 'Pb': 1000 ** 5,
1648 'pb': 1000 ** 5,
1649 'petabytes': 1000 ** 5,
1650 'pebibytes': 1024 ** 5,
1651 'EiB': 1024 ** 6,
1652 'EB': 1000 ** 6,
1653 'eB': 1024 ** 6,
1654 'Eb': 1000 ** 6,
1655 'eb': 1000 ** 6,
1656 'exabytes': 1000 ** 6,
1657 'exbibytes': 1024 ** 6,
1658 'ZiB': 1024 ** 7,
1659 'ZB': 1000 ** 7,
1660 'zB': 1024 ** 7,
1661 'Zb': 1000 ** 7,
1662 'zb': 1000 ** 7,
1663 'zettabytes': 1000 ** 7,
1664 'zebibytes': 1024 ** 7,
1665 'YiB': 1024 ** 8,
1666 'YB': 1000 ** 8,
1667 'yB': 1024 ** 8,
1668 'Yb': 1000 ** 8,
1669 'yb': 1000 ** 8,
1670 'yottabytes': 1000 ** 8,
1671 'yobibytes': 1024 ** 8,
1672 }
1673
1674 return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677 def parse_count(s):
1678 if s is None:
1679 return None
1680
1681 s = s.strip()
1682
1683 if re.match(r'^[\d,.]+$', s):
1684 return str_to_int(s)
1685
1686 _UNIT_TABLE = {
1687 'k': 1000,
1688 'K': 1000,
1689 'm': 1000 ** 2,
1690 'M': 1000 ** 2,
1691 'kk': 1000 ** 2,
1692 'KK': 1000 ** 2,
1693 }
1694
1695 return lookup_unit_table(_UNIT_TABLE, s)
1696
1697
1698 def parse_resolution(s):
1699 if s is None:
1700 return {}
1701
1702 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1703 if mobj:
1704 return {
1705 'width': int(mobj.group('w')),
1706 'height': int(mobj.group('h')),
1707 }
1708
1709 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1710 if mobj:
1711 return {'height': int(mobj.group(1))}
1712
1713 mobj = re.search(r'\b([48])[kK]\b', s)
1714 if mobj:
1715 return {'height': int(mobj.group(1)) * 540}
1716
1717 return {}
1718
1719
1720 def month_by_name(name, lang='en'):
1721 """ Return the number of a month by (locale-independently) English name """
1722
1723 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1724
1725 try:
1726 return month_names.index(name) + 1
1727 except ValueError:
1728 return None
1729
1730
1731 def month_by_abbreviation(abbrev):
1732 """ Return the number of a month by (locale-independently) English
1733 abbreviations """
1734
1735 try:
1736 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1737 except ValueError:
1738 return None
1739
1740
1741 def fix_xml_ampersands(xml_str):
1742 """Replace all the '&' by '&amp;' in XML"""
1743 return re.sub(
1744 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1745 '&amp;',
1746 xml_str)
1747
1748
1749 def setproctitle(title):
1750 assert isinstance(title, compat_str)
1751
1752 # ctypes in Jython is not complete
1753 # http://bugs.jython.org/issue2148
1754 if sys.platform.startswith('java'):
1755 return
1756
1757 try:
1758 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1759 except OSError:
1760 return
1761 except TypeError:
1762 # LoadLibrary in Windows Python 2.7.13 only expects
1763 # a bytestring, but since unicode_literals turns
1764 # every string into a unicode string, it fails.
1765 return
1766 title_bytes = title.encode('utf-8')
1767 buf = ctypes.create_string_buffer(len(title_bytes))
1768 buf.value = title_bytes
1769 try:
1770 libc.prctl(15, buf, 0, 0, 0)
1771 except AttributeError:
1772 return # Strange libc, just skip this
1773
1774
1775 def remove_start(s, start):
1776 return s[len(start):] if s is not None and s.startswith(start) else s
1777
1778
1779 def remove_end(s, end):
1780 return s[:-len(end)] if s is not None and s.endswith(end) else s
1781
1782
1783 def remove_quotes(s):
1784 if s is None or len(s) < 2:
1785 return s
1786 for quote in ('"', "'", ):
1787 if s[0] == quote and s[-1] == quote:
1788 return s[1:-1]
1789 return s
1790
1791
1792 def url_basename(url):
1793 path = compat_urlparse.urlparse(url).path
1794 return path.strip('/').split('/')[-1]
1795
1796
1797 def base_url(url):
1798 return re.match(r'https?://[^?#&]+/', url).group()
1799
1800
1801 def urljoin(base, path):
1802 if isinstance(path, bytes):
1803 path = path.decode('utf-8')
1804 if not isinstance(path, compat_str) or not path:
1805 return None
1806 if re.match(r'^(?:https?:)?//', path):
1807 return path
1808 if isinstance(base, bytes):
1809 base = base.decode('utf-8')
1810 if not isinstance(base, compat_str) or not re.match(
1811 r'^(?:https?:)?//', base):
1812 return None
1813 return compat_urlparse.urljoin(base, path)
1814
1815
1816 class HEADRequest(compat_urllib_request.Request):
1817 def get_method(self):
1818 return 'HEAD'
1819
1820
1821 class PUTRequest(compat_urllib_request.Request):
1822 def get_method(self):
1823 return 'PUT'
1824
1825
1826 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1827 if get_attr:
1828 if v is not None:
1829 v = getattr(v, get_attr, None)
1830 if v == '':
1831 v = None
1832 if v is None:
1833 return default
1834 try:
1835 return int(v) * invscale // scale
1836 except ValueError:
1837 return default
1838
1839
1840 def str_or_none(v, default=None):
1841 return default if v is None else compat_str(v)
1842
1843
1844 def str_to_int(int_str):
1845 """ A more relaxed version of int_or_none """
1846 if int_str is None:
1847 return None
1848 int_str = re.sub(r'[,\.\+]', '', int_str)
1849 return int(int_str)
1850
1851
1852 def float_or_none(v, scale=1, invscale=1, default=None):
1853 if v is None:
1854 return default
1855 try:
1856 return float(v) * invscale / scale
1857 except ValueError:
1858 return default
1859
1860
1861 def bool_or_none(v, default=None):
1862 return v if isinstance(v, bool) else default
1863
1864
1865 def strip_or_none(v):
1866 return None if v is None else v.strip()
1867
1868
1869 def url_or_none(url):
1870 if not url or not isinstance(url, compat_str):
1871 return None
1872 url = url.strip()
1873 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1874
1875
1876 def parse_duration(s):
1877 if not isinstance(s, compat_basestring):
1878 return None
1879
1880 s = s.strip()
1881
1882 days, hours, mins, secs, ms = [None] * 5
1883 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1884 if m:
1885 days, hours, mins, secs, ms = m.groups()
1886 else:
1887 m = re.match(
1888 r'''(?ix)(?:P?
1889 (?:
1890 [0-9]+\s*y(?:ears?)?\s*
1891 )?
1892 (?:
1893 [0-9]+\s*m(?:onths?)?\s*
1894 )?
1895 (?:
1896 [0-9]+\s*w(?:eeks?)?\s*
1897 )?
1898 (?:
1899 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1900 )?
1901 T)?
1902 (?:
1903 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1904 )?
1905 (?:
1906 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1907 )?
1908 (?:
1909 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1910 )?Z?$''', s)
1911 if m:
1912 days, hours, mins, secs, ms = m.groups()
1913 else:
1914 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1915 if m:
1916 hours, mins = m.groups()
1917 else:
1918 return None
1919
1920 duration = 0
1921 if secs:
1922 duration += float(secs)
1923 if mins:
1924 duration += float(mins) * 60
1925 if hours:
1926 duration += float(hours) * 60 * 60
1927 if days:
1928 duration += float(days) * 24 * 60 * 60
1929 if ms:
1930 duration += float(ms)
1931 return duration
1932
1933
1934 def prepend_extension(filename, ext, expected_real_ext=None):
1935 name, real_ext = os.path.splitext(filename)
1936 return (
1937 '{0}.{1}{2}'.format(name, ext, real_ext)
1938 if not expected_real_ext or real_ext[1:] == expected_real_ext
1939 else '{0}.{1}'.format(filename, ext))
1940
1941
1942 def replace_extension(filename, ext, expected_real_ext=None):
1943 name, real_ext = os.path.splitext(filename)
1944 return '{0}.{1}'.format(
1945 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1946 ext)
1947
1948
1949 def check_executable(exe, args=[]):
1950 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1951 args can be a list of arguments for a short output (like -version) """
1952 try:
1953 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1954 except OSError:
1955 return False
1956 return exe
1957
1958
1959 def get_exe_version(exe, args=['--version'],
1960 version_re=None, unrecognized='present'):
1961 """ Returns the version of the specified executable,
1962 or False if the executable is not present """
1963 try:
1964 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1965 # SIGTTOU if youtube-dl is run in the background.
1966 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1967 out, _ = subprocess.Popen(
1968 [encodeArgument(exe)] + args,
1969 stdin=subprocess.PIPE,
1970 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1971 except OSError:
1972 return False
1973 if isinstance(out, bytes): # Python 2.x
1974 out = out.decode('ascii', 'ignore')
1975 return detect_exe_version(out, version_re, unrecognized)
1976
1977
1978 def detect_exe_version(output, version_re=None, unrecognized='present'):
1979 assert isinstance(output, compat_str)
1980 if version_re is None:
1981 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1982 m = re.search(version_re, output)
1983 if m:
1984 return m.group(1)
1985 else:
1986 return unrecognized
1987
1988
1989 class PagedList(object):
1990 def __len__(self):
1991 # This is only useful for tests
1992 return len(self.getslice())
1993
1994
1995 class OnDemandPagedList(PagedList):
1996 def __init__(self, pagefunc, pagesize, use_cache=True):
1997 self._pagefunc = pagefunc
1998 self._pagesize = pagesize
1999 self._use_cache = use_cache
2000 if use_cache:
2001 self._cache = {}
2002
2003 def getslice(self, start=0, end=None):
2004 res = []
2005 for pagenum in itertools.count(start // self._pagesize):
2006 firstid = pagenum * self._pagesize
2007 nextfirstid = pagenum * self._pagesize + self._pagesize
2008 if start >= nextfirstid:
2009 continue
2010
2011 page_results = None
2012 if self._use_cache:
2013 page_results = self._cache.get(pagenum)
2014 if page_results is None:
2015 page_results = list(self._pagefunc(pagenum))
2016 if self._use_cache:
2017 self._cache[pagenum] = page_results
2018
2019 startv = (
2020 start % self._pagesize
2021 if firstid <= start < nextfirstid
2022 else 0)
2023
2024 endv = (
2025 ((end - 1) % self._pagesize) + 1
2026 if (end is not None and firstid <= end <= nextfirstid)
2027 else None)
2028
2029 if startv != 0 or endv is not None:
2030 page_results = page_results[startv:endv]
2031 res.extend(page_results)
2032
2033 # A little optimization - if current page is not "full", ie. does
2034 # not contain page_size videos then we can assume that this page
2035 # is the last one - there are no more ids on further pages -
2036 # i.e. no need to query again.
2037 if len(page_results) + startv < self._pagesize:
2038 break
2039
2040 # If we got the whole page, but the next page is not interesting,
2041 # break out early as well
2042 if end == nextfirstid:
2043 break
2044 return res
2045
2046
2047 class InAdvancePagedList(PagedList):
2048 def __init__(self, pagefunc, pagecount, pagesize):
2049 self._pagefunc = pagefunc
2050 self._pagecount = pagecount
2051 self._pagesize = pagesize
2052
2053 def getslice(self, start=0, end=None):
2054 res = []
2055 start_page = start // self._pagesize
2056 end_page = (
2057 self._pagecount if end is None else (end // self._pagesize + 1))
2058 skip_elems = start - start_page * self._pagesize
2059 only_more = None if end is None else end - start
2060 for pagenum in range(start_page, end_page):
2061 page = list(self._pagefunc(pagenum))
2062 if skip_elems:
2063 page = page[skip_elems:]
2064 skip_elems = None
2065 if only_more is not None:
2066 if len(page) < only_more:
2067 only_more -= len(page)
2068 else:
2069 page = page[:only_more]
2070 res.extend(page)
2071 break
2072 res.extend(page)
2073 return res
2074
2075
2076 def uppercase_escape(s):
2077 unicode_escape = codecs.getdecoder('unicode_escape')
2078 return re.sub(
2079 r'\\U[0-9a-fA-F]{8}',
2080 lambda m: unicode_escape(m.group(0))[0],
2081 s)
2082
2083
2084 def lowercase_escape(s):
2085 unicode_escape = codecs.getdecoder('unicode_escape')
2086 return re.sub(
2087 r'\\u[0-9a-fA-F]{4}',
2088 lambda m: unicode_escape(m.group(0))[0],
2089 s)
2090
2091
2092 def escape_rfc3986(s):
2093 """Escape non-ASCII characters as suggested by RFC 3986"""
2094 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2095 s = s.encode('utf-8')
2096 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2097
2098
2099 def escape_url(url):
2100 """Escape URL as suggested by RFC 3986"""
2101 url_parsed = compat_urllib_parse_urlparse(url)
2102 return url_parsed._replace(
2103 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2104 path=escape_rfc3986(url_parsed.path),
2105 params=escape_rfc3986(url_parsed.params),
2106 query=escape_rfc3986(url_parsed.query),
2107 fragment=escape_rfc3986(url_parsed.fragment)
2108 ).geturl()
2109
2110
2111 def read_batch_urls(batch_fd):
2112 def fixup(url):
2113 if not isinstance(url, compat_str):
2114 url = url.decode('utf-8', 'replace')
2115 BOM_UTF8 = '\xef\xbb\xbf'
2116 if url.startswith(BOM_UTF8):
2117 url = url[len(BOM_UTF8):]
2118 url = url.strip()
2119 if url.startswith(('#', ';', ']')):
2120 return False
2121 return url
2122
2123 with contextlib.closing(batch_fd) as fd:
2124 return [url for url in map(fixup, fd) if url]
2125
2126
2127 def urlencode_postdata(*args, **kargs):
2128 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2129
2130
2131 def update_url_query(url, query):
2132 if not query:
2133 return url
2134 parsed_url = compat_urlparse.urlparse(url)
2135 qs = compat_parse_qs(parsed_url.query)
2136 qs.update(query)
2137 return compat_urlparse.urlunparse(parsed_url._replace(
2138 query=compat_urllib_parse_urlencode(qs, True)))
2139
2140
2141 def update_Request(req, url=None, data=None, headers={}, query={}):
2142 req_headers = req.headers.copy()
2143 req_headers.update(headers)
2144 req_data = data or req.data
2145 req_url = update_url_query(url or req.get_full_url(), query)
2146 req_get_method = req.get_method()
2147 if req_get_method == 'HEAD':
2148 req_type = HEADRequest
2149 elif req_get_method == 'PUT':
2150 req_type = PUTRequest
2151 else:
2152 req_type = compat_urllib_request.Request
2153 new_req = req_type(
2154 req_url, data=req_data, headers=req_headers,
2155 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2156 if hasattr(req, 'timeout'):
2157 new_req.timeout = req.timeout
2158 return new_req
2159
2160
2161 def _multipart_encode_impl(data, boundary):
2162 content_type = 'multipart/form-data; boundary=%s' % boundary
2163
2164 out = b''
2165 for k, v in data.items():
2166 out += b'--' + boundary.encode('ascii') + b'\r\n'
2167 if isinstance(k, compat_str):
2168 k = k.encode('utf-8')
2169 if isinstance(v, compat_str):
2170 v = v.encode('utf-8')
2171 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2172 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2173 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2174 if boundary.encode('ascii') in content:
2175 raise ValueError('Boundary overlaps with data')
2176 out += content
2177
2178 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2179
2180 return out, content_type
2181
2182
2183 def multipart_encode(data, boundary=None):
2184 '''
2185 Encode a dict to RFC 7578-compliant form-data
2186
2187 data:
2188 A dict where keys and values can be either Unicode or bytes-like
2189 objects.
2190 boundary:
2191 If specified a Unicode object, it's used as the boundary. Otherwise
2192 a random boundary is generated.
2193
2194 Reference: https://tools.ietf.org/html/rfc7578
2195 '''
2196 has_specified_boundary = boundary is not None
2197
2198 while True:
2199 if boundary is None:
2200 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2201
2202 try:
2203 out, content_type = _multipart_encode_impl(data, boundary)
2204 break
2205 except ValueError:
2206 if has_specified_boundary:
2207 raise
2208 boundary = None
2209
2210 return out, content_type
2211
2212
2213 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2214 if isinstance(key_or_keys, (list, tuple)):
2215 for key in key_or_keys:
2216 if key not in d or d[key] is None or skip_false_values and not d[key]:
2217 continue
2218 return d[key]
2219 return default
2220 return d.get(key_or_keys, default)
2221
2222
2223 def try_get(src, getter, expected_type=None):
2224 if not isinstance(getter, (list, tuple)):
2225 getter = [getter]
2226 for get in getter:
2227 try:
2228 v = get(src)
2229 except (AttributeError, KeyError, TypeError, IndexError):
2230 pass
2231 else:
2232 if expected_type is None or isinstance(v, expected_type):
2233 return v
2234
2235
2236 def merge_dicts(*dicts):
2237 merged = {}
2238 for a_dict in dicts:
2239 for k, v in a_dict.items():
2240 if v is None:
2241 continue
2242 if (k not in merged or
2243 (isinstance(v, compat_str) and v and
2244 isinstance(merged[k], compat_str) and
2245 not merged[k])):
2246 merged[k] = v
2247 return merged
2248
2249
2250 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2251 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2252
2253
2254 US_RATINGS = {
2255 'G': 0,
2256 'PG': 10,
2257 'PG-13': 13,
2258 'R': 16,
2259 'NC': 18,
2260 }
2261
2262
2263 TV_PARENTAL_GUIDELINES = {
2264 'TV-Y': 0,
2265 'TV-Y7': 7,
2266 'TV-G': 0,
2267 'TV-PG': 0,
2268 'TV-14': 14,
2269 'TV-MA': 17,
2270 }
2271
2272
2273 def parse_age_limit(s):
2274 if type(s) == int:
2275 return s if 0 <= s <= 21 else None
2276 if not isinstance(s, compat_basestring):
2277 return None
2278 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2279 if m:
2280 return int(m.group('age'))
2281 if s in US_RATINGS:
2282 return US_RATINGS[s]
2283 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2284 if m:
2285 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2286 return None
2287
2288
2289 def strip_jsonp(code):
2290 return re.sub(
2291 r'''(?sx)^
2292 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2293 (?:\s*&&\s*(?P=func_name))?
2294 \s*\(\s*(?P<callback_data>.*)\);?
2295 \s*?(?://[^\n]*)*$''',
2296 r'\g<callback_data>', code)
2297
2298
2299 def js_to_json(code):
2300 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2301 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2302 INTEGER_TABLE = (
2303 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2304 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2305 )
2306
2307 def fix_kv(m):
2308 v = m.group(0)
2309 if v in ('true', 'false', 'null'):
2310 return v
2311 elif v.startswith('/*') or v.startswith('//') or v == ',':
2312 return ""
2313
2314 if v[0] in ("'", '"'):
2315 v = re.sub(r'(?s)\\.|"', lambda m: {
2316 '"': '\\"',
2317 "\\'": "'",
2318 '\\\n': '',
2319 '\\x': '\\u00',
2320 }.get(m.group(0), m.group(0)), v[1:-1])
2321
2322 for regex, base in INTEGER_TABLE:
2323 im = re.match(regex, v)
2324 if im:
2325 i = int(im.group(1), base)
2326 return '"%d":' % i if v.endswith(':') else '%d' % i
2327
2328 return '"%s"' % v
2329
2330 return re.sub(r'''(?sx)
2331 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2332 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2333 {comment}|,(?={skip}[\]}}])|
2334 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2335 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2336 [0-9]+(?={skip}:)
2337 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2338
2339
2340 def qualities(quality_ids):
2341 """ Get a numeric quality value out of a list of possible values """
2342 def q(qid):
2343 try:
2344 return quality_ids.index(qid)
2345 except ValueError:
2346 return -1
2347 return q
2348
2349
2350 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2351
2352
2353 def limit_length(s, length):
2354 """ Add ellipses to overly long strings """
2355 if s is None:
2356 return None
2357 ELLIPSES = '...'
2358 if len(s) > length:
2359 return s[:length - len(ELLIPSES)] + ELLIPSES
2360 return s
2361
2362
2363 def version_tuple(v):
2364 return tuple(int(e) for e in re.split(r'[-.]', v))
2365
2366
2367 def is_outdated_version(version, limit, assume_new=True):
2368 if not version:
2369 return not assume_new
2370 try:
2371 return version_tuple(version) < version_tuple(limit)
2372 except ValueError:
2373 return not assume_new
2374
2375
2376 def ytdl_is_updateable():
2377 """ Returns if youtube-dl can be updated with -U """
2378 from zipimport import zipimporter
2379
2380 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2381
2382
2383 def args_to_str(args):
2384 # Get a short string representation for a subprocess command
2385 return ' '.join(compat_shlex_quote(a) for a in args)
2386
2387
2388 def error_to_compat_str(err):
2389 err_str = str(err)
2390 # On python 2 error byte string must be decoded with proper
2391 # encoding rather than ascii
2392 if sys.version_info[0] < 3:
2393 err_str = err_str.decode(preferredencoding())
2394 return err_str
2395
2396
2397 def mimetype2ext(mt):
2398 if mt is None:
2399 return None
2400
2401 ext = {
2402 'audio/mp4': 'm4a',
2403 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2404 # it's the most popular one
2405 'audio/mpeg': 'mp3',
2406 }.get(mt)
2407 if ext is not None:
2408 return ext
2409
2410 _, _, res = mt.rpartition('/')
2411 res = res.split(';')[0].strip().lower()
2412
2413 return {
2414 '3gpp': '3gp',
2415 'smptett+xml': 'tt',
2416 'ttaf+xml': 'dfxp',
2417 'ttml+xml': 'ttml',
2418 'x-flv': 'flv',
2419 'x-mp4-fragmented': 'mp4',
2420 'x-ms-sami': 'sami',
2421 'x-ms-wmv': 'wmv',
2422 'mpegurl': 'm3u8',
2423 'x-mpegurl': 'm3u8',
2424 'vnd.apple.mpegurl': 'm3u8',
2425 'dash+xml': 'mpd',
2426 'f4m+xml': 'f4m',
2427 'hds+xml': 'f4m',
2428 'vnd.ms-sstr+xml': 'ism',
2429 'quicktime': 'mov',
2430 'mp2t': 'ts',
2431 }.get(res, res)
2432
2433
2434 def parse_codecs(codecs_str):
2435 # http://tools.ietf.org/html/rfc6381
2436 if not codecs_str:
2437 return {}
2438 splited_codecs = list(filter(None, map(
2439 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2440 vcodec, acodec = None, None
2441 for full_codec in splited_codecs:
2442 codec = full_codec.split('.')[0]
2443 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2444 if not vcodec:
2445 vcodec = full_codec
2446 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2447 if not acodec:
2448 acodec = full_codec
2449 else:
2450 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2451 if not vcodec and not acodec:
2452 if len(splited_codecs) == 2:
2453 return {
2454 'vcodec': vcodec,
2455 'acodec': acodec,
2456 }
2457 elif len(splited_codecs) == 1:
2458 return {
2459 'vcodec': 'none',
2460 'acodec': vcodec,
2461 }
2462 else:
2463 return {
2464 'vcodec': vcodec or 'none',
2465 'acodec': acodec or 'none',
2466 }
2467 return {}
2468
2469
2470 def urlhandle_detect_ext(url_handle):
2471 getheader = url_handle.headers.get
2472
2473 cd = getheader('Content-Disposition')
2474 if cd:
2475 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2476 if m:
2477 e = determine_ext(m.group('filename'), default_ext=None)
2478 if e:
2479 return e
2480
2481 return mimetype2ext(getheader('Content-Type'))
2482
2483
2484 def encode_data_uri(data, mime_type):
2485 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2486
2487
2488 def age_restricted(content_limit, age_limit):
2489 """ Returns True iff the content should be blocked """
2490
2491 if age_limit is None: # No limit set
2492 return False
2493 if content_limit is None:
2494 return False # Content available for everyone
2495 return age_limit < content_limit
2496
2497
2498 def is_html(first_bytes):
2499 """ Detect whether a file contains HTML by examining its first bytes. """
2500
2501 BOMS = [
2502 (b'\xef\xbb\xbf', 'utf-8'),
2503 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2504 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2505 (b'\xff\xfe', 'utf-16-le'),
2506 (b'\xfe\xff', 'utf-16-be'),
2507 ]
2508 for bom, enc in BOMS:
2509 if first_bytes.startswith(bom):
2510 s = first_bytes[len(bom):].decode(enc, 'replace')
2511 break
2512 else:
2513 s = first_bytes.decode('utf-8', 'replace')
2514
2515 return re.match(r'^\s*<', s)
2516
2517
2518 def determine_protocol(info_dict):
2519 protocol = info_dict.get('protocol')
2520 if protocol is not None:
2521 return protocol
2522
2523 url = info_dict['url']
2524 if url.startswith('rtmp'):
2525 return 'rtmp'
2526 elif url.startswith('mms'):
2527 return 'mms'
2528 elif url.startswith('rtsp'):
2529 return 'rtsp'
2530
2531 ext = determine_ext(url)
2532 if ext == 'm3u8':
2533 return 'm3u8'
2534 elif ext == 'f4m':
2535 return 'f4m'
2536
2537 return compat_urllib_parse_urlparse(url).scheme
2538
2539
2540 def render_table(header_row, data):
2541 """ Render a list of rows, each as a list of values """
2542 table = [header_row] + data
2543 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2544 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2545 return '\n'.join(format_str % tuple(row) for row in table)
2546
2547
2548 def _match_one(filter_part, dct):
2549 COMPARISON_OPERATORS = {
2550 '<': operator.lt,
2551 '<=': operator.le,
2552 '>': operator.gt,
2553 '>=': operator.ge,
2554 '=': operator.eq,
2555 '!=': operator.ne,
2556 }
2557 operator_rex = re.compile(r'''(?x)\s*
2558 (?P<key>[a-z_]+)
2559 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2560 (?:
2561 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2562 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2563 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2564 )
2565 \s*$
2566 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2567 m = operator_rex.search(filter_part)
2568 if m:
2569 op = COMPARISON_OPERATORS[m.group('op')]
2570 actual_value = dct.get(m.group('key'))
2571 if (m.group('quotedstrval') is not None or
2572 m.group('strval') is not None or
2573 # If the original field is a string and matching comparisonvalue is
2574 # a number we should respect the origin of the original field
2575 # and process comparison value as a string (see
2576 # https://github.com/rg3/youtube-dl/issues/11082).
2577 actual_value is not None and m.group('intval') is not None and
2578 isinstance(actual_value, compat_str)):
2579 if m.group('op') not in ('=', '!='):
2580 raise ValueError(
2581 'Operator %s does not support string values!' % m.group('op'))
2582 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2583 quote = m.group('quote')
2584 if quote is not None:
2585 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2586 else:
2587 try:
2588 comparison_value = int(m.group('intval'))
2589 except ValueError:
2590 comparison_value = parse_filesize(m.group('intval'))
2591 if comparison_value is None:
2592 comparison_value = parse_filesize(m.group('intval') + 'B')
2593 if comparison_value is None:
2594 raise ValueError(
2595 'Invalid integer value %r in filter part %r' % (
2596 m.group('intval'), filter_part))
2597 if actual_value is None:
2598 return m.group('none_inclusive')
2599 return op(actual_value, comparison_value)
2600
2601 UNARY_OPERATORS = {
2602 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2603 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2604 }
2605 operator_rex = re.compile(r'''(?x)\s*
2606 (?P<op>%s)\s*(?P<key>[a-z_]+)
2607 \s*$
2608 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2609 m = operator_rex.search(filter_part)
2610 if m:
2611 op = UNARY_OPERATORS[m.group('op')]
2612 actual_value = dct.get(m.group('key'))
2613 return op(actual_value)
2614
2615 raise ValueError('Invalid filter part %r' % filter_part)
2616
2617
2618 def match_str(filter_str, dct):
2619 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2620
2621 return all(
2622 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2623
2624
2625 def match_filter_func(filter_str):
2626 def _match_func(info_dict):
2627 if match_str(filter_str, info_dict):
2628 return None
2629 else:
2630 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2631 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2632 return _match_func
2633
2634
2635 def parse_dfxp_time_expr(time_expr):
2636 if not time_expr:
2637 return
2638
2639 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2640 if mobj:
2641 return float(mobj.group('time_offset'))
2642
2643 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2644 if mobj:
2645 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2646
2647
2648 def srt_subtitles_timecode(seconds):
2649 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2650
2651
2652 def dfxp2srt(dfxp_data):
2653 '''
2654 @param dfxp_data A bytes-like object containing DFXP data
2655 @returns A unicode object containing converted SRT data
2656 '''
2657 LEGACY_NAMESPACES = (
2658 (b'http://www.w3.org/ns/ttml', [
2659 b'http://www.w3.org/2004/11/ttaf1',
2660 b'http://www.w3.org/2006/04/ttaf1',
2661 b'http://www.w3.org/2006/10/ttaf1',
2662 ]),
2663 (b'http://www.w3.org/ns/ttml#styling', [
2664 b'http://www.w3.org/ns/ttml#style',
2665 ]),
2666 )
2667
2668 SUPPORTED_STYLING = [
2669 'color',
2670 'fontFamily',
2671 'fontSize',
2672 'fontStyle',
2673 'fontWeight',
2674 'textDecoration'
2675 ]
2676
2677 _x = functools.partial(xpath_with_ns, ns_map={
2678 'xml': 'http://www.w3.org/XML/1998/namespace',
2679 'ttml': 'http://www.w3.org/ns/ttml',
2680 'tts': 'http://www.w3.org/ns/ttml#styling',
2681 })
2682
2683 styles = {}
2684 default_style = {}
2685
2686 class TTMLPElementParser(object):
2687 _out = ''
2688 _unclosed_elements = []
2689 _applied_styles = []
2690
2691 def start(self, tag, attrib):
2692 if tag in (_x('ttml:br'), 'br'):
2693 self._out += '\n'
2694 else:
2695 unclosed_elements = []
2696 style = {}
2697 element_style_id = attrib.get('style')
2698 if default_style:
2699 style.update(default_style)
2700 if element_style_id:
2701 style.update(styles.get(element_style_id, {}))
2702 for prop in SUPPORTED_STYLING:
2703 prop_val = attrib.get(_x('tts:' + prop))
2704 if prop_val:
2705 style[prop] = prop_val
2706 if style:
2707 font = ''
2708 for k, v in sorted(style.items()):
2709 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2710 continue
2711 if k == 'color':
2712 font += ' color="%s"' % v
2713 elif k == 'fontSize':
2714 font += ' size="%s"' % v
2715 elif k == 'fontFamily':
2716 font += ' face="%s"' % v
2717 elif k == 'fontWeight' and v == 'bold':
2718 self._out += '<b>'
2719 unclosed_elements.append('b')
2720 elif k == 'fontStyle' and v == 'italic':
2721 self._out += '<i>'
2722 unclosed_elements.append('i')
2723 elif k == 'textDecoration' and v == 'underline':
2724 self._out += '<u>'
2725 unclosed_elements.append('u')
2726 if font:
2727 self._out += '<font' + font + '>'
2728 unclosed_elements.append('font')
2729 applied_style = {}
2730 if self._applied_styles:
2731 applied_style.update(self._applied_styles[-1])
2732 applied_style.update(style)
2733 self._applied_styles.append(applied_style)
2734 self._unclosed_elements.append(unclosed_elements)
2735
2736 def end(self, tag):
2737 if tag not in (_x('ttml:br'), 'br'):
2738 unclosed_elements = self._unclosed_elements.pop()
2739 for element in reversed(unclosed_elements):
2740 self._out += '</%s>' % element
2741 if unclosed_elements and self._applied_styles:
2742 self._applied_styles.pop()
2743
2744 def data(self, data):
2745 self._out += data
2746
2747 def close(self):
2748 return self._out.strip()
2749
2750 def parse_node(node):
2751 target = TTMLPElementParser()
2752 parser = xml.etree.ElementTree.XMLParser(target=target)
2753 parser.feed(xml.etree.ElementTree.tostring(node))
2754 return parser.close()
2755
2756 for k, v in LEGACY_NAMESPACES:
2757 for ns in v:
2758 dfxp_data = dfxp_data.replace(ns, k)
2759
2760 dfxp = compat_etree_fromstring(dfxp_data)
2761 out = []
2762 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2763
2764 if not paras:
2765 raise ValueError('Invalid dfxp/TTML subtitle')
2766
2767 repeat = False
2768 while True:
2769 for style in dfxp.findall(_x('.//ttml:style')):
2770 style_id = style.get('id') or style.get(_x('xml:id'))
2771 if not style_id:
2772 continue
2773 parent_style_id = style.get('style')
2774 if parent_style_id:
2775 if parent_style_id not in styles:
2776 repeat = True
2777 continue
2778 styles[style_id] = styles[parent_style_id].copy()
2779 for prop in SUPPORTED_STYLING:
2780 prop_val = style.get(_x('tts:' + prop))
2781 if prop_val:
2782 styles.setdefault(style_id, {})[prop] = prop_val
2783 if repeat:
2784 repeat = False
2785 else:
2786 break
2787
2788 for p in ('body', 'div'):
2789 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2790 if ele is None:
2791 continue
2792 style = styles.get(ele.get('style'))
2793 if not style:
2794 continue
2795 default_style.update(style)
2796
2797 for para, index in zip(paras, itertools.count(1)):
2798 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2799 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2800 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2801 if begin_time is None:
2802 continue
2803 if not end_time:
2804 if not dur:
2805 continue
2806 end_time = begin_time + dur
2807 out.append('%d\n%s --> %s\n%s\n\n' % (
2808 index,
2809 srt_subtitles_timecode(begin_time),
2810 srt_subtitles_timecode(end_time),
2811 parse_node(para)))
2812
2813 return ''.join(out)
2814
2815
2816 def cli_option(params, command_option, param):
2817 param = params.get(param)
2818 if param:
2819 param = compat_str(param)
2820 return [command_option, param] if param is not None else []
2821
2822
2823 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2824 param = params.get(param)
2825 if param is None:
2826 return []
2827 assert isinstance(param, bool)
2828 if separator:
2829 return [command_option + separator + (true_value if param else false_value)]
2830 return [command_option, true_value if param else false_value]
2831
2832
2833 def cli_valueless_option(params, command_option, param, expected_value=True):
2834 param = params.get(param)
2835 return [command_option] if param == expected_value else []
2836
2837
2838 def cli_configuration_args(params, param, default=[]):
2839 ex_args = params.get(param)
2840 if ex_args is None:
2841 return default
2842 assert isinstance(ex_args, list)
2843 return ex_args
2844
2845
2846 class ISO639Utils(object):
2847 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2848 _lang_map = {
2849 'aa': 'aar',
2850 'ab': 'abk',
2851 'ae': 'ave',
2852 'af': 'afr',
2853 'ak': 'aka',
2854 'am': 'amh',
2855 'an': 'arg',
2856 'ar': 'ara',
2857 'as': 'asm',
2858 'av': 'ava',
2859 'ay': 'aym',
2860 'az': 'aze',
2861 'ba': 'bak',
2862 'be': 'bel',
2863 'bg': 'bul',
2864 'bh': 'bih',
2865 'bi': 'bis',
2866 'bm': 'bam',
2867 'bn': 'ben',
2868 'bo': 'bod',
2869 'br': 'bre',
2870 'bs': 'bos',
2871 'ca': 'cat',
2872 'ce': 'che',
2873 'ch': 'cha',
2874 'co': 'cos',
2875 'cr': 'cre',
2876 'cs': 'ces',
2877 'cu': 'chu',
2878 'cv': 'chv',
2879 'cy': 'cym',
2880 'da': 'dan',
2881 'de': 'deu',
2882 'dv': 'div',
2883 'dz': 'dzo',
2884 'ee': 'ewe',
2885 'el': 'ell',
2886 'en': 'eng',
2887 'eo': 'epo',
2888 'es': 'spa',
2889 'et': 'est',
2890 'eu': 'eus',
2891 'fa': 'fas',
2892 'ff': 'ful',
2893 'fi': 'fin',
2894 'fj': 'fij',
2895 'fo': 'fao',
2896 'fr': 'fra',
2897 'fy': 'fry',
2898 'ga': 'gle',
2899 'gd': 'gla',
2900 'gl': 'glg',
2901 'gn': 'grn',
2902 'gu': 'guj',
2903 'gv': 'glv',
2904 'ha': 'hau',
2905 'he': 'heb',
2906 'hi': 'hin',
2907 'ho': 'hmo',
2908 'hr': 'hrv',
2909 'ht': 'hat',
2910 'hu': 'hun',
2911 'hy': 'hye',
2912 'hz': 'her',
2913 'ia': 'ina',
2914 'id': 'ind',
2915 'ie': 'ile',
2916 'ig': 'ibo',
2917 'ii': 'iii',
2918 'ik': 'ipk',
2919 'io': 'ido',
2920 'is': 'isl',
2921 'it': 'ita',
2922 'iu': 'iku',
2923 'ja': 'jpn',
2924 'jv': 'jav',
2925 'ka': 'kat',
2926 'kg': 'kon',
2927 'ki': 'kik',
2928 'kj': 'kua',
2929 'kk': 'kaz',
2930 'kl': 'kal',
2931 'km': 'khm',
2932 'kn': 'kan',
2933 'ko': 'kor',
2934 'kr': 'kau',
2935 'ks': 'kas',
2936 'ku': 'kur',
2937 'kv': 'kom',
2938 'kw': 'cor',
2939 'ky': 'kir',
2940 'la': 'lat',
2941 'lb': 'ltz',
2942 'lg': 'lug',
2943 'li': 'lim',
2944 'ln': 'lin',
2945 'lo': 'lao',
2946 'lt': 'lit',
2947 'lu': 'lub',
2948 'lv': 'lav',
2949 'mg': 'mlg',
2950 'mh': 'mah',
2951 'mi': 'mri',
2952 'mk': 'mkd',
2953 'ml': 'mal',
2954 'mn': 'mon',
2955 'mr': 'mar',
2956 'ms': 'msa',
2957 'mt': 'mlt',
2958 'my': 'mya',
2959 'na': 'nau',
2960 'nb': 'nob',
2961 'nd': 'nde',
2962 'ne': 'nep',
2963 'ng': 'ndo',
2964 'nl': 'nld',
2965 'nn': 'nno',
2966 'no': 'nor',
2967 'nr': 'nbl',
2968 'nv': 'nav',
2969 'ny': 'nya',
2970 'oc': 'oci',
2971 'oj': 'oji',
2972 'om': 'orm',
2973 'or': 'ori',
2974 'os': 'oss',
2975 'pa': 'pan',
2976 'pi': 'pli',
2977 'pl': 'pol',
2978 'ps': 'pus',
2979 'pt': 'por',
2980 'qu': 'que',
2981 'rm': 'roh',
2982 'rn': 'run',
2983 'ro': 'ron',
2984 'ru': 'rus',
2985 'rw': 'kin',
2986 'sa': 'san',
2987 'sc': 'srd',
2988 'sd': 'snd',
2989 'se': 'sme',
2990 'sg': 'sag',
2991 'si': 'sin',
2992 'sk': 'slk',
2993 'sl': 'slv',
2994 'sm': 'smo',
2995 'sn': 'sna',
2996 'so': 'som',
2997 'sq': 'sqi',
2998 'sr': 'srp',
2999 'ss': 'ssw',
3000 'st': 'sot',
3001 'su': 'sun',
3002 'sv': 'swe',
3003 'sw': 'swa',
3004 'ta': 'tam',
3005 'te': 'tel',
3006 'tg': 'tgk',
3007 'th': 'tha',
3008 'ti': 'tir',
3009 'tk': 'tuk',
3010 'tl': 'tgl',
3011 'tn': 'tsn',
3012 'to': 'ton',
3013 'tr': 'tur',
3014 'ts': 'tso',
3015 'tt': 'tat',
3016 'tw': 'twi',
3017 'ty': 'tah',
3018 'ug': 'uig',
3019 'uk': 'ukr',
3020 'ur': 'urd',
3021 'uz': 'uzb',
3022 've': 'ven',
3023 'vi': 'vie',
3024 'vo': 'vol',
3025 'wa': 'wln',
3026 'wo': 'wol',
3027 'xh': 'xho',
3028 'yi': 'yid',
3029 'yo': 'yor',
3030 'za': 'zha',
3031 'zh': 'zho',
3032 'zu': 'zul',
3033 }
3034
3035 @classmethod
3036 def short2long(cls, code):
3037 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3038 return cls._lang_map.get(code[:2])
3039
3040 @classmethod
3041 def long2short(cls, code):
3042 """Convert language code from ISO 639-2/T to ISO 639-1"""
3043 for short_name, long_name in cls._lang_map.items():
3044 if long_name == code:
3045 return short_name
3046
3047
3048 class ISO3166Utils(object):
3049 # From http://data.okfn.org/data/core/country-list
3050 _country_map = {
3051 'AF': 'Afghanistan',
3052 'AX': 'Åland Islands',
3053 'AL': 'Albania',
3054 'DZ': 'Algeria',
3055 'AS': 'American Samoa',
3056 'AD': 'Andorra',
3057 'AO': 'Angola',
3058 'AI': 'Anguilla',
3059 'AQ': 'Antarctica',
3060 'AG': 'Antigua and Barbuda',
3061 'AR': 'Argentina',
3062 'AM': 'Armenia',
3063 'AW': 'Aruba',
3064 'AU': 'Australia',
3065 'AT': 'Austria',
3066 'AZ': 'Azerbaijan',
3067 'BS': 'Bahamas',
3068 'BH': 'Bahrain',
3069 'BD': 'Bangladesh',
3070 'BB': 'Barbados',
3071 'BY': 'Belarus',
3072 'BE': 'Belgium',
3073 'BZ': 'Belize',
3074 'BJ': 'Benin',
3075 'BM': 'Bermuda',
3076 'BT': 'Bhutan',
3077 'BO': 'Bolivia, Plurinational State of',
3078 'BQ': 'Bonaire, Sint Eustatius and Saba',
3079 'BA': 'Bosnia and Herzegovina',
3080 'BW': 'Botswana',
3081 'BV': 'Bouvet Island',
3082 'BR': 'Brazil',
3083 'IO': 'British Indian Ocean Territory',
3084 'BN': 'Brunei Darussalam',
3085 'BG': 'Bulgaria',
3086 'BF': 'Burkina Faso',
3087 'BI': 'Burundi',
3088 'KH': 'Cambodia',
3089 'CM': 'Cameroon',
3090 'CA': 'Canada',
3091 'CV': 'Cape Verde',
3092 'KY': 'Cayman Islands',
3093 'CF': 'Central African Republic',
3094 'TD': 'Chad',
3095 'CL': 'Chile',
3096 'CN': 'China',
3097 'CX': 'Christmas Island',
3098 'CC': 'Cocos (Keeling) Islands',
3099 'CO': 'Colombia',
3100 'KM': 'Comoros',
3101 'CG': 'Congo',
3102 'CD': 'Congo, the Democratic Republic of the',
3103 'CK': 'Cook Islands',
3104 'CR': 'Costa Rica',
3105 'CI': 'Côte d\'Ivoire',
3106 'HR': 'Croatia',
3107 'CU': 'Cuba',
3108 'CW': 'Curaçao',
3109 'CY': 'Cyprus',
3110 'CZ': 'Czech Republic',
3111 'DK': 'Denmark',
3112 'DJ': 'Djibouti',
3113 'DM': 'Dominica',
3114 'DO': 'Dominican Republic',
3115 'EC': 'Ecuador',
3116 'EG': 'Egypt',
3117 'SV': 'El Salvador',
3118 'GQ': 'Equatorial Guinea',
3119 'ER': 'Eritrea',
3120 'EE': 'Estonia',
3121 'ET': 'Ethiopia',
3122 'FK': 'Falkland Islands (Malvinas)',
3123 'FO': 'Faroe Islands',
3124 'FJ': 'Fiji',
3125 'FI': 'Finland',
3126 'FR': 'France',
3127 'GF': 'French Guiana',
3128 'PF': 'French Polynesia',
3129 'TF': 'French Southern Territories',
3130 'GA': 'Gabon',
3131 'GM': 'Gambia',
3132 'GE': 'Georgia',
3133 'DE': 'Germany',
3134 'GH': 'Ghana',
3135 'GI': 'Gibraltar',
3136 'GR': 'Greece',
3137 'GL': 'Greenland',
3138 'GD': 'Grenada',
3139 'GP': 'Guadeloupe',
3140 'GU': 'Guam',
3141 'GT': 'Guatemala',
3142 'GG': 'Guernsey',
3143 'GN': 'Guinea',
3144 'GW': 'Guinea-Bissau',
3145 'GY': 'Guyana',
3146 'HT': 'Haiti',
3147 'HM': 'Heard Island and McDonald Islands',
3148 'VA': 'Holy See (Vatican City State)',
3149 'HN': 'Honduras',
3150 'HK': 'Hong Kong',
3151 'HU': 'Hungary',
3152 'IS': 'Iceland',
3153 'IN': 'India',
3154 'ID': 'Indonesia',
3155 'IR': 'Iran, Islamic Republic of',
3156 'IQ': 'Iraq',
3157 'IE': 'Ireland',
3158 'IM': 'Isle of Man',
3159 'IL': 'Israel',
3160 'IT': 'Italy',
3161 'JM': 'Jamaica',
3162 'JP': 'Japan',
3163 'JE': 'Jersey',
3164 'JO': 'Jordan',
3165 'KZ': 'Kazakhstan',
3166 'KE': 'Kenya',
3167 'KI': 'Kiribati',
3168 'KP': 'Korea, Democratic People\'s Republic of',
3169 'KR': 'Korea, Republic of',
3170 'KW': 'Kuwait',
3171 'KG': 'Kyrgyzstan',
3172 'LA': 'Lao People\'s Democratic Republic',
3173 'LV': 'Latvia',
3174 'LB': 'Lebanon',
3175 'LS': 'Lesotho',
3176 'LR': 'Liberia',
3177 'LY': 'Libya',
3178 'LI': 'Liechtenstein',
3179 'LT': 'Lithuania',
3180 'LU': 'Luxembourg',
3181 'MO': 'Macao',
3182 'MK': 'Macedonia, the Former Yugoslav Republic of',
3183 'MG': 'Madagascar',
3184 'MW': 'Malawi',
3185 'MY': 'Malaysia',
3186 'MV': 'Maldives',
3187 'ML': 'Mali',
3188 'MT': 'Malta',
3189 'MH': 'Marshall Islands',
3190 'MQ': 'Martinique',
3191 'MR': 'Mauritania',
3192 'MU': 'Mauritius',
3193 'YT': 'Mayotte',
3194 'MX': 'Mexico',
3195 'FM': 'Micronesia, Federated States of',
3196 'MD': 'Moldova, Republic of',
3197 'MC': 'Monaco',
3198 'MN': 'Mongolia',
3199 'ME': 'Montenegro',
3200 'MS': 'Montserrat',
3201 'MA': 'Morocco',
3202 'MZ': 'Mozambique',
3203 'MM': 'Myanmar',
3204 'NA': 'Namibia',
3205 'NR': 'Nauru',
3206 'NP': 'Nepal',
3207 'NL': 'Netherlands',
3208 'NC': 'New Caledonia',
3209 'NZ': 'New Zealand',
3210 'NI': 'Nicaragua',
3211 'NE': 'Niger',
3212 'NG': 'Nigeria',
3213 'NU': 'Niue',
3214 'NF': 'Norfolk Island',
3215 'MP': 'Northern Mariana Islands',
3216 'NO': 'Norway',
3217 'OM': 'Oman',
3218 'PK': 'Pakistan',
3219 'PW': 'Palau',
3220 'PS': 'Palestine, State of',
3221 'PA': 'Panama',
3222 'PG': 'Papua New Guinea',
3223 'PY': 'Paraguay',
3224 'PE': 'Peru',
3225 'PH': 'Philippines',
3226 'PN': 'Pitcairn',
3227 'PL': 'Poland',
3228 'PT': 'Portugal',
3229 'PR': 'Puerto Rico',
3230 'QA': 'Qatar',
3231 'RE': 'Réunion',
3232 'RO': 'Romania',
3233 'RU': 'Russian Federation',
3234 'RW': 'Rwanda',
3235 'BL': 'Saint Barthélemy',
3236 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3237 'KN': 'Saint Kitts and Nevis',
3238 'LC': 'Saint Lucia',
3239 'MF': 'Saint Martin (French part)',
3240 'PM': 'Saint Pierre and Miquelon',
3241 'VC': 'Saint Vincent and the Grenadines',
3242 'WS': 'Samoa',
3243 'SM': 'San Marino',
3244 'ST': 'Sao Tome and Principe',
3245 'SA': 'Saudi Arabia',
3246 'SN': 'Senegal',
3247 'RS': 'Serbia',
3248 'SC': 'Seychelles',
3249 'SL': 'Sierra Leone',
3250 'SG': 'Singapore',
3251 'SX': 'Sint Maarten (Dutch part)',
3252 'SK': 'Slovakia',
3253 'SI': 'Slovenia',
3254 'SB': 'Solomon Islands',
3255 'SO': 'Somalia',
3256 'ZA': 'South Africa',
3257 'GS': 'South Georgia and the South Sandwich Islands',
3258 'SS': 'South Sudan',
3259 'ES': 'Spain',
3260 'LK': 'Sri Lanka',
3261 'SD': 'Sudan',
3262 'SR': 'Suriname',
3263 'SJ': 'Svalbard and Jan Mayen',
3264 'SZ': 'Swaziland',
3265 'SE': 'Sweden',
3266 'CH': 'Switzerland',
3267 'SY': 'Syrian Arab Republic',
3268 'TW': 'Taiwan, Province of China',
3269 'TJ': 'Tajikistan',
3270 'TZ': 'Tanzania, United Republic of',
3271 'TH': 'Thailand',
3272 'TL': 'Timor-Leste',
3273 'TG': 'Togo',
3274 'TK': 'Tokelau',
3275 'TO': 'Tonga',
3276 'TT': 'Trinidad and Tobago',
3277 'TN': 'Tunisia',
3278 'TR': 'Turkey',
3279 'TM': 'Turkmenistan',
3280 'TC': 'Turks and Caicos Islands',
3281 'TV': 'Tuvalu',
3282 'UG': 'Uganda',
3283 'UA': 'Ukraine',
3284 'AE': 'United Arab Emirates',
3285 'GB': 'United Kingdom',
3286 'US': 'United States',
3287 'UM': 'United States Minor Outlying Islands',
3288 'UY': 'Uruguay',
3289 'UZ': 'Uzbekistan',
3290 'VU': 'Vanuatu',
3291 'VE': 'Venezuela, Bolivarian Republic of',
3292 'VN': 'Viet Nam',
3293 'VG': 'Virgin Islands, British',
3294 'VI': 'Virgin Islands, U.S.',
3295 'WF': 'Wallis and Futuna',
3296 'EH': 'Western Sahara',
3297 'YE': 'Yemen',
3298 'ZM': 'Zambia',
3299 'ZW': 'Zimbabwe',
3300 }
3301
3302 @classmethod
3303 def short2full(cls, code):
3304 """Convert an ISO 3166-2 country code to the corresponding full name"""
3305 return cls._country_map.get(code.upper())
3306
3307
3308 class GeoUtils(object):
3309 # Major IPv4 address blocks per country
3310 _country_ip_map = {
3311 'AD': '85.94.160.0/19',
3312 'AE': '94.200.0.0/13',
3313 'AF': '149.54.0.0/17',
3314 'AG': '209.59.64.0/18',
3315 'AI': '204.14.248.0/21',
3316 'AL': '46.99.0.0/16',
3317 'AM': '46.70.0.0/15',
3318 'AO': '105.168.0.0/13',
3319 'AP': '159.117.192.0/21',
3320 'AR': '181.0.0.0/12',
3321 'AS': '202.70.112.0/20',
3322 'AT': '84.112.0.0/13',
3323 'AU': '1.128.0.0/11',
3324 'AW': '181.41.0.0/18',
3325 'AZ': '5.191.0.0/16',
3326 'BA': '31.176.128.0/17',
3327 'BB': '65.48.128.0/17',
3328 'BD': '114.130.0.0/16',
3329 'BE': '57.0.0.0/8',
3330 'BF': '129.45.128.0/17',
3331 'BG': '95.42.0.0/15',
3332 'BH': '37.131.0.0/17',
3333 'BI': '154.117.192.0/18',
3334 'BJ': '137.255.0.0/16',
3335 'BL': '192.131.134.0/24',
3336 'BM': '196.12.64.0/18',
3337 'BN': '156.31.0.0/16',
3338 'BO': '161.56.0.0/16',
3339 'BQ': '161.0.80.0/20',
3340 'BR': '152.240.0.0/12',
3341 'BS': '24.51.64.0/18',
3342 'BT': '119.2.96.0/19',
3343 'BW': '168.167.0.0/16',
3344 'BY': '178.120.0.0/13',
3345 'BZ': '179.42.192.0/18',
3346 'CA': '99.224.0.0/11',
3347 'CD': '41.243.0.0/16',
3348 'CF': '196.32.200.0/21',
3349 'CG': '197.214.128.0/17',
3350 'CH': '85.0.0.0/13',
3351 'CI': '154.232.0.0/14',
3352 'CK': '202.65.32.0/19',
3353 'CL': '152.172.0.0/14',
3354 'CM': '165.210.0.0/15',
3355 'CN': '36.128.0.0/10',
3356 'CO': '181.240.0.0/12',
3357 'CR': '201.192.0.0/12',
3358 'CU': '152.206.0.0/15',
3359 'CV': '165.90.96.0/19',
3360 'CW': '190.88.128.0/17',
3361 'CY': '46.198.0.0/15',
3362 'CZ': '88.100.0.0/14',
3363 'DE': '53.0.0.0/8',
3364 'DJ': '197.241.0.0/17',
3365 'DK': '87.48.0.0/12',
3366 'DM': '192.243.48.0/20',
3367 'DO': '152.166.0.0/15',
3368 'DZ': '41.96.0.0/12',
3369 'EC': '186.68.0.0/15',
3370 'EE': '90.190.0.0/15',
3371 'EG': '156.160.0.0/11',
3372 'ER': '196.200.96.0/20',
3373 'ES': '88.0.0.0/11',
3374 'ET': '196.188.0.0/14',
3375 'EU': '2.16.0.0/13',
3376 'FI': '91.152.0.0/13',
3377 'FJ': '144.120.0.0/16',
3378 'FM': '119.252.112.0/20',
3379 'FO': '88.85.32.0/19',
3380 'FR': '90.0.0.0/9',
3381 'GA': '41.158.0.0/15',
3382 'GB': '25.0.0.0/8',
3383 'GD': '74.122.88.0/21',
3384 'GE': '31.146.0.0/16',
3385 'GF': '161.22.64.0/18',
3386 'GG': '62.68.160.0/19',
3387 'GH': '45.208.0.0/14',
3388 'GI': '85.115.128.0/19',
3389 'GL': '88.83.0.0/19',
3390 'GM': '160.182.0.0/15',
3391 'GN': '197.149.192.0/18',
3392 'GP': '104.250.0.0/19',
3393 'GQ': '105.235.224.0/20',
3394 'GR': '94.64.0.0/13',
3395 'GT': '168.234.0.0/16',
3396 'GU': '168.123.0.0/16',
3397 'GW': '197.214.80.0/20',
3398 'GY': '181.41.64.0/18',
3399 'HK': '113.252.0.0/14',
3400 'HN': '181.210.0.0/16',
3401 'HR': '93.136.0.0/13',
3402 'HT': '148.102.128.0/17',
3403 'HU': '84.0.0.0/14',
3404 'ID': '39.192.0.0/10',
3405 'IE': '87.32.0.0/12',
3406 'IL': '79.176.0.0/13',
3407 'IM': '5.62.80.0/20',
3408 'IN': '117.192.0.0/10',
3409 'IO': '203.83.48.0/21',
3410 'IQ': '37.236.0.0/14',
3411 'IR': '2.176.0.0/12',
3412 'IS': '82.221.0.0/16',
3413 'IT': '79.0.0.0/10',
3414 'JE': '87.244.64.0/18',
3415 'JM': '72.27.0.0/17',
3416 'JO': '176.29.0.0/16',
3417 'JP': '126.0.0.0/8',
3418 'KE': '105.48.0.0/12',
3419 'KG': '158.181.128.0/17',
3420 'KH': '36.37.128.0/17',
3421 'KI': '103.25.140.0/22',
3422 'KM': '197.255.224.0/20',
3423 'KN': '198.32.32.0/19',
3424 'KP': '175.45.176.0/22',
3425 'KR': '175.192.0.0/10',
3426 'KW': '37.36.0.0/14',
3427 'KY': '64.96.0.0/15',
3428 'KZ': '2.72.0.0/13',
3429 'LA': '115.84.64.0/18',
3430 'LB': '178.135.0.0/16',
3431 'LC': '192.147.231.0/24',
3432 'LI': '82.117.0.0/19',
3433 'LK': '112.134.0.0/15',
3434 'LR': '41.86.0.0/19',
3435 'LS': '129.232.0.0/17',
3436 'LT': '78.56.0.0/13',
3437 'LU': '188.42.0.0/16',
3438 'LV': '46.109.0.0/16',
3439 'LY': '41.252.0.0/14',
3440 'MA': '105.128.0.0/11',
3441 'MC': '88.209.64.0/18',
3442 'MD': '37.246.0.0/16',
3443 'ME': '178.175.0.0/17',
3444 'MF': '74.112.232.0/21',
3445 'MG': '154.126.0.0/17',
3446 'MH': '117.103.88.0/21',
3447 'MK': '77.28.0.0/15',
3448 'ML': '154.118.128.0/18',
3449 'MM': '37.111.0.0/17',
3450 'MN': '49.0.128.0/17',
3451 'MO': '60.246.0.0/16',
3452 'MP': '202.88.64.0/20',
3453 'MQ': '109.203.224.0/19',
3454 'MR': '41.188.64.0/18',
3455 'MS': '208.90.112.0/22',
3456 'MT': '46.11.0.0/16',
3457 'MU': '105.16.0.0/12',
3458 'MV': '27.114.128.0/18',
3459 'MW': '105.234.0.0/16',
3460 'MX': '187.192.0.0/11',
3461 'MY': '175.136.0.0/13',
3462 'MZ': '197.218.0.0/15',
3463 'NA': '41.182.0.0/16',
3464 'NC': '101.101.0.0/18',
3465 'NE': '197.214.0.0/18',
3466 'NF': '203.17.240.0/22',
3467 'NG': '105.112.0.0/12',
3468 'NI': '186.76.0.0/15',
3469 'NL': '145.96.0.0/11',
3470 'NO': '84.208.0.0/13',
3471 'NP': '36.252.0.0/15',
3472 'NR': '203.98.224.0/19',
3473 'NU': '49.156.48.0/22',
3474 'NZ': '49.224.0.0/14',
3475 'OM': '5.36.0.0/15',
3476 'PA': '186.72.0.0/15',
3477 'PE': '186.160.0.0/14',
3478 'PF': '123.50.64.0/18',
3479 'PG': '124.240.192.0/19',
3480 'PH': '49.144.0.0/13',
3481 'PK': '39.32.0.0/11',
3482 'PL': '83.0.0.0/11',
3483 'PM': '70.36.0.0/20',
3484 'PR': '66.50.0.0/16',
3485 'PS': '188.161.0.0/16',
3486 'PT': '85.240.0.0/13',
3487 'PW': '202.124.224.0/20',
3488 'PY': '181.120.0.0/14',
3489 'QA': '37.210.0.0/15',
3490 'RE': '139.26.0.0/16',
3491 'RO': '79.112.0.0/13',
3492 'RS': '178.220.0.0/14',
3493 'RU': '5.136.0.0/13',
3494 'RW': '105.178.0.0/15',
3495 'SA': '188.48.0.0/13',
3496 'SB': '202.1.160.0/19',
3497 'SC': '154.192.0.0/11',
3498 'SD': '154.96.0.0/13',
3499 'SE': '78.64.0.0/12',
3500 'SG': '152.56.0.0/14',
3501 'SI': '188.196.0.0/14',
3502 'SK': '78.98.0.0/15',
3503 'SL': '197.215.0.0/17',
3504 'SM': '89.186.32.0/19',
3505 'SN': '41.82.0.0/15',
3506 'SO': '197.220.64.0/19',
3507 'SR': '186.179.128.0/17',
3508 'SS': '105.235.208.0/21',
3509 'ST': '197.159.160.0/19',
3510 'SV': '168.243.0.0/16',
3511 'SX': '190.102.0.0/20',
3512 'SY': '5.0.0.0/16',
3513 'SZ': '41.84.224.0/19',
3514 'TC': '65.255.48.0/20',
3515 'TD': '154.68.128.0/19',
3516 'TG': '196.168.0.0/14',
3517 'TH': '171.96.0.0/13',
3518 'TJ': '85.9.128.0/18',
3519 'TK': '27.96.24.0/21',
3520 'TL': '180.189.160.0/20',
3521 'TM': '95.85.96.0/19',
3522 'TN': '197.0.0.0/11',
3523 'TO': '175.176.144.0/21',
3524 'TR': '78.160.0.0/11',
3525 'TT': '186.44.0.0/15',
3526 'TV': '202.2.96.0/19',
3527 'TW': '120.96.0.0/11',
3528 'TZ': '156.156.0.0/14',
3529 'UA': '93.72.0.0/13',
3530 'UG': '154.224.0.0/13',
3531 'US': '3.0.0.0/8',
3532 'UY': '167.56.0.0/13',
3533 'UZ': '82.215.64.0/18',
3534 'VA': '212.77.0.0/19',
3535 'VC': '24.92.144.0/20',
3536 'VE': '186.88.0.0/13',
3537 'VG': '172.103.64.0/18',
3538 'VI': '146.226.0.0/16',
3539 'VN': '14.160.0.0/11',
3540 'VU': '202.80.32.0/20',
3541 'WF': '117.20.32.0/21',
3542 'WS': '202.4.32.0/19',
3543 'YE': '134.35.0.0/16',
3544 'YT': '41.242.116.0/22',
3545 'ZA': '41.0.0.0/11',
3546 'ZM': '165.56.0.0/13',
3547 'ZW': '41.85.192.0/19',
3548 }
3549
3550 @classmethod
3551 def random_ipv4(cls, code_or_block):
3552 if len(code_or_block) == 2:
3553 block = cls._country_ip_map.get(code_or_block.upper())
3554 if not block:
3555 return None
3556 else:
3557 block = code_or_block
3558 addr, preflen = block.split('/')
3559 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3560 addr_max = addr_min | (0xffffffff >> int(preflen))
3561 return compat_str(socket.inet_ntoa(
3562 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3563
3564
3565 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3566 def __init__(self, proxies=None):
3567 # Set default handlers
3568 for type in ('http', 'https'):
3569 setattr(self, '%s_open' % type,
3570 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3571 meth(r, proxy, type))
3572 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3573
3574 def proxy_open(self, req, proxy, type):
3575 req_proxy = req.headers.get('Ytdl-request-proxy')
3576 if req_proxy is not None:
3577 proxy = req_proxy
3578 del req.headers['Ytdl-request-proxy']
3579
3580 if proxy == '__noproxy__':
3581 return None # No Proxy
3582 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3583 req.add_header('Ytdl-socks-proxy', proxy)
3584 # youtube-dl's http/https handlers do wrapping the socket with socks
3585 return None
3586 return compat_urllib_request.ProxyHandler.proxy_open(
3587 self, req, proxy, type)
3588
3589
3590 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3591 # released into Public Domain
3592 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3593
3594 def long_to_bytes(n, blocksize=0):
3595 """long_to_bytes(n:long, blocksize:int) : string
3596 Convert a long integer to a byte string.
3597
3598 If optional blocksize is given and greater than zero, pad the front of the
3599 byte string with binary zeros so that the length is a multiple of
3600 blocksize.
3601 """
3602 # after much testing, this algorithm was deemed to be the fastest
3603 s = b''
3604 n = int(n)
3605 while n > 0:
3606 s = compat_struct_pack('>I', n & 0xffffffff) + s
3607 n = n >> 32
3608 # strip off leading zeros
3609 for i in range(len(s)):
3610 if s[i] != b'\000'[0]:
3611 break
3612 else:
3613 # only happens when n == 0
3614 s = b'\000'
3615 i = 0
3616 s = s[i:]
3617 # add back some pad bytes. this could be done more efficiently w.r.t. the
3618 # de-padding being done above, but sigh...
3619 if blocksize > 0 and len(s) % blocksize:
3620 s = (blocksize - len(s) % blocksize) * b'\000' + s
3621 return s
3622
3623
3624 def bytes_to_long(s):
3625 """bytes_to_long(string) : long
3626 Convert a byte string to a long integer.
3627
3628 This is (essentially) the inverse of long_to_bytes().
3629 """
3630 acc = 0
3631 length = len(s)
3632 if length % 4:
3633 extra = (4 - length % 4)
3634 s = b'\000' * extra + s
3635 length = length + extra
3636 for i in range(0, length, 4):
3637 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3638 return acc
3639
3640
3641 def ohdave_rsa_encrypt(data, exponent, modulus):
3642 '''
3643 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3644
3645 Input:
3646 data: data to encrypt, bytes-like object
3647 exponent, modulus: parameter e and N of RSA algorithm, both integer
3648 Output: hex string of encrypted data
3649
3650 Limitation: supports one block encryption only
3651 '''
3652
3653 payload = int(binascii.hexlify(data[::-1]), 16)
3654 encrypted = pow(payload, exponent, modulus)
3655 return '%x' % encrypted
3656
3657
3658 def pkcs1pad(data, length):
3659 """
3660 Padding input data with PKCS#1 scheme
3661
3662 @param {int[]} data input data
3663 @param {int} length target length
3664 @returns {int[]} padded data
3665 """
3666 if len(data) > length - 11:
3667 raise ValueError('Input data too long for PKCS#1 padding')
3668
3669 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3670 return [0, 2] + pseudo_random + [0] + data
3671
3672
3673 def encode_base_n(num, n, table=None):
3674 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3675 if not table:
3676 table = FULL_TABLE[:n]
3677
3678 if n > len(table):
3679 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3680
3681 if num == 0:
3682 return table[0]
3683
3684 ret = ''
3685 while num:
3686 ret = table[num % n] + ret
3687 num = num // n
3688 return ret
3689
3690
3691 def decode_packed_codes(code):
3692 mobj = re.search(PACKED_CODES_RE, code)
3693 obfucasted_code, base, count, symbols = mobj.groups()
3694 base = int(base)
3695 count = int(count)
3696 symbols = symbols.split('|')
3697 symbol_table = {}
3698
3699 while count:
3700 count -= 1
3701 base_n_count = encode_base_n(count, base)
3702 symbol_table[base_n_count] = symbols[count] or base_n_count
3703
3704 return re.sub(
3705 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3706 obfucasted_code)
3707
3708
3709 def parse_m3u8_attributes(attrib):
3710 info = {}
3711 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3712 if val.startswith('"'):
3713 val = val[1:-1]
3714 info[key] = val
3715 return info
3716
3717
3718 def urshift(val, n):
3719 return val >> n if val >= 0 else (val + 0x100000000) >> n
3720
3721
3722 # Based on png2str() written by @gdkchan and improved by @yokrysty
3723 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3724 def decode_png(png_data):
3725 # Reference: https://www.w3.org/TR/PNG/
3726 header = png_data[8:]
3727
3728 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3729 raise IOError('Not a valid PNG file.')
3730
3731 int_map = {1: '>B', 2: '>H', 4: '>I'}
3732 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3733
3734 chunks = []
3735
3736 while header:
3737 length = unpack_integer(header[:4])
3738 header = header[4:]
3739
3740 chunk_type = header[:4]
3741 header = header[4:]
3742
3743 chunk_data = header[:length]
3744 header = header[length:]
3745
3746 header = header[4:] # Skip CRC
3747
3748 chunks.append({
3749 'type': chunk_type,
3750 'length': length,
3751 'data': chunk_data
3752 })
3753
3754 ihdr = chunks[0]['data']
3755
3756 width = unpack_integer(ihdr[:4])
3757 height = unpack_integer(ihdr[4:8])
3758
3759 idat = b''
3760
3761 for chunk in chunks:
3762 if chunk['type'] == b'IDAT':
3763 idat += chunk['data']
3764
3765 if not idat:
3766 raise IOError('Unable to read PNG data.')
3767
3768 decompressed_data = bytearray(zlib.decompress(idat))
3769
3770 stride = width * 3
3771 pixels = []
3772
3773 def _get_pixel(idx):
3774 x = idx % stride
3775 y = idx // stride
3776 return pixels[y][x]
3777
3778 for y in range(height):
3779 basePos = y * (1 + stride)
3780 filter_type = decompressed_data[basePos]
3781
3782 current_row = []
3783
3784 pixels.append(current_row)
3785
3786 for x in range(stride):
3787 color = decompressed_data[1 + basePos + x]
3788 basex = y * stride + x
3789 left = 0
3790 up = 0
3791
3792 if x > 2:
3793 left = _get_pixel(basex - 3)
3794 if y > 0:
3795 up = _get_pixel(basex - stride)
3796
3797 if filter_type == 1: # Sub
3798 color = (color + left) & 0xff
3799 elif filter_type == 2: # Up
3800 color = (color + up) & 0xff
3801 elif filter_type == 3: # Average
3802 color = (color + ((left + up) >> 1)) & 0xff
3803 elif filter_type == 4: # Paeth
3804 a = left
3805 b = up
3806 c = 0
3807
3808 if x > 2 and y > 0:
3809 c = _get_pixel(basex - stride - 3)
3810
3811 p = a + b - c
3812
3813 pa = abs(p - a)
3814 pb = abs(p - b)
3815 pc = abs(p - c)
3816
3817 if pa <= pb and pa <= pc:
3818 color = (color + a) & 0xff
3819 elif pb <= pc:
3820 color = (color + b) & 0xff
3821 else:
3822 color = (color + c) & 0xff
3823
3824 current_row.append(color)
3825
3826 return width, height, pixels
3827
3828
3829 def write_xattr(path, key, value):
3830 # This mess below finds the best xattr tool for the job
3831 try:
3832 # try the pyxattr module...
3833 import xattr
3834
3835 if hasattr(xattr, 'set'): # pyxattr
3836 # Unicode arguments are not supported in python-pyxattr until
3837 # version 0.5.0
3838 # See https://github.com/rg3/youtube-dl/issues/5498
3839 pyxattr_required_version = '0.5.0'
3840 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3841 # TODO: fallback to CLI tools
3842 raise XAttrUnavailableError(
3843 'python-pyxattr is detected but is too old. '
3844 'youtube-dl requires %s or above while your version is %s. '
3845 'Falling back to other xattr implementations' % (
3846 pyxattr_required_version, xattr.__version__))
3847
3848 setxattr = xattr.set
3849 else: # xattr
3850 setxattr = xattr.setxattr
3851
3852 try:
3853 setxattr(path, key, value)
3854 except EnvironmentError as e:
3855 raise XAttrMetadataError(e.errno, e.strerror)
3856
3857 except ImportError:
3858 if compat_os_name == 'nt':
3859 # Write xattrs to NTFS Alternate Data Streams:
3860 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3861 assert ':' not in key
3862 assert os.path.exists(path)
3863
3864 ads_fn = path + ':' + key
3865 try:
3866 with open(ads_fn, 'wb') as f:
3867 f.write(value)
3868 except EnvironmentError as e:
3869 raise XAttrMetadataError(e.errno, e.strerror)
3870 else:
3871 user_has_setfattr = check_executable('setfattr', ['--version'])
3872 user_has_xattr = check_executable('xattr', ['-h'])
3873
3874 if user_has_setfattr or user_has_xattr:
3875
3876 value = value.decode('utf-8')
3877 if user_has_setfattr:
3878 executable = 'setfattr'
3879 opts = ['-n', key, '-v', value]
3880 elif user_has_xattr:
3881 executable = 'xattr'
3882 opts = ['-w', key, value]
3883
3884 cmd = ([encodeFilename(executable, True)] +
3885 [encodeArgument(o) for o in opts] +
3886 [encodeFilename(path, True)])
3887
3888 try:
3889 p = subprocess.Popen(
3890 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3891 except EnvironmentError as e:
3892 raise XAttrMetadataError(e.errno, e.strerror)
3893 stdout, stderr = p.communicate()
3894 stderr = stderr.decode('utf-8', 'replace')
3895 if p.returncode != 0:
3896 raise XAttrMetadataError(p.returncode, stderr)
3897
3898 else:
3899 # On Unix, and can't find pyxattr, setfattr, or xattr.
3900 if sys.platform.startswith('linux'):
3901 raise XAttrUnavailableError(
3902 "Couldn't find a tool to set the xattrs. "
3903 "Install either the python 'pyxattr' or 'xattr' "
3904 "modules, or the GNU 'attr' package "
3905 "(which contains the 'setfattr' tool).")
3906 else:
3907 raise XAttrUnavailableError(
3908 "Couldn't find a tool to set the xattrs. "
3909 "Install either the python 'xattr' module, "
3910 "or the 'xattr' binary.")
3911
3912
3913 def random_birthday(year_field, month_field, day_field):
3914 return {
3915 year_field: str(random.randint(1950, 1995)),
3916 month_field: str(random.randint(1, 12)),
3917 day_field: str(random.randint(1, 31)),
3918 }