]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/utils.py
[nrktv] Update API host (closes #16324)
[yt-dlp.git] / youtube_dl / utils.py
... / ...
CommitLineData
1#!/usr/bin/env python
2# coding: utf-8
3
4from __future__ import unicode_literals
5
6import base64
7import binascii
8import calendar
9import codecs
10import contextlib
11import ctypes
12import datetime
13import email.utils
14import email.header
15import errno
16import functools
17import gzip
18import io
19import itertools
20import json
21import locale
22import math
23import operator
24import os
25import platform
26import random
27import re
28import socket
29import ssl
30import subprocess
31import sys
32import tempfile
33import traceback
34import xml.etree.ElementTree
35import zlib
36
37from .compat import (
38 compat_HTMLParseError,
39 compat_HTMLParser,
40 compat_basestring,
41 compat_chr,
42 compat_ctypes_WINFUNCTYPE,
43 compat_etree_fromstring,
44 compat_expanduser,
45 compat_html_entities,
46 compat_html_entities_html5,
47 compat_http_client,
48 compat_kwargs,
49 compat_os_name,
50 compat_parse_qs,
51 compat_shlex_quote,
52 compat_socket_create_connection,
53 compat_str,
54 compat_struct_pack,
55 compat_struct_unpack,
56 compat_urllib_error,
57 compat_urllib_parse,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
62 compat_urlparse,
63 compat_xpath,
64)
65
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
71
72def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
84std_headers = {
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
90}
91
92
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
98NO_DEFAULT = object()
99
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
109}
110
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
126# needed for sanitizing filenames in restricted mode
127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
130
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
138 '%b %d %Y',
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
148 '%Y/%m/%d %H:%M',
149 '%Y/%m/%d %H:%M:%S',
150 '%Y-%m-%d %H:%M',
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187
188
189def preferredencoding():
190 """Get preferred encoding.
191
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
197 'TEST'.encode(pref)
198 except Exception:
199 pref = 'UTF-8'
200
201 return pref
202
203
204def write_json_file(obj, fn):
205 """ Encode obj as JSON and write it to fn, atomically if possible """
206
207 fn = encodeFilename(fn)
208 if sys.version_info < (3, 0) and sys.platform != 'win32':
209 encoding = get_filesystem_encoding()
210 # os.path.basename returns a bytes object, but NamedTemporaryFile
211 # will fail if the filename contains non ascii characters unless we
212 # use a unicode object
213 path_basename = lambda f: os.path.basename(fn).decode(encoding)
214 # the same for os.path.dirname
215 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
216 else:
217 path_basename = os.path.basename
218 path_dirname = os.path.dirname
219
220 args = {
221 'suffix': '.tmp',
222 'prefix': path_basename(fn) + '.',
223 'dir': path_dirname(fn),
224 'delete': False,
225 }
226
227 # In Python 2.x, json.dump expects a bytestream.
228 # In Python 3.x, it writes to a character stream
229 if sys.version_info < (3, 0):
230 args['mode'] = 'wb'
231 else:
232 args.update({
233 'mode': 'w',
234 'encoding': 'utf-8',
235 })
236
237 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
238
239 try:
240 with tf:
241 json.dump(obj, tf)
242 if sys.platform == 'win32':
243 # Need to remove existing file on Windows, else os.rename raises
244 # WindowsError or FileExistsError.
245 try:
246 os.unlink(fn)
247 except OSError:
248 pass
249 os.rename(tf.name, fn)
250 except Exception:
251 try:
252 os.remove(tf.name)
253 except OSError:
254 pass
255 raise
256
257
258if sys.version_info >= (2, 7):
259 def find_xpath_attr(node, xpath, key, val=None):
260 """ Find the xpath xpath[@key=val] """
261 assert re.match(r'^[a-zA-Z_-]+$', key)
262 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
263 return node.find(expr)
264else:
265 def find_xpath_attr(node, xpath, key, val=None):
266 for f in node.findall(compat_xpath(xpath)):
267 if key not in f.attrib:
268 continue
269 if val is None or f.attrib.get(key) == val:
270 return f
271 return None
272
273# On python2.6 the xml.etree.ElementTree.Element methods don't support
274# the namespace parameter
275
276
277def xpath_with_ns(path, ns_map):
278 components = [c.split(':') for c in path.split('/')]
279 replaced = []
280 for c in components:
281 if len(c) == 1:
282 replaced.append(c[0])
283 else:
284 ns, tag = c
285 replaced.append('{%s}%s' % (ns_map[ns], tag))
286 return '/'.join(replaced)
287
288
289def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
290 def _find_xpath(xpath):
291 return node.find(compat_xpath(xpath))
292
293 if isinstance(xpath, (str, compat_str)):
294 n = _find_xpath(xpath)
295 else:
296 for xp in xpath:
297 n = _find_xpath(xp)
298 if n is not None:
299 break
300
301 if n is None:
302 if default is not NO_DEFAULT:
303 return default
304 elif fatal:
305 name = xpath if name is None else name
306 raise ExtractorError('Could not find XML element %s' % name)
307 else:
308 return None
309 return n
310
311
312def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
313 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
314 if n is None or n == default:
315 return n
316 if n.text is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element\'s text %s' % name)
322 else:
323 return None
324 return n.text
325
326
327def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
328 n = find_xpath_attr(node, xpath, key)
329 if n is None:
330 if default is not NO_DEFAULT:
331 return default
332 elif fatal:
333 name = '%s[@%s]' % (xpath, key) if name is None else name
334 raise ExtractorError('Could not find XML attribute %s' % name)
335 else:
336 return None
337 return n.attrib[key]
338
339
340def get_element_by_id(id, html):
341 """Return the content of the tag with the specified ID in the passed HTML document"""
342 return get_element_by_attribute('id', id, html)
343
344
345def get_element_by_class(class_name, html):
346 """Return the content of the first tag with the specified class in the passed HTML document"""
347 retval = get_elements_by_class(class_name, html)
348 return retval[0] if retval else None
349
350
351def get_element_by_attribute(attribute, value, html, escape_value=True):
352 retval = get_elements_by_attribute(attribute, value, html, escape_value)
353 return retval[0] if retval else None
354
355
356def get_elements_by_class(class_name, html):
357 """Return the content of all tags with the specified class in the passed HTML document as a list"""
358 return get_elements_by_attribute(
359 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
360 html, escape_value=False)
361
362
363def get_elements_by_attribute(attribute, value, html, escape_value=True):
364 """Return the content of the tag with the specified attribute in the passed HTML document"""
365
366 value = re.escape(value) if escape_value else value
367
368 retlist = []
369 for m in re.finditer(r'''(?xs)
370 <([a-zA-Z0-9:._-]+)
371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
372 \s+%s=['"]?%s['"]?
373 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
374 \s*>
375 (?P<content>.*?)
376 </\1>
377 ''' % (re.escape(attribute), value), html):
378 res = m.group('content')
379
380 if res.startswith('"') or res.startswith("'"):
381 res = res[1:-1]
382
383 retlist.append(unescapeHTML(res))
384
385 return retlist
386
387
388class HTMLAttributeParser(compat_HTMLParser):
389 """Trivial HTML parser to gather the attributes for a single element"""
390 def __init__(self):
391 self.attrs = {}
392 compat_HTMLParser.__init__(self)
393
394 def handle_starttag(self, tag, attrs):
395 self.attrs = dict(attrs)
396
397
398def extract_attributes(html_element):
399 """Given a string for an HTML element such as
400 <el
401 a="foo" B="bar" c="&98;az" d=boz
402 empty= noval entity="&amp;"
403 sq='"' dq="'"
404 >
405 Decode and return a dictionary of attributes.
406 {
407 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
408 'empty': '', 'noval': None, 'entity': '&',
409 'sq': '"', 'dq': '\''
410 }.
411 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
412 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
413 """
414 parser = HTMLAttributeParser()
415 try:
416 parser.feed(html_element)
417 parser.close()
418 # Older Python may throw HTMLParseError in case of malformed HTML
419 except compat_HTMLParseError:
420 pass
421 return parser.attrs
422
423
424def clean_html(html):
425 """Clean an HTML snippet into a readable string"""
426
427 if html is None: # Convenience for sanitizing descriptions etc.
428 return html
429
430 # Newline vs <br />
431 html = html.replace('\n', ' ')
432 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
433 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
434 # Strip html tags
435 html = re.sub('<.*?>', '', html)
436 # Replace html entities
437 html = unescapeHTML(html)
438 return html.strip()
439
440
441def sanitize_open(filename, open_mode):
442 """Try to open the given filename, and slightly tweak it if this fails.
443
444 Attempts to open the given filename. If this fails, it tries to change
445 the filename slightly, step by step, until it's either able to open it
446 or it fails and raises a final exception, like the standard open()
447 function.
448
449 It returns the tuple (stream, definitive_file_name).
450 """
451 try:
452 if filename == '-':
453 if sys.platform == 'win32':
454 import msvcrt
455 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
456 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
457 stream = open(encodeFilename(filename), open_mode)
458 return (stream, filename)
459 except (IOError, OSError) as err:
460 if err.errno in (errno.EACCES,):
461 raise
462
463 # In case of error, try to remove win32 forbidden chars
464 alt_filename = sanitize_path(filename)
465 if alt_filename == filename:
466 raise
467 else:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(alt_filename), open_mode)
470 return (stream, alt_filename)
471
472
473def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
475 timestamp = None
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
479 return timestamp
480
481
482def sanitize_filename(s, restricted=False, is_id=False):
483 """Sanitizes a string so it could be used as part of a filename.
484 If restricted is set, use a stricter subset of allowed characters.
485 Set is_id if this is not an arbitrary string, but an ID that should be kept
486 if possible.
487 """
488 def replace_insane(char):
489 if restricted and char in ACCENT_CHARS:
490 return ACCENT_CHARS[char]
491 if char == '?' or ord(char) < 32 or ord(char) == 127:
492 return ''
493 elif char == '"':
494 return '' if restricted else '\''
495 elif char == ':':
496 return '_-' if restricted else ' -'
497 elif char in '\\/|*<>':
498 return '_'
499 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
500 return '_'
501 if restricted and ord(char) > 127:
502 return '_'
503 return char
504
505 # Handle timestamps
506 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
507 result = ''.join(map(replace_insane, s))
508 if not is_id:
509 while '__' in result:
510 result = result.replace('__', '_')
511 result = result.strip('_')
512 # Common case of "Foreign band name - English song title"
513 if restricted and result.startswith('-_'):
514 result = result[2:]
515 if result.startswith('-'):
516 result = '_' + result[len('-'):]
517 result = result.lstrip('.')
518 if not result:
519 result = '_'
520 return result
521
522
523def sanitize_path(s):
524 """Sanitizes and normalizes path on Windows"""
525 if sys.platform != 'win32':
526 return s
527 drive_or_unc, _ = os.path.splitdrive(s)
528 if sys.version_info < (2, 7) and not drive_or_unc:
529 drive_or_unc, _ = os.path.splitunc(s)
530 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
531 if drive_or_unc:
532 norm_path.pop(0)
533 sanitized_path = [
534 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
535 for path_part in norm_path]
536 if drive_or_unc:
537 sanitized_path.insert(0, drive_or_unc + os.path.sep)
538 return os.path.join(*sanitized_path)
539
540
541def sanitize_url(url):
542 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
543 # the number of unwanted failures due to missing protocol
544 if url.startswith('//'):
545 return 'http:%s' % url
546 # Fix some common typos seen so far
547 COMMON_TYPOS = (
548 # https://github.com/rg3/youtube-dl/issues/15649
549 (r'^httpss://', r'https://'),
550 # https://bx1.be/lives/direct-tv/
551 (r'^rmtp([es]?)://', r'rtmp\1://'),
552 )
553 for mistake, fixup in COMMON_TYPOS:
554 if re.match(mistake, url):
555 return re.sub(mistake, fixup, url)
556 return url
557
558
559def sanitized_Request(url, *args, **kwargs):
560 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
561
562
563def expand_path(s):
564 """Expand shell variables and ~"""
565 return os.path.expandvars(compat_expanduser(s))
566
567
568def orderedSet(iterable):
569 """ Remove all duplicates from the input iterable """
570 res = []
571 for el in iterable:
572 if el not in res:
573 res.append(el)
574 return res
575
576
577def _htmlentity_transform(entity_with_semicolon):
578 """Transforms an HTML entity to a character."""
579 entity = entity_with_semicolon[:-1]
580
581 # Known non-numeric HTML entity
582 if entity in compat_html_entities.name2codepoint:
583 return compat_chr(compat_html_entities.name2codepoint[entity])
584
585 # TODO: HTML5 allows entities without a semicolon. For example,
586 # '&Eacuteric' should be decoded as 'Éric'.
587 if entity_with_semicolon in compat_html_entities_html5:
588 return compat_html_entities_html5[entity_with_semicolon]
589
590 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
591 if mobj is not None:
592 numstr = mobj.group(1)
593 if numstr.startswith('x'):
594 base = 16
595 numstr = '0%s' % numstr
596 else:
597 base = 10
598 # See https://github.com/rg3/youtube-dl/issues/7518
599 try:
600 return compat_chr(int(numstr, base))
601 except ValueError:
602 pass
603
604 # Unknown entity in name, return its literal representation
605 return '&%s;' % entity
606
607
608def unescapeHTML(s):
609 if s is None:
610 return None
611 assert type(s) == compat_str
612
613 return re.sub(
614 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
615
616
617def get_subprocess_encoding():
618 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
619 # For subprocess calls, encode with locale encoding
620 # Refer to http://stackoverflow.com/a/9951851/35070
621 encoding = preferredencoding()
622 else:
623 encoding = sys.getfilesystemencoding()
624 if encoding is None:
625 encoding = 'utf-8'
626 return encoding
627
628
629def encodeFilename(s, for_subprocess=False):
630 """
631 @param s The name of the file
632 """
633
634 assert type(s) == compat_str
635
636 # Python 3 has a Unicode API
637 if sys.version_info >= (3, 0):
638 return s
639
640 # Pass '' directly to use Unicode APIs on Windows 2000 and up
641 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
642 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
643 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
644 return s
645
646 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
647 if sys.platform.startswith('java'):
648 return s
649
650 return s.encode(get_subprocess_encoding(), 'ignore')
651
652
653def decodeFilename(b, for_subprocess=False):
654
655 if sys.version_info >= (3, 0):
656 return b
657
658 if not isinstance(b, bytes):
659 return b
660
661 return b.decode(get_subprocess_encoding(), 'ignore')
662
663
664def encodeArgument(s):
665 if not isinstance(s, compat_str):
666 # Legacy code that uses byte strings
667 # Uncomment the following line after fixing all post processors
668 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
669 s = s.decode('ascii')
670 return encodeFilename(s, True)
671
672
673def decodeArgument(b):
674 return decodeFilename(b, True)
675
676
677def decodeOption(optval):
678 if optval is None:
679 return optval
680 if isinstance(optval, bytes):
681 optval = optval.decode(preferredencoding())
682
683 assert isinstance(optval, compat_str)
684 return optval
685
686
687def formatSeconds(secs):
688 if secs > 3600:
689 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
690 elif secs > 60:
691 return '%d:%02d' % (secs // 60, secs % 60)
692 else:
693 return '%d' % secs
694
695
696def make_HTTPS_handler(params, **kwargs):
697 opts_no_check_certificate = params.get('nocheckcertificate', False)
698 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
699 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
700 if opts_no_check_certificate:
701 context.check_hostname = False
702 context.verify_mode = ssl.CERT_NONE
703 try:
704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
705 except TypeError:
706 # Python 2.7.8
707 # (create_default_context present but HTTPSHandler has no context=)
708 pass
709
710 if sys.version_info < (3, 2):
711 return YoutubeDLHTTPSHandler(params, **kwargs)
712 else: # Python < 3.4
713 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
714 context.verify_mode = (ssl.CERT_NONE
715 if opts_no_check_certificate
716 else ssl.CERT_REQUIRED)
717 context.set_default_verify_paths()
718 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
719
720
721def bug_reports_message():
722 if ytdl_is_updateable():
723 update_cmd = 'type youtube-dl -U to update'
724 else:
725 update_cmd = 'see https://yt-dl.org/update on how to update'
726 msg = '; please report this issue on https://yt-dl.org/bug .'
727 msg += ' Make sure you are using the latest version; %s.' % update_cmd
728 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
729 return msg
730
731
732class YoutubeDLError(Exception):
733 """Base exception for YoutubeDL errors."""
734 pass
735
736
737class ExtractorError(YoutubeDLError):
738 """Error during info extraction."""
739
740 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
741 """ tb, if given, is the original traceback (so that it can be printed out).
742 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
743 """
744
745 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
746 expected = True
747 if video_id is not None:
748 msg = video_id + ': ' + msg
749 if cause:
750 msg += ' (caused by %r)' % cause
751 if not expected:
752 msg += bug_reports_message()
753 super(ExtractorError, self).__init__(msg)
754
755 self.traceback = tb
756 self.exc_info = sys.exc_info() # preserve original exception
757 self.cause = cause
758 self.video_id = video_id
759
760 def format_traceback(self):
761 if self.traceback is None:
762 return None
763 return ''.join(traceback.format_tb(self.traceback))
764
765
766class UnsupportedError(ExtractorError):
767 def __init__(self, url):
768 super(UnsupportedError, self).__init__(
769 'Unsupported URL: %s' % url, expected=True)
770 self.url = url
771
772
773class RegexNotFoundError(ExtractorError):
774 """Error when a regex didn't match"""
775 pass
776
777
778class GeoRestrictedError(ExtractorError):
779 """Geographic restriction Error exception.
780
781 This exception may be thrown when a video is not available from your
782 geographic location due to geographic restrictions imposed by a website.
783 """
784 def __init__(self, msg, countries=None):
785 super(GeoRestrictedError, self).__init__(msg, expected=True)
786 self.msg = msg
787 self.countries = countries
788
789
790class DownloadError(YoutubeDLError):
791 """Download Error exception.
792
793 This exception may be thrown by FileDownloader objects if they are not
794 configured to continue on errors. They will contain the appropriate
795 error message.
796 """
797
798 def __init__(self, msg, exc_info=None):
799 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
800 super(DownloadError, self).__init__(msg)
801 self.exc_info = exc_info
802
803
804class SameFileError(YoutubeDLError):
805 """Same File exception.
806
807 This exception will be thrown by FileDownloader objects if they detect
808 multiple files would have to be downloaded to the same file on disk.
809 """
810 pass
811
812
813class PostProcessingError(YoutubeDLError):
814 """Post Processing exception.
815
816 This exception may be raised by PostProcessor's .run() method to
817 indicate an error in the postprocessing task.
818 """
819
820 def __init__(self, msg):
821 super(PostProcessingError, self).__init__(msg)
822 self.msg = msg
823
824
825class MaxDownloadsReached(YoutubeDLError):
826 """ --max-downloads limit has been reached. """
827 pass
828
829
830class UnavailableVideoError(YoutubeDLError):
831 """Unavailable Format exception.
832
833 This exception will be thrown when a video is requested
834 in a format that is not available for that video.
835 """
836 pass
837
838
839class ContentTooShortError(YoutubeDLError):
840 """Content Too Short exception.
841
842 This exception may be raised by FileDownloader objects when a file they
843 download is too small for what the server announced first, indicating
844 the connection was probably interrupted.
845 """
846
847 def __init__(self, downloaded, expected):
848 super(ContentTooShortError, self).__init__(
849 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
850 )
851 # Both in bytes
852 self.downloaded = downloaded
853 self.expected = expected
854
855
856class XAttrMetadataError(YoutubeDLError):
857 def __init__(self, code=None, msg='Unknown error'):
858 super(XAttrMetadataError, self).__init__(msg)
859 self.code = code
860 self.msg = msg
861
862 # Parsing code and msg
863 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
864 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
865 self.reason = 'NO_SPACE'
866 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
867 self.reason = 'VALUE_TOO_LONG'
868 else:
869 self.reason = 'NOT_SUPPORTED'
870
871
872class XAttrUnavailableError(YoutubeDLError):
873 pass
874
875
876def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
877 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
878 # expected HTTP responses to meet HTTP/1.0 or later (see also
879 # https://github.com/rg3/youtube-dl/issues/6727)
880 if sys.version_info < (3, 0):
881 kwargs['strict'] = True
882 hc = http_class(*args, **compat_kwargs(kwargs))
883 source_address = ydl_handler._params.get('source_address')
884 if source_address is not None:
885 sa = (source_address, 0)
886 if hasattr(hc, 'source_address'): # Python 2.7+
887 hc.source_address = sa
888 else: # Python 2.6
889 def _hc_connect(self, *args, **kwargs):
890 sock = compat_socket_create_connection(
891 (self.host, self.port), self.timeout, sa)
892 if is_https:
893 self.sock = ssl.wrap_socket(
894 sock, self.key_file, self.cert_file,
895 ssl_version=ssl.PROTOCOL_TLSv1)
896 else:
897 self.sock = sock
898 hc.connect = functools.partial(_hc_connect, hc)
899
900 return hc
901
902
903def handle_youtubedl_headers(headers):
904 filtered_headers = headers
905
906 if 'Youtubedl-no-compression' in filtered_headers:
907 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
908 del filtered_headers['Youtubedl-no-compression']
909
910 return filtered_headers
911
912
913class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
914 """Handler for HTTP requests and responses.
915
916 This class, when installed with an OpenerDirector, automatically adds
917 the standard headers to every HTTP request and handles gzipped and
918 deflated responses from web servers. If compression is to be avoided in
919 a particular request, the original request in the program code only has
920 to include the HTTP header "Youtubedl-no-compression", which will be
921 removed before making the real request.
922
923 Part of this code was copied from:
924
925 http://techknack.net/python-urllib2-handlers/
926
927 Andrew Rowls, the author of that code, agreed to release it to the
928 public domain.
929 """
930
931 def __init__(self, params, *args, **kwargs):
932 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
933 self._params = params
934
935 def http_open(self, req):
936 conn_class = compat_http_client.HTTPConnection
937
938 socks_proxy = req.headers.get('Ytdl-socks-proxy')
939 if socks_proxy:
940 conn_class = make_socks_conn_class(conn_class, socks_proxy)
941 del req.headers['Ytdl-socks-proxy']
942
943 return self.do_open(functools.partial(
944 _create_http_connection, self, conn_class, False),
945 req)
946
947 @staticmethod
948 def deflate(data):
949 try:
950 return zlib.decompress(data, -zlib.MAX_WBITS)
951 except zlib.error:
952 return zlib.decompress(data)
953
954 def http_request(self, req):
955 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
956 # always respected by websites, some tend to give out URLs with non percent-encoded
957 # non-ASCII characters (see telemb.py, ard.py [#3412])
958 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
959 # To work around aforementioned issue we will replace request's original URL with
960 # percent-encoded one
961 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
962 # the code of this workaround has been moved here from YoutubeDL.urlopen()
963 url = req.get_full_url()
964 url_escaped = escape_url(url)
965
966 # Substitute URL if any change after escaping
967 if url != url_escaped:
968 req = update_Request(req, url=url_escaped)
969
970 for h, v in std_headers.items():
971 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
972 # The dict keys are capitalized because of this bug by urllib
973 if h.capitalize() not in req.headers:
974 req.add_header(h, v)
975
976 req.headers = handle_youtubedl_headers(req.headers)
977
978 if sys.version_info < (2, 7) and '#' in req.get_full_url():
979 # Python 2.6 is brain-dead when it comes to fragments
980 req._Request__original = req._Request__original.partition('#')[0]
981 req._Request__r_type = req._Request__r_type.partition('#')[0]
982
983 return req
984
985 def http_response(self, req, resp):
986 old_resp = resp
987 # gzip
988 if resp.headers.get('Content-encoding', '') == 'gzip':
989 content = resp.read()
990 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
991 try:
992 uncompressed = io.BytesIO(gz.read())
993 except IOError as original_ioerror:
994 # There may be junk add the end of the file
995 # See http://stackoverflow.com/q/4928560/35070 for details
996 for i in range(1, 1024):
997 try:
998 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
999 uncompressed = io.BytesIO(gz.read())
1000 except IOError:
1001 continue
1002 break
1003 else:
1004 raise original_ioerror
1005 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1006 resp.msg = old_resp.msg
1007 del resp.headers['Content-encoding']
1008 # deflate
1009 if resp.headers.get('Content-encoding', '') == 'deflate':
1010 gz = io.BytesIO(self.deflate(resp.read()))
1011 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1012 resp.msg = old_resp.msg
1013 del resp.headers['Content-encoding']
1014 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015 # https://github.com/rg3/youtube-dl/issues/6457).
1016 if 300 <= resp.code < 400:
1017 location = resp.headers.get('Location')
1018 if location:
1019 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020 if sys.version_info >= (3, 0):
1021 location = location.encode('iso-8859-1').decode('utf-8')
1022 else:
1023 location = location.decode('utf-8')
1024 location_escaped = escape_url(location)
1025 if location != location_escaped:
1026 del resp.headers['Location']
1027 if sys.version_info < (3, 0):
1028 location_escaped = location_escaped.encode('utf-8')
1029 resp.headers['Location'] = location_escaped
1030 return resp
1031
1032 https_request = http_request
1033 https_response = http_response
1034
1035
1036def make_socks_conn_class(base_class, socks_proxy):
1037 assert issubclass(base_class, (
1038 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1039
1040 url_components = compat_urlparse.urlparse(socks_proxy)
1041 if url_components.scheme.lower() == 'socks5':
1042 socks_type = ProxyType.SOCKS5
1043 elif url_components.scheme.lower() in ('socks', 'socks4'):
1044 socks_type = ProxyType.SOCKS4
1045 elif url_components.scheme.lower() == 'socks4a':
1046 socks_type = ProxyType.SOCKS4A
1047
1048 def unquote_if_non_empty(s):
1049 if not s:
1050 return s
1051 return compat_urllib_parse_unquote_plus(s)
1052
1053 proxy_args = (
1054 socks_type,
1055 url_components.hostname, url_components.port or 1080,
1056 True, # Remote DNS
1057 unquote_if_non_empty(url_components.username),
1058 unquote_if_non_empty(url_components.password),
1059 )
1060
1061 class SocksConnection(base_class):
1062 def connect(self):
1063 self.sock = sockssocket()
1064 self.sock.setproxy(*proxy_args)
1065 if type(self.timeout) in (int, float):
1066 self.sock.settimeout(self.timeout)
1067 self.sock.connect((self.host, self.port))
1068
1069 if isinstance(self, compat_http_client.HTTPSConnection):
1070 if hasattr(self, '_context'): # Python > 2.6
1071 self.sock = self._context.wrap_socket(
1072 self.sock, server_hostname=self.host)
1073 else:
1074 self.sock = ssl.wrap_socket(self.sock)
1075
1076 return SocksConnection
1077
1078
1079class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083 self._params = params
1084
1085 def https_open(self, req):
1086 kwargs = {}
1087 conn_class = self._https_conn_class
1088
1089 if hasattr(self, '_context'): # python > 2.6
1090 kwargs['context'] = self._context
1091 if hasattr(self, '_check_hostname'): # python 3.x
1092 kwargs['check_hostname'] = self._check_hostname
1093
1094 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1095 if socks_proxy:
1096 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097 del req.headers['Ytdl-socks-proxy']
1098
1099 return self.do_open(functools.partial(
1100 _create_http_connection, self, conn_class, True),
1101 req, **kwargs)
1102
1103
1104class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105 def __init__(self, cookiejar=None):
1106 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1107
1108 def http_response(self, request, response):
1109 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110 # characters in Set-Cookie HTTP header of last response (see
1111 # https://github.com/rg3/youtube-dl/issues/6769).
1112 # In order to at least prevent crashing we will percent encode Set-Cookie
1113 # header before HTTPCookieProcessor starts processing it.
1114 # if sys.version_info < (3, 0) and response.headers:
1115 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116 # set_cookie = response.headers.get(set_cookie_header)
1117 # if set_cookie:
1118 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119 # if set_cookie != set_cookie_escaped:
1120 # del response.headers[set_cookie_header]
1121 # response.headers[set_cookie_header] = set_cookie_escaped
1122 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1123
1124 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125 https_response = http_response
1126
1127
1128def extract_timezone(date_str):
1129 m = re.search(
1130 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1131 date_str)
1132 if not m:
1133 timezone = datetime.timedelta()
1134 else:
1135 date_str = date_str[:-len(m.group('tz'))]
1136 if not m.group('sign'):
1137 timezone = datetime.timedelta()
1138 else:
1139 sign = 1 if m.group('sign') == '+' else -1
1140 timezone = datetime.timedelta(
1141 hours=sign * int(m.group('hours')),
1142 minutes=sign * int(m.group('minutes')))
1143 return timezone, date_str
1144
1145
1146def parse_iso8601(date_str, delimiter='T', timezone=None):
1147 """ Return a UNIX timestamp from the given date """
1148
1149 if date_str is None:
1150 return None
1151
1152 date_str = re.sub(r'\.[0-9]+', '', date_str)
1153
1154 if timezone is None:
1155 timezone, date_str = extract_timezone(date_str)
1156
1157 try:
1158 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160 return calendar.timegm(dt.timetuple())
1161 except ValueError:
1162 pass
1163
1164
1165def date_formats(day_first=True):
1166 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1167
1168
1169def unified_strdate(date_str, day_first=True):
1170 """Return a string with the date in the format YYYYMMDD"""
1171
1172 if date_str is None:
1173 return None
1174 upload_date = None
1175 # Replace commas
1176 date_str = date_str.replace(',', ' ')
1177 # Remove AM/PM + timezone
1178 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1179 _, date_str = extract_timezone(date_str)
1180
1181 for expression in date_formats(day_first):
1182 try:
1183 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1184 except ValueError:
1185 pass
1186 if upload_date is None:
1187 timetuple = email.utils.parsedate_tz(date_str)
1188 if timetuple:
1189 try:
1190 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191 except ValueError:
1192 pass
1193 if upload_date is not None:
1194 return compat_str(upload_date)
1195
1196
1197def unified_timestamp(date_str, day_first=True):
1198 if date_str is None:
1199 return None
1200
1201 date_str = re.sub(r'[,|]', '', date_str)
1202
1203 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1204 timezone, date_str = extract_timezone(date_str)
1205
1206 # Remove AM/PM + timezone
1207 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1208
1209 # Remove unrecognized timezones from ISO 8601 alike timestamps
1210 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1211 if m:
1212 date_str = date_str[:-len(m.group('tz'))]
1213
1214 # Python only supports microseconds, so remove nanoseconds
1215 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1216 if m:
1217 date_str = m.group(1)
1218
1219 for expression in date_formats(day_first):
1220 try:
1221 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1222 return calendar.timegm(dt.timetuple())
1223 except ValueError:
1224 pass
1225 timetuple = email.utils.parsedate_tz(date_str)
1226 if timetuple:
1227 return calendar.timegm(timetuple) + pm_delta * 3600
1228
1229
1230def determine_ext(url, default_ext='unknown_video'):
1231 if url is None:
1232 return default_ext
1233 guess = url.partition('?')[0].rpartition('.')[2]
1234 if re.match(r'^[A-Za-z0-9]+$', guess):
1235 return guess
1236 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1237 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1238 return guess.rstrip('/')
1239 else:
1240 return default_ext
1241
1242
1243def subtitles_filename(filename, sub_lang, sub_format):
1244 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1245
1246
1247def date_from_str(date_str):
1248 """
1249 Return a datetime object from a string in the format YYYYMMDD or
1250 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1251 today = datetime.date.today()
1252 if date_str in ('now', 'today'):
1253 return today
1254 if date_str == 'yesterday':
1255 return today - datetime.timedelta(days=1)
1256 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1257 if match is not None:
1258 sign = match.group('sign')
1259 time = int(match.group('time'))
1260 if sign == '-':
1261 time = -time
1262 unit = match.group('unit')
1263 # A bad approximation?
1264 if unit == 'month':
1265 unit = 'day'
1266 time *= 30
1267 elif unit == 'year':
1268 unit = 'day'
1269 time *= 365
1270 unit += 's'
1271 delta = datetime.timedelta(**{unit: time})
1272 return today + delta
1273 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1274
1275
1276def hyphenate_date(date_str):
1277 """
1278 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1279 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1280 if match is not None:
1281 return '-'.join(match.groups())
1282 else:
1283 return date_str
1284
1285
1286class DateRange(object):
1287 """Represents a time interval between two dates"""
1288
1289 def __init__(self, start=None, end=None):
1290 """start and end must be strings in the format accepted by date"""
1291 if start is not None:
1292 self.start = date_from_str(start)
1293 else:
1294 self.start = datetime.datetime.min.date()
1295 if end is not None:
1296 self.end = date_from_str(end)
1297 else:
1298 self.end = datetime.datetime.max.date()
1299 if self.start > self.end:
1300 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1301
1302 @classmethod
1303 def day(cls, day):
1304 """Returns a range that only contains the given day"""
1305 return cls(day, day)
1306
1307 def __contains__(self, date):
1308 """Check if the date is in the range"""
1309 if not isinstance(date, datetime.date):
1310 date = date_from_str(date)
1311 return self.start <= date <= self.end
1312
1313 def __str__(self):
1314 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1315
1316
1317def platform_name():
1318 """ Returns the platform name as a compat_str """
1319 res = platform.platform()
1320 if isinstance(res, bytes):
1321 res = res.decode(preferredencoding())
1322
1323 assert isinstance(res, compat_str)
1324 return res
1325
1326
1327def _windows_write_string(s, out):
1328 """ Returns True if the string was written using special methods,
1329 False if it has yet to be written out."""
1330 # Adapted from http://stackoverflow.com/a/3259271/35070
1331
1332 import ctypes
1333 import ctypes.wintypes
1334
1335 WIN_OUTPUT_IDS = {
1336 1: -11,
1337 2: -12,
1338 }
1339
1340 try:
1341 fileno = out.fileno()
1342 except AttributeError:
1343 # If the output stream doesn't have a fileno, it's virtual
1344 return False
1345 except io.UnsupportedOperation:
1346 # Some strange Windows pseudo files?
1347 return False
1348 if fileno not in WIN_OUTPUT_IDS:
1349 return False
1350
1351 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1352 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1353 ('GetStdHandle', ctypes.windll.kernel32))
1354 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1355
1356 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1357 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1358 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1359 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1360 written = ctypes.wintypes.DWORD(0)
1361
1362 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1363 FILE_TYPE_CHAR = 0x0002
1364 FILE_TYPE_REMOTE = 0x8000
1365 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1366 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1367 ctypes.POINTER(ctypes.wintypes.DWORD))(
1368 ('GetConsoleMode', ctypes.windll.kernel32))
1369 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1370
1371 def not_a_console(handle):
1372 if handle == INVALID_HANDLE_VALUE or handle is None:
1373 return True
1374 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1375 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1376
1377 if not_a_console(h):
1378 return False
1379
1380 def next_nonbmp_pos(s):
1381 try:
1382 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1383 except StopIteration:
1384 return len(s)
1385
1386 while s:
1387 count = min(next_nonbmp_pos(s), 1024)
1388
1389 ret = WriteConsoleW(
1390 h, s, count if count else 2, ctypes.byref(written), None)
1391 if ret == 0:
1392 raise OSError('Failed to write string')
1393 if not count: # We just wrote a non-BMP character
1394 assert written.value == 2
1395 s = s[1:]
1396 else:
1397 assert written.value > 0
1398 s = s[written.value:]
1399 return True
1400
1401
1402def write_string(s, out=None, encoding=None):
1403 if out is None:
1404 out = sys.stderr
1405 assert type(s) == compat_str
1406
1407 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1408 if _windows_write_string(s, out):
1409 return
1410
1411 if ('b' in getattr(out, 'mode', '') or
1412 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1413 byt = s.encode(encoding or preferredencoding(), 'ignore')
1414 out.write(byt)
1415 elif hasattr(out, 'buffer'):
1416 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1417 byt = s.encode(enc, 'ignore')
1418 out.buffer.write(byt)
1419 else:
1420 out.write(s)
1421 out.flush()
1422
1423
1424def bytes_to_intlist(bs):
1425 if not bs:
1426 return []
1427 if isinstance(bs[0], int): # Python 3
1428 return list(bs)
1429 else:
1430 return [ord(c) for c in bs]
1431
1432
1433def intlist_to_bytes(xs):
1434 if not xs:
1435 return b''
1436 return compat_struct_pack('%dB' % len(xs), *xs)
1437
1438
1439# Cross-platform file locking
1440if sys.platform == 'win32':
1441 import ctypes.wintypes
1442 import msvcrt
1443
1444 class OVERLAPPED(ctypes.Structure):
1445 _fields_ = [
1446 ('Internal', ctypes.wintypes.LPVOID),
1447 ('InternalHigh', ctypes.wintypes.LPVOID),
1448 ('Offset', ctypes.wintypes.DWORD),
1449 ('OffsetHigh', ctypes.wintypes.DWORD),
1450 ('hEvent', ctypes.wintypes.HANDLE),
1451 ]
1452
1453 kernel32 = ctypes.windll.kernel32
1454 LockFileEx = kernel32.LockFileEx
1455 LockFileEx.argtypes = [
1456 ctypes.wintypes.HANDLE, # hFile
1457 ctypes.wintypes.DWORD, # dwFlags
1458 ctypes.wintypes.DWORD, # dwReserved
1459 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1460 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1461 ctypes.POINTER(OVERLAPPED) # Overlapped
1462 ]
1463 LockFileEx.restype = ctypes.wintypes.BOOL
1464 UnlockFileEx = kernel32.UnlockFileEx
1465 UnlockFileEx.argtypes = [
1466 ctypes.wintypes.HANDLE, # hFile
1467 ctypes.wintypes.DWORD, # dwReserved
1468 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1469 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1470 ctypes.POINTER(OVERLAPPED) # Overlapped
1471 ]
1472 UnlockFileEx.restype = ctypes.wintypes.BOOL
1473 whole_low = 0xffffffff
1474 whole_high = 0x7fffffff
1475
1476 def _lock_file(f, exclusive):
1477 overlapped = OVERLAPPED()
1478 overlapped.Offset = 0
1479 overlapped.OffsetHigh = 0
1480 overlapped.hEvent = 0
1481 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1482 handle = msvcrt.get_osfhandle(f.fileno())
1483 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1484 whole_low, whole_high, f._lock_file_overlapped_p):
1485 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1486
1487 def _unlock_file(f):
1488 assert f._lock_file_overlapped_p
1489 handle = msvcrt.get_osfhandle(f.fileno())
1490 if not UnlockFileEx(handle, 0,
1491 whole_low, whole_high, f._lock_file_overlapped_p):
1492 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1493
1494else:
1495 # Some platforms, such as Jython, is missing fcntl
1496 try:
1497 import fcntl
1498
1499 def _lock_file(f, exclusive):
1500 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1501
1502 def _unlock_file(f):
1503 fcntl.flock(f, fcntl.LOCK_UN)
1504 except ImportError:
1505 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1506
1507 def _lock_file(f, exclusive):
1508 raise IOError(UNSUPPORTED_MSG)
1509
1510 def _unlock_file(f):
1511 raise IOError(UNSUPPORTED_MSG)
1512
1513
1514class locked_file(object):
1515 def __init__(self, filename, mode, encoding=None):
1516 assert mode in ['r', 'a', 'w']
1517 self.f = io.open(filename, mode, encoding=encoding)
1518 self.mode = mode
1519
1520 def __enter__(self):
1521 exclusive = self.mode != 'r'
1522 try:
1523 _lock_file(self.f, exclusive)
1524 except IOError:
1525 self.f.close()
1526 raise
1527 return self
1528
1529 def __exit__(self, etype, value, traceback):
1530 try:
1531 _unlock_file(self.f)
1532 finally:
1533 self.f.close()
1534
1535 def __iter__(self):
1536 return iter(self.f)
1537
1538 def write(self, *args):
1539 return self.f.write(*args)
1540
1541 def read(self, *args):
1542 return self.f.read(*args)
1543
1544
1545def get_filesystem_encoding():
1546 encoding = sys.getfilesystemencoding()
1547 return encoding if encoding is not None else 'utf-8'
1548
1549
1550def shell_quote(args):
1551 quoted_args = []
1552 encoding = get_filesystem_encoding()
1553 for a in args:
1554 if isinstance(a, bytes):
1555 # We may get a filename encoded with 'encodeFilename'
1556 a = a.decode(encoding)
1557 quoted_args.append(compat_shlex_quote(a))
1558 return ' '.join(quoted_args)
1559
1560
1561def smuggle_url(url, data):
1562 """ Pass additional data in a URL for internal use. """
1563
1564 url, idata = unsmuggle_url(url, {})
1565 data.update(idata)
1566 sdata = compat_urllib_parse_urlencode(
1567 {'__youtubedl_smuggle': json.dumps(data)})
1568 return url + '#' + sdata
1569
1570
1571def unsmuggle_url(smug_url, default=None):
1572 if '#__youtubedl_smuggle' not in smug_url:
1573 return smug_url, default
1574 url, _, sdata = smug_url.rpartition('#')
1575 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1576 data = json.loads(jsond)
1577 return url, data
1578
1579
1580def format_bytes(bytes):
1581 if bytes is None:
1582 return 'N/A'
1583 if type(bytes) is str:
1584 bytes = float(bytes)
1585 if bytes == 0.0:
1586 exponent = 0
1587 else:
1588 exponent = int(math.log(bytes, 1024.0))
1589 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1590 converted = float(bytes) / float(1024 ** exponent)
1591 return '%.2f%s' % (converted, suffix)
1592
1593
1594def lookup_unit_table(unit_table, s):
1595 units_re = '|'.join(re.escape(u) for u in unit_table)
1596 m = re.match(
1597 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1598 if not m:
1599 return None
1600 num_str = m.group('num').replace(',', '.')
1601 mult = unit_table[m.group('unit')]
1602 return int(float(num_str) * mult)
1603
1604
1605def parse_filesize(s):
1606 if s is None:
1607 return None
1608
1609 # The lower-case forms are of course incorrect and unofficial,
1610 # but we support those too
1611 _UNIT_TABLE = {
1612 'B': 1,
1613 'b': 1,
1614 'bytes': 1,
1615 'KiB': 1024,
1616 'KB': 1000,
1617 'kB': 1024,
1618 'Kb': 1000,
1619 'kb': 1000,
1620 'kilobytes': 1000,
1621 'kibibytes': 1024,
1622 'MiB': 1024 ** 2,
1623 'MB': 1000 ** 2,
1624 'mB': 1024 ** 2,
1625 'Mb': 1000 ** 2,
1626 'mb': 1000 ** 2,
1627 'megabytes': 1000 ** 2,
1628 'mebibytes': 1024 ** 2,
1629 'GiB': 1024 ** 3,
1630 'GB': 1000 ** 3,
1631 'gB': 1024 ** 3,
1632 'Gb': 1000 ** 3,
1633 'gb': 1000 ** 3,
1634 'gigabytes': 1000 ** 3,
1635 'gibibytes': 1024 ** 3,
1636 'TiB': 1024 ** 4,
1637 'TB': 1000 ** 4,
1638 'tB': 1024 ** 4,
1639 'Tb': 1000 ** 4,
1640 'tb': 1000 ** 4,
1641 'terabytes': 1000 ** 4,
1642 'tebibytes': 1024 ** 4,
1643 'PiB': 1024 ** 5,
1644 'PB': 1000 ** 5,
1645 'pB': 1024 ** 5,
1646 'Pb': 1000 ** 5,
1647 'pb': 1000 ** 5,
1648 'petabytes': 1000 ** 5,
1649 'pebibytes': 1024 ** 5,
1650 'EiB': 1024 ** 6,
1651 'EB': 1000 ** 6,
1652 'eB': 1024 ** 6,
1653 'Eb': 1000 ** 6,
1654 'eb': 1000 ** 6,
1655 'exabytes': 1000 ** 6,
1656 'exbibytes': 1024 ** 6,
1657 'ZiB': 1024 ** 7,
1658 'ZB': 1000 ** 7,
1659 'zB': 1024 ** 7,
1660 'Zb': 1000 ** 7,
1661 'zb': 1000 ** 7,
1662 'zettabytes': 1000 ** 7,
1663 'zebibytes': 1024 ** 7,
1664 'YiB': 1024 ** 8,
1665 'YB': 1000 ** 8,
1666 'yB': 1024 ** 8,
1667 'Yb': 1000 ** 8,
1668 'yb': 1000 ** 8,
1669 'yottabytes': 1000 ** 8,
1670 'yobibytes': 1024 ** 8,
1671 }
1672
1673 return lookup_unit_table(_UNIT_TABLE, s)
1674
1675
1676def parse_count(s):
1677 if s is None:
1678 return None
1679
1680 s = s.strip()
1681
1682 if re.match(r'^[\d,.]+$', s):
1683 return str_to_int(s)
1684
1685 _UNIT_TABLE = {
1686 'k': 1000,
1687 'K': 1000,
1688 'm': 1000 ** 2,
1689 'M': 1000 ** 2,
1690 'kk': 1000 ** 2,
1691 'KK': 1000 ** 2,
1692 }
1693
1694 return lookup_unit_table(_UNIT_TABLE, s)
1695
1696
1697def parse_resolution(s):
1698 if s is None:
1699 return {}
1700
1701 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1702 if mobj:
1703 return {
1704 'width': int(mobj.group('w')),
1705 'height': int(mobj.group('h')),
1706 }
1707
1708 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1709 if mobj:
1710 return {'height': int(mobj.group(1))}
1711
1712 mobj = re.search(r'\b([48])[kK]\b', s)
1713 if mobj:
1714 return {'height': int(mobj.group(1)) * 540}
1715
1716 return {}
1717
1718
1719def month_by_name(name, lang='en'):
1720 """ Return the number of a month by (locale-independently) English name """
1721
1722 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1723
1724 try:
1725 return month_names.index(name) + 1
1726 except ValueError:
1727 return None
1728
1729
1730def month_by_abbreviation(abbrev):
1731 """ Return the number of a month by (locale-independently) English
1732 abbreviations """
1733
1734 try:
1735 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1736 except ValueError:
1737 return None
1738
1739
1740def fix_xml_ampersands(xml_str):
1741 """Replace all the '&' by '&amp;' in XML"""
1742 return re.sub(
1743 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1744 '&amp;',
1745 xml_str)
1746
1747
1748def setproctitle(title):
1749 assert isinstance(title, compat_str)
1750
1751 # ctypes in Jython is not complete
1752 # http://bugs.jython.org/issue2148
1753 if sys.platform.startswith('java'):
1754 return
1755
1756 try:
1757 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1758 except OSError:
1759 return
1760 except TypeError:
1761 # LoadLibrary in Windows Python 2.7.13 only expects
1762 # a bytestring, but since unicode_literals turns
1763 # every string into a unicode string, it fails.
1764 return
1765 title_bytes = title.encode('utf-8')
1766 buf = ctypes.create_string_buffer(len(title_bytes))
1767 buf.value = title_bytes
1768 try:
1769 libc.prctl(15, buf, 0, 0, 0)
1770 except AttributeError:
1771 return # Strange libc, just skip this
1772
1773
1774def remove_start(s, start):
1775 return s[len(start):] if s is not None and s.startswith(start) else s
1776
1777
1778def remove_end(s, end):
1779 return s[:-len(end)] if s is not None and s.endswith(end) else s
1780
1781
1782def remove_quotes(s):
1783 if s is None or len(s) < 2:
1784 return s
1785 for quote in ('"', "'", ):
1786 if s[0] == quote and s[-1] == quote:
1787 return s[1:-1]
1788 return s
1789
1790
1791def url_basename(url):
1792 path = compat_urlparse.urlparse(url).path
1793 return path.strip('/').split('/')[-1]
1794
1795
1796def base_url(url):
1797 return re.match(r'https?://[^?#&]+/', url).group()
1798
1799
1800def urljoin(base, path):
1801 if isinstance(path, bytes):
1802 path = path.decode('utf-8')
1803 if not isinstance(path, compat_str) or not path:
1804 return None
1805 if re.match(r'^(?:https?:)?//', path):
1806 return path
1807 if isinstance(base, bytes):
1808 base = base.decode('utf-8')
1809 if not isinstance(base, compat_str) or not re.match(
1810 r'^(?:https?:)?//', base):
1811 return None
1812 return compat_urlparse.urljoin(base, path)
1813
1814
1815class HEADRequest(compat_urllib_request.Request):
1816 def get_method(self):
1817 return 'HEAD'
1818
1819
1820class PUTRequest(compat_urllib_request.Request):
1821 def get_method(self):
1822 return 'PUT'
1823
1824
1825def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1826 if get_attr:
1827 if v is not None:
1828 v = getattr(v, get_attr, None)
1829 if v == '':
1830 v = None
1831 if v is None:
1832 return default
1833 try:
1834 return int(v) * invscale // scale
1835 except ValueError:
1836 return default
1837
1838
1839def str_or_none(v, default=None):
1840 return default if v is None else compat_str(v)
1841
1842
1843def str_to_int(int_str):
1844 """ A more relaxed version of int_or_none """
1845 if int_str is None:
1846 return None
1847 int_str = re.sub(r'[,\.\+]', '', int_str)
1848 return int(int_str)
1849
1850
1851def float_or_none(v, scale=1, invscale=1, default=None):
1852 if v is None:
1853 return default
1854 try:
1855 return float(v) * invscale / scale
1856 except ValueError:
1857 return default
1858
1859
1860def bool_or_none(v, default=None):
1861 return v if isinstance(v, bool) else default
1862
1863
1864def strip_or_none(v):
1865 return None if v is None else v.strip()
1866
1867
1868def parse_duration(s):
1869 if not isinstance(s, compat_basestring):
1870 return None
1871
1872 s = s.strip()
1873
1874 days, hours, mins, secs, ms = [None] * 5
1875 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1876 if m:
1877 days, hours, mins, secs, ms = m.groups()
1878 else:
1879 m = re.match(
1880 r'''(?ix)(?:P?
1881 (?:
1882 [0-9]+\s*y(?:ears?)?\s*
1883 )?
1884 (?:
1885 [0-9]+\s*m(?:onths?)?\s*
1886 )?
1887 (?:
1888 [0-9]+\s*w(?:eeks?)?\s*
1889 )?
1890 (?:
1891 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1892 )?
1893 T)?
1894 (?:
1895 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1896 )?
1897 (?:
1898 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1899 )?
1900 (?:
1901 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1902 )?Z?$''', s)
1903 if m:
1904 days, hours, mins, secs, ms = m.groups()
1905 else:
1906 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1907 if m:
1908 hours, mins = m.groups()
1909 else:
1910 return None
1911
1912 duration = 0
1913 if secs:
1914 duration += float(secs)
1915 if mins:
1916 duration += float(mins) * 60
1917 if hours:
1918 duration += float(hours) * 60 * 60
1919 if days:
1920 duration += float(days) * 24 * 60 * 60
1921 if ms:
1922 duration += float(ms)
1923 return duration
1924
1925
1926def prepend_extension(filename, ext, expected_real_ext=None):
1927 name, real_ext = os.path.splitext(filename)
1928 return (
1929 '{0}.{1}{2}'.format(name, ext, real_ext)
1930 if not expected_real_ext or real_ext[1:] == expected_real_ext
1931 else '{0}.{1}'.format(filename, ext))
1932
1933
1934def replace_extension(filename, ext, expected_real_ext=None):
1935 name, real_ext = os.path.splitext(filename)
1936 return '{0}.{1}'.format(
1937 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1938 ext)
1939
1940
1941def check_executable(exe, args=[]):
1942 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1943 args can be a list of arguments for a short output (like -version) """
1944 try:
1945 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1946 except OSError:
1947 return False
1948 return exe
1949
1950
1951def get_exe_version(exe, args=['--version'],
1952 version_re=None, unrecognized='present'):
1953 """ Returns the version of the specified executable,
1954 or False if the executable is not present """
1955 try:
1956 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1957 # SIGTTOU if youtube-dl is run in the background.
1958 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1959 out, _ = subprocess.Popen(
1960 [encodeArgument(exe)] + args,
1961 stdin=subprocess.PIPE,
1962 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1963 except OSError:
1964 return False
1965 if isinstance(out, bytes): # Python 2.x
1966 out = out.decode('ascii', 'ignore')
1967 return detect_exe_version(out, version_re, unrecognized)
1968
1969
1970def detect_exe_version(output, version_re=None, unrecognized='present'):
1971 assert isinstance(output, compat_str)
1972 if version_re is None:
1973 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1974 m = re.search(version_re, output)
1975 if m:
1976 return m.group(1)
1977 else:
1978 return unrecognized
1979
1980
1981class PagedList(object):
1982 def __len__(self):
1983 # This is only useful for tests
1984 return len(self.getslice())
1985
1986
1987class OnDemandPagedList(PagedList):
1988 def __init__(self, pagefunc, pagesize, use_cache=True):
1989 self._pagefunc = pagefunc
1990 self._pagesize = pagesize
1991 self._use_cache = use_cache
1992 if use_cache:
1993 self._cache = {}
1994
1995 def getslice(self, start=0, end=None):
1996 res = []
1997 for pagenum in itertools.count(start // self._pagesize):
1998 firstid = pagenum * self._pagesize
1999 nextfirstid = pagenum * self._pagesize + self._pagesize
2000 if start >= nextfirstid:
2001 continue
2002
2003 page_results = None
2004 if self._use_cache:
2005 page_results = self._cache.get(pagenum)
2006 if page_results is None:
2007 page_results = list(self._pagefunc(pagenum))
2008 if self._use_cache:
2009 self._cache[pagenum] = page_results
2010
2011 startv = (
2012 start % self._pagesize
2013 if firstid <= start < nextfirstid
2014 else 0)
2015
2016 endv = (
2017 ((end - 1) % self._pagesize) + 1
2018 if (end is not None and firstid <= end <= nextfirstid)
2019 else None)
2020
2021 if startv != 0 or endv is not None:
2022 page_results = page_results[startv:endv]
2023 res.extend(page_results)
2024
2025 # A little optimization - if current page is not "full", ie. does
2026 # not contain page_size videos then we can assume that this page
2027 # is the last one - there are no more ids on further pages -
2028 # i.e. no need to query again.
2029 if len(page_results) + startv < self._pagesize:
2030 break
2031
2032 # If we got the whole page, but the next page is not interesting,
2033 # break out early as well
2034 if end == nextfirstid:
2035 break
2036 return res
2037
2038
2039class InAdvancePagedList(PagedList):
2040 def __init__(self, pagefunc, pagecount, pagesize):
2041 self._pagefunc = pagefunc
2042 self._pagecount = pagecount
2043 self._pagesize = pagesize
2044
2045 def getslice(self, start=0, end=None):
2046 res = []
2047 start_page = start // self._pagesize
2048 end_page = (
2049 self._pagecount if end is None else (end // self._pagesize + 1))
2050 skip_elems = start - start_page * self._pagesize
2051 only_more = None if end is None else end - start
2052 for pagenum in range(start_page, end_page):
2053 page = list(self._pagefunc(pagenum))
2054 if skip_elems:
2055 page = page[skip_elems:]
2056 skip_elems = None
2057 if only_more is not None:
2058 if len(page) < only_more:
2059 only_more -= len(page)
2060 else:
2061 page = page[:only_more]
2062 res.extend(page)
2063 break
2064 res.extend(page)
2065 return res
2066
2067
2068def uppercase_escape(s):
2069 unicode_escape = codecs.getdecoder('unicode_escape')
2070 return re.sub(
2071 r'\\U[0-9a-fA-F]{8}',
2072 lambda m: unicode_escape(m.group(0))[0],
2073 s)
2074
2075
2076def lowercase_escape(s):
2077 unicode_escape = codecs.getdecoder('unicode_escape')
2078 return re.sub(
2079 r'\\u[0-9a-fA-F]{4}',
2080 lambda m: unicode_escape(m.group(0))[0],
2081 s)
2082
2083
2084def escape_rfc3986(s):
2085 """Escape non-ASCII characters as suggested by RFC 3986"""
2086 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2087 s = s.encode('utf-8')
2088 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2089
2090
2091def escape_url(url):
2092 """Escape URL as suggested by RFC 3986"""
2093 url_parsed = compat_urllib_parse_urlparse(url)
2094 return url_parsed._replace(
2095 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2096 path=escape_rfc3986(url_parsed.path),
2097 params=escape_rfc3986(url_parsed.params),
2098 query=escape_rfc3986(url_parsed.query),
2099 fragment=escape_rfc3986(url_parsed.fragment)
2100 ).geturl()
2101
2102
2103def read_batch_urls(batch_fd):
2104 def fixup(url):
2105 if not isinstance(url, compat_str):
2106 url = url.decode('utf-8', 'replace')
2107 BOM_UTF8 = '\xef\xbb\xbf'
2108 if url.startswith(BOM_UTF8):
2109 url = url[len(BOM_UTF8):]
2110 url = url.strip()
2111 if url.startswith(('#', ';', ']')):
2112 return False
2113 return url
2114
2115 with contextlib.closing(batch_fd) as fd:
2116 return [url for url in map(fixup, fd) if url]
2117
2118
2119def urlencode_postdata(*args, **kargs):
2120 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2121
2122
2123def update_url_query(url, query):
2124 if not query:
2125 return url
2126 parsed_url = compat_urlparse.urlparse(url)
2127 qs = compat_parse_qs(parsed_url.query)
2128 qs.update(query)
2129 return compat_urlparse.urlunparse(parsed_url._replace(
2130 query=compat_urllib_parse_urlencode(qs, True)))
2131
2132
2133def update_Request(req, url=None, data=None, headers={}, query={}):
2134 req_headers = req.headers.copy()
2135 req_headers.update(headers)
2136 req_data = data or req.data
2137 req_url = update_url_query(url or req.get_full_url(), query)
2138 req_get_method = req.get_method()
2139 if req_get_method == 'HEAD':
2140 req_type = HEADRequest
2141 elif req_get_method == 'PUT':
2142 req_type = PUTRequest
2143 else:
2144 req_type = compat_urllib_request.Request
2145 new_req = req_type(
2146 req_url, data=req_data, headers=req_headers,
2147 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2148 if hasattr(req, 'timeout'):
2149 new_req.timeout = req.timeout
2150 return new_req
2151
2152
2153def _multipart_encode_impl(data, boundary):
2154 content_type = 'multipart/form-data; boundary=%s' % boundary
2155
2156 out = b''
2157 for k, v in data.items():
2158 out += b'--' + boundary.encode('ascii') + b'\r\n'
2159 if isinstance(k, compat_str):
2160 k = k.encode('utf-8')
2161 if isinstance(v, compat_str):
2162 v = v.encode('utf-8')
2163 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2164 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2165 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2166 if boundary.encode('ascii') in content:
2167 raise ValueError('Boundary overlaps with data')
2168 out += content
2169
2170 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2171
2172 return out, content_type
2173
2174
2175def multipart_encode(data, boundary=None):
2176 '''
2177 Encode a dict to RFC 7578-compliant form-data
2178
2179 data:
2180 A dict where keys and values can be either Unicode or bytes-like
2181 objects.
2182 boundary:
2183 If specified a Unicode object, it's used as the boundary. Otherwise
2184 a random boundary is generated.
2185
2186 Reference: https://tools.ietf.org/html/rfc7578
2187 '''
2188 has_specified_boundary = boundary is not None
2189
2190 while True:
2191 if boundary is None:
2192 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2193
2194 try:
2195 out, content_type = _multipart_encode_impl(data, boundary)
2196 break
2197 except ValueError:
2198 if has_specified_boundary:
2199 raise
2200 boundary = None
2201
2202 return out, content_type
2203
2204
2205def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2206 if isinstance(key_or_keys, (list, tuple)):
2207 for key in key_or_keys:
2208 if key not in d or d[key] is None or skip_false_values and not d[key]:
2209 continue
2210 return d[key]
2211 return default
2212 return d.get(key_or_keys, default)
2213
2214
2215def try_get(src, getter, expected_type=None):
2216 if not isinstance(getter, (list, tuple)):
2217 getter = [getter]
2218 for get in getter:
2219 try:
2220 v = get(src)
2221 except (AttributeError, KeyError, TypeError, IndexError):
2222 pass
2223 else:
2224 if expected_type is None or isinstance(v, expected_type):
2225 return v
2226
2227
2228def merge_dicts(*dicts):
2229 merged = {}
2230 for a_dict in dicts:
2231 for k, v in a_dict.items():
2232 if v is None:
2233 continue
2234 if (k not in merged or
2235 (isinstance(v, compat_str) and v and
2236 isinstance(merged[k], compat_str) and
2237 not merged[k])):
2238 merged[k] = v
2239 return merged
2240
2241
2242def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2243 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2244
2245
2246US_RATINGS = {
2247 'G': 0,
2248 'PG': 10,
2249 'PG-13': 13,
2250 'R': 16,
2251 'NC': 18,
2252}
2253
2254
2255TV_PARENTAL_GUIDELINES = {
2256 'TV-Y': 0,
2257 'TV-Y7': 7,
2258 'TV-G': 0,
2259 'TV-PG': 0,
2260 'TV-14': 14,
2261 'TV-MA': 17,
2262}
2263
2264
2265def parse_age_limit(s):
2266 if type(s) == int:
2267 return s if 0 <= s <= 21 else None
2268 if not isinstance(s, compat_basestring):
2269 return None
2270 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2271 if m:
2272 return int(m.group('age'))
2273 if s in US_RATINGS:
2274 return US_RATINGS[s]
2275 return TV_PARENTAL_GUIDELINES.get(s)
2276
2277
2278def strip_jsonp(code):
2279 return re.sub(
2280 r'''(?sx)^
2281 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2282 (?:\s*&&\s*(?P=func_name))?
2283 \s*\(\s*(?P<callback_data>.*)\);?
2284 \s*?(?://[^\n]*)*$''',
2285 r'\g<callback_data>', code)
2286
2287
2288def js_to_json(code):
2289 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2290 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2291 INTEGER_TABLE = (
2292 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2293 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2294 )
2295
2296 def fix_kv(m):
2297 v = m.group(0)
2298 if v in ('true', 'false', 'null'):
2299 return v
2300 elif v.startswith('/*') or v.startswith('//') or v == ',':
2301 return ""
2302
2303 if v[0] in ("'", '"'):
2304 v = re.sub(r'(?s)\\.|"', lambda m: {
2305 '"': '\\"',
2306 "\\'": "'",
2307 '\\\n': '',
2308 '\\x': '\\u00',
2309 }.get(m.group(0), m.group(0)), v[1:-1])
2310
2311 for regex, base in INTEGER_TABLE:
2312 im = re.match(regex, v)
2313 if im:
2314 i = int(im.group(1), base)
2315 return '"%d":' % i if v.endswith(':') else '%d' % i
2316
2317 return '"%s"' % v
2318
2319 return re.sub(r'''(?sx)
2320 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2321 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2322 {comment}|,(?={skip}[\]}}])|
2323 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2324 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2325 [0-9]+(?={skip}:)
2326 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2327
2328
2329def qualities(quality_ids):
2330 """ Get a numeric quality value out of a list of possible values """
2331 def q(qid):
2332 try:
2333 return quality_ids.index(qid)
2334 except ValueError:
2335 return -1
2336 return q
2337
2338
2339DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2340
2341
2342def limit_length(s, length):
2343 """ Add ellipses to overly long strings """
2344 if s is None:
2345 return None
2346 ELLIPSES = '...'
2347 if len(s) > length:
2348 return s[:length - len(ELLIPSES)] + ELLIPSES
2349 return s
2350
2351
2352def version_tuple(v):
2353 return tuple(int(e) for e in re.split(r'[-.]', v))
2354
2355
2356def is_outdated_version(version, limit, assume_new=True):
2357 if not version:
2358 return not assume_new
2359 try:
2360 return version_tuple(version) < version_tuple(limit)
2361 except ValueError:
2362 return not assume_new
2363
2364
2365def ytdl_is_updateable():
2366 """ Returns if youtube-dl can be updated with -U """
2367 from zipimport import zipimporter
2368
2369 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2370
2371
2372def args_to_str(args):
2373 # Get a short string representation for a subprocess command
2374 return ' '.join(compat_shlex_quote(a) for a in args)
2375
2376
2377def error_to_compat_str(err):
2378 err_str = str(err)
2379 # On python 2 error byte string must be decoded with proper
2380 # encoding rather than ascii
2381 if sys.version_info[0] < 3:
2382 err_str = err_str.decode(preferredencoding())
2383 return err_str
2384
2385
2386def mimetype2ext(mt):
2387 if mt is None:
2388 return None
2389
2390 ext = {
2391 'audio/mp4': 'm4a',
2392 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2393 # it's the most popular one
2394 'audio/mpeg': 'mp3',
2395 }.get(mt)
2396 if ext is not None:
2397 return ext
2398
2399 _, _, res = mt.rpartition('/')
2400 res = res.split(';')[0].strip().lower()
2401
2402 return {
2403 '3gpp': '3gp',
2404 'smptett+xml': 'tt',
2405 'ttaf+xml': 'dfxp',
2406 'ttml+xml': 'ttml',
2407 'x-flv': 'flv',
2408 'x-mp4-fragmented': 'mp4',
2409 'x-ms-sami': 'sami',
2410 'x-ms-wmv': 'wmv',
2411 'mpegurl': 'm3u8',
2412 'x-mpegurl': 'm3u8',
2413 'vnd.apple.mpegurl': 'm3u8',
2414 'dash+xml': 'mpd',
2415 'f4m+xml': 'f4m',
2416 'hds+xml': 'f4m',
2417 'vnd.ms-sstr+xml': 'ism',
2418 'quicktime': 'mov',
2419 'mp2t': 'ts',
2420 }.get(res, res)
2421
2422
2423def parse_codecs(codecs_str):
2424 # http://tools.ietf.org/html/rfc6381
2425 if not codecs_str:
2426 return {}
2427 splited_codecs = list(filter(None, map(
2428 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2429 vcodec, acodec = None, None
2430 for full_codec in splited_codecs:
2431 codec = full_codec.split('.')[0]
2432 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2433 if not vcodec:
2434 vcodec = full_codec
2435 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2436 if not acodec:
2437 acodec = full_codec
2438 else:
2439 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2440 if not vcodec and not acodec:
2441 if len(splited_codecs) == 2:
2442 return {
2443 'vcodec': vcodec,
2444 'acodec': acodec,
2445 }
2446 elif len(splited_codecs) == 1:
2447 return {
2448 'vcodec': 'none',
2449 'acodec': vcodec,
2450 }
2451 else:
2452 return {
2453 'vcodec': vcodec or 'none',
2454 'acodec': acodec or 'none',
2455 }
2456 return {}
2457
2458
2459def urlhandle_detect_ext(url_handle):
2460 getheader = url_handle.headers.get
2461
2462 cd = getheader('Content-Disposition')
2463 if cd:
2464 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2465 if m:
2466 e = determine_ext(m.group('filename'), default_ext=None)
2467 if e:
2468 return e
2469
2470 return mimetype2ext(getheader('Content-Type'))
2471
2472
2473def encode_data_uri(data, mime_type):
2474 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2475
2476
2477def age_restricted(content_limit, age_limit):
2478 """ Returns True iff the content should be blocked """
2479
2480 if age_limit is None: # No limit set
2481 return False
2482 if content_limit is None:
2483 return False # Content available for everyone
2484 return age_limit < content_limit
2485
2486
2487def is_html(first_bytes):
2488 """ Detect whether a file contains HTML by examining its first bytes. """
2489
2490 BOMS = [
2491 (b'\xef\xbb\xbf', 'utf-8'),
2492 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2493 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2494 (b'\xff\xfe', 'utf-16-le'),
2495 (b'\xfe\xff', 'utf-16-be'),
2496 ]
2497 for bom, enc in BOMS:
2498 if first_bytes.startswith(bom):
2499 s = first_bytes[len(bom):].decode(enc, 'replace')
2500 break
2501 else:
2502 s = first_bytes.decode('utf-8', 'replace')
2503
2504 return re.match(r'^\s*<', s)
2505
2506
2507def determine_protocol(info_dict):
2508 protocol = info_dict.get('protocol')
2509 if protocol is not None:
2510 return protocol
2511
2512 url = info_dict['url']
2513 if url.startswith('rtmp'):
2514 return 'rtmp'
2515 elif url.startswith('mms'):
2516 return 'mms'
2517 elif url.startswith('rtsp'):
2518 return 'rtsp'
2519
2520 ext = determine_ext(url)
2521 if ext == 'm3u8':
2522 return 'm3u8'
2523 elif ext == 'f4m':
2524 return 'f4m'
2525
2526 return compat_urllib_parse_urlparse(url).scheme
2527
2528
2529def render_table(header_row, data):
2530 """ Render a list of rows, each as a list of values """
2531 table = [header_row] + data
2532 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2533 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2534 return '\n'.join(format_str % tuple(row) for row in table)
2535
2536
2537def _match_one(filter_part, dct):
2538 COMPARISON_OPERATORS = {
2539 '<': operator.lt,
2540 '<=': operator.le,
2541 '>': operator.gt,
2542 '>=': operator.ge,
2543 '=': operator.eq,
2544 '!=': operator.ne,
2545 }
2546 operator_rex = re.compile(r'''(?x)\s*
2547 (?P<key>[a-z_]+)
2548 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2549 (?:
2550 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2551 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2552 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2553 )
2554 \s*$
2555 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2556 m = operator_rex.search(filter_part)
2557 if m:
2558 op = COMPARISON_OPERATORS[m.group('op')]
2559 actual_value = dct.get(m.group('key'))
2560 if (m.group('quotedstrval') is not None or
2561 m.group('strval') is not None or
2562 # If the original field is a string and matching comparisonvalue is
2563 # a number we should respect the origin of the original field
2564 # and process comparison value as a string (see
2565 # https://github.com/rg3/youtube-dl/issues/11082).
2566 actual_value is not None and m.group('intval') is not None and
2567 isinstance(actual_value, compat_str)):
2568 if m.group('op') not in ('=', '!='):
2569 raise ValueError(
2570 'Operator %s does not support string values!' % m.group('op'))
2571 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2572 quote = m.group('quote')
2573 if quote is not None:
2574 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2575 else:
2576 try:
2577 comparison_value = int(m.group('intval'))
2578 except ValueError:
2579 comparison_value = parse_filesize(m.group('intval'))
2580 if comparison_value is None:
2581 comparison_value = parse_filesize(m.group('intval') + 'B')
2582 if comparison_value is None:
2583 raise ValueError(
2584 'Invalid integer value %r in filter part %r' % (
2585 m.group('intval'), filter_part))
2586 if actual_value is None:
2587 return m.group('none_inclusive')
2588 return op(actual_value, comparison_value)
2589
2590 UNARY_OPERATORS = {
2591 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2592 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
2593 }
2594 operator_rex = re.compile(r'''(?x)\s*
2595 (?P<op>%s)\s*(?P<key>[a-z_]+)
2596 \s*$
2597 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2598 m = operator_rex.search(filter_part)
2599 if m:
2600 op = UNARY_OPERATORS[m.group('op')]
2601 actual_value = dct.get(m.group('key'))
2602 return op(actual_value)
2603
2604 raise ValueError('Invalid filter part %r' % filter_part)
2605
2606
2607def match_str(filter_str, dct):
2608 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2609
2610 return all(
2611 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2612
2613
2614def match_filter_func(filter_str):
2615 def _match_func(info_dict):
2616 if match_str(filter_str, info_dict):
2617 return None
2618 else:
2619 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2620 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2621 return _match_func
2622
2623
2624def parse_dfxp_time_expr(time_expr):
2625 if not time_expr:
2626 return
2627
2628 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2629 if mobj:
2630 return float(mobj.group('time_offset'))
2631
2632 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2633 if mobj:
2634 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2635
2636
2637def srt_subtitles_timecode(seconds):
2638 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2639
2640
2641def dfxp2srt(dfxp_data):
2642 '''
2643 @param dfxp_data A bytes-like object containing DFXP data
2644 @returns A unicode object containing converted SRT data
2645 '''
2646 LEGACY_NAMESPACES = (
2647 (b'http://www.w3.org/ns/ttml', [
2648 b'http://www.w3.org/2004/11/ttaf1',
2649 b'http://www.w3.org/2006/04/ttaf1',
2650 b'http://www.w3.org/2006/10/ttaf1',
2651 ]),
2652 (b'http://www.w3.org/ns/ttml#styling', [
2653 b'http://www.w3.org/ns/ttml#style',
2654 ]),
2655 )
2656
2657 SUPPORTED_STYLING = [
2658 'color',
2659 'fontFamily',
2660 'fontSize',
2661 'fontStyle',
2662 'fontWeight',
2663 'textDecoration'
2664 ]
2665
2666 _x = functools.partial(xpath_with_ns, ns_map={
2667 'ttml': 'http://www.w3.org/ns/ttml',
2668 'tts': 'http://www.w3.org/ns/ttml#styling',
2669 })
2670
2671 styles = {}
2672 default_style = {}
2673
2674 class TTMLPElementParser(object):
2675 _out = ''
2676 _unclosed_elements = []
2677 _applied_styles = []
2678
2679 def start(self, tag, attrib):
2680 if tag in (_x('ttml:br'), 'br'):
2681 self._out += '\n'
2682 else:
2683 unclosed_elements = []
2684 style = {}
2685 element_style_id = attrib.get('style')
2686 if default_style:
2687 style.update(default_style)
2688 if element_style_id:
2689 style.update(styles.get(element_style_id, {}))
2690 for prop in SUPPORTED_STYLING:
2691 prop_val = attrib.get(_x('tts:' + prop))
2692 if prop_val:
2693 style[prop] = prop_val
2694 if style:
2695 font = ''
2696 for k, v in sorted(style.items()):
2697 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2698 continue
2699 if k == 'color':
2700 font += ' color="%s"' % v
2701 elif k == 'fontSize':
2702 font += ' size="%s"' % v
2703 elif k == 'fontFamily':
2704 font += ' face="%s"' % v
2705 elif k == 'fontWeight' and v == 'bold':
2706 self._out += '<b>'
2707 unclosed_elements.append('b')
2708 elif k == 'fontStyle' and v == 'italic':
2709 self._out += '<i>'
2710 unclosed_elements.append('i')
2711 elif k == 'textDecoration' and v == 'underline':
2712 self._out += '<u>'
2713 unclosed_elements.append('u')
2714 if font:
2715 self._out += '<font' + font + '>'
2716 unclosed_elements.append('font')
2717 applied_style = {}
2718 if self._applied_styles:
2719 applied_style.update(self._applied_styles[-1])
2720 applied_style.update(style)
2721 self._applied_styles.append(applied_style)
2722 self._unclosed_elements.append(unclosed_elements)
2723
2724 def end(self, tag):
2725 if tag not in (_x('ttml:br'), 'br'):
2726 unclosed_elements = self._unclosed_elements.pop()
2727 for element in reversed(unclosed_elements):
2728 self._out += '</%s>' % element
2729 if unclosed_elements and self._applied_styles:
2730 self._applied_styles.pop()
2731
2732 def data(self, data):
2733 self._out += data
2734
2735 def close(self):
2736 return self._out.strip()
2737
2738 def parse_node(node):
2739 target = TTMLPElementParser()
2740 parser = xml.etree.ElementTree.XMLParser(target=target)
2741 parser.feed(xml.etree.ElementTree.tostring(node))
2742 return parser.close()
2743
2744 for k, v in LEGACY_NAMESPACES:
2745 for ns in v:
2746 dfxp_data = dfxp_data.replace(ns, k)
2747
2748 dfxp = compat_etree_fromstring(dfxp_data)
2749 out = []
2750 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2751
2752 if not paras:
2753 raise ValueError('Invalid dfxp/TTML subtitle')
2754
2755 repeat = False
2756 while True:
2757 for style in dfxp.findall(_x('.//ttml:style')):
2758 style_id = style.get('id')
2759 parent_style_id = style.get('style')
2760 if parent_style_id:
2761 if parent_style_id not in styles:
2762 repeat = True
2763 continue
2764 styles[style_id] = styles[parent_style_id].copy()
2765 for prop in SUPPORTED_STYLING:
2766 prop_val = style.get(_x('tts:' + prop))
2767 if prop_val:
2768 styles.setdefault(style_id, {})[prop] = prop_val
2769 if repeat:
2770 repeat = False
2771 else:
2772 break
2773
2774 for p in ('body', 'div'):
2775 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2776 if ele is None:
2777 continue
2778 style = styles.get(ele.get('style'))
2779 if not style:
2780 continue
2781 default_style.update(style)
2782
2783 for para, index in zip(paras, itertools.count(1)):
2784 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2785 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2786 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2787 if begin_time is None:
2788 continue
2789 if not end_time:
2790 if not dur:
2791 continue
2792 end_time = begin_time + dur
2793 out.append('%d\n%s --> %s\n%s\n\n' % (
2794 index,
2795 srt_subtitles_timecode(begin_time),
2796 srt_subtitles_timecode(end_time),
2797 parse_node(para)))
2798
2799 return ''.join(out)
2800
2801
2802def cli_option(params, command_option, param):
2803 param = params.get(param)
2804 if param:
2805 param = compat_str(param)
2806 return [command_option, param] if param is not None else []
2807
2808
2809def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2810 param = params.get(param)
2811 if param is None:
2812 return []
2813 assert isinstance(param, bool)
2814 if separator:
2815 return [command_option + separator + (true_value if param else false_value)]
2816 return [command_option, true_value if param else false_value]
2817
2818
2819def cli_valueless_option(params, command_option, param, expected_value=True):
2820 param = params.get(param)
2821 return [command_option] if param == expected_value else []
2822
2823
2824def cli_configuration_args(params, param, default=[]):
2825 ex_args = params.get(param)
2826 if ex_args is None:
2827 return default
2828 assert isinstance(ex_args, list)
2829 return ex_args
2830
2831
2832class ISO639Utils(object):
2833 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2834 _lang_map = {
2835 'aa': 'aar',
2836 'ab': 'abk',
2837 'ae': 'ave',
2838 'af': 'afr',
2839 'ak': 'aka',
2840 'am': 'amh',
2841 'an': 'arg',
2842 'ar': 'ara',
2843 'as': 'asm',
2844 'av': 'ava',
2845 'ay': 'aym',
2846 'az': 'aze',
2847 'ba': 'bak',
2848 'be': 'bel',
2849 'bg': 'bul',
2850 'bh': 'bih',
2851 'bi': 'bis',
2852 'bm': 'bam',
2853 'bn': 'ben',
2854 'bo': 'bod',
2855 'br': 'bre',
2856 'bs': 'bos',
2857 'ca': 'cat',
2858 'ce': 'che',
2859 'ch': 'cha',
2860 'co': 'cos',
2861 'cr': 'cre',
2862 'cs': 'ces',
2863 'cu': 'chu',
2864 'cv': 'chv',
2865 'cy': 'cym',
2866 'da': 'dan',
2867 'de': 'deu',
2868 'dv': 'div',
2869 'dz': 'dzo',
2870 'ee': 'ewe',
2871 'el': 'ell',
2872 'en': 'eng',
2873 'eo': 'epo',
2874 'es': 'spa',
2875 'et': 'est',
2876 'eu': 'eus',
2877 'fa': 'fas',
2878 'ff': 'ful',
2879 'fi': 'fin',
2880 'fj': 'fij',
2881 'fo': 'fao',
2882 'fr': 'fra',
2883 'fy': 'fry',
2884 'ga': 'gle',
2885 'gd': 'gla',
2886 'gl': 'glg',
2887 'gn': 'grn',
2888 'gu': 'guj',
2889 'gv': 'glv',
2890 'ha': 'hau',
2891 'he': 'heb',
2892 'hi': 'hin',
2893 'ho': 'hmo',
2894 'hr': 'hrv',
2895 'ht': 'hat',
2896 'hu': 'hun',
2897 'hy': 'hye',
2898 'hz': 'her',
2899 'ia': 'ina',
2900 'id': 'ind',
2901 'ie': 'ile',
2902 'ig': 'ibo',
2903 'ii': 'iii',
2904 'ik': 'ipk',
2905 'io': 'ido',
2906 'is': 'isl',
2907 'it': 'ita',
2908 'iu': 'iku',
2909 'ja': 'jpn',
2910 'jv': 'jav',
2911 'ka': 'kat',
2912 'kg': 'kon',
2913 'ki': 'kik',
2914 'kj': 'kua',
2915 'kk': 'kaz',
2916 'kl': 'kal',
2917 'km': 'khm',
2918 'kn': 'kan',
2919 'ko': 'kor',
2920 'kr': 'kau',
2921 'ks': 'kas',
2922 'ku': 'kur',
2923 'kv': 'kom',
2924 'kw': 'cor',
2925 'ky': 'kir',
2926 'la': 'lat',
2927 'lb': 'ltz',
2928 'lg': 'lug',
2929 'li': 'lim',
2930 'ln': 'lin',
2931 'lo': 'lao',
2932 'lt': 'lit',
2933 'lu': 'lub',
2934 'lv': 'lav',
2935 'mg': 'mlg',
2936 'mh': 'mah',
2937 'mi': 'mri',
2938 'mk': 'mkd',
2939 'ml': 'mal',
2940 'mn': 'mon',
2941 'mr': 'mar',
2942 'ms': 'msa',
2943 'mt': 'mlt',
2944 'my': 'mya',
2945 'na': 'nau',
2946 'nb': 'nob',
2947 'nd': 'nde',
2948 'ne': 'nep',
2949 'ng': 'ndo',
2950 'nl': 'nld',
2951 'nn': 'nno',
2952 'no': 'nor',
2953 'nr': 'nbl',
2954 'nv': 'nav',
2955 'ny': 'nya',
2956 'oc': 'oci',
2957 'oj': 'oji',
2958 'om': 'orm',
2959 'or': 'ori',
2960 'os': 'oss',
2961 'pa': 'pan',
2962 'pi': 'pli',
2963 'pl': 'pol',
2964 'ps': 'pus',
2965 'pt': 'por',
2966 'qu': 'que',
2967 'rm': 'roh',
2968 'rn': 'run',
2969 'ro': 'ron',
2970 'ru': 'rus',
2971 'rw': 'kin',
2972 'sa': 'san',
2973 'sc': 'srd',
2974 'sd': 'snd',
2975 'se': 'sme',
2976 'sg': 'sag',
2977 'si': 'sin',
2978 'sk': 'slk',
2979 'sl': 'slv',
2980 'sm': 'smo',
2981 'sn': 'sna',
2982 'so': 'som',
2983 'sq': 'sqi',
2984 'sr': 'srp',
2985 'ss': 'ssw',
2986 'st': 'sot',
2987 'su': 'sun',
2988 'sv': 'swe',
2989 'sw': 'swa',
2990 'ta': 'tam',
2991 'te': 'tel',
2992 'tg': 'tgk',
2993 'th': 'tha',
2994 'ti': 'tir',
2995 'tk': 'tuk',
2996 'tl': 'tgl',
2997 'tn': 'tsn',
2998 'to': 'ton',
2999 'tr': 'tur',
3000 'ts': 'tso',
3001 'tt': 'tat',
3002 'tw': 'twi',
3003 'ty': 'tah',
3004 'ug': 'uig',
3005 'uk': 'ukr',
3006 'ur': 'urd',
3007 'uz': 'uzb',
3008 've': 'ven',
3009 'vi': 'vie',
3010 'vo': 'vol',
3011 'wa': 'wln',
3012 'wo': 'wol',
3013 'xh': 'xho',
3014 'yi': 'yid',
3015 'yo': 'yor',
3016 'za': 'zha',
3017 'zh': 'zho',
3018 'zu': 'zul',
3019 }
3020
3021 @classmethod
3022 def short2long(cls, code):
3023 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3024 return cls._lang_map.get(code[:2])
3025
3026 @classmethod
3027 def long2short(cls, code):
3028 """Convert language code from ISO 639-2/T to ISO 639-1"""
3029 for short_name, long_name in cls._lang_map.items():
3030 if long_name == code:
3031 return short_name
3032
3033
3034class ISO3166Utils(object):
3035 # From http://data.okfn.org/data/core/country-list
3036 _country_map = {
3037 'AF': 'Afghanistan',
3038 'AX': 'Åland Islands',
3039 'AL': 'Albania',
3040 'DZ': 'Algeria',
3041 'AS': 'American Samoa',
3042 'AD': 'Andorra',
3043 'AO': 'Angola',
3044 'AI': 'Anguilla',
3045 'AQ': 'Antarctica',
3046 'AG': 'Antigua and Barbuda',
3047 'AR': 'Argentina',
3048 'AM': 'Armenia',
3049 'AW': 'Aruba',
3050 'AU': 'Australia',
3051 'AT': 'Austria',
3052 'AZ': 'Azerbaijan',
3053 'BS': 'Bahamas',
3054 'BH': 'Bahrain',
3055 'BD': 'Bangladesh',
3056 'BB': 'Barbados',
3057 'BY': 'Belarus',
3058 'BE': 'Belgium',
3059 'BZ': 'Belize',
3060 'BJ': 'Benin',
3061 'BM': 'Bermuda',
3062 'BT': 'Bhutan',
3063 'BO': 'Bolivia, Plurinational State of',
3064 'BQ': 'Bonaire, Sint Eustatius and Saba',
3065 'BA': 'Bosnia and Herzegovina',
3066 'BW': 'Botswana',
3067 'BV': 'Bouvet Island',
3068 'BR': 'Brazil',
3069 'IO': 'British Indian Ocean Territory',
3070 'BN': 'Brunei Darussalam',
3071 'BG': 'Bulgaria',
3072 'BF': 'Burkina Faso',
3073 'BI': 'Burundi',
3074 'KH': 'Cambodia',
3075 'CM': 'Cameroon',
3076 'CA': 'Canada',
3077 'CV': 'Cape Verde',
3078 'KY': 'Cayman Islands',
3079 'CF': 'Central African Republic',
3080 'TD': 'Chad',
3081 'CL': 'Chile',
3082 'CN': 'China',
3083 'CX': 'Christmas Island',
3084 'CC': 'Cocos (Keeling) Islands',
3085 'CO': 'Colombia',
3086 'KM': 'Comoros',
3087 'CG': 'Congo',
3088 'CD': 'Congo, the Democratic Republic of the',
3089 'CK': 'Cook Islands',
3090 'CR': 'Costa Rica',
3091 'CI': 'Côte d\'Ivoire',
3092 'HR': 'Croatia',
3093 'CU': 'Cuba',
3094 'CW': 'Curaçao',
3095 'CY': 'Cyprus',
3096 'CZ': 'Czech Republic',
3097 'DK': 'Denmark',
3098 'DJ': 'Djibouti',
3099 'DM': 'Dominica',
3100 'DO': 'Dominican Republic',
3101 'EC': 'Ecuador',
3102 'EG': 'Egypt',
3103 'SV': 'El Salvador',
3104 'GQ': 'Equatorial Guinea',
3105 'ER': 'Eritrea',
3106 'EE': 'Estonia',
3107 'ET': 'Ethiopia',
3108 'FK': 'Falkland Islands (Malvinas)',
3109 'FO': 'Faroe Islands',
3110 'FJ': 'Fiji',
3111 'FI': 'Finland',
3112 'FR': 'France',
3113 'GF': 'French Guiana',
3114 'PF': 'French Polynesia',
3115 'TF': 'French Southern Territories',
3116 'GA': 'Gabon',
3117 'GM': 'Gambia',
3118 'GE': 'Georgia',
3119 'DE': 'Germany',
3120 'GH': 'Ghana',
3121 'GI': 'Gibraltar',
3122 'GR': 'Greece',
3123 'GL': 'Greenland',
3124 'GD': 'Grenada',
3125 'GP': 'Guadeloupe',
3126 'GU': 'Guam',
3127 'GT': 'Guatemala',
3128 'GG': 'Guernsey',
3129 'GN': 'Guinea',
3130 'GW': 'Guinea-Bissau',
3131 'GY': 'Guyana',
3132 'HT': 'Haiti',
3133 'HM': 'Heard Island and McDonald Islands',
3134 'VA': 'Holy See (Vatican City State)',
3135 'HN': 'Honduras',
3136 'HK': 'Hong Kong',
3137 'HU': 'Hungary',
3138 'IS': 'Iceland',
3139 'IN': 'India',
3140 'ID': 'Indonesia',
3141 'IR': 'Iran, Islamic Republic of',
3142 'IQ': 'Iraq',
3143 'IE': 'Ireland',
3144 'IM': 'Isle of Man',
3145 'IL': 'Israel',
3146 'IT': 'Italy',
3147 'JM': 'Jamaica',
3148 'JP': 'Japan',
3149 'JE': 'Jersey',
3150 'JO': 'Jordan',
3151 'KZ': 'Kazakhstan',
3152 'KE': 'Kenya',
3153 'KI': 'Kiribati',
3154 'KP': 'Korea, Democratic People\'s Republic of',
3155 'KR': 'Korea, Republic of',
3156 'KW': 'Kuwait',
3157 'KG': 'Kyrgyzstan',
3158 'LA': 'Lao People\'s Democratic Republic',
3159 'LV': 'Latvia',
3160 'LB': 'Lebanon',
3161 'LS': 'Lesotho',
3162 'LR': 'Liberia',
3163 'LY': 'Libya',
3164 'LI': 'Liechtenstein',
3165 'LT': 'Lithuania',
3166 'LU': 'Luxembourg',
3167 'MO': 'Macao',
3168 'MK': 'Macedonia, the Former Yugoslav Republic of',
3169 'MG': 'Madagascar',
3170 'MW': 'Malawi',
3171 'MY': 'Malaysia',
3172 'MV': 'Maldives',
3173 'ML': 'Mali',
3174 'MT': 'Malta',
3175 'MH': 'Marshall Islands',
3176 'MQ': 'Martinique',
3177 'MR': 'Mauritania',
3178 'MU': 'Mauritius',
3179 'YT': 'Mayotte',
3180 'MX': 'Mexico',
3181 'FM': 'Micronesia, Federated States of',
3182 'MD': 'Moldova, Republic of',
3183 'MC': 'Monaco',
3184 'MN': 'Mongolia',
3185 'ME': 'Montenegro',
3186 'MS': 'Montserrat',
3187 'MA': 'Morocco',
3188 'MZ': 'Mozambique',
3189 'MM': 'Myanmar',
3190 'NA': 'Namibia',
3191 'NR': 'Nauru',
3192 'NP': 'Nepal',
3193 'NL': 'Netherlands',
3194 'NC': 'New Caledonia',
3195 'NZ': 'New Zealand',
3196 'NI': 'Nicaragua',
3197 'NE': 'Niger',
3198 'NG': 'Nigeria',
3199 'NU': 'Niue',
3200 'NF': 'Norfolk Island',
3201 'MP': 'Northern Mariana Islands',
3202 'NO': 'Norway',
3203 'OM': 'Oman',
3204 'PK': 'Pakistan',
3205 'PW': 'Palau',
3206 'PS': 'Palestine, State of',
3207 'PA': 'Panama',
3208 'PG': 'Papua New Guinea',
3209 'PY': 'Paraguay',
3210 'PE': 'Peru',
3211 'PH': 'Philippines',
3212 'PN': 'Pitcairn',
3213 'PL': 'Poland',
3214 'PT': 'Portugal',
3215 'PR': 'Puerto Rico',
3216 'QA': 'Qatar',
3217 'RE': 'Réunion',
3218 'RO': 'Romania',
3219 'RU': 'Russian Federation',
3220 'RW': 'Rwanda',
3221 'BL': 'Saint Barthélemy',
3222 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3223 'KN': 'Saint Kitts and Nevis',
3224 'LC': 'Saint Lucia',
3225 'MF': 'Saint Martin (French part)',
3226 'PM': 'Saint Pierre and Miquelon',
3227 'VC': 'Saint Vincent and the Grenadines',
3228 'WS': 'Samoa',
3229 'SM': 'San Marino',
3230 'ST': 'Sao Tome and Principe',
3231 'SA': 'Saudi Arabia',
3232 'SN': 'Senegal',
3233 'RS': 'Serbia',
3234 'SC': 'Seychelles',
3235 'SL': 'Sierra Leone',
3236 'SG': 'Singapore',
3237 'SX': 'Sint Maarten (Dutch part)',
3238 'SK': 'Slovakia',
3239 'SI': 'Slovenia',
3240 'SB': 'Solomon Islands',
3241 'SO': 'Somalia',
3242 'ZA': 'South Africa',
3243 'GS': 'South Georgia and the South Sandwich Islands',
3244 'SS': 'South Sudan',
3245 'ES': 'Spain',
3246 'LK': 'Sri Lanka',
3247 'SD': 'Sudan',
3248 'SR': 'Suriname',
3249 'SJ': 'Svalbard and Jan Mayen',
3250 'SZ': 'Swaziland',
3251 'SE': 'Sweden',
3252 'CH': 'Switzerland',
3253 'SY': 'Syrian Arab Republic',
3254 'TW': 'Taiwan, Province of China',
3255 'TJ': 'Tajikistan',
3256 'TZ': 'Tanzania, United Republic of',
3257 'TH': 'Thailand',
3258 'TL': 'Timor-Leste',
3259 'TG': 'Togo',
3260 'TK': 'Tokelau',
3261 'TO': 'Tonga',
3262 'TT': 'Trinidad and Tobago',
3263 'TN': 'Tunisia',
3264 'TR': 'Turkey',
3265 'TM': 'Turkmenistan',
3266 'TC': 'Turks and Caicos Islands',
3267 'TV': 'Tuvalu',
3268 'UG': 'Uganda',
3269 'UA': 'Ukraine',
3270 'AE': 'United Arab Emirates',
3271 'GB': 'United Kingdom',
3272 'US': 'United States',
3273 'UM': 'United States Minor Outlying Islands',
3274 'UY': 'Uruguay',
3275 'UZ': 'Uzbekistan',
3276 'VU': 'Vanuatu',
3277 'VE': 'Venezuela, Bolivarian Republic of',
3278 'VN': 'Viet Nam',
3279 'VG': 'Virgin Islands, British',
3280 'VI': 'Virgin Islands, U.S.',
3281 'WF': 'Wallis and Futuna',
3282 'EH': 'Western Sahara',
3283 'YE': 'Yemen',
3284 'ZM': 'Zambia',
3285 'ZW': 'Zimbabwe',
3286 }
3287
3288 @classmethod
3289 def short2full(cls, code):
3290 """Convert an ISO 3166-2 country code to the corresponding full name"""
3291 return cls._country_map.get(code.upper())
3292
3293
3294class GeoUtils(object):
3295 # Major IPv4 address blocks per country
3296 _country_ip_map = {
3297 'AD': '85.94.160.0/19',
3298 'AE': '94.200.0.0/13',
3299 'AF': '149.54.0.0/17',
3300 'AG': '209.59.64.0/18',
3301 'AI': '204.14.248.0/21',
3302 'AL': '46.99.0.0/16',
3303 'AM': '46.70.0.0/15',
3304 'AO': '105.168.0.0/13',
3305 'AP': '159.117.192.0/21',
3306 'AR': '181.0.0.0/12',
3307 'AS': '202.70.112.0/20',
3308 'AT': '84.112.0.0/13',
3309 'AU': '1.128.0.0/11',
3310 'AW': '181.41.0.0/18',
3311 'AZ': '5.191.0.0/16',
3312 'BA': '31.176.128.0/17',
3313 'BB': '65.48.128.0/17',
3314 'BD': '114.130.0.0/16',
3315 'BE': '57.0.0.0/8',
3316 'BF': '129.45.128.0/17',
3317 'BG': '95.42.0.0/15',
3318 'BH': '37.131.0.0/17',
3319 'BI': '154.117.192.0/18',
3320 'BJ': '137.255.0.0/16',
3321 'BL': '192.131.134.0/24',
3322 'BM': '196.12.64.0/18',
3323 'BN': '156.31.0.0/16',
3324 'BO': '161.56.0.0/16',
3325 'BQ': '161.0.80.0/20',
3326 'BR': '152.240.0.0/12',
3327 'BS': '24.51.64.0/18',
3328 'BT': '119.2.96.0/19',
3329 'BW': '168.167.0.0/16',
3330 'BY': '178.120.0.0/13',
3331 'BZ': '179.42.192.0/18',
3332 'CA': '99.224.0.0/11',
3333 'CD': '41.243.0.0/16',
3334 'CF': '196.32.200.0/21',
3335 'CG': '197.214.128.0/17',
3336 'CH': '85.0.0.0/13',
3337 'CI': '154.232.0.0/14',
3338 'CK': '202.65.32.0/19',
3339 'CL': '152.172.0.0/14',
3340 'CM': '165.210.0.0/15',
3341 'CN': '36.128.0.0/10',
3342 'CO': '181.240.0.0/12',
3343 'CR': '201.192.0.0/12',
3344 'CU': '152.206.0.0/15',
3345 'CV': '165.90.96.0/19',
3346 'CW': '190.88.128.0/17',
3347 'CY': '46.198.0.0/15',
3348 'CZ': '88.100.0.0/14',
3349 'DE': '53.0.0.0/8',
3350 'DJ': '197.241.0.0/17',
3351 'DK': '87.48.0.0/12',
3352 'DM': '192.243.48.0/20',
3353 'DO': '152.166.0.0/15',
3354 'DZ': '41.96.0.0/12',
3355 'EC': '186.68.0.0/15',
3356 'EE': '90.190.0.0/15',
3357 'EG': '156.160.0.0/11',
3358 'ER': '196.200.96.0/20',
3359 'ES': '88.0.0.0/11',
3360 'ET': '196.188.0.0/14',
3361 'EU': '2.16.0.0/13',
3362 'FI': '91.152.0.0/13',
3363 'FJ': '144.120.0.0/16',
3364 'FM': '119.252.112.0/20',
3365 'FO': '88.85.32.0/19',
3366 'FR': '90.0.0.0/9',
3367 'GA': '41.158.0.0/15',
3368 'GB': '25.0.0.0/8',
3369 'GD': '74.122.88.0/21',
3370 'GE': '31.146.0.0/16',
3371 'GF': '161.22.64.0/18',
3372 'GG': '62.68.160.0/19',
3373 'GH': '45.208.0.0/14',
3374 'GI': '85.115.128.0/19',
3375 'GL': '88.83.0.0/19',
3376 'GM': '160.182.0.0/15',
3377 'GN': '197.149.192.0/18',
3378 'GP': '104.250.0.0/19',
3379 'GQ': '105.235.224.0/20',
3380 'GR': '94.64.0.0/13',
3381 'GT': '168.234.0.0/16',
3382 'GU': '168.123.0.0/16',
3383 'GW': '197.214.80.0/20',
3384 'GY': '181.41.64.0/18',
3385 'HK': '113.252.0.0/14',
3386 'HN': '181.210.0.0/16',
3387 'HR': '93.136.0.0/13',
3388 'HT': '148.102.128.0/17',
3389 'HU': '84.0.0.0/14',
3390 'ID': '39.192.0.0/10',
3391 'IE': '87.32.0.0/12',
3392 'IL': '79.176.0.0/13',
3393 'IM': '5.62.80.0/20',
3394 'IN': '117.192.0.0/10',
3395 'IO': '203.83.48.0/21',
3396 'IQ': '37.236.0.0/14',
3397 'IR': '2.176.0.0/12',
3398 'IS': '82.221.0.0/16',
3399 'IT': '79.0.0.0/10',
3400 'JE': '87.244.64.0/18',
3401 'JM': '72.27.0.0/17',
3402 'JO': '176.29.0.0/16',
3403 'JP': '126.0.0.0/8',
3404 'KE': '105.48.0.0/12',
3405 'KG': '158.181.128.0/17',
3406 'KH': '36.37.128.0/17',
3407 'KI': '103.25.140.0/22',
3408 'KM': '197.255.224.0/20',
3409 'KN': '198.32.32.0/19',
3410 'KP': '175.45.176.0/22',
3411 'KR': '175.192.0.0/10',
3412 'KW': '37.36.0.0/14',
3413 'KY': '64.96.0.0/15',
3414 'KZ': '2.72.0.0/13',
3415 'LA': '115.84.64.0/18',
3416 'LB': '178.135.0.0/16',
3417 'LC': '192.147.231.0/24',
3418 'LI': '82.117.0.0/19',
3419 'LK': '112.134.0.0/15',
3420 'LR': '41.86.0.0/19',
3421 'LS': '129.232.0.0/17',
3422 'LT': '78.56.0.0/13',
3423 'LU': '188.42.0.0/16',
3424 'LV': '46.109.0.0/16',
3425 'LY': '41.252.0.0/14',
3426 'MA': '105.128.0.0/11',
3427 'MC': '88.209.64.0/18',
3428 'MD': '37.246.0.0/16',
3429 'ME': '178.175.0.0/17',
3430 'MF': '74.112.232.0/21',
3431 'MG': '154.126.0.0/17',
3432 'MH': '117.103.88.0/21',
3433 'MK': '77.28.0.0/15',
3434 'ML': '154.118.128.0/18',
3435 'MM': '37.111.0.0/17',
3436 'MN': '49.0.128.0/17',
3437 'MO': '60.246.0.0/16',
3438 'MP': '202.88.64.0/20',
3439 'MQ': '109.203.224.0/19',
3440 'MR': '41.188.64.0/18',
3441 'MS': '208.90.112.0/22',
3442 'MT': '46.11.0.0/16',
3443 'MU': '105.16.0.0/12',
3444 'MV': '27.114.128.0/18',
3445 'MW': '105.234.0.0/16',
3446 'MX': '187.192.0.0/11',
3447 'MY': '175.136.0.0/13',
3448 'MZ': '197.218.0.0/15',
3449 'NA': '41.182.0.0/16',
3450 'NC': '101.101.0.0/18',
3451 'NE': '197.214.0.0/18',
3452 'NF': '203.17.240.0/22',
3453 'NG': '105.112.0.0/12',
3454 'NI': '186.76.0.0/15',
3455 'NL': '145.96.0.0/11',
3456 'NO': '84.208.0.0/13',
3457 'NP': '36.252.0.0/15',
3458 'NR': '203.98.224.0/19',
3459 'NU': '49.156.48.0/22',
3460 'NZ': '49.224.0.0/14',
3461 'OM': '5.36.0.0/15',
3462 'PA': '186.72.0.0/15',
3463 'PE': '186.160.0.0/14',
3464 'PF': '123.50.64.0/18',
3465 'PG': '124.240.192.0/19',
3466 'PH': '49.144.0.0/13',
3467 'PK': '39.32.0.0/11',
3468 'PL': '83.0.0.0/11',
3469 'PM': '70.36.0.0/20',
3470 'PR': '66.50.0.0/16',
3471 'PS': '188.161.0.0/16',
3472 'PT': '85.240.0.0/13',
3473 'PW': '202.124.224.0/20',
3474 'PY': '181.120.0.0/14',
3475 'QA': '37.210.0.0/15',
3476 'RE': '139.26.0.0/16',
3477 'RO': '79.112.0.0/13',
3478 'RS': '178.220.0.0/14',
3479 'RU': '5.136.0.0/13',
3480 'RW': '105.178.0.0/15',
3481 'SA': '188.48.0.0/13',
3482 'SB': '202.1.160.0/19',
3483 'SC': '154.192.0.0/11',
3484 'SD': '154.96.0.0/13',
3485 'SE': '78.64.0.0/12',
3486 'SG': '152.56.0.0/14',
3487 'SI': '188.196.0.0/14',
3488 'SK': '78.98.0.0/15',
3489 'SL': '197.215.0.0/17',
3490 'SM': '89.186.32.0/19',
3491 'SN': '41.82.0.0/15',
3492 'SO': '197.220.64.0/19',
3493 'SR': '186.179.128.0/17',
3494 'SS': '105.235.208.0/21',
3495 'ST': '197.159.160.0/19',
3496 'SV': '168.243.0.0/16',
3497 'SX': '190.102.0.0/20',
3498 'SY': '5.0.0.0/16',
3499 'SZ': '41.84.224.0/19',
3500 'TC': '65.255.48.0/20',
3501 'TD': '154.68.128.0/19',
3502 'TG': '196.168.0.0/14',
3503 'TH': '171.96.0.0/13',
3504 'TJ': '85.9.128.0/18',
3505 'TK': '27.96.24.0/21',
3506 'TL': '180.189.160.0/20',
3507 'TM': '95.85.96.0/19',
3508 'TN': '197.0.0.0/11',
3509 'TO': '175.176.144.0/21',
3510 'TR': '78.160.0.0/11',
3511 'TT': '186.44.0.0/15',
3512 'TV': '202.2.96.0/19',
3513 'TW': '120.96.0.0/11',
3514 'TZ': '156.156.0.0/14',
3515 'UA': '93.72.0.0/13',
3516 'UG': '154.224.0.0/13',
3517 'US': '3.0.0.0/8',
3518 'UY': '167.56.0.0/13',
3519 'UZ': '82.215.64.0/18',
3520 'VA': '212.77.0.0/19',
3521 'VC': '24.92.144.0/20',
3522 'VE': '186.88.0.0/13',
3523 'VG': '172.103.64.0/18',
3524 'VI': '146.226.0.0/16',
3525 'VN': '14.160.0.0/11',
3526 'VU': '202.80.32.0/20',
3527 'WF': '117.20.32.0/21',
3528 'WS': '202.4.32.0/19',
3529 'YE': '134.35.0.0/16',
3530 'YT': '41.242.116.0/22',
3531 'ZA': '41.0.0.0/11',
3532 'ZM': '165.56.0.0/13',
3533 'ZW': '41.85.192.0/19',
3534 }
3535
3536 @classmethod
3537 def random_ipv4(cls, code):
3538 block = cls._country_ip_map.get(code.upper())
3539 if not block:
3540 return None
3541 addr, preflen = block.split('/')
3542 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3543 addr_max = addr_min | (0xffffffff >> int(preflen))
3544 return compat_str(socket.inet_ntoa(
3545 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3546
3547
3548class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3549 def __init__(self, proxies=None):
3550 # Set default handlers
3551 for type in ('http', 'https'):
3552 setattr(self, '%s_open' % type,
3553 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3554 meth(r, proxy, type))
3555 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3556
3557 def proxy_open(self, req, proxy, type):
3558 req_proxy = req.headers.get('Ytdl-request-proxy')
3559 if req_proxy is not None:
3560 proxy = req_proxy
3561 del req.headers['Ytdl-request-proxy']
3562
3563 if proxy == '__noproxy__':
3564 return None # No Proxy
3565 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3566 req.add_header('Ytdl-socks-proxy', proxy)
3567 # youtube-dl's http/https handlers do wrapping the socket with socks
3568 return None
3569 return compat_urllib_request.ProxyHandler.proxy_open(
3570 self, req, proxy, type)
3571
3572
3573# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3574# released into Public Domain
3575# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3576
3577def long_to_bytes(n, blocksize=0):
3578 """long_to_bytes(n:long, blocksize:int) : string
3579 Convert a long integer to a byte string.
3580
3581 If optional blocksize is given and greater than zero, pad the front of the
3582 byte string with binary zeros so that the length is a multiple of
3583 blocksize.
3584 """
3585 # after much testing, this algorithm was deemed to be the fastest
3586 s = b''
3587 n = int(n)
3588 while n > 0:
3589 s = compat_struct_pack('>I', n & 0xffffffff) + s
3590 n = n >> 32
3591 # strip off leading zeros
3592 for i in range(len(s)):
3593 if s[i] != b'\000'[0]:
3594 break
3595 else:
3596 # only happens when n == 0
3597 s = b'\000'
3598 i = 0
3599 s = s[i:]
3600 # add back some pad bytes. this could be done more efficiently w.r.t. the
3601 # de-padding being done above, but sigh...
3602 if blocksize > 0 and len(s) % blocksize:
3603 s = (blocksize - len(s) % blocksize) * b'\000' + s
3604 return s
3605
3606
3607def bytes_to_long(s):
3608 """bytes_to_long(string) : long
3609 Convert a byte string to a long integer.
3610
3611 This is (essentially) the inverse of long_to_bytes().
3612 """
3613 acc = 0
3614 length = len(s)
3615 if length % 4:
3616 extra = (4 - length % 4)
3617 s = b'\000' * extra + s
3618 length = length + extra
3619 for i in range(0, length, 4):
3620 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3621 return acc
3622
3623
3624def ohdave_rsa_encrypt(data, exponent, modulus):
3625 '''
3626 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3627
3628 Input:
3629 data: data to encrypt, bytes-like object
3630 exponent, modulus: parameter e and N of RSA algorithm, both integer
3631 Output: hex string of encrypted data
3632
3633 Limitation: supports one block encryption only
3634 '''
3635
3636 payload = int(binascii.hexlify(data[::-1]), 16)
3637 encrypted = pow(payload, exponent, modulus)
3638 return '%x' % encrypted
3639
3640
3641def pkcs1pad(data, length):
3642 """
3643 Padding input data with PKCS#1 scheme
3644
3645 @param {int[]} data input data
3646 @param {int} length target length
3647 @returns {int[]} padded data
3648 """
3649 if len(data) > length - 11:
3650 raise ValueError('Input data too long for PKCS#1 padding')
3651
3652 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3653 return [0, 2] + pseudo_random + [0] + data
3654
3655
3656def encode_base_n(num, n, table=None):
3657 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3658 if not table:
3659 table = FULL_TABLE[:n]
3660
3661 if n > len(table):
3662 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3663
3664 if num == 0:
3665 return table[0]
3666
3667 ret = ''
3668 while num:
3669 ret = table[num % n] + ret
3670 num = num // n
3671 return ret
3672
3673
3674def decode_packed_codes(code):
3675 mobj = re.search(PACKED_CODES_RE, code)
3676 obfucasted_code, base, count, symbols = mobj.groups()
3677 base = int(base)
3678 count = int(count)
3679 symbols = symbols.split('|')
3680 symbol_table = {}
3681
3682 while count:
3683 count -= 1
3684 base_n_count = encode_base_n(count, base)
3685 symbol_table[base_n_count] = symbols[count] or base_n_count
3686
3687 return re.sub(
3688 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3689 obfucasted_code)
3690
3691
3692def parse_m3u8_attributes(attrib):
3693 info = {}
3694 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3695 if val.startswith('"'):
3696 val = val[1:-1]
3697 info[key] = val
3698 return info
3699
3700
3701def urshift(val, n):
3702 return val >> n if val >= 0 else (val + 0x100000000) >> n
3703
3704
3705# Based on png2str() written by @gdkchan and improved by @yokrysty
3706# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3707def decode_png(png_data):
3708 # Reference: https://www.w3.org/TR/PNG/
3709 header = png_data[8:]
3710
3711 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3712 raise IOError('Not a valid PNG file.')
3713
3714 int_map = {1: '>B', 2: '>H', 4: '>I'}
3715 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3716
3717 chunks = []
3718
3719 while header:
3720 length = unpack_integer(header[:4])
3721 header = header[4:]
3722
3723 chunk_type = header[:4]
3724 header = header[4:]
3725
3726 chunk_data = header[:length]
3727 header = header[length:]
3728
3729 header = header[4:] # Skip CRC
3730
3731 chunks.append({
3732 'type': chunk_type,
3733 'length': length,
3734 'data': chunk_data
3735 })
3736
3737 ihdr = chunks[0]['data']
3738
3739 width = unpack_integer(ihdr[:4])
3740 height = unpack_integer(ihdr[4:8])
3741
3742 idat = b''
3743
3744 for chunk in chunks:
3745 if chunk['type'] == b'IDAT':
3746 idat += chunk['data']
3747
3748 if not idat:
3749 raise IOError('Unable to read PNG data.')
3750
3751 decompressed_data = bytearray(zlib.decompress(idat))
3752
3753 stride = width * 3
3754 pixels = []
3755
3756 def _get_pixel(idx):
3757 x = idx % stride
3758 y = idx // stride
3759 return pixels[y][x]
3760
3761 for y in range(height):
3762 basePos = y * (1 + stride)
3763 filter_type = decompressed_data[basePos]
3764
3765 current_row = []
3766
3767 pixels.append(current_row)
3768
3769 for x in range(stride):
3770 color = decompressed_data[1 + basePos + x]
3771 basex = y * stride + x
3772 left = 0
3773 up = 0
3774
3775 if x > 2:
3776 left = _get_pixel(basex - 3)
3777 if y > 0:
3778 up = _get_pixel(basex - stride)
3779
3780 if filter_type == 1: # Sub
3781 color = (color + left) & 0xff
3782 elif filter_type == 2: # Up
3783 color = (color + up) & 0xff
3784 elif filter_type == 3: # Average
3785 color = (color + ((left + up) >> 1)) & 0xff
3786 elif filter_type == 4: # Paeth
3787 a = left
3788 b = up
3789 c = 0
3790
3791 if x > 2 and y > 0:
3792 c = _get_pixel(basex - stride - 3)
3793
3794 p = a + b - c
3795
3796 pa = abs(p - a)
3797 pb = abs(p - b)
3798 pc = abs(p - c)
3799
3800 if pa <= pb and pa <= pc:
3801 color = (color + a) & 0xff
3802 elif pb <= pc:
3803 color = (color + b) & 0xff
3804 else:
3805 color = (color + c) & 0xff
3806
3807 current_row.append(color)
3808
3809 return width, height, pixels
3810
3811
3812def write_xattr(path, key, value):
3813 # This mess below finds the best xattr tool for the job
3814 try:
3815 # try the pyxattr module...
3816 import xattr
3817
3818 if hasattr(xattr, 'set'): # pyxattr
3819 # Unicode arguments are not supported in python-pyxattr until
3820 # version 0.5.0
3821 # See https://github.com/rg3/youtube-dl/issues/5498
3822 pyxattr_required_version = '0.5.0'
3823 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3824 # TODO: fallback to CLI tools
3825 raise XAttrUnavailableError(
3826 'python-pyxattr is detected but is too old. '
3827 'youtube-dl requires %s or above while your version is %s. '
3828 'Falling back to other xattr implementations' % (
3829 pyxattr_required_version, xattr.__version__))
3830
3831 setxattr = xattr.set
3832 else: # xattr
3833 setxattr = xattr.setxattr
3834
3835 try:
3836 setxattr(path, key, value)
3837 except EnvironmentError as e:
3838 raise XAttrMetadataError(e.errno, e.strerror)
3839
3840 except ImportError:
3841 if compat_os_name == 'nt':
3842 # Write xattrs to NTFS Alternate Data Streams:
3843 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3844 assert ':' not in key
3845 assert os.path.exists(path)
3846
3847 ads_fn = path + ':' + key
3848 try:
3849 with open(ads_fn, 'wb') as f:
3850 f.write(value)
3851 except EnvironmentError as e:
3852 raise XAttrMetadataError(e.errno, e.strerror)
3853 else:
3854 user_has_setfattr = check_executable('setfattr', ['--version'])
3855 user_has_xattr = check_executable('xattr', ['-h'])
3856
3857 if user_has_setfattr or user_has_xattr:
3858
3859 value = value.decode('utf-8')
3860 if user_has_setfattr:
3861 executable = 'setfattr'
3862 opts = ['-n', key, '-v', value]
3863 elif user_has_xattr:
3864 executable = 'xattr'
3865 opts = ['-w', key, value]
3866
3867 cmd = ([encodeFilename(executable, True)] +
3868 [encodeArgument(o) for o in opts] +
3869 [encodeFilename(path, True)])
3870
3871 try:
3872 p = subprocess.Popen(
3873 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3874 except EnvironmentError as e:
3875 raise XAttrMetadataError(e.errno, e.strerror)
3876 stdout, stderr = p.communicate()
3877 stderr = stderr.decode('utf-8', 'replace')
3878 if p.returncode != 0:
3879 raise XAttrMetadataError(p.returncode, stderr)
3880
3881 else:
3882 # On Unix, and can't find pyxattr, setfattr, or xattr.
3883 if sys.platform.startswith('linux'):
3884 raise XAttrUnavailableError(
3885 "Couldn't find a tool to set the xattrs. "
3886 "Install either the python 'pyxattr' or 'xattr' "
3887 "modules, or the GNU 'attr' package "
3888 "(which contains the 'setfattr' tool).")
3889 else:
3890 raise XAttrUnavailableError(
3891 "Couldn't find a tool to set the xattrs. "
3892 "Install either the python 'xattr' module, "
3893 "or the 'xattr' binary.")
3894
3895
3896def random_birthday(year_field, month_field, day_field):
3897 return {
3898 year_field: str(random.randint(1950, 1995)),
3899 month_field: str(random.randint(1, 12)),
3900 day_field: str(random.randint(1, 31)),
3901 }