youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113 DATE_FORMATS = (
 114     '%d %B %Y',
 115     '%d %b %Y',
 116     '%B %d %Y',
 117     '%b %d %Y',
 118     '%b %dst %Y %I:%M',
 119     '%b %dnd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y/%m/%d',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y-%m-%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S.%f',
 127     '%d.%m.%Y %H:%M',
 128     '%d.%m.%Y %H.%M',
 129     '%Y-%m-%dT%H:%M:%SZ',
 130     '%Y-%m-%dT%H:%M:%S.%fZ',
 131     '%Y-%m-%dT%H:%M:%S.%f0Z',
 132     '%Y-%m-%dT%H:%M:%S',
 133     '%Y-%m-%dT%H:%M:%S.%f',
 134     '%Y-%m-%dT%H:%M',
 135 )
 136
 137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 138 DATE_FORMATS_DAY_FIRST.extend([
 139     '%d-%m-%Y',
 140     '%d.%m.%Y',
 141     '%d.%m.%y',
 142     '%d/%m/%Y',
 143     '%d/%m/%y',
 144     '%d/%m/%Y %H:%M:%S',
 145 ])
 146
 147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_MONTH_FIRST.extend([
 149     '%m-%d-%Y',
 150     '%m.%d.%Y',
 151     '%m/%d/%Y',
 152     '%m/%d/%y',
 153     '%m/%d/%Y %H:%M:%S',
 154 ])
 155
 156
 157 def preferredencoding():
 158     """Get preferred encoding.
 159
 160     Returns the best encoding scheme for the system, based on
 161     locale.getpreferredencoding() and some further tweaks.
 162     """
 163     try:
 164         pref = locale.getpreferredencoding()
 165         'TEST'.encode(pref)
 166     except Exception:
 167         pref = 'UTF-8'
 168
 169     return pref
 170
 171
 172 def write_json_file(obj, fn):
 173     """ Encode obj as JSON and write it to fn, atomically if possible """
 174
 175     fn = encodeFilename(fn)
 176     if sys.version_info < (3, 0) and sys.platform != 'win32':
 177         encoding = get_filesystem_encoding()
 178         # os.path.basename returns a bytes object, but NamedTemporaryFile
 179         # will fail if the filename contains non ascii characters unless we
 180         # use a unicode object
 181         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 182         # the same for os.path.dirname
 183         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 184     else:
 185         path_basename = os.path.basename
 186         path_dirname = os.path.dirname
 187
 188     args = {
 189         'suffix': '.tmp',
 190         'prefix': path_basename(fn) + '.',
 191         'dir': path_dirname(fn),
 192         'delete': False,
 193     }
 194
 195     # In Python 2.x, json.dump expects a bytestream.
 196     # In Python 3.x, it writes to a character stream
 197     if sys.version_info < (3, 0):
 198         args['mode'] = 'wb'
 199     else:
 200         args.update({
 201             'mode': 'w',
 202             'encoding': 'utf-8',
 203         })
 204
 205     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 206
 207     try:
 208         with tf:
 209             json.dump(obj, tf)
 210         if sys.platform == 'win32':
 211             # Need to remove existing file on Windows, else os.rename raises
 212             # WindowsError or FileExistsError.
 213             try:
 214                 os.unlink(fn)
 215             except OSError:
 216                 pass
 217         os.rename(tf.name, fn)
 218     except Exception:
 219         try:
 220             os.remove(tf.name)
 221         except OSError:
 222             pass
 223         raise
 224
 225
 226 if sys.version_info >= (2, 7):
 227     def find_xpath_attr(node, xpath, key, val=None):
 228         """ Find the xpath xpath[@key=val] """
 229         assert re.match(r'^[a-zA-Z_-]+$', key)
 230         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 231         return node.find(expr)
 232 else:
 233     def find_xpath_attr(node, xpath, key, val=None):
 234         for f in node.findall(compat_xpath(xpath)):
 235             if key not in f.attrib:
 236                 continue
 237             if val is None or f.attrib.get(key) == val:
 238                 return f
 239         return None
 240
 241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 242 # the namespace parameter
 243
 244
 245 def xpath_with_ns(path, ns_map):
 246     components = [c.split(':') for c in path.split('/')]
 247     replaced = []
 248     for c in components:
 249         if len(c) == 1:
 250             replaced.append(c[0])
 251         else:
 252             ns, tag = c
 253             replaced.append('{%s}%s' % (ns_map[ns], tag))
 254     return '/'.join(replaced)
 255
 256
 257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 258     def _find_xpath(xpath):
 259         return node.find(compat_xpath(xpath))
 260
 261     if isinstance(xpath, (str, compat_str)):
 262         n = _find_xpath(xpath)
 263     else:
 264         for xp in xpath:
 265             n = _find_xpath(xp)
 266             if n is not None:
 267                 break
 268
 269     if n is None:
 270         if default is not NO_DEFAULT:
 271             return default
 272         elif fatal:
 273             name = xpath if name is None else name
 274             raise ExtractorError('Could not find XML element %s' % name)
 275         else:
 276             return None
 277     return n
 278
 279
 280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 281     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 282     if n is None or n == default:
 283         return n
 284     if n.text is None:
 285         if default is not NO_DEFAULT:
 286             return default
 287         elif fatal:
 288             name = xpath if name is None else name
 289             raise ExtractorError('Could not find XML element\'s text %s' % name)
 290         else:
 291             return None
 292     return n.text
 293
 294
 295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 296     n = find_xpath_attr(node, xpath, key)
 297     if n is None:
 298         if default is not NO_DEFAULT:
 299             return default
 300         elif fatal:
 301             name = '%s[@%s]' % (xpath, key) if name is None else name
 302             raise ExtractorError('Could not find XML attribute %s' % name)
 303         else:
 304             return None
 305     return n.attrib[key]
 306
 307
 308 def get_element_by_id(id, html):
 309     """Return the content of the tag with the specified ID in the passed HTML document"""
 310     return get_element_by_attribute('id', id, html)
 311
 312
 313 def get_element_by_attribute(attribute, value, html):
 314     """Return the content of the tag with the specified attribute in the passed HTML document"""
 315
 316     m = re.search(r'''(?xs)
 317         <([a-zA-Z0-9:._-]+)
 318          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 319          \s+%s=['"]?%s['"]?
 320          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 321         \s*>
 322         (?P<content>.*?)
 323         </\1>
 324     ''' % (re.escape(attribute), re.escape(value)), html)
 325
 326     if not m:
 327         return None
 328     res = m.group('content')
 329
 330     if res.startswith('"') or res.startswith("'"):
 331         res = res[1:-1]
 332
 333     return unescapeHTML(res)
 334
 335
 336 class HTMLAttributeParser(compat_HTMLParser):
 337     """Trivial HTML parser to gather the attributes for a single element"""
 338     def __init__(self):
 339         self.attrs = {}
 340         compat_HTMLParser.__init__(self)
 341
 342     def handle_starttag(self, tag, attrs):
 343         self.attrs = dict(attrs)
 344
 345
 346 def extract_attributes(html_element):
 347     """Given a string for an HTML element such as
 348     <el
 349          a="foo" B="bar" c="&98;az" d=boz
 350          empty= noval entity="&amp;"
 351          sq='"' dq="'"
 352     >
 353     Decode and return a dictionary of attributes.
 354     {
 355         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 356         'empty': '', 'noval': None, 'entity': '&',
 357         'sq': '"', 'dq': '\''
 358     }.
 359     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 360     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 361     """
 362     parser = HTMLAttributeParser()
 363     parser.feed(html_element)
 364     parser.close()
 365     return parser.attrs
 366
 367
 368 def clean_html(html):
 369     """Clean an HTML snippet into a readable string"""
 370
 371     if html is None:  # Convenience for sanitizing descriptions etc.
 372         return html
 373
 374     # Newline vs <br />
 375     html = html.replace('\n', ' ')
 376     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 377     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 378     # Strip html tags
 379     html = re.sub('<.*?>', '', html)
 380     # Replace html entities
 381     html = unescapeHTML(html)
 382     return html.strip()
 383
 384
 385 def sanitize_open(filename, open_mode):
 386     """Try to open the given filename, and slightly tweak it if this fails.
 387
 388     Attempts to open the given filename. If this fails, it tries to change
 389     the filename slightly, step by step, until it's either able to open it
 390     or it fails and raises a final exception, like the standard open()
 391     function.
 392
 393     It returns the tuple (stream, definitive_file_name).
 394     """
 395     try:
 396         if filename == '-':
 397             if sys.platform == 'win32':
 398                 import msvcrt
 399                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 400             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 401         stream = open(encodeFilename(filename), open_mode)
 402         return (stream, filename)
 403     except (IOError, OSError) as err:
 404         if err.errno in (errno.EACCES,):
 405             raise
 406
 407         # In case of error, try to remove win32 forbidden chars
 408         alt_filename = sanitize_path(filename)
 409         if alt_filename == filename:
 410             raise
 411         else:
 412             # An exception here should be caught in the caller
 413             stream = open(encodeFilename(alt_filename), open_mode)
 414             return (stream, alt_filename)
 415
 416
 417 def timeconvert(timestr):
 418     """Convert RFC 2822 defined time string into system timestamp"""
 419     timestamp = None
 420     timetuple = email.utils.parsedate_tz(timestr)
 421     if timetuple is not None:
 422         timestamp = email.utils.mktime_tz(timetuple)
 423     return timestamp
 424
 425
 426 def sanitize_filename(s, restricted=False, is_id=False):
 427     """Sanitizes a string so it could be used as part of a filename.
 428     If restricted is set, use a stricter subset of allowed characters.
 429     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 430     """
 431     def replace_insane(char):
 432         if restricted and char in ACCENT_CHARS:
 433             return ACCENT_CHARS[char]
 434         if char == '?' or ord(char) < 32 or ord(char) == 127:
 435             return ''
 436         elif char == '"':
 437             return '' if restricted else '\''
 438         elif char == ':':
 439             return '_-' if restricted else ' -'
 440         elif char in '\\/|*<>':
 441             return '_'
 442         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 443             return '_'
 444         if restricted and ord(char) > 127:
 445             return '_'
 446         return char
 447
 448     # Handle timestamps
 449     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 450     result = ''.join(map(replace_insane, s))
 451     if not is_id:
 452         while '__' in result:
 453             result = result.replace('__', '_')
 454         result = result.strip('_')
 455         # Common case of "Foreign band name - English song title"
 456         if restricted and result.startswith('-_'):
 457             result = result[2:]
 458         if result.startswith('-'):
 459             result = '_' + result[len('-'):]
 460         result = result.lstrip('.')
 461         if not result:
 462             result = '_'
 463     return result
 464
 465
 466 def sanitize_path(s):
 467     """Sanitizes and normalizes path on Windows"""
 468     if sys.platform != 'win32':
 469         return s
 470     drive_or_unc, _ = os.path.splitdrive(s)
 471     if sys.version_info < (2, 7) and not drive_or_unc:
 472         drive_or_unc, _ = os.path.splitunc(s)
 473     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 474     if drive_or_unc:
 475         norm_path.pop(0)
 476     sanitized_path = [
 477         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 478         for path_part in norm_path]
 479     if drive_or_unc:
 480         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 481     return os.path.join(*sanitized_path)
 482
 483
 484 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 485 # unwanted failures due to missing protocol
 486 def sanitize_url(url):
 487     return 'http:%s' % url if url.startswith('//') else url
 488
 489
 490 def sanitized_Request(url, *args, **kwargs):
 491     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 492
 493
 494 def orderedSet(iterable):
 495     """ Remove all duplicates from the input iterable """
 496     res = []
 497     for el in iterable:
 498         if el not in res:
 499             res.append(el)
 500     return res
 501
 502
 503 def _htmlentity_transform(entity_with_semicolon):
 504     """Transforms an HTML entity to a character."""
 505     entity = entity_with_semicolon[:-1]
 506
 507     # Known non-numeric HTML entity
 508     if entity in compat_html_entities.name2codepoint:
 509         return compat_chr(compat_html_entities.name2codepoint[entity])
 510
 511     # TODO: HTML5 allows entities without a semicolon. For example,
 512     # '&Eacuteric' should be decoded as 'Éric'.
 513     if entity_with_semicolon in compat_html_entities_html5:
 514         return compat_html_entities_html5[entity_with_semicolon]
 515
 516     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 517     if mobj is not None:
 518         numstr = mobj.group(1)
 519         if numstr.startswith('x'):
 520             base = 16
 521             numstr = '0%s' % numstr
 522         else:
 523             base = 10
 524         # See https://github.com/rg3/youtube-dl/issues/7518
 525         try:
 526             return compat_chr(int(numstr, base))
 527         except ValueError:
 528             pass
 529
 530     # Unknown entity in name, return its literal representation
 531     return '&%s;' % entity
 532
 533
 534 def unescapeHTML(s):
 535     if s is None:
 536         return None
 537     assert type(s) == compat_str
 538
 539     return re.sub(
 540         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 541
 542
 543 def get_subprocess_encoding():
 544     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 545         # For subprocess calls, encode with locale encoding
 546         # Refer to http://stackoverflow.com/a/9951851/35070
 547         encoding = preferredencoding()
 548     else:
 549         encoding = sys.getfilesystemencoding()
 550     if encoding is None:
 551         encoding = 'utf-8'
 552     return encoding
 553
 554
 555 def encodeFilename(s, for_subprocess=False):
 556     """
 557     @param s The name of the file
 558     """
 559
 560     assert type(s) == compat_str
 561
 562     # Python 3 has a Unicode API
 563     if sys.version_info >= (3, 0):
 564         return s
 565
 566     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 567     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 568     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 569     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 570         return s
 571
 572     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 573     if sys.platform.startswith('java'):
 574         return s
 575
 576     return s.encode(get_subprocess_encoding(), 'ignore')
 577
 578
 579 def decodeFilename(b, for_subprocess=False):
 580
 581     if sys.version_info >= (3, 0):
 582         return b
 583
 584     if not isinstance(b, bytes):
 585         return b
 586
 587     return b.decode(get_subprocess_encoding(), 'ignore')
 588
 589
 590 def encodeArgument(s):
 591     if not isinstance(s, compat_str):
 592         # Legacy code that uses byte strings
 593         # Uncomment the following line after fixing all post processors
 594         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 595         s = s.decode('ascii')
 596     return encodeFilename(s, True)
 597
 598
 599 def decodeArgument(b):
 600     return decodeFilename(b, True)
 601
 602
 603 def decodeOption(optval):
 604     if optval is None:
 605         return optval
 606     if isinstance(optval, bytes):
 607         optval = optval.decode(preferredencoding())
 608
 609     assert isinstance(optval, compat_str)
 610     return optval
 611
 612
 613 def formatSeconds(secs):
 614     if secs > 3600:
 615         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 616     elif secs > 60:
 617         return '%d:%02d' % (secs // 60, secs % 60)
 618     else:
 619         return '%d' % secs
 620
 621
 622 def make_HTTPS_handler(params, **kwargs):
 623     opts_no_check_certificate = params.get('nocheckcertificate', False)
 624     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 625         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 626         if opts_no_check_certificate:
 627             context.check_hostname = False
 628             context.verify_mode = ssl.CERT_NONE
 629         try:
 630             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 631         except TypeError:
 632             # Python 2.7.8
 633             # (create_default_context present but HTTPSHandler has no context=)
 634             pass
 635
 636     if sys.version_info < (3, 2):
 637         return YoutubeDLHTTPSHandler(params, **kwargs)
 638     else:  # Python < 3.4
 639         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 640         context.verify_mode = (ssl.CERT_NONE
 641                                if opts_no_check_certificate
 642                                else ssl.CERT_REQUIRED)
 643         context.set_default_verify_paths()
 644         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 645
 646
 647 def bug_reports_message():
 648     if ytdl_is_updateable():
 649         update_cmd = 'type  youtube-dl -U  to update'
 650     else:
 651         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 652     msg = '; please report this issue on https://yt-dl.org/bug .'
 653     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 654     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 655     return msg
 656
 657
 658 class ExtractorError(Exception):
 659     """Error during info extraction."""
 660
 661     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 662         """ tb, if given, is the original traceback (so that it can be printed out).
 663         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 664         """
 665
 666         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 667             expected = True
 668         if video_id is not None:
 669             msg = video_id + ': ' + msg
 670         if cause:
 671             msg += ' (caused by %r)' % cause
 672         if not expected:
 673             msg += bug_reports_message()
 674         super(ExtractorError, self).__init__(msg)
 675
 676         self.traceback = tb
 677         self.exc_info = sys.exc_info()  # preserve original exception
 678         self.cause = cause
 679         self.video_id = video_id
 680
 681     def format_traceback(self):
 682         if self.traceback is None:
 683             return None
 684         return ''.join(traceback.format_tb(self.traceback))
 685
 686
 687 class UnsupportedError(ExtractorError):
 688     def __init__(self, url):
 689         super(UnsupportedError, self).__init__(
 690             'Unsupported URL: %s' % url, expected=True)
 691         self.url = url
 692
 693
 694 class RegexNotFoundError(ExtractorError):
 695     """Error when a regex didn't match"""
 696     pass
 697
 698
 699 class DownloadError(Exception):
 700     """Download Error exception.
 701
 702     This exception may be thrown by FileDownloader objects if they are not
 703     configured to continue on errors. They will contain the appropriate
 704     error message.
 705     """
 706
 707     def __init__(self, msg, exc_info=None):
 708         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 709         super(DownloadError, self).__init__(msg)
 710         self.exc_info = exc_info
 711
 712
 713 class SameFileError(Exception):
 714     """Same File exception.
 715
 716     This exception will be thrown by FileDownloader objects if they detect
 717     multiple files would have to be downloaded to the same file on disk.
 718     """
 719     pass
 720
 721
 722 class PostProcessingError(Exception):
 723     """Post Processing exception.
 724
 725     This exception may be raised by PostProcessor's .run() method to
 726     indicate an error in the postprocessing task.
 727     """
 728
 729     def __init__(self, msg):
 730         self.msg = msg
 731
 732
 733 class MaxDownloadsReached(Exception):
 734     """ --max-downloads limit has been reached. """
 735     pass
 736
 737
 738 class UnavailableVideoError(Exception):
 739     """Unavailable Format exception.
 740
 741     This exception will be thrown when a video is requested
 742     in a format that is not available for that video.
 743     """
 744     pass
 745
 746
 747 class ContentTooShortError(Exception):
 748     """Content Too Short exception.
 749
 750     This exception may be raised by FileDownloader objects when a file they
 751     download is too small for what the server announced first, indicating
 752     the connection was probably interrupted.
 753     """
 754
 755     def __init__(self, downloaded, expected):
 756         # Both in bytes
 757         self.downloaded = downloaded
 758         self.expected = expected
 759
 760
 761 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 762     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 763     # expected HTTP responses to meet HTTP/1.0 or later (see also
 764     # https://github.com/rg3/youtube-dl/issues/6727)
 765     if sys.version_info < (3, 0):
 766         kwargs[b'strict'] = True
 767     hc = http_class(*args, **kwargs)
 768     source_address = ydl_handler._params.get('source_address')
 769     if source_address is not None:
 770         sa = (source_address, 0)
 771         if hasattr(hc, 'source_address'):  # Python 2.7+
 772             hc.source_address = sa
 773         else:  # Python 2.6
 774             def _hc_connect(self, *args, **kwargs):
 775                 sock = compat_socket_create_connection(
 776                     (self.host, self.port), self.timeout, sa)
 777                 if is_https:
 778                     self.sock = ssl.wrap_socket(
 779                         sock, self.key_file, self.cert_file,
 780                         ssl_version=ssl.PROTOCOL_TLSv1)
 781                 else:
 782                     self.sock = sock
 783             hc.connect = functools.partial(_hc_connect, hc)
 784
 785     return hc
 786
 787
 788 def handle_youtubedl_headers(headers):
 789     filtered_headers = headers
 790
 791     if 'Youtubedl-no-compression' in filtered_headers:
 792         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 793         del filtered_headers['Youtubedl-no-compression']
 794
 795     return filtered_headers
 796
 797
 798 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 799     """Handler for HTTP requests and responses.
 800
 801     This class, when installed with an OpenerDirector, automatically adds
 802     the standard headers to every HTTP request and handles gzipped and
 803     deflated responses from web servers. If compression is to be avoided in
 804     a particular request, the original request in the program code only has
 805     to include the HTTP header "Youtubedl-no-compression", which will be
 806     removed before making the real request.
 807
 808     Part of this code was copied from:
 809
 810     http://techknack.net/python-urllib2-handlers/
 811
 812     Andrew Rowls, the author of that code, agreed to release it to the
 813     public domain.
 814     """
 815
 816     def __init__(self, params, *args, **kwargs):
 817         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 818         self._params = params
 819
 820     def http_open(self, req):
 821         conn_class = compat_http_client.HTTPConnection
 822
 823         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 824         if socks_proxy:
 825             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 826             del req.headers['Ytdl-socks-proxy']
 827
 828         return self.do_open(functools.partial(
 829             _create_http_connection, self, conn_class, False),
 830             req)
 831
 832     @staticmethod
 833     def deflate(data):
 834         try:
 835             return zlib.decompress(data, -zlib.MAX_WBITS)
 836         except zlib.error:
 837             return zlib.decompress(data)
 838
 839     @staticmethod
 840     def addinfourl_wrapper(stream, headers, url, code):
 841         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 842             return compat_urllib_request.addinfourl(stream, headers, url, code)
 843         ret = compat_urllib_request.addinfourl(stream, headers, url)
 844         ret.code = code
 845         return ret
 846
 847     def http_request(self, req):
 848         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 849         # always respected by websites, some tend to give out URLs with non percent-encoded
 850         # non-ASCII characters (see telemb.py, ard.py [#3412])
 851         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 852         # To work around aforementioned issue we will replace request's original URL with
 853         # percent-encoded one
 854         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 855         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 856         url = req.get_full_url()
 857         url_escaped = escape_url(url)
 858
 859         # Substitute URL if any change after escaping
 860         if url != url_escaped:
 861             req = update_Request(req, url=url_escaped)
 862
 863         for h, v in std_headers.items():
 864             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 865             # The dict keys are capitalized because of this bug by urllib
 866             if h.capitalize() not in req.headers:
 867                 req.add_header(h, v)
 868
 869         req.headers = handle_youtubedl_headers(req.headers)
 870
 871         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 872             # Python 2.6 is brain-dead when it comes to fragments
 873             req._Request__original = req._Request__original.partition('#')[0]
 874             req._Request__r_type = req._Request__r_type.partition('#')[0]
 875
 876         return req
 877
 878     def http_response(self, req, resp):
 879         old_resp = resp
 880         # gzip
 881         if resp.headers.get('Content-encoding', '') == 'gzip':
 882             content = resp.read()
 883             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 884             try:
 885                 uncompressed = io.BytesIO(gz.read())
 886             except IOError as original_ioerror:
 887                 # There may be junk add the end of the file
 888                 # See http://stackoverflow.com/q/4928560/35070 for details
 889                 for i in range(1, 1024):
 890                     try:
 891                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 892                         uncompressed = io.BytesIO(gz.read())
 893                     except IOError:
 894                         continue
 895                     break
 896                 else:
 897                     raise original_ioerror
 898             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 899             resp.msg = old_resp.msg
 900             del resp.headers['Content-encoding']
 901         # deflate
 902         if resp.headers.get('Content-encoding', '') == 'deflate':
 903             gz = io.BytesIO(self.deflate(resp.read()))
 904             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 905             resp.msg = old_resp.msg
 906             del resp.headers['Content-encoding']
 907         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 908         # https://github.com/rg3/youtube-dl/issues/6457).
 909         if 300 <= resp.code < 400:
 910             location = resp.headers.get('Location')
 911             if location:
 912                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 913                 if sys.version_info >= (3, 0):
 914                     location = location.encode('iso-8859-1').decode('utf-8')
 915                 else:
 916                     location = location.decode('utf-8')
 917                 location_escaped = escape_url(location)
 918                 if location != location_escaped:
 919                     del resp.headers['Location']
 920                     if sys.version_info < (3, 0):
 921                         location_escaped = location_escaped.encode('utf-8')
 922                     resp.headers['Location'] = location_escaped
 923         return resp
 924
 925     https_request = http_request
 926     https_response = http_response
 927
 928
 929 def make_socks_conn_class(base_class, socks_proxy):
 930     assert issubclass(base_class, (
 931         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 932
 933     url_components = compat_urlparse.urlparse(socks_proxy)
 934     if url_components.scheme.lower() == 'socks5':
 935         socks_type = ProxyType.SOCKS5
 936     elif url_components.scheme.lower() in ('socks', 'socks4'):
 937         socks_type = ProxyType.SOCKS4
 938     elif url_components.scheme.lower() == 'socks4a':
 939         socks_type = ProxyType.SOCKS4A
 940
 941     def unquote_if_non_empty(s):
 942         if not s:
 943             return s
 944         return compat_urllib_parse_unquote_plus(s)
 945
 946     proxy_args = (
 947         socks_type,
 948         url_components.hostname, url_components.port or 1080,
 949         True,  # Remote DNS
 950         unquote_if_non_empty(url_components.username),
 951         unquote_if_non_empty(url_components.password),
 952     )
 953
 954     class SocksConnection(base_class):
 955         def connect(self):
 956             self.sock = sockssocket()
 957             self.sock.setproxy(*proxy_args)
 958             if type(self.timeout) in (int, float):
 959                 self.sock.settimeout(self.timeout)
 960             self.sock.connect((self.host, self.port))
 961
 962             if isinstance(self, compat_http_client.HTTPSConnection):
 963                 if hasattr(self, '_context'):  # Python > 2.6
 964                     self.sock = self._context.wrap_socket(
 965                         self.sock, server_hostname=self.host)
 966                 else:
 967                     self.sock = ssl.wrap_socket(self.sock)
 968
 969     return SocksConnection
 970
 971
 972 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 973     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 974         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 975         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 976         self._params = params
 977
 978     def https_open(self, req):
 979         kwargs = {}
 980         conn_class = self._https_conn_class
 981
 982         if hasattr(self, '_context'):  # python > 2.6
 983             kwargs['context'] = self._context
 984         if hasattr(self, '_check_hostname'):  # python 3.x
 985             kwargs['check_hostname'] = self._check_hostname
 986
 987         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 988         if socks_proxy:
 989             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 990             del req.headers['Ytdl-socks-proxy']
 991
 992         return self.do_open(functools.partial(
 993             _create_http_connection, self, conn_class, True),
 994             req, **kwargs)
 995
 996
 997 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 998     def __init__(self, cookiejar=None):
 999         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001     def http_response(self, request, response):
1002         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003         # characters in Set-Cookie HTTP header of last response (see
1004         # https://github.com/rg3/youtube-dl/issues/6769).
1005         # In order to at least prevent crashing we will percent encode Set-Cookie
1006         # header before HTTPCookieProcessor starts processing it.
1007         # if sys.version_info < (3, 0) and response.headers:
1008         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009         #         set_cookie = response.headers.get(set_cookie_header)
1010         #         if set_cookie:
1011         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012         #             if set_cookie != set_cookie_escaped:
1013         #                 del response.headers[set_cookie_header]
1014         #                 response.headers[set_cookie_header] = set_cookie_escaped
1015         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018     https_response = http_response
1019
1020
1021 def extract_timezone(date_str):
1022     m = re.search(
1023         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024         date_str)
1025     if not m:
1026         timezone = datetime.timedelta()
1027     else:
1028         date_str = date_str[:-len(m.group('tz'))]
1029         if not m.group('sign'):
1030             timezone = datetime.timedelta()
1031         else:
1032             sign = 1 if m.group('sign') == '+' else -1
1033             timezone = datetime.timedelta(
1034                 hours=sign * int(m.group('hours')),
1035                 minutes=sign * int(m.group('minutes')))
1036     return timezone, date_str
1037
1038
1039 def parse_iso8601(date_str, delimiter='T', timezone=None):
1040     """ Return a UNIX timestamp from the given date """
1041
1042     if date_str is None:
1043         return None
1044
1045     date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
1047     if timezone is None:
1048         timezone, date_str = extract_timezone(date_str)
1049
1050     try:
1051         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053         return calendar.timegm(dt.timetuple())
1054     except ValueError:
1055         pass
1056
1057
1058 def date_formats(day_first=True):
1059     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
1062 def unified_strdate(date_str, day_first=True):
1063     """Return a string with the date in the format YYYYMMDD"""
1064
1065     if date_str is None:
1066         return None
1067     upload_date = None
1068     # Replace commas
1069     date_str = date_str.replace(',', ' ')
1070     # Remove AM/PM + timezone
1071     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1072     _, date_str = extract_timezone(date_str)
1073
1074     for expression in date_formats(day_first):
1075         try:
1076             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1077         except ValueError:
1078             pass
1079     if upload_date is None:
1080         timetuple = email.utils.parsedate_tz(date_str)
1081         if timetuple:
1082             try:
1083                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084             except ValueError:
1085                 pass
1086     if upload_date is not None:
1087         return compat_str(upload_date)
1088
1089
1090 def unified_timestamp(date_str, day_first=True):
1091     if date_str is None:
1092         return None
1093
1094     date_str = date_str.replace(',', ' ')
1095
1096     pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097     timezone, date_str = extract_timezone(date_str)
1098
1099     # Remove AM/PM + timezone
1100     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102     for expression in date_formats(day_first):
1103         try:
1104             dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105             return calendar.timegm(dt.timetuple())
1106         except ValueError:
1107             pass
1108     timetuple = email.utils.parsedate_tz(date_str)
1109     if timetuple:
1110         return calendar.timegm(timetuple.timetuple())
1111
1112
1113 def determine_ext(url, default_ext='unknown_video'):
1114     if url is None:
1115         return default_ext
1116     guess = url.partition('?')[0].rpartition('.')[2]
1117     if re.match(r'^[A-Za-z0-9]+$', guess):
1118         return guess
1119     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1121         return guess.rstrip('/')
1122     else:
1123         return default_ext
1124
1125
1126 def subtitles_filename(filename, sub_lang, sub_format):
1127     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1128
1129
1130 def date_from_str(date_str):
1131     """
1132     Return a datetime object from a string in the format YYYYMMDD or
1133     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134     today = datetime.date.today()
1135     if date_str in ('now', 'today'):
1136         return today
1137     if date_str == 'yesterday':
1138         return today - datetime.timedelta(days=1)
1139     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140     if match is not None:
1141         sign = match.group('sign')
1142         time = int(match.group('time'))
1143         if sign == '-':
1144             time = -time
1145         unit = match.group('unit')
1146         # A bad approximation?
1147         if unit == 'month':
1148             unit = 'day'
1149             time *= 30
1150         elif unit == 'year':
1151             unit = 'day'
1152             time *= 365
1153         unit += 's'
1154         delta = datetime.timedelta(**{unit: time})
1155         return today + delta
1156     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1157
1158
1159 def hyphenate_date(date_str):
1160     """
1161     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163     if match is not None:
1164         return '-'.join(match.groups())
1165     else:
1166         return date_str
1167
1168
1169 class DateRange(object):
1170     """Represents a time interval between two dates"""
1171
1172     def __init__(self, start=None, end=None):
1173         """start and end must be strings in the format accepted by date"""
1174         if start is not None:
1175             self.start = date_from_str(start)
1176         else:
1177             self.start = datetime.datetime.min.date()
1178         if end is not None:
1179             self.end = date_from_str(end)
1180         else:
1181             self.end = datetime.datetime.max.date()
1182         if self.start > self.end:
1183             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1184
1185     @classmethod
1186     def day(cls, day):
1187         """Returns a range that only contains the given day"""
1188         return cls(day, day)
1189
1190     def __contains__(self, date):
1191         """Check if the date is in the range"""
1192         if not isinstance(date, datetime.date):
1193             date = date_from_str(date)
1194         return self.start <= date <= self.end
1195
1196     def __str__(self):
1197         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1198
1199
1200 def platform_name():
1201     """ Returns the platform name as a compat_str """
1202     res = platform.platform()
1203     if isinstance(res, bytes):
1204         res = res.decode(preferredencoding())
1205
1206     assert isinstance(res, compat_str)
1207     return res
1208
1209
1210 def _windows_write_string(s, out):
1211     """ Returns True if the string was written using special methods,
1212     False if it has yet to be written out."""
1213     # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215     import ctypes
1216     import ctypes.wintypes
1217
1218     WIN_OUTPUT_IDS = {
1219         1: -11,
1220         2: -12,
1221     }
1222
1223     try:
1224         fileno = out.fileno()
1225     except AttributeError:
1226         # If the output stream doesn't have a fileno, it's virtual
1227         return False
1228     except io.UnsupportedOperation:
1229         # Some strange Windows pseudo files?
1230         return False
1231     if fileno not in WIN_OUTPUT_IDS:
1232         return False
1233
1234     GetStdHandle = ctypes.WINFUNCTYPE(
1235         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1236         (b'GetStdHandle', ctypes.windll.kernel32))
1237     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
1239     WriteConsoleW = ctypes.WINFUNCTYPE(
1240         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1242         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1243     written = ctypes.wintypes.DWORD(0)
1244
1245     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1246     FILE_TYPE_CHAR = 0x0002
1247     FILE_TYPE_REMOTE = 0x8000
1248     GetConsoleMode = ctypes.WINFUNCTYPE(
1249         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250         ctypes.POINTER(ctypes.wintypes.DWORD))(
1251         (b'GetConsoleMode', ctypes.windll.kernel32))
1252     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254     def not_a_console(handle):
1255         if handle == INVALID_HANDLE_VALUE or handle is None:
1256             return True
1257         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1259
1260     if not_a_console(h):
1261         return False
1262
1263     def next_nonbmp_pos(s):
1264         try:
1265             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266         except StopIteration:
1267             return len(s)
1268
1269     while s:
1270         count = min(next_nonbmp_pos(s), 1024)
1271
1272         ret = WriteConsoleW(
1273             h, s, count if count else 2, ctypes.byref(written), None)
1274         if ret == 0:
1275             raise OSError('Failed to write string')
1276         if not count:  # We just wrote a non-BMP character
1277             assert written.value == 2
1278             s = s[1:]
1279         else:
1280             assert written.value > 0
1281             s = s[written.value:]
1282     return True
1283
1284
1285 def write_string(s, out=None, encoding=None):
1286     if out is None:
1287         out = sys.stderr
1288     assert type(s) == compat_str
1289
1290     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291         if _windows_write_string(s, out):
1292             return
1293
1294     if ('b' in getattr(out, 'mode', '') or
1295             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1296         byt = s.encode(encoding or preferredencoding(), 'ignore')
1297         out.write(byt)
1298     elif hasattr(out, 'buffer'):
1299         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300         byt = s.encode(enc, 'ignore')
1301         out.buffer.write(byt)
1302     else:
1303         out.write(s)
1304     out.flush()
1305
1306
1307 def bytes_to_intlist(bs):
1308     if not bs:
1309         return []
1310     if isinstance(bs[0], int):  # Python 3
1311         return list(bs)
1312     else:
1313         return [ord(c) for c in bs]
1314
1315
1316 def intlist_to_bytes(xs):
1317     if not xs:
1318         return b''
1319     return compat_struct_pack('%dB' % len(xs), *xs)
1320
1321
1322 # Cross-platform file locking
1323 if sys.platform == 'win32':
1324     import ctypes.wintypes
1325     import msvcrt
1326
1327     class OVERLAPPED(ctypes.Structure):
1328         _fields_ = [
1329             ('Internal', ctypes.wintypes.LPVOID),
1330             ('InternalHigh', ctypes.wintypes.LPVOID),
1331             ('Offset', ctypes.wintypes.DWORD),
1332             ('OffsetHigh', ctypes.wintypes.DWORD),
1333             ('hEvent', ctypes.wintypes.HANDLE),
1334         ]
1335
1336     kernel32 = ctypes.windll.kernel32
1337     LockFileEx = kernel32.LockFileEx
1338     LockFileEx.argtypes = [
1339         ctypes.wintypes.HANDLE,     # hFile
1340         ctypes.wintypes.DWORD,      # dwFlags
1341         ctypes.wintypes.DWORD,      # dwReserved
1342         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1343         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1344         ctypes.POINTER(OVERLAPPED)  # Overlapped
1345     ]
1346     LockFileEx.restype = ctypes.wintypes.BOOL
1347     UnlockFileEx = kernel32.UnlockFileEx
1348     UnlockFileEx.argtypes = [
1349         ctypes.wintypes.HANDLE,     # hFile
1350         ctypes.wintypes.DWORD,      # dwReserved
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1353         ctypes.POINTER(OVERLAPPED)  # Overlapped
1354     ]
1355     UnlockFileEx.restype = ctypes.wintypes.BOOL
1356     whole_low = 0xffffffff
1357     whole_high = 0x7fffffff
1358
1359     def _lock_file(f, exclusive):
1360         overlapped = OVERLAPPED()
1361         overlapped.Offset = 0
1362         overlapped.OffsetHigh = 0
1363         overlapped.hEvent = 0
1364         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365         handle = msvcrt.get_osfhandle(f.fileno())
1366         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367                           whole_low, whole_high, f._lock_file_overlapped_p):
1368             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370     def _unlock_file(f):
1371         assert f._lock_file_overlapped_p
1372         handle = msvcrt.get_osfhandle(f.fileno())
1373         if not UnlockFileEx(handle, 0,
1374                             whole_low, whole_high, f._lock_file_overlapped_p):
1375             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377 else:
1378     # Some platforms, such as Jython, is missing fcntl
1379     try:
1380         import fcntl
1381
1382         def _lock_file(f, exclusive):
1383             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1384
1385         def _unlock_file(f):
1386             fcntl.flock(f, fcntl.LOCK_UN)
1387     except ImportError:
1388         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390         def _lock_file(f, exclusive):
1391             raise IOError(UNSUPPORTED_MSG)
1392
1393         def _unlock_file(f):
1394             raise IOError(UNSUPPORTED_MSG)
1395
1396
1397 class locked_file(object):
1398     def __init__(self, filename, mode, encoding=None):
1399         assert mode in ['r', 'a', 'w']
1400         self.f = io.open(filename, mode, encoding=encoding)
1401         self.mode = mode
1402
1403     def __enter__(self):
1404         exclusive = self.mode != 'r'
1405         try:
1406             _lock_file(self.f, exclusive)
1407         except IOError:
1408             self.f.close()
1409             raise
1410         return self
1411
1412     def __exit__(self, etype, value, traceback):
1413         try:
1414             _unlock_file(self.f)
1415         finally:
1416             self.f.close()
1417
1418     def __iter__(self):
1419         return iter(self.f)
1420
1421     def write(self, *args):
1422         return self.f.write(*args)
1423
1424     def read(self, *args):
1425         return self.f.read(*args)
1426
1427
1428 def get_filesystem_encoding():
1429     encoding = sys.getfilesystemencoding()
1430     return encoding if encoding is not None else 'utf-8'
1431
1432
1433 def shell_quote(args):
1434     quoted_args = []
1435     encoding = get_filesystem_encoding()
1436     for a in args:
1437         if isinstance(a, bytes):
1438             # We may get a filename encoded with 'encodeFilename'
1439             a = a.decode(encoding)
1440         quoted_args.append(pipes.quote(a))
1441     return ' '.join(quoted_args)
1442
1443
1444 def smuggle_url(url, data):
1445     """ Pass additional data in a URL for internal use. """
1446
1447     url, idata = unsmuggle_url(url, {})
1448     data.update(idata)
1449     sdata = compat_urllib_parse_urlencode(
1450         {'__youtubedl_smuggle': json.dumps(data)})
1451     return url + '#' + sdata
1452
1453
1454 def unsmuggle_url(smug_url, default=None):
1455     if '#__youtubedl_smuggle' not in smug_url:
1456         return smug_url, default
1457     url, _, sdata = smug_url.rpartition('#')
1458     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1459     data = json.loads(jsond)
1460     return url, data
1461
1462
1463 def format_bytes(bytes):
1464     if bytes is None:
1465         return 'N/A'
1466     if type(bytes) is str:
1467         bytes = float(bytes)
1468     if bytes == 0.0:
1469         exponent = 0
1470     else:
1471         exponent = int(math.log(bytes, 1024.0))
1472     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1473     converted = float(bytes) / float(1024 ** exponent)
1474     return '%.2f%s' % (converted, suffix)
1475
1476
1477 def lookup_unit_table(unit_table, s):
1478     units_re = '|'.join(re.escape(u) for u in unit_table)
1479     m = re.match(
1480         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1481     if not m:
1482         return None
1483     num_str = m.group('num').replace(',', '.')
1484     mult = unit_table[m.group('unit')]
1485     return int(float(num_str) * mult)
1486
1487
1488 def parse_filesize(s):
1489     if s is None:
1490         return None
1491
1492     # The lower-case forms are of course incorrect and unofficial,
1493     # but we support those too
1494     _UNIT_TABLE = {
1495         'B': 1,
1496         'b': 1,
1497         'KiB': 1024,
1498         'KB': 1000,
1499         'kB': 1024,
1500         'Kb': 1000,
1501         'MiB': 1024 ** 2,
1502         'MB': 1000 ** 2,
1503         'mB': 1024 ** 2,
1504         'Mb': 1000 ** 2,
1505         'GiB': 1024 ** 3,
1506         'GB': 1000 ** 3,
1507         'gB': 1024 ** 3,
1508         'Gb': 1000 ** 3,
1509         'TiB': 1024 ** 4,
1510         'TB': 1000 ** 4,
1511         'tB': 1024 ** 4,
1512         'Tb': 1000 ** 4,
1513         'PiB': 1024 ** 5,
1514         'PB': 1000 ** 5,
1515         'pB': 1024 ** 5,
1516         'Pb': 1000 ** 5,
1517         'EiB': 1024 ** 6,
1518         'EB': 1000 ** 6,
1519         'eB': 1024 ** 6,
1520         'Eb': 1000 ** 6,
1521         'ZiB': 1024 ** 7,
1522         'ZB': 1000 ** 7,
1523         'zB': 1024 ** 7,
1524         'Zb': 1000 ** 7,
1525         'YiB': 1024 ** 8,
1526         'YB': 1000 ** 8,
1527         'yB': 1024 ** 8,
1528         'Yb': 1000 ** 8,
1529     }
1530
1531     return lookup_unit_table(_UNIT_TABLE, s)
1532
1533
1534 def parse_count(s):
1535     if s is None:
1536         return None
1537
1538     s = s.strip()
1539
1540     if re.match(r'^[\d,.]+$', s):
1541         return str_to_int(s)
1542
1543     _UNIT_TABLE = {
1544         'k': 1000,
1545         'K': 1000,
1546         'm': 1000 ** 2,
1547         'M': 1000 ** 2,
1548         'kk': 1000 ** 2,
1549         'KK': 1000 ** 2,
1550     }
1551
1552     return lookup_unit_table(_UNIT_TABLE, s)
1553
1554
1555 def month_by_name(name):
1556     """ Return the number of a month by (locale-independently) English name """
1557
1558     try:
1559         return ENGLISH_MONTH_NAMES.index(name) + 1
1560     except ValueError:
1561         return None
1562
1563
1564 def month_by_abbreviation(abbrev):
1565     """ Return the number of a month by (locale-independently) English
1566         abbreviations """
1567
1568     try:
1569         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1570     except ValueError:
1571         return None
1572
1573
1574 def fix_xml_ampersands(xml_str):
1575     """Replace all the '&' by '&amp;' in XML"""
1576     return re.sub(
1577         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1578         '&amp;',
1579         xml_str)
1580
1581
1582 def setproctitle(title):
1583     assert isinstance(title, compat_str)
1584
1585     # ctypes in Jython is not complete
1586     # http://bugs.jython.org/issue2148
1587     if sys.platform.startswith('java'):
1588         return
1589
1590     try:
1591         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1592     except OSError:
1593         return
1594     title_bytes = title.encode('utf-8')
1595     buf = ctypes.create_string_buffer(len(title_bytes))
1596     buf.value = title_bytes
1597     try:
1598         libc.prctl(15, buf, 0, 0, 0)
1599     except AttributeError:
1600         return  # Strange libc, just skip this
1601
1602
1603 def remove_start(s, start):
1604     return s[len(start):] if s is not None and s.startswith(start) else s
1605
1606
1607 def remove_end(s, end):
1608     return s[:-len(end)] if s is not None and s.endswith(end) else s
1609
1610
1611 def remove_quotes(s):
1612     if s is None or len(s) < 2:
1613         return s
1614     for quote in ('"', "'", ):
1615         if s[0] == quote and s[-1] == quote:
1616             return s[1:-1]
1617     return s
1618
1619
1620 def url_basename(url):
1621     path = compat_urlparse.urlparse(url).path
1622     return path.strip('/').split('/')[-1]
1623
1624
1625 class HEADRequest(compat_urllib_request.Request):
1626     def get_method(self):
1627         return 'HEAD'
1628
1629
1630 class PUTRequest(compat_urllib_request.Request):
1631     def get_method(self):
1632         return 'PUT'
1633
1634
1635 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1636     if get_attr:
1637         if v is not None:
1638             v = getattr(v, get_attr, None)
1639     if v == '':
1640         v = None
1641     if v is None:
1642         return default
1643     try:
1644         return int(v) * invscale // scale
1645     except ValueError:
1646         return default
1647
1648
1649 def str_or_none(v, default=None):
1650     return default if v is None else compat_str(v)
1651
1652
1653 def str_to_int(int_str):
1654     """ A more relaxed version of int_or_none """
1655     if int_str is None:
1656         return None
1657     int_str = re.sub(r'[,\.\+]', '', int_str)
1658     return int(int_str)
1659
1660
1661 def float_or_none(v, scale=1, invscale=1, default=None):
1662     if v is None:
1663         return default
1664     try:
1665         return float(v) * invscale / scale
1666     except ValueError:
1667         return default
1668
1669
1670 def strip_or_none(v):
1671     return None if v is None else v.strip()
1672
1673
1674 def parse_duration(s):
1675     if not isinstance(s, compat_basestring):
1676         return None
1677
1678     s = s.strip()
1679
1680     days, hours, mins, secs, ms = [None] * 5
1681     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1682     if m:
1683         days, hours, mins, secs, ms = m.groups()
1684     else:
1685         m = re.match(
1686             r'''(?ix)(?:P?T)?
1687                 (?:
1688                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1689                 )?
1690                 (?:
1691                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1692                 )?
1693                 (?:
1694                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1695                 )?
1696                 (?:
1697                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1698                 )?$''', s)
1699         if m:
1700             days, hours, mins, secs, ms = m.groups()
1701         else:
1702             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1703             if m:
1704                 hours, mins = m.groups()
1705             else:
1706                 return None
1707
1708     duration = 0
1709     if secs:
1710         duration += float(secs)
1711     if mins:
1712         duration += float(mins) * 60
1713     if hours:
1714         duration += float(hours) * 60 * 60
1715     if days:
1716         duration += float(days) * 24 * 60 * 60
1717     if ms:
1718         duration += float(ms)
1719     return duration
1720
1721
1722 def prepend_extension(filename, ext, expected_real_ext=None):
1723     name, real_ext = os.path.splitext(filename)
1724     return (
1725         '{0}.{1}{2}'.format(name, ext, real_ext)
1726         if not expected_real_ext or real_ext[1:] == expected_real_ext
1727         else '{0}.{1}'.format(filename, ext))
1728
1729
1730 def replace_extension(filename, ext, expected_real_ext=None):
1731     name, real_ext = os.path.splitext(filename)
1732     return '{0}.{1}'.format(
1733         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1734         ext)
1735
1736
1737 def check_executable(exe, args=[]):
1738     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1739     args can be a list of arguments for a short output (like -version) """
1740     try:
1741         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1742     except OSError:
1743         return False
1744     return exe
1745
1746
1747 def get_exe_version(exe, args=['--version'],
1748                     version_re=None, unrecognized='present'):
1749     """ Returns the version of the specified executable,
1750     or False if the executable is not present """
1751     try:
1752         out, _ = subprocess.Popen(
1753             [encodeArgument(exe)] + args,
1754             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1755     except OSError:
1756         return False
1757     if isinstance(out, bytes):  # Python 2.x
1758         out = out.decode('ascii', 'ignore')
1759     return detect_exe_version(out, version_re, unrecognized)
1760
1761
1762 def detect_exe_version(output, version_re=None, unrecognized='present'):
1763     assert isinstance(output, compat_str)
1764     if version_re is None:
1765         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1766     m = re.search(version_re, output)
1767     if m:
1768         return m.group(1)
1769     else:
1770         return unrecognized
1771
1772
1773 class PagedList(object):
1774     def __len__(self):
1775         # This is only useful for tests
1776         return len(self.getslice())
1777
1778
1779 class OnDemandPagedList(PagedList):
1780     def __init__(self, pagefunc, pagesize, use_cache=False):
1781         self._pagefunc = pagefunc
1782         self._pagesize = pagesize
1783         self._use_cache = use_cache
1784         if use_cache:
1785             self._cache = {}
1786
1787     def getslice(self, start=0, end=None):
1788         res = []
1789         for pagenum in itertools.count(start // self._pagesize):
1790             firstid = pagenum * self._pagesize
1791             nextfirstid = pagenum * self._pagesize + self._pagesize
1792             if start >= nextfirstid:
1793                 continue
1794
1795             page_results = None
1796             if self._use_cache:
1797                 page_results = self._cache.get(pagenum)
1798             if page_results is None:
1799                 page_results = list(self._pagefunc(pagenum))
1800             if self._use_cache:
1801                 self._cache[pagenum] = page_results
1802
1803             startv = (
1804                 start % self._pagesize
1805                 if firstid <= start < nextfirstid
1806                 else 0)
1807
1808             endv = (
1809                 ((end - 1) % self._pagesize) + 1
1810                 if (end is not None and firstid <= end <= nextfirstid)
1811                 else None)
1812
1813             if startv != 0 or endv is not None:
1814                 page_results = page_results[startv:endv]
1815             res.extend(page_results)
1816
1817             # A little optimization - if current page is not "full", ie. does
1818             # not contain page_size videos then we can assume that this page
1819             # is the last one - there are no more ids on further pages -
1820             # i.e. no need to query again.
1821             if len(page_results) + startv < self._pagesize:
1822                 break
1823
1824             # If we got the whole page, but the next page is not interesting,
1825             # break out early as well
1826             if end == nextfirstid:
1827                 break
1828         return res
1829
1830
1831 class InAdvancePagedList(PagedList):
1832     def __init__(self, pagefunc, pagecount, pagesize):
1833         self._pagefunc = pagefunc
1834         self._pagecount = pagecount
1835         self._pagesize = pagesize
1836
1837     def getslice(self, start=0, end=None):
1838         res = []
1839         start_page = start // self._pagesize
1840         end_page = (
1841             self._pagecount if end is None else (end // self._pagesize + 1))
1842         skip_elems = start - start_page * self._pagesize
1843         only_more = None if end is None else end - start
1844         for pagenum in range(start_page, end_page):
1845             page = list(self._pagefunc(pagenum))
1846             if skip_elems:
1847                 page = page[skip_elems:]
1848                 skip_elems = None
1849             if only_more is not None:
1850                 if len(page) < only_more:
1851                     only_more -= len(page)
1852                 else:
1853                     page = page[:only_more]
1854                     res.extend(page)
1855                     break
1856             res.extend(page)
1857         return res
1858
1859
1860 def uppercase_escape(s):
1861     unicode_escape = codecs.getdecoder('unicode_escape')
1862     return re.sub(
1863         r'\\U[0-9a-fA-F]{8}',
1864         lambda m: unicode_escape(m.group(0))[0],
1865         s)
1866
1867
1868 def lowercase_escape(s):
1869     unicode_escape = codecs.getdecoder('unicode_escape')
1870     return re.sub(
1871         r'\\u[0-9a-fA-F]{4}',
1872         lambda m: unicode_escape(m.group(0))[0],
1873         s)
1874
1875
1876 def escape_rfc3986(s):
1877     """Escape non-ASCII characters as suggested by RFC 3986"""
1878     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1879         s = s.encode('utf-8')
1880     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1881
1882
1883 def escape_url(url):
1884     """Escape URL as suggested by RFC 3986"""
1885     url_parsed = compat_urllib_parse_urlparse(url)
1886     return url_parsed._replace(
1887         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1888         path=escape_rfc3986(url_parsed.path),
1889         params=escape_rfc3986(url_parsed.params),
1890         query=escape_rfc3986(url_parsed.query),
1891         fragment=escape_rfc3986(url_parsed.fragment)
1892     ).geturl()
1893
1894
1895 def read_batch_urls(batch_fd):
1896     def fixup(url):
1897         if not isinstance(url, compat_str):
1898             url = url.decode('utf-8', 'replace')
1899         BOM_UTF8 = '\xef\xbb\xbf'
1900         if url.startswith(BOM_UTF8):
1901             url = url[len(BOM_UTF8):]
1902         url = url.strip()
1903         if url.startswith(('#', ';', ']')):
1904             return False
1905         return url
1906
1907     with contextlib.closing(batch_fd) as fd:
1908         return [url for url in map(fixup, fd) if url]
1909
1910
1911 def urlencode_postdata(*args, **kargs):
1912     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1913
1914
1915 def update_url_query(url, query):
1916     if not query:
1917         return url
1918     parsed_url = compat_urlparse.urlparse(url)
1919     qs = compat_parse_qs(parsed_url.query)
1920     qs.update(query)
1921     return compat_urlparse.urlunparse(parsed_url._replace(
1922         query=compat_urllib_parse_urlencode(qs, True)))
1923
1924
1925 def update_Request(req, url=None, data=None, headers={}, query={}):
1926     req_headers = req.headers.copy()
1927     req_headers.update(headers)
1928     req_data = data or req.data
1929     req_url = update_url_query(url or req.get_full_url(), query)
1930     req_get_method = req.get_method()
1931     if req_get_method == 'HEAD':
1932         req_type = HEADRequest
1933     elif req_get_method == 'PUT':
1934         req_type = PUTRequest
1935     else:
1936         req_type = compat_urllib_request.Request
1937     new_req = req_type(
1938         req_url, data=req_data, headers=req_headers,
1939         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1940     if hasattr(req, 'timeout'):
1941         new_req.timeout = req.timeout
1942     return new_req
1943
1944
1945 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1946     if isinstance(key_or_keys, (list, tuple)):
1947         for key in key_or_keys:
1948             if key not in d or d[key] is None or skip_false_values and not d[key]:
1949                 continue
1950             return d[key]
1951         return default
1952     return d.get(key_or_keys, default)
1953
1954
1955 def try_get(src, getter, expected_type=None):
1956     try:
1957         v = getter(src)
1958     except (AttributeError, KeyError, TypeError, IndexError):
1959         pass
1960     else:
1961         if expected_type is None or isinstance(v, expected_type):
1962             return v
1963
1964
1965 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1966     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1967
1968
1969 US_RATINGS = {
1970     'G': 0,
1971     'PG': 10,
1972     'PG-13': 13,
1973     'R': 16,
1974     'NC': 18,
1975 }
1976
1977
1978 def parse_age_limit(s):
1979     if s is None:
1980         return None
1981     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1982     return int(m.group('age')) if m else US_RATINGS.get(s)
1983
1984
1985 def strip_jsonp(code):
1986     return re.sub(
1987         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1988
1989
1990 def js_to_json(code):
1991     def fix_kv(m):
1992         v = m.group(0)
1993         if v in ('true', 'false', 'null'):
1994             return v
1995         elif v.startswith('/*') or v == ',':
1996             return ""
1997
1998         if v[0] in ("'", '"'):
1999             v = re.sub(r'(?s)\\.|"', lambda m: {
2000                 '"': '\\"',
2001                 "\\'": "'",
2002                 '\\\n': '',
2003                 '\\x': '\\u00',
2004             }.get(m.group(0), m.group(0)), v[1:-1])
2005
2006         INTEGER_TABLE = (
2007             (r'^0[xX][0-9a-fA-F]+', 16),
2008             (r'^0+[0-7]+', 8),
2009         )
2010
2011         for regex, base in INTEGER_TABLE:
2012             im = re.match(regex, v)
2013             if im:
2014                 i = int(im.group(0), base)
2015                 return '"%d":' % i if v.endswith(':') else '%d' % i
2016
2017         return '"%s"' % v
2018
2019     return re.sub(r'''(?sx)
2020         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2021         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2022         /\*.*?\*/|,(?=\s*[\]}])|
2023         [a-zA-Z_][.a-zA-Z_0-9]*|
2024         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2025         [0-9]+(?=\s*:)
2026         ''', fix_kv, code)
2027
2028
2029 def qualities(quality_ids):
2030     """ Get a numeric quality value out of a list of possible values """
2031     def q(qid):
2032         try:
2033             return quality_ids.index(qid)
2034         except ValueError:
2035             return -1
2036     return q
2037
2038
2039 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2040
2041
2042 def limit_length(s, length):
2043     """ Add ellipses to overly long strings """
2044     if s is None:
2045         return None
2046     ELLIPSES = '...'
2047     if len(s) > length:
2048         return s[:length - len(ELLIPSES)] + ELLIPSES
2049     return s
2050
2051
2052 def version_tuple(v):
2053     return tuple(int(e) for e in re.split(r'[-.]', v))
2054
2055
2056 def is_outdated_version(version, limit, assume_new=True):
2057     if not version:
2058         return not assume_new
2059     try:
2060         return version_tuple(version) < version_tuple(limit)
2061     except ValueError:
2062         return not assume_new
2063
2064
2065 def ytdl_is_updateable():
2066     """ Returns if youtube-dl can be updated with -U """
2067     from zipimport import zipimporter
2068
2069     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2070
2071
2072 def args_to_str(args):
2073     # Get a short string representation for a subprocess command
2074     return ' '.join(compat_shlex_quote(a) for a in args)
2075
2076
2077 def error_to_compat_str(err):
2078     err_str = str(err)
2079     # On python 2 error byte string must be decoded with proper
2080     # encoding rather than ascii
2081     if sys.version_info[0] < 3:
2082         err_str = err_str.decode(preferredencoding())
2083     return err_str
2084
2085
2086 def mimetype2ext(mt):
2087     if mt is None:
2088         return None
2089
2090     ext = {
2091         'audio/mp4': 'm4a',
2092         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2093         # it's the most popular one
2094         'audio/mpeg': 'mp3',
2095     }.get(mt)
2096     if ext is not None:
2097         return ext
2098
2099     _, _, res = mt.rpartition('/')
2100     res = res.lower()
2101
2102     return {
2103         '3gpp': '3gp',
2104         'smptett+xml': 'tt',
2105         'srt': 'srt',
2106         'ttaf+xml': 'dfxp',
2107         'ttml+xml': 'ttml',
2108         'vtt': 'vtt',
2109         'x-flv': 'flv',
2110         'x-mp4-fragmented': 'mp4',
2111         'x-ms-wmv': 'wmv',
2112         'mpegurl': 'm3u8',
2113         'x-mpegurl': 'm3u8',
2114         'vnd.apple.mpegurl': 'm3u8',
2115         'dash+xml': 'mpd',
2116         'f4m': 'f4m',
2117         'f4m+xml': 'f4m',
2118     }.get(res, res)
2119
2120
2121 def urlhandle_detect_ext(url_handle):
2122     getheader = url_handle.headers.get
2123
2124     cd = getheader('Content-Disposition')
2125     if cd:
2126         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2127         if m:
2128             e = determine_ext(m.group('filename'), default_ext=None)
2129             if e:
2130                 return e
2131
2132     return mimetype2ext(getheader('Content-Type'))
2133
2134
2135 def encode_data_uri(data, mime_type):
2136     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2137
2138
2139 def age_restricted(content_limit, age_limit):
2140     """ Returns True iff the content should be blocked """
2141
2142     if age_limit is None:  # No limit set
2143         return False
2144     if content_limit is None:
2145         return False  # Content available for everyone
2146     return age_limit < content_limit
2147
2148
2149 def is_html(first_bytes):
2150     """ Detect whether a file contains HTML by examining its first bytes. """
2151
2152     BOMS = [
2153         (b'\xef\xbb\xbf', 'utf-8'),
2154         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2155         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2156         (b'\xff\xfe', 'utf-16-le'),
2157         (b'\xfe\xff', 'utf-16-be'),
2158     ]
2159     for bom, enc in BOMS:
2160         if first_bytes.startswith(bom):
2161             s = first_bytes[len(bom):].decode(enc, 'replace')
2162             break
2163     else:
2164         s = first_bytes.decode('utf-8', 'replace')
2165
2166     return re.match(r'^\s*<', s)
2167
2168
2169 def determine_protocol(info_dict):
2170     protocol = info_dict.get('protocol')
2171     if protocol is not None:
2172         return protocol
2173
2174     url = info_dict['url']
2175     if url.startswith('rtmp'):
2176         return 'rtmp'
2177     elif url.startswith('mms'):
2178         return 'mms'
2179     elif url.startswith('rtsp'):
2180         return 'rtsp'
2181
2182     ext = determine_ext(url)
2183     if ext == 'm3u8':
2184         return 'm3u8'
2185     elif ext == 'f4m':
2186         return 'f4m'
2187
2188     return compat_urllib_parse_urlparse(url).scheme
2189
2190
2191 def render_table(header_row, data):
2192     """ Render a list of rows, each as a list of values """
2193     table = [header_row] + data
2194     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2195     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2196     return '\n'.join(format_str % tuple(row) for row in table)
2197
2198
2199 def _match_one(filter_part, dct):
2200     COMPARISON_OPERATORS = {
2201         '<': operator.lt,
2202         '<=': operator.le,
2203         '>': operator.gt,
2204         '>=': operator.ge,
2205         '=': operator.eq,
2206         '!=': operator.ne,
2207     }
2208     operator_rex = re.compile(r'''(?x)\s*
2209         (?P<key>[a-z_]+)
2210         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2211         (?:
2212             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2213             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2214         )
2215         \s*$
2216         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2217     m = operator_rex.search(filter_part)
2218     if m:
2219         op = COMPARISON_OPERATORS[m.group('op')]
2220         if m.group('strval') is not None:
2221             if m.group('op') not in ('=', '!='):
2222                 raise ValueError(
2223                     'Operator %s does not support string values!' % m.group('op'))
2224             comparison_value = m.group('strval')
2225         else:
2226             try:
2227                 comparison_value = int(m.group('intval'))
2228             except ValueError:
2229                 comparison_value = parse_filesize(m.group('intval'))
2230                 if comparison_value is None:
2231                     comparison_value = parse_filesize(m.group('intval') + 'B')
2232                 if comparison_value is None:
2233                     raise ValueError(
2234                         'Invalid integer value %r in filter part %r' % (
2235                             m.group('intval'), filter_part))
2236         actual_value = dct.get(m.group('key'))
2237         if actual_value is None:
2238             return m.group('none_inclusive')
2239         return op(actual_value, comparison_value)
2240
2241     UNARY_OPERATORS = {
2242         '': lambda v: v is not None,
2243         '!': lambda v: v is None,
2244     }
2245     operator_rex = re.compile(r'''(?x)\s*
2246         (?P<op>%s)\s*(?P<key>[a-z_]+)
2247         \s*$
2248         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2249     m = operator_rex.search(filter_part)
2250     if m:
2251         op = UNARY_OPERATORS[m.group('op')]
2252         actual_value = dct.get(m.group('key'))
2253         return op(actual_value)
2254
2255     raise ValueError('Invalid filter part %r' % filter_part)
2256
2257
2258 def match_str(filter_str, dct):
2259     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2260
2261     return all(
2262         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2263
2264
2265 def match_filter_func(filter_str):
2266     def _match_func(info_dict):
2267         if match_str(filter_str, info_dict):
2268             return None
2269         else:
2270             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2271             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2272     return _match_func
2273
2274
2275 def parse_dfxp_time_expr(time_expr):
2276     if not time_expr:
2277         return
2278
2279     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2280     if mobj:
2281         return float(mobj.group('time_offset'))
2282
2283     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2284     if mobj:
2285         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2286
2287
2288 def srt_subtitles_timecode(seconds):
2289     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2290
2291
2292 def dfxp2srt(dfxp_data):
2293     _x = functools.partial(xpath_with_ns, ns_map={
2294         'ttml': 'http://www.w3.org/ns/ttml',
2295         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2296         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2297     })
2298
2299     class TTMLPElementParser(object):
2300         out = ''
2301
2302         def start(self, tag, attrib):
2303             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2304                 self.out += '\n'
2305
2306         def end(self, tag):
2307             pass
2308
2309         def data(self, data):
2310             self.out += data
2311
2312         def close(self):
2313             return self.out.strip()
2314
2315     def parse_node(node):
2316         target = TTMLPElementParser()
2317         parser = xml.etree.ElementTree.XMLParser(target=target)
2318         parser.feed(xml.etree.ElementTree.tostring(node))
2319         return parser.close()
2320
2321     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2322     out = []
2323     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2324
2325     if not paras:
2326         raise ValueError('Invalid dfxp/TTML subtitle')
2327
2328     for para, index in zip(paras, itertools.count(1)):
2329         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2330         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2331         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2332         if begin_time is None:
2333             continue
2334         if not end_time:
2335             if not dur:
2336                 continue
2337             end_time = begin_time + dur
2338         out.append('%d\n%s --> %s\n%s\n\n' % (
2339             index,
2340             srt_subtitles_timecode(begin_time),
2341             srt_subtitles_timecode(end_time),
2342             parse_node(para)))
2343
2344     return ''.join(out)
2345
2346
2347 def cli_option(params, command_option, param):
2348     param = params.get(param)
2349     return [command_option, param] if param is not None else []
2350
2351
2352 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2353     param = params.get(param)
2354     assert isinstance(param, bool)
2355     if separator:
2356         return [command_option + separator + (true_value if param else false_value)]
2357     return [command_option, true_value if param else false_value]
2358
2359
2360 def cli_valueless_option(params, command_option, param, expected_value=True):
2361     param = params.get(param)
2362     return [command_option] if param == expected_value else []
2363
2364
2365 def cli_configuration_args(params, param, default=[]):
2366     ex_args = params.get(param)
2367     if ex_args is None:
2368         return default
2369     assert isinstance(ex_args, list)
2370     return ex_args
2371
2372
2373 class ISO639Utils(object):
2374     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2375     _lang_map = {
2376         'aa': 'aar',
2377         'ab': 'abk',
2378         'ae': 'ave',
2379         'af': 'afr',
2380         'ak': 'aka',
2381         'am': 'amh',
2382         'an': 'arg',
2383         'ar': 'ara',
2384         'as': 'asm',
2385         'av': 'ava',
2386         'ay': 'aym',
2387         'az': 'aze',
2388         'ba': 'bak',
2389         'be': 'bel',
2390         'bg': 'bul',
2391         'bh': 'bih',
2392         'bi': 'bis',
2393         'bm': 'bam',
2394         'bn': 'ben',
2395         'bo': 'bod',
2396         'br': 'bre',
2397         'bs': 'bos',
2398         'ca': 'cat',
2399         'ce': 'che',
2400         'ch': 'cha',
2401         'co': 'cos',
2402         'cr': 'cre',
2403         'cs': 'ces',
2404         'cu': 'chu',
2405         'cv': 'chv',
2406         'cy': 'cym',
2407         'da': 'dan',
2408         'de': 'deu',
2409         'dv': 'div',
2410         'dz': 'dzo',
2411         'ee': 'ewe',
2412         'el': 'ell',
2413         'en': 'eng',
2414         'eo': 'epo',
2415         'es': 'spa',
2416         'et': 'est',
2417         'eu': 'eus',
2418         'fa': 'fas',
2419         'ff': 'ful',
2420         'fi': 'fin',
2421         'fj': 'fij',
2422         'fo': 'fao',
2423         'fr': 'fra',
2424         'fy': 'fry',
2425         'ga': 'gle',
2426         'gd': 'gla',
2427         'gl': 'glg',
2428         'gn': 'grn',
2429         'gu': 'guj',
2430         'gv': 'glv',
2431         'ha': 'hau',
2432         'he': 'heb',
2433         'hi': 'hin',
2434         'ho': 'hmo',
2435         'hr': 'hrv',
2436         'ht': 'hat',
2437         'hu': 'hun',
2438         'hy': 'hye',
2439         'hz': 'her',
2440         'ia': 'ina',
2441         'id': 'ind',
2442         'ie': 'ile',
2443         'ig': 'ibo',
2444         'ii': 'iii',
2445         'ik': 'ipk',
2446         'io': 'ido',
2447         'is': 'isl',
2448         'it': 'ita',
2449         'iu': 'iku',
2450         'ja': 'jpn',
2451         'jv': 'jav',
2452         'ka': 'kat',
2453         'kg': 'kon',
2454         'ki': 'kik',
2455         'kj': 'kua',
2456         'kk': 'kaz',
2457         'kl': 'kal',
2458         'km': 'khm',
2459         'kn': 'kan',
2460         'ko': 'kor',
2461         'kr': 'kau',
2462         'ks': 'kas',
2463         'ku': 'kur',
2464         'kv': 'kom',
2465         'kw': 'cor',
2466         'ky': 'kir',
2467         'la': 'lat',
2468         'lb': 'ltz',
2469         'lg': 'lug',
2470         'li': 'lim',
2471         'ln': 'lin',
2472         'lo': 'lao',
2473         'lt': 'lit',
2474         'lu': 'lub',
2475         'lv': 'lav',
2476         'mg': 'mlg',
2477         'mh': 'mah',
2478         'mi': 'mri',
2479         'mk': 'mkd',
2480         'ml': 'mal',
2481         'mn': 'mon',
2482         'mr': 'mar',
2483         'ms': 'msa',
2484         'mt': 'mlt',
2485         'my': 'mya',
2486         'na': 'nau',
2487         'nb': 'nob',
2488         'nd': 'nde',
2489         'ne': 'nep',
2490         'ng': 'ndo',
2491         'nl': 'nld',
2492         'nn': 'nno',
2493         'no': 'nor',
2494         'nr': 'nbl',
2495         'nv': 'nav',
2496         'ny': 'nya',
2497         'oc': 'oci',
2498         'oj': 'oji',
2499         'om': 'orm',
2500         'or': 'ori',
2501         'os': 'oss',
2502         'pa': 'pan',
2503         'pi': 'pli',
2504         'pl': 'pol',
2505         'ps': 'pus',
2506         'pt': 'por',
2507         'qu': 'que',
2508         'rm': 'roh',
2509         'rn': 'run',
2510         'ro': 'ron',
2511         'ru': 'rus',
2512         'rw': 'kin',
2513         'sa': 'san',
2514         'sc': 'srd',
2515         'sd': 'snd',
2516         'se': 'sme',
2517         'sg': 'sag',
2518         'si': 'sin',
2519         'sk': 'slk',
2520         'sl': 'slv',
2521         'sm': 'smo',
2522         'sn': 'sna',
2523         'so': 'som',
2524         'sq': 'sqi',
2525         'sr': 'srp',
2526         'ss': 'ssw',
2527         'st': 'sot',
2528         'su': 'sun',
2529         'sv': 'swe',
2530         'sw': 'swa',
2531         'ta': 'tam',
2532         'te': 'tel',
2533         'tg': 'tgk',
2534         'th': 'tha',
2535         'ti': 'tir',
2536         'tk': 'tuk',
2537         'tl': 'tgl',
2538         'tn': 'tsn',
2539         'to': 'ton',
2540         'tr': 'tur',
2541         'ts': 'tso',
2542         'tt': 'tat',
2543         'tw': 'twi',
2544         'ty': 'tah',
2545         'ug': 'uig',
2546         'uk': 'ukr',
2547         'ur': 'urd',
2548         'uz': 'uzb',
2549         've': 'ven',
2550         'vi': 'vie',
2551         'vo': 'vol',
2552         'wa': 'wln',
2553         'wo': 'wol',
2554         'xh': 'xho',
2555         'yi': 'yid',
2556         'yo': 'yor',
2557         'za': 'zha',
2558         'zh': 'zho',
2559         'zu': 'zul',
2560     }
2561
2562     @classmethod
2563     def short2long(cls, code):
2564         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2565         return cls._lang_map.get(code[:2])
2566
2567     @classmethod
2568     def long2short(cls, code):
2569         """Convert language code from ISO 639-2/T to ISO 639-1"""
2570         for short_name, long_name in cls._lang_map.items():
2571             if long_name == code:
2572                 return short_name
2573
2574
2575 class ISO3166Utils(object):
2576     # From http://data.okfn.org/data/core/country-list
2577     _country_map = {
2578         'AF': 'Afghanistan',
2579         'AX': 'Åland Islands',
2580         'AL': 'Albania',
2581         'DZ': 'Algeria',
2582         'AS': 'American Samoa',
2583         'AD': 'Andorra',
2584         'AO': 'Angola',
2585         'AI': 'Anguilla',
2586         'AQ': 'Antarctica',
2587         'AG': 'Antigua and Barbuda',
2588         'AR': 'Argentina',
2589         'AM': 'Armenia',
2590         'AW': 'Aruba',
2591         'AU': 'Australia',
2592         'AT': 'Austria',
2593         'AZ': 'Azerbaijan',
2594         'BS': 'Bahamas',
2595         'BH': 'Bahrain',
2596         'BD': 'Bangladesh',
2597         'BB': 'Barbados',
2598         'BY': 'Belarus',
2599         'BE': 'Belgium',
2600         'BZ': 'Belize',
2601         'BJ': 'Benin',
2602         'BM': 'Bermuda',
2603         'BT': 'Bhutan',
2604         'BO': 'Bolivia, Plurinational State of',
2605         'BQ': 'Bonaire, Sint Eustatius and Saba',
2606         'BA': 'Bosnia and Herzegovina',
2607         'BW': 'Botswana',
2608         'BV': 'Bouvet Island',
2609         'BR': 'Brazil',
2610         'IO': 'British Indian Ocean Territory',
2611         'BN': 'Brunei Darussalam',
2612         'BG': 'Bulgaria',
2613         'BF': 'Burkina Faso',
2614         'BI': 'Burundi',
2615         'KH': 'Cambodia',
2616         'CM': 'Cameroon',
2617         'CA': 'Canada',
2618         'CV': 'Cape Verde',
2619         'KY': 'Cayman Islands',
2620         'CF': 'Central African Republic',
2621         'TD': 'Chad',
2622         'CL': 'Chile',
2623         'CN': 'China',
2624         'CX': 'Christmas Island',
2625         'CC': 'Cocos (Keeling) Islands',
2626         'CO': 'Colombia',
2627         'KM': 'Comoros',
2628         'CG': 'Congo',
2629         'CD': 'Congo, the Democratic Republic of the',
2630         'CK': 'Cook Islands',
2631         'CR': 'Costa Rica',
2632         'CI': 'Côte d\'Ivoire',
2633         'HR': 'Croatia',
2634         'CU': 'Cuba',
2635         'CW': 'Curaçao',
2636         'CY': 'Cyprus',
2637         'CZ': 'Czech Republic',
2638         'DK': 'Denmark',
2639         'DJ': 'Djibouti',
2640         'DM': 'Dominica',
2641         'DO': 'Dominican Republic',
2642         'EC': 'Ecuador',
2643         'EG': 'Egypt',
2644         'SV': 'El Salvador',
2645         'GQ': 'Equatorial Guinea',
2646         'ER': 'Eritrea',
2647         'EE': 'Estonia',
2648         'ET': 'Ethiopia',
2649         'FK': 'Falkland Islands (Malvinas)',
2650         'FO': 'Faroe Islands',
2651         'FJ': 'Fiji',
2652         'FI': 'Finland',
2653         'FR': 'France',
2654         'GF': 'French Guiana',
2655         'PF': 'French Polynesia',
2656         'TF': 'French Southern Territories',
2657         'GA': 'Gabon',
2658         'GM': 'Gambia',
2659         'GE': 'Georgia',
2660         'DE': 'Germany',
2661         'GH': 'Ghana',
2662         'GI': 'Gibraltar',
2663         'GR': 'Greece',
2664         'GL': 'Greenland',
2665         'GD': 'Grenada',
2666         'GP': 'Guadeloupe',
2667         'GU': 'Guam',
2668         'GT': 'Guatemala',
2669         'GG': 'Guernsey',
2670         'GN': 'Guinea',
2671         'GW': 'Guinea-Bissau',
2672         'GY': 'Guyana',
2673         'HT': 'Haiti',
2674         'HM': 'Heard Island and McDonald Islands',
2675         'VA': 'Holy See (Vatican City State)',
2676         'HN': 'Honduras',
2677         'HK': 'Hong Kong',
2678         'HU': 'Hungary',
2679         'IS': 'Iceland',
2680         'IN': 'India',
2681         'ID': 'Indonesia',
2682         'IR': 'Iran, Islamic Republic of',
2683         'IQ': 'Iraq',
2684         'IE': 'Ireland',
2685         'IM': 'Isle of Man',
2686         'IL': 'Israel',
2687         'IT': 'Italy',
2688         'JM': 'Jamaica',
2689         'JP': 'Japan',
2690         'JE': 'Jersey',
2691         'JO': 'Jordan',
2692         'KZ': 'Kazakhstan',
2693         'KE': 'Kenya',
2694         'KI': 'Kiribati',
2695         'KP': 'Korea, Democratic People\'s Republic of',
2696         'KR': 'Korea, Republic of',
2697         'KW': 'Kuwait',
2698         'KG': 'Kyrgyzstan',
2699         'LA': 'Lao People\'s Democratic Republic',
2700         'LV': 'Latvia',
2701         'LB': 'Lebanon',
2702         'LS': 'Lesotho',
2703         'LR': 'Liberia',
2704         'LY': 'Libya',
2705         'LI': 'Liechtenstein',
2706         'LT': 'Lithuania',
2707         'LU': 'Luxembourg',
2708         'MO': 'Macao',
2709         'MK': 'Macedonia, the Former Yugoslav Republic of',
2710         'MG': 'Madagascar',
2711         'MW': 'Malawi',
2712         'MY': 'Malaysia',
2713         'MV': 'Maldives',
2714         'ML': 'Mali',
2715         'MT': 'Malta',
2716         'MH': 'Marshall Islands',
2717         'MQ': 'Martinique',
2718         'MR': 'Mauritania',
2719         'MU': 'Mauritius',
2720         'YT': 'Mayotte',
2721         'MX': 'Mexico',
2722         'FM': 'Micronesia, Federated States of',
2723         'MD': 'Moldova, Republic of',
2724         'MC': 'Monaco',
2725         'MN': 'Mongolia',
2726         'ME': 'Montenegro',
2727         'MS': 'Montserrat',
2728         'MA': 'Morocco',
2729         'MZ': 'Mozambique',
2730         'MM': 'Myanmar',
2731         'NA': 'Namibia',
2732         'NR': 'Nauru',
2733         'NP': 'Nepal',
2734         'NL': 'Netherlands',
2735         'NC': 'New Caledonia',
2736         'NZ': 'New Zealand',
2737         'NI': 'Nicaragua',
2738         'NE': 'Niger',
2739         'NG': 'Nigeria',
2740         'NU': 'Niue',
2741         'NF': 'Norfolk Island',
2742         'MP': 'Northern Mariana Islands',
2743         'NO': 'Norway',
2744         'OM': 'Oman',
2745         'PK': 'Pakistan',
2746         'PW': 'Palau',
2747         'PS': 'Palestine, State of',
2748         'PA': 'Panama',
2749         'PG': 'Papua New Guinea',
2750         'PY': 'Paraguay',
2751         'PE': 'Peru',
2752         'PH': 'Philippines',
2753         'PN': 'Pitcairn',
2754         'PL': 'Poland',
2755         'PT': 'Portugal',
2756         'PR': 'Puerto Rico',
2757         'QA': 'Qatar',
2758         'RE': 'Réunion',
2759         'RO': 'Romania',
2760         'RU': 'Russian Federation',
2761         'RW': 'Rwanda',
2762         'BL': 'Saint Barthélemy',
2763         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2764         'KN': 'Saint Kitts and Nevis',
2765         'LC': 'Saint Lucia',
2766         'MF': 'Saint Martin (French part)',
2767         'PM': 'Saint Pierre and Miquelon',
2768         'VC': 'Saint Vincent and the Grenadines',
2769         'WS': 'Samoa',
2770         'SM': 'San Marino',
2771         'ST': 'Sao Tome and Principe',
2772         'SA': 'Saudi Arabia',
2773         'SN': 'Senegal',
2774         'RS': 'Serbia',
2775         'SC': 'Seychelles',
2776         'SL': 'Sierra Leone',
2777         'SG': 'Singapore',
2778         'SX': 'Sint Maarten (Dutch part)',
2779         'SK': 'Slovakia',
2780         'SI': 'Slovenia',
2781         'SB': 'Solomon Islands',
2782         'SO': 'Somalia',
2783         'ZA': 'South Africa',
2784         'GS': 'South Georgia and the South Sandwich Islands',
2785         'SS': 'South Sudan',
2786         'ES': 'Spain',
2787         'LK': 'Sri Lanka',
2788         'SD': 'Sudan',
2789         'SR': 'Suriname',
2790         'SJ': 'Svalbard and Jan Mayen',
2791         'SZ': 'Swaziland',
2792         'SE': 'Sweden',
2793         'CH': 'Switzerland',
2794         'SY': 'Syrian Arab Republic',
2795         'TW': 'Taiwan, Province of China',
2796         'TJ': 'Tajikistan',
2797         'TZ': 'Tanzania, United Republic of',
2798         'TH': 'Thailand',
2799         'TL': 'Timor-Leste',
2800         'TG': 'Togo',
2801         'TK': 'Tokelau',
2802         'TO': 'Tonga',
2803         'TT': 'Trinidad and Tobago',
2804         'TN': 'Tunisia',
2805         'TR': 'Turkey',
2806         'TM': 'Turkmenistan',
2807         'TC': 'Turks and Caicos Islands',
2808         'TV': 'Tuvalu',
2809         'UG': 'Uganda',
2810         'UA': 'Ukraine',
2811         'AE': 'United Arab Emirates',
2812         'GB': 'United Kingdom',
2813         'US': 'United States',
2814         'UM': 'United States Minor Outlying Islands',
2815         'UY': 'Uruguay',
2816         'UZ': 'Uzbekistan',
2817         'VU': 'Vanuatu',
2818         'VE': 'Venezuela, Bolivarian Republic of',
2819         'VN': 'Viet Nam',
2820         'VG': 'Virgin Islands, British',
2821         'VI': 'Virgin Islands, U.S.',
2822         'WF': 'Wallis and Futuna',
2823         'EH': 'Western Sahara',
2824         'YE': 'Yemen',
2825         'ZM': 'Zambia',
2826         'ZW': 'Zimbabwe',
2827     }
2828
2829     @classmethod
2830     def short2full(cls, code):
2831         """Convert an ISO 3166-2 country code to the corresponding full name"""
2832         return cls._country_map.get(code.upper())
2833
2834
2835 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2836     def __init__(self, proxies=None):
2837         # Set default handlers
2838         for type in ('http', 'https'):
2839             setattr(self, '%s_open' % type,
2840                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2841                         meth(r, proxy, type))
2842         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2843
2844     def proxy_open(self, req, proxy, type):
2845         req_proxy = req.headers.get('Ytdl-request-proxy')
2846         if req_proxy is not None:
2847             proxy = req_proxy
2848             del req.headers['Ytdl-request-proxy']
2849
2850         if proxy == '__noproxy__':
2851             return None  # No Proxy
2852         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2853             req.add_header('Ytdl-socks-proxy', proxy)
2854             # youtube-dl's http/https handlers do wrapping the socket with socks
2855             return None
2856         return compat_urllib_request.ProxyHandler.proxy_open(
2857             self, req, proxy, type)
2858
2859
2860 def ohdave_rsa_encrypt(data, exponent, modulus):
2861     '''
2862     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2863
2864     Input:
2865         data: data to encrypt, bytes-like object
2866         exponent, modulus: parameter e and N of RSA algorithm, both integer
2867     Output: hex string of encrypted data
2868
2869     Limitation: supports one block encryption only
2870     '''
2871
2872     payload = int(binascii.hexlify(data[::-1]), 16)
2873     encrypted = pow(payload, exponent, modulus)
2874     return '%x' % encrypted
2875
2876
2877 def encode_base_n(num, n, table=None):
2878     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2879     if not table:
2880         table = FULL_TABLE[:n]
2881
2882     if n > len(table):
2883         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2884
2885     if num == 0:
2886         return table[0]
2887
2888     ret = ''
2889     while num:
2890         ret = table[num % n] + ret
2891         num = num // n
2892     return ret
2893
2894
2895 def decode_packed_codes(code):
2896     mobj = re.search(
2897         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2898         code)
2899     obfucasted_code, base, count, symbols = mobj.groups()
2900     base = int(base)
2901     count = int(count)
2902     symbols = symbols.split('|')
2903     symbol_table = {}
2904
2905     while count:
2906         count -= 1
2907         base_n_count = encode_base_n(count, base)
2908         symbol_table[base_n_count] = symbols[count] or base_n_count
2909
2910     return re.sub(
2911         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2912         obfucasted_code)
2913
2914
2915 def parse_m3u8_attributes(attrib):
2916     info = {}
2917     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2918         if val.startswith('"'):
2919             val = val[1:-1]
2920         info[key] = val
2921     return info
2922
2923
2924 def urshift(val, n):
2925     return val >> n if val >= 0 else (val + 0x100000000) >> n