youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_struct_unpack,
  51     compat_urllib_error,
  52     compat_urllib_parse,
  53     compat_urllib_parse_urlencode,
  54     compat_urllib_parse_urlparse,
  55     compat_urllib_parse_unquote_plus,
  56     compat_urllib_request,
  57     compat_urlparse,
  58     compat_xpath,
  59 )
  60
  61 from .socks import (
  62     ProxyType,
  63     sockssocket,
  64 )
  65
  66
  67 def register_socks_protocols():
  68     # "Register" SOCKS protocols
  69     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  70     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  71     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  72         if scheme not in compat_urlparse.uses_netloc:
  73             compat_urlparse.uses_netloc.append(scheme)
  74
  75
  76 # This is not clearly defined otherwise
  77 compiled_regex_type = type(re.compile(''))
  78
  79 std_headers = {
  80     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  81     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83     'Accept-Encoding': 'gzip, deflate',
  84     'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87
  88 NO_DEFAULT = object()
  89
  90 ENGLISH_MONTH_NAMES = [
  91     'January', 'February', 'March', 'April', 'May', 'June',
  92     'July', 'August', 'September', 'October', 'November', 'December']
  93
  94 KNOWN_EXTENSIONS = (
  95     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  96     'flv', 'f4v', 'f4a', 'f4b',
  97     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  98     'mkv', 'mka', 'mk3d',
  99     'avi', 'divx',
 100     'mov',
 101     'asf', 'wmv', 'wma',
 102     '3gp', '3g2',
 103     'mp3',
 104     'flac',
 105     'ape',
 106     'wav',
 107     'f4f', 'f4m', 'm3u8', 'smil')
 108
 109 # needed for sanitizing filenames in restricted mode
 110 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 111                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 112                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 113
 114 DATE_FORMATS = (
 115     '%d %B %Y',
 116     '%d %b %Y',
 117     '%B %d %Y',
 118     '%b %d %Y',
 119     '%b %dst %Y %I:%M',
 120     '%b %dnd %Y %I:%M',
 121     '%b %dth %Y %I:%M',
 122     '%Y %m %d',
 123     '%Y-%m-%d',
 124     '%Y/%m/%d',
 125     '%Y/%m/%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S',
 127     '%Y-%m-%d %H:%M:%S.%f',
 128     '%d.%m.%Y %H:%M',
 129     '%d.%m.%Y %H.%M',
 130     '%Y-%m-%dT%H:%M:%SZ',
 131     '%Y-%m-%dT%H:%M:%S.%fZ',
 132     '%Y-%m-%dT%H:%M:%S.%f0Z',
 133     '%Y-%m-%dT%H:%M:%S',
 134     '%Y-%m-%dT%H:%M:%S.%f',
 135     '%Y-%m-%dT%H:%M',
 136 )
 137
 138 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 139 DATE_FORMATS_DAY_FIRST.extend([
 140     '%d-%m-%Y',
 141     '%d.%m.%Y',
 142     '%d.%m.%y',
 143     '%d/%m/%Y',
 144     '%d/%m/%y',
 145     '%d/%m/%Y %H:%M:%S',
 146 ])
 147
 148 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 149 DATE_FORMATS_MONTH_FIRST.extend([
 150     '%m-%d-%Y',
 151     '%m.%d.%Y',
 152     '%m/%d/%Y',
 153     '%m/%d/%y',
 154     '%m/%d/%Y %H:%M:%S',
 155 ])
 156
 157
 158 def preferredencoding():
 159     """Get preferred encoding.
 160
 161     Returns the best encoding scheme for the system, based on
 162     locale.getpreferredencoding() and some further tweaks.
 163     """
 164     try:
 165         pref = locale.getpreferredencoding()
 166         'TEST'.encode(pref)
 167     except Exception:
 168         pref = 'UTF-8'
 169
 170     return pref
 171
 172
 173 def write_json_file(obj, fn):
 174     """ Encode obj as JSON and write it to fn, atomically if possible """
 175
 176     fn = encodeFilename(fn)
 177     if sys.version_info < (3, 0) and sys.platform != 'win32':
 178         encoding = get_filesystem_encoding()
 179         # os.path.basename returns a bytes object, but NamedTemporaryFile
 180         # will fail if the filename contains non ascii characters unless we
 181         # use a unicode object
 182         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 183         # the same for os.path.dirname
 184         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 185     else:
 186         path_basename = os.path.basename
 187         path_dirname = os.path.dirname
 188
 189     args = {
 190         'suffix': '.tmp',
 191         'prefix': path_basename(fn) + '.',
 192         'dir': path_dirname(fn),
 193         'delete': False,
 194     }
 195
 196     # In Python 2.x, json.dump expects a bytestream.
 197     # In Python 3.x, it writes to a character stream
 198     if sys.version_info < (3, 0):
 199         args['mode'] = 'wb'
 200     else:
 201         args.update({
 202             'mode': 'w',
 203             'encoding': 'utf-8',
 204         })
 205
 206     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 207
 208     try:
 209         with tf:
 210             json.dump(obj, tf)
 211         if sys.platform == 'win32':
 212             # Need to remove existing file on Windows, else os.rename raises
 213             # WindowsError or FileExistsError.
 214             try:
 215                 os.unlink(fn)
 216             except OSError:
 217                 pass
 218         os.rename(tf.name, fn)
 219     except Exception:
 220         try:
 221             os.remove(tf.name)
 222         except OSError:
 223             pass
 224         raise
 225
 226
 227 if sys.version_info >= (2, 7):
 228     def find_xpath_attr(node, xpath, key, val=None):
 229         """ Find the xpath xpath[@key=val] """
 230         assert re.match(r'^[a-zA-Z_-]+$', key)
 231         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 232         return node.find(expr)
 233 else:
 234     def find_xpath_attr(node, xpath, key, val=None):
 235         for f in node.findall(compat_xpath(xpath)):
 236             if key not in f.attrib:
 237                 continue
 238             if val is None or f.attrib.get(key) == val:
 239                 return f
 240         return None
 241
 242 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 243 # the namespace parameter
 244
 245
 246 def xpath_with_ns(path, ns_map):
 247     components = [c.split(':') for c in path.split('/')]
 248     replaced = []
 249     for c in components:
 250         if len(c) == 1:
 251             replaced.append(c[0])
 252         else:
 253             ns, tag = c
 254             replaced.append('{%s}%s' % (ns_map[ns], tag))
 255     return '/'.join(replaced)
 256
 257
 258 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 259     def _find_xpath(xpath):
 260         return node.find(compat_xpath(xpath))
 261
 262     if isinstance(xpath, (str, compat_str)):
 263         n = _find_xpath(xpath)
 264     else:
 265         for xp in xpath:
 266             n = _find_xpath(xp)
 267             if n is not None:
 268                 break
 269
 270     if n is None:
 271         if default is not NO_DEFAULT:
 272             return default
 273         elif fatal:
 274             name = xpath if name is None else name
 275             raise ExtractorError('Could not find XML element %s' % name)
 276         else:
 277             return None
 278     return n
 279
 280
 281 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 282     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 283     if n is None or n == default:
 284         return n
 285     if n.text is None:
 286         if default is not NO_DEFAULT:
 287             return default
 288         elif fatal:
 289             name = xpath if name is None else name
 290             raise ExtractorError('Could not find XML element\'s text %s' % name)
 291         else:
 292             return None
 293     return n.text
 294
 295
 296 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 297     n = find_xpath_attr(node, xpath, key)
 298     if n is None:
 299         if default is not NO_DEFAULT:
 300             return default
 301         elif fatal:
 302             name = '%s[@%s]' % (xpath, key) if name is None else name
 303             raise ExtractorError('Could not find XML attribute %s' % name)
 304         else:
 305             return None
 306     return n.attrib[key]
 307
 308
 309 def get_element_by_id(id, html):
 310     """Return the content of the tag with the specified ID in the passed HTML document"""
 311     return get_element_by_attribute('id', id, html)
 312
 313
 314 def get_element_by_class(class_name, html):
 315     return get_element_by_attribute(
 316         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 317         html, escape_value=False)
 318
 319
 320 def get_element_by_attribute(attribute, value, html, escape_value=True):
 321     """Return the content of the tag with the specified attribute in the passed HTML document"""
 322
 323     value = re.escape(value) if escape_value else value
 324
 325     m = re.search(r'''(?xs)
 326         <([a-zA-Z0-9:._-]+)
 327          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 328          \s+%s=['"]?%s['"]?
 329          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 330         \s*>
 331         (?P<content>.*?)
 332         </\1>
 333     ''' % (re.escape(attribute), value), html)
 334
 335     if not m:
 336         return None
 337     res = m.group('content')
 338
 339     if res.startswith('"') or res.startswith("'"):
 340         res = res[1:-1]
 341
 342     return unescapeHTML(res)
 343
 344
 345 class HTMLAttributeParser(compat_HTMLParser):
 346     """Trivial HTML parser to gather the attributes for a single element"""
 347     def __init__(self):
 348         self.attrs = {}
 349         compat_HTMLParser.__init__(self)
 350
 351     def handle_starttag(self, tag, attrs):
 352         self.attrs = dict(attrs)
 353
 354
 355 def extract_attributes(html_element):
 356     """Given a string for an HTML element such as
 357     <el
 358          a="foo" B="bar" c="&98;az" d=boz
 359          empty= noval entity="&amp;"
 360          sq='"' dq="'"
 361     >
 362     Decode and return a dictionary of attributes.
 363     {
 364         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 365         'empty': '', 'noval': None, 'entity': '&',
 366         'sq': '"', 'dq': '\''
 367     }.
 368     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 369     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 370     """
 371     parser = HTMLAttributeParser()
 372     parser.feed(html_element)
 373     parser.close()
 374     return parser.attrs
 375
 376
 377 def clean_html(html):
 378     """Clean an HTML snippet into a readable string"""
 379
 380     if html is None:  # Convenience for sanitizing descriptions etc.
 381         return html
 382
 383     # Newline vs <br />
 384     html = html.replace('\n', ' ')
 385     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 386     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 387     # Strip html tags
 388     html = re.sub('<.*?>', '', html)
 389     # Replace html entities
 390     html = unescapeHTML(html)
 391     return html.strip()
 392
 393
 394 def sanitize_open(filename, open_mode):
 395     """Try to open the given filename, and slightly tweak it if this fails.
 396
 397     Attempts to open the given filename. If this fails, it tries to change
 398     the filename slightly, step by step, until it's either able to open it
 399     or it fails and raises a final exception, like the standard open()
 400     function.
 401
 402     It returns the tuple (stream, definitive_file_name).
 403     """
 404     try:
 405         if filename == '-':
 406             if sys.platform == 'win32':
 407                 import msvcrt
 408                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 409             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 410         stream = open(encodeFilename(filename), open_mode)
 411         return (stream, filename)
 412     except (IOError, OSError) as err:
 413         if err.errno in (errno.EACCES,):
 414             raise
 415
 416         # In case of error, try to remove win32 forbidden chars
 417         alt_filename = sanitize_path(filename)
 418         if alt_filename == filename:
 419             raise
 420         else:
 421             # An exception here should be caught in the caller
 422             stream = open(encodeFilename(alt_filename), open_mode)
 423             return (stream, alt_filename)
 424
 425
 426 def timeconvert(timestr):
 427     """Convert RFC 2822 defined time string into system timestamp"""
 428     timestamp = None
 429     timetuple = email.utils.parsedate_tz(timestr)
 430     if timetuple is not None:
 431         timestamp = email.utils.mktime_tz(timetuple)
 432     return timestamp
 433
 434
 435 def sanitize_filename(s, restricted=False, is_id=False):
 436     """Sanitizes a string so it could be used as part of a filename.
 437     If restricted is set, use a stricter subset of allowed characters.
 438     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 439     """
 440     def replace_insane(char):
 441         if restricted and char in ACCENT_CHARS:
 442             return ACCENT_CHARS[char]
 443         if char == '?' or ord(char) < 32 or ord(char) == 127:
 444             return ''
 445         elif char == '"':
 446             return '' if restricted else '\''
 447         elif char == ':':
 448             return '_-' if restricted else ' -'
 449         elif char in '\\/|*<>':
 450             return '_'
 451         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 452             return '_'
 453         if restricted and ord(char) > 127:
 454             return '_'
 455         return char
 456
 457     # Handle timestamps
 458     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 459     result = ''.join(map(replace_insane, s))
 460     if not is_id:
 461         while '__' in result:
 462             result = result.replace('__', '_')
 463         result = result.strip('_')
 464         # Common case of "Foreign band name - English song title"
 465         if restricted and result.startswith('-_'):
 466             result = result[2:]
 467         if result.startswith('-'):
 468             result = '_' + result[len('-'):]
 469         result = result.lstrip('.')
 470         if not result:
 471             result = '_'
 472     return result
 473
 474
 475 def sanitize_path(s):
 476     """Sanitizes and normalizes path on Windows"""
 477     if sys.platform != 'win32':
 478         return s
 479     drive_or_unc, _ = os.path.splitdrive(s)
 480     if sys.version_info < (2, 7) and not drive_or_unc:
 481         drive_or_unc, _ = os.path.splitunc(s)
 482     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 483     if drive_or_unc:
 484         norm_path.pop(0)
 485     sanitized_path = [
 486         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 487         for path_part in norm_path]
 488     if drive_or_unc:
 489         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 490     return os.path.join(*sanitized_path)
 491
 492
 493 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 494 # unwanted failures due to missing protocol
 495 def sanitize_url(url):
 496     return 'http:%s' % url if url.startswith('//') else url
 497
 498
 499 def sanitized_Request(url, *args, **kwargs):
 500     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 501
 502
 503 def orderedSet(iterable):
 504     """ Remove all duplicates from the input iterable """
 505     res = []
 506     for el in iterable:
 507         if el not in res:
 508             res.append(el)
 509     return res
 510
 511
 512 def _htmlentity_transform(entity_with_semicolon):
 513     """Transforms an HTML entity to a character."""
 514     entity = entity_with_semicolon[:-1]
 515
 516     # Known non-numeric HTML entity
 517     if entity in compat_html_entities.name2codepoint:
 518         return compat_chr(compat_html_entities.name2codepoint[entity])
 519
 520     # TODO: HTML5 allows entities without a semicolon. For example,
 521     # '&Eacuteric' should be decoded as 'Éric'.
 522     if entity_with_semicolon in compat_html_entities_html5:
 523         return compat_html_entities_html5[entity_with_semicolon]
 524
 525     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 526     if mobj is not None:
 527         numstr = mobj.group(1)
 528         if numstr.startswith('x'):
 529             base = 16
 530             numstr = '0%s' % numstr
 531         else:
 532             base = 10
 533         # See https://github.com/rg3/youtube-dl/issues/7518
 534         try:
 535             return compat_chr(int(numstr, base))
 536         except ValueError:
 537             pass
 538
 539     # Unknown entity in name, return its literal representation
 540     return '&%s;' % entity
 541
 542
 543 def unescapeHTML(s):
 544     if s is None:
 545         return None
 546     assert type(s) == compat_str
 547
 548     return re.sub(
 549         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 550
 551
 552 def get_subprocess_encoding():
 553     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 554         # For subprocess calls, encode with locale encoding
 555         # Refer to http://stackoverflow.com/a/9951851/35070
 556         encoding = preferredencoding()
 557     else:
 558         encoding = sys.getfilesystemencoding()
 559     if encoding is None:
 560         encoding = 'utf-8'
 561     return encoding
 562
 563
 564 def encodeFilename(s, for_subprocess=False):
 565     """
 566     @param s The name of the file
 567     """
 568
 569     assert type(s) == compat_str
 570
 571     # Python 3 has a Unicode API
 572     if sys.version_info >= (3, 0):
 573         return s
 574
 575     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 576     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 577     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 578     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 579         return s
 580
 581     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 582     if sys.platform.startswith('java'):
 583         return s
 584
 585     return s.encode(get_subprocess_encoding(), 'ignore')
 586
 587
 588 def decodeFilename(b, for_subprocess=False):
 589
 590     if sys.version_info >= (3, 0):
 591         return b
 592
 593     if not isinstance(b, bytes):
 594         return b
 595
 596     return b.decode(get_subprocess_encoding(), 'ignore')
 597
 598
 599 def encodeArgument(s):
 600     if not isinstance(s, compat_str):
 601         # Legacy code that uses byte strings
 602         # Uncomment the following line after fixing all post processors
 603         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 604         s = s.decode('ascii')
 605     return encodeFilename(s, True)
 606
 607
 608 def decodeArgument(b):
 609     return decodeFilename(b, True)
 610
 611
 612 def decodeOption(optval):
 613     if optval is None:
 614         return optval
 615     if isinstance(optval, bytes):
 616         optval = optval.decode(preferredencoding())
 617
 618     assert isinstance(optval, compat_str)
 619     return optval
 620
 621
 622 def formatSeconds(secs):
 623     if secs > 3600:
 624         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 625     elif secs > 60:
 626         return '%d:%02d' % (secs // 60, secs % 60)
 627     else:
 628         return '%d' % secs
 629
 630
 631 def make_HTTPS_handler(params, **kwargs):
 632     opts_no_check_certificate = params.get('nocheckcertificate', False)
 633     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 634         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 635         if opts_no_check_certificate:
 636             context.check_hostname = False
 637             context.verify_mode = ssl.CERT_NONE
 638         try:
 639             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 640         except TypeError:
 641             # Python 2.7.8
 642             # (create_default_context present but HTTPSHandler has no context=)
 643             pass
 644
 645     if sys.version_info < (3, 2):
 646         return YoutubeDLHTTPSHandler(params, **kwargs)
 647     else:  # Python < 3.4
 648         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 649         context.verify_mode = (ssl.CERT_NONE
 650                                if opts_no_check_certificate
 651                                else ssl.CERT_REQUIRED)
 652         context.set_default_verify_paths()
 653         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 654
 655
 656 def bug_reports_message():
 657     if ytdl_is_updateable():
 658         update_cmd = 'type  youtube-dl -U  to update'
 659     else:
 660         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 661     msg = '; please report this issue on https://yt-dl.org/bug .'
 662     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 663     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 664     return msg
 665
 666
 667 class ExtractorError(Exception):
 668     """Error during info extraction."""
 669
 670     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 671         """ tb, if given, is the original traceback (so that it can be printed out).
 672         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 673         """
 674
 675         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 676             expected = True
 677         if video_id is not None:
 678             msg = video_id + ': ' + msg
 679         if cause:
 680             msg += ' (caused by %r)' % cause
 681         if not expected:
 682             msg += bug_reports_message()
 683         super(ExtractorError, self).__init__(msg)
 684
 685         self.traceback = tb
 686         self.exc_info = sys.exc_info()  # preserve original exception
 687         self.cause = cause
 688         self.video_id = video_id
 689
 690     def format_traceback(self):
 691         if self.traceback is None:
 692             return None
 693         return ''.join(traceback.format_tb(self.traceback))
 694
 695
 696 class UnsupportedError(ExtractorError):
 697     def __init__(self, url):
 698         super(UnsupportedError, self).__init__(
 699             'Unsupported URL: %s' % url, expected=True)
 700         self.url = url
 701
 702
 703 class RegexNotFoundError(ExtractorError):
 704     """Error when a regex didn't match"""
 705     pass
 706
 707
 708 class DownloadError(Exception):
 709     """Download Error exception.
 710
 711     This exception may be thrown by FileDownloader objects if they are not
 712     configured to continue on errors. They will contain the appropriate
 713     error message.
 714     """
 715
 716     def __init__(self, msg, exc_info=None):
 717         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 718         super(DownloadError, self).__init__(msg)
 719         self.exc_info = exc_info
 720
 721
 722 class SameFileError(Exception):
 723     """Same File exception.
 724
 725     This exception will be thrown by FileDownloader objects if they detect
 726     multiple files would have to be downloaded to the same file on disk.
 727     """
 728     pass
 729
 730
 731 class PostProcessingError(Exception):
 732     """Post Processing exception.
 733
 734     This exception may be raised by PostProcessor's .run() method to
 735     indicate an error in the postprocessing task.
 736     """
 737
 738     def __init__(self, msg):
 739         self.msg = msg
 740
 741
 742 class MaxDownloadsReached(Exception):
 743     """ --max-downloads limit has been reached. """
 744     pass
 745
 746
 747 class UnavailableVideoError(Exception):
 748     """Unavailable Format exception.
 749
 750     This exception will be thrown when a video is requested
 751     in a format that is not available for that video.
 752     """
 753     pass
 754
 755
 756 class ContentTooShortError(Exception):
 757     """Content Too Short exception.
 758
 759     This exception may be raised by FileDownloader objects when a file they
 760     download is too small for what the server announced first, indicating
 761     the connection was probably interrupted.
 762     """
 763
 764     def __init__(self, downloaded, expected):
 765         # Both in bytes
 766         self.downloaded = downloaded
 767         self.expected = expected
 768
 769
 770 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 771     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 772     # expected HTTP responses to meet HTTP/1.0 or later (see also
 773     # https://github.com/rg3/youtube-dl/issues/6727)
 774     if sys.version_info < (3, 0):
 775         kwargs[b'strict'] = True
 776     hc = http_class(*args, **kwargs)
 777     source_address = ydl_handler._params.get('source_address')
 778     if source_address is not None:
 779         sa = (source_address, 0)
 780         if hasattr(hc, 'source_address'):  # Python 2.7+
 781             hc.source_address = sa
 782         else:  # Python 2.6
 783             def _hc_connect(self, *args, **kwargs):
 784                 sock = compat_socket_create_connection(
 785                     (self.host, self.port), self.timeout, sa)
 786                 if is_https:
 787                     self.sock = ssl.wrap_socket(
 788                         sock, self.key_file, self.cert_file,
 789                         ssl_version=ssl.PROTOCOL_TLSv1)
 790                 else:
 791                     self.sock = sock
 792             hc.connect = functools.partial(_hc_connect, hc)
 793
 794     return hc
 795
 796
 797 def handle_youtubedl_headers(headers):
 798     filtered_headers = headers
 799
 800     if 'Youtubedl-no-compression' in filtered_headers:
 801         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 802         del filtered_headers['Youtubedl-no-compression']
 803
 804     return filtered_headers
 805
 806
 807 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 808     """Handler for HTTP requests and responses.
 809
 810     This class, when installed with an OpenerDirector, automatically adds
 811     the standard headers to every HTTP request and handles gzipped and
 812     deflated responses from web servers. If compression is to be avoided in
 813     a particular request, the original request in the program code only has
 814     to include the HTTP header "Youtubedl-no-compression", which will be
 815     removed before making the real request.
 816
 817     Part of this code was copied from:
 818
 819     http://techknack.net/python-urllib2-handlers/
 820
 821     Andrew Rowls, the author of that code, agreed to release it to the
 822     public domain.
 823     """
 824
 825     def __init__(self, params, *args, **kwargs):
 826         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 827         self._params = params
 828
 829     def http_open(self, req):
 830         conn_class = compat_http_client.HTTPConnection
 831
 832         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 833         if socks_proxy:
 834             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 835             del req.headers['Ytdl-socks-proxy']
 836
 837         return self.do_open(functools.partial(
 838             _create_http_connection, self, conn_class, False),
 839             req)
 840
 841     @staticmethod
 842     def deflate(data):
 843         try:
 844             return zlib.decompress(data, -zlib.MAX_WBITS)
 845         except zlib.error:
 846             return zlib.decompress(data)
 847
 848     @staticmethod
 849     def addinfourl_wrapper(stream, headers, url, code):
 850         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 851             return compat_urllib_request.addinfourl(stream, headers, url, code)
 852         ret = compat_urllib_request.addinfourl(stream, headers, url)
 853         ret.code = code
 854         return ret
 855
 856     def http_request(self, req):
 857         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 858         # always respected by websites, some tend to give out URLs with non percent-encoded
 859         # non-ASCII characters (see telemb.py, ard.py [#3412])
 860         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 861         # To work around aforementioned issue we will replace request's original URL with
 862         # percent-encoded one
 863         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 864         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 865         url = req.get_full_url()
 866         url_escaped = escape_url(url)
 867
 868         # Substitute URL if any change after escaping
 869         if url != url_escaped:
 870             req = update_Request(req, url=url_escaped)
 871
 872         for h, v in std_headers.items():
 873             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 874             # The dict keys are capitalized because of this bug by urllib
 875             if h.capitalize() not in req.headers:
 876                 req.add_header(h, v)
 877
 878         req.headers = handle_youtubedl_headers(req.headers)
 879
 880         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 881             # Python 2.6 is brain-dead when it comes to fragments
 882             req._Request__original = req._Request__original.partition('#')[0]
 883             req._Request__r_type = req._Request__r_type.partition('#')[0]
 884
 885         return req
 886
 887     def http_response(self, req, resp):
 888         old_resp = resp
 889         # gzip
 890         if resp.headers.get('Content-encoding', '') == 'gzip':
 891             content = resp.read()
 892             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 893             try:
 894                 uncompressed = io.BytesIO(gz.read())
 895             except IOError as original_ioerror:
 896                 # There may be junk add the end of the file
 897                 # See http://stackoverflow.com/q/4928560/35070 for details
 898                 for i in range(1, 1024):
 899                     try:
 900                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 901                         uncompressed = io.BytesIO(gz.read())
 902                     except IOError:
 903                         continue
 904                     break
 905                 else:
 906                     raise original_ioerror
 907             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 908             resp.msg = old_resp.msg
 909             del resp.headers['Content-encoding']
 910         # deflate
 911         if resp.headers.get('Content-encoding', '') == 'deflate':
 912             gz = io.BytesIO(self.deflate(resp.read()))
 913             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 914             resp.msg = old_resp.msg
 915             del resp.headers['Content-encoding']
 916         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 917         # https://github.com/rg3/youtube-dl/issues/6457).
 918         if 300 <= resp.code < 400:
 919             location = resp.headers.get('Location')
 920             if location:
 921                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 922                 if sys.version_info >= (3, 0):
 923                     location = location.encode('iso-8859-1').decode('utf-8')
 924                 else:
 925                     location = location.decode('utf-8')
 926                 location_escaped = escape_url(location)
 927                 if location != location_escaped:
 928                     del resp.headers['Location']
 929                     if sys.version_info < (3, 0):
 930                         location_escaped = location_escaped.encode('utf-8')
 931                     resp.headers['Location'] = location_escaped
 932         return resp
 933
 934     https_request = http_request
 935     https_response = http_response
 936
 937
 938 def make_socks_conn_class(base_class, socks_proxy):
 939     assert issubclass(base_class, (
 940         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 941
 942     url_components = compat_urlparse.urlparse(socks_proxy)
 943     if url_components.scheme.lower() == 'socks5':
 944         socks_type = ProxyType.SOCKS5
 945     elif url_components.scheme.lower() in ('socks', 'socks4'):
 946         socks_type = ProxyType.SOCKS4
 947     elif url_components.scheme.lower() == 'socks4a':
 948         socks_type = ProxyType.SOCKS4A
 949
 950     def unquote_if_non_empty(s):
 951         if not s:
 952             return s
 953         return compat_urllib_parse_unquote_plus(s)
 954
 955     proxy_args = (
 956         socks_type,
 957         url_components.hostname, url_components.port or 1080,
 958         True,  # Remote DNS
 959         unquote_if_non_empty(url_components.username),
 960         unquote_if_non_empty(url_components.password),
 961     )
 962
 963     class SocksConnection(base_class):
 964         def connect(self):
 965             self.sock = sockssocket()
 966             self.sock.setproxy(*proxy_args)
 967             if type(self.timeout) in (int, float):
 968                 self.sock.settimeout(self.timeout)
 969             self.sock.connect((self.host, self.port))
 970
 971             if isinstance(self, compat_http_client.HTTPSConnection):
 972                 if hasattr(self, '_context'):  # Python > 2.6
 973                     self.sock = self._context.wrap_socket(
 974                         self.sock, server_hostname=self.host)
 975                 else:
 976                     self.sock = ssl.wrap_socket(self.sock)
 977
 978     return SocksConnection
 979
 980
 981 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 982     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 983         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 984         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 985         self._params = params
 986
 987     def https_open(self, req):
 988         kwargs = {}
 989         conn_class = self._https_conn_class
 990
 991         if hasattr(self, '_context'):  # python > 2.6
 992             kwargs['context'] = self._context
 993         if hasattr(self, '_check_hostname'):  # python 3.x
 994             kwargs['check_hostname'] = self._check_hostname
 995
 996         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 997         if socks_proxy:
 998             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 999             del req.headers['Ytdl-socks-proxy']
1000
1001         return self.do_open(functools.partial(
1002             _create_http_connection, self, conn_class, True),
1003             req, **kwargs)
1004
1005
1006 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1007     def __init__(self, cookiejar=None):
1008         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1009
1010     def http_response(self, request, response):
1011         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1012         # characters in Set-Cookie HTTP header of last response (see
1013         # https://github.com/rg3/youtube-dl/issues/6769).
1014         # In order to at least prevent crashing we will percent encode Set-Cookie
1015         # header before HTTPCookieProcessor starts processing it.
1016         # if sys.version_info < (3, 0) and response.headers:
1017         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1018         #         set_cookie = response.headers.get(set_cookie_header)
1019         #         if set_cookie:
1020         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1021         #             if set_cookie != set_cookie_escaped:
1022         #                 del response.headers[set_cookie_header]
1023         #                 response.headers[set_cookie_header] = set_cookie_escaped
1024         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1025
1026     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1027     https_response = http_response
1028
1029
1030 def extract_timezone(date_str):
1031     m = re.search(
1032         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1033         date_str)
1034     if not m:
1035         timezone = datetime.timedelta()
1036     else:
1037         date_str = date_str[:-len(m.group('tz'))]
1038         if not m.group('sign'):
1039             timezone = datetime.timedelta()
1040         else:
1041             sign = 1 if m.group('sign') == '+' else -1
1042             timezone = datetime.timedelta(
1043                 hours=sign * int(m.group('hours')),
1044                 minutes=sign * int(m.group('minutes')))
1045     return timezone, date_str
1046
1047
1048 def parse_iso8601(date_str, delimiter='T', timezone=None):
1049     """ Return a UNIX timestamp from the given date """
1050
1051     if date_str is None:
1052         return None
1053
1054     date_str = re.sub(r'\.[0-9]+', '', date_str)
1055
1056     if timezone is None:
1057         timezone, date_str = extract_timezone(date_str)
1058
1059     try:
1060         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1061         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1062         return calendar.timegm(dt.timetuple())
1063     except ValueError:
1064         pass
1065
1066
1067 def date_formats(day_first=True):
1068     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1069
1070
1071 def unified_strdate(date_str, day_first=True):
1072     """Return a string with the date in the format YYYYMMDD"""
1073
1074     if date_str is None:
1075         return None
1076     upload_date = None
1077     # Replace commas
1078     date_str = date_str.replace(',', ' ')
1079     # Remove AM/PM + timezone
1080     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1081     _, date_str = extract_timezone(date_str)
1082
1083     for expression in date_formats(day_first):
1084         try:
1085             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1086         except ValueError:
1087             pass
1088     if upload_date is None:
1089         timetuple = email.utils.parsedate_tz(date_str)
1090         if timetuple:
1091             try:
1092                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1093             except ValueError:
1094                 pass
1095     if upload_date is not None:
1096         return compat_str(upload_date)
1097
1098
1099 def unified_timestamp(date_str, day_first=True):
1100     if date_str is None:
1101         return None
1102
1103     date_str = date_str.replace(',', ' ')
1104
1105     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1106     timezone, date_str = extract_timezone(date_str)
1107
1108     # Remove AM/PM + timezone
1109     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1110
1111     for expression in date_formats(day_first):
1112         try:
1113             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1114             return calendar.timegm(dt.timetuple())
1115         except ValueError:
1116             pass
1117     timetuple = email.utils.parsedate_tz(date_str)
1118     if timetuple:
1119         return calendar.timegm(timetuple) + pm_delta * 3600
1120
1121
1122 def determine_ext(url, default_ext='unknown_video'):
1123     if url is None:
1124         return default_ext
1125     guess = url.partition('?')[0].rpartition('.')[2]
1126     if re.match(r'^[A-Za-z0-9]+$', guess):
1127         return guess
1128     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1129     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1130         return guess.rstrip('/')
1131     else:
1132         return default_ext
1133
1134
1135 def subtitles_filename(filename, sub_lang, sub_format):
1136     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1137
1138
1139 def date_from_str(date_str):
1140     """
1141     Return a datetime object from a string in the format YYYYMMDD or
1142     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1143     today = datetime.date.today()
1144     if date_str in ('now', 'today'):
1145         return today
1146     if date_str == 'yesterday':
1147         return today - datetime.timedelta(days=1)
1148     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1149     if match is not None:
1150         sign = match.group('sign')
1151         time = int(match.group('time'))
1152         if sign == '-':
1153             time = -time
1154         unit = match.group('unit')
1155         # A bad approximation?
1156         if unit == 'month':
1157             unit = 'day'
1158             time *= 30
1159         elif unit == 'year':
1160             unit = 'day'
1161             time *= 365
1162         unit += 's'
1163         delta = datetime.timedelta(**{unit: time})
1164         return today + delta
1165     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1166
1167
1168 def hyphenate_date(date_str):
1169     """
1170     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1171     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1172     if match is not None:
1173         return '-'.join(match.groups())
1174     else:
1175         return date_str
1176
1177
1178 class DateRange(object):
1179     """Represents a time interval between two dates"""
1180
1181     def __init__(self, start=None, end=None):
1182         """start and end must be strings in the format accepted by date"""
1183         if start is not None:
1184             self.start = date_from_str(start)
1185         else:
1186             self.start = datetime.datetime.min.date()
1187         if end is not None:
1188             self.end = date_from_str(end)
1189         else:
1190             self.end = datetime.datetime.max.date()
1191         if self.start > self.end:
1192             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1193
1194     @classmethod
1195     def day(cls, day):
1196         """Returns a range that only contains the given day"""
1197         return cls(day, day)
1198
1199     def __contains__(self, date):
1200         """Check if the date is in the range"""
1201         if not isinstance(date, datetime.date):
1202             date = date_from_str(date)
1203         return self.start <= date <= self.end
1204
1205     def __str__(self):
1206         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1207
1208
1209 def platform_name():
1210     """ Returns the platform name as a compat_str """
1211     res = platform.platform()
1212     if isinstance(res, bytes):
1213         res = res.decode(preferredencoding())
1214
1215     assert isinstance(res, compat_str)
1216     return res
1217
1218
1219 def _windows_write_string(s, out):
1220     """ Returns True if the string was written using special methods,
1221     False if it has yet to be written out."""
1222     # Adapted from http://stackoverflow.com/a/3259271/35070
1223
1224     import ctypes
1225     import ctypes.wintypes
1226
1227     WIN_OUTPUT_IDS = {
1228         1: -11,
1229         2: -12,
1230     }
1231
1232     try:
1233         fileno = out.fileno()
1234     except AttributeError:
1235         # If the output stream doesn't have a fileno, it's virtual
1236         return False
1237     except io.UnsupportedOperation:
1238         # Some strange Windows pseudo files?
1239         return False
1240     if fileno not in WIN_OUTPUT_IDS:
1241         return False
1242
1243     GetStdHandle = ctypes.WINFUNCTYPE(
1244         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1245         (b'GetStdHandle', ctypes.windll.kernel32))
1246     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1247
1248     WriteConsoleW = ctypes.WINFUNCTYPE(
1249         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1250         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1251         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1252     written = ctypes.wintypes.DWORD(0)
1253
1254     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1255     FILE_TYPE_CHAR = 0x0002
1256     FILE_TYPE_REMOTE = 0x8000
1257     GetConsoleMode = ctypes.WINFUNCTYPE(
1258         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1259         ctypes.POINTER(ctypes.wintypes.DWORD))(
1260         (b'GetConsoleMode', ctypes.windll.kernel32))
1261     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1262
1263     def not_a_console(handle):
1264         if handle == INVALID_HANDLE_VALUE or handle is None:
1265             return True
1266         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1267                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1268
1269     if not_a_console(h):
1270         return False
1271
1272     def next_nonbmp_pos(s):
1273         try:
1274             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1275         except StopIteration:
1276             return len(s)
1277
1278     while s:
1279         count = min(next_nonbmp_pos(s), 1024)
1280
1281         ret = WriteConsoleW(
1282             h, s, count if count else 2, ctypes.byref(written), None)
1283         if ret == 0:
1284             raise OSError('Failed to write string')
1285         if not count:  # We just wrote a non-BMP character
1286             assert written.value == 2
1287             s = s[1:]
1288         else:
1289             assert written.value > 0
1290             s = s[written.value:]
1291     return True
1292
1293
1294 def write_string(s, out=None, encoding=None):
1295     if out is None:
1296         out = sys.stderr
1297     assert type(s) == compat_str
1298
1299     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1300         if _windows_write_string(s, out):
1301             return
1302
1303     if ('b' in getattr(out, 'mode', '') or
1304             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1305         byt = s.encode(encoding or preferredencoding(), 'ignore')
1306         out.write(byt)
1307     elif hasattr(out, 'buffer'):
1308         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1309         byt = s.encode(enc, 'ignore')
1310         out.buffer.write(byt)
1311     else:
1312         out.write(s)
1313     out.flush()
1314
1315
1316 def bytes_to_intlist(bs):
1317     if not bs:
1318         return []
1319     if isinstance(bs[0], int):  # Python 3
1320         return list(bs)
1321     else:
1322         return [ord(c) for c in bs]
1323
1324
1325 def intlist_to_bytes(xs):
1326     if not xs:
1327         return b''
1328     return compat_struct_pack('%dB' % len(xs), *xs)
1329
1330
1331 # Cross-platform file locking
1332 if sys.platform == 'win32':
1333     import ctypes.wintypes
1334     import msvcrt
1335
1336     class OVERLAPPED(ctypes.Structure):
1337         _fields_ = [
1338             ('Internal', ctypes.wintypes.LPVOID),
1339             ('InternalHigh', ctypes.wintypes.LPVOID),
1340             ('Offset', ctypes.wintypes.DWORD),
1341             ('OffsetHigh', ctypes.wintypes.DWORD),
1342             ('hEvent', ctypes.wintypes.HANDLE),
1343         ]
1344
1345     kernel32 = ctypes.windll.kernel32
1346     LockFileEx = kernel32.LockFileEx
1347     LockFileEx.argtypes = [
1348         ctypes.wintypes.HANDLE,     # hFile
1349         ctypes.wintypes.DWORD,      # dwFlags
1350         ctypes.wintypes.DWORD,      # dwReserved
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1353         ctypes.POINTER(OVERLAPPED)  # Overlapped
1354     ]
1355     LockFileEx.restype = ctypes.wintypes.BOOL
1356     UnlockFileEx = kernel32.UnlockFileEx
1357     UnlockFileEx.argtypes = [
1358         ctypes.wintypes.HANDLE,     # hFile
1359         ctypes.wintypes.DWORD,      # dwReserved
1360         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1361         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1362         ctypes.POINTER(OVERLAPPED)  # Overlapped
1363     ]
1364     UnlockFileEx.restype = ctypes.wintypes.BOOL
1365     whole_low = 0xffffffff
1366     whole_high = 0x7fffffff
1367
1368     def _lock_file(f, exclusive):
1369         overlapped = OVERLAPPED()
1370         overlapped.Offset = 0
1371         overlapped.OffsetHigh = 0
1372         overlapped.hEvent = 0
1373         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1374         handle = msvcrt.get_osfhandle(f.fileno())
1375         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1376                           whole_low, whole_high, f._lock_file_overlapped_p):
1377             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1378
1379     def _unlock_file(f):
1380         assert f._lock_file_overlapped_p
1381         handle = msvcrt.get_osfhandle(f.fileno())
1382         if not UnlockFileEx(handle, 0,
1383                             whole_low, whole_high, f._lock_file_overlapped_p):
1384             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1385
1386 else:
1387     # Some platforms, such as Jython, is missing fcntl
1388     try:
1389         import fcntl
1390
1391         def _lock_file(f, exclusive):
1392             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1393
1394         def _unlock_file(f):
1395             fcntl.flock(f, fcntl.LOCK_UN)
1396     except ImportError:
1397         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1398
1399         def _lock_file(f, exclusive):
1400             raise IOError(UNSUPPORTED_MSG)
1401
1402         def _unlock_file(f):
1403             raise IOError(UNSUPPORTED_MSG)
1404
1405
1406 class locked_file(object):
1407     def __init__(self, filename, mode, encoding=None):
1408         assert mode in ['r', 'a', 'w']
1409         self.f = io.open(filename, mode, encoding=encoding)
1410         self.mode = mode
1411
1412     def __enter__(self):
1413         exclusive = self.mode != 'r'
1414         try:
1415             _lock_file(self.f, exclusive)
1416         except IOError:
1417             self.f.close()
1418             raise
1419         return self
1420
1421     def __exit__(self, etype, value, traceback):
1422         try:
1423             _unlock_file(self.f)
1424         finally:
1425             self.f.close()
1426
1427     def __iter__(self):
1428         return iter(self.f)
1429
1430     def write(self, *args):
1431         return self.f.write(*args)
1432
1433     def read(self, *args):
1434         return self.f.read(*args)
1435
1436
1437 def get_filesystem_encoding():
1438     encoding = sys.getfilesystemencoding()
1439     return encoding if encoding is not None else 'utf-8'
1440
1441
1442 def shell_quote(args):
1443     quoted_args = []
1444     encoding = get_filesystem_encoding()
1445     for a in args:
1446         if isinstance(a, bytes):
1447             # We may get a filename encoded with 'encodeFilename'
1448             a = a.decode(encoding)
1449         quoted_args.append(pipes.quote(a))
1450     return ' '.join(quoted_args)
1451
1452
1453 def smuggle_url(url, data):
1454     """ Pass additional data in a URL for internal use. """
1455
1456     url, idata = unsmuggle_url(url, {})
1457     data.update(idata)
1458     sdata = compat_urllib_parse_urlencode(
1459         {'__youtubedl_smuggle': json.dumps(data)})
1460     return url + '#' + sdata
1461
1462
1463 def unsmuggle_url(smug_url, default=None):
1464     if '#__youtubedl_smuggle' not in smug_url:
1465         return smug_url, default
1466     url, _, sdata = smug_url.rpartition('#')
1467     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1468     data = json.loads(jsond)
1469     return url, data
1470
1471
1472 def format_bytes(bytes):
1473     if bytes is None:
1474         return 'N/A'
1475     if type(bytes) is str:
1476         bytes = float(bytes)
1477     if bytes == 0.0:
1478         exponent = 0
1479     else:
1480         exponent = int(math.log(bytes, 1024.0))
1481     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1482     converted = float(bytes) / float(1024 ** exponent)
1483     return '%.2f%s' % (converted, suffix)
1484
1485
1486 def lookup_unit_table(unit_table, s):
1487     units_re = '|'.join(re.escape(u) for u in unit_table)
1488     m = re.match(
1489         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1490     if not m:
1491         return None
1492     num_str = m.group('num').replace(',', '.')
1493     mult = unit_table[m.group('unit')]
1494     return int(float(num_str) * mult)
1495
1496
1497 def parse_filesize(s):
1498     if s is None:
1499         return None
1500
1501     # The lower-case forms are of course incorrect and unofficial,
1502     # but we support those too
1503     _UNIT_TABLE = {
1504         'B': 1,
1505         'b': 1,
1506         'KiB': 1024,
1507         'KB': 1000,
1508         'kB': 1024,
1509         'Kb': 1000,
1510         'MiB': 1024 ** 2,
1511         'MB': 1000 ** 2,
1512         'mB': 1024 ** 2,
1513         'Mb': 1000 ** 2,
1514         'GiB': 1024 ** 3,
1515         'GB': 1000 ** 3,
1516         'gB': 1024 ** 3,
1517         'Gb': 1000 ** 3,
1518         'TiB': 1024 ** 4,
1519         'TB': 1000 ** 4,
1520         'tB': 1024 ** 4,
1521         'Tb': 1000 ** 4,
1522         'PiB': 1024 ** 5,
1523         'PB': 1000 ** 5,
1524         'pB': 1024 ** 5,
1525         'Pb': 1000 ** 5,
1526         'EiB': 1024 ** 6,
1527         'EB': 1000 ** 6,
1528         'eB': 1024 ** 6,
1529         'Eb': 1000 ** 6,
1530         'ZiB': 1024 ** 7,
1531         'ZB': 1000 ** 7,
1532         'zB': 1024 ** 7,
1533         'Zb': 1000 ** 7,
1534         'YiB': 1024 ** 8,
1535         'YB': 1000 ** 8,
1536         'yB': 1024 ** 8,
1537         'Yb': 1000 ** 8,
1538     }
1539
1540     return lookup_unit_table(_UNIT_TABLE, s)
1541
1542
1543 def parse_count(s):
1544     if s is None:
1545         return None
1546
1547     s = s.strip()
1548
1549     if re.match(r'^[\d,.]+$', s):
1550         return str_to_int(s)
1551
1552     _UNIT_TABLE = {
1553         'k': 1000,
1554         'K': 1000,
1555         'm': 1000 ** 2,
1556         'M': 1000 ** 2,
1557         'kk': 1000 ** 2,
1558         'KK': 1000 ** 2,
1559     }
1560
1561     return lookup_unit_table(_UNIT_TABLE, s)
1562
1563
1564 def month_by_name(name):
1565     """ Return the number of a month by (locale-independently) English name """
1566
1567     try:
1568         return ENGLISH_MONTH_NAMES.index(name) + 1
1569     except ValueError:
1570         return None
1571
1572
1573 def month_by_abbreviation(abbrev):
1574     """ Return the number of a month by (locale-independently) English
1575         abbreviations """
1576
1577     try:
1578         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1579     except ValueError:
1580         return None
1581
1582
1583 def fix_xml_ampersands(xml_str):
1584     """Replace all the '&' by '&amp;' in XML"""
1585     return re.sub(
1586         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1587         '&amp;',
1588         xml_str)
1589
1590
1591 def setproctitle(title):
1592     assert isinstance(title, compat_str)
1593
1594     # ctypes in Jython is not complete
1595     # http://bugs.jython.org/issue2148
1596     if sys.platform.startswith('java'):
1597         return
1598
1599     try:
1600         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1601     except OSError:
1602         return
1603     title_bytes = title.encode('utf-8')
1604     buf = ctypes.create_string_buffer(len(title_bytes))
1605     buf.value = title_bytes
1606     try:
1607         libc.prctl(15, buf, 0, 0, 0)
1608     except AttributeError:
1609         return  # Strange libc, just skip this
1610
1611
1612 def remove_start(s, start):
1613     return s[len(start):] if s is not None and s.startswith(start) else s
1614
1615
1616 def remove_end(s, end):
1617     return s[:-len(end)] if s is not None and s.endswith(end) else s
1618
1619
1620 def remove_quotes(s):
1621     if s is None or len(s) < 2:
1622         return s
1623     for quote in ('"', "'", ):
1624         if s[0] == quote and s[-1] == quote:
1625             return s[1:-1]
1626     return s
1627
1628
1629 def url_basename(url):
1630     path = compat_urlparse.urlparse(url).path
1631     return path.strip('/').split('/')[-1]
1632
1633
1634 class HEADRequest(compat_urllib_request.Request):
1635     def get_method(self):
1636         return 'HEAD'
1637
1638
1639 class PUTRequest(compat_urllib_request.Request):
1640     def get_method(self):
1641         return 'PUT'
1642
1643
1644 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1645     if get_attr:
1646         if v is not None:
1647             v = getattr(v, get_attr, None)
1648     if v == '':
1649         v = None
1650     if v is None:
1651         return default
1652     try:
1653         return int(v) * invscale // scale
1654     except ValueError:
1655         return default
1656
1657
1658 def str_or_none(v, default=None):
1659     return default if v is None else compat_str(v)
1660
1661
1662 def str_to_int(int_str):
1663     """ A more relaxed version of int_or_none """
1664     if int_str is None:
1665         return None
1666     int_str = re.sub(r'[,\.\+]', '', int_str)
1667     return int(int_str)
1668
1669
1670 def float_or_none(v, scale=1, invscale=1, default=None):
1671     if v is None:
1672         return default
1673     try:
1674         return float(v) * invscale / scale
1675     except ValueError:
1676         return default
1677
1678
1679 def strip_or_none(v):
1680     return None if v is None else v.strip()
1681
1682
1683 def parse_duration(s):
1684     if not isinstance(s, compat_basestring):
1685         return None
1686
1687     s = s.strip()
1688
1689     days, hours, mins, secs, ms = [None] * 5
1690     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1691     if m:
1692         days, hours, mins, secs, ms = m.groups()
1693     else:
1694         m = re.match(
1695             r'''(?ix)(?:P?T)?
1696                 (?:
1697                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1698                 )?
1699                 (?:
1700                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1701                 )?
1702                 (?:
1703                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1704                 )?
1705                 (?:
1706                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1707                 )?$''', s)
1708         if m:
1709             days, hours, mins, secs, ms = m.groups()
1710         else:
1711             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1712             if m:
1713                 hours, mins = m.groups()
1714             else:
1715                 return None
1716
1717     duration = 0
1718     if secs:
1719         duration += float(secs)
1720     if mins:
1721         duration += float(mins) * 60
1722     if hours:
1723         duration += float(hours) * 60 * 60
1724     if days:
1725         duration += float(days) * 24 * 60 * 60
1726     if ms:
1727         duration += float(ms)
1728     return duration
1729
1730
1731 def prepend_extension(filename, ext, expected_real_ext=None):
1732     name, real_ext = os.path.splitext(filename)
1733     return (
1734         '{0}.{1}{2}'.format(name, ext, real_ext)
1735         if not expected_real_ext or real_ext[1:] == expected_real_ext
1736         else '{0}.{1}'.format(filename, ext))
1737
1738
1739 def replace_extension(filename, ext, expected_real_ext=None):
1740     name, real_ext = os.path.splitext(filename)
1741     return '{0}.{1}'.format(
1742         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1743         ext)
1744
1745
1746 def check_executable(exe, args=[]):
1747     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1748     args can be a list of arguments for a short output (like -version) """
1749     try:
1750         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1751     except OSError:
1752         return False
1753     return exe
1754
1755
1756 def get_exe_version(exe, args=['--version'],
1757                     version_re=None, unrecognized='present'):
1758     """ Returns the version of the specified executable,
1759     or False if the executable is not present """
1760     try:
1761         out, _ = subprocess.Popen(
1762             [encodeArgument(exe)] + args,
1763             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1764     except OSError:
1765         return False
1766     if isinstance(out, bytes):  # Python 2.x
1767         out = out.decode('ascii', 'ignore')
1768     return detect_exe_version(out, version_re, unrecognized)
1769
1770
1771 def detect_exe_version(output, version_re=None, unrecognized='present'):
1772     assert isinstance(output, compat_str)
1773     if version_re is None:
1774         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1775     m = re.search(version_re, output)
1776     if m:
1777         return m.group(1)
1778     else:
1779         return unrecognized
1780
1781
1782 class PagedList(object):
1783     def __len__(self):
1784         # This is only useful for tests
1785         return len(self.getslice())
1786
1787
1788 class OnDemandPagedList(PagedList):
1789     def __init__(self, pagefunc, pagesize, use_cache=False):
1790         self._pagefunc = pagefunc
1791         self._pagesize = pagesize
1792         self._use_cache = use_cache
1793         if use_cache:
1794             self._cache = {}
1795
1796     def getslice(self, start=0, end=None):
1797         res = []
1798         for pagenum in itertools.count(start // self._pagesize):
1799             firstid = pagenum * self._pagesize
1800             nextfirstid = pagenum * self._pagesize + self._pagesize
1801             if start >= nextfirstid:
1802                 continue
1803
1804             page_results = None
1805             if self._use_cache:
1806                 page_results = self._cache.get(pagenum)
1807             if page_results is None:
1808                 page_results = list(self._pagefunc(pagenum))
1809             if self._use_cache:
1810                 self._cache[pagenum] = page_results
1811
1812             startv = (
1813                 start % self._pagesize
1814                 if firstid <= start < nextfirstid
1815                 else 0)
1816
1817             endv = (
1818                 ((end - 1) % self._pagesize) + 1
1819                 if (end is not None and firstid <= end <= nextfirstid)
1820                 else None)
1821
1822             if startv != 0 or endv is not None:
1823                 page_results = page_results[startv:endv]
1824             res.extend(page_results)
1825
1826             # A little optimization - if current page is not "full", ie. does
1827             # not contain page_size videos then we can assume that this page
1828             # is the last one - there are no more ids on further pages -
1829             # i.e. no need to query again.
1830             if len(page_results) + startv < self._pagesize:
1831                 break
1832
1833             # If we got the whole page, but the next page is not interesting,
1834             # break out early as well
1835             if end == nextfirstid:
1836                 break
1837         return res
1838
1839
1840 class InAdvancePagedList(PagedList):
1841     def __init__(self, pagefunc, pagecount, pagesize):
1842         self._pagefunc = pagefunc
1843         self._pagecount = pagecount
1844         self._pagesize = pagesize
1845
1846     def getslice(self, start=0, end=None):
1847         res = []
1848         start_page = start // self._pagesize
1849         end_page = (
1850             self._pagecount if end is None else (end // self._pagesize + 1))
1851         skip_elems = start - start_page * self._pagesize
1852         only_more = None if end is None else end - start
1853         for pagenum in range(start_page, end_page):
1854             page = list(self._pagefunc(pagenum))
1855             if skip_elems:
1856                 page = page[skip_elems:]
1857                 skip_elems = None
1858             if only_more is not None:
1859                 if len(page) < only_more:
1860                     only_more -= len(page)
1861                 else:
1862                     page = page[:only_more]
1863                     res.extend(page)
1864                     break
1865             res.extend(page)
1866         return res
1867
1868
1869 def uppercase_escape(s):
1870     unicode_escape = codecs.getdecoder('unicode_escape')
1871     return re.sub(
1872         r'\\U[0-9a-fA-F]{8}',
1873         lambda m: unicode_escape(m.group(0))[0],
1874         s)
1875
1876
1877 def lowercase_escape(s):
1878     unicode_escape = codecs.getdecoder('unicode_escape')
1879     return re.sub(
1880         r'\\u[0-9a-fA-F]{4}',
1881         lambda m: unicode_escape(m.group(0))[0],
1882         s)
1883
1884
1885 def escape_rfc3986(s):
1886     """Escape non-ASCII characters as suggested by RFC 3986"""
1887     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1888         s = s.encode('utf-8')
1889     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1890
1891
1892 def escape_url(url):
1893     """Escape URL as suggested by RFC 3986"""
1894     url_parsed = compat_urllib_parse_urlparse(url)
1895     return url_parsed._replace(
1896         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1897         path=escape_rfc3986(url_parsed.path),
1898         params=escape_rfc3986(url_parsed.params),
1899         query=escape_rfc3986(url_parsed.query),
1900         fragment=escape_rfc3986(url_parsed.fragment)
1901     ).geturl()
1902
1903
1904 def read_batch_urls(batch_fd):
1905     def fixup(url):
1906         if not isinstance(url, compat_str):
1907             url = url.decode('utf-8', 'replace')
1908         BOM_UTF8 = '\xef\xbb\xbf'
1909         if url.startswith(BOM_UTF8):
1910             url = url[len(BOM_UTF8):]
1911         url = url.strip()
1912         if url.startswith(('#', ';', ']')):
1913             return False
1914         return url
1915
1916     with contextlib.closing(batch_fd) as fd:
1917         return [url for url in map(fixup, fd) if url]
1918
1919
1920 def urlencode_postdata(*args, **kargs):
1921     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1922
1923
1924 def update_url_query(url, query):
1925     if not query:
1926         return url
1927     parsed_url = compat_urlparse.urlparse(url)
1928     qs = compat_parse_qs(parsed_url.query)
1929     qs.update(query)
1930     return compat_urlparse.urlunparse(parsed_url._replace(
1931         query=compat_urllib_parse_urlencode(qs, True)))
1932
1933
1934 def update_Request(req, url=None, data=None, headers={}, query={}):
1935     req_headers = req.headers.copy()
1936     req_headers.update(headers)
1937     req_data = data or req.data
1938     req_url = update_url_query(url or req.get_full_url(), query)
1939     req_get_method = req.get_method()
1940     if req_get_method == 'HEAD':
1941         req_type = HEADRequest
1942     elif req_get_method == 'PUT':
1943         req_type = PUTRequest
1944     else:
1945         req_type = compat_urllib_request.Request
1946     new_req = req_type(
1947         req_url, data=req_data, headers=req_headers,
1948         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1949     if hasattr(req, 'timeout'):
1950         new_req.timeout = req.timeout
1951     return new_req
1952
1953
1954 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1955     if isinstance(key_or_keys, (list, tuple)):
1956         for key in key_or_keys:
1957             if key not in d or d[key] is None or skip_false_values and not d[key]:
1958                 continue
1959             return d[key]
1960         return default
1961     return d.get(key_or_keys, default)
1962
1963
1964 def try_get(src, getter, expected_type=None):
1965     try:
1966         v = getter(src)
1967     except (AttributeError, KeyError, TypeError, IndexError):
1968         pass
1969     else:
1970         if expected_type is None or isinstance(v, expected_type):
1971             return v
1972
1973
1974 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1975     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1976
1977
1978 US_RATINGS = {
1979     'G': 0,
1980     'PG': 10,
1981     'PG-13': 13,
1982     'R': 16,
1983     'NC': 18,
1984 }
1985
1986
1987 def parse_age_limit(s):
1988     if s is None:
1989         return None
1990     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1991     return int(m.group('age')) if m else US_RATINGS.get(s)
1992
1993
1994 def strip_jsonp(code):
1995     return re.sub(
1996         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1997
1998
1999 def js_to_json(code):
2000     def fix_kv(m):
2001         v = m.group(0)
2002         if v in ('true', 'false', 'null'):
2003             return v
2004         elif v.startswith('/*') or v == ',':
2005             return ""
2006
2007         if v[0] in ("'", '"'):
2008             v = re.sub(r'(?s)\\.|"', lambda m: {
2009                 '"': '\\"',
2010                 "\\'": "'",
2011                 '\\\n': '',
2012                 '\\x': '\\u00',
2013             }.get(m.group(0), m.group(0)), v[1:-1])
2014
2015         INTEGER_TABLE = (
2016             (r'^0[xX][0-9a-fA-F]+', 16),
2017             (r'^0+[0-7]+', 8),
2018         )
2019
2020         for regex, base in INTEGER_TABLE:
2021             im = re.match(regex, v)
2022             if im:
2023                 i = int(im.group(0), base)
2024                 return '"%d":' % i if v.endswith(':') else '%d' % i
2025
2026         return '"%s"' % v
2027
2028     return re.sub(r'''(?sx)
2029         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2030         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2031         /\*.*?\*/|,(?=\s*[\]}])|
2032         [a-zA-Z_][.a-zA-Z_0-9]*|
2033         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2034         [0-9]+(?=\s*:)
2035         ''', fix_kv, code)
2036
2037
2038 def qualities(quality_ids):
2039     """ Get a numeric quality value out of a list of possible values """
2040     def q(qid):
2041         try:
2042             return quality_ids.index(qid)
2043         except ValueError:
2044             return -1
2045     return q
2046
2047
2048 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2049
2050
2051 def limit_length(s, length):
2052     """ Add ellipses to overly long strings """
2053     if s is None:
2054         return None
2055     ELLIPSES = '...'
2056     if len(s) > length:
2057         return s[:length - len(ELLIPSES)] + ELLIPSES
2058     return s
2059
2060
2061 def version_tuple(v):
2062     return tuple(int(e) for e in re.split(r'[-.]', v))
2063
2064
2065 def is_outdated_version(version, limit, assume_new=True):
2066     if not version:
2067         return not assume_new
2068     try:
2069         return version_tuple(version) < version_tuple(limit)
2070     except ValueError:
2071         return not assume_new
2072
2073
2074 def ytdl_is_updateable():
2075     """ Returns if youtube-dl can be updated with -U """
2076     from zipimport import zipimporter
2077
2078     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2079
2080
2081 def args_to_str(args):
2082     # Get a short string representation for a subprocess command
2083     return ' '.join(compat_shlex_quote(a) for a in args)
2084
2085
2086 def error_to_compat_str(err):
2087     err_str = str(err)
2088     # On python 2 error byte string must be decoded with proper
2089     # encoding rather than ascii
2090     if sys.version_info[0] < 3:
2091         err_str = err_str.decode(preferredencoding())
2092     return err_str
2093
2094
2095 def mimetype2ext(mt):
2096     if mt is None:
2097         return None
2098
2099     ext = {
2100         'audio/mp4': 'm4a',
2101         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2102         # it's the most popular one
2103         'audio/mpeg': 'mp3',
2104     }.get(mt)
2105     if ext is not None:
2106         return ext
2107
2108     _, _, res = mt.rpartition('/')
2109     res = res.lower()
2110
2111     return {
2112         '3gpp': '3gp',
2113         'smptett+xml': 'tt',
2114         'srt': 'srt',
2115         'ttaf+xml': 'dfxp',
2116         'ttml+xml': 'ttml',
2117         'vtt': 'vtt',
2118         'x-flv': 'flv',
2119         'x-mp4-fragmented': 'mp4',
2120         'x-ms-wmv': 'wmv',
2121         'mpegurl': 'm3u8',
2122         'x-mpegurl': 'm3u8',
2123         'vnd.apple.mpegurl': 'm3u8',
2124         'dash+xml': 'mpd',
2125         'f4m': 'f4m',
2126         'f4m+xml': 'f4m',
2127         'hds+xml': 'f4m',
2128         'vnd.ms-sstr+xml': 'ism',
2129     }.get(res, res)
2130
2131
2132 def parse_codecs(codecs_str):
2133     # http://tools.ietf.org/html/rfc6381
2134     if not codecs_str:
2135         return {}
2136     splited_codecs = list(filter(None, map(
2137         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2138     vcodec, acodec = None, None
2139     for full_codec in splited_codecs:
2140         codec = full_codec.split('.')[0]
2141         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2142             if not vcodec:
2143                 vcodec = full_codec
2144         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2145             if not acodec:
2146                 acodec = full_codec
2147         else:
2148             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2149     if not vcodec and not acodec:
2150         if len(splited_codecs) == 2:
2151             return {
2152                 'vcodec': vcodec,
2153                 'acodec': acodec,
2154             }
2155         elif len(splited_codecs) == 1:
2156             return {
2157                 'vcodec': 'none',
2158                 'acodec': vcodec,
2159             }
2160     else:
2161         return {
2162             'vcodec': vcodec or 'none',
2163             'acodec': acodec or 'none',
2164         }
2165     return {}
2166
2167
2168 def urlhandle_detect_ext(url_handle):
2169     getheader = url_handle.headers.get
2170
2171     cd = getheader('Content-Disposition')
2172     if cd:
2173         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2174         if m:
2175             e = determine_ext(m.group('filename'), default_ext=None)
2176             if e:
2177                 return e
2178
2179     return mimetype2ext(getheader('Content-Type'))
2180
2181
2182 def encode_data_uri(data, mime_type):
2183     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2184
2185
2186 def age_restricted(content_limit, age_limit):
2187     """ Returns True iff the content should be blocked """
2188
2189     if age_limit is None:  # No limit set
2190         return False
2191     if content_limit is None:
2192         return False  # Content available for everyone
2193     return age_limit < content_limit
2194
2195
2196 def is_html(first_bytes):
2197     """ Detect whether a file contains HTML by examining its first bytes. """
2198
2199     BOMS = [
2200         (b'\xef\xbb\xbf', 'utf-8'),
2201         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2202         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2203         (b'\xff\xfe', 'utf-16-le'),
2204         (b'\xfe\xff', 'utf-16-be'),
2205     ]
2206     for bom, enc in BOMS:
2207         if first_bytes.startswith(bom):
2208             s = first_bytes[len(bom):].decode(enc, 'replace')
2209             break
2210     else:
2211         s = first_bytes.decode('utf-8', 'replace')
2212
2213     return re.match(r'^\s*<', s)
2214
2215
2216 def determine_protocol(info_dict):
2217     protocol = info_dict.get('protocol')
2218     if protocol is not None:
2219         return protocol
2220
2221     url = info_dict['url']
2222     if url.startswith('rtmp'):
2223         return 'rtmp'
2224     elif url.startswith('mms'):
2225         return 'mms'
2226     elif url.startswith('rtsp'):
2227         return 'rtsp'
2228
2229     ext = determine_ext(url)
2230     if ext == 'm3u8':
2231         return 'm3u8'
2232     elif ext == 'f4m':
2233         return 'f4m'
2234
2235     return compat_urllib_parse_urlparse(url).scheme
2236
2237
2238 def render_table(header_row, data):
2239     """ Render a list of rows, each as a list of values """
2240     table = [header_row] + data
2241     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2242     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2243     return '\n'.join(format_str % tuple(row) for row in table)
2244
2245
2246 def _match_one(filter_part, dct):
2247     COMPARISON_OPERATORS = {
2248         '<': operator.lt,
2249         '<=': operator.le,
2250         '>': operator.gt,
2251         '>=': operator.ge,
2252         '=': operator.eq,
2253         '!=': operator.ne,
2254     }
2255     operator_rex = re.compile(r'''(?x)\s*
2256         (?P<key>[a-z_]+)
2257         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2258         (?:
2259             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2260             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2261         )
2262         \s*$
2263         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2264     m = operator_rex.search(filter_part)
2265     if m:
2266         op = COMPARISON_OPERATORS[m.group('op')]
2267         if m.group('strval') is not None:
2268             if m.group('op') not in ('=', '!='):
2269                 raise ValueError(
2270                     'Operator %s does not support string values!' % m.group('op'))
2271             comparison_value = m.group('strval')
2272         else:
2273             try:
2274                 comparison_value = int(m.group('intval'))
2275             except ValueError:
2276                 comparison_value = parse_filesize(m.group('intval'))
2277                 if comparison_value is None:
2278                     comparison_value = parse_filesize(m.group('intval') + 'B')
2279                 if comparison_value is None:
2280                     raise ValueError(
2281                         'Invalid integer value %r in filter part %r' % (
2282                             m.group('intval'), filter_part))
2283         actual_value = dct.get(m.group('key'))
2284         if actual_value is None:
2285             return m.group('none_inclusive')
2286         return op(actual_value, comparison_value)
2287
2288     UNARY_OPERATORS = {
2289         '': lambda v: v is not None,
2290         '!': lambda v: v is None,
2291     }
2292     operator_rex = re.compile(r'''(?x)\s*
2293         (?P<op>%s)\s*(?P<key>[a-z_]+)
2294         \s*$
2295         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2296     m = operator_rex.search(filter_part)
2297     if m:
2298         op = UNARY_OPERATORS[m.group('op')]
2299         actual_value = dct.get(m.group('key'))
2300         return op(actual_value)
2301
2302     raise ValueError('Invalid filter part %r' % filter_part)
2303
2304
2305 def match_str(filter_str, dct):
2306     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2307
2308     return all(
2309         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2310
2311
2312 def match_filter_func(filter_str):
2313     def _match_func(info_dict):
2314         if match_str(filter_str, info_dict):
2315             return None
2316         else:
2317             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2318             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2319     return _match_func
2320
2321
2322 def parse_dfxp_time_expr(time_expr):
2323     if not time_expr:
2324         return
2325
2326     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2327     if mobj:
2328         return float(mobj.group('time_offset'))
2329
2330     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2331     if mobj:
2332         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2333
2334
2335 def srt_subtitles_timecode(seconds):
2336     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2337
2338
2339 def dfxp2srt(dfxp_data):
2340     _x = functools.partial(xpath_with_ns, ns_map={
2341         'ttml': 'http://www.w3.org/ns/ttml',
2342         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2343         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2344     })
2345
2346     class TTMLPElementParser(object):
2347         out = ''
2348
2349         def start(self, tag, attrib):
2350             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2351                 self.out += '\n'
2352
2353         def end(self, tag):
2354             pass
2355
2356         def data(self, data):
2357             self.out += data
2358
2359         def close(self):
2360             return self.out.strip()
2361
2362     def parse_node(node):
2363         target = TTMLPElementParser()
2364         parser = xml.etree.ElementTree.XMLParser(target=target)
2365         parser.feed(xml.etree.ElementTree.tostring(node))
2366         return parser.close()
2367
2368     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2369     out = []
2370     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2371
2372     if not paras:
2373         raise ValueError('Invalid dfxp/TTML subtitle')
2374
2375     for para, index in zip(paras, itertools.count(1)):
2376         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2377         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2378         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2379         if begin_time is None:
2380             continue
2381         if not end_time:
2382             if not dur:
2383                 continue
2384             end_time = begin_time + dur
2385         out.append('%d\n%s --> %s\n%s\n\n' % (
2386             index,
2387             srt_subtitles_timecode(begin_time),
2388             srt_subtitles_timecode(end_time),
2389             parse_node(para)))
2390
2391     return ''.join(out)
2392
2393
2394 def cli_option(params, command_option, param):
2395     param = params.get(param)
2396     return [command_option, param] if param is not None else []
2397
2398
2399 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2400     param = params.get(param)
2401     assert isinstance(param, bool)
2402     if separator:
2403         return [command_option + separator + (true_value if param else false_value)]
2404     return [command_option, true_value if param else false_value]
2405
2406
2407 def cli_valueless_option(params, command_option, param, expected_value=True):
2408     param = params.get(param)
2409     return [command_option] if param == expected_value else []
2410
2411
2412 def cli_configuration_args(params, param, default=[]):
2413     ex_args = params.get(param)
2414     if ex_args is None:
2415         return default
2416     assert isinstance(ex_args, list)
2417     return ex_args
2418
2419
2420 class ISO639Utils(object):
2421     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2422     _lang_map = {
2423         'aa': 'aar',
2424         'ab': 'abk',
2425         'ae': 'ave',
2426         'af': 'afr',
2427         'ak': 'aka',
2428         'am': 'amh',
2429         'an': 'arg',
2430         'ar': 'ara',
2431         'as': 'asm',
2432         'av': 'ava',
2433         'ay': 'aym',
2434         'az': 'aze',
2435         'ba': 'bak',
2436         'be': 'bel',
2437         'bg': 'bul',
2438         'bh': 'bih',
2439         'bi': 'bis',
2440         'bm': 'bam',
2441         'bn': 'ben',
2442         'bo': 'bod',
2443         'br': 'bre',
2444         'bs': 'bos',
2445         'ca': 'cat',
2446         'ce': 'che',
2447         'ch': 'cha',
2448         'co': 'cos',
2449         'cr': 'cre',
2450         'cs': 'ces',
2451         'cu': 'chu',
2452         'cv': 'chv',
2453         'cy': 'cym',
2454         'da': 'dan',
2455         'de': 'deu',
2456         'dv': 'div',
2457         'dz': 'dzo',
2458         'ee': 'ewe',
2459         'el': 'ell',
2460         'en': 'eng',
2461         'eo': 'epo',
2462         'es': 'spa',
2463         'et': 'est',
2464         'eu': 'eus',
2465         'fa': 'fas',
2466         'ff': 'ful',
2467         'fi': 'fin',
2468         'fj': 'fij',
2469         'fo': 'fao',
2470         'fr': 'fra',
2471         'fy': 'fry',
2472         'ga': 'gle',
2473         'gd': 'gla',
2474         'gl': 'glg',
2475         'gn': 'grn',
2476         'gu': 'guj',
2477         'gv': 'glv',
2478         'ha': 'hau',
2479         'he': 'heb',
2480         'hi': 'hin',
2481         'ho': 'hmo',
2482         'hr': 'hrv',
2483         'ht': 'hat',
2484         'hu': 'hun',
2485         'hy': 'hye',
2486         'hz': 'her',
2487         'ia': 'ina',
2488         'id': 'ind',
2489         'ie': 'ile',
2490         'ig': 'ibo',
2491         'ii': 'iii',
2492         'ik': 'ipk',
2493         'io': 'ido',
2494         'is': 'isl',
2495         'it': 'ita',
2496         'iu': 'iku',
2497         'ja': 'jpn',
2498         'jv': 'jav',
2499         'ka': 'kat',
2500         'kg': 'kon',
2501         'ki': 'kik',
2502         'kj': 'kua',
2503         'kk': 'kaz',
2504         'kl': 'kal',
2505         'km': 'khm',
2506         'kn': 'kan',
2507         'ko': 'kor',
2508         'kr': 'kau',
2509         'ks': 'kas',
2510         'ku': 'kur',
2511         'kv': 'kom',
2512         'kw': 'cor',
2513         'ky': 'kir',
2514         'la': 'lat',
2515         'lb': 'ltz',
2516         'lg': 'lug',
2517         'li': 'lim',
2518         'ln': 'lin',
2519         'lo': 'lao',
2520         'lt': 'lit',
2521         'lu': 'lub',
2522         'lv': 'lav',
2523         'mg': 'mlg',
2524         'mh': 'mah',
2525         'mi': 'mri',
2526         'mk': 'mkd',
2527         'ml': 'mal',
2528         'mn': 'mon',
2529         'mr': 'mar',
2530         'ms': 'msa',
2531         'mt': 'mlt',
2532         'my': 'mya',
2533         'na': 'nau',
2534         'nb': 'nob',
2535         'nd': 'nde',
2536         'ne': 'nep',
2537         'ng': 'ndo',
2538         'nl': 'nld',
2539         'nn': 'nno',
2540         'no': 'nor',
2541         'nr': 'nbl',
2542         'nv': 'nav',
2543         'ny': 'nya',
2544         'oc': 'oci',
2545         'oj': 'oji',
2546         'om': 'orm',
2547         'or': 'ori',
2548         'os': 'oss',
2549         'pa': 'pan',
2550         'pi': 'pli',
2551         'pl': 'pol',
2552         'ps': 'pus',
2553         'pt': 'por',
2554         'qu': 'que',
2555         'rm': 'roh',
2556         'rn': 'run',
2557         'ro': 'ron',
2558         'ru': 'rus',
2559         'rw': 'kin',
2560         'sa': 'san',
2561         'sc': 'srd',
2562         'sd': 'snd',
2563         'se': 'sme',
2564         'sg': 'sag',
2565         'si': 'sin',
2566         'sk': 'slk',
2567         'sl': 'slv',
2568         'sm': 'smo',
2569         'sn': 'sna',
2570         'so': 'som',
2571         'sq': 'sqi',
2572         'sr': 'srp',
2573         'ss': 'ssw',
2574         'st': 'sot',
2575         'su': 'sun',
2576         'sv': 'swe',
2577         'sw': 'swa',
2578         'ta': 'tam',
2579         'te': 'tel',
2580         'tg': 'tgk',
2581         'th': 'tha',
2582         'ti': 'tir',
2583         'tk': 'tuk',
2584         'tl': 'tgl',
2585         'tn': 'tsn',
2586         'to': 'ton',
2587         'tr': 'tur',
2588         'ts': 'tso',
2589         'tt': 'tat',
2590         'tw': 'twi',
2591         'ty': 'tah',
2592         'ug': 'uig',
2593         'uk': 'ukr',
2594         'ur': 'urd',
2595         'uz': 'uzb',
2596         've': 'ven',
2597         'vi': 'vie',
2598         'vo': 'vol',
2599         'wa': 'wln',
2600         'wo': 'wol',
2601         'xh': 'xho',
2602         'yi': 'yid',
2603         'yo': 'yor',
2604         'za': 'zha',
2605         'zh': 'zho',
2606         'zu': 'zul',
2607     }
2608
2609     @classmethod
2610     def short2long(cls, code):
2611         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2612         return cls._lang_map.get(code[:2])
2613
2614     @classmethod
2615     def long2short(cls, code):
2616         """Convert language code from ISO 639-2/T to ISO 639-1"""
2617         for short_name, long_name in cls._lang_map.items():
2618             if long_name == code:
2619                 return short_name
2620
2621
2622 class ISO3166Utils(object):
2623     # From http://data.okfn.org/data/core/country-list
2624     _country_map = {
2625         'AF': 'Afghanistan',
2626         'AX': 'Åland Islands',
2627         'AL': 'Albania',
2628         'DZ': 'Algeria',
2629         'AS': 'American Samoa',
2630         'AD': 'Andorra',
2631         'AO': 'Angola',
2632         'AI': 'Anguilla',
2633         'AQ': 'Antarctica',
2634         'AG': 'Antigua and Barbuda',
2635         'AR': 'Argentina',
2636         'AM': 'Armenia',
2637         'AW': 'Aruba',
2638         'AU': 'Australia',
2639         'AT': 'Austria',
2640         'AZ': 'Azerbaijan',
2641         'BS': 'Bahamas',
2642         'BH': 'Bahrain',
2643         'BD': 'Bangladesh',
2644         'BB': 'Barbados',
2645         'BY': 'Belarus',
2646         'BE': 'Belgium',
2647         'BZ': 'Belize',
2648         'BJ': 'Benin',
2649         'BM': 'Bermuda',
2650         'BT': 'Bhutan',
2651         'BO': 'Bolivia, Plurinational State of',
2652         'BQ': 'Bonaire, Sint Eustatius and Saba',
2653         'BA': 'Bosnia and Herzegovina',
2654         'BW': 'Botswana',
2655         'BV': 'Bouvet Island',
2656         'BR': 'Brazil',
2657         'IO': 'British Indian Ocean Territory',
2658         'BN': 'Brunei Darussalam',
2659         'BG': 'Bulgaria',
2660         'BF': 'Burkina Faso',
2661         'BI': 'Burundi',
2662         'KH': 'Cambodia',
2663         'CM': 'Cameroon',
2664         'CA': 'Canada',
2665         'CV': 'Cape Verde',
2666         'KY': 'Cayman Islands',
2667         'CF': 'Central African Republic',
2668         'TD': 'Chad',
2669         'CL': 'Chile',
2670         'CN': 'China',
2671         'CX': 'Christmas Island',
2672         'CC': 'Cocos (Keeling) Islands',
2673         'CO': 'Colombia',
2674         'KM': 'Comoros',
2675         'CG': 'Congo',
2676         'CD': 'Congo, the Democratic Republic of the',
2677         'CK': 'Cook Islands',
2678         'CR': 'Costa Rica',
2679         'CI': 'Côte d\'Ivoire',
2680         'HR': 'Croatia',
2681         'CU': 'Cuba',
2682         'CW': 'Curaçao',
2683         'CY': 'Cyprus',
2684         'CZ': 'Czech Republic',
2685         'DK': 'Denmark',
2686         'DJ': 'Djibouti',
2687         'DM': 'Dominica',
2688         'DO': 'Dominican Republic',
2689         'EC': 'Ecuador',
2690         'EG': 'Egypt',
2691         'SV': 'El Salvador',
2692         'GQ': 'Equatorial Guinea',
2693         'ER': 'Eritrea',
2694         'EE': 'Estonia',
2695         'ET': 'Ethiopia',
2696         'FK': 'Falkland Islands (Malvinas)',
2697         'FO': 'Faroe Islands',
2698         'FJ': 'Fiji',
2699         'FI': 'Finland',
2700         'FR': 'France',
2701         'GF': 'French Guiana',
2702         'PF': 'French Polynesia',
2703         'TF': 'French Southern Territories',
2704         'GA': 'Gabon',
2705         'GM': 'Gambia',
2706         'GE': 'Georgia',
2707         'DE': 'Germany',
2708         'GH': 'Ghana',
2709         'GI': 'Gibraltar',
2710         'GR': 'Greece',
2711         'GL': 'Greenland',
2712         'GD': 'Grenada',
2713         'GP': 'Guadeloupe',
2714         'GU': 'Guam',
2715         'GT': 'Guatemala',
2716         'GG': 'Guernsey',
2717         'GN': 'Guinea',
2718         'GW': 'Guinea-Bissau',
2719         'GY': 'Guyana',
2720         'HT': 'Haiti',
2721         'HM': 'Heard Island and McDonald Islands',
2722         'VA': 'Holy See (Vatican City State)',
2723         'HN': 'Honduras',
2724         'HK': 'Hong Kong',
2725         'HU': 'Hungary',
2726         'IS': 'Iceland',
2727         'IN': 'India',
2728         'ID': 'Indonesia',
2729         'IR': 'Iran, Islamic Republic of',
2730         'IQ': 'Iraq',
2731         'IE': 'Ireland',
2732         'IM': 'Isle of Man',
2733         'IL': 'Israel',
2734         'IT': 'Italy',
2735         'JM': 'Jamaica',
2736         'JP': 'Japan',
2737         'JE': 'Jersey',
2738         'JO': 'Jordan',
2739         'KZ': 'Kazakhstan',
2740         'KE': 'Kenya',
2741         'KI': 'Kiribati',
2742         'KP': 'Korea, Democratic People\'s Republic of',
2743         'KR': 'Korea, Republic of',
2744         'KW': 'Kuwait',
2745         'KG': 'Kyrgyzstan',
2746         'LA': 'Lao People\'s Democratic Republic',
2747         'LV': 'Latvia',
2748         'LB': 'Lebanon',
2749         'LS': 'Lesotho',
2750         'LR': 'Liberia',
2751         'LY': 'Libya',
2752         'LI': 'Liechtenstein',
2753         'LT': 'Lithuania',
2754         'LU': 'Luxembourg',
2755         'MO': 'Macao',
2756         'MK': 'Macedonia, the Former Yugoslav Republic of',
2757         'MG': 'Madagascar',
2758         'MW': 'Malawi',
2759         'MY': 'Malaysia',
2760         'MV': 'Maldives',
2761         'ML': 'Mali',
2762         'MT': 'Malta',
2763         'MH': 'Marshall Islands',
2764         'MQ': 'Martinique',
2765         'MR': 'Mauritania',
2766         'MU': 'Mauritius',
2767         'YT': 'Mayotte',
2768         'MX': 'Mexico',
2769         'FM': 'Micronesia, Federated States of',
2770         'MD': 'Moldova, Republic of',
2771         'MC': 'Monaco',
2772         'MN': 'Mongolia',
2773         'ME': 'Montenegro',
2774         'MS': 'Montserrat',
2775         'MA': 'Morocco',
2776         'MZ': 'Mozambique',
2777         'MM': 'Myanmar',
2778         'NA': 'Namibia',
2779         'NR': 'Nauru',
2780         'NP': 'Nepal',
2781         'NL': 'Netherlands',
2782         'NC': 'New Caledonia',
2783         'NZ': 'New Zealand',
2784         'NI': 'Nicaragua',
2785         'NE': 'Niger',
2786         'NG': 'Nigeria',
2787         'NU': 'Niue',
2788         'NF': 'Norfolk Island',
2789         'MP': 'Northern Mariana Islands',
2790         'NO': 'Norway',
2791         'OM': 'Oman',
2792         'PK': 'Pakistan',
2793         'PW': 'Palau',
2794         'PS': 'Palestine, State of',
2795         'PA': 'Panama',
2796         'PG': 'Papua New Guinea',
2797         'PY': 'Paraguay',
2798         'PE': 'Peru',
2799         'PH': 'Philippines',
2800         'PN': 'Pitcairn',
2801         'PL': 'Poland',
2802         'PT': 'Portugal',
2803         'PR': 'Puerto Rico',
2804         'QA': 'Qatar',
2805         'RE': 'Réunion',
2806         'RO': 'Romania',
2807         'RU': 'Russian Federation',
2808         'RW': 'Rwanda',
2809         'BL': 'Saint Barthélemy',
2810         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2811         'KN': 'Saint Kitts and Nevis',
2812         'LC': 'Saint Lucia',
2813         'MF': 'Saint Martin (French part)',
2814         'PM': 'Saint Pierre and Miquelon',
2815         'VC': 'Saint Vincent and the Grenadines',
2816         'WS': 'Samoa',
2817         'SM': 'San Marino',
2818         'ST': 'Sao Tome and Principe',
2819         'SA': 'Saudi Arabia',
2820         'SN': 'Senegal',
2821         'RS': 'Serbia',
2822         'SC': 'Seychelles',
2823         'SL': 'Sierra Leone',
2824         'SG': 'Singapore',
2825         'SX': 'Sint Maarten (Dutch part)',
2826         'SK': 'Slovakia',
2827         'SI': 'Slovenia',
2828         'SB': 'Solomon Islands',
2829         'SO': 'Somalia',
2830         'ZA': 'South Africa',
2831         'GS': 'South Georgia and the South Sandwich Islands',
2832         'SS': 'South Sudan',
2833         'ES': 'Spain',
2834         'LK': 'Sri Lanka',
2835         'SD': 'Sudan',
2836         'SR': 'Suriname',
2837         'SJ': 'Svalbard and Jan Mayen',
2838         'SZ': 'Swaziland',
2839         'SE': 'Sweden',
2840         'CH': 'Switzerland',
2841         'SY': 'Syrian Arab Republic',
2842         'TW': 'Taiwan, Province of China',
2843         'TJ': 'Tajikistan',
2844         'TZ': 'Tanzania, United Republic of',
2845         'TH': 'Thailand',
2846         'TL': 'Timor-Leste',
2847         'TG': 'Togo',
2848         'TK': 'Tokelau',
2849         'TO': 'Tonga',
2850         'TT': 'Trinidad and Tobago',
2851         'TN': 'Tunisia',
2852         'TR': 'Turkey',
2853         'TM': 'Turkmenistan',
2854         'TC': 'Turks and Caicos Islands',
2855         'TV': 'Tuvalu',
2856         'UG': 'Uganda',
2857         'UA': 'Ukraine',
2858         'AE': 'United Arab Emirates',
2859         'GB': 'United Kingdom',
2860         'US': 'United States',
2861         'UM': 'United States Minor Outlying Islands',
2862         'UY': 'Uruguay',
2863         'UZ': 'Uzbekistan',
2864         'VU': 'Vanuatu',
2865         'VE': 'Venezuela, Bolivarian Republic of',
2866         'VN': 'Viet Nam',
2867         'VG': 'Virgin Islands, British',
2868         'VI': 'Virgin Islands, U.S.',
2869         'WF': 'Wallis and Futuna',
2870         'EH': 'Western Sahara',
2871         'YE': 'Yemen',
2872         'ZM': 'Zambia',
2873         'ZW': 'Zimbabwe',
2874     }
2875
2876     @classmethod
2877     def short2full(cls, code):
2878         """Convert an ISO 3166-2 country code to the corresponding full name"""
2879         return cls._country_map.get(code.upper())
2880
2881
2882 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2883     def __init__(self, proxies=None):
2884         # Set default handlers
2885         for type in ('http', 'https'):
2886             setattr(self, '%s_open' % type,
2887                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2888                         meth(r, proxy, type))
2889         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2890
2891     def proxy_open(self, req, proxy, type):
2892         req_proxy = req.headers.get('Ytdl-request-proxy')
2893         if req_proxy is not None:
2894             proxy = req_proxy
2895             del req.headers['Ytdl-request-proxy']
2896
2897         if proxy == '__noproxy__':
2898             return None  # No Proxy
2899         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2900             req.add_header('Ytdl-socks-proxy', proxy)
2901             # youtube-dl's http/https handlers do wrapping the socket with socks
2902             return None
2903         return compat_urllib_request.ProxyHandler.proxy_open(
2904             self, req, proxy, type)
2905
2906
2907 def ohdave_rsa_encrypt(data, exponent, modulus):
2908     '''
2909     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2910
2911     Input:
2912         data: data to encrypt, bytes-like object
2913         exponent, modulus: parameter e and N of RSA algorithm, both integer
2914     Output: hex string of encrypted data
2915
2916     Limitation: supports one block encryption only
2917     '''
2918
2919     payload = int(binascii.hexlify(data[::-1]), 16)
2920     encrypted = pow(payload, exponent, modulus)
2921     return '%x' % encrypted
2922
2923
2924 def encode_base_n(num, n, table=None):
2925     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2926     if not table:
2927         table = FULL_TABLE[:n]
2928
2929     if n > len(table):
2930         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2931
2932     if num == 0:
2933         return table[0]
2934
2935     ret = ''
2936     while num:
2937         ret = table[num % n] + ret
2938         num = num // n
2939     return ret
2940
2941
2942 def decode_packed_codes(code):
2943     mobj = re.search(
2944         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2945         code)
2946     obfucasted_code, base, count, symbols = mobj.groups()
2947     base = int(base)
2948     count = int(count)
2949     symbols = symbols.split('|')
2950     symbol_table = {}
2951
2952     while count:
2953         count -= 1
2954         base_n_count = encode_base_n(count, base)
2955         symbol_table[base_n_count] = symbols[count] or base_n_count
2956
2957     return re.sub(
2958         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2959         obfucasted_code)
2960
2961
2962 def parse_m3u8_attributes(attrib):
2963     info = {}
2964     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2965         if val.startswith('"'):
2966             val = val[1:-1]
2967         info[key] = val
2968     return info
2969
2970
2971 def urshift(val, n):
2972     return val >> n if val >= 0 else (val + 0x100000000) >> n
2973
2974
2975 # Based on png2str() written by @gdkchan and improved by @yokrysty
2976 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
2977 def decode_png(png_data):
2978     # Reference: https://www.w3.org/TR/PNG/
2979     header = png_data[8:]
2980
2981     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
2982         raise IOError('Not a valid PNG file.')
2983
2984     int_map = {1: '>B', 2: '>H', 4: '>I'}
2985     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
2986
2987     chunks = []
2988
2989     while header:
2990         length = unpack_integer(header[:4])
2991         header = header[4:]
2992
2993         chunk_type = header[:4]
2994         header = header[4:]
2995
2996         chunk_data = header[:length]
2997         header = header[length:]
2998
2999         header = header[4:]  # Skip CRC
3000
3001         chunks.append({
3002             'type': chunk_type,
3003             'length': length,
3004             'data': chunk_data
3005         })
3006
3007     ihdr = chunks[0]['data']
3008
3009     width = unpack_integer(ihdr[:4])
3010     height = unpack_integer(ihdr[4:8])
3011
3012     idat = b''
3013
3014     for chunk in chunks:
3015         if chunk['type'] == b'IDAT':
3016             idat += chunk['data']
3017
3018     if not idat:
3019         raise IOError('Unable to read PNG data.')
3020
3021     decompressed_data = bytearray(zlib.decompress(idat))
3022
3023     stride = width * 3
3024     pixels = []
3025
3026     def _get_pixel(idx):
3027         x = idx % stride
3028         y = idx // stride
3029         return pixels[y][x]
3030
3031     for y in range(height):
3032         basePos = y * (1 + stride)
3033         filter_type = decompressed_data[basePos]
3034
3035         current_row = []
3036
3037         pixels.append(current_row)
3038
3039         for x in range(stride):
3040             color = decompressed_data[1 + basePos + x]
3041             basex = y * stride + x
3042             left = 0
3043             up = 0
3044
3045             if x > 2:
3046                 left = _get_pixel(basex - 3)
3047             if y > 0:
3048                 up = _get_pixel(basex - stride)
3049
3050             if filter_type == 1:  # Sub
3051                 color = (color + left) & 0xff
3052             elif filter_type == 2:  # Up
3053                 color = (color + up) & 0xff
3054             elif filter_type == 3:  # Average
3055                 color = (color + ((left + up) >> 1)) & 0xff
3056             elif filter_type == 4:  # Paeth
3057                 a = left
3058                 b = up
3059                 c = 0
3060
3061                 if x > 2 and y > 0:
3062                     c = _get_pixel(basex - stride - 3)
3063
3064                 p = a + b - c
3065
3066                 pa = abs(p - a)
3067                 pb = abs(p - b)
3068                 pc = abs(p - c)
3069
3070                 if pa <= pb and pa <= pc:
3071                     color = (color + a) & 0xff
3072                 elif pb <= pc:
3073                     color = (color + b) & 0xff
3074                 else:
3075                     color = (color + c) & 0xff
3076
3077             current_row.append(color)
3078
3079     return width, height, pixels