youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 NO_DEFAULT = object()
  66
  67 ENGLISH_MONTH_NAMES = [
  68     'January', 'February', 'March', 'April', 'May', 'June',
  69     'July', 'August', 'September', 'October', 'November', 'December']
  70
  71
  72 def preferredencoding():
  73     """Get preferred encoding.
  74
  75     Returns the best encoding scheme for the system, based on
  76     locale.getpreferredencoding() and some further tweaks.
  77     """
  78     try:
  79         pref = locale.getpreferredencoding()
  80         'TEST'.encode(pref)
  81     except Exception:
  82         pref = 'UTF-8'
  83
  84     return pref
  85
  86
  87 def write_json_file(obj, fn):
  88     """ Encode obj as JSON and write it to fn, atomically if possible """
  89
  90     fn = encodeFilename(fn)
  91     if sys.version_info < (3, 0) and sys.platform != 'win32':
  92         encoding = get_filesystem_encoding()
  93         # os.path.basename returns a bytes object, but NamedTemporaryFile
  94         # will fail if the filename contains non ascii characters unless we
  95         # use a unicode object
  96         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  97         # the same for os.path.dirname
  98         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  99     else:
 100         path_basename = os.path.basename
 101         path_dirname = os.path.dirname
 102
 103     args = {
 104         'suffix': '.tmp',
 105         'prefix': path_basename(fn) + '.',
 106         'dir': path_dirname(fn),
 107         'delete': False,
 108     }
 109
 110     # In Python 2.x, json.dump expects a bytestream.
 111     # In Python 3.x, it writes to a character stream
 112     if sys.version_info < (3, 0):
 113         args['mode'] = 'wb'
 114     else:
 115         args.update({
 116             'mode': 'w',
 117             'encoding': 'utf-8',
 118         })
 119
 120     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 121
 122     try:
 123         with tf:
 124             json.dump(obj, tf)
 125         if sys.platform == 'win32':
 126             # Need to remove existing file on Windows, else os.rename raises
 127             # WindowsError or FileExistsError.
 128             try:
 129                 os.unlink(fn)
 130             except OSError:
 131                 pass
 132         os.rename(tf.name, fn)
 133     except Exception:
 134         try:
 135             os.remove(tf.name)
 136         except OSError:
 137             pass
 138         raise
 139
 140
 141 if sys.version_info >= (2, 7):
 142     def find_xpath_attr(node, xpath, key, val=None):
 143         """ Find the xpath xpath[@key=val] """
 144         assert re.match(r'^[a-zA-Z-]+$', key)
 145         if val:
 146             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 147         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 148         return node.find(expr)
 149 else:
 150     def find_xpath_attr(node, xpath, key, val=None):
 151         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 152         # .//node does not match if a node is a direct child of . !
 153         if isinstance(xpath, compat_str):
 154             xpath = xpath.encode('ascii')
 155
 156         for f in node.findall(xpath):
 157             if key not in f.attrib:
 158                 continue
 159             if val is None or f.attrib.get(key) == val:
 160                 return f
 161         return None
 162
 163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 164 # the namespace parameter
 165
 166
 167 def xpath_with_ns(path, ns_map):
 168     components = [c.split(':') for c in path.split('/')]
 169     replaced = []
 170     for c in components:
 171         if len(c) == 1:
 172             replaced.append(c[0])
 173         else:
 174             ns, tag = c
 175             replaced.append('{%s}%s' % (ns_map[ns], tag))
 176     return '/'.join(replaced)
 177
 178
 179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 180     if sys.version_info < (2, 7):  # Crazy 2.6
 181         xpath = xpath.encode('ascii')
 182
 183     n = node.find(xpath)
 184     if n is None or n.text is None:
 185         if default is not NO_DEFAULT:
 186             return default
 187         elif fatal:
 188             name = xpath if name is None else name
 189             raise ExtractorError('Could not find XML element %s' % name)
 190         else:
 191             return None
 192     return n.text
 193
 194
 195 def get_element_by_id(id, html):
 196     """Return the content of the tag with the specified ID in the passed HTML document"""
 197     return get_element_by_attribute("id", id, html)
 198
 199
 200 def get_element_by_attribute(attribute, value, html):
 201     """Return the content of the tag with the specified attribute in the passed HTML document"""
 202
 203     m = re.search(r'''(?xs)
 204         <([a-zA-Z0-9:._-]+)
 205          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 206          \s+%s=['"]?%s['"]?
 207          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 208         \s*>
 209         (?P<content>.*?)
 210         </\1>
 211     ''' % (re.escape(attribute), re.escape(value)), html)
 212
 213     if not m:
 214         return None
 215     res = m.group('content')
 216
 217     if res.startswith('"') or res.startswith("'"):
 218         res = res[1:-1]
 219
 220     return unescapeHTML(res)
 221
 222
 223 def clean_html(html):
 224     """Clean an HTML snippet into a readable string"""
 225
 226     if html is None:  # Convenience for sanitizing descriptions etc.
 227         return html
 228
 229     # Newline vs <br />
 230     html = html.replace('\n', ' ')
 231     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 232     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 233     # Strip html tags
 234     html = re.sub('<.*?>', '', html)
 235     # Replace html entities
 236     html = unescapeHTML(html)
 237     return html.strip()
 238
 239
 240 def sanitize_open(filename, open_mode):
 241     """Try to open the given filename, and slightly tweak it if this fails.
 242
 243     Attempts to open the given filename. If this fails, it tries to change
 244     the filename slightly, step by step, until it's either able to open it
 245     or it fails and raises a final exception, like the standard open()
 246     function.
 247
 248     It returns the tuple (stream, definitive_file_name).
 249     """
 250     try:
 251         if filename == '-':
 252             if sys.platform == 'win32':
 253                 import msvcrt
 254                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 255             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 256         stream = open(encodeFilename(filename), open_mode)
 257         return (stream, filename)
 258     except (IOError, OSError) as err:
 259         if err.errno in (errno.EACCES,):
 260             raise
 261
 262         # In case of error, try to remove win32 forbidden chars
 263         alt_filename = sanitize_path(filename)
 264         if alt_filename == filename:
 265             raise
 266         else:
 267             # An exception here should be caught in the caller
 268             stream = open(encodeFilename(alt_filename), open_mode)
 269             return (stream, alt_filename)
 270
 271
 272 def timeconvert(timestr):
 273     """Convert RFC 2822 defined time string into system timestamp"""
 274     timestamp = None
 275     timetuple = email.utils.parsedate_tz(timestr)
 276     if timetuple is not None:
 277         timestamp = email.utils.mktime_tz(timetuple)
 278     return timestamp
 279
 280
 281 def sanitize_filename(s, restricted=False, is_id=False):
 282     """Sanitizes a string so it could be used as part of a filename.
 283     If restricted is set, use a stricter subset of allowed characters.
 284     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 285     """
 286     def replace_insane(char):
 287         if char == '?' or ord(char) < 32 or ord(char) == 127:
 288             return ''
 289         elif char == '"':
 290             return '' if restricted else '\''
 291         elif char == ':':
 292             return '_-' if restricted else ' -'
 293         elif char in '\\/|*<>':
 294             return '_'
 295         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 296             return '_'
 297         if restricted and ord(char) > 127:
 298             return '_'
 299         return char
 300
 301     # Handle timestamps
 302     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 303     result = ''.join(map(replace_insane, s))
 304     if not is_id:
 305         while '__' in result:
 306             result = result.replace('__', '_')
 307         result = result.strip('_')
 308         # Common case of "Foreign band name - English song title"
 309         if restricted and result.startswith('-_'):
 310             result = result[2:]
 311         if result.startswith('-'):
 312             result = '_' + result[len('-'):]
 313         result = result.lstrip('.')
 314         if not result:
 315             result = '_'
 316     return result
 317
 318
 319 def sanitize_path(s):
 320     """Sanitizes and normalizes path on Windows"""
 321     if sys.platform != 'win32':
 322         return s
 323     drive_or_unc, _ = os.path.splitdrive(s)
 324     if sys.version_info < (2, 7) and not drive_or_unc:
 325         drive_or_unc, _ = os.path.splitunc(s)
 326     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 327     if drive_or_unc:
 328         norm_path.pop(0)
 329     sanitized_path = [
 330         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 331         for path_part in norm_path]
 332     if drive_or_unc:
 333         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 334     return os.path.join(*sanitized_path)
 335
 336
 337 def orderedSet(iterable):
 338     """ Remove all duplicates from the input iterable """
 339     res = []
 340     for el in iterable:
 341         if el not in res:
 342             res.append(el)
 343     return res
 344
 345
 346 def _htmlentity_transform(entity):
 347     """Transforms an HTML entity to a character."""
 348     # Known non-numeric HTML entity
 349     if entity in compat_html_entities.name2codepoint:
 350         return compat_chr(compat_html_entities.name2codepoint[entity])
 351
 352     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 353     if mobj is not None:
 354         numstr = mobj.group(1)
 355         if numstr.startswith('x'):
 356             base = 16
 357             numstr = '0%s' % numstr
 358         else:
 359             base = 10
 360         return compat_chr(int(numstr, base))
 361
 362     # Unknown entity in name, return its literal representation
 363     return ('&%s;' % entity)
 364
 365
 366 def unescapeHTML(s):
 367     if s is None:
 368         return None
 369     assert type(s) == compat_str
 370
 371     return re.sub(
 372         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 373
 374
 375 def get_subprocess_encoding():
 376     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 377         # For subprocess calls, encode with locale encoding
 378         # Refer to http://stackoverflow.com/a/9951851/35070
 379         encoding = preferredencoding()
 380     else:
 381         encoding = sys.getfilesystemencoding()
 382     if encoding is None:
 383         encoding = 'utf-8'
 384     return encoding
 385
 386
 387 def encodeFilename(s, for_subprocess=False):
 388     """
 389     @param s The name of the file
 390     """
 391
 392     assert type(s) == compat_str
 393
 394     # Python 3 has a Unicode API
 395     if sys.version_info >= (3, 0):
 396         return s
 397
 398     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 399     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 400     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 401     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 402         return s
 403
 404     return s.encode(get_subprocess_encoding(), 'ignore')
 405
 406
 407 def decodeFilename(b, for_subprocess=False):
 408
 409     if sys.version_info >= (3, 0):
 410         return b
 411
 412     if not isinstance(b, bytes):
 413         return b
 414
 415     return b.decode(get_subprocess_encoding(), 'ignore')
 416
 417
 418 def encodeArgument(s):
 419     if not isinstance(s, compat_str):
 420         # Legacy code that uses byte strings
 421         # Uncomment the following line after fixing all post processors
 422         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 423         s = s.decode('ascii')
 424     return encodeFilename(s, True)
 425
 426
 427 def decodeArgument(b):
 428     return decodeFilename(b, True)
 429
 430
 431 def decodeOption(optval):
 432     if optval is None:
 433         return optval
 434     if isinstance(optval, bytes):
 435         optval = optval.decode(preferredencoding())
 436
 437     assert isinstance(optval, compat_str)
 438     return optval
 439
 440
 441 def formatSeconds(secs):
 442     if secs > 3600:
 443         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 444     elif secs > 60:
 445         return '%d:%02d' % (secs // 60, secs % 60)
 446     else:
 447         return '%d' % secs
 448
 449
 450 def make_HTTPS_handler(params, **kwargs):
 451     opts_no_check_certificate = params.get('nocheckcertificate', False)
 452     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 453         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 454         if opts_no_check_certificate:
 455             context.check_hostname = False
 456             context.verify_mode = ssl.CERT_NONE
 457         try:
 458             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 459         except TypeError:
 460             # Python 2.7.8
 461             # (create_default_context present but HTTPSHandler has no context=)
 462             pass
 463
 464     if sys.version_info < (3, 2):
 465         return YoutubeDLHTTPSHandler(params, **kwargs)
 466     else:  # Python < 3.4
 467         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 468         context.verify_mode = (ssl.CERT_NONE
 469                                if opts_no_check_certificate
 470                                else ssl.CERT_REQUIRED)
 471         context.set_default_verify_paths()
 472         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 473
 474
 475 def bug_reports_message():
 476     if ytdl_is_updateable():
 477         update_cmd = 'type  youtube-dl -U  to update'
 478     else:
 479         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 480     msg = '; please report this issue on https://yt-dl.org/bug .'
 481     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 482     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 483     return msg
 484
 485
 486 class ExtractorError(Exception):
 487     """Error during info extraction."""
 488
 489     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 490         """ tb, if given, is the original traceback (so that it can be printed out).
 491         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 492         """
 493
 494         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 495             expected = True
 496         if video_id is not None:
 497             msg = video_id + ': ' + msg
 498         if cause:
 499             msg += ' (caused by %r)' % cause
 500         if not expected:
 501             msg += bug_reports_message()
 502         super(ExtractorError, self).__init__(msg)
 503
 504         self.traceback = tb
 505         self.exc_info = sys.exc_info()  # preserve original exception
 506         self.cause = cause
 507         self.video_id = video_id
 508
 509     def format_traceback(self):
 510         if self.traceback is None:
 511             return None
 512         return ''.join(traceback.format_tb(self.traceback))
 513
 514
 515 class UnsupportedError(ExtractorError):
 516     def __init__(self, url):
 517         super(UnsupportedError, self).__init__(
 518             'Unsupported URL: %s' % url, expected=True)
 519         self.url = url
 520
 521
 522 class RegexNotFoundError(ExtractorError):
 523     """Error when a regex didn't match"""
 524     pass
 525
 526
 527 class DownloadError(Exception):
 528     """Download Error exception.
 529
 530     This exception may be thrown by FileDownloader objects if they are not
 531     configured to continue on errors. They will contain the appropriate
 532     error message.
 533     """
 534
 535     def __init__(self, msg, exc_info=None):
 536         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 537         super(DownloadError, self).__init__(msg)
 538         self.exc_info = exc_info
 539
 540
 541 class SameFileError(Exception):
 542     """Same File exception.
 543
 544     This exception will be thrown by FileDownloader objects if they detect
 545     multiple files would have to be downloaded to the same file on disk.
 546     """
 547     pass
 548
 549
 550 class PostProcessingError(Exception):
 551     """Post Processing exception.
 552
 553     This exception may be raised by PostProcessor's .run() method to
 554     indicate an error in the postprocessing task.
 555     """
 556
 557     def __init__(self, msg):
 558         self.msg = msg
 559
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582
 583     def __init__(self, downloaded, expected):
 584         # Both in bytes
 585         self.downloaded = downloaded
 586         self.expected = expected
 587
 588
 589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 590     hc = http_class(*args, **kwargs)
 591     source_address = ydl_handler._params.get('source_address')
 592     if source_address is not None:
 593         sa = (source_address, 0)
 594         if hasattr(hc, 'source_address'):  # Python 2.7+
 595             hc.source_address = sa
 596         else:  # Python 2.6
 597             def _hc_connect(self, *args, **kwargs):
 598                 sock = compat_socket_create_connection(
 599                     (self.host, self.port), self.timeout, sa)
 600                 if is_https:
 601                     self.sock = ssl.wrap_socket(
 602                         sock, self.key_file, self.cert_file,
 603                         ssl_version=ssl.PROTOCOL_TLSv1)
 604                 else:
 605                     self.sock = sock
 606             hc.connect = functools.partial(_hc_connect, hc)
 607
 608     return hc
 609
 610
 611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 612     """Handler for HTTP requests and responses.
 613
 614     This class, when installed with an OpenerDirector, automatically adds
 615     the standard headers to every HTTP request and handles gzipped and
 616     deflated responses from web servers. If compression is to be avoided in
 617     a particular request, the original request in the program code only has
 618     to include the HTTP header "Youtubedl-No-Compression", which will be
 619     removed before making the real request.
 620
 621     Part of this code was copied from:
 622
 623     http://techknack.net/python-urllib2-handlers/
 624
 625     Andrew Rowls, the author of that code, agreed to release it to the
 626     public domain.
 627     """
 628
 629     def __init__(self, params, *args, **kwargs):
 630         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 631         self._params = params
 632
 633     def http_open(self, req):
 634         return self.do_open(functools.partial(
 635             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 636             req)
 637
 638     @staticmethod
 639     def deflate(data):
 640         try:
 641             return zlib.decompress(data, -zlib.MAX_WBITS)
 642         except zlib.error:
 643             return zlib.decompress(data)
 644
 645     @staticmethod
 646     def addinfourl_wrapper(stream, headers, url, code):
 647         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 648             return compat_urllib_request.addinfourl(stream, headers, url, code)
 649         ret = compat_urllib_request.addinfourl(stream, headers, url)
 650         ret.code = code
 651         return ret
 652
 653     def http_request(self, req):
 654         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 655         # always respected by websites, some tend to give out URLs with non percent-encoded
 656         # non-ASCII characters (see telemb.py, ard.py [#3412])
 657         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 658         # To work around aforementioned issue we will replace request's original URL with
 659         # percent-encoded one
 660         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 661         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 662         url = req.get_full_url()
 663         url_escaped = escape_url(url)
 664
 665         # Substitute URL if any change after escaping
 666         if url != url_escaped:
 667             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 668             new_req = req_type(
 669                 url_escaped, data=req.data, headers=req.headers,
 670                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 671             new_req.timeout = req.timeout
 672             req = new_req
 673
 674         for h, v in std_headers.items():
 675             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 676             # The dict keys are capitalized because of this bug by urllib
 677             if h.capitalize() not in req.headers:
 678                 req.add_header(h, v)
 679         if 'Youtubedl-no-compression' in req.headers:
 680             if 'Accept-encoding' in req.headers:
 681                 del req.headers['Accept-encoding']
 682             del req.headers['Youtubedl-no-compression']
 683
 684         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 685             # Python 2.6 is brain-dead when it comes to fragments
 686             req._Request__original = req._Request__original.partition('#')[0]
 687             req._Request__r_type = req._Request__r_type.partition('#')[0]
 688
 689         return req
 690
 691     def http_response(self, req, resp):
 692         old_resp = resp
 693         # gzip
 694         if resp.headers.get('Content-encoding', '') == 'gzip':
 695             content = resp.read()
 696             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 697             try:
 698                 uncompressed = io.BytesIO(gz.read())
 699             except IOError as original_ioerror:
 700                 # There may be junk add the end of the file
 701                 # See http://stackoverflow.com/q/4928560/35070 for details
 702                 for i in range(1, 1024):
 703                     try:
 704                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 705                         uncompressed = io.BytesIO(gz.read())
 706                     except IOError:
 707                         continue
 708                     break
 709                 else:
 710                     raise original_ioerror
 711             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 712             resp.msg = old_resp.msg
 713         # deflate
 714         if resp.headers.get('Content-encoding', '') == 'deflate':
 715             gz = io.BytesIO(self.deflate(resp.read()))
 716             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 717             resp.msg = old_resp.msg
 718         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
 719         if 300 <= resp.code < 400:
 720             location = resp.headers.get('Location')
 721             if location:
 722                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 723                 if sys.version_info >= (3, 0):
 724                     location = location.encode('iso-8859-1').decode('utf-8')
 725                 location_escaped = escape_url(location)
 726                 if location != location_escaped:
 727                     del resp.headers['Location']
 728                     resp.headers['Location'] = location_escaped
 729         return resp
 730
 731     https_request = http_request
 732     https_response = http_response
 733
 734
 735 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 736     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 737         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 738         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 739         self._params = params
 740
 741     def https_open(self, req):
 742         kwargs = {}
 743         if hasattr(self, '_context'):  # python > 2.6
 744             kwargs['context'] = self._context
 745         if hasattr(self, '_check_hostname'):  # python 3.x
 746             kwargs['check_hostname'] = self._check_hostname
 747         return self.do_open(functools.partial(
 748             _create_http_connection, self, self._https_conn_class, True),
 749             req, **kwargs)
 750
 751
 752 def parse_iso8601(date_str, delimiter='T', timezone=None):
 753     """ Return a UNIX timestamp from the given date """
 754
 755     if date_str is None:
 756         return None
 757
 758     if timezone is None:
 759         m = re.search(
 760             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 761             date_str)
 762         if not m:
 763             timezone = datetime.timedelta()
 764         else:
 765             date_str = date_str[:-len(m.group(0))]
 766             if not m.group('sign'):
 767                 timezone = datetime.timedelta()
 768             else:
 769                 sign = 1 if m.group('sign') == '+' else -1
 770                 timezone = datetime.timedelta(
 771                     hours=sign * int(m.group('hours')),
 772                     minutes=sign * int(m.group('minutes')))
 773     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 774     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 775     return calendar.timegm(dt.timetuple())
 776
 777
 778 def unified_strdate(date_str, day_first=True):
 779     """Return a string with the date in the format YYYYMMDD"""
 780
 781     if date_str is None:
 782         return None
 783     upload_date = None
 784     # Replace commas
 785     date_str = date_str.replace(',', ' ')
 786     # %z (UTC offset) is only supported in python>=3.2
 787     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 788         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 789     # Remove AM/PM + timezone
 790     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 791
 792     format_expressions = [
 793         '%d %B %Y',
 794         '%d %b %Y',
 795         '%B %d %Y',
 796         '%b %d %Y',
 797         '%b %dst %Y %I:%M%p',
 798         '%b %dnd %Y %I:%M%p',
 799         '%b %dth %Y %I:%M%p',
 800         '%Y %m %d',
 801         '%Y-%m-%d',
 802         '%Y/%m/%d',
 803         '%Y/%m/%d %H:%M:%S',
 804         '%Y-%m-%d %H:%M:%S',
 805         '%Y-%m-%d %H:%M:%S.%f',
 806         '%d.%m.%Y %H:%M',
 807         '%d.%m.%Y %H.%M',
 808         '%Y-%m-%dT%H:%M:%SZ',
 809         '%Y-%m-%dT%H:%M:%S.%fZ',
 810         '%Y-%m-%dT%H:%M:%S.%f0Z',
 811         '%Y-%m-%dT%H:%M:%S',
 812         '%Y-%m-%dT%H:%M:%S.%f',
 813         '%Y-%m-%dT%H:%M',
 814     ]
 815     if day_first:
 816         format_expressions.extend([
 817             '%d-%m-%Y',
 818             '%d.%m.%Y',
 819             '%d/%m/%Y',
 820             '%d/%m/%y',
 821             '%d/%m/%Y %H:%M:%S',
 822         ])
 823     else:
 824         format_expressions.extend([
 825             '%m-%d-%Y',
 826             '%m.%d.%Y',
 827             '%m/%d/%Y',
 828             '%m/%d/%y',
 829             '%m/%d/%Y %H:%M:%S',
 830         ])
 831     for expression in format_expressions:
 832         try:
 833             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 834         except ValueError:
 835             pass
 836     if upload_date is None:
 837         timetuple = email.utils.parsedate_tz(date_str)
 838         if timetuple:
 839             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 840     return upload_date
 841
 842
 843 def determine_ext(url, default_ext='unknown_video'):
 844     if url is None:
 845         return default_ext
 846     guess = url.partition('?')[0].rpartition('.')[2]
 847     if re.match(r'^[A-Za-z0-9]+$', guess):
 848         return guess
 849     else:
 850         return default_ext
 851
 852
 853 def subtitles_filename(filename, sub_lang, sub_format):
 854     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 855
 856
 857 def date_from_str(date_str):
 858     """
 859     Return a datetime object from a string in the format YYYYMMDD or
 860     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 861     today = datetime.date.today()
 862     if date_str in ('now', 'today'):
 863         return today
 864     if date_str == 'yesterday':
 865         return today - datetime.timedelta(days=1)
 866     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 867     if match is not None:
 868         sign = match.group('sign')
 869         time = int(match.group('time'))
 870         if sign == '-':
 871             time = -time
 872         unit = match.group('unit')
 873         # A bad aproximation?
 874         if unit == 'month':
 875             unit = 'day'
 876             time *= 30
 877         elif unit == 'year':
 878             unit = 'day'
 879             time *= 365
 880         unit += 's'
 881         delta = datetime.timedelta(**{unit: time})
 882         return today + delta
 883     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 884
 885
 886 def hyphenate_date(date_str):
 887     """
 888     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 889     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 890     if match is not None:
 891         return '-'.join(match.groups())
 892     else:
 893         return date_str
 894
 895
 896 class DateRange(object):
 897     """Represents a time interval between two dates"""
 898
 899     def __init__(self, start=None, end=None):
 900         """start and end must be strings in the format accepted by date"""
 901         if start is not None:
 902             self.start = date_from_str(start)
 903         else:
 904             self.start = datetime.datetime.min.date()
 905         if end is not None:
 906             self.end = date_from_str(end)
 907         else:
 908             self.end = datetime.datetime.max.date()
 909         if self.start > self.end:
 910             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 911
 912     @classmethod
 913     def day(cls, day):
 914         """Returns a range that only contains the given day"""
 915         return cls(day, day)
 916
 917     def __contains__(self, date):
 918         """Check if the date is in the range"""
 919         if not isinstance(date, datetime.date):
 920             date = date_from_str(date)
 921         return self.start <= date <= self.end
 922
 923     def __str__(self):
 924         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 925
 926
 927 def platform_name():
 928     """ Returns the platform name as a compat_str """
 929     res = platform.platform()
 930     if isinstance(res, bytes):
 931         res = res.decode(preferredencoding())
 932
 933     assert isinstance(res, compat_str)
 934     return res
 935
 936
 937 def _windows_write_string(s, out):
 938     """ Returns True if the string was written using special methods,
 939     False if it has yet to be written out."""
 940     # Adapted from http://stackoverflow.com/a/3259271/35070
 941
 942     import ctypes
 943     import ctypes.wintypes
 944
 945     WIN_OUTPUT_IDS = {
 946         1: -11,
 947         2: -12,
 948     }
 949
 950     try:
 951         fileno = out.fileno()
 952     except AttributeError:
 953         # If the output stream doesn't have a fileno, it's virtual
 954         return False
 955     except io.UnsupportedOperation:
 956         # Some strange Windows pseudo files?
 957         return False
 958     if fileno not in WIN_OUTPUT_IDS:
 959         return False
 960
 961     GetStdHandle = ctypes.WINFUNCTYPE(
 962         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 963         (b"GetStdHandle", ctypes.windll.kernel32))
 964     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 965
 966     WriteConsoleW = ctypes.WINFUNCTYPE(
 967         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 968         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 969         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 970     written = ctypes.wintypes.DWORD(0)
 971
 972     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 973     FILE_TYPE_CHAR = 0x0002
 974     FILE_TYPE_REMOTE = 0x8000
 975     GetConsoleMode = ctypes.WINFUNCTYPE(
 976         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 977         ctypes.POINTER(ctypes.wintypes.DWORD))(
 978         (b"GetConsoleMode", ctypes.windll.kernel32))
 979     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 980
 981     def not_a_console(handle):
 982         if handle == INVALID_HANDLE_VALUE or handle is None:
 983             return True
 984         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 985                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 986
 987     if not_a_console(h):
 988         return False
 989
 990     def next_nonbmp_pos(s):
 991         try:
 992             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 993         except StopIteration:
 994             return len(s)
 995
 996     while s:
 997         count = min(next_nonbmp_pos(s), 1024)
 998
 999         ret = WriteConsoleW(
1000             h, s, count if count else 2, ctypes.byref(written), None)
1001         if ret == 0:
1002             raise OSError('Failed to write string')
1003         if not count:  # We just wrote a non-BMP character
1004             assert written.value == 2
1005             s = s[1:]
1006         else:
1007             assert written.value > 0
1008             s = s[written.value:]
1009     return True
1010
1011
1012 def write_string(s, out=None, encoding=None):
1013     if out is None:
1014         out = sys.stderr
1015     assert type(s) == compat_str
1016
1017     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1018         if _windows_write_string(s, out):
1019             return
1020
1021     if ('b' in getattr(out, 'mode', '') or
1022             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1023         byt = s.encode(encoding or preferredencoding(), 'ignore')
1024         out.write(byt)
1025     elif hasattr(out, 'buffer'):
1026         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1027         byt = s.encode(enc, 'ignore')
1028         out.buffer.write(byt)
1029     else:
1030         out.write(s)
1031     out.flush()
1032
1033
1034 def bytes_to_intlist(bs):
1035     if not bs:
1036         return []
1037     if isinstance(bs[0], int):  # Python 3
1038         return list(bs)
1039     else:
1040         return [ord(c) for c in bs]
1041
1042
1043 def intlist_to_bytes(xs):
1044     if not xs:
1045         return b''
1046     return struct_pack('%dB' % len(xs), *xs)
1047
1048
1049 # Cross-platform file locking
1050 if sys.platform == 'win32':
1051     import ctypes.wintypes
1052     import msvcrt
1053
1054     class OVERLAPPED(ctypes.Structure):
1055         _fields_ = [
1056             ('Internal', ctypes.wintypes.LPVOID),
1057             ('InternalHigh', ctypes.wintypes.LPVOID),
1058             ('Offset', ctypes.wintypes.DWORD),
1059             ('OffsetHigh', ctypes.wintypes.DWORD),
1060             ('hEvent', ctypes.wintypes.HANDLE),
1061         ]
1062
1063     kernel32 = ctypes.windll.kernel32
1064     LockFileEx = kernel32.LockFileEx
1065     LockFileEx.argtypes = [
1066         ctypes.wintypes.HANDLE,     # hFile
1067         ctypes.wintypes.DWORD,      # dwFlags
1068         ctypes.wintypes.DWORD,      # dwReserved
1069         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1070         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1071         ctypes.POINTER(OVERLAPPED)  # Overlapped
1072     ]
1073     LockFileEx.restype = ctypes.wintypes.BOOL
1074     UnlockFileEx = kernel32.UnlockFileEx
1075     UnlockFileEx.argtypes = [
1076         ctypes.wintypes.HANDLE,     # hFile
1077         ctypes.wintypes.DWORD,      # dwReserved
1078         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1079         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1080         ctypes.POINTER(OVERLAPPED)  # Overlapped
1081     ]
1082     UnlockFileEx.restype = ctypes.wintypes.BOOL
1083     whole_low = 0xffffffff
1084     whole_high = 0x7fffffff
1085
1086     def _lock_file(f, exclusive):
1087         overlapped = OVERLAPPED()
1088         overlapped.Offset = 0
1089         overlapped.OffsetHigh = 0
1090         overlapped.hEvent = 0
1091         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1092         handle = msvcrt.get_osfhandle(f.fileno())
1093         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1094                           whole_low, whole_high, f._lock_file_overlapped_p):
1095             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1096
1097     def _unlock_file(f):
1098         assert f._lock_file_overlapped_p
1099         handle = msvcrt.get_osfhandle(f.fileno())
1100         if not UnlockFileEx(handle, 0,
1101                             whole_low, whole_high, f._lock_file_overlapped_p):
1102             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1103
1104 else:
1105     import fcntl
1106
1107     def _lock_file(f, exclusive):
1108         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1109
1110     def _unlock_file(f):
1111         fcntl.flock(f, fcntl.LOCK_UN)
1112
1113
1114 class locked_file(object):
1115     def __init__(self, filename, mode, encoding=None):
1116         assert mode in ['r', 'a', 'w']
1117         self.f = io.open(filename, mode, encoding=encoding)
1118         self.mode = mode
1119
1120     def __enter__(self):
1121         exclusive = self.mode != 'r'
1122         try:
1123             _lock_file(self.f, exclusive)
1124         except IOError:
1125             self.f.close()
1126             raise
1127         return self
1128
1129     def __exit__(self, etype, value, traceback):
1130         try:
1131             _unlock_file(self.f)
1132         finally:
1133             self.f.close()
1134
1135     def __iter__(self):
1136         return iter(self.f)
1137
1138     def write(self, *args):
1139         return self.f.write(*args)
1140
1141     def read(self, *args):
1142         return self.f.read(*args)
1143
1144
1145 def get_filesystem_encoding():
1146     encoding = sys.getfilesystemencoding()
1147     return encoding if encoding is not None else 'utf-8'
1148
1149
1150 def shell_quote(args):
1151     quoted_args = []
1152     encoding = get_filesystem_encoding()
1153     for a in args:
1154         if isinstance(a, bytes):
1155             # We may get a filename encoded with 'encodeFilename'
1156             a = a.decode(encoding)
1157         quoted_args.append(pipes.quote(a))
1158     return ' '.join(quoted_args)
1159
1160
1161 def smuggle_url(url, data):
1162     """ Pass additional data in a URL for internal use. """
1163
1164     sdata = compat_urllib_parse.urlencode(
1165         {'__youtubedl_smuggle': json.dumps(data)})
1166     return url + '#' + sdata
1167
1168
1169 def unsmuggle_url(smug_url, default=None):
1170     if '#__youtubedl_smuggle' not in smug_url:
1171         return smug_url, default
1172     url, _, sdata = smug_url.rpartition('#')
1173     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1174     data = json.loads(jsond)
1175     return url, data
1176
1177
1178 def format_bytes(bytes):
1179     if bytes is None:
1180         return 'N/A'
1181     if type(bytes) is str:
1182         bytes = float(bytes)
1183     if bytes == 0.0:
1184         exponent = 0
1185     else:
1186         exponent = int(math.log(bytes, 1024.0))
1187     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1188     converted = float(bytes) / float(1024 ** exponent)
1189     return '%.2f%s' % (converted, suffix)
1190
1191
1192 def parse_filesize(s):
1193     if s is None:
1194         return None
1195
1196     # The lower-case forms are of course incorrect and inofficial,
1197     # but we support those too
1198     _UNIT_TABLE = {
1199         'B': 1,
1200         'b': 1,
1201         'KiB': 1024,
1202         'KB': 1000,
1203         'kB': 1024,
1204         'Kb': 1000,
1205         'MiB': 1024 ** 2,
1206         'MB': 1000 ** 2,
1207         'mB': 1024 ** 2,
1208         'Mb': 1000 ** 2,
1209         'GiB': 1024 ** 3,
1210         'GB': 1000 ** 3,
1211         'gB': 1024 ** 3,
1212         'Gb': 1000 ** 3,
1213         'TiB': 1024 ** 4,
1214         'TB': 1000 ** 4,
1215         'tB': 1024 ** 4,
1216         'Tb': 1000 ** 4,
1217         'PiB': 1024 ** 5,
1218         'PB': 1000 ** 5,
1219         'pB': 1024 ** 5,
1220         'Pb': 1000 ** 5,
1221         'EiB': 1024 ** 6,
1222         'EB': 1000 ** 6,
1223         'eB': 1024 ** 6,
1224         'Eb': 1000 ** 6,
1225         'ZiB': 1024 ** 7,
1226         'ZB': 1000 ** 7,
1227         'zB': 1024 ** 7,
1228         'Zb': 1000 ** 7,
1229         'YiB': 1024 ** 8,
1230         'YB': 1000 ** 8,
1231         'yB': 1024 ** 8,
1232         'Yb': 1000 ** 8,
1233     }
1234
1235     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1236     m = re.match(
1237         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1238     if not m:
1239         return None
1240
1241     num_str = m.group('num').replace(',', '.')
1242     mult = _UNIT_TABLE[m.group('unit')]
1243     return int(float(num_str) * mult)
1244
1245
1246 def month_by_name(name):
1247     """ Return the number of a month by (locale-independently) English name """
1248
1249     try:
1250         return ENGLISH_MONTH_NAMES.index(name) + 1
1251     except ValueError:
1252         return None
1253
1254
1255 def month_by_abbreviation(abbrev):
1256     """ Return the number of a month by (locale-independently) English
1257         abbreviations """
1258
1259     try:
1260         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1261     except ValueError:
1262         return None
1263
1264
1265 def fix_xml_ampersands(xml_str):
1266     """Replace all the '&' by '&amp;' in XML"""
1267     return re.sub(
1268         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1269         '&amp;',
1270         xml_str)
1271
1272
1273 def setproctitle(title):
1274     assert isinstance(title, compat_str)
1275     try:
1276         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1277     except OSError:
1278         return
1279     title_bytes = title.encode('utf-8')
1280     buf = ctypes.create_string_buffer(len(title_bytes))
1281     buf.value = title_bytes
1282     try:
1283         libc.prctl(15, buf, 0, 0, 0)
1284     except AttributeError:
1285         return  # Strange libc, just skip this
1286
1287
1288 def remove_start(s, start):
1289     if s.startswith(start):
1290         return s[len(start):]
1291     return s
1292
1293
1294 def remove_end(s, end):
1295     if s.endswith(end):
1296         return s[:-len(end)]
1297     return s
1298
1299
1300 def url_basename(url):
1301     path = compat_urlparse.urlparse(url).path
1302     return path.strip('/').split('/')[-1]
1303
1304
1305 class HEADRequest(compat_urllib_request.Request):
1306     def get_method(self):
1307         return "HEAD"
1308
1309
1310 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1311     if get_attr:
1312         if v is not None:
1313             v = getattr(v, get_attr, None)
1314     if v == '':
1315         v = None
1316     return default if v is None else (int(v) * invscale // scale)
1317
1318
1319 def str_or_none(v, default=None):
1320     return default if v is None else compat_str(v)
1321
1322
1323 def str_to_int(int_str):
1324     """ A more relaxed version of int_or_none """
1325     if int_str is None:
1326         return None
1327     int_str = re.sub(r'[,\.\+]', '', int_str)
1328     return int(int_str)
1329
1330
1331 def float_or_none(v, scale=1, invscale=1, default=None):
1332     return default if v is None else (float(v) * invscale / scale)
1333
1334
1335 def parse_duration(s):
1336     if not isinstance(s, compat_basestring):
1337         return None
1338
1339     s = s.strip()
1340
1341     m = re.match(
1342         r'''(?ix)(?:P?T)?
1343         (?:
1344             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1345             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1346
1347             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1348             (?:
1349                 (?:
1350                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1351                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1352                 )?
1353                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1354             )?
1355             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1356         )$''', s)
1357     if not m:
1358         return None
1359     res = 0
1360     if m.group('only_mins'):
1361         return float_or_none(m.group('only_mins'), invscale=60)
1362     if m.group('only_hours'):
1363         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1364     if m.group('secs'):
1365         res += int(m.group('secs'))
1366     if m.group('mins_reversed'):
1367         res += int(m.group('mins_reversed')) * 60
1368     if m.group('mins'):
1369         res += int(m.group('mins')) * 60
1370     if m.group('hours'):
1371         res += int(m.group('hours')) * 60 * 60
1372     if m.group('hours_reversed'):
1373         res += int(m.group('hours_reversed')) * 60 * 60
1374     if m.group('days'):
1375         res += int(m.group('days')) * 24 * 60 * 60
1376     if m.group('ms'):
1377         res += float(m.group('ms'))
1378     return res
1379
1380
1381 def prepend_extension(filename, ext, expected_real_ext=None):
1382     name, real_ext = os.path.splitext(filename)
1383     return (
1384         '{0}.{1}{2}'.format(name, ext, real_ext)
1385         if not expected_real_ext or real_ext[1:] == expected_real_ext
1386         else '{0}.{1}'.format(filename, ext))
1387
1388
1389 def replace_extension(filename, ext, expected_real_ext=None):
1390     name, real_ext = os.path.splitext(filename)
1391     return '{0}.{1}'.format(
1392         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1393         ext)
1394
1395
1396 def check_executable(exe, args=[]):
1397     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1398     args can be a list of arguments for a short output (like -version) """
1399     try:
1400         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1401     except OSError:
1402         return False
1403     return exe
1404
1405
1406 def get_exe_version(exe, args=['--version'],
1407                     version_re=None, unrecognized='present'):
1408     """ Returns the version of the specified executable,
1409     or False if the executable is not present """
1410     try:
1411         out, _ = subprocess.Popen(
1412             [encodeArgument(exe)] + args,
1413             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1414     except OSError:
1415         return False
1416     if isinstance(out, bytes):  # Python 2.x
1417         out = out.decode('ascii', 'ignore')
1418     return detect_exe_version(out, version_re, unrecognized)
1419
1420
1421 def detect_exe_version(output, version_re=None, unrecognized='present'):
1422     assert isinstance(output, compat_str)
1423     if version_re is None:
1424         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1425     m = re.search(version_re, output)
1426     if m:
1427         return m.group(1)
1428     else:
1429         return unrecognized
1430
1431
1432 class PagedList(object):
1433     def __len__(self):
1434         # This is only useful for tests
1435         return len(self.getslice())
1436
1437
1438 class OnDemandPagedList(PagedList):
1439     def __init__(self, pagefunc, pagesize):
1440         self._pagefunc = pagefunc
1441         self._pagesize = pagesize
1442
1443     def getslice(self, start=0, end=None):
1444         res = []
1445         for pagenum in itertools.count(start // self._pagesize):
1446             firstid = pagenum * self._pagesize
1447             nextfirstid = pagenum * self._pagesize + self._pagesize
1448             if start >= nextfirstid:
1449                 continue
1450
1451             page_results = list(self._pagefunc(pagenum))
1452
1453             startv = (
1454                 start % self._pagesize
1455                 if firstid <= start < nextfirstid
1456                 else 0)
1457
1458             endv = (
1459                 ((end - 1) % self._pagesize) + 1
1460                 if (end is not None and firstid <= end <= nextfirstid)
1461                 else None)
1462
1463             if startv != 0 or endv is not None:
1464                 page_results = page_results[startv:endv]
1465             res.extend(page_results)
1466
1467             # A little optimization - if current page is not "full", ie. does
1468             # not contain page_size videos then we can assume that this page
1469             # is the last one - there are no more ids on further pages -
1470             # i.e. no need to query again.
1471             if len(page_results) + startv < self._pagesize:
1472                 break
1473
1474             # If we got the whole page, but the next page is not interesting,
1475             # break out early as well
1476             if end == nextfirstid:
1477                 break
1478         return res
1479
1480
1481 class InAdvancePagedList(PagedList):
1482     def __init__(self, pagefunc, pagecount, pagesize):
1483         self._pagefunc = pagefunc
1484         self._pagecount = pagecount
1485         self._pagesize = pagesize
1486
1487     def getslice(self, start=0, end=None):
1488         res = []
1489         start_page = start // self._pagesize
1490         end_page = (
1491             self._pagecount if end is None else (end // self._pagesize + 1))
1492         skip_elems = start - start_page * self._pagesize
1493         only_more = None if end is None else end - start
1494         for pagenum in range(start_page, end_page):
1495             page = list(self._pagefunc(pagenum))
1496             if skip_elems:
1497                 page = page[skip_elems:]
1498                 skip_elems = None
1499             if only_more is not None:
1500                 if len(page) < only_more:
1501                     only_more -= len(page)
1502                 else:
1503                     page = page[:only_more]
1504                     res.extend(page)
1505                     break
1506             res.extend(page)
1507         return res
1508
1509
1510 def uppercase_escape(s):
1511     unicode_escape = codecs.getdecoder('unicode_escape')
1512     return re.sub(
1513         r'\\U[0-9a-fA-F]{8}',
1514         lambda m: unicode_escape(m.group(0))[0],
1515         s)
1516
1517
1518 def lowercase_escape(s):
1519     unicode_escape = codecs.getdecoder('unicode_escape')
1520     return re.sub(
1521         r'\\u[0-9a-fA-F]{4}',
1522         lambda m: unicode_escape(m.group(0))[0],
1523         s)
1524
1525
1526 def escape_rfc3986(s):
1527     """Escape non-ASCII characters as suggested by RFC 3986"""
1528     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1529         s = s.encode('utf-8')
1530     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1531
1532
1533 def escape_url(url):
1534     """Escape URL as suggested by RFC 3986"""
1535     url_parsed = compat_urllib_parse_urlparse(url)
1536     return url_parsed._replace(
1537         path=escape_rfc3986(url_parsed.path),
1538         params=escape_rfc3986(url_parsed.params),
1539         query=escape_rfc3986(url_parsed.query),
1540         fragment=escape_rfc3986(url_parsed.fragment)
1541     ).geturl()
1542
1543 try:
1544     struct.pack('!I', 0)
1545 except TypeError:
1546     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1547     def struct_pack(spec, *args):
1548         if isinstance(spec, compat_str):
1549             spec = spec.encode('ascii')
1550         return struct.pack(spec, *args)
1551
1552     def struct_unpack(spec, *args):
1553         if isinstance(spec, compat_str):
1554             spec = spec.encode('ascii')
1555         return struct.unpack(spec, *args)
1556 else:
1557     struct_pack = struct.pack
1558     struct_unpack = struct.unpack
1559
1560
1561 def read_batch_urls(batch_fd):
1562     def fixup(url):
1563         if not isinstance(url, compat_str):
1564             url = url.decode('utf-8', 'replace')
1565         BOM_UTF8 = '\xef\xbb\xbf'
1566         if url.startswith(BOM_UTF8):
1567             url = url[len(BOM_UTF8):]
1568         url = url.strip()
1569         if url.startswith(('#', ';', ']')):
1570             return False
1571         return url
1572
1573     with contextlib.closing(batch_fd) as fd:
1574         return [url for url in map(fixup, fd) if url]
1575
1576
1577 def urlencode_postdata(*args, **kargs):
1578     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1579
1580
1581 try:
1582     etree_iter = xml.etree.ElementTree.Element.iter
1583 except AttributeError:  # Python <=2.6
1584     etree_iter = lambda n: n.findall('.//*')
1585
1586
1587 def parse_xml(s):
1588     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1589         def doctype(self, name, pubid, system):
1590             pass  # Ignore doctypes
1591
1592     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1593     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1594     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1595     # Fix up XML parser in Python 2.x
1596     if sys.version_info < (3, 0):
1597         for n in etree_iter(tree):
1598             if n.text is not None:
1599                 if not isinstance(n.text, compat_str):
1600                     n.text = n.text.decode('utf-8')
1601     return tree
1602
1603
1604 US_RATINGS = {
1605     'G': 0,
1606     'PG': 10,
1607     'PG-13': 13,
1608     'R': 16,
1609     'NC': 18,
1610 }
1611
1612
1613 def parse_age_limit(s):
1614     if s is None:
1615         return None
1616     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1617     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1618
1619
1620 def strip_jsonp(code):
1621     return re.sub(
1622         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1623
1624
1625 def js_to_json(code):
1626     def fix_kv(m):
1627         v = m.group(0)
1628         if v in ('true', 'false', 'null'):
1629             return v
1630         if v.startswith('"'):
1631             return v
1632         if v.startswith("'"):
1633             v = v[1:-1]
1634             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1635                 '\\\\': '\\\\',
1636                 "\\'": "'",
1637                 '"': '\\"',
1638             }[m.group(0)], v)
1639         return '"%s"' % v
1640
1641     res = re.sub(r'''(?x)
1642         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1643         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1644         [a-zA-Z_][.a-zA-Z_0-9]*
1645         ''', fix_kv, code)
1646     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1647     return res
1648
1649
1650 def qualities(quality_ids):
1651     """ Get a numeric quality value out of a list of possible values """
1652     def q(qid):
1653         try:
1654             return quality_ids.index(qid)
1655         except ValueError:
1656             return -1
1657     return q
1658
1659
1660 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1661
1662
1663 def limit_length(s, length):
1664     """ Add ellipses to overly long strings """
1665     if s is None:
1666         return None
1667     ELLIPSES = '...'
1668     if len(s) > length:
1669         return s[:length - len(ELLIPSES)] + ELLIPSES
1670     return s
1671
1672
1673 def version_tuple(v):
1674     return tuple(int(e) for e in re.split(r'[-.]', v))
1675
1676
1677 def is_outdated_version(version, limit, assume_new=True):
1678     if not version:
1679         return not assume_new
1680     try:
1681         return version_tuple(version) < version_tuple(limit)
1682     except ValueError:
1683         return not assume_new
1684
1685
1686 def ytdl_is_updateable():
1687     """ Returns if youtube-dl can be updated with -U """
1688     from zipimport import zipimporter
1689
1690     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1691
1692
1693 def args_to_str(args):
1694     # Get a short string representation for a subprocess command
1695     return ' '.join(shlex_quote(a) for a in args)
1696
1697
1698 def mimetype2ext(mt):
1699     _, _, res = mt.rpartition('/')
1700
1701     return {
1702         'x-ms-wmv': 'wmv',
1703         'x-mp4-fragmented': 'mp4',
1704         'ttml+xml': 'ttml',
1705     }.get(res, res)
1706
1707
1708 def urlhandle_detect_ext(url_handle):
1709     try:
1710         url_handle.headers
1711         getheader = lambda h: url_handle.headers[h]
1712     except AttributeError:  # Python < 3
1713         getheader = url_handle.info().getheader
1714
1715     cd = getheader('Content-Disposition')
1716     if cd:
1717         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1718         if m:
1719             e = determine_ext(m.group('filename'), default_ext=None)
1720             if e:
1721                 return e
1722
1723     return mimetype2ext(getheader('Content-Type'))
1724
1725
1726 def age_restricted(content_limit, age_limit):
1727     """ Returns True iff the content should be blocked """
1728
1729     if age_limit is None:  # No limit set
1730         return False
1731     if content_limit is None:
1732         return False  # Content available for everyone
1733     return age_limit < content_limit
1734
1735
1736 def is_html(first_bytes):
1737     """ Detect whether a file contains HTML by examining its first bytes. """
1738
1739     BOMS = [
1740         (b'\xef\xbb\xbf', 'utf-8'),
1741         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1742         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1743         (b'\xff\xfe', 'utf-16-le'),
1744         (b'\xfe\xff', 'utf-16-be'),
1745     ]
1746     for bom, enc in BOMS:
1747         if first_bytes.startswith(bom):
1748             s = first_bytes[len(bom):].decode(enc, 'replace')
1749             break
1750     else:
1751         s = first_bytes.decode('utf-8', 'replace')
1752
1753     return re.match(r'^\s*<', s)
1754
1755
1756 def determine_protocol(info_dict):
1757     protocol = info_dict.get('protocol')
1758     if protocol is not None:
1759         return protocol
1760
1761     url = info_dict['url']
1762     if url.startswith('rtmp'):
1763         return 'rtmp'
1764     elif url.startswith('mms'):
1765         return 'mms'
1766     elif url.startswith('rtsp'):
1767         return 'rtsp'
1768
1769     ext = determine_ext(url)
1770     if ext == 'm3u8':
1771         return 'm3u8'
1772     elif ext == 'f4m':
1773         return 'f4m'
1774
1775     return compat_urllib_parse_urlparse(url).scheme
1776
1777
1778 def render_table(header_row, data):
1779     """ Render a list of rows, each as a list of values """
1780     table = [header_row] + data
1781     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1782     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1783     return '\n'.join(format_str % tuple(row) for row in table)
1784
1785
1786 def _match_one(filter_part, dct):
1787     COMPARISON_OPERATORS = {
1788         '<': operator.lt,
1789         '<=': operator.le,
1790         '>': operator.gt,
1791         '>=': operator.ge,
1792         '=': operator.eq,
1793         '!=': operator.ne,
1794     }
1795     operator_rex = re.compile(r'''(?x)\s*
1796         (?P<key>[a-z_]+)
1797         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1798         (?:
1799             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1800             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1801         )
1802         \s*$
1803         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1804     m = operator_rex.search(filter_part)
1805     if m:
1806         op = COMPARISON_OPERATORS[m.group('op')]
1807         if m.group('strval') is not None:
1808             if m.group('op') not in ('=', '!='):
1809                 raise ValueError(
1810                     'Operator %s does not support string values!' % m.group('op'))
1811             comparison_value = m.group('strval')
1812         else:
1813             try:
1814                 comparison_value = int(m.group('intval'))
1815             except ValueError:
1816                 comparison_value = parse_filesize(m.group('intval'))
1817                 if comparison_value is None:
1818                     comparison_value = parse_filesize(m.group('intval') + 'B')
1819                 if comparison_value is None:
1820                     raise ValueError(
1821                         'Invalid integer value %r in filter part %r' % (
1822                             m.group('intval'), filter_part))
1823         actual_value = dct.get(m.group('key'))
1824         if actual_value is None:
1825             return m.group('none_inclusive')
1826         return op(actual_value, comparison_value)
1827
1828     UNARY_OPERATORS = {
1829         '': lambda v: v is not None,
1830         '!': lambda v: v is None,
1831     }
1832     operator_rex = re.compile(r'''(?x)\s*
1833         (?P<op>%s)\s*(?P<key>[a-z_]+)
1834         \s*$
1835         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1836     m = operator_rex.search(filter_part)
1837     if m:
1838         op = UNARY_OPERATORS[m.group('op')]
1839         actual_value = dct.get(m.group('key'))
1840         return op(actual_value)
1841
1842     raise ValueError('Invalid filter part %r' % filter_part)
1843
1844
1845 def match_str(filter_str, dct):
1846     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1847
1848     return all(
1849         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1850
1851
1852 def match_filter_func(filter_str):
1853     def _match_func(info_dict):
1854         if match_str(filter_str, info_dict):
1855             return None
1856         else:
1857             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1858             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1859     return _match_func
1860
1861
1862 def parse_dfxp_time_expr(time_expr):
1863     if not time_expr:
1864         return 0.0
1865
1866     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1867     if mobj:
1868         return float(mobj.group('time_offset'))
1869
1870     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1871     if mobj:
1872         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1873
1874
1875 def srt_subtitles_timecode(seconds):
1876     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1877
1878
1879 def dfxp2srt(dfxp_data):
1880     _x = functools.partial(xpath_with_ns, ns_map={
1881         'ttml': 'http://www.w3.org/ns/ttml',
1882         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1883     })
1884
1885     def parse_node(node):
1886         str_or_empty = functools.partial(str_or_none, default='')
1887
1888         out = str_or_empty(node.text)
1889
1890         for child in node:
1891             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1892                 out += '\n' + str_or_empty(child.tail)
1893             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1894                 out += str_or_empty(parse_node(child))
1895             else:
1896                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1897
1898         return out
1899
1900     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1901     out = []
1902     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1903
1904     if not paras:
1905         raise ValueError('Invalid dfxp/TTML subtitle')
1906
1907     for para, index in zip(paras, itertools.count(1)):
1908         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1909         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1910         if not end_time:
1911             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1912         out.append('%d\n%s --> %s\n%s\n\n' % (
1913             index,
1914             srt_subtitles_timecode(begin_time),
1915             srt_subtitles_timecode(end_time),
1916             parse_node(para)))
1917
1918     return ''.join(out)
1919
1920
1921 class ISO639Utils(object):
1922     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1923     _lang_map = {
1924         'aa': 'aar',
1925         'ab': 'abk',
1926         'ae': 'ave',
1927         'af': 'afr',
1928         'ak': 'aka',
1929         'am': 'amh',
1930         'an': 'arg',
1931         'ar': 'ara',
1932         'as': 'asm',
1933         'av': 'ava',
1934         'ay': 'aym',
1935         'az': 'aze',
1936         'ba': 'bak',
1937         'be': 'bel',
1938         'bg': 'bul',
1939         'bh': 'bih',
1940         'bi': 'bis',
1941         'bm': 'bam',
1942         'bn': 'ben',
1943         'bo': 'bod',
1944         'br': 'bre',
1945         'bs': 'bos',
1946         'ca': 'cat',
1947         'ce': 'che',
1948         'ch': 'cha',
1949         'co': 'cos',
1950         'cr': 'cre',
1951         'cs': 'ces',
1952         'cu': 'chu',
1953         'cv': 'chv',
1954         'cy': 'cym',
1955         'da': 'dan',
1956         'de': 'deu',
1957         'dv': 'div',
1958         'dz': 'dzo',
1959         'ee': 'ewe',
1960         'el': 'ell',
1961         'en': 'eng',
1962         'eo': 'epo',
1963         'es': 'spa',
1964         'et': 'est',
1965         'eu': 'eus',
1966         'fa': 'fas',
1967         'ff': 'ful',
1968         'fi': 'fin',
1969         'fj': 'fij',
1970         'fo': 'fao',
1971         'fr': 'fra',
1972         'fy': 'fry',
1973         'ga': 'gle',
1974         'gd': 'gla',
1975         'gl': 'glg',
1976         'gn': 'grn',
1977         'gu': 'guj',
1978         'gv': 'glv',
1979         'ha': 'hau',
1980         'he': 'heb',
1981         'hi': 'hin',
1982         'ho': 'hmo',
1983         'hr': 'hrv',
1984         'ht': 'hat',
1985         'hu': 'hun',
1986         'hy': 'hye',
1987         'hz': 'her',
1988         'ia': 'ina',
1989         'id': 'ind',
1990         'ie': 'ile',
1991         'ig': 'ibo',
1992         'ii': 'iii',
1993         'ik': 'ipk',
1994         'io': 'ido',
1995         'is': 'isl',
1996         'it': 'ita',
1997         'iu': 'iku',
1998         'ja': 'jpn',
1999         'jv': 'jav',
2000         'ka': 'kat',
2001         'kg': 'kon',
2002         'ki': 'kik',
2003         'kj': 'kua',
2004         'kk': 'kaz',
2005         'kl': 'kal',
2006         'km': 'khm',
2007         'kn': 'kan',
2008         'ko': 'kor',
2009         'kr': 'kau',
2010         'ks': 'kas',
2011         'ku': 'kur',
2012         'kv': 'kom',
2013         'kw': 'cor',
2014         'ky': 'kir',
2015         'la': 'lat',
2016         'lb': 'ltz',
2017         'lg': 'lug',
2018         'li': 'lim',
2019         'ln': 'lin',
2020         'lo': 'lao',
2021         'lt': 'lit',
2022         'lu': 'lub',
2023         'lv': 'lav',
2024         'mg': 'mlg',
2025         'mh': 'mah',
2026         'mi': 'mri',
2027         'mk': 'mkd',
2028         'ml': 'mal',
2029         'mn': 'mon',
2030         'mr': 'mar',
2031         'ms': 'msa',
2032         'mt': 'mlt',
2033         'my': 'mya',
2034         'na': 'nau',
2035         'nb': 'nob',
2036         'nd': 'nde',
2037         'ne': 'nep',
2038         'ng': 'ndo',
2039         'nl': 'nld',
2040         'nn': 'nno',
2041         'no': 'nor',
2042         'nr': 'nbl',
2043         'nv': 'nav',
2044         'ny': 'nya',
2045         'oc': 'oci',
2046         'oj': 'oji',
2047         'om': 'orm',
2048         'or': 'ori',
2049         'os': 'oss',
2050         'pa': 'pan',
2051         'pi': 'pli',
2052         'pl': 'pol',
2053         'ps': 'pus',
2054         'pt': 'por',
2055         'qu': 'que',
2056         'rm': 'roh',
2057         'rn': 'run',
2058         'ro': 'ron',
2059         'ru': 'rus',
2060         'rw': 'kin',
2061         'sa': 'san',
2062         'sc': 'srd',
2063         'sd': 'snd',
2064         'se': 'sme',
2065         'sg': 'sag',
2066         'si': 'sin',
2067         'sk': 'slk',
2068         'sl': 'slv',
2069         'sm': 'smo',
2070         'sn': 'sna',
2071         'so': 'som',
2072         'sq': 'sqi',
2073         'sr': 'srp',
2074         'ss': 'ssw',
2075         'st': 'sot',
2076         'su': 'sun',
2077         'sv': 'swe',
2078         'sw': 'swa',
2079         'ta': 'tam',
2080         'te': 'tel',
2081         'tg': 'tgk',
2082         'th': 'tha',
2083         'ti': 'tir',
2084         'tk': 'tuk',
2085         'tl': 'tgl',
2086         'tn': 'tsn',
2087         'to': 'ton',
2088         'tr': 'tur',
2089         'ts': 'tso',
2090         'tt': 'tat',
2091         'tw': 'twi',
2092         'ty': 'tah',
2093         'ug': 'uig',
2094         'uk': 'ukr',
2095         'ur': 'urd',
2096         'uz': 'uzb',
2097         've': 'ven',
2098         'vi': 'vie',
2099         'vo': 'vol',
2100         'wa': 'wln',
2101         'wo': 'wol',
2102         'xh': 'xho',
2103         'yi': 'yid',
2104         'yo': 'yor',
2105         'za': 'zha',
2106         'zh': 'zho',
2107         'zu': 'zul',
2108     }
2109
2110     @classmethod
2111     def short2long(cls, code):
2112         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2113         return cls._lang_map.get(code[:2])
2114
2115     @classmethod
2116     def long2short(cls, code):
2117         """Convert language code from ISO 639-2/T to ISO 639-1"""
2118         for short_name, long_name in cls._lang_map.items():
2119             if long_name == code:
2120                 return short_name
2121
2122
2123 class ISO3166Utils(object):
2124     # From http://data.okfn.org/data/core/country-list
2125     _country_map = {
2126         'AF': 'Afghanistan',
2127         'AX': 'Åland Islands',
2128         'AL': 'Albania',
2129         'DZ': 'Algeria',
2130         'AS': 'American Samoa',
2131         'AD': 'Andorra',
2132         'AO': 'Angola',
2133         'AI': 'Anguilla',
2134         'AQ': 'Antarctica',
2135         'AG': 'Antigua and Barbuda',
2136         'AR': 'Argentina',
2137         'AM': 'Armenia',
2138         'AW': 'Aruba',
2139         'AU': 'Australia',
2140         'AT': 'Austria',
2141         'AZ': 'Azerbaijan',
2142         'BS': 'Bahamas',
2143         'BH': 'Bahrain',
2144         'BD': 'Bangladesh',
2145         'BB': 'Barbados',
2146         'BY': 'Belarus',
2147         'BE': 'Belgium',
2148         'BZ': 'Belize',
2149         'BJ': 'Benin',
2150         'BM': 'Bermuda',
2151         'BT': 'Bhutan',
2152         'BO': 'Bolivia, Plurinational State of',
2153         'BQ': 'Bonaire, Sint Eustatius and Saba',
2154         'BA': 'Bosnia and Herzegovina',
2155         'BW': 'Botswana',
2156         'BV': 'Bouvet Island',
2157         'BR': 'Brazil',
2158         'IO': 'British Indian Ocean Territory',
2159         'BN': 'Brunei Darussalam',
2160         'BG': 'Bulgaria',
2161         'BF': 'Burkina Faso',
2162         'BI': 'Burundi',
2163         'KH': 'Cambodia',
2164         'CM': 'Cameroon',
2165         'CA': 'Canada',
2166         'CV': 'Cape Verde',
2167         'KY': 'Cayman Islands',
2168         'CF': 'Central African Republic',
2169         'TD': 'Chad',
2170         'CL': 'Chile',
2171         'CN': 'China',
2172         'CX': 'Christmas Island',
2173         'CC': 'Cocos (Keeling) Islands',
2174         'CO': 'Colombia',
2175         'KM': 'Comoros',
2176         'CG': 'Congo',
2177         'CD': 'Congo, the Democratic Republic of the',
2178         'CK': 'Cook Islands',
2179         'CR': 'Costa Rica',
2180         'CI': 'Côte d\'Ivoire',
2181         'HR': 'Croatia',
2182         'CU': 'Cuba',
2183         'CW': 'Curaçao',
2184         'CY': 'Cyprus',
2185         'CZ': 'Czech Republic',
2186         'DK': 'Denmark',
2187         'DJ': 'Djibouti',
2188         'DM': 'Dominica',
2189         'DO': 'Dominican Republic',
2190         'EC': 'Ecuador',
2191         'EG': 'Egypt',
2192         'SV': 'El Salvador',
2193         'GQ': 'Equatorial Guinea',
2194         'ER': 'Eritrea',
2195         'EE': 'Estonia',
2196         'ET': 'Ethiopia',
2197         'FK': 'Falkland Islands (Malvinas)',
2198         'FO': 'Faroe Islands',
2199         'FJ': 'Fiji',
2200         'FI': 'Finland',
2201         'FR': 'France',
2202         'GF': 'French Guiana',
2203         'PF': 'French Polynesia',
2204         'TF': 'French Southern Territories',
2205         'GA': 'Gabon',
2206         'GM': 'Gambia',
2207         'GE': 'Georgia',
2208         'DE': 'Germany',
2209         'GH': 'Ghana',
2210         'GI': 'Gibraltar',
2211         'GR': 'Greece',
2212         'GL': 'Greenland',
2213         'GD': 'Grenada',
2214         'GP': 'Guadeloupe',
2215         'GU': 'Guam',
2216         'GT': 'Guatemala',
2217         'GG': 'Guernsey',
2218         'GN': 'Guinea',
2219         'GW': 'Guinea-Bissau',
2220         'GY': 'Guyana',
2221         'HT': 'Haiti',
2222         'HM': 'Heard Island and McDonald Islands',
2223         'VA': 'Holy See (Vatican City State)',
2224         'HN': 'Honduras',
2225         'HK': 'Hong Kong',
2226         'HU': 'Hungary',
2227         'IS': 'Iceland',
2228         'IN': 'India',
2229         'ID': 'Indonesia',
2230         'IR': 'Iran, Islamic Republic of',
2231         'IQ': 'Iraq',
2232         'IE': 'Ireland',
2233         'IM': 'Isle of Man',
2234         'IL': 'Israel',
2235         'IT': 'Italy',
2236         'JM': 'Jamaica',
2237         'JP': 'Japan',
2238         'JE': 'Jersey',
2239         'JO': 'Jordan',
2240         'KZ': 'Kazakhstan',
2241         'KE': 'Kenya',
2242         'KI': 'Kiribati',
2243         'KP': 'Korea, Democratic People\'s Republic of',
2244         'KR': 'Korea, Republic of',
2245         'KW': 'Kuwait',
2246         'KG': 'Kyrgyzstan',
2247         'LA': 'Lao People\'s Democratic Republic',
2248         'LV': 'Latvia',
2249         'LB': 'Lebanon',
2250         'LS': 'Lesotho',
2251         'LR': 'Liberia',
2252         'LY': 'Libya',
2253         'LI': 'Liechtenstein',
2254         'LT': 'Lithuania',
2255         'LU': 'Luxembourg',
2256         'MO': 'Macao',
2257         'MK': 'Macedonia, the Former Yugoslav Republic of',
2258         'MG': 'Madagascar',
2259         'MW': 'Malawi',
2260         'MY': 'Malaysia',
2261         'MV': 'Maldives',
2262         'ML': 'Mali',
2263         'MT': 'Malta',
2264         'MH': 'Marshall Islands',
2265         'MQ': 'Martinique',
2266         'MR': 'Mauritania',
2267         'MU': 'Mauritius',
2268         'YT': 'Mayotte',
2269         'MX': 'Mexico',
2270         'FM': 'Micronesia, Federated States of',
2271         'MD': 'Moldova, Republic of',
2272         'MC': 'Monaco',
2273         'MN': 'Mongolia',
2274         'ME': 'Montenegro',
2275         'MS': 'Montserrat',
2276         'MA': 'Morocco',
2277         'MZ': 'Mozambique',
2278         'MM': 'Myanmar',
2279         'NA': 'Namibia',
2280         'NR': 'Nauru',
2281         'NP': 'Nepal',
2282         'NL': 'Netherlands',
2283         'NC': 'New Caledonia',
2284         'NZ': 'New Zealand',
2285         'NI': 'Nicaragua',
2286         'NE': 'Niger',
2287         'NG': 'Nigeria',
2288         'NU': 'Niue',
2289         'NF': 'Norfolk Island',
2290         'MP': 'Northern Mariana Islands',
2291         'NO': 'Norway',
2292         'OM': 'Oman',
2293         'PK': 'Pakistan',
2294         'PW': 'Palau',
2295         'PS': 'Palestine, State of',
2296         'PA': 'Panama',
2297         'PG': 'Papua New Guinea',
2298         'PY': 'Paraguay',
2299         'PE': 'Peru',
2300         'PH': 'Philippines',
2301         'PN': 'Pitcairn',
2302         'PL': 'Poland',
2303         'PT': 'Portugal',
2304         'PR': 'Puerto Rico',
2305         'QA': 'Qatar',
2306         'RE': 'Réunion',
2307         'RO': 'Romania',
2308         'RU': 'Russian Federation',
2309         'RW': 'Rwanda',
2310         'BL': 'Saint Barthélemy',
2311         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2312         'KN': 'Saint Kitts and Nevis',
2313         'LC': 'Saint Lucia',
2314         'MF': 'Saint Martin (French part)',
2315         'PM': 'Saint Pierre and Miquelon',
2316         'VC': 'Saint Vincent and the Grenadines',
2317         'WS': 'Samoa',
2318         'SM': 'San Marino',
2319         'ST': 'Sao Tome and Principe',
2320         'SA': 'Saudi Arabia',
2321         'SN': 'Senegal',
2322         'RS': 'Serbia',
2323         'SC': 'Seychelles',
2324         'SL': 'Sierra Leone',
2325         'SG': 'Singapore',
2326         'SX': 'Sint Maarten (Dutch part)',
2327         'SK': 'Slovakia',
2328         'SI': 'Slovenia',
2329         'SB': 'Solomon Islands',
2330         'SO': 'Somalia',
2331         'ZA': 'South Africa',
2332         'GS': 'South Georgia and the South Sandwich Islands',
2333         'SS': 'South Sudan',
2334         'ES': 'Spain',
2335         'LK': 'Sri Lanka',
2336         'SD': 'Sudan',
2337         'SR': 'Suriname',
2338         'SJ': 'Svalbard and Jan Mayen',
2339         'SZ': 'Swaziland',
2340         'SE': 'Sweden',
2341         'CH': 'Switzerland',
2342         'SY': 'Syrian Arab Republic',
2343         'TW': 'Taiwan, Province of China',
2344         'TJ': 'Tajikistan',
2345         'TZ': 'Tanzania, United Republic of',
2346         'TH': 'Thailand',
2347         'TL': 'Timor-Leste',
2348         'TG': 'Togo',
2349         'TK': 'Tokelau',
2350         'TO': 'Tonga',
2351         'TT': 'Trinidad and Tobago',
2352         'TN': 'Tunisia',
2353         'TR': 'Turkey',
2354         'TM': 'Turkmenistan',
2355         'TC': 'Turks and Caicos Islands',
2356         'TV': 'Tuvalu',
2357         'UG': 'Uganda',
2358         'UA': 'Ukraine',
2359         'AE': 'United Arab Emirates',
2360         'GB': 'United Kingdom',
2361         'US': 'United States',
2362         'UM': 'United States Minor Outlying Islands',
2363         'UY': 'Uruguay',
2364         'UZ': 'Uzbekistan',
2365         'VU': 'Vanuatu',
2366         'VE': 'Venezuela, Bolivarian Republic of',
2367         'VN': 'Viet Nam',
2368         'VG': 'Virgin Islands, British',
2369         'VI': 'Virgin Islands, U.S.',
2370         'WF': 'Wallis and Futuna',
2371         'EH': 'Western Sahara',
2372         'YE': 'Yemen',
2373         'ZM': 'Zambia',
2374         'ZW': 'Zimbabwe',
2375     }
2376
2377     @classmethod
2378     def short2full(cls, code):
2379         """Convert an ISO 3166-2 country code to the corresponding full name"""
2380         return cls._country_map.get(code.upper())
2381
2382
2383 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2384     def __init__(self, proxies=None):
2385         # Set default handlers
2386         for type in ('http', 'https'):
2387             setattr(self, '%s_open' % type,
2388                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2389                         meth(r, proxy, type))
2390         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2391
2392     def proxy_open(self, req, proxy, type):
2393         req_proxy = req.headers.get('Ytdl-request-proxy')
2394         if req_proxy is not None:
2395             proxy = req_proxy
2396             del req.headers['Ytdl-request-proxy']
2397
2398         if proxy == '__noproxy__':
2399             return None  # No Proxy
2400         return compat_urllib_request.ProxyHandler.proxy_open(
2401             self, req, proxy, type)