youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 ENGLISH_MONTH_NAMES = [
  66     'January', 'February', 'March', 'April', 'May', 'June',
  67     'July', 'August', 'September', 'October', 'November', 'December']
  68
  69
  70 def preferredencoding():
  71     """Get preferred encoding.
  72
  73     Returns the best encoding scheme for the system, based on
  74     locale.getpreferredencoding() and some further tweaks.
  75     """
  76     try:
  77         pref = locale.getpreferredencoding()
  78         'TEST'.encode(pref)
  79     except Exception:
  80         pref = 'UTF-8'
  81
  82     return pref
  83
  84
  85 def write_json_file(obj, fn):
  86     """ Encode obj as JSON and write it to fn, atomically if possible """
  87
  88     fn = encodeFilename(fn)
  89     if sys.version_info < (3, 0) and sys.platform != 'win32':
  90         encoding = get_filesystem_encoding()
  91         # os.path.basename returns a bytes object, but NamedTemporaryFile
  92         # will fail if the filename contains non ascii characters unless we
  93         # use a unicode object
  94         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  95         # the same for os.path.dirname
  96         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  97     else:
  98         path_basename = os.path.basename
  99         path_dirname = os.path.dirname
 100
 101     args = {
 102         'suffix': '.tmp',
 103         'prefix': path_basename(fn) + '.',
 104         'dir': path_dirname(fn),
 105         'delete': False,
 106     }
 107
 108     # In Python 2.x, json.dump expects a bytestream.
 109     # In Python 3.x, it writes to a character stream
 110     if sys.version_info < (3, 0):
 111         args['mode'] = 'wb'
 112     else:
 113         args.update({
 114             'mode': 'w',
 115             'encoding': 'utf-8',
 116         })
 117
 118     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 119
 120     try:
 121         with tf:
 122             json.dump(obj, tf)
 123         if sys.platform == 'win32':
 124             # Need to remove existing file on Windows, else os.rename raises
 125             # WindowsError or FileExistsError.
 126             try:
 127                 os.unlink(fn)
 128             except OSError:
 129                 pass
 130         os.rename(tf.name, fn)
 131     except Exception:
 132         try:
 133             os.remove(tf.name)
 134         except OSError:
 135             pass
 136         raise
 137
 138
 139 if sys.version_info >= (2, 7):
 140     def find_xpath_attr(node, xpath, key, val):
 141         """ Find the xpath xpath[@key=val] """
 142         assert re.match(r'^[a-zA-Z-]+$', key)
 143         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 144         expr = xpath + "[@%s='%s']" % (key, val)
 145         return node.find(expr)
 146 else:
 147     def find_xpath_attr(node, xpath, key, val):
 148         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 149         # .//node does not match if a node is a direct child of . !
 150         if isinstance(xpath, compat_str):
 151             xpath = xpath.encode('ascii')
 152
 153         for f in node.findall(xpath):
 154             if f.attrib.get(key) == val:
 155                 return f
 156         return None
 157
 158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 159 # the namespace parameter
 160
 161
 162 def xpath_with_ns(path, ns_map):
 163     components = [c.split(':') for c in path.split('/')]
 164     replaced = []
 165     for c in components:
 166         if len(c) == 1:
 167             replaced.append(c[0])
 168         else:
 169             ns, tag = c
 170             replaced.append('{%s}%s' % (ns_map[ns], tag))
 171     return '/'.join(replaced)
 172
 173
 174 def xpath_text(node, xpath, name=None, fatal=False):
 175     if sys.version_info < (2, 7):  # Crazy 2.6
 176         xpath = xpath.encode('ascii')
 177
 178     n = node.find(xpath)
 179     if n is None or n.text is None:
 180         if fatal:
 181             name = xpath if name is None else name
 182             raise ExtractorError('Could not find XML element %s' % name)
 183         else:
 184             return None
 185     return n.text
 186
 187
 188 def get_element_by_id(id, html):
 189     """Return the content of the tag with the specified ID in the passed HTML document"""
 190     return get_element_by_attribute("id", id, html)
 191
 192
 193 def get_element_by_attribute(attribute, value, html):
 194     """Return the content of the tag with the specified attribute in the passed HTML document"""
 195
 196     m = re.search(r'''(?xs)
 197         <([a-zA-Z0-9:._-]+)
 198          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 199          \s+%s=['"]?%s['"]?
 200          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 201         \s*>
 202         (?P<content>.*?)
 203         </\1>
 204     ''' % (re.escape(attribute), re.escape(value)), html)
 205
 206     if not m:
 207         return None
 208     res = m.group('content')
 209
 210     if res.startswith('"') or res.startswith("'"):
 211         res = res[1:-1]
 212
 213     return unescapeHTML(res)
 214
 215
 216 def clean_html(html):
 217     """Clean an HTML snippet into a readable string"""
 218
 219     if html is None:  # Convenience for sanitizing descriptions etc.
 220         return html
 221
 222     # Newline vs <br />
 223     html = html.replace('\n', ' ')
 224     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 225     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 226     # Strip html tags
 227     html = re.sub('<.*?>', '', html)
 228     # Replace html entities
 229     html = unescapeHTML(html)
 230     return html.strip()
 231
 232
 233 def sanitize_open(filename, open_mode):
 234     """Try to open the given filename, and slightly tweak it if this fails.
 235
 236     Attempts to open the given filename. If this fails, it tries to change
 237     the filename slightly, step by step, until it's either able to open it
 238     or it fails and raises a final exception, like the standard open()
 239     function.
 240
 241     It returns the tuple (stream, definitive_file_name).
 242     """
 243     try:
 244         if filename == '-':
 245             if sys.platform == 'win32':
 246                 import msvcrt
 247                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 248             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 249         stream = open(encodeFilename(filename), open_mode)
 250         return (stream, filename)
 251     except (IOError, OSError) as err:
 252         if err.errno in (errno.EACCES,):
 253             raise
 254
 255         # In case of error, try to remove win32 forbidden chars
 256         alt_filename = sanitize_path(filename)
 257         if alt_filename == filename:
 258             raise
 259         else:
 260             # An exception here should be caught in the caller
 261             stream = open(encodeFilename(alt_filename), open_mode)
 262             return (stream, alt_filename)
 263
 264
 265 def timeconvert(timestr):
 266     """Convert RFC 2822 defined time string into system timestamp"""
 267     timestamp = None
 268     timetuple = email.utils.parsedate_tz(timestr)
 269     if timetuple is not None:
 270         timestamp = email.utils.mktime_tz(timetuple)
 271     return timestamp
 272
 273
 274 def sanitize_filename(s, restricted=False, is_id=False):
 275     """Sanitizes a string so it could be used as part of a filename.
 276     If restricted is set, use a stricter subset of allowed characters.
 277     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 278     """
 279     def replace_insane(char):
 280         if char == '?' or ord(char) < 32 or ord(char) == 127:
 281             return ''
 282         elif char == '"':
 283             return '' if restricted else '\''
 284         elif char == ':':
 285             return '_-' if restricted else ' -'
 286         elif char in '\\/|*<>':
 287             return '_'
 288         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 289             return '_'
 290         if restricted and ord(char) > 127:
 291             return '_'
 292         return char
 293
 294     # Handle timestamps
 295     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 296     result = ''.join(map(replace_insane, s))
 297     if not is_id:
 298         while '__' in result:
 299             result = result.replace('__', '_')
 300         result = result.strip('_')
 301         # Common case of "Foreign band name - English song title"
 302         if restricted and result.startswith('-_'):
 303             result = result[2:]
 304         if result.startswith('-'):
 305             result = '_' + result[len('-'):]
 306         result = result.lstrip('.')
 307         if not result:
 308             result = '_'
 309     return result
 310
 311
 312 def sanitize_path(s):
 313     """Sanitizes and normalizes path on Windows"""
 314     if sys.platform != 'win32':
 315         return s
 316     drive_or_unc, _ = os.path.splitdrive(s)
 317     if sys.version_info < (2, 7) and not drive_or_unc:
 318         drive_or_unc, _ = os.path.splitunc(s)
 319     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 320     if drive_or_unc:
 321         norm_path.pop(0)
 322     sanitized_path = [
 323         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 324         for path_part in norm_path]
 325     if drive_or_unc:
 326         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 327     return os.path.join(*sanitized_path)
 328
 329
 330 def sanitize_url_path_consecutive_slashes(url):
 331     """Collapses consecutive slashes in URLs' path"""
 332     parsed_url = list(compat_urlparse.urlparse(url))
 333     parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
 334     return compat_urlparse.urlunparse(parsed_url)
 335
 336
 337 def orderedSet(iterable):
 338     """ Remove all duplicates from the input iterable """
 339     res = []
 340     for el in iterable:
 341         if el not in res:
 342             res.append(el)
 343     return res
 344
 345
 346 def _htmlentity_transform(entity):
 347     """Transforms an HTML entity to a character."""
 348     # Known non-numeric HTML entity
 349     if entity in compat_html_entities.name2codepoint:
 350         return compat_chr(compat_html_entities.name2codepoint[entity])
 351
 352     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 353     if mobj is not None:
 354         numstr = mobj.group(1)
 355         if numstr.startswith('x'):
 356             base = 16
 357             numstr = '0%s' % numstr
 358         else:
 359             base = 10
 360         return compat_chr(int(numstr, base))
 361
 362     # Unknown entity in name, return its literal representation
 363     return ('&%s;' % entity)
 364
 365
 366 def unescapeHTML(s):
 367     if s is None:
 368         return None
 369     assert type(s) == compat_str
 370
 371     return re.sub(
 372         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 373
 374
 375 def get_subprocess_encoding():
 376     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 377         # For subprocess calls, encode with locale encoding
 378         # Refer to http://stackoverflow.com/a/9951851/35070
 379         encoding = preferredencoding()
 380     else:
 381         encoding = sys.getfilesystemencoding()
 382     if encoding is None:
 383         encoding = 'utf-8'
 384     return encoding
 385
 386
 387 def encodeFilename(s, for_subprocess=False):
 388     """
 389     @param s The name of the file
 390     """
 391
 392     assert type(s) == compat_str
 393
 394     # Python 3 has a Unicode API
 395     if sys.version_info >= (3, 0):
 396         return s
 397
 398     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 399     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 400     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 401     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 402         return s
 403
 404     return s.encode(get_subprocess_encoding(), 'ignore')
 405
 406
 407 def decodeFilename(b, for_subprocess=False):
 408
 409     if sys.version_info >= (3, 0):
 410         return b
 411
 412     if not isinstance(b, bytes):
 413         return b
 414
 415     return b.decode(get_subprocess_encoding(), 'ignore')
 416
 417
 418 def encodeArgument(s):
 419     if not isinstance(s, compat_str):
 420         # Legacy code that uses byte strings
 421         # Uncomment the following line after fixing all post processors
 422         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 423         s = s.decode('ascii')
 424     return encodeFilename(s, True)
 425
 426
 427 def decodeArgument(b):
 428     return decodeFilename(b, True)
 429
 430
 431 def decodeOption(optval):
 432     if optval is None:
 433         return optval
 434     if isinstance(optval, bytes):
 435         optval = optval.decode(preferredencoding())
 436
 437     assert isinstance(optval, compat_str)
 438     return optval
 439
 440
 441 def formatSeconds(secs):
 442     if secs > 3600:
 443         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 444     elif secs > 60:
 445         return '%d:%02d' % (secs // 60, secs % 60)
 446     else:
 447         return '%d' % secs
 448
 449
 450 def make_HTTPS_handler(params, **kwargs):
 451     opts_no_check_certificate = params.get('nocheckcertificate', False)
 452     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 453         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 454         if opts_no_check_certificate:
 455             context.check_hostname = False
 456             context.verify_mode = ssl.CERT_NONE
 457         try:
 458             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 459         except TypeError:
 460             # Python 2.7.8
 461             # (create_default_context present but HTTPSHandler has no context=)
 462             pass
 463
 464     if sys.version_info < (3, 2):
 465         return YoutubeDLHTTPSHandler(params, **kwargs)
 466     else:  # Python < 3.4
 467         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 468         context.verify_mode = (ssl.CERT_NONE
 469                                if opts_no_check_certificate
 470                                else ssl.CERT_REQUIRED)
 471         context.set_default_verify_paths()
 472         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 473
 474
 475 def bug_reports_message():
 476     if ytdl_is_updateable():
 477         update_cmd = 'type  youtube-dl -U  to update'
 478     else:
 479         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 480     msg = '; please report this issue on https://yt-dl.org/bug .'
 481     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 482     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 483     return msg
 484
 485
 486 class ExtractorError(Exception):
 487     """Error during info extraction."""
 488
 489     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 490         """ tb, if given, is the original traceback (so that it can be printed out).
 491         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 492         """
 493
 494         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 495             expected = True
 496         if video_id is not None:
 497             msg = video_id + ': ' + msg
 498         if cause:
 499             msg += ' (caused by %r)' % cause
 500         if not expected:
 501             msg += bug_reports_message()
 502         super(ExtractorError, self).__init__(msg)
 503
 504         self.traceback = tb
 505         self.exc_info = sys.exc_info()  # preserve original exception
 506         self.cause = cause
 507         self.video_id = video_id
 508
 509     def format_traceback(self):
 510         if self.traceback is None:
 511             return None
 512         return ''.join(traceback.format_tb(self.traceback))
 513
 514
 515 class UnsupportedError(ExtractorError):
 516     def __init__(self, url):
 517         super(UnsupportedError, self).__init__(
 518             'Unsupported URL: %s' % url, expected=True)
 519         self.url = url
 520
 521
 522 class RegexNotFoundError(ExtractorError):
 523     """Error when a regex didn't match"""
 524     pass
 525
 526
 527 class DownloadError(Exception):
 528     """Download Error exception.
 529
 530     This exception may be thrown by FileDownloader objects if they are not
 531     configured to continue on errors. They will contain the appropriate
 532     error message.
 533     """
 534
 535     def __init__(self, msg, exc_info=None):
 536         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 537         super(DownloadError, self).__init__(msg)
 538         self.exc_info = exc_info
 539
 540
 541 class SameFileError(Exception):
 542     """Same File exception.
 543
 544     This exception will be thrown by FileDownloader objects if they detect
 545     multiple files would have to be downloaded to the same file on disk.
 546     """
 547     pass
 548
 549
 550 class PostProcessingError(Exception):
 551     """Post Processing exception.
 552
 553     This exception may be raised by PostProcessor's .run() method to
 554     indicate an error in the postprocessing task.
 555     """
 556
 557     def __init__(self, msg):
 558         self.msg = msg
 559
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582     # Both in bytes
 583     downloaded = None
 584     expected = None
 585
 586     def __init__(self, downloaded, expected):
 587         self.downloaded = downloaded
 588         self.expected = expected
 589
 590
 591 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 592     hc = http_class(*args, **kwargs)
 593     source_address = ydl_handler._params.get('source_address')
 594     if source_address is not None:
 595         sa = (source_address, 0)
 596         if hasattr(hc, 'source_address'):  # Python 2.7+
 597             hc.source_address = sa
 598         else:  # Python 2.6
 599             def _hc_connect(self, *args, **kwargs):
 600                 sock = compat_socket_create_connection(
 601                     (self.host, self.port), self.timeout, sa)
 602                 if is_https:
 603                     self.sock = ssl.wrap_socket(
 604                         sock, self.key_file, self.cert_file,
 605                         ssl_version=ssl.PROTOCOL_TLSv1)
 606                 else:
 607                     self.sock = sock
 608             hc.connect = functools.partial(_hc_connect, hc)
 609
 610     return hc
 611
 612
 613 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 614     """Handler for HTTP requests and responses.
 615
 616     This class, when installed with an OpenerDirector, automatically adds
 617     the standard headers to every HTTP request and handles gzipped and
 618     deflated responses from web servers. If compression is to be avoided in
 619     a particular request, the original request in the program code only has
 620     to include the HTTP header "Youtubedl-No-Compression", which will be
 621     removed before making the real request.
 622
 623     Part of this code was copied from:
 624
 625     http://techknack.net/python-urllib2-handlers/
 626
 627     Andrew Rowls, the author of that code, agreed to release it to the
 628     public domain.
 629     """
 630
 631     def __init__(self, params, *args, **kwargs):
 632         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 633         self._params = params
 634
 635     def http_open(self, req):
 636         return self.do_open(functools.partial(
 637             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 638             req)
 639
 640     @staticmethod
 641     def deflate(data):
 642         try:
 643             return zlib.decompress(data, -zlib.MAX_WBITS)
 644         except zlib.error:
 645             return zlib.decompress(data)
 646
 647     @staticmethod
 648     def addinfourl_wrapper(stream, headers, url, code):
 649         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 650             return compat_urllib_request.addinfourl(stream, headers, url, code)
 651         ret = compat_urllib_request.addinfourl(stream, headers, url)
 652         ret.code = code
 653         return ret
 654
 655     def http_request(self, req):
 656         for h, v in std_headers.items():
 657             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 658             # The dict keys are capitalized because of this bug by urllib
 659             if h.capitalize() not in req.headers:
 660                 req.add_header(h, v)
 661         if 'Youtubedl-no-compression' in req.headers:
 662             if 'Accept-encoding' in req.headers:
 663                 del req.headers['Accept-encoding']
 664             del req.headers['Youtubedl-no-compression']
 665
 666         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 667             # Python 2.6 is brain-dead when it comes to fragments
 668             req._Request__original = req._Request__original.partition('#')[0]
 669             req._Request__r_type = req._Request__r_type.partition('#')[0]
 670
 671         return req
 672
 673     def http_response(self, req, resp):
 674         old_resp = resp
 675         # gzip
 676         if resp.headers.get('Content-encoding', '') == 'gzip':
 677             content = resp.read()
 678             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 679             try:
 680                 uncompressed = io.BytesIO(gz.read())
 681             except IOError as original_ioerror:
 682                 # There may be junk add the end of the file
 683                 # See http://stackoverflow.com/q/4928560/35070 for details
 684                 for i in range(1, 1024):
 685                     try:
 686                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 687                         uncompressed = io.BytesIO(gz.read())
 688                     except IOError:
 689                         continue
 690                     break
 691                 else:
 692                     raise original_ioerror
 693             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 694             resp.msg = old_resp.msg
 695         # deflate
 696         if resp.headers.get('Content-encoding', '') == 'deflate':
 697             gz = io.BytesIO(self.deflate(resp.read()))
 698             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 699             resp.msg = old_resp.msg
 700         return resp
 701
 702     https_request = http_request
 703     https_response = http_response
 704
 705
 706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 707     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 708         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 709         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 710         self._params = params
 711
 712     def https_open(self, req):
 713         kwargs = {}
 714         if hasattr(self, '_context'):  # python > 2.6
 715             kwargs['context'] = self._context
 716         if hasattr(self, '_check_hostname'):  # python 3.x
 717             kwargs['check_hostname'] = self._check_hostname
 718         return self.do_open(functools.partial(
 719             _create_http_connection, self, self._https_conn_class, True),
 720             req, **kwargs)
 721
 722
 723 def parse_iso8601(date_str, delimiter='T', timezone=None):
 724     """ Return a UNIX timestamp from the given date """
 725
 726     if date_str is None:
 727         return None
 728
 729     if timezone is None:
 730         m = re.search(
 731             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 732             date_str)
 733         if not m:
 734             timezone = datetime.timedelta()
 735         else:
 736             date_str = date_str[:-len(m.group(0))]
 737             if not m.group('sign'):
 738                 timezone = datetime.timedelta()
 739             else:
 740                 sign = 1 if m.group('sign') == '+' else -1
 741                 timezone = datetime.timedelta(
 742                     hours=sign * int(m.group('hours')),
 743                     minutes=sign * int(m.group('minutes')))
 744     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 745     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 746     return calendar.timegm(dt.timetuple())
 747
 748
 749 def unified_strdate(date_str, day_first=True):
 750     """Return a string with the date in the format YYYYMMDD"""
 751
 752     if date_str is None:
 753         return None
 754     upload_date = None
 755     # Replace commas
 756     date_str = date_str.replace(',', ' ')
 757     # %z (UTC offset) is only supported in python>=3.2
 758     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 759         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 760     # Remove AM/PM + timezone
 761     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 762
 763     format_expressions = [
 764         '%d %B %Y',
 765         '%d %b %Y',
 766         '%B %d %Y',
 767         '%b %d %Y',
 768         '%b %dst %Y %I:%M%p',
 769         '%b %dnd %Y %I:%M%p',
 770         '%b %dth %Y %I:%M%p',
 771         '%Y %m %d',
 772         '%Y-%m-%d',
 773         '%Y/%m/%d',
 774         '%Y/%m/%d %H:%M:%S',
 775         '%Y-%m-%d %H:%M:%S',
 776         '%Y-%m-%d %H:%M:%S.%f',
 777         '%d.%m.%Y %H:%M',
 778         '%d.%m.%Y %H.%M',
 779         '%Y-%m-%dT%H:%M:%SZ',
 780         '%Y-%m-%dT%H:%M:%S.%fZ',
 781         '%Y-%m-%dT%H:%M:%S.%f0Z',
 782         '%Y-%m-%dT%H:%M:%S',
 783         '%Y-%m-%dT%H:%M:%S.%f',
 784         '%Y-%m-%dT%H:%M',
 785     ]
 786     if day_first:
 787         format_expressions.extend([
 788             '%d-%m-%Y',
 789             '%d.%m.%Y',
 790             '%d/%m/%Y',
 791             '%d/%m/%y',
 792             '%d/%m/%Y %H:%M:%S',
 793         ])
 794     else:
 795         format_expressions.extend([
 796             '%m-%d-%Y',
 797             '%m.%d.%Y',
 798             '%m/%d/%Y',
 799             '%m/%d/%y',
 800             '%m/%d/%Y %H:%M:%S',
 801         ])
 802     for expression in format_expressions:
 803         try:
 804             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 805         except ValueError:
 806             pass
 807     if upload_date is None:
 808         timetuple = email.utils.parsedate_tz(date_str)
 809         if timetuple:
 810             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 811     return upload_date
 812
 813
 814 def determine_ext(url, default_ext='unknown_video'):
 815     if url is None:
 816         return default_ext
 817     guess = url.partition('?')[0].rpartition('.')[2]
 818     if re.match(r'^[A-Za-z0-9]+$', guess):
 819         return guess
 820     else:
 821         return default_ext
 822
 823
 824 def subtitles_filename(filename, sub_lang, sub_format):
 825     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 826
 827
 828 def date_from_str(date_str):
 829     """
 830     Return a datetime object from a string in the format YYYYMMDD or
 831     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 832     today = datetime.date.today()
 833     if date_str in ('now', 'today'):
 834         return today
 835     if date_str == 'yesterday':
 836         return today - datetime.timedelta(days=1)
 837     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 838     if match is not None:
 839         sign = match.group('sign')
 840         time = int(match.group('time'))
 841         if sign == '-':
 842             time = -time
 843         unit = match.group('unit')
 844         # A bad aproximation?
 845         if unit == 'month':
 846             unit = 'day'
 847             time *= 30
 848         elif unit == 'year':
 849             unit = 'day'
 850             time *= 365
 851         unit += 's'
 852         delta = datetime.timedelta(**{unit: time})
 853         return today + delta
 854     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 855
 856
 857 def hyphenate_date(date_str):
 858     """
 859     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 860     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 861     if match is not None:
 862         return '-'.join(match.groups())
 863     else:
 864         return date_str
 865
 866
 867 class DateRange(object):
 868     """Represents a time interval between two dates"""
 869
 870     def __init__(self, start=None, end=None):
 871         """start and end must be strings in the format accepted by date"""
 872         if start is not None:
 873             self.start = date_from_str(start)
 874         else:
 875             self.start = datetime.datetime.min.date()
 876         if end is not None:
 877             self.end = date_from_str(end)
 878         else:
 879             self.end = datetime.datetime.max.date()
 880         if self.start > self.end:
 881             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 882
 883     @classmethod
 884     def day(cls, day):
 885         """Returns a range that only contains the given day"""
 886         return cls(day, day)
 887
 888     def __contains__(self, date):
 889         """Check if the date is in the range"""
 890         if not isinstance(date, datetime.date):
 891             date = date_from_str(date)
 892         return self.start <= date <= self.end
 893
 894     def __str__(self):
 895         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 896
 897
 898 def platform_name():
 899     """ Returns the platform name as a compat_str """
 900     res = platform.platform()
 901     if isinstance(res, bytes):
 902         res = res.decode(preferredencoding())
 903
 904     assert isinstance(res, compat_str)
 905     return res
 906
 907
 908 def _windows_write_string(s, out):
 909     """ Returns True if the string was written using special methods,
 910     False if it has yet to be written out."""
 911     # Adapted from http://stackoverflow.com/a/3259271/35070
 912
 913     import ctypes
 914     import ctypes.wintypes
 915
 916     WIN_OUTPUT_IDS = {
 917         1: -11,
 918         2: -12,
 919     }
 920
 921     try:
 922         fileno = out.fileno()
 923     except AttributeError:
 924         # If the output stream doesn't have a fileno, it's virtual
 925         return False
 926     except io.UnsupportedOperation:
 927         # Some strange Windows pseudo files?
 928         return False
 929     if fileno not in WIN_OUTPUT_IDS:
 930         return False
 931
 932     GetStdHandle = ctypes.WINFUNCTYPE(
 933         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 934         (b"GetStdHandle", ctypes.windll.kernel32))
 935     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 936
 937     WriteConsoleW = ctypes.WINFUNCTYPE(
 938         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 939         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 940         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 941     written = ctypes.wintypes.DWORD(0)
 942
 943     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 944     FILE_TYPE_CHAR = 0x0002
 945     FILE_TYPE_REMOTE = 0x8000
 946     GetConsoleMode = ctypes.WINFUNCTYPE(
 947         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 948         ctypes.POINTER(ctypes.wintypes.DWORD))(
 949         (b"GetConsoleMode", ctypes.windll.kernel32))
 950     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 951
 952     def not_a_console(handle):
 953         if handle == INVALID_HANDLE_VALUE or handle is None:
 954             return True
 955         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 956                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 957
 958     if not_a_console(h):
 959         return False
 960
 961     def next_nonbmp_pos(s):
 962         try:
 963             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 964         except StopIteration:
 965             return len(s)
 966
 967     while s:
 968         count = min(next_nonbmp_pos(s), 1024)
 969
 970         ret = WriteConsoleW(
 971             h, s, count if count else 2, ctypes.byref(written), None)
 972         if ret == 0:
 973             raise OSError('Failed to write string')
 974         if not count:  # We just wrote a non-BMP character
 975             assert written.value == 2
 976             s = s[1:]
 977         else:
 978             assert written.value > 0
 979             s = s[written.value:]
 980     return True
 981
 982
 983 def write_string(s, out=None, encoding=None):
 984     if out is None:
 985         out = sys.stderr
 986     assert type(s) == compat_str
 987
 988     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 989         if _windows_write_string(s, out):
 990             return
 991
 992     if ('b' in getattr(out, 'mode', '') or
 993             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 994         byt = s.encode(encoding or preferredencoding(), 'ignore')
 995         out.write(byt)
 996     elif hasattr(out, 'buffer'):
 997         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 998         byt = s.encode(enc, 'ignore')
 999         out.buffer.write(byt)
1000     else:
1001         out.write(s)
1002     out.flush()
1003
1004
1005 def bytes_to_intlist(bs):
1006     if not bs:
1007         return []
1008     if isinstance(bs[0], int):  # Python 3
1009         return list(bs)
1010     else:
1011         return [ord(c) for c in bs]
1012
1013
1014 def intlist_to_bytes(xs):
1015     if not xs:
1016         return b''
1017     return struct_pack('%dB' % len(xs), *xs)
1018
1019
1020 # Cross-platform file locking
1021 if sys.platform == 'win32':
1022     import ctypes.wintypes
1023     import msvcrt
1024
1025     class OVERLAPPED(ctypes.Structure):
1026         _fields_ = [
1027             ('Internal', ctypes.wintypes.LPVOID),
1028             ('InternalHigh', ctypes.wintypes.LPVOID),
1029             ('Offset', ctypes.wintypes.DWORD),
1030             ('OffsetHigh', ctypes.wintypes.DWORD),
1031             ('hEvent', ctypes.wintypes.HANDLE),
1032         ]
1033
1034     kernel32 = ctypes.windll.kernel32
1035     LockFileEx = kernel32.LockFileEx
1036     LockFileEx.argtypes = [
1037         ctypes.wintypes.HANDLE,     # hFile
1038         ctypes.wintypes.DWORD,      # dwFlags
1039         ctypes.wintypes.DWORD,      # dwReserved
1040         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1041         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1042         ctypes.POINTER(OVERLAPPED)  # Overlapped
1043     ]
1044     LockFileEx.restype = ctypes.wintypes.BOOL
1045     UnlockFileEx = kernel32.UnlockFileEx
1046     UnlockFileEx.argtypes = [
1047         ctypes.wintypes.HANDLE,     # hFile
1048         ctypes.wintypes.DWORD,      # dwReserved
1049         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1050         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1051         ctypes.POINTER(OVERLAPPED)  # Overlapped
1052     ]
1053     UnlockFileEx.restype = ctypes.wintypes.BOOL
1054     whole_low = 0xffffffff
1055     whole_high = 0x7fffffff
1056
1057     def _lock_file(f, exclusive):
1058         overlapped = OVERLAPPED()
1059         overlapped.Offset = 0
1060         overlapped.OffsetHigh = 0
1061         overlapped.hEvent = 0
1062         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063         handle = msvcrt.get_osfhandle(f.fileno())
1064         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065                           whole_low, whole_high, f._lock_file_overlapped_p):
1066             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068     def _unlock_file(f):
1069         assert f._lock_file_overlapped_p
1070         handle = msvcrt.get_osfhandle(f.fileno())
1071         if not UnlockFileEx(handle, 0,
1072                             whole_low, whole_high, f._lock_file_overlapped_p):
1073             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075 else:
1076     import fcntl
1077
1078     def _lock_file(f, exclusive):
1079         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1080
1081     def _unlock_file(f):
1082         fcntl.flock(f, fcntl.LOCK_UN)
1083
1084
1085 class locked_file(object):
1086     def __init__(self, filename, mode, encoding=None):
1087         assert mode in ['r', 'a', 'w']
1088         self.f = io.open(filename, mode, encoding=encoding)
1089         self.mode = mode
1090
1091     def __enter__(self):
1092         exclusive = self.mode != 'r'
1093         try:
1094             _lock_file(self.f, exclusive)
1095         except IOError:
1096             self.f.close()
1097             raise
1098         return self
1099
1100     def __exit__(self, etype, value, traceback):
1101         try:
1102             _unlock_file(self.f)
1103         finally:
1104             self.f.close()
1105
1106     def __iter__(self):
1107         return iter(self.f)
1108
1109     def write(self, *args):
1110         return self.f.write(*args)
1111
1112     def read(self, *args):
1113         return self.f.read(*args)
1114
1115
1116 def get_filesystem_encoding():
1117     encoding = sys.getfilesystemencoding()
1118     return encoding if encoding is not None else 'utf-8'
1119
1120
1121 def shell_quote(args):
1122     quoted_args = []
1123     encoding = get_filesystem_encoding()
1124     for a in args:
1125         if isinstance(a, bytes):
1126             # We may get a filename encoded with 'encodeFilename'
1127             a = a.decode(encoding)
1128         quoted_args.append(pipes.quote(a))
1129     return ' '.join(quoted_args)
1130
1131
1132 def smuggle_url(url, data):
1133     """ Pass additional data in a URL for internal use. """
1134
1135     sdata = compat_urllib_parse.urlencode(
1136         {'__youtubedl_smuggle': json.dumps(data)})
1137     return url + '#' + sdata
1138
1139
1140 def unsmuggle_url(smug_url, default=None):
1141     if '#__youtubedl_smuggle' not in smug_url:
1142         return smug_url, default
1143     url, _, sdata = smug_url.rpartition('#')
1144     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1145     data = json.loads(jsond)
1146     return url, data
1147
1148
1149 def format_bytes(bytes):
1150     if bytes is None:
1151         return 'N/A'
1152     if type(bytes) is str:
1153         bytes = float(bytes)
1154     if bytes == 0.0:
1155         exponent = 0
1156     else:
1157         exponent = int(math.log(bytes, 1024.0))
1158     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1159     converted = float(bytes) / float(1024 ** exponent)
1160     return '%.2f%s' % (converted, suffix)
1161
1162
1163 def parse_filesize(s):
1164     if s is None:
1165         return None
1166
1167     # The lower-case forms are of course incorrect and inofficial,
1168     # but we support those too
1169     _UNIT_TABLE = {
1170         'B': 1,
1171         'b': 1,
1172         'KiB': 1024,
1173         'KB': 1000,
1174         'kB': 1024,
1175         'Kb': 1000,
1176         'MiB': 1024 ** 2,
1177         'MB': 1000 ** 2,
1178         'mB': 1024 ** 2,
1179         'Mb': 1000 ** 2,
1180         'GiB': 1024 ** 3,
1181         'GB': 1000 ** 3,
1182         'gB': 1024 ** 3,
1183         'Gb': 1000 ** 3,
1184         'TiB': 1024 ** 4,
1185         'TB': 1000 ** 4,
1186         'tB': 1024 ** 4,
1187         'Tb': 1000 ** 4,
1188         'PiB': 1024 ** 5,
1189         'PB': 1000 ** 5,
1190         'pB': 1024 ** 5,
1191         'Pb': 1000 ** 5,
1192         'EiB': 1024 ** 6,
1193         'EB': 1000 ** 6,
1194         'eB': 1024 ** 6,
1195         'Eb': 1000 ** 6,
1196         'ZiB': 1024 ** 7,
1197         'ZB': 1000 ** 7,
1198         'zB': 1024 ** 7,
1199         'Zb': 1000 ** 7,
1200         'YiB': 1024 ** 8,
1201         'YB': 1000 ** 8,
1202         'yB': 1024 ** 8,
1203         'Yb': 1000 ** 8,
1204     }
1205
1206     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1207     m = re.match(
1208         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1209     if not m:
1210         return None
1211
1212     num_str = m.group('num').replace(',', '.')
1213     mult = _UNIT_TABLE[m.group('unit')]
1214     return int(float(num_str) * mult)
1215
1216
1217 def month_by_name(name):
1218     """ Return the number of a month by (locale-independently) English name """
1219
1220     try:
1221         return ENGLISH_MONTH_NAMES.index(name) + 1
1222     except ValueError:
1223         return None
1224
1225
1226 def month_by_abbreviation(abbrev):
1227     """ Return the number of a month by (locale-independently) English
1228         abbreviations """
1229
1230     try:
1231         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1232     except ValueError:
1233         return None
1234
1235
1236 def fix_xml_ampersands(xml_str):
1237     """Replace all the '&' by '&amp;' in XML"""
1238     return re.sub(
1239         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1240         '&amp;',
1241         xml_str)
1242
1243
1244 def setproctitle(title):
1245     assert isinstance(title, compat_str)
1246     try:
1247         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248     except OSError:
1249         return
1250     title_bytes = title.encode('utf-8')
1251     buf = ctypes.create_string_buffer(len(title_bytes))
1252     buf.value = title_bytes
1253     try:
1254         libc.prctl(15, buf, 0, 0, 0)
1255     except AttributeError:
1256         return  # Strange libc, just skip this
1257
1258
1259 def remove_start(s, start):
1260     if s.startswith(start):
1261         return s[len(start):]
1262     return s
1263
1264
1265 def remove_end(s, end):
1266     if s.endswith(end):
1267         return s[:-len(end)]
1268     return s
1269
1270
1271 def url_basename(url):
1272     path = compat_urlparse.urlparse(url).path
1273     return path.strip('/').split('/')[-1]
1274
1275
1276 class HEADRequest(compat_urllib_request.Request):
1277     def get_method(self):
1278         return "HEAD"
1279
1280
1281 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1282     if get_attr:
1283         if v is not None:
1284             v = getattr(v, get_attr, None)
1285     if v == '':
1286         v = None
1287     return default if v is None else (int(v) * invscale // scale)
1288
1289
1290 def str_or_none(v, default=None):
1291     return default if v is None else compat_str(v)
1292
1293
1294 def str_to_int(int_str):
1295     """ A more relaxed version of int_or_none """
1296     if int_str is None:
1297         return None
1298     int_str = re.sub(r'[,\.\+]', '', int_str)
1299     return int(int_str)
1300
1301
1302 def float_or_none(v, scale=1, invscale=1, default=None):
1303     return default if v is None else (float(v) * invscale / scale)
1304
1305
1306 def parse_duration(s):
1307     if not isinstance(s, compat_basestring):
1308         return None
1309
1310     s = s.strip()
1311
1312     m = re.match(
1313         r'''(?ix)(?:P?T)?
1314         (?:
1315             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
1318             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1319             (?:
1320                 (?:
1321                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323                 )?
1324                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325             )?
1326             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327         )$''', s)
1328     if not m:
1329         return None
1330     res = 0
1331     if m.group('only_mins'):
1332         return float_or_none(m.group('only_mins'), invscale=60)
1333     if m.group('only_hours'):
1334         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335     if m.group('secs'):
1336         res += int(m.group('secs'))
1337     if m.group('mins_reversed'):
1338         res += int(m.group('mins_reversed')) * 60
1339     if m.group('mins'):
1340         res += int(m.group('mins')) * 60
1341     if m.group('hours'):
1342         res += int(m.group('hours')) * 60 * 60
1343     if m.group('hours_reversed'):
1344         res += int(m.group('hours_reversed')) * 60 * 60
1345     if m.group('days'):
1346         res += int(m.group('days')) * 24 * 60 * 60
1347     if m.group('ms'):
1348         res += float(m.group('ms'))
1349     return res
1350
1351
1352 def prepend_extension(filename, ext, expected_real_ext=None):
1353     name, real_ext = os.path.splitext(filename)
1354     return (
1355         '{0}.{1}{2}'.format(name, ext, real_ext)
1356         if not expected_real_ext or real_ext[1:] == expected_real_ext
1357         else '{0}.{1}'.format(filename, ext))
1358
1359
1360 def replace_extension(filename, ext, expected_real_ext=None):
1361     name, real_ext = os.path.splitext(filename)
1362     return '{0}.{1}'.format(
1363         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1364         ext)
1365
1366
1367 def check_executable(exe, args=[]):
1368     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1369     args can be a list of arguments for a short output (like -version) """
1370     try:
1371         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1372     except OSError:
1373         return False
1374     return exe
1375
1376
1377 def get_exe_version(exe, args=['--version'],
1378                     version_re=None, unrecognized='present'):
1379     """ Returns the version of the specified executable,
1380     or False if the executable is not present """
1381     try:
1382         out, _ = subprocess.Popen(
1383             [exe] + args,
1384             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1385     except OSError:
1386         return False
1387     if isinstance(out, bytes):  # Python 2.x
1388         out = out.decode('ascii', 'ignore')
1389     return detect_exe_version(out, version_re, unrecognized)
1390
1391
1392 def detect_exe_version(output, version_re=None, unrecognized='present'):
1393     assert isinstance(output, compat_str)
1394     if version_re is None:
1395         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1396     m = re.search(version_re, output)
1397     if m:
1398         return m.group(1)
1399     else:
1400         return unrecognized
1401
1402
1403 class PagedList(object):
1404     def __len__(self):
1405         # This is only useful for tests
1406         return len(self.getslice())
1407
1408
1409 class OnDemandPagedList(PagedList):
1410     def __init__(self, pagefunc, pagesize):
1411         self._pagefunc = pagefunc
1412         self._pagesize = pagesize
1413
1414     def getslice(self, start=0, end=None):
1415         res = []
1416         for pagenum in itertools.count(start // self._pagesize):
1417             firstid = pagenum * self._pagesize
1418             nextfirstid = pagenum * self._pagesize + self._pagesize
1419             if start >= nextfirstid:
1420                 continue
1421
1422             page_results = list(self._pagefunc(pagenum))
1423
1424             startv = (
1425                 start % self._pagesize
1426                 if firstid <= start < nextfirstid
1427                 else 0)
1428
1429             endv = (
1430                 ((end - 1) % self._pagesize) + 1
1431                 if (end is not None and firstid <= end <= nextfirstid)
1432                 else None)
1433
1434             if startv != 0 or endv is not None:
1435                 page_results = page_results[startv:endv]
1436             res.extend(page_results)
1437
1438             # A little optimization - if current page is not "full", ie. does
1439             # not contain page_size videos then we can assume that this page
1440             # is the last one - there are no more ids on further pages -
1441             # i.e. no need to query again.
1442             if len(page_results) + startv < self._pagesize:
1443                 break
1444
1445             # If we got the whole page, but the next page is not interesting,
1446             # break out early as well
1447             if end == nextfirstid:
1448                 break
1449         return res
1450
1451
1452 class InAdvancePagedList(PagedList):
1453     def __init__(self, pagefunc, pagecount, pagesize):
1454         self._pagefunc = pagefunc
1455         self._pagecount = pagecount
1456         self._pagesize = pagesize
1457
1458     def getslice(self, start=0, end=None):
1459         res = []
1460         start_page = start // self._pagesize
1461         end_page = (
1462             self._pagecount if end is None else (end // self._pagesize + 1))
1463         skip_elems = start - start_page * self._pagesize
1464         only_more = None if end is None else end - start
1465         for pagenum in range(start_page, end_page):
1466             page = list(self._pagefunc(pagenum))
1467             if skip_elems:
1468                 page = page[skip_elems:]
1469                 skip_elems = None
1470             if only_more is not None:
1471                 if len(page) < only_more:
1472                     only_more -= len(page)
1473                 else:
1474                     page = page[:only_more]
1475                     res.extend(page)
1476                     break
1477             res.extend(page)
1478         return res
1479
1480
1481 def uppercase_escape(s):
1482     unicode_escape = codecs.getdecoder('unicode_escape')
1483     return re.sub(
1484         r'\\U[0-9a-fA-F]{8}',
1485         lambda m: unicode_escape(m.group(0))[0],
1486         s)
1487
1488
1489 def escape_rfc3986(s):
1490     """Escape non-ASCII characters as suggested by RFC 3986"""
1491     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1492         s = s.encode('utf-8')
1493     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1494
1495
1496 def escape_url(url):
1497     """Escape URL as suggested by RFC 3986"""
1498     url_parsed = compat_urllib_parse_urlparse(url)
1499     return url_parsed._replace(
1500         path=escape_rfc3986(url_parsed.path),
1501         params=escape_rfc3986(url_parsed.params),
1502         query=escape_rfc3986(url_parsed.query),
1503         fragment=escape_rfc3986(url_parsed.fragment)
1504     ).geturl()
1505
1506 try:
1507     struct.pack('!I', 0)
1508 except TypeError:
1509     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1510     def struct_pack(spec, *args):
1511         if isinstance(spec, compat_str):
1512             spec = spec.encode('ascii')
1513         return struct.pack(spec, *args)
1514
1515     def struct_unpack(spec, *args):
1516         if isinstance(spec, compat_str):
1517             spec = spec.encode('ascii')
1518         return struct.unpack(spec, *args)
1519 else:
1520     struct_pack = struct.pack
1521     struct_unpack = struct.unpack
1522
1523
1524 def read_batch_urls(batch_fd):
1525     def fixup(url):
1526         if not isinstance(url, compat_str):
1527             url = url.decode('utf-8', 'replace')
1528         BOM_UTF8 = '\xef\xbb\xbf'
1529         if url.startswith(BOM_UTF8):
1530             url = url[len(BOM_UTF8):]
1531         url = url.strip()
1532         if url.startswith(('#', ';', ']')):
1533             return False
1534         return url
1535
1536     with contextlib.closing(batch_fd) as fd:
1537         return [url for url in map(fixup, fd) if url]
1538
1539
1540 def urlencode_postdata(*args, **kargs):
1541     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1542
1543
1544 try:
1545     etree_iter = xml.etree.ElementTree.Element.iter
1546 except AttributeError:  # Python <=2.6
1547     etree_iter = lambda n: n.findall('.//*')
1548
1549
1550 def parse_xml(s):
1551     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1552         def doctype(self, name, pubid, system):
1553             pass  # Ignore doctypes
1554
1555     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1556     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1557     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1558     # Fix up XML parser in Python 2.x
1559     if sys.version_info < (3, 0):
1560         for n in etree_iter(tree):
1561             if n.text is not None:
1562                 if not isinstance(n.text, compat_str):
1563                     n.text = n.text.decode('utf-8')
1564     return tree
1565
1566
1567 US_RATINGS = {
1568     'G': 0,
1569     'PG': 10,
1570     'PG-13': 13,
1571     'R': 16,
1572     'NC': 18,
1573 }
1574
1575
1576 def parse_age_limit(s):
1577     if s is None:
1578         return None
1579     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1580     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1581
1582
1583 def strip_jsonp(code):
1584     return re.sub(
1585         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1586
1587
1588 def js_to_json(code):
1589     def fix_kv(m):
1590         v = m.group(0)
1591         if v in ('true', 'false', 'null'):
1592             return v
1593         if v.startswith('"'):
1594             return v
1595         if v.startswith("'"):
1596             v = v[1:-1]
1597             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1598                 '\\\\': '\\\\',
1599                 "\\'": "'",
1600                 '"': '\\"',
1601             }[m.group(0)], v)
1602         return '"%s"' % v
1603
1604     res = re.sub(r'''(?x)
1605         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1606         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1607         [a-zA-Z_][.a-zA-Z_0-9]*
1608         ''', fix_kv, code)
1609     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1610     return res
1611
1612
1613 def qualities(quality_ids):
1614     """ Get a numeric quality value out of a list of possible values """
1615     def q(qid):
1616         try:
1617             return quality_ids.index(qid)
1618         except ValueError:
1619             return -1
1620     return q
1621
1622
1623 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1624
1625
1626 def limit_length(s, length):
1627     """ Add ellipses to overly long strings """
1628     if s is None:
1629         return None
1630     ELLIPSES = '...'
1631     if len(s) > length:
1632         return s[:length - len(ELLIPSES)] + ELLIPSES
1633     return s
1634
1635
1636 def version_tuple(v):
1637     return tuple(int(e) for e in re.split(r'[-.]', v))
1638
1639
1640 def is_outdated_version(version, limit, assume_new=True):
1641     if not version:
1642         return not assume_new
1643     try:
1644         return version_tuple(version) < version_tuple(limit)
1645     except ValueError:
1646         return not assume_new
1647
1648
1649 def ytdl_is_updateable():
1650     """ Returns if youtube-dl can be updated with -U """
1651     from zipimport import zipimporter
1652
1653     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1654
1655
1656 def args_to_str(args):
1657     # Get a short string representation for a subprocess command
1658     return ' '.join(shlex_quote(a) for a in args)
1659
1660
1661 def mimetype2ext(mt):
1662     _, _, res = mt.rpartition('/')
1663
1664     return {
1665         'x-ms-wmv': 'wmv',
1666         'x-mp4-fragmented': 'mp4',
1667     }.get(res, res)
1668
1669
1670 def urlhandle_detect_ext(url_handle):
1671     try:
1672         url_handle.headers
1673         getheader = lambda h: url_handle.headers[h]
1674     except AttributeError:  # Python < 3
1675         getheader = url_handle.info().getheader
1676
1677     cd = getheader('Content-Disposition')
1678     if cd:
1679         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1680         if m:
1681             e = determine_ext(m.group('filename'), default_ext=None)
1682             if e:
1683                 return e
1684
1685     return mimetype2ext(getheader('Content-Type'))
1686
1687
1688 def age_restricted(content_limit, age_limit):
1689     """ Returns True iff the content should be blocked """
1690
1691     if age_limit is None:  # No limit set
1692         return False
1693     if content_limit is None:
1694         return False  # Content available for everyone
1695     return age_limit < content_limit
1696
1697
1698 def is_html(first_bytes):
1699     """ Detect whether a file contains HTML by examining its first bytes. """
1700
1701     BOMS = [
1702         (b'\xef\xbb\xbf', 'utf-8'),
1703         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1704         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1705         (b'\xff\xfe', 'utf-16-le'),
1706         (b'\xfe\xff', 'utf-16-be'),
1707     ]
1708     for bom, enc in BOMS:
1709         if first_bytes.startswith(bom):
1710             s = first_bytes[len(bom):].decode(enc, 'replace')
1711             break
1712     else:
1713         s = first_bytes.decode('utf-8', 'replace')
1714
1715     return re.match(r'^\s*<', s)
1716
1717
1718 def determine_protocol(info_dict):
1719     protocol = info_dict.get('protocol')
1720     if protocol is not None:
1721         return protocol
1722
1723     url = info_dict['url']
1724     if url.startswith('rtmp'):
1725         return 'rtmp'
1726     elif url.startswith('mms'):
1727         return 'mms'
1728     elif url.startswith('rtsp'):
1729         return 'rtsp'
1730
1731     ext = determine_ext(url)
1732     if ext == 'm3u8':
1733         return 'm3u8'
1734     elif ext == 'f4m':
1735         return 'f4m'
1736
1737     return compat_urllib_parse_urlparse(url).scheme
1738
1739
1740 def render_table(header_row, data):
1741     """ Render a list of rows, each as a list of values """
1742     table = [header_row] + data
1743     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1744     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1745     return '\n'.join(format_str % tuple(row) for row in table)
1746
1747
1748 def _match_one(filter_part, dct):
1749     COMPARISON_OPERATORS = {
1750         '<': operator.lt,
1751         '<=': operator.le,
1752         '>': operator.gt,
1753         '>=': operator.ge,
1754         '=': operator.eq,
1755         '!=': operator.ne,
1756     }
1757     operator_rex = re.compile(r'''(?x)\s*
1758         (?P<key>[a-z_]+)
1759         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1760         (?:
1761             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1762             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1763         )
1764         \s*$
1765         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1766     m = operator_rex.search(filter_part)
1767     if m:
1768         op = COMPARISON_OPERATORS[m.group('op')]
1769         if m.group('strval') is not None:
1770             if m.group('op') not in ('=', '!='):
1771                 raise ValueError(
1772                     'Operator %s does not support string values!' % m.group('op'))
1773             comparison_value = m.group('strval')
1774         else:
1775             try:
1776                 comparison_value = int(m.group('intval'))
1777             except ValueError:
1778                 comparison_value = parse_filesize(m.group('intval'))
1779                 if comparison_value is None:
1780                     comparison_value = parse_filesize(m.group('intval') + 'B')
1781                 if comparison_value is None:
1782                     raise ValueError(
1783                         'Invalid integer value %r in filter part %r' % (
1784                             m.group('intval'), filter_part))
1785         actual_value = dct.get(m.group('key'))
1786         if actual_value is None:
1787             return m.group('none_inclusive')
1788         return op(actual_value, comparison_value)
1789
1790     UNARY_OPERATORS = {
1791         '': lambda v: v is not None,
1792         '!': lambda v: v is None,
1793     }
1794     operator_rex = re.compile(r'''(?x)\s*
1795         (?P<op>%s)\s*(?P<key>[a-z_]+)
1796         \s*$
1797         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1798     m = operator_rex.search(filter_part)
1799     if m:
1800         op = UNARY_OPERATORS[m.group('op')]
1801         actual_value = dct.get(m.group('key'))
1802         return op(actual_value)
1803
1804     raise ValueError('Invalid filter part %r' % filter_part)
1805
1806
1807 def match_str(filter_str, dct):
1808     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1809
1810     return all(
1811         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1812
1813
1814 def match_filter_func(filter_str):
1815     def _match_func(info_dict):
1816         if match_str(filter_str, info_dict):
1817             return None
1818         else:
1819             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1820             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1821     return _match_func
1822
1823
1824 def parse_dfxp_time_expr(time_expr):
1825     if not time_expr:
1826         return 0.0
1827
1828     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1829     if mobj:
1830         return float(mobj.group('time_offset'))
1831
1832     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1833     if mobj:
1834         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1835
1836
1837 def format_srt_time(seconds):
1838     (mins, secs) = divmod(seconds, 60)
1839     (hours, mins) = divmod(mins, 60)
1840     millisecs = (secs - int(secs)) * 1000
1841     secs = int(secs)
1842     return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1843
1844
1845 def dfxp2srt(dfxp_data):
1846     _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1847
1848     def parse_node(node):
1849         str_or_empty = functools.partial(str_or_none, default='')
1850
1851         out = str_or_empty(node.text)
1852
1853         for child in node:
1854             if child.tag == _x('ttml:br'):
1855                 out += '\n' + str_or_empty(child.tail)
1856             elif child.tag == _x('ttml:span'):
1857                 out += str_or_empty(parse_node(child))
1858             else:
1859                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1860
1861         return out
1862
1863     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1864     out = []
1865     paras = dfxp.findall(_x('.//ttml:p'))
1866
1867     for para, index in zip(paras, itertools.count(1)):
1868         out.append('%d\n%s --> %s\n%s\n\n' % (
1869             index,
1870             format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1871             format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1872             parse_node(para)))
1873
1874     return ''.join(out)
1875
1876
1877 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1878     def __init__(self, proxies=None):
1879         # Set default handlers
1880         for type in ('http', 'https'):
1881             setattr(self, '%s_open' % type,
1882                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1883                         meth(r, proxy, type))
1884         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1885
1886     def proxy_open(self, req, proxy, type):
1887         req_proxy = req.headers.get('Ytdl-request-proxy')
1888         if req_proxy is not None:
1889             proxy = req_proxy
1890             del req.headers['Ytdl-request-proxy']
1891
1892         if proxy == '__noproxy__':
1893             return None  # No Proxy
1894         return compat_urllib_request.ProxyHandler.proxy_open(
1895             self, req, proxy, type)