youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_kwargs,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 ENGLISH_MONTH_NAMES = [
  66     'January', 'February', 'March', 'April', 'May', 'June',
  67     'July', 'August', 'September', 'October', 'November', 'December']
  68
  69
  70 def preferredencoding():
  71     """Get preferred encoding.
  72
  73     Returns the best encoding scheme for the system, based on
  74     locale.getpreferredencoding() and some further tweaks.
  75     """
  76     try:
  77         pref = locale.getpreferredencoding()
  78         'TEST'.encode(pref)
  79     except Exception:
  80         pref = 'UTF-8'
  81
  82     return pref
  83
  84
  85 def write_json_file(obj, fn):
  86     """ Encode obj as JSON and write it to fn, atomically if possible """
  87
  88     fn = encodeFilename(fn)
  89     if sys.version_info < (3, 0) and sys.platform != 'win32':
  90         encoding = get_filesystem_encoding()
  91         # os.path.basename returns a bytes object, but NamedTemporaryFile
  92         # will fail if the filename contains non ascii characters unless we
  93         # use a unicode object
  94         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  95         # the same for os.path.dirname
  96         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  97     else:
  98         path_basename = os.path.basename
  99         path_dirname = os.path.dirname
 100
 101     args = {
 102         'suffix': '.tmp',
 103         'prefix': path_basename(fn) + '.',
 104         'dir': path_dirname(fn),
 105         'delete': False,
 106     }
 107
 108     # In Python 2.x, json.dump expects a bytestream.
 109     # In Python 3.x, it writes to a character stream
 110     if sys.version_info < (3, 0):
 111         args['mode'] = 'wb'
 112     else:
 113         args.update({
 114             'mode': 'w',
 115             'encoding': 'utf-8',
 116         })
 117
 118     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 119
 120     try:
 121         with tf:
 122             json.dump(obj, tf)
 123         if sys.platform == 'win32':
 124             # Need to remove existing file on Windows, else os.rename raises
 125             # WindowsError or FileExistsError.
 126             try:
 127                 os.unlink(fn)
 128             except OSError:
 129                 pass
 130         os.rename(tf.name, fn)
 131     except Exception:
 132         try:
 133             os.remove(tf.name)
 134         except OSError:
 135             pass
 136         raise
 137
 138
 139 if sys.version_info >= (2, 7):
 140     def find_xpath_attr(node, xpath, key, val):
 141         """ Find the xpath xpath[@key=val] """
 142         assert re.match(r'^[a-zA-Z-]+$', key)
 143         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 144         expr = xpath + "[@%s='%s']" % (key, val)
 145         return node.find(expr)
 146 else:
 147     def find_xpath_attr(node, xpath, key, val):
 148         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 149         # .//node does not match if a node is a direct child of . !
 150         if isinstance(xpath, compat_str):
 151             xpath = xpath.encode('ascii')
 152
 153         for f in node.findall(xpath):
 154             if f.attrib.get(key) == val:
 155                 return f
 156         return None
 157
 158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 159 # the namespace parameter
 160
 161
 162 def xpath_with_ns(path, ns_map):
 163     components = [c.split(':') for c in path.split('/')]
 164     replaced = []
 165     for c in components:
 166         if len(c) == 1:
 167             replaced.append(c[0])
 168         else:
 169             ns, tag = c
 170             replaced.append('{%s}%s' % (ns_map[ns], tag))
 171     return '/'.join(replaced)
 172
 173
 174 def xpath_text(node, xpath, name=None, fatal=False):
 175     if sys.version_info < (2, 7):  # Crazy 2.6
 176         xpath = xpath.encode('ascii')
 177
 178     n = node.find(xpath)
 179     if n is None or n.text is None:
 180         if fatal:
 181             name = xpath if name is None else name
 182             raise ExtractorError('Could not find XML element %s' % name)
 183         else:
 184             return None
 185     return n.text
 186
 187
 188 def get_element_by_id(id, html):
 189     """Return the content of the tag with the specified ID in the passed HTML document"""
 190     return get_element_by_attribute("id", id, html)
 191
 192
 193 def get_element_by_attribute(attribute, value, html):
 194     """Return the content of the tag with the specified attribute in the passed HTML document"""
 195
 196     m = re.search(r'''(?xs)
 197         <([a-zA-Z0-9:._-]+)
 198          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 199          \s+%s=['"]?%s['"]?
 200          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 201         \s*>
 202         (?P<content>.*?)
 203         </\1>
 204     ''' % (re.escape(attribute), re.escape(value)), html)
 205
 206     if not m:
 207         return None
 208     res = m.group('content')
 209
 210     if res.startswith('"') or res.startswith("'"):
 211         res = res[1:-1]
 212
 213     return unescapeHTML(res)
 214
 215
 216 def clean_html(html):
 217     """Clean an HTML snippet into a readable string"""
 218
 219     if html is None:  # Convenience for sanitizing descriptions etc.
 220         return html
 221
 222     # Newline vs <br />
 223     html = html.replace('\n', ' ')
 224     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 225     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 226     # Strip html tags
 227     html = re.sub('<.*?>', '', html)
 228     # Replace html entities
 229     html = unescapeHTML(html)
 230     return html.strip()
 231
 232
 233 def sanitize_open(filename, open_mode):
 234     """Try to open the given filename, and slightly tweak it if this fails.
 235
 236     Attempts to open the given filename. If this fails, it tries to change
 237     the filename slightly, step by step, until it's either able to open it
 238     or it fails and raises a final exception, like the standard open()
 239     function.
 240
 241     It returns the tuple (stream, definitive_file_name).
 242     """
 243     try:
 244         if filename == '-':
 245             if sys.platform == 'win32':
 246                 import msvcrt
 247                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 248             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 249         stream = open(encodeFilename(filename), open_mode)
 250         return (stream, filename)
 251     except (IOError, OSError) as err:
 252         if err.errno in (errno.EACCES,):
 253             raise
 254
 255         # In case of error, try to remove win32 forbidden chars
 256         alt_filename = sanitize_path(filename)
 257         if alt_filename == filename:
 258             raise
 259         else:
 260             # An exception here should be caught in the caller
 261             stream = open(encodeFilename(alt_filename), open_mode)
 262             return (stream, alt_filename)
 263
 264
 265 def timeconvert(timestr):
 266     """Convert RFC 2822 defined time string into system timestamp"""
 267     timestamp = None
 268     timetuple = email.utils.parsedate_tz(timestr)
 269     if timetuple is not None:
 270         timestamp = email.utils.mktime_tz(timetuple)
 271     return timestamp
 272
 273
 274 def sanitize_filename(s, restricted=False, is_id=False):
 275     """Sanitizes a string so it could be used as part of a filename.
 276     If restricted is set, use a stricter subset of allowed characters.
 277     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 278     """
 279     def replace_insane(char):
 280         if char == '?' or ord(char) < 32 or ord(char) == 127:
 281             return ''
 282         elif char == '"':
 283             return '' if restricted else '\''
 284         elif char == ':':
 285             return '_-' if restricted else ' -'
 286         elif char in '\\/|*<>':
 287             return '_'
 288         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 289             return '_'
 290         if restricted and ord(char) > 127:
 291             return '_'
 292         return char
 293
 294     # Handle timestamps
 295     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 296     result = ''.join(map(replace_insane, s))
 297     if not is_id:
 298         while '__' in result:
 299             result = result.replace('__', '_')
 300         result = result.strip('_')
 301         # Common case of "Foreign band name - English song title"
 302         if restricted and result.startswith('-_'):
 303             result = result[2:]
 304         if result.startswith('-'):
 305             result = '_' + result[len('-'):]
 306         result = result.lstrip('.')
 307         if not result:
 308             result = '_'
 309     return result
 310
 311
 312 def sanitize_path(s):
 313     """Sanitizes and normalizes path on Windows"""
 314     if sys.platform != 'win32':
 315         return s
 316     drive_or_unc, _ = os.path.splitdrive(s)
 317     if sys.version_info < (2, 7) and not drive_or_unc:
 318         drive_or_unc, _ = os.path.splitunc(s)
 319     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 320     if drive_or_unc:
 321         norm_path.pop(0)
 322     sanitized_path = [
 323         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
 324         for path_part in norm_path]
 325     if drive_or_unc:
 326         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 327     return os.path.join(*sanitized_path)
 328
 329
 330 def sanitize_url_path_consecutive_slashes(url):
 331     """Collapses consecutive slashes in URLs' path"""
 332     parsed_url = list(compat_urlparse.urlparse(url))
 333     parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
 334     return compat_urlparse.urlunparse(parsed_url)
 335
 336
 337 def orderedSet(iterable):
 338     """ Remove all duplicates from the input iterable """
 339     res = []
 340     for el in iterable:
 341         if el not in res:
 342             res.append(el)
 343     return res
 344
 345
 346 def _htmlentity_transform(entity):
 347     """Transforms an HTML entity to a character."""
 348     # Known non-numeric HTML entity
 349     if entity in compat_html_entities.name2codepoint:
 350         return compat_chr(compat_html_entities.name2codepoint[entity])
 351
 352     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 353     if mobj is not None:
 354         numstr = mobj.group(1)
 355         if numstr.startswith('x'):
 356             base = 16
 357             numstr = '0%s' % numstr
 358         else:
 359             base = 10
 360         return compat_chr(int(numstr, base))
 361
 362     # Unknown entity in name, return its literal representation
 363     return ('&%s;' % entity)
 364
 365
 366 def unescapeHTML(s):
 367     if s is None:
 368         return None
 369     assert type(s) == compat_str
 370
 371     return re.sub(
 372         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 373
 374
 375 def get_subprocess_encoding():
 376     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 377         # For subprocess calls, encode with locale encoding
 378         # Refer to http://stackoverflow.com/a/9951851/35070
 379         encoding = preferredencoding()
 380     else:
 381         encoding = sys.getfilesystemencoding()
 382     if encoding is None:
 383         encoding = 'utf-8'
 384     return encoding
 385
 386
 387 def encodeFilename(s, for_subprocess=False):
 388     """
 389     @param s The name of the file
 390     """
 391
 392     assert type(s) == compat_str
 393
 394     # Python 3 has a Unicode API
 395     if sys.version_info >= (3, 0):
 396         return s
 397
 398     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 399     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 400     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 401     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 402         return s
 403
 404     return s.encode(get_subprocess_encoding(), 'ignore')
 405
 406
 407 def decodeFilename(b, for_subprocess=False):
 408
 409     if sys.version_info >= (3, 0):
 410         return b
 411
 412     if not isinstance(b, bytes):
 413         return b
 414
 415     return b.decode(get_subprocess_encoding(), 'ignore')
 416
 417
 418 def encodeArgument(s):
 419     if not isinstance(s, compat_str):
 420         # Legacy code that uses byte strings
 421         # Uncomment the following line after fixing all post processors
 422         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 423         s = s.decode('ascii')
 424     return encodeFilename(s, True)
 425
 426
 427 def decodeArgument(b):
 428     return decodeFilename(b, True)
 429
 430
 431 def decodeOption(optval):
 432     if optval is None:
 433         return optval
 434     if isinstance(optval, bytes):
 435         optval = optval.decode(preferredencoding())
 436
 437     assert isinstance(optval, compat_str)
 438     return optval
 439
 440
 441 def formatSeconds(secs):
 442     if secs > 3600:
 443         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 444     elif secs > 60:
 445         return '%d:%02d' % (secs // 60, secs % 60)
 446     else:
 447         return '%d' % secs
 448
 449
 450 def make_HTTPS_handler(params, **kwargs):
 451     opts_no_check_certificate = params.get('nocheckcertificate', False)
 452     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 453         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 454         if opts_no_check_certificate:
 455             context.check_hostname = False
 456             context.verify_mode = ssl.CERT_NONE
 457         try:
 458             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 459         except TypeError:
 460             # Python 2.7.8
 461             # (create_default_context present but HTTPSHandler has no context=)
 462             pass
 463
 464     if sys.version_info < (3, 2):
 465         return YoutubeDLHTTPSHandler(params, **kwargs)
 466     else:  # Python < 3.4
 467         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 468         context.verify_mode = (ssl.CERT_NONE
 469                                if opts_no_check_certificate
 470                                else ssl.CERT_REQUIRED)
 471         context.set_default_verify_paths()
 472         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 473
 474
 475 def bug_reports_message():
 476     if ytdl_is_updateable():
 477         update_cmd = 'type  youtube-dl -U  to update'
 478     else:
 479         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 480     msg = '; please report this issue on https://yt-dl.org/bug .'
 481     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 482     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 483     return msg
 484
 485
 486 class ExtractorError(Exception):
 487     """Error during info extraction."""
 488
 489     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 490         """ tb, if given, is the original traceback (so that it can be printed out).
 491         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 492         """
 493
 494         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 495             expected = True
 496         if video_id is not None:
 497             msg = video_id + ': ' + msg
 498         if cause:
 499             msg += ' (caused by %r)' % cause
 500         if not expected:
 501             msg += bug_reports_message()
 502         super(ExtractorError, self).__init__(msg)
 503
 504         self.traceback = tb
 505         self.exc_info = sys.exc_info()  # preserve original exception
 506         self.cause = cause
 507         self.video_id = video_id
 508
 509     def format_traceback(self):
 510         if self.traceback is None:
 511             return None
 512         return ''.join(traceback.format_tb(self.traceback))
 513
 514
 515 class UnsupportedError(ExtractorError):
 516     def __init__(self, url):
 517         super(UnsupportedError, self).__init__(
 518             'Unsupported URL: %s' % url, expected=True)
 519         self.url = url
 520
 521
 522 class RegexNotFoundError(ExtractorError):
 523     """Error when a regex didn't match"""
 524     pass
 525
 526
 527 class DownloadError(Exception):
 528     """Download Error exception.
 529
 530     This exception may be thrown by FileDownloader objects if they are not
 531     configured to continue on errors. They will contain the appropriate
 532     error message.
 533     """
 534
 535     def __init__(self, msg, exc_info=None):
 536         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 537         super(DownloadError, self).__init__(msg)
 538         self.exc_info = exc_info
 539
 540
 541 class SameFileError(Exception):
 542     """Same File exception.
 543
 544     This exception will be thrown by FileDownloader objects if they detect
 545     multiple files would have to be downloaded to the same file on disk.
 546     """
 547     pass
 548
 549
 550 class PostProcessingError(Exception):
 551     """Post Processing exception.
 552
 553     This exception may be raised by PostProcessor's .run() method to
 554     indicate an error in the postprocessing task.
 555     """
 556
 557     def __init__(self, msg):
 558         self.msg = msg
 559
 560
 561 class MaxDownloadsReached(Exception):
 562     """ --max-downloads limit has been reached. """
 563     pass
 564
 565
 566 class UnavailableVideoError(Exception):
 567     """Unavailable Format exception.
 568
 569     This exception will be thrown when a video is requested
 570     in a format that is not available for that video.
 571     """
 572     pass
 573
 574
 575 class ContentTooShortError(Exception):
 576     """Content Too Short exception.
 577
 578     This exception may be raised by FileDownloader objects when a file they
 579     download is too small for what the server announced first, indicating
 580     the connection was probably interrupted.
 581     """
 582     # Both in bytes
 583     downloaded = None
 584     expected = None
 585
 586     def __init__(self, downloaded, expected):
 587         self.downloaded = downloaded
 588         self.expected = expected
 589
 590
 591 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 592     hc = http_class(*args, **kwargs)
 593     source_address = ydl_handler._params.get('source_address')
 594     if source_address is not None:
 595         sa = (source_address, 0)
 596         if hasattr(hc, 'source_address'):  # Python 2.7+
 597             hc.source_address = sa
 598         else:  # Python 2.6
 599             def _hc_connect(self, *args, **kwargs):
 600                 sock = compat_socket_create_connection(
 601                     (self.host, self.port), self.timeout, sa)
 602                 if is_https:
 603                     self.sock = ssl.wrap_socket(
 604                         sock, self.key_file, self.cert_file,
 605                         ssl_version=ssl.PROTOCOL_TLSv1)
 606                 else:
 607                     self.sock = sock
 608             hc.connect = functools.partial(_hc_connect, hc)
 609
 610     return hc
 611
 612
 613 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 614     """Handler for HTTP requests and responses.
 615
 616     This class, when installed with an OpenerDirector, automatically adds
 617     the standard headers to every HTTP request and handles gzipped and
 618     deflated responses from web servers. If compression is to be avoided in
 619     a particular request, the original request in the program code only has
 620     to include the HTTP header "Youtubedl-No-Compression", which will be
 621     removed before making the real request.
 622
 623     Part of this code was copied from:
 624
 625     http://techknack.net/python-urllib2-handlers/
 626
 627     Andrew Rowls, the author of that code, agreed to release it to the
 628     public domain.
 629     """
 630
 631     def __init__(self, params, *args, **kwargs):
 632         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 633         self._params = params
 634
 635     def http_open(self, req):
 636         return self.do_open(functools.partial(
 637             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 638             req)
 639
 640     @staticmethod
 641     def deflate(data):
 642         try:
 643             return zlib.decompress(data, -zlib.MAX_WBITS)
 644         except zlib.error:
 645             return zlib.decompress(data)
 646
 647     @staticmethod
 648     def addinfourl_wrapper(stream, headers, url, code):
 649         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 650             return compat_urllib_request.addinfourl(stream, headers, url, code)
 651         ret = compat_urllib_request.addinfourl(stream, headers, url)
 652         ret.code = code
 653         return ret
 654
 655     def http_request(self, req):
 656         for h, v in std_headers.items():
 657             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 658             # The dict keys are capitalized because of this bug by urllib
 659             if h.capitalize() not in req.headers:
 660                 req.add_header(h, v)
 661         if 'Youtubedl-no-compression' in req.headers:
 662             if 'Accept-encoding' in req.headers:
 663                 del req.headers['Accept-encoding']
 664             del req.headers['Youtubedl-no-compression']
 665
 666         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 667             # Python 2.6 is brain-dead when it comes to fragments
 668             req._Request__original = req._Request__original.partition('#')[0]
 669             req._Request__r_type = req._Request__r_type.partition('#')[0]
 670
 671         return req
 672
 673     def http_response(self, req, resp):
 674         old_resp = resp
 675         # gzip
 676         if resp.headers.get('Content-encoding', '') == 'gzip':
 677             content = resp.read()
 678             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 679             try:
 680                 uncompressed = io.BytesIO(gz.read())
 681             except IOError as original_ioerror:
 682                 # There may be junk add the end of the file
 683                 # See http://stackoverflow.com/q/4928560/35070 for details
 684                 for i in range(1, 1024):
 685                     try:
 686                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 687                         uncompressed = io.BytesIO(gz.read())
 688                     except IOError:
 689                         continue
 690                     break
 691                 else:
 692                     raise original_ioerror
 693             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 694             resp.msg = old_resp.msg
 695         # deflate
 696         if resp.headers.get('Content-encoding', '') == 'deflate':
 697             gz = io.BytesIO(self.deflate(resp.read()))
 698             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 699             resp.msg = old_resp.msg
 700         return resp
 701
 702     https_request = http_request
 703     https_response = http_response
 704
 705
 706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 707     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 708         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 709         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 710         self._params = params
 711
 712     def https_open(self, req):
 713         kwargs = {}
 714         if hasattr(self, '_context'):  # python > 2.6
 715             kwargs['context'] = self._context
 716         if hasattr(self, '_check_hostname'):  # python 3.x
 717             kwargs['check_hostname'] = self._check_hostname
 718         return self.do_open(functools.partial(
 719             _create_http_connection, self, self._https_conn_class, True),
 720             req, **kwargs)
 721
 722
 723 def parse_iso8601(date_str, delimiter='T', timezone=None):
 724     """ Return a UNIX timestamp from the given date """
 725
 726     if date_str is None:
 727         return None
 728
 729     if timezone is None:
 730         m = re.search(
 731             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 732             date_str)
 733         if not m:
 734             timezone = datetime.timedelta()
 735         else:
 736             date_str = date_str[:-len(m.group(0))]
 737             if not m.group('sign'):
 738                 timezone = datetime.timedelta()
 739             else:
 740                 sign = 1 if m.group('sign') == '+' else -1
 741                 timezone = datetime.timedelta(
 742                     hours=sign * int(m.group('hours')),
 743                     minutes=sign * int(m.group('minutes')))
 744     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 745     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 746     return calendar.timegm(dt.timetuple())
 747
 748
 749 def unified_strdate(date_str, day_first=True):
 750     """Return a string with the date in the format YYYYMMDD"""
 751
 752     if date_str is None:
 753         return None
 754     upload_date = None
 755     # Replace commas
 756     date_str = date_str.replace(',', ' ')
 757     # %z (UTC offset) is only supported in python>=3.2
 758     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 759         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 760     # Remove AM/PM + timezone
 761     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 762
 763     format_expressions = [
 764         '%d %B %Y',
 765         '%d %b %Y',
 766         '%B %d %Y',
 767         '%b %d %Y',
 768         '%b %dst %Y %I:%M%p',
 769         '%b %dnd %Y %I:%M%p',
 770         '%b %dth %Y %I:%M%p',
 771         '%Y %m %d',
 772         '%Y-%m-%d',
 773         '%Y/%m/%d',
 774         '%Y/%m/%d %H:%M:%S',
 775         '%Y-%m-%d %H:%M:%S',
 776         '%Y-%m-%d %H:%M:%S.%f',
 777         '%d.%m.%Y %H:%M',
 778         '%d.%m.%Y %H.%M',
 779         '%Y-%m-%dT%H:%M:%SZ',
 780         '%Y-%m-%dT%H:%M:%S.%fZ',
 781         '%Y-%m-%dT%H:%M:%S.%f0Z',
 782         '%Y-%m-%dT%H:%M:%S',
 783         '%Y-%m-%dT%H:%M:%S.%f',
 784         '%Y-%m-%dT%H:%M',
 785     ]
 786     if day_first:
 787         format_expressions.extend([
 788             '%d-%m-%Y',
 789             '%d.%m.%Y',
 790             '%d/%m/%Y',
 791             '%d/%m/%y',
 792             '%d/%m/%Y %H:%M:%S',
 793         ])
 794     else:
 795         format_expressions.extend([
 796             '%m-%d-%Y',
 797             '%m.%d.%Y',
 798             '%m/%d/%Y',
 799             '%m/%d/%y',
 800             '%m/%d/%Y %H:%M:%S',
 801         ])
 802     for expression in format_expressions:
 803         try:
 804             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 805         except ValueError:
 806             pass
 807     if upload_date is None:
 808         timetuple = email.utils.parsedate_tz(date_str)
 809         if timetuple:
 810             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 811     return upload_date
 812
 813
 814 def determine_ext(url, default_ext='unknown_video'):
 815     if url is None:
 816         return default_ext
 817     guess = url.partition('?')[0].rpartition('.')[2]
 818     if re.match(r'^[A-Za-z0-9]+$', guess):
 819         return guess
 820     else:
 821         return default_ext
 822
 823
 824 def subtitles_filename(filename, sub_lang, sub_format):
 825     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 826
 827
 828 def date_from_str(date_str):
 829     """
 830     Return a datetime object from a string in the format YYYYMMDD or
 831     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 832     today = datetime.date.today()
 833     if date_str in ('now', 'today'):
 834         return today
 835     if date_str == 'yesterday':
 836         return today - datetime.timedelta(days=1)
 837     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 838     if match is not None:
 839         sign = match.group('sign')
 840         time = int(match.group('time'))
 841         if sign == '-':
 842             time = -time
 843         unit = match.group('unit')
 844         # A bad aproximation?
 845         if unit == 'month':
 846             unit = 'day'
 847             time *= 30
 848         elif unit == 'year':
 849             unit = 'day'
 850             time *= 365
 851         unit += 's'
 852         delta = datetime.timedelta(**{unit: time})
 853         return today + delta
 854     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 855
 856
 857 def hyphenate_date(date_str):
 858     """
 859     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 860     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 861     if match is not None:
 862         return '-'.join(match.groups())
 863     else:
 864         return date_str
 865
 866
 867 class DateRange(object):
 868     """Represents a time interval between two dates"""
 869
 870     def __init__(self, start=None, end=None):
 871         """start and end must be strings in the format accepted by date"""
 872         if start is not None:
 873             self.start = date_from_str(start)
 874         else:
 875             self.start = datetime.datetime.min.date()
 876         if end is not None:
 877             self.end = date_from_str(end)
 878         else:
 879             self.end = datetime.datetime.max.date()
 880         if self.start > self.end:
 881             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 882
 883     @classmethod
 884     def day(cls, day):
 885         """Returns a range that only contains the given day"""
 886         return cls(day, day)
 887
 888     def __contains__(self, date):
 889         """Check if the date is in the range"""
 890         if not isinstance(date, datetime.date):
 891             date = date_from_str(date)
 892         return self.start <= date <= self.end
 893
 894     def __str__(self):
 895         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 896
 897
 898 def platform_name():
 899     """ Returns the platform name as a compat_str """
 900     res = platform.platform()
 901     if isinstance(res, bytes):
 902         res = res.decode(preferredencoding())
 903
 904     assert isinstance(res, compat_str)
 905     return res
 906
 907
 908 def _windows_write_string(s, out):
 909     """ Returns True if the string was written using special methods,
 910     False if it has yet to be written out."""
 911     # Adapted from http://stackoverflow.com/a/3259271/35070
 912
 913     import ctypes
 914     import ctypes.wintypes
 915
 916     WIN_OUTPUT_IDS = {
 917         1: -11,
 918         2: -12,
 919     }
 920
 921     try:
 922         fileno = out.fileno()
 923     except AttributeError:
 924         # If the output stream doesn't have a fileno, it's virtual
 925         return False
 926     except io.UnsupportedOperation:
 927         # Some strange Windows pseudo files?
 928         return False
 929     if fileno not in WIN_OUTPUT_IDS:
 930         return False
 931
 932     GetStdHandle = ctypes.WINFUNCTYPE(
 933         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 934         (b"GetStdHandle", ctypes.windll.kernel32))
 935     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 936
 937     WriteConsoleW = ctypes.WINFUNCTYPE(
 938         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 939         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 940         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 941     written = ctypes.wintypes.DWORD(0)
 942
 943     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 944     FILE_TYPE_CHAR = 0x0002
 945     FILE_TYPE_REMOTE = 0x8000
 946     GetConsoleMode = ctypes.WINFUNCTYPE(
 947         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 948         ctypes.POINTER(ctypes.wintypes.DWORD))(
 949         (b"GetConsoleMode", ctypes.windll.kernel32))
 950     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 951
 952     def not_a_console(handle):
 953         if handle == INVALID_HANDLE_VALUE or handle is None:
 954             return True
 955         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 956                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 957
 958     if not_a_console(h):
 959         return False
 960
 961     def next_nonbmp_pos(s):
 962         try:
 963             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 964         except StopIteration:
 965             return len(s)
 966
 967     while s:
 968         count = min(next_nonbmp_pos(s), 1024)
 969
 970         ret = WriteConsoleW(
 971             h, s, count if count else 2, ctypes.byref(written), None)
 972         if ret == 0:
 973             raise OSError('Failed to write string')
 974         if not count:  # We just wrote a non-BMP character
 975             assert written.value == 2
 976             s = s[1:]
 977         else:
 978             assert written.value > 0
 979             s = s[written.value:]
 980     return True
 981
 982
 983 def write_string(s, out=None, encoding=None):
 984     if out is None:
 985         out = sys.stderr
 986     assert type(s) == compat_str
 987
 988     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 989         if _windows_write_string(s, out):
 990             return
 991
 992     if ('b' in getattr(out, 'mode', '') or
 993             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 994         byt = s.encode(encoding or preferredencoding(), 'ignore')
 995         out.write(byt)
 996     elif hasattr(out, 'buffer'):
 997         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 998         byt = s.encode(enc, 'ignore')
 999         out.buffer.write(byt)
1000     else:
1001         out.write(s)
1002     out.flush()
1003
1004
1005 def bytes_to_intlist(bs):
1006     if not bs:
1007         return []
1008     if isinstance(bs[0], int):  # Python 3
1009         return list(bs)
1010     else:
1011         return [ord(c) for c in bs]
1012
1013
1014 def intlist_to_bytes(xs):
1015     if not xs:
1016         return b''
1017     return struct_pack('%dB' % len(xs), *xs)
1018
1019
1020 # Cross-platform file locking
1021 if sys.platform == 'win32':
1022     import ctypes.wintypes
1023     import msvcrt
1024
1025     class OVERLAPPED(ctypes.Structure):
1026         _fields_ = [
1027             ('Internal', ctypes.wintypes.LPVOID),
1028             ('InternalHigh', ctypes.wintypes.LPVOID),
1029             ('Offset', ctypes.wintypes.DWORD),
1030             ('OffsetHigh', ctypes.wintypes.DWORD),
1031             ('hEvent', ctypes.wintypes.HANDLE),
1032         ]
1033
1034     kernel32 = ctypes.windll.kernel32
1035     LockFileEx = kernel32.LockFileEx
1036     LockFileEx.argtypes = [
1037         ctypes.wintypes.HANDLE,     # hFile
1038         ctypes.wintypes.DWORD,      # dwFlags
1039         ctypes.wintypes.DWORD,      # dwReserved
1040         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1041         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1042         ctypes.POINTER(OVERLAPPED)  # Overlapped
1043     ]
1044     LockFileEx.restype = ctypes.wintypes.BOOL
1045     UnlockFileEx = kernel32.UnlockFileEx
1046     UnlockFileEx.argtypes = [
1047         ctypes.wintypes.HANDLE,     # hFile
1048         ctypes.wintypes.DWORD,      # dwReserved
1049         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1050         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1051         ctypes.POINTER(OVERLAPPED)  # Overlapped
1052     ]
1053     UnlockFileEx.restype = ctypes.wintypes.BOOL
1054     whole_low = 0xffffffff
1055     whole_high = 0x7fffffff
1056
1057     def _lock_file(f, exclusive):
1058         overlapped = OVERLAPPED()
1059         overlapped.Offset = 0
1060         overlapped.OffsetHigh = 0
1061         overlapped.hEvent = 0
1062         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063         handle = msvcrt.get_osfhandle(f.fileno())
1064         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065                           whole_low, whole_high, f._lock_file_overlapped_p):
1066             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068     def _unlock_file(f):
1069         assert f._lock_file_overlapped_p
1070         handle = msvcrt.get_osfhandle(f.fileno())
1071         if not UnlockFileEx(handle, 0,
1072                             whole_low, whole_high, f._lock_file_overlapped_p):
1073             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075 else:
1076     import fcntl
1077
1078     def _lock_file(f, exclusive):
1079         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1080
1081     def _unlock_file(f):
1082         fcntl.flock(f, fcntl.LOCK_UN)
1083
1084
1085 class locked_file(object):
1086     def __init__(self, filename, mode, encoding=None):
1087         assert mode in ['r', 'a', 'w']
1088         self.f = io.open(filename, mode, encoding=encoding)
1089         self.mode = mode
1090
1091     def __enter__(self):
1092         exclusive = self.mode != 'r'
1093         try:
1094             _lock_file(self.f, exclusive)
1095         except IOError:
1096             self.f.close()
1097             raise
1098         return self
1099
1100     def __exit__(self, etype, value, traceback):
1101         try:
1102             _unlock_file(self.f)
1103         finally:
1104             self.f.close()
1105
1106     def __iter__(self):
1107         return iter(self.f)
1108
1109     def write(self, *args):
1110         return self.f.write(*args)
1111
1112     def read(self, *args):
1113         return self.f.read(*args)
1114
1115
1116 def get_filesystem_encoding():
1117     encoding = sys.getfilesystemencoding()
1118     return encoding if encoding is not None else 'utf-8'
1119
1120
1121 def shell_quote(args):
1122     quoted_args = []
1123     encoding = get_filesystem_encoding()
1124     for a in args:
1125         if isinstance(a, bytes):
1126             # We may get a filename encoded with 'encodeFilename'
1127             a = a.decode(encoding)
1128         quoted_args.append(pipes.quote(a))
1129     return ' '.join(quoted_args)
1130
1131
1132 def smuggle_url(url, data):
1133     """ Pass additional data in a URL for internal use. """
1134
1135     sdata = compat_urllib_parse.urlencode(
1136         {'__youtubedl_smuggle': json.dumps(data)})
1137     return url + '#' + sdata
1138
1139
1140 def unsmuggle_url(smug_url, default=None):
1141     if '#__youtubedl_smuggle' not in smug_url:
1142         return smug_url, default
1143     url, _, sdata = smug_url.rpartition('#')
1144     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1145     data = json.loads(jsond)
1146     return url, data
1147
1148
1149 def format_bytes(bytes):
1150     if bytes is None:
1151         return 'N/A'
1152     if type(bytes) is str:
1153         bytes = float(bytes)
1154     if bytes == 0.0:
1155         exponent = 0
1156     else:
1157         exponent = int(math.log(bytes, 1024.0))
1158     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1159     converted = float(bytes) / float(1024 ** exponent)
1160     return '%.2f%s' % (converted, suffix)
1161
1162
1163 def parse_filesize(s):
1164     if s is None:
1165         return None
1166
1167     # The lower-case forms are of course incorrect and inofficial,
1168     # but we support those too
1169     _UNIT_TABLE = {
1170         'B': 1,
1171         'b': 1,
1172         'KiB': 1024,
1173         'KB': 1000,
1174         'kB': 1024,
1175         'Kb': 1000,
1176         'MiB': 1024 ** 2,
1177         'MB': 1000 ** 2,
1178         'mB': 1024 ** 2,
1179         'Mb': 1000 ** 2,
1180         'GiB': 1024 ** 3,
1181         'GB': 1000 ** 3,
1182         'gB': 1024 ** 3,
1183         'Gb': 1000 ** 3,
1184         'TiB': 1024 ** 4,
1185         'TB': 1000 ** 4,
1186         'tB': 1024 ** 4,
1187         'Tb': 1000 ** 4,
1188         'PiB': 1024 ** 5,
1189         'PB': 1000 ** 5,
1190         'pB': 1024 ** 5,
1191         'Pb': 1000 ** 5,
1192         'EiB': 1024 ** 6,
1193         'EB': 1000 ** 6,
1194         'eB': 1024 ** 6,
1195         'Eb': 1000 ** 6,
1196         'ZiB': 1024 ** 7,
1197         'ZB': 1000 ** 7,
1198         'zB': 1024 ** 7,
1199         'Zb': 1000 ** 7,
1200         'YiB': 1024 ** 8,
1201         'YB': 1000 ** 8,
1202         'yB': 1024 ** 8,
1203         'Yb': 1000 ** 8,
1204     }
1205
1206     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1207     m = re.match(
1208         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1209     if not m:
1210         return None
1211
1212     num_str = m.group('num').replace(',', '.')
1213     mult = _UNIT_TABLE[m.group('unit')]
1214     return int(float(num_str) * mult)
1215
1216
1217 def month_by_name(name):
1218     """ Return the number of a month by (locale-independently) English name """
1219
1220     try:
1221         return ENGLISH_MONTH_NAMES.index(name) + 1
1222     except ValueError:
1223         return None
1224
1225
1226 def month_by_abbreviation(abbrev):
1227     """ Return the number of a month by (locale-independently) English
1228         abbreviations """
1229
1230     try:
1231         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1232     except ValueError:
1233         return None
1234
1235
1236 def fix_xml_ampersands(xml_str):
1237     """Replace all the '&' by '&amp;' in XML"""
1238     return re.sub(
1239         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1240         '&amp;',
1241         xml_str)
1242
1243
1244 def setproctitle(title):
1245     assert isinstance(title, compat_str)
1246     try:
1247         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248     except OSError:
1249         return
1250     title_bytes = title.encode('utf-8')
1251     buf = ctypes.create_string_buffer(len(title_bytes))
1252     buf.value = title_bytes
1253     try:
1254         libc.prctl(15, buf, 0, 0, 0)
1255     except AttributeError:
1256         return  # Strange libc, just skip this
1257
1258
1259 def remove_start(s, start):
1260     if s.startswith(start):
1261         return s[len(start):]
1262     return s
1263
1264
1265 def remove_end(s, end):
1266     if s.endswith(end):
1267         return s[:-len(end)]
1268     return s
1269
1270
1271 def url_basename(url):
1272     path = compat_urlparse.urlparse(url).path
1273     return path.strip('/').split('/')[-1]
1274
1275
1276 class HEADRequest(compat_urllib_request.Request):
1277     def get_method(self):
1278         return "HEAD"
1279
1280
1281 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1282     if get_attr:
1283         if v is not None:
1284             v = getattr(v, get_attr, None)
1285     if v == '':
1286         v = None
1287     return default if v is None else (int(v) * invscale // scale)
1288
1289
1290 def str_or_none(v, default=None):
1291     return default if v is None else compat_str(v)
1292
1293
1294 def str_to_int(int_str):
1295     """ A more relaxed version of int_or_none """
1296     if int_str is None:
1297         return None
1298     int_str = re.sub(r'[,\.\+]', '', int_str)
1299     return int(int_str)
1300
1301
1302 def float_or_none(v, scale=1, invscale=1, default=None):
1303     return default if v is None else (float(v) * invscale / scale)
1304
1305
1306 def parse_duration(s):
1307     if not isinstance(s, compat_basestring):
1308         return None
1309
1310     s = s.strip()
1311
1312     m = re.match(
1313         r'''(?ix)(?:P?T)?
1314         (?:
1315             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
1318             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1319             (?:
1320                 (?:
1321                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323                 )?
1324                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325             )?
1326             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327         )$''', s)
1328     if not m:
1329         return None
1330     res = 0
1331     if m.group('only_mins'):
1332         return float_or_none(m.group('only_mins'), invscale=60)
1333     if m.group('only_hours'):
1334         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335     if m.group('secs'):
1336         res += int(m.group('secs'))
1337     if m.group('mins_reversed'):
1338         res += int(m.group('mins_reversed')) * 60
1339     if m.group('mins'):
1340         res += int(m.group('mins')) * 60
1341     if m.group('hours'):
1342         res += int(m.group('hours')) * 60 * 60
1343     if m.group('hours_reversed'):
1344         res += int(m.group('hours_reversed')) * 60 * 60
1345     if m.group('days'):
1346         res += int(m.group('days')) * 24 * 60 * 60
1347     if m.group('ms'):
1348         res += float(m.group('ms'))
1349     return res
1350
1351
1352 def prepend_extension(filename, ext):
1353     name, real_ext = os.path.splitext(filename)
1354     return '{0}.{1}{2}'.format(name, ext, real_ext)
1355
1356
1357 def check_executable(exe, args=[]):
1358     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359     args can be a list of arguments for a short output (like -version) """
1360     try:
1361         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362     except OSError:
1363         return False
1364     return exe
1365
1366
1367 def get_exe_version(exe, args=['--version'],
1368                     version_re=None, unrecognized='present'):
1369     """ Returns the version of the specified executable,
1370     or False if the executable is not present """
1371     try:
1372         out, _ = subprocess.Popen(
1373             [exe] + args,
1374             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1375     except OSError:
1376         return False
1377     if isinstance(out, bytes):  # Python 2.x
1378         out = out.decode('ascii', 'ignore')
1379     return detect_exe_version(out, version_re, unrecognized)
1380
1381
1382 def detect_exe_version(output, version_re=None, unrecognized='present'):
1383     assert isinstance(output, compat_str)
1384     if version_re is None:
1385         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1386     m = re.search(version_re, output)
1387     if m:
1388         return m.group(1)
1389     else:
1390         return unrecognized
1391
1392
1393 class PagedList(object):
1394     def __len__(self):
1395         # This is only useful for tests
1396         return len(self.getslice())
1397
1398
1399 class OnDemandPagedList(PagedList):
1400     def __init__(self, pagefunc, pagesize):
1401         self._pagefunc = pagefunc
1402         self._pagesize = pagesize
1403
1404     def getslice(self, start=0, end=None):
1405         res = []
1406         for pagenum in itertools.count(start // self._pagesize):
1407             firstid = pagenum * self._pagesize
1408             nextfirstid = pagenum * self._pagesize + self._pagesize
1409             if start >= nextfirstid:
1410                 continue
1411
1412             page_results = list(self._pagefunc(pagenum))
1413
1414             startv = (
1415                 start % self._pagesize
1416                 if firstid <= start < nextfirstid
1417                 else 0)
1418
1419             endv = (
1420                 ((end - 1) % self._pagesize) + 1
1421                 if (end is not None and firstid <= end <= nextfirstid)
1422                 else None)
1423
1424             if startv != 0 or endv is not None:
1425                 page_results = page_results[startv:endv]
1426             res.extend(page_results)
1427
1428             # A little optimization - if current page is not "full", ie. does
1429             # not contain page_size videos then we can assume that this page
1430             # is the last one - there are no more ids on further pages -
1431             # i.e. no need to query again.
1432             if len(page_results) + startv < self._pagesize:
1433                 break
1434
1435             # If we got the whole page, but the next page is not interesting,
1436             # break out early as well
1437             if end == nextfirstid:
1438                 break
1439         return res
1440
1441
1442 class InAdvancePagedList(PagedList):
1443     def __init__(self, pagefunc, pagecount, pagesize):
1444         self._pagefunc = pagefunc
1445         self._pagecount = pagecount
1446         self._pagesize = pagesize
1447
1448     def getslice(self, start=0, end=None):
1449         res = []
1450         start_page = start // self._pagesize
1451         end_page = (
1452             self._pagecount if end is None else (end // self._pagesize + 1))
1453         skip_elems = start - start_page * self._pagesize
1454         only_more = None if end is None else end - start
1455         for pagenum in range(start_page, end_page):
1456             page = list(self._pagefunc(pagenum))
1457             if skip_elems:
1458                 page = page[skip_elems:]
1459                 skip_elems = None
1460             if only_more is not None:
1461                 if len(page) < only_more:
1462                     only_more -= len(page)
1463                 else:
1464                     page = page[:only_more]
1465                     res.extend(page)
1466                     break
1467             res.extend(page)
1468         return res
1469
1470
1471 def uppercase_escape(s):
1472     unicode_escape = codecs.getdecoder('unicode_escape')
1473     return re.sub(
1474         r'\\U[0-9a-fA-F]{8}',
1475         lambda m: unicode_escape(m.group(0))[0],
1476         s)
1477
1478
1479 def escape_rfc3986(s):
1480     """Escape non-ASCII characters as suggested by RFC 3986"""
1481     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1482         s = s.encode('utf-8')
1483     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1484
1485
1486 def escape_url(url):
1487     """Escape URL as suggested by RFC 3986"""
1488     url_parsed = compat_urllib_parse_urlparse(url)
1489     return url_parsed._replace(
1490         path=escape_rfc3986(url_parsed.path),
1491         params=escape_rfc3986(url_parsed.params),
1492         query=escape_rfc3986(url_parsed.query),
1493         fragment=escape_rfc3986(url_parsed.fragment)
1494     ).geturl()
1495
1496 try:
1497     struct.pack('!I', 0)
1498 except TypeError:
1499     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500     def struct_pack(spec, *args):
1501         if isinstance(spec, compat_str):
1502             spec = spec.encode('ascii')
1503         return struct.pack(spec, *args)
1504
1505     def struct_unpack(spec, *args):
1506         if isinstance(spec, compat_str):
1507             spec = spec.encode('ascii')
1508         return struct.unpack(spec, *args)
1509 else:
1510     struct_pack = struct.pack
1511     struct_unpack = struct.unpack
1512
1513
1514 def read_batch_urls(batch_fd):
1515     def fixup(url):
1516         if not isinstance(url, compat_str):
1517             url = url.decode('utf-8', 'replace')
1518         BOM_UTF8 = '\xef\xbb\xbf'
1519         if url.startswith(BOM_UTF8):
1520             url = url[len(BOM_UTF8):]
1521         url = url.strip()
1522         if url.startswith(('#', ';', ']')):
1523             return False
1524         return url
1525
1526     with contextlib.closing(batch_fd) as fd:
1527         return [url for url in map(fixup, fd) if url]
1528
1529
1530 def urlencode_postdata(*args, **kargs):
1531     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1532
1533
1534 try:
1535     etree_iter = xml.etree.ElementTree.Element.iter
1536 except AttributeError:  # Python <=2.6
1537     etree_iter = lambda n: n.findall('.//*')
1538
1539
1540 def parse_xml(s):
1541     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542         def doctype(self, name, pubid, system):
1543             pass  # Ignore doctypes
1544
1545     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1547     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548     # Fix up XML parser in Python 2.x
1549     if sys.version_info < (3, 0):
1550         for n in etree_iter(tree):
1551             if n.text is not None:
1552                 if not isinstance(n.text, compat_str):
1553                     n.text = n.text.decode('utf-8')
1554     return tree
1555
1556
1557 US_RATINGS = {
1558     'G': 0,
1559     'PG': 10,
1560     'PG-13': 13,
1561     'R': 16,
1562     'NC': 18,
1563 }
1564
1565
1566 def parse_age_limit(s):
1567     if s is None:
1568         return None
1569     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1570     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1571
1572
1573 def strip_jsonp(code):
1574     return re.sub(
1575         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1576
1577
1578 def js_to_json(code):
1579     def fix_kv(m):
1580         v = m.group(0)
1581         if v in ('true', 'false', 'null'):
1582             return v
1583         if v.startswith('"'):
1584             return v
1585         if v.startswith("'"):
1586             v = v[1:-1]
1587             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1588                 '\\\\': '\\\\',
1589                 "\\'": "'",
1590                 '"': '\\"',
1591             }[m.group(0)], v)
1592         return '"%s"' % v
1593
1594     res = re.sub(r'''(?x)
1595         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1596         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1597         [a-zA-Z_][.a-zA-Z_0-9]*
1598         ''', fix_kv, code)
1599     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1600     return res
1601
1602
1603 def qualities(quality_ids):
1604     """ Get a numeric quality value out of a list of possible values """
1605     def q(qid):
1606         try:
1607             return quality_ids.index(qid)
1608         except ValueError:
1609             return -1
1610     return q
1611
1612
1613 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1614
1615
1616 def limit_length(s, length):
1617     """ Add ellipses to overly long strings """
1618     if s is None:
1619         return None
1620     ELLIPSES = '...'
1621     if len(s) > length:
1622         return s[:length - len(ELLIPSES)] + ELLIPSES
1623     return s
1624
1625
1626 def version_tuple(v):
1627     return tuple(int(e) for e in re.split(r'[-.]', v))
1628
1629
1630 def is_outdated_version(version, limit, assume_new=True):
1631     if not version:
1632         return not assume_new
1633     try:
1634         return version_tuple(version) < version_tuple(limit)
1635     except ValueError:
1636         return not assume_new
1637
1638
1639 def ytdl_is_updateable():
1640     """ Returns if youtube-dl can be updated with -U """
1641     from zipimport import zipimporter
1642
1643     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1644
1645
1646 def args_to_str(args):
1647     # Get a short string representation for a subprocess command
1648     return ' '.join(shlex_quote(a) for a in args)
1649
1650
1651 def mimetype2ext(mt):
1652     _, _, res = mt.rpartition('/')
1653
1654     return {
1655         'x-ms-wmv': 'wmv',
1656         'x-mp4-fragmented': 'mp4',
1657     }.get(res, res)
1658
1659
1660 def urlhandle_detect_ext(url_handle):
1661     try:
1662         url_handle.headers
1663         getheader = lambda h: url_handle.headers[h]
1664     except AttributeError:  # Python < 3
1665         getheader = url_handle.info().getheader
1666
1667     cd = getheader('Content-Disposition')
1668     if cd:
1669         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1670         if m:
1671             e = determine_ext(m.group('filename'), default_ext=None)
1672             if e:
1673                 return e
1674
1675     return mimetype2ext(getheader('Content-Type'))
1676
1677
1678 def age_restricted(content_limit, age_limit):
1679     """ Returns True iff the content should be blocked """
1680
1681     if age_limit is None:  # No limit set
1682         return False
1683     if content_limit is None:
1684         return False  # Content available for everyone
1685     return age_limit < content_limit
1686
1687
1688 def is_html(first_bytes):
1689     """ Detect whether a file contains HTML by examining its first bytes. """
1690
1691     BOMS = [
1692         (b'\xef\xbb\xbf', 'utf-8'),
1693         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1694         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1695         (b'\xff\xfe', 'utf-16-le'),
1696         (b'\xfe\xff', 'utf-16-be'),
1697     ]
1698     for bom, enc in BOMS:
1699         if first_bytes.startswith(bom):
1700             s = first_bytes[len(bom):].decode(enc, 'replace')
1701             break
1702     else:
1703         s = first_bytes.decode('utf-8', 'replace')
1704
1705     return re.match(r'^\s*<', s)
1706
1707
1708 def determine_protocol(info_dict):
1709     protocol = info_dict.get('protocol')
1710     if protocol is not None:
1711         return protocol
1712
1713     url = info_dict['url']
1714     if url.startswith('rtmp'):
1715         return 'rtmp'
1716     elif url.startswith('mms'):
1717         return 'mms'
1718     elif url.startswith('rtsp'):
1719         return 'rtsp'
1720
1721     ext = determine_ext(url)
1722     if ext == 'm3u8':
1723         return 'm3u8'
1724     elif ext == 'f4m':
1725         return 'f4m'
1726
1727     return compat_urllib_parse_urlparse(url).scheme
1728
1729
1730 def render_table(header_row, data):
1731     """ Render a list of rows, each as a list of values """
1732     table = [header_row] + data
1733     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1734     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1735     return '\n'.join(format_str % tuple(row) for row in table)
1736
1737
1738 def _match_one(filter_part, dct):
1739     COMPARISON_OPERATORS = {
1740         '<': operator.lt,
1741         '<=': operator.le,
1742         '>': operator.gt,
1743         '>=': operator.ge,
1744         '=': operator.eq,
1745         '!=': operator.ne,
1746     }
1747     operator_rex = re.compile(r'''(?x)\s*
1748         (?P<key>[a-z_]+)
1749         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1750         (?:
1751             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1752             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1753         )
1754         \s*$
1755         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1756     m = operator_rex.search(filter_part)
1757     if m:
1758         op = COMPARISON_OPERATORS[m.group('op')]
1759         if m.group('strval') is not None:
1760             if m.group('op') not in ('=', '!='):
1761                 raise ValueError(
1762                     'Operator %s does not support string values!' % m.group('op'))
1763             comparison_value = m.group('strval')
1764         else:
1765             try:
1766                 comparison_value = int(m.group('intval'))
1767             except ValueError:
1768                 comparison_value = parse_filesize(m.group('intval'))
1769                 if comparison_value is None:
1770                     comparison_value = parse_filesize(m.group('intval') + 'B')
1771                 if comparison_value is None:
1772                     raise ValueError(
1773                         'Invalid integer value %r in filter part %r' % (
1774                             m.group('intval'), filter_part))
1775         actual_value = dct.get(m.group('key'))
1776         if actual_value is None:
1777             return m.group('none_inclusive')
1778         return op(actual_value, comparison_value)
1779
1780     UNARY_OPERATORS = {
1781         '': lambda v: v is not None,
1782         '!': lambda v: v is None,
1783     }
1784     operator_rex = re.compile(r'''(?x)\s*
1785         (?P<op>%s)\s*(?P<key>[a-z_]+)
1786         \s*$
1787         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1788     m = operator_rex.search(filter_part)
1789     if m:
1790         op = UNARY_OPERATORS[m.group('op')]
1791         actual_value = dct.get(m.group('key'))
1792         return op(actual_value)
1793
1794     raise ValueError('Invalid filter part %r' % filter_part)
1795
1796
1797 def match_str(filter_str, dct):
1798     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1799
1800     return all(
1801         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1802
1803
1804 def match_filter_func(filter_str):
1805     def _match_func(info_dict):
1806         if match_str(filter_str, info_dict):
1807             return None
1808         else:
1809             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1810             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1811     return _match_func
1812
1813
1814 def parse_dfxp_time_expr(time_expr):
1815     if not time_expr:
1816         return 0.0
1817
1818     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1819     if mobj:
1820         return float(mobj.group('time_offset'))
1821
1822     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1823     if mobj:
1824         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1825
1826
1827 def format_srt_time(seconds):
1828     (mins, secs) = divmod(seconds, 60)
1829     (hours, mins) = divmod(mins, 60)
1830     millisecs = (secs - int(secs)) * 1000
1831     secs = int(secs)
1832     return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1833
1834
1835 def dfxp2srt(dfxp_data):
1836     _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1837
1838     def parse_node(node):
1839         str_or_empty = functools.partial(str_or_none, default='')
1840
1841         out = str_or_empty(node.text)
1842
1843         for child in node:
1844             if child.tag == _x('ttml:br'):
1845                 out += '\n' + str_or_empty(child.tail)
1846             elif child.tag == _x('ttml:span'):
1847                 out += str_or_empty(parse_node(child))
1848             else:
1849                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1850
1851         return out
1852
1853     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1854     out = []
1855     paras = dfxp.findall(_x('.//ttml:p'))
1856
1857     for para, index in zip(paras, itertools.count(1)):
1858         out.append('%d\n%s --> %s\n%s\n\n' % (
1859             index,
1860             format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1861             format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1862             parse_node(para)))
1863
1864     return ''.join(out)
1865
1866
1867 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1868     def __init__(self, proxies=None):
1869         # Set default handlers
1870         for type in ('http', 'https'):
1871             setattr(self, '%s_open' % type,
1872                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1873                         meth(r, proxy, type))
1874         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1875
1876     def proxy_open(self, req, proxy, type):
1877         req_proxy = req.headers.get('Ytdl-request-proxy')
1878         if req_proxy is not None:
1879             proxy = req_proxy
1880             del req.headers['Ytdl-request-proxy']
1881
1882         if proxy == '__noproxy__':
1883             return None  # No Proxy
1884         return compat_urllib_request.ProxyHandler.proxy_open(
1885             self, req, proxy, type)