youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_getenv,
  39     compat_html_entities,
  40     compat_http_client,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 def preferredencoding():
  66     """Get preferred encoding.
  67
  68     Returns the best encoding scheme for the system, based on
  69     locale.getpreferredencoding() and some further tweaks.
  70     """
  71     try:
  72         pref = locale.getpreferredencoding()
  73         'TEST'.encode(pref)
  74     except:
  75         pref = 'UTF-8'
  76
  77     return pref
  78
  79
  80 def write_json_file(obj, fn):
  81     """ Encode obj as JSON and write it to fn, atomically if possible """
  82
  83     fn = encodeFilename(fn)
  84     if sys.version_info < (3, 0) and sys.platform != 'win32':
  85         encoding = get_filesystem_encoding()
  86         # os.path.basename returns a bytes object, but NamedTemporaryFile
  87         # will fail if the filename contains non ascii characters unless we
  88         # use a unicode object
  89         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  90         # the same for os.path.dirname
  91         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  92     else:
  93         path_basename = os.path.basename
  94         path_dirname = os.path.dirname
  95
  96     args = {
  97         'suffix': '.tmp',
  98         'prefix': path_basename(fn) + '.',
  99         'dir': path_dirname(fn),
 100         'delete': False,
 101     }
 102
 103     # In Python 2.x, json.dump expects a bytestream.
 104     # In Python 3.x, it writes to a character stream
 105     if sys.version_info < (3, 0):
 106         args['mode'] = 'wb'
 107     else:
 108         args.update({
 109             'mode': 'w',
 110             'encoding': 'utf-8',
 111         })
 112
 113     tf = tempfile.NamedTemporaryFile(**args)
 114
 115     try:
 116         with tf:
 117             json.dump(obj, tf)
 118         if sys.platform == 'win32':
 119             # Need to remove existing file on Windows, else os.rename raises
 120             # WindowsError or FileExistsError.
 121             try:
 122                 os.unlink(fn)
 123             except OSError:
 124                 pass
 125         os.rename(tf.name, fn)
 126     except:
 127         try:
 128             os.remove(tf.name)
 129         except OSError:
 130             pass
 131         raise
 132
 133
 134 if sys.version_info >= (2, 7):
 135     def find_xpath_attr(node, xpath, key, val):
 136         """ Find the xpath xpath[@key=val] """
 137         assert re.match(r'^[a-zA-Z-]+$', key)
 138         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 139         expr = xpath + "[@%s='%s']" % (key, val)
 140         return node.find(expr)
 141 else:
 142     def find_xpath_attr(node, xpath, key, val):
 143         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 144         # .//node does not match if a node is a direct child of . !
 145         if isinstance(xpath, compat_str):
 146             xpath = xpath.encode('ascii')
 147
 148         for f in node.findall(xpath):
 149             if f.attrib.get(key) == val:
 150                 return f
 151         return None
 152
 153 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 154 # the namespace parameter
 155
 156
 157 def xpath_with_ns(path, ns_map):
 158     components = [c.split(':') for c in path.split('/')]
 159     replaced = []
 160     for c in components:
 161         if len(c) == 1:
 162             replaced.append(c[0])
 163         else:
 164             ns, tag = c
 165             replaced.append('{%s}%s' % (ns_map[ns], tag))
 166     return '/'.join(replaced)
 167
 168
 169 def xpath_text(node, xpath, name=None, fatal=False):
 170     if sys.version_info < (2, 7):  # Crazy 2.6
 171         xpath = xpath.encode('ascii')
 172
 173     n = node.find(xpath)
 174     if n is None or n.text is None:
 175         if fatal:
 176             name = xpath if name is None else name
 177             raise ExtractorError('Could not find XML element %s' % name)
 178         else:
 179             return None
 180     return n.text
 181
 182
 183 def get_element_by_id(id, html):
 184     """Return the content of the tag with the specified ID in the passed HTML document"""
 185     return get_element_by_attribute("id", id, html)
 186
 187
 188 def get_element_by_attribute(attribute, value, html):
 189     """Return the content of the tag with the specified attribute in the passed HTML document"""
 190
 191     m = re.search(r'''(?xs)
 192         <([a-zA-Z0-9:._-]+)
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194          \s+%s=['"]?%s['"]?
 195          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 196         \s*>
 197         (?P<content>.*?)
 198         </\1>
 199     ''' % (re.escape(attribute), re.escape(value)), html)
 200
 201     if not m:
 202         return None
 203     res = m.group('content')
 204
 205     if res.startswith('"') or res.startswith("'"):
 206         res = res[1:-1]
 207
 208     return unescapeHTML(res)
 209
 210
 211 def clean_html(html):
 212     """Clean an HTML snippet into a readable string"""
 213
 214     if html is None:  # Convenience for sanitizing descriptions etc.
 215         return html
 216
 217     # Newline vs <br />
 218     html = html.replace('\n', ' ')
 219     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 220     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 221     # Strip html tags
 222     html = re.sub('<.*?>', '', html)
 223     # Replace html entities
 224     html = unescapeHTML(html)
 225     return html.strip()
 226
 227
 228 def sanitize_open(filename, open_mode):
 229     """Try to open the given filename, and slightly tweak it if this fails.
 230
 231     Attempts to open the given filename. If this fails, it tries to change
 232     the filename slightly, step by step, until it's either able to open it
 233     or it fails and raises a final exception, like the standard open()
 234     function.
 235
 236     It returns the tuple (stream, definitive_file_name).
 237     """
 238     try:
 239         if filename == '-':
 240             if sys.platform == 'win32':
 241                 import msvcrt
 242                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 243             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 244         stream = open(encodeFilename(filename), open_mode)
 245         return (stream, filename)
 246     except (IOError, OSError) as err:
 247         if err.errno in (errno.EACCES,):
 248             raise
 249
 250         # In case of error, try to remove win32 forbidden chars
 251         alt_filename = os.path.join(
 252             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 253             for path_part in os.path.split(filename)
 254         )
 255         if alt_filename == filename:
 256             raise
 257         else:
 258             # An exception here should be caught in the caller
 259             stream = open(encodeFilename(filename), open_mode)
 260             return (stream, alt_filename)
 261
 262
 263 def timeconvert(timestr):
 264     """Convert RFC 2822 defined time string into system timestamp"""
 265     timestamp = None
 266     timetuple = email.utils.parsedate_tz(timestr)
 267     if timetuple is not None:
 268         timestamp = email.utils.mktime_tz(timetuple)
 269     return timestamp
 270
 271
 272 def sanitize_filename(s, restricted=False, is_id=False):
 273     """Sanitizes a string so it could be used as part of a filename.
 274     If restricted is set, use a stricter subset of allowed characters.
 275     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 276     """
 277     def replace_insane(char):
 278         if char == '?' or ord(char) < 32 or ord(char) == 127:
 279             return ''
 280         elif char == '"':
 281             return '' if restricted else '\''
 282         elif char == ':':
 283             return '_-' if restricted else ' -'
 284         elif char in '\\/|*<>':
 285             return '_'
 286         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 287             return '_'
 288         if restricted and ord(char) > 127:
 289             return '_'
 290         return char
 291
 292     # Handle timestamps
 293     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 294     result = ''.join(map(replace_insane, s))
 295     if not is_id:
 296         while '__' in result:
 297             result = result.replace('__', '_')
 298         result = result.strip('_')
 299         # Common case of "Foreign band name - English song title"
 300         if restricted and result.startswith('-_'):
 301             result = result[2:]
 302         if not result:
 303             result = '_'
 304     return result
 305
 306
 307 def orderedSet(iterable):
 308     """ Remove all duplicates from the input iterable """
 309     res = []
 310     for el in iterable:
 311         if el not in res:
 312             res.append(el)
 313     return res
 314
 315
 316 def _htmlentity_transform(entity):
 317     """Transforms an HTML entity to a character."""
 318     # Known non-numeric HTML entity
 319     if entity in compat_html_entities.name2codepoint:
 320         return compat_chr(compat_html_entities.name2codepoint[entity])
 321
 322     mobj = re.match(r'#(x?[0-9]+)', entity)
 323     if mobj is not None:
 324         numstr = mobj.group(1)
 325         if numstr.startswith('x'):
 326             base = 16
 327             numstr = '0%s' % numstr
 328         else:
 329             base = 10
 330         return compat_chr(int(numstr, base))
 331
 332     # Unknown entity in name, return its literal representation
 333     return ('&%s;' % entity)
 334
 335
 336 def unescapeHTML(s):
 337     if s is None:
 338         return None
 339     assert type(s) == compat_str
 340
 341     return re.sub(
 342         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 343
 344
 345 def encodeFilename(s, for_subprocess=False):
 346     """
 347     @param s The name of the file
 348     """
 349
 350     assert type(s) == compat_str
 351
 352     # Python 3 has a Unicode API
 353     if sys.version_info >= (3, 0):
 354         return s
 355
 356     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 357         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 358         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 359         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 360         if not for_subprocess:
 361             return s
 362         else:
 363             # For subprocess calls, encode with locale encoding
 364             # Refer to http://stackoverflow.com/a/9951851/35070
 365             encoding = preferredencoding()
 366     else:
 367         encoding = sys.getfilesystemencoding()
 368     if encoding is None:
 369         encoding = 'utf-8'
 370     return s.encode(encoding, 'ignore')
 371
 372
 373 def encodeArgument(s):
 374     if not isinstance(s, compat_str):
 375         # Legacy code that uses byte strings
 376         # Uncomment the following line after fixing all post processors
 377         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 378         s = s.decode('ascii')
 379     return encodeFilename(s, True)
 380
 381
 382 def decodeOption(optval):
 383     if optval is None:
 384         return optval
 385     if isinstance(optval, bytes):
 386         optval = optval.decode(preferredencoding())
 387
 388     assert isinstance(optval, compat_str)
 389     return optval
 390
 391
 392 def formatSeconds(secs):
 393     if secs > 3600:
 394         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 395     elif secs > 60:
 396         return '%d:%02d' % (secs // 60, secs % 60)
 397     else:
 398         return '%d' % secs
 399
 400
 401 def make_HTTPS_handler(params, **kwargs):
 402     opts_no_check_certificate = params.get('nocheckcertificate', False)
 403     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 404         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 405         if opts_no_check_certificate:
 406             context.check_hostname = False
 407             context.verify_mode = ssl.CERT_NONE
 408         try:
 409             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 410         except TypeError:
 411             # Python 2.7.8
 412             # (create_default_context present but HTTPSHandler has no context=)
 413             pass
 414
 415     if sys.version_info < (3, 2):
 416         return YoutubeDLHTTPSHandler(params, **kwargs)
 417     else:  # Python < 3.4
 418         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 419         context.verify_mode = (ssl.CERT_NONE
 420                                if opts_no_check_certificate
 421                                else ssl.CERT_REQUIRED)
 422         context.set_default_verify_paths()
 423         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 424
 425
 426 class ExtractorError(Exception):
 427     """Error during info extraction."""
 428
 429     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 430         """ tb, if given, is the original traceback (so that it can be printed out).
 431         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 432         """
 433
 434         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 435             expected = True
 436         if video_id is not None:
 437             msg = video_id + ': ' + msg
 438         if cause:
 439             msg += ' (caused by %r)' % cause
 440         if not expected:
 441             if ytdl_is_updateable():
 442                 update_cmd = 'type  youtube-dl -U  to update'
 443             else:
 444                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 445             msg += '; please report this issue on https://yt-dl.org/bug .'
 446             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 447             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 448         super(ExtractorError, self).__init__(msg)
 449
 450         self.traceback = tb
 451         self.exc_info = sys.exc_info()  # preserve original exception
 452         self.cause = cause
 453         self.video_id = video_id
 454
 455     def format_traceback(self):
 456         if self.traceback is None:
 457             return None
 458         return ''.join(traceback.format_tb(self.traceback))
 459
 460
 461 class UnsupportedError(ExtractorError):
 462     def __init__(self, url):
 463         super(UnsupportedError, self).__init__(
 464             'Unsupported URL: %s' % url, expected=True)
 465         self.url = url
 466
 467
 468 class RegexNotFoundError(ExtractorError):
 469     """Error when a regex didn't match"""
 470     pass
 471
 472
 473 class DownloadError(Exception):
 474     """Download Error exception.
 475
 476     This exception may be thrown by FileDownloader objects if they are not
 477     configured to continue on errors. They will contain the appropriate
 478     error message.
 479     """
 480
 481     def __init__(self, msg, exc_info=None):
 482         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 483         super(DownloadError, self).__init__(msg)
 484         self.exc_info = exc_info
 485
 486
 487 class SameFileError(Exception):
 488     """Same File exception.
 489
 490     This exception will be thrown by FileDownloader objects if they detect
 491     multiple files would have to be downloaded to the same file on disk.
 492     """
 493     pass
 494
 495
 496 class PostProcessingError(Exception):
 497     """Post Processing exception.
 498
 499     This exception may be raised by PostProcessor's .run() method to
 500     indicate an error in the postprocessing task.
 501     """
 502
 503     def __init__(self, msg):
 504         self.msg = msg
 505
 506
 507 class MaxDownloadsReached(Exception):
 508     """ --max-downloads limit has been reached. """
 509     pass
 510
 511
 512 class UnavailableVideoError(Exception):
 513     """Unavailable Format exception.
 514
 515     This exception will be thrown when a video is requested
 516     in a format that is not available for that video.
 517     """
 518     pass
 519
 520
 521 class ContentTooShortError(Exception):
 522     """Content Too Short exception.
 523
 524     This exception may be raised by FileDownloader objects when a file they
 525     download is too small for what the server announced first, indicating
 526     the connection was probably interrupted.
 527     """
 528     # Both in bytes
 529     downloaded = None
 530     expected = None
 531
 532     def __init__(self, downloaded, expected):
 533         self.downloaded = downloaded
 534         self.expected = expected
 535
 536
 537 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 538     hc = http_class(*args, **kwargs)
 539     source_address = ydl_handler._params.get('source_address')
 540     if source_address is not None:
 541         sa = (source_address, 0)
 542         if hasattr(hc, 'source_address'):  # Python 2.7+
 543             hc.source_address = sa
 544         else:  # Python 2.6
 545             def _hc_connect(self, *args, **kwargs):
 546                 sock = compat_socket_create_connection(
 547                     (self.host, self.port), self.timeout, sa)
 548                 if is_https:
 549                     self.sock = ssl.wrap_socket(
 550                         sock, self.key_file, self.cert_file,
 551                         ssl_version=ssl.PROTOCOL_TLSv1)
 552                 else:
 553                     self.sock = sock
 554             hc.connect = functools.partial(_hc_connect, hc)
 555
 556     return hc
 557
 558
 559 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 560     """Handler for HTTP requests and responses.
 561
 562     This class, when installed with an OpenerDirector, automatically adds
 563     the standard headers to every HTTP request and handles gzipped and
 564     deflated responses from web servers. If compression is to be avoided in
 565     a particular request, the original request in the program code only has
 566     to include the HTTP header "Youtubedl-No-Compression", which will be
 567     removed before making the real request.
 568
 569     Part of this code was copied from:
 570
 571     http://techknack.net/python-urllib2-handlers/
 572
 573     Andrew Rowls, the author of that code, agreed to release it to the
 574     public domain.
 575     """
 576
 577     def __init__(self, params, *args, **kwargs):
 578         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 579         self._params = params
 580
 581     def http_open(self, req):
 582         return self.do_open(functools.partial(
 583             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 584             req)
 585
 586     @staticmethod
 587     def deflate(data):
 588         try:
 589             return zlib.decompress(data, -zlib.MAX_WBITS)
 590         except zlib.error:
 591             return zlib.decompress(data)
 592
 593     @staticmethod
 594     def addinfourl_wrapper(stream, headers, url, code):
 595         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 596             return compat_urllib_request.addinfourl(stream, headers, url, code)
 597         ret = compat_urllib_request.addinfourl(stream, headers, url)
 598         ret.code = code
 599         return ret
 600
 601     def http_request(self, req):
 602         for h, v in std_headers.items():
 603             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 604             # The dict keys are capitalized because of this bug by urllib
 605             if h.capitalize() not in req.headers:
 606                 req.add_header(h, v)
 607         if 'Youtubedl-no-compression' in req.headers:
 608             if 'Accept-encoding' in req.headers:
 609                 del req.headers['Accept-encoding']
 610             del req.headers['Youtubedl-no-compression']
 611
 612         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 613             # Python 2.6 is brain-dead when it comes to fragments
 614             req._Request__original = req._Request__original.partition('#')[0]
 615             req._Request__r_type = req._Request__r_type.partition('#')[0]
 616
 617         return req
 618
 619     def http_response(self, req, resp):
 620         old_resp = resp
 621         # gzip
 622         if resp.headers.get('Content-encoding', '') == 'gzip':
 623             content = resp.read()
 624             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 625             try:
 626                 uncompressed = io.BytesIO(gz.read())
 627             except IOError as original_ioerror:
 628                 # There may be junk add the end of the file
 629                 # See http://stackoverflow.com/q/4928560/35070 for details
 630                 for i in range(1, 1024):
 631                     try:
 632                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 633                         uncompressed = io.BytesIO(gz.read())
 634                     except IOError:
 635                         continue
 636                     break
 637                 else:
 638                     raise original_ioerror
 639             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 640             resp.msg = old_resp.msg
 641         # deflate
 642         if resp.headers.get('Content-encoding', '') == 'deflate':
 643             gz = io.BytesIO(self.deflate(resp.read()))
 644             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 645             resp.msg = old_resp.msg
 646         return resp
 647
 648     https_request = http_request
 649     https_response = http_response
 650
 651
 652 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 653     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 654         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 655         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 656         self._params = params
 657
 658     def https_open(self, req):
 659         kwargs = {}
 660         if hasattr(self, '_context'):  # python > 2.6
 661             kwargs['context'] = self._context
 662         if hasattr(self, '_check_hostname'):  # python 3.x
 663             kwargs['check_hostname'] = self._check_hostname
 664         return self.do_open(functools.partial(
 665             _create_http_connection, self, self._https_conn_class, True),
 666             req, **kwargs)
 667
 668
 669 def parse_iso8601(date_str, delimiter='T'):
 670     """ Return a UNIX timestamp from the given date """
 671
 672     if date_str is None:
 673         return None
 674
 675     m = re.search(
 676         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 677         date_str)
 678     if not m:
 679         timezone = datetime.timedelta()
 680     else:
 681         date_str = date_str[:-len(m.group(0))]
 682         if not m.group('sign'):
 683             timezone = datetime.timedelta()
 684         else:
 685             sign = 1 if m.group('sign') == '+' else -1
 686             timezone = datetime.timedelta(
 687                 hours=sign * int(m.group('hours')),
 688                 minutes=sign * int(m.group('minutes')))
 689     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 690     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 691     return calendar.timegm(dt.timetuple())
 692
 693
 694 def unified_strdate(date_str, day_first=True):
 695     """Return a string with the date in the format YYYYMMDD"""
 696
 697     if date_str is None:
 698         return None
 699     upload_date = None
 700     # Replace commas
 701     date_str = date_str.replace(',', ' ')
 702     # %z (UTC offset) is only supported in python>=3.2
 703     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 704     # Remove AM/PM + timezone
 705     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 706
 707     format_expressions = [
 708         '%d %B %Y',
 709         '%d %b %Y',
 710         '%B %d %Y',
 711         '%b %d %Y',
 712         '%b %dst %Y %I:%M%p',
 713         '%b %dnd %Y %I:%M%p',
 714         '%b %dth %Y %I:%M%p',
 715         '%Y %m %d',
 716         '%Y-%m-%d',
 717         '%Y/%m/%d',
 718         '%Y/%m/%d %H:%M:%S',
 719         '%Y-%m-%d %H:%M:%S',
 720         '%Y-%m-%d %H:%M:%S.%f',
 721         '%d.%m.%Y %H:%M',
 722         '%d.%m.%Y %H.%M',
 723         '%Y-%m-%dT%H:%M:%SZ',
 724         '%Y-%m-%dT%H:%M:%S.%fZ',
 725         '%Y-%m-%dT%H:%M:%S.%f0Z',
 726         '%Y-%m-%dT%H:%M:%S',
 727         '%Y-%m-%dT%H:%M:%S.%f',
 728         '%Y-%m-%dT%H:%M',
 729     ]
 730     if day_first:
 731         format_expressions.extend([
 732             '%d.%m.%Y',
 733             '%d/%m/%Y',
 734             '%d/%m/%y',
 735             '%d/%m/%Y %H:%M:%S',
 736         ])
 737     else:
 738         format_expressions.extend([
 739             '%m.%d.%Y',
 740             '%m/%d/%Y',
 741             '%m/%d/%y',
 742             '%m/%d/%Y %H:%M:%S',
 743         ])
 744     for expression in format_expressions:
 745         try:
 746             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 747         except ValueError:
 748             pass
 749     if upload_date is None:
 750         timetuple = email.utils.parsedate_tz(date_str)
 751         if timetuple:
 752             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 753     return upload_date
 754
 755
 756 def determine_ext(url, default_ext='unknown_video'):
 757     if url is None:
 758         return default_ext
 759     guess = url.partition('?')[0].rpartition('.')[2]
 760     if re.match(r'^[A-Za-z0-9]+$', guess):
 761         return guess
 762     else:
 763         return default_ext
 764
 765
 766 def subtitles_filename(filename, sub_lang, sub_format):
 767     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 768
 769
 770 def date_from_str(date_str):
 771     """
 772     Return a datetime object from a string in the format YYYYMMDD or
 773     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 774     today = datetime.date.today()
 775     if date_str in ('now', 'today'):
 776         return today
 777     if date_str == 'yesterday':
 778         return today - datetime.timedelta(days=1)
 779     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 780     if match is not None:
 781         sign = match.group('sign')
 782         time = int(match.group('time'))
 783         if sign == '-':
 784             time = -time
 785         unit = match.group('unit')
 786         # A bad aproximation?
 787         if unit == 'month':
 788             unit = 'day'
 789             time *= 30
 790         elif unit == 'year':
 791             unit = 'day'
 792             time *= 365
 793         unit += 's'
 794         delta = datetime.timedelta(**{unit: time})
 795         return today + delta
 796     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 797
 798
 799 def hyphenate_date(date_str):
 800     """
 801     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 802     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 803     if match is not None:
 804         return '-'.join(match.groups())
 805     else:
 806         return date_str
 807
 808
 809 class DateRange(object):
 810     """Represents a time interval between two dates"""
 811
 812     def __init__(self, start=None, end=None):
 813         """start and end must be strings in the format accepted by date"""
 814         if start is not None:
 815             self.start = date_from_str(start)
 816         else:
 817             self.start = datetime.datetime.min.date()
 818         if end is not None:
 819             self.end = date_from_str(end)
 820         else:
 821             self.end = datetime.datetime.max.date()
 822         if self.start > self.end:
 823             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 824
 825     @classmethod
 826     def day(cls, day):
 827         """Returns a range that only contains the given day"""
 828         return cls(day, day)
 829
 830     def __contains__(self, date):
 831         """Check if the date is in the range"""
 832         if not isinstance(date, datetime.date):
 833             date = date_from_str(date)
 834         return self.start <= date <= self.end
 835
 836     def __str__(self):
 837         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 838
 839
 840 def platform_name():
 841     """ Returns the platform name as a compat_str """
 842     res = platform.platform()
 843     if isinstance(res, bytes):
 844         res = res.decode(preferredencoding())
 845
 846     assert isinstance(res, compat_str)
 847     return res
 848
 849
 850 def _windows_write_string(s, out):
 851     """ Returns True if the string was written using special methods,
 852     False if it has yet to be written out."""
 853     # Adapted from http://stackoverflow.com/a/3259271/35070
 854
 855     import ctypes
 856     import ctypes.wintypes
 857
 858     WIN_OUTPUT_IDS = {
 859         1: -11,
 860         2: -12,
 861     }
 862
 863     try:
 864         fileno = out.fileno()
 865     except AttributeError:
 866         # If the output stream doesn't have a fileno, it's virtual
 867         return False
 868     except io.UnsupportedOperation:
 869         # Some strange Windows pseudo files?
 870         return False
 871     if fileno not in WIN_OUTPUT_IDS:
 872         return False
 873
 874     GetStdHandle = ctypes.WINFUNCTYPE(
 875         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 876         (b"GetStdHandle", ctypes.windll.kernel32))
 877     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 878
 879     WriteConsoleW = ctypes.WINFUNCTYPE(
 880         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 881         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 882         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 883     written = ctypes.wintypes.DWORD(0)
 884
 885     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 886     FILE_TYPE_CHAR = 0x0002
 887     FILE_TYPE_REMOTE = 0x8000
 888     GetConsoleMode = ctypes.WINFUNCTYPE(
 889         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 890         ctypes.POINTER(ctypes.wintypes.DWORD))(
 891         (b"GetConsoleMode", ctypes.windll.kernel32))
 892     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 893
 894     def not_a_console(handle):
 895         if handle == INVALID_HANDLE_VALUE or handle is None:
 896             return True
 897         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 898                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 899
 900     if not_a_console(h):
 901         return False
 902
 903     def next_nonbmp_pos(s):
 904         try:
 905             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 906         except StopIteration:
 907             return len(s)
 908
 909     while s:
 910         count = min(next_nonbmp_pos(s), 1024)
 911
 912         ret = WriteConsoleW(
 913             h, s, count if count else 2, ctypes.byref(written), None)
 914         if ret == 0:
 915             raise OSError('Failed to write string')
 916         if not count:  # We just wrote a non-BMP character
 917             assert written.value == 2
 918             s = s[1:]
 919         else:
 920             assert written.value > 0
 921             s = s[written.value:]
 922     return True
 923
 924
 925 def write_string(s, out=None, encoding=None):
 926     if out is None:
 927         out = sys.stderr
 928     assert type(s) == compat_str
 929
 930     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 931         if _windows_write_string(s, out):
 932             return
 933
 934     if ('b' in getattr(out, 'mode', '') or
 935             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 936         byt = s.encode(encoding or preferredencoding(), 'ignore')
 937         out.write(byt)
 938     elif hasattr(out, 'buffer'):
 939         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 940         byt = s.encode(enc, 'ignore')
 941         out.buffer.write(byt)
 942     else:
 943         out.write(s)
 944     out.flush()
 945
 946
 947 def bytes_to_intlist(bs):
 948     if not bs:
 949         return []
 950     if isinstance(bs[0], int):  # Python 3
 951         return list(bs)
 952     else:
 953         return [ord(c) for c in bs]
 954
 955
 956 def intlist_to_bytes(xs):
 957     if not xs:
 958         return b''
 959     return struct_pack('%dB' % len(xs), *xs)
 960
 961
 962 # Cross-platform file locking
 963 if sys.platform == 'win32':
 964     import ctypes.wintypes
 965     import msvcrt
 966
 967     class OVERLAPPED(ctypes.Structure):
 968         _fields_ = [
 969             ('Internal', ctypes.wintypes.LPVOID),
 970             ('InternalHigh', ctypes.wintypes.LPVOID),
 971             ('Offset', ctypes.wintypes.DWORD),
 972             ('OffsetHigh', ctypes.wintypes.DWORD),
 973             ('hEvent', ctypes.wintypes.HANDLE),
 974         ]
 975
 976     kernel32 = ctypes.windll.kernel32
 977     LockFileEx = kernel32.LockFileEx
 978     LockFileEx.argtypes = [
 979         ctypes.wintypes.HANDLE,     # hFile
 980         ctypes.wintypes.DWORD,      # dwFlags
 981         ctypes.wintypes.DWORD,      # dwReserved
 982         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 983         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 984         ctypes.POINTER(OVERLAPPED)  # Overlapped
 985     ]
 986     LockFileEx.restype = ctypes.wintypes.BOOL
 987     UnlockFileEx = kernel32.UnlockFileEx
 988     UnlockFileEx.argtypes = [
 989         ctypes.wintypes.HANDLE,     # hFile
 990         ctypes.wintypes.DWORD,      # dwReserved
 991         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 993         ctypes.POINTER(OVERLAPPED)  # Overlapped
 994     ]
 995     UnlockFileEx.restype = ctypes.wintypes.BOOL
 996     whole_low = 0xffffffff
 997     whole_high = 0x7fffffff
 998
 999     def _lock_file(f, exclusive):
1000         overlapped = OVERLAPPED()
1001         overlapped.Offset = 0
1002         overlapped.OffsetHigh = 0
1003         overlapped.hEvent = 0
1004         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1005         handle = msvcrt.get_osfhandle(f.fileno())
1006         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1007                           whole_low, whole_high, f._lock_file_overlapped_p):
1008             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1009
1010     def _unlock_file(f):
1011         assert f._lock_file_overlapped_p
1012         handle = msvcrt.get_osfhandle(f.fileno())
1013         if not UnlockFileEx(handle, 0,
1014                             whole_low, whole_high, f._lock_file_overlapped_p):
1015             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1016
1017 else:
1018     import fcntl
1019
1020     def _lock_file(f, exclusive):
1021         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1022
1023     def _unlock_file(f):
1024         fcntl.flock(f, fcntl.LOCK_UN)
1025
1026
1027 class locked_file(object):
1028     def __init__(self, filename, mode, encoding=None):
1029         assert mode in ['r', 'a', 'w']
1030         self.f = io.open(filename, mode, encoding=encoding)
1031         self.mode = mode
1032
1033     def __enter__(self):
1034         exclusive = self.mode != 'r'
1035         try:
1036             _lock_file(self.f, exclusive)
1037         except IOError:
1038             self.f.close()
1039             raise
1040         return self
1041
1042     def __exit__(self, etype, value, traceback):
1043         try:
1044             _unlock_file(self.f)
1045         finally:
1046             self.f.close()
1047
1048     def __iter__(self):
1049         return iter(self.f)
1050
1051     def write(self, *args):
1052         return self.f.write(*args)
1053
1054     def read(self, *args):
1055         return self.f.read(*args)
1056
1057
1058 def get_filesystem_encoding():
1059     encoding = sys.getfilesystemencoding()
1060     return encoding if encoding is not None else 'utf-8'
1061
1062
1063 def shell_quote(args):
1064     quoted_args = []
1065     encoding = get_filesystem_encoding()
1066     for a in args:
1067         if isinstance(a, bytes):
1068             # We may get a filename encoded with 'encodeFilename'
1069             a = a.decode(encoding)
1070         quoted_args.append(pipes.quote(a))
1071     return ' '.join(quoted_args)
1072
1073
1074 def takewhile_inclusive(pred, seq):
1075     """ Like itertools.takewhile, but include the latest evaluated element
1076         (the first element so that Not pred(e)) """
1077     for e in seq:
1078         yield e
1079         if not pred(e):
1080             return
1081
1082
1083 def smuggle_url(url, data):
1084     """ Pass additional data in a URL for internal use. """
1085
1086     sdata = compat_urllib_parse.urlencode(
1087         {'__youtubedl_smuggle': json.dumps(data)})
1088     return url + '#' + sdata
1089
1090
1091 def unsmuggle_url(smug_url, default=None):
1092     if '#__youtubedl_smuggle' not in smug_url:
1093         return smug_url, default
1094     url, _, sdata = smug_url.rpartition('#')
1095     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1096     data = json.loads(jsond)
1097     return url, data
1098
1099
1100 def format_bytes(bytes):
1101     if bytes is None:
1102         return 'N/A'
1103     if type(bytes) is str:
1104         bytes = float(bytes)
1105     if bytes == 0.0:
1106         exponent = 0
1107     else:
1108         exponent = int(math.log(bytes, 1024.0))
1109     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1110     converted = float(bytes) / float(1024 ** exponent)
1111     return '%.2f%s' % (converted, suffix)
1112
1113
1114 def parse_filesize(s):
1115     if s is None:
1116         return None
1117
1118     # The lower-case forms are of course incorrect and inofficial,
1119     # but we support those too
1120     _UNIT_TABLE = {
1121         'B': 1,
1122         'b': 1,
1123         'KiB': 1024,
1124         'KB': 1000,
1125         'kB': 1024,
1126         'Kb': 1000,
1127         'MiB': 1024 ** 2,
1128         'MB': 1000 ** 2,
1129         'mB': 1024 ** 2,
1130         'Mb': 1000 ** 2,
1131         'GiB': 1024 ** 3,
1132         'GB': 1000 ** 3,
1133         'gB': 1024 ** 3,
1134         'Gb': 1000 ** 3,
1135         'TiB': 1024 ** 4,
1136         'TB': 1000 ** 4,
1137         'tB': 1024 ** 4,
1138         'Tb': 1000 ** 4,
1139         'PiB': 1024 ** 5,
1140         'PB': 1000 ** 5,
1141         'pB': 1024 ** 5,
1142         'Pb': 1000 ** 5,
1143         'EiB': 1024 ** 6,
1144         'EB': 1000 ** 6,
1145         'eB': 1024 ** 6,
1146         'Eb': 1000 ** 6,
1147         'ZiB': 1024 ** 7,
1148         'ZB': 1000 ** 7,
1149         'zB': 1024 ** 7,
1150         'Zb': 1000 ** 7,
1151         'YiB': 1024 ** 8,
1152         'YB': 1000 ** 8,
1153         'yB': 1024 ** 8,
1154         'Yb': 1000 ** 8,
1155     }
1156
1157     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1158     m = re.match(
1159         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1160     if not m:
1161         return None
1162
1163     num_str = m.group('num').replace(',', '.')
1164     mult = _UNIT_TABLE[m.group('unit')]
1165     return int(float(num_str) * mult)
1166
1167
1168 def get_term_width():
1169     columns = compat_getenv('COLUMNS', None)
1170     if columns:
1171         return int(columns)
1172
1173     try:
1174         sp = subprocess.Popen(
1175             ['stty', 'size'],
1176             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1177         out, err = sp.communicate()
1178         return int(out.split()[1])
1179     except:
1180         pass
1181     return None
1182
1183
1184 def month_by_name(name):
1185     """ Return the number of a month by (locale-independently) English name """
1186
1187     ENGLISH_NAMES = [
1188         'January', 'February', 'March', 'April', 'May', 'June',
1189         'July', 'August', 'September', 'October', 'November', 'December']
1190     try:
1191         return ENGLISH_NAMES.index(name) + 1
1192     except ValueError:
1193         return None
1194
1195
1196 def fix_xml_ampersands(xml_str):
1197     """Replace all the '&' by '&amp;' in XML"""
1198     return re.sub(
1199         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1200         '&amp;',
1201         xml_str)
1202
1203
1204 def setproctitle(title):
1205     assert isinstance(title, compat_str)
1206     try:
1207         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1208     except OSError:
1209         return
1210     title_bytes = title.encode('utf-8')
1211     buf = ctypes.create_string_buffer(len(title_bytes))
1212     buf.value = title_bytes
1213     try:
1214         libc.prctl(15, buf, 0, 0, 0)
1215     except AttributeError:
1216         return  # Strange libc, just skip this
1217
1218
1219 def remove_start(s, start):
1220     if s.startswith(start):
1221         return s[len(start):]
1222     return s
1223
1224
1225 def remove_end(s, end):
1226     if s.endswith(end):
1227         return s[:-len(end)]
1228     return s
1229
1230
1231 def url_basename(url):
1232     path = compat_urlparse.urlparse(url).path
1233     return path.strip('/').split('/')[-1]
1234
1235
1236 class HEADRequest(compat_urllib_request.Request):
1237     def get_method(self):
1238         return "HEAD"
1239
1240
1241 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1242     if get_attr:
1243         if v is not None:
1244             v = getattr(v, get_attr, None)
1245     if v == '':
1246         v = None
1247     return default if v is None else (int(v) * invscale // scale)
1248
1249
1250 def str_or_none(v, default=None):
1251     return default if v is None else compat_str(v)
1252
1253
1254 def str_to_int(int_str):
1255     """ A more relaxed version of int_or_none """
1256     if int_str is None:
1257         return None
1258     int_str = re.sub(r'[,\.\+]', '', int_str)
1259     return int(int_str)
1260
1261
1262 def float_or_none(v, scale=1, invscale=1, default=None):
1263     return default if v is None else (float(v) * invscale / scale)
1264
1265
1266 def parse_duration(s):
1267     if not isinstance(s, compat_basestring):
1268         return None
1269
1270     s = s.strip()
1271
1272     m = re.match(
1273         r'''(?ix)(?:P?T)?
1274         (?:
1275             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1276             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1277
1278             (?:
1279                 (?:
1280                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1281                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1282                 )?
1283                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1284             )?
1285             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1286         )$''', s)
1287     if not m:
1288         return None
1289     res = 0
1290     if m.group('only_mins'):
1291         return float_or_none(m.group('only_mins'), invscale=60)
1292     if m.group('only_hours'):
1293         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1294     if m.group('secs'):
1295         res += int(m.group('secs'))
1296     if m.group('mins'):
1297         res += int(m.group('mins')) * 60
1298     if m.group('hours'):
1299         res += int(m.group('hours')) * 60 * 60
1300     if m.group('days'):
1301         res += int(m.group('days')) * 24 * 60 * 60
1302     if m.group('ms'):
1303         res += float(m.group('ms'))
1304     return res
1305
1306
1307 def prepend_extension(filename, ext):
1308     name, real_ext = os.path.splitext(filename)
1309     return '{0}.{1}{2}'.format(name, ext, real_ext)
1310
1311
1312 def check_executable(exe, args=[]):
1313     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1314     args can be a list of arguments for a short output (like -version) """
1315     try:
1316         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1317     except OSError:
1318         return False
1319     return exe
1320
1321
1322 def get_exe_version(exe, args=['--version'],
1323                     version_re=None, unrecognized='present'):
1324     """ Returns the version of the specified executable,
1325     or False if the executable is not present """
1326     try:
1327         out, _ = subprocess.Popen(
1328             [exe] + args,
1329             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1330     except OSError:
1331         return False
1332     if isinstance(out, bytes):  # Python 2.x
1333         out = out.decode('ascii', 'ignore')
1334     return detect_exe_version(out, version_re, unrecognized)
1335
1336
1337 def detect_exe_version(output, version_re=None, unrecognized='present'):
1338     assert isinstance(output, compat_str)
1339     if version_re is None:
1340         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1341     m = re.search(version_re, output)
1342     if m:
1343         return m.group(1)
1344     else:
1345         return unrecognized
1346
1347
1348 class PagedList(object):
1349     def __len__(self):
1350         # This is only useful for tests
1351         return len(self.getslice())
1352
1353
1354 class OnDemandPagedList(PagedList):
1355     def __init__(self, pagefunc, pagesize):
1356         self._pagefunc = pagefunc
1357         self._pagesize = pagesize
1358
1359     def getslice(self, start=0, end=None):
1360         res = []
1361         for pagenum in itertools.count(start // self._pagesize):
1362             firstid = pagenum * self._pagesize
1363             nextfirstid = pagenum * self._pagesize + self._pagesize
1364             if start >= nextfirstid:
1365                 continue
1366
1367             page_results = list(self._pagefunc(pagenum))
1368
1369             startv = (
1370                 start % self._pagesize
1371                 if firstid <= start < nextfirstid
1372                 else 0)
1373
1374             endv = (
1375                 ((end - 1) % self._pagesize) + 1
1376                 if (end is not None and firstid <= end <= nextfirstid)
1377                 else None)
1378
1379             if startv != 0 or endv is not None:
1380                 page_results = page_results[startv:endv]
1381             res.extend(page_results)
1382
1383             # A little optimization - if current page is not "full", ie. does
1384             # not contain page_size videos then we can assume that this page
1385             # is the last one - there are no more ids on further pages -
1386             # i.e. no need to query again.
1387             if len(page_results) + startv < self._pagesize:
1388                 break
1389
1390             # If we got the whole page, but the next page is not interesting,
1391             # break out early as well
1392             if end == nextfirstid:
1393                 break
1394         return res
1395
1396
1397 class InAdvancePagedList(PagedList):
1398     def __init__(self, pagefunc, pagecount, pagesize):
1399         self._pagefunc = pagefunc
1400         self._pagecount = pagecount
1401         self._pagesize = pagesize
1402
1403     def getslice(self, start=0, end=None):
1404         res = []
1405         start_page = start // self._pagesize
1406         end_page = (
1407             self._pagecount if end is None else (end // self._pagesize + 1))
1408         skip_elems = start - start_page * self._pagesize
1409         only_more = None if end is None else end - start
1410         for pagenum in range(start_page, end_page):
1411             page = list(self._pagefunc(pagenum))
1412             if skip_elems:
1413                 page = page[skip_elems:]
1414                 skip_elems = None
1415             if only_more is not None:
1416                 if len(page) < only_more:
1417                     only_more -= len(page)
1418                 else:
1419                     page = page[:only_more]
1420                     res.extend(page)
1421                     break
1422             res.extend(page)
1423         return res
1424
1425
1426 def uppercase_escape(s):
1427     unicode_escape = codecs.getdecoder('unicode_escape')
1428     return re.sub(
1429         r'\\U[0-9a-fA-F]{8}',
1430         lambda m: unicode_escape(m.group(0))[0],
1431         s)
1432
1433
1434 def escape_rfc3986(s):
1435     """Escape non-ASCII characters as suggested by RFC 3986"""
1436     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1437         s = s.encode('utf-8')
1438     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1439
1440
1441 def escape_url(url):
1442     """Escape URL as suggested by RFC 3986"""
1443     url_parsed = compat_urllib_parse_urlparse(url)
1444     return url_parsed._replace(
1445         path=escape_rfc3986(url_parsed.path),
1446         params=escape_rfc3986(url_parsed.params),
1447         query=escape_rfc3986(url_parsed.query),
1448         fragment=escape_rfc3986(url_parsed.fragment)
1449     ).geturl()
1450
1451 try:
1452     struct.pack('!I', 0)
1453 except TypeError:
1454     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1455     def struct_pack(spec, *args):
1456         if isinstance(spec, compat_str):
1457             spec = spec.encode('ascii')
1458         return struct.pack(spec, *args)
1459
1460     def struct_unpack(spec, *args):
1461         if isinstance(spec, compat_str):
1462             spec = spec.encode('ascii')
1463         return struct.unpack(spec, *args)
1464 else:
1465     struct_pack = struct.pack
1466     struct_unpack = struct.unpack
1467
1468
1469 def read_batch_urls(batch_fd):
1470     def fixup(url):
1471         if not isinstance(url, compat_str):
1472             url = url.decode('utf-8', 'replace')
1473         BOM_UTF8 = '\xef\xbb\xbf'
1474         if url.startswith(BOM_UTF8):
1475             url = url[len(BOM_UTF8):]
1476         url = url.strip()
1477         if url.startswith(('#', ';', ']')):
1478             return False
1479         return url
1480
1481     with contextlib.closing(batch_fd) as fd:
1482         return [url for url in map(fixup, fd) if url]
1483
1484
1485 def urlencode_postdata(*args, **kargs):
1486     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1487
1488
1489 try:
1490     etree_iter = xml.etree.ElementTree.Element.iter
1491 except AttributeError:  # Python <=2.6
1492     etree_iter = lambda n: n.findall('.//*')
1493
1494
1495 def parse_xml(s):
1496     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1497         def doctype(self, name, pubid, system):
1498             pass  # Ignore doctypes
1499
1500     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1501     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1502     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1503     # Fix up XML parser in Python 2.x
1504     if sys.version_info < (3, 0):
1505         for n in etree_iter(tree):
1506             if n.text is not None:
1507                 if not isinstance(n.text, compat_str):
1508                     n.text = n.text.decode('utf-8')
1509     return tree
1510
1511
1512 US_RATINGS = {
1513     'G': 0,
1514     'PG': 10,
1515     'PG-13': 13,
1516     'R': 16,
1517     'NC': 18,
1518 }
1519
1520
1521 def parse_age_limit(s):
1522     if s is None:
1523         return None
1524     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1525     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1526
1527
1528 def strip_jsonp(code):
1529     return re.sub(
1530         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1531
1532
1533 def js_to_json(code):
1534     def fix_kv(m):
1535         v = m.group(0)
1536         if v in ('true', 'false', 'null'):
1537             return v
1538         if v.startswith('"'):
1539             return v
1540         if v.startswith("'"):
1541             v = v[1:-1]
1542             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1543                 '\\\\': '\\\\',
1544                 "\\'": "'",
1545                 '"': '\\"',
1546             }[m.group(0)], v)
1547         return '"%s"' % v
1548
1549     res = re.sub(r'''(?x)
1550         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1551         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1552         [a-zA-Z_][.a-zA-Z_0-9]*
1553         ''', fix_kv, code)
1554     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1555     return res
1556
1557
1558 def qualities(quality_ids):
1559     """ Get a numeric quality value out of a list of possible values """
1560     def q(qid):
1561         try:
1562             return quality_ids.index(qid)
1563         except ValueError:
1564             return -1
1565     return q
1566
1567
1568 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1569
1570
1571 def limit_length(s, length):
1572     """ Add ellipses to overly long strings """
1573     if s is None:
1574         return None
1575     ELLIPSES = '...'
1576     if len(s) > length:
1577         return s[:length - len(ELLIPSES)] + ELLIPSES
1578     return s
1579
1580
1581 def version_tuple(v):
1582     return tuple(int(e) for e in re.split(r'[-.]', v))
1583
1584
1585 def is_outdated_version(version, limit, assume_new=True):
1586     if not version:
1587         return not assume_new
1588     try:
1589         return version_tuple(version) < version_tuple(limit)
1590     except ValueError:
1591         return not assume_new
1592
1593
1594 def ytdl_is_updateable():
1595     """ Returns if youtube-dl can be updated with -U """
1596     from zipimport import zipimporter
1597
1598     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1599
1600
1601 def args_to_str(args):
1602     # Get a short string representation for a subprocess command
1603     return ' '.join(shlex_quote(a) for a in args)
1604
1605
1606 def urlhandle_detect_ext(url_handle):
1607     try:
1608         url_handle.headers
1609         getheader = lambda h: url_handle.headers[h]
1610     except AttributeError:  # Python < 3
1611         getheader = url_handle.info().getheader
1612
1613     cd = getheader('Content-Disposition')
1614     if cd:
1615         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1616         if m:
1617             e = determine_ext(m.group('filename'), default_ext=None)
1618             if e:
1619                 return e
1620
1621     return getheader('Content-Type').split("/")[1]
1622
1623
1624 def age_restricted(content_limit, age_limit):
1625     """ Returns True iff the content should be blocked """
1626
1627     if age_limit is None:  # No limit set
1628         return False
1629     if content_limit is None:
1630         return False  # Content available for everyone
1631     return age_limit < content_limit
1632
1633
1634 def is_html(first_bytes):
1635     """ Detect whether a file contains HTML by examining its first bytes. """
1636
1637     BOMS = [
1638         (b'\xef\xbb\xbf', 'utf-8'),
1639         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1640         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1641         (b'\xff\xfe', 'utf-16-le'),
1642         (b'\xfe\xff', 'utf-16-be'),
1643     ]
1644     for bom, enc in BOMS:
1645         if first_bytes.startswith(bom):
1646             s = first_bytes[len(bom):].decode(enc, 'replace')
1647             break
1648     else:
1649         s = first_bytes.decode('utf-8', 'replace')
1650
1651     return re.match(r'^\s*<', s)
1652
1653
1654 def determine_protocol(info_dict):
1655     protocol = info_dict.get('protocol')
1656     if protocol is not None:
1657         return protocol
1658
1659     url = info_dict['url']
1660     if url.startswith('rtmp'):
1661         return 'rtmp'
1662     elif url.startswith('mms'):
1663         return 'mms'
1664     elif url.startswith('rtsp'):
1665         return 'rtsp'
1666
1667     ext = determine_ext(url)
1668     if ext == 'm3u8':
1669         return 'm3u8'
1670     elif ext == 'f4m':
1671         return 'f4m'
1672
1673     return compat_urllib_parse_urlparse(url).scheme
1674
1675
1676 def render_table(header_row, data):
1677     """ Render a list of rows, each as a list of values """
1678     table = [header_row] + data
1679     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1680     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1681     return '\n'.join(format_str % tuple(row) for row in table)
1682
1683
1684 def _match_one(filter_part, dct):
1685     COMPARISON_OPERATORS = {
1686         '<': operator.lt,
1687         '<=': operator.le,
1688         '>': operator.gt,
1689         '>=': operator.ge,
1690         '=': operator.eq,
1691         '!=': operator.ne,
1692     }
1693     operator_rex = re.compile(r'''(?x)\s*
1694         (?P<key>[a-z_]+)
1695         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1696         (?:
1697             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1698             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1699         )
1700         \s*$
1701         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1702     m = operator_rex.search(filter_part)
1703     if m:
1704         op = COMPARISON_OPERATORS[m.group('op')]
1705         if m.group('strval') is not None:
1706             if m.group('op') not in ('=', '!='):
1707                 raise ValueError(
1708                     'Operator %s does not support string values!' % m.group('op'))
1709             comparison_value = m.group('strval')
1710         else:
1711             try:
1712                 comparison_value = int(m.group('intval'))
1713             except ValueError:
1714                 comparison_value = parse_filesize(m.group('intval'))
1715                 if comparison_value is None:
1716                     comparison_value = parse_filesize(m.group('intval') + 'B')
1717                 if comparison_value is None:
1718                     raise ValueError(
1719                         'Invalid integer value %r in filter part %r' % (
1720                             m.group('intval'), filter_part))
1721         actual_value = dct.get(m.group('key'))
1722         if actual_value is None:
1723             return m.group('none_inclusive')
1724         return op(actual_value, comparison_value)
1725
1726     UNARY_OPERATORS = {
1727         '': lambda v: v is not None,
1728         '!': lambda v: v is None,
1729     }
1730     operator_rex = re.compile(r'''(?x)\s*
1731         (?P<op>%s)\s*(?P<key>[a-z_]+)
1732         \s*$
1733         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1734     m = operator_rex.search(filter_part)
1735     if m:
1736         op = UNARY_OPERATORS[m.group('op')]
1737         actual_value = dct.get(m.group('key'))
1738         return op(actual_value)
1739
1740     raise ValueError('Invalid filter part %r' % filter_part)
1741
1742
1743 def match_str(filter_str, dct):
1744     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1745
1746     return all(
1747         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1748
1749
1750 def match_filter_func(filter_str):
1751     def _match_func(info_dict):
1752         if match_str(filter_str, info_dict):
1753             return None
1754         else:
1755             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1756             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1757     return _match_func