youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208
 209     if html is None:  # Convenience for sanitizing descriptions etc.
 210         return html
 211
 212     # Newline vs <br />
 213     html = html.replace('\n', ' ')
 214     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 215     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 216     # Strip html tags
 217     html = re.sub('<.*?>', '', html)
 218     # Replace html entities
 219     html = unescapeHTML(html)
 220     return html.strip()
 221
 222
 223 def sanitize_open(filename, open_mode):
 224     """Try to open the given filename, and slightly tweak it if this fails.
 225
 226     Attempts to open the given filename. If this fails, it tries to change
 227     the filename slightly, step by step, until it's either able to open it
 228     or it fails and raises a final exception, like the standard open()
 229     function.
 230
 231     It returns the tuple (stream, definitive_file_name).
 232     """
 233     try:
 234         if filename == '-':
 235             if sys.platform == 'win32':
 236                 import msvcrt
 237                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 239         stream = open(encodeFilename(filename), open_mode)
 240         return (stream, filename)
 241     except (IOError, OSError) as err:
 242         if err.errno in (errno.EACCES,):
 243             raise
 244
 245         # In case of error, try to remove win32 forbidden chars
 246         alt_filename = os.path.join(
 247             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 248             for path_part in os.path.split(filename)
 249         )
 250         if alt_filename == filename:
 251             raise
 252         else:
 253             # An exception here should be caught in the caller
 254             stream = open(encodeFilename(filename), open_mode)
 255             return (stream, alt_filename)
 256
 257
 258 def timeconvert(timestr):
 259     """Convert RFC 2822 defined time string into system timestamp"""
 260     timestamp = None
 261     timetuple = email.utils.parsedate_tz(timestr)
 262     if timetuple is not None:
 263         timestamp = email.utils.mktime_tz(timetuple)
 264     return timestamp
 265
 266
 267 def sanitize_filename(s, restricted=False, is_id=False):
 268     """Sanitizes a string so it could be used as part of a filename.
 269     If restricted is set, use a stricter subset of allowed characters.
 270     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 271     """
 272     def replace_insane(char):
 273         if char == '?' or ord(char) < 32 or ord(char) == 127:
 274             return ''
 275         elif char == '"':
 276             return '' if restricted else '\''
 277         elif char == ':':
 278             return '_-' if restricted else ' -'
 279         elif char in '\\/|*<>':
 280             return '_'
 281         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 282             return '_'
 283         if restricted and ord(char) > 127:
 284             return '_'
 285         return char
 286
 287     result = ''.join(map(replace_insane, s))
 288     if not is_id:
 289         while '__' in result:
 290             result = result.replace('__', '_')
 291         result = result.strip('_')
 292         # Common case of "Foreign band name - English song title"
 293         if restricted and result.startswith('-_'):
 294             result = result[2:]
 295         if not result:
 296             result = '_'
 297     return result
 298
 299
 300 def orderedSet(iterable):
 301     """ Remove all duplicates from the input iterable """
 302     res = []
 303     for el in iterable:
 304         if el not in res:
 305             res.append(el)
 306     return res
 307
 308
 309 def _htmlentity_transform(entity):
 310     """Transforms an HTML entity to a character."""
 311     # Known non-numeric HTML entity
 312     if entity in compat_html_entities.name2codepoint:
 313         return compat_chr(compat_html_entities.name2codepoint[entity])
 314
 315     mobj = re.match(r'#(x?[0-9]+)', entity)
 316     if mobj is not None:
 317         numstr = mobj.group(1)
 318         if numstr.startswith('x'):
 319             base = 16
 320             numstr = '0%s' % numstr
 321         else:
 322             base = 10
 323         return compat_chr(int(numstr, base))
 324
 325     # Unknown entity in name, return its literal representation
 326     return ('&%s;' % entity)
 327
 328
 329 def unescapeHTML(s):
 330     if s is None:
 331         return None
 332     assert type(s) == compat_str
 333
 334     return re.sub(
 335         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 336
 337
 338 def encodeFilename(s, for_subprocess=False):
 339     """
 340     @param s The name of the file
 341     """
 342
 343     assert type(s) == compat_str
 344
 345     # Python 3 has a Unicode API
 346     if sys.version_info >= (3, 0):
 347         return s
 348
 349     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 350         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 351         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 352         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 353         if not for_subprocess:
 354             return s
 355         else:
 356             # For subprocess calls, encode with locale encoding
 357             # Refer to http://stackoverflow.com/a/9951851/35070
 358             encoding = preferredencoding()
 359     else:
 360         encoding = sys.getfilesystemencoding()
 361     if encoding is None:
 362         encoding = 'utf-8'
 363     return s.encode(encoding, 'ignore')
 364
 365
 366 def encodeArgument(s):
 367     if not isinstance(s, compat_str):
 368         # Legacy code that uses byte strings
 369         # Uncomment the following line after fixing all post processors
 370         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 371         s = s.decode('ascii')
 372     return encodeFilename(s, True)
 373
 374
 375 def decodeOption(optval):
 376     if optval is None:
 377         return optval
 378     if isinstance(optval, bytes):
 379         optval = optval.decode(preferredencoding())
 380
 381     assert isinstance(optval, compat_str)
 382     return optval
 383
 384
 385 def formatSeconds(secs):
 386     if secs > 3600:
 387         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 388     elif secs > 60:
 389         return '%d:%02d' % (secs // 60, secs % 60)
 390     else:
 391         return '%d' % secs
 392
 393
 394 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 395     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 396         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 397         if opts_no_check_certificate:
 398             context.verify_mode = ssl.CERT_NONE
 399         try:
 400             return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 401         except TypeError:
 402             # Python 2.7.8
 403             # (create_default_context present but HTTPSHandler has no context=)
 404             pass
 405
 406     if sys.version_info < (3, 2):
 407         import httplib
 408
 409         class HTTPSConnectionV3(httplib.HTTPSConnection):
 410             def __init__(self, *args, **kwargs):
 411                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 412
 413             def connect(self):
 414                 sock = socket.create_connection((self.host, self.port), self.timeout)
 415                 if getattr(self, '_tunnel_host', False):
 416                     self.sock = sock
 417                     self._tunnel()
 418                 try:
 419                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 420                 except ssl.SSLError:
 421                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 422
 423         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 424             def https_open(self, req):
 425                 return self.do_open(HTTPSConnectionV3, req)
 426         return HTTPSHandlerV3(**kwargs)
 427     else:  # Python < 3.4
 428         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 429         context.verify_mode = (ssl.CERT_NONE
 430                                if opts_no_check_certificate
 431                                else ssl.CERT_REQUIRED)
 432         context.set_default_verify_paths()
 433         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 434
 435
 436 class ExtractorError(Exception):
 437     """Error during info extraction."""
 438
 439     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 440         """ tb, if given, is the original traceback (so that it can be printed out).
 441         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 442         """
 443
 444         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 445             expected = True
 446         if video_id is not None:
 447             msg = video_id + ': ' + msg
 448         if cause:
 449             msg += ' (caused by %r)' % cause
 450         if not expected:
 451             if ytdl_is_updateable():
 452                 update_cmd = 'type  youtube-dl -U  to update'
 453             else:
 454                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 455             msg += '; please report this issue on https://yt-dl.org/bug .'
 456             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 457             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 458         super(ExtractorError, self).__init__(msg)
 459
 460         self.traceback = tb
 461         self.exc_info = sys.exc_info()  # preserve original exception
 462         self.cause = cause
 463         self.video_id = video_id
 464
 465     def format_traceback(self):
 466         if self.traceback is None:
 467             return None
 468         return ''.join(traceback.format_tb(self.traceback))
 469
 470
 471 class UnsupportedError(ExtractorError):
 472     def __init__(self, url):
 473         super(UnsupportedError, self).__init__(
 474             'Unsupported URL: %s' % url, expected=True)
 475         self.url = url
 476
 477
 478 class RegexNotFoundError(ExtractorError):
 479     """Error when a regex didn't match"""
 480     pass
 481
 482
 483 class DownloadError(Exception):
 484     """Download Error exception.
 485
 486     This exception may be thrown by FileDownloader objects if they are not
 487     configured to continue on errors. They will contain the appropriate
 488     error message.
 489     """
 490
 491     def __init__(self, msg, exc_info=None):
 492         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 493         super(DownloadError, self).__init__(msg)
 494         self.exc_info = exc_info
 495
 496
 497 class SameFileError(Exception):
 498     """Same File exception.
 499
 500     This exception will be thrown by FileDownloader objects if they detect
 501     multiple files would have to be downloaded to the same file on disk.
 502     """
 503     pass
 504
 505
 506 class PostProcessingError(Exception):
 507     """Post Processing exception.
 508
 509     This exception may be raised by PostProcessor's .run() method to
 510     indicate an error in the postprocessing task.
 511     """
 512
 513     def __init__(self, msg):
 514         self.msg = msg
 515
 516
 517 class MaxDownloadsReached(Exception):
 518     """ --max-downloads limit has been reached. """
 519     pass
 520
 521
 522 class UnavailableVideoError(Exception):
 523     """Unavailable Format exception.
 524
 525     This exception will be thrown when a video is requested
 526     in a format that is not available for that video.
 527     """
 528     pass
 529
 530
 531 class ContentTooShortError(Exception):
 532     """Content Too Short exception.
 533
 534     This exception may be raised by FileDownloader objects when a file they
 535     download is too small for what the server announced first, indicating
 536     the connection was probably interrupted.
 537     """
 538     # Both in bytes
 539     downloaded = None
 540     expected = None
 541
 542     def __init__(self, downloaded, expected):
 543         self.downloaded = downloaded
 544         self.expected = expected
 545
 546
 547 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 548     """Handler for HTTP requests and responses.
 549
 550     This class, when installed with an OpenerDirector, automatically adds
 551     the standard headers to every HTTP request and handles gzipped and
 552     deflated responses from web servers. If compression is to be avoided in
 553     a particular request, the original request in the program code only has
 554     to include the HTTP header "Youtubedl-No-Compression", which will be
 555     removed before making the real request.
 556
 557     Part of this code was copied from:
 558
 559     http://techknack.net/python-urllib2-handlers/
 560
 561     Andrew Rowls, the author of that code, agreed to release it to the
 562     public domain.
 563     """
 564
 565     @staticmethod
 566     def deflate(data):
 567         try:
 568             return zlib.decompress(data, -zlib.MAX_WBITS)
 569         except zlib.error:
 570             return zlib.decompress(data)
 571
 572     @staticmethod
 573     def addinfourl_wrapper(stream, headers, url, code):
 574         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 575             return compat_urllib_request.addinfourl(stream, headers, url, code)
 576         ret = compat_urllib_request.addinfourl(stream, headers, url)
 577         ret.code = code
 578         return ret
 579
 580     def http_request(self, req):
 581         for h, v in std_headers.items():
 582             if h not in req.headers:
 583                 req.add_header(h, v)
 584         if 'Youtubedl-no-compression' in req.headers:
 585             if 'Accept-encoding' in req.headers:
 586                 del req.headers['Accept-encoding']
 587             del req.headers['Youtubedl-no-compression']
 588         if 'Youtubedl-user-agent' in req.headers:
 589             if 'User-agent' in req.headers:
 590                 del req.headers['User-agent']
 591             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 592             del req.headers['Youtubedl-user-agent']
 593
 594         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 595             # Python 2.6 is brain-dead when it comes to fragments
 596             req._Request__original = req._Request__original.partition('#')[0]
 597             req._Request__r_type = req._Request__r_type.partition('#')[0]
 598
 599         return req
 600
 601     def http_response(self, req, resp):
 602         old_resp = resp
 603         # gzip
 604         if resp.headers.get('Content-encoding', '') == 'gzip':
 605             content = resp.read()
 606             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 607             try:
 608                 uncompressed = io.BytesIO(gz.read())
 609             except IOError as original_ioerror:
 610                 # There may be junk add the end of the file
 611                 # See http://stackoverflow.com/q/4928560/35070 for details
 612                 for i in range(1, 1024):
 613                     try:
 614                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 615                         uncompressed = io.BytesIO(gz.read())
 616                     except IOError:
 617                         continue
 618                     break
 619                 else:
 620                     raise original_ioerror
 621             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 622             resp.msg = old_resp.msg
 623         # deflate
 624         if resp.headers.get('Content-encoding', '') == 'deflate':
 625             gz = io.BytesIO(self.deflate(resp.read()))
 626             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 627             resp.msg = old_resp.msg
 628         return resp
 629
 630     https_request = http_request
 631     https_response = http_response
 632
 633
 634 def parse_iso8601(date_str, delimiter='T'):
 635     """ Return a UNIX timestamp from the given date """
 636
 637     if date_str is None:
 638         return None
 639
 640     m = re.search(
 641         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 642         date_str)
 643     if not m:
 644         timezone = datetime.timedelta()
 645     else:
 646         date_str = date_str[:-len(m.group(0))]
 647         if not m.group('sign'):
 648             timezone = datetime.timedelta()
 649         else:
 650             sign = 1 if m.group('sign') == '+' else -1
 651             timezone = datetime.timedelta(
 652                 hours=sign * int(m.group('hours')),
 653                 minutes=sign * int(m.group('minutes')))
 654     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 655     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 656     return calendar.timegm(dt.timetuple())
 657
 658
 659 def unified_strdate(date_str, day_first=True):
 660     """Return a string with the date in the format YYYYMMDD"""
 661
 662     if date_str is None:
 663         return None
 664     upload_date = None
 665     # Replace commas
 666     date_str = date_str.replace(',', ' ')
 667     # %z (UTC offset) is only supported in python>=3.2
 668     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 669     # Remove AM/PM + timezone
 670     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 671
 672     format_expressions = [
 673         '%d %B %Y',
 674         '%d %b %Y',
 675         '%B %d %Y',
 676         '%b %d %Y',
 677         '%b %dst %Y %I:%M%p',
 678         '%b %dnd %Y %I:%M%p',
 679         '%b %dth %Y %I:%M%p',
 680         '%Y-%m-%d',
 681         '%Y/%m/%d',
 682         '%d.%m.%Y',
 683         '%d/%m/%Y',
 684         '%d/%m/%y',
 685         '%Y/%m/%d %H:%M:%S',
 686         '%Y-%m-%d %H:%M:%S',
 687         '%Y-%m-%d %H:%M:%S.%f',
 688         '%d.%m.%Y %H:%M',
 689         '%d.%m.%Y %H.%M',
 690         '%Y-%m-%dT%H:%M:%SZ',
 691         '%Y-%m-%dT%H:%M:%S.%fZ',
 692         '%Y-%m-%dT%H:%M:%S.%f0Z',
 693         '%Y-%m-%dT%H:%M:%S',
 694         '%Y-%m-%dT%H:%M:%S.%f',
 695         '%Y-%m-%dT%H:%M',
 696     ]
 697     if day_first:
 698         format_expressions.extend([
 699             '%d/%m/%Y %H:%M:%S',
 700         ])
 701     else:
 702         format_expressions.extend([
 703             '%m/%d/%Y %H:%M:%S',
 704         ])
 705     for expression in format_expressions:
 706         try:
 707             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 708         except ValueError:
 709             pass
 710     if upload_date is None:
 711         timetuple = email.utils.parsedate_tz(date_str)
 712         if timetuple:
 713             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 714     return upload_date
 715
 716
 717 def determine_ext(url, default_ext='unknown_video'):
 718     if url is None:
 719         return default_ext
 720     guess = url.partition('?')[0].rpartition('.')[2]
 721     if re.match(r'^[A-Za-z0-9]+$', guess):
 722         return guess
 723     else:
 724         return default_ext
 725
 726
 727 def subtitles_filename(filename, sub_lang, sub_format):
 728     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 729
 730
 731 def date_from_str(date_str):
 732     """
 733     Return a datetime object from a string in the format YYYYMMDD or
 734     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 735     today = datetime.date.today()
 736     if date_str in ('now', 'today'):
 737         return today
 738     if date_str == 'yesterday':
 739         return today - datetime.timedelta(days=1)
 740     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 741     if match is not None:
 742         sign = match.group('sign')
 743         time = int(match.group('time'))
 744         if sign == '-':
 745             time = -time
 746         unit = match.group('unit')
 747         # A bad aproximation?
 748         if unit == 'month':
 749             unit = 'day'
 750             time *= 30
 751         elif unit == 'year':
 752             unit = 'day'
 753             time *= 365
 754         unit += 's'
 755         delta = datetime.timedelta(**{unit: time})
 756         return today + delta
 757     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 758
 759
 760 def hyphenate_date(date_str):
 761     """
 762     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 763     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 764     if match is not None:
 765         return '-'.join(match.groups())
 766     else:
 767         return date_str
 768
 769
 770 class DateRange(object):
 771     """Represents a time interval between two dates"""
 772
 773     def __init__(self, start=None, end=None):
 774         """start and end must be strings in the format accepted by date"""
 775         if start is not None:
 776             self.start = date_from_str(start)
 777         else:
 778             self.start = datetime.datetime.min.date()
 779         if end is not None:
 780             self.end = date_from_str(end)
 781         else:
 782             self.end = datetime.datetime.max.date()
 783         if self.start > self.end:
 784             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 785
 786     @classmethod
 787     def day(cls, day):
 788         """Returns a range that only contains the given day"""
 789         return cls(day, day)
 790
 791     def __contains__(self, date):
 792         """Check if the date is in the range"""
 793         if not isinstance(date, datetime.date):
 794             date = date_from_str(date)
 795         return self.start <= date <= self.end
 796
 797     def __str__(self):
 798         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 799
 800
 801 def platform_name():
 802     """ Returns the platform name as a compat_str """
 803     res = platform.platform()
 804     if isinstance(res, bytes):
 805         res = res.decode(preferredencoding())
 806
 807     assert isinstance(res, compat_str)
 808     return res
 809
 810
 811 def _windows_write_string(s, out):
 812     """ Returns True if the string was written using special methods,
 813     False if it has yet to be written out."""
 814     # Adapted from http://stackoverflow.com/a/3259271/35070
 815
 816     import ctypes
 817     import ctypes.wintypes
 818
 819     WIN_OUTPUT_IDS = {
 820         1: -11,
 821         2: -12,
 822     }
 823
 824     try:
 825         fileno = out.fileno()
 826     except AttributeError:
 827         # If the output stream doesn't have a fileno, it's virtual
 828         return False
 829     if fileno not in WIN_OUTPUT_IDS:
 830         return False
 831
 832     GetStdHandle = ctypes.WINFUNCTYPE(
 833         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 834         (b"GetStdHandle", ctypes.windll.kernel32))
 835     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 836
 837     WriteConsoleW = ctypes.WINFUNCTYPE(
 838         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 839         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 840         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 841     written = ctypes.wintypes.DWORD(0)
 842
 843     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 844     FILE_TYPE_CHAR = 0x0002
 845     FILE_TYPE_REMOTE = 0x8000
 846     GetConsoleMode = ctypes.WINFUNCTYPE(
 847         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 848         ctypes.POINTER(ctypes.wintypes.DWORD))(
 849         (b"GetConsoleMode", ctypes.windll.kernel32))
 850     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 851
 852     def not_a_console(handle):
 853         if handle == INVALID_HANDLE_VALUE or handle is None:
 854             return True
 855         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 856                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 857
 858     if not_a_console(h):
 859         return False
 860
 861     def next_nonbmp_pos(s):
 862         try:
 863             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 864         except StopIteration:
 865             return len(s)
 866
 867     while s:
 868         count = min(next_nonbmp_pos(s), 1024)
 869
 870         ret = WriteConsoleW(
 871             h, s, count if count else 2, ctypes.byref(written), None)
 872         if ret == 0:
 873             raise OSError('Failed to write string')
 874         if not count:  # We just wrote a non-BMP character
 875             assert written.value == 2
 876             s = s[1:]
 877         else:
 878             assert written.value > 0
 879             s = s[written.value:]
 880     return True
 881
 882
 883 def write_string(s, out=None, encoding=None):
 884     if out is None:
 885         out = sys.stderr
 886     assert type(s) == compat_str
 887
 888     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 889         if _windows_write_string(s, out):
 890             return
 891
 892     if ('b' in getattr(out, 'mode', '') or
 893             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 894         byt = s.encode(encoding or preferredencoding(), 'ignore')
 895         out.write(byt)
 896     elif hasattr(out, 'buffer'):
 897         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 898         byt = s.encode(enc, 'ignore')
 899         out.buffer.write(byt)
 900     else:
 901         out.write(s)
 902     out.flush()
 903
 904
 905 def bytes_to_intlist(bs):
 906     if not bs:
 907         return []
 908     if isinstance(bs[0], int):  # Python 3
 909         return list(bs)
 910     else:
 911         return [ord(c) for c in bs]
 912
 913
 914 def intlist_to_bytes(xs):
 915     if not xs:
 916         return b''
 917     return struct_pack('%dB' % len(xs), *xs)
 918
 919
 920 # Cross-platform file locking
 921 if sys.platform == 'win32':
 922     import ctypes.wintypes
 923     import msvcrt
 924
 925     class OVERLAPPED(ctypes.Structure):
 926         _fields_ = [
 927             ('Internal', ctypes.wintypes.LPVOID),
 928             ('InternalHigh', ctypes.wintypes.LPVOID),
 929             ('Offset', ctypes.wintypes.DWORD),
 930             ('OffsetHigh', ctypes.wintypes.DWORD),
 931             ('hEvent', ctypes.wintypes.HANDLE),
 932         ]
 933
 934     kernel32 = ctypes.windll.kernel32
 935     LockFileEx = kernel32.LockFileEx
 936     LockFileEx.argtypes = [
 937         ctypes.wintypes.HANDLE,     # hFile
 938         ctypes.wintypes.DWORD,      # dwFlags
 939         ctypes.wintypes.DWORD,      # dwReserved
 940         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 941         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 942         ctypes.POINTER(OVERLAPPED)  # Overlapped
 943     ]
 944     LockFileEx.restype = ctypes.wintypes.BOOL
 945     UnlockFileEx = kernel32.UnlockFileEx
 946     UnlockFileEx.argtypes = [
 947         ctypes.wintypes.HANDLE,     # hFile
 948         ctypes.wintypes.DWORD,      # dwReserved
 949         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 950         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 951         ctypes.POINTER(OVERLAPPED)  # Overlapped
 952     ]
 953     UnlockFileEx.restype = ctypes.wintypes.BOOL
 954     whole_low = 0xffffffff
 955     whole_high = 0x7fffffff
 956
 957     def _lock_file(f, exclusive):
 958         overlapped = OVERLAPPED()
 959         overlapped.Offset = 0
 960         overlapped.OffsetHigh = 0
 961         overlapped.hEvent = 0
 962         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 963         handle = msvcrt.get_osfhandle(f.fileno())
 964         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 965                           whole_low, whole_high, f._lock_file_overlapped_p):
 966             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 967
 968     def _unlock_file(f):
 969         assert f._lock_file_overlapped_p
 970         handle = msvcrt.get_osfhandle(f.fileno())
 971         if not UnlockFileEx(handle, 0,
 972                             whole_low, whole_high, f._lock_file_overlapped_p):
 973             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 974
 975 else:
 976     import fcntl
 977
 978     def _lock_file(f, exclusive):
 979         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 980
 981     def _unlock_file(f):
 982         fcntl.flock(f, fcntl.LOCK_UN)
 983
 984
 985 class locked_file(object):
 986     def __init__(self, filename, mode, encoding=None):
 987         assert mode in ['r', 'a', 'w']
 988         self.f = io.open(filename, mode, encoding=encoding)
 989         self.mode = mode
 990
 991     def __enter__(self):
 992         exclusive = self.mode != 'r'
 993         try:
 994             _lock_file(self.f, exclusive)
 995         except IOError:
 996             self.f.close()
 997             raise
 998         return self
 999
1000     def __exit__(self, etype, value, traceback):
1001         try:
1002             _unlock_file(self.f)
1003         finally:
1004             self.f.close()
1005
1006     def __iter__(self):
1007         return iter(self.f)
1008
1009     def write(self, *args):
1010         return self.f.write(*args)
1011
1012     def read(self, *args):
1013         return self.f.read(*args)
1014
1015
1016 def get_filesystem_encoding():
1017     encoding = sys.getfilesystemencoding()
1018     return encoding if encoding is not None else 'utf-8'
1019
1020
1021 def shell_quote(args):
1022     quoted_args = []
1023     encoding = get_filesystem_encoding()
1024     for a in args:
1025         if isinstance(a, bytes):
1026             # We may get a filename encoded with 'encodeFilename'
1027             a = a.decode(encoding)
1028         quoted_args.append(pipes.quote(a))
1029     return ' '.join(quoted_args)
1030
1031
1032 def takewhile_inclusive(pred, seq):
1033     """ Like itertools.takewhile, but include the latest evaluated element
1034         (the first element so that Not pred(e)) """
1035     for e in seq:
1036         yield e
1037         if not pred(e):
1038             return
1039
1040
1041 def smuggle_url(url, data):
1042     """ Pass additional data in a URL for internal use. """
1043
1044     sdata = compat_urllib_parse.urlencode(
1045         {'__youtubedl_smuggle': json.dumps(data)})
1046     return url + '#' + sdata
1047
1048
1049 def unsmuggle_url(smug_url, default=None):
1050     if '#__youtubedl_smuggle' not in smug_url:
1051         return smug_url, default
1052     url, _, sdata = smug_url.rpartition('#')
1053     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1054     data = json.loads(jsond)
1055     return url, data
1056
1057
1058 def format_bytes(bytes):
1059     if bytes is None:
1060         return 'N/A'
1061     if type(bytes) is str:
1062         bytes = float(bytes)
1063     if bytes == 0.0:
1064         exponent = 0
1065     else:
1066         exponent = int(math.log(bytes, 1024.0))
1067     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1068     converted = float(bytes) / float(1024 ** exponent)
1069     return '%.2f%s' % (converted, suffix)
1070
1071
1072 def parse_filesize(s):
1073     if s is None:
1074         return None
1075
1076     # The lower-case forms are of course incorrect and inofficial,
1077     # but we support those too
1078     _UNIT_TABLE = {
1079         'B': 1,
1080         'b': 1,
1081         'KiB': 1024,
1082         'KB': 1000,
1083         'kB': 1024,
1084         'Kb': 1000,
1085         'MiB': 1024 ** 2,
1086         'MB': 1000 ** 2,
1087         'mB': 1024 ** 2,
1088         'Mb': 1000 ** 2,
1089         'GiB': 1024 ** 3,
1090         'GB': 1000 ** 3,
1091         'gB': 1024 ** 3,
1092         'Gb': 1000 ** 3,
1093         'TiB': 1024 ** 4,
1094         'TB': 1000 ** 4,
1095         'tB': 1024 ** 4,
1096         'Tb': 1000 ** 4,
1097         'PiB': 1024 ** 5,
1098         'PB': 1000 ** 5,
1099         'pB': 1024 ** 5,
1100         'Pb': 1000 ** 5,
1101         'EiB': 1024 ** 6,
1102         'EB': 1000 ** 6,
1103         'eB': 1024 ** 6,
1104         'Eb': 1000 ** 6,
1105         'ZiB': 1024 ** 7,
1106         'ZB': 1000 ** 7,
1107         'zB': 1024 ** 7,
1108         'Zb': 1000 ** 7,
1109         'YiB': 1024 ** 8,
1110         'YB': 1000 ** 8,
1111         'yB': 1024 ** 8,
1112         'Yb': 1000 ** 8,
1113     }
1114
1115     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1116     m = re.match(
1117         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1118     if not m:
1119         return None
1120
1121     num_str = m.group('num').replace(',', '.')
1122     mult = _UNIT_TABLE[m.group('unit')]
1123     return int(float(num_str) * mult)
1124
1125
1126 def get_term_width():
1127     columns = compat_getenv('COLUMNS', None)
1128     if columns:
1129         return int(columns)
1130
1131     try:
1132         sp = subprocess.Popen(
1133             ['stty', 'size'],
1134             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1135         out, err = sp.communicate()
1136         return int(out.split()[1])
1137     except:
1138         pass
1139     return None
1140
1141
1142 def month_by_name(name):
1143     """ Return the number of a month by (locale-independently) English name """
1144
1145     ENGLISH_NAMES = [
1146         'January', 'February', 'March', 'April', 'May', 'June',
1147         'July', 'August', 'September', 'October', 'November', 'December']
1148     try:
1149         return ENGLISH_NAMES.index(name) + 1
1150     except ValueError:
1151         return None
1152
1153
1154 def fix_xml_ampersands(xml_str):
1155     """Replace all the '&' by '&amp;' in XML"""
1156     return re.sub(
1157         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1158         '&amp;',
1159         xml_str)
1160
1161
1162 def setproctitle(title):
1163     assert isinstance(title, compat_str)
1164     try:
1165         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1166     except OSError:
1167         return
1168     title_bytes = title.encode('utf-8')
1169     buf = ctypes.create_string_buffer(len(title_bytes))
1170     buf.value = title_bytes
1171     try:
1172         libc.prctl(15, buf, 0, 0, 0)
1173     except AttributeError:
1174         return  # Strange libc, just skip this
1175
1176
1177 def remove_start(s, start):
1178     if s.startswith(start):
1179         return s[len(start):]
1180     return s
1181
1182
1183 def remove_end(s, end):
1184     if s.endswith(end):
1185         return s[:-len(end)]
1186     return s
1187
1188
1189 def url_basename(url):
1190     path = compat_urlparse.urlparse(url).path
1191     return path.strip('/').split('/')[-1]
1192
1193
1194 class HEADRequest(compat_urllib_request.Request):
1195     def get_method(self):
1196         return "HEAD"
1197
1198
1199 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1200     if get_attr:
1201         if v is not None:
1202             v = getattr(v, get_attr, None)
1203     if v == '':
1204         v = None
1205     return default if v is None else (int(v) * invscale // scale)
1206
1207
1208 def str_or_none(v, default=None):
1209     return default if v is None else compat_str(v)
1210
1211
1212 def str_to_int(int_str):
1213     """ A more relaxed version of int_or_none """
1214     if int_str is None:
1215         return None
1216     int_str = re.sub(r'[,\.\+]', '', int_str)
1217     return int(int_str)
1218
1219
1220 def float_or_none(v, scale=1, invscale=1, default=None):
1221     return default if v is None else (float(v) * invscale / scale)
1222
1223
1224 def parse_duration(s):
1225     if s is None:
1226         return None
1227
1228     s = s.strip()
1229
1230     m = re.match(
1231         r'''(?ix)T?
1232         (?:
1233             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1234             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1235
1236             (?:
1237                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1238                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1239             )?
1240             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1241         )$''', s)
1242     if not m:
1243         return None
1244     res = 0
1245     if m.group('only_mins'):
1246         return float_or_none(m.group('only_mins'), invscale=60)
1247     if m.group('only_hours'):
1248         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1249     if m.group('secs'):
1250         res += int(m.group('secs'))
1251     if m.group('mins'):
1252         res += int(m.group('mins')) * 60
1253     if m.group('hours'):
1254         res += int(m.group('hours')) * 60 * 60
1255     if m.group('ms'):
1256         res += float(m.group('ms'))
1257     return res
1258
1259
1260 def prepend_extension(filename, ext):
1261     name, real_ext = os.path.splitext(filename)
1262     return '{0}.{1}{2}'.format(name, ext, real_ext)
1263
1264
1265 def check_executable(exe, args=[]):
1266     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1267     args can be a list of arguments for a short output (like -version) """
1268     try:
1269         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1270     except OSError:
1271         return False
1272     return exe
1273
1274
1275 def get_exe_version(exe, args=['--version'],
1276                     version_re=None, unrecognized='present'):
1277     """ Returns the version of the specified executable,
1278     or False if the executable is not present """
1279     try:
1280         out, _ = subprocess.Popen(
1281             [exe] + args,
1282             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1283     except OSError:
1284         return False
1285     if isinstance(out, bytes):  # Python 2.x
1286         out = out.decode('ascii', 'ignore')
1287     return detect_exe_version(out, version_re, unrecognized)
1288
1289
1290 def detect_exe_version(output, version_re=None, unrecognized='present'):
1291     assert isinstance(output, compat_str)
1292     if version_re is None:
1293         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1294     m = re.search(version_re, output)
1295     if m:
1296         return m.group(1)
1297     else:
1298         return unrecognized
1299
1300
1301 class PagedList(object):
1302     def __len__(self):
1303         # This is only useful for tests
1304         return len(self.getslice())
1305
1306
1307 class OnDemandPagedList(PagedList):
1308     def __init__(self, pagefunc, pagesize):
1309         self._pagefunc = pagefunc
1310         self._pagesize = pagesize
1311
1312     def getslice(self, start=0, end=None):
1313         res = []
1314         for pagenum in itertools.count(start // self._pagesize):
1315             firstid = pagenum * self._pagesize
1316             nextfirstid = pagenum * self._pagesize + self._pagesize
1317             if start >= nextfirstid:
1318                 continue
1319
1320             page_results = list(self._pagefunc(pagenum))
1321
1322             startv = (
1323                 start % self._pagesize
1324                 if firstid <= start < nextfirstid
1325                 else 0)
1326
1327             endv = (
1328                 ((end - 1) % self._pagesize) + 1
1329                 if (end is not None and firstid <= end <= nextfirstid)
1330                 else None)
1331
1332             if startv != 0 or endv is not None:
1333                 page_results = page_results[startv:endv]
1334             res.extend(page_results)
1335
1336             # A little optimization - if current page is not "full", ie. does
1337             # not contain page_size videos then we can assume that this page
1338             # is the last one - there are no more ids on further pages -
1339             # i.e. no need to query again.
1340             if len(page_results) + startv < self._pagesize:
1341                 break
1342
1343             # If we got the whole page, but the next page is not interesting,
1344             # break out early as well
1345             if end == nextfirstid:
1346                 break
1347         return res
1348
1349
1350 class InAdvancePagedList(PagedList):
1351     def __init__(self, pagefunc, pagecount, pagesize):
1352         self._pagefunc = pagefunc
1353         self._pagecount = pagecount
1354         self._pagesize = pagesize
1355
1356     def getslice(self, start=0, end=None):
1357         res = []
1358         start_page = start // self._pagesize
1359         end_page = (
1360             self._pagecount if end is None else (end // self._pagesize + 1))
1361         skip_elems = start - start_page * self._pagesize
1362         only_more = None if end is None else end - start
1363         for pagenum in range(start_page, end_page):
1364             page = list(self._pagefunc(pagenum))
1365             if skip_elems:
1366                 page = page[skip_elems:]
1367                 skip_elems = None
1368             if only_more is not None:
1369                 if len(page) < only_more:
1370                     only_more -= len(page)
1371                 else:
1372                     page = page[:only_more]
1373                     res.extend(page)
1374                     break
1375             res.extend(page)
1376         return res
1377
1378
1379 def uppercase_escape(s):
1380     unicode_escape = codecs.getdecoder('unicode_escape')
1381     return re.sub(
1382         r'\\U[0-9a-fA-F]{8}',
1383         lambda m: unicode_escape(m.group(0))[0],
1384         s)
1385
1386
1387 def escape_rfc3986(s):
1388     """Escape non-ASCII characters as suggested by RFC 3986"""
1389     if sys.version_info < (3, 0) and isinstance(s, unicode):
1390         s = s.encode('utf-8')
1391     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1392
1393
1394 def escape_url(url):
1395     """Escape URL as suggested by RFC 3986"""
1396     url_parsed = compat_urllib_parse_urlparse(url)
1397     return url_parsed._replace(
1398         path=escape_rfc3986(url_parsed.path),
1399         params=escape_rfc3986(url_parsed.params),
1400         query=escape_rfc3986(url_parsed.query),
1401         fragment=escape_rfc3986(url_parsed.fragment)
1402     ).geturl()
1403
1404 try:
1405     struct.pack('!I', 0)
1406 except TypeError:
1407     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1408     def struct_pack(spec, *args):
1409         if isinstance(spec, compat_str):
1410             spec = spec.encode('ascii')
1411         return struct.pack(spec, *args)
1412
1413     def struct_unpack(spec, *args):
1414         if isinstance(spec, compat_str):
1415             spec = spec.encode('ascii')
1416         return struct.unpack(spec, *args)
1417 else:
1418     struct_pack = struct.pack
1419     struct_unpack = struct.unpack
1420
1421
1422 def read_batch_urls(batch_fd):
1423     def fixup(url):
1424         if not isinstance(url, compat_str):
1425             url = url.decode('utf-8', 'replace')
1426         BOM_UTF8 = '\xef\xbb\xbf'
1427         if url.startswith(BOM_UTF8):
1428             url = url[len(BOM_UTF8):]
1429         url = url.strip()
1430         if url.startswith(('#', ';', ']')):
1431             return False
1432         return url
1433
1434     with contextlib.closing(batch_fd) as fd:
1435         return [url for url in map(fixup, fd) if url]
1436
1437
1438 def urlencode_postdata(*args, **kargs):
1439     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1440
1441
1442 try:
1443     etree_iter = xml.etree.ElementTree.Element.iter
1444 except AttributeError:  # Python <=2.6
1445     etree_iter = lambda n: n.findall('.//*')
1446
1447
1448 def parse_xml(s):
1449     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1450         def doctype(self, name, pubid, system):
1451             pass  # Ignore doctypes
1452
1453     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1454     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1455     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1456     # Fix up XML parser in Python 2.x
1457     if sys.version_info < (3, 0):
1458         for n in etree_iter(tree):
1459             if n.text is not None:
1460                 if not isinstance(n.text, compat_str):
1461                     n.text = n.text.decode('utf-8')
1462     return tree
1463
1464
1465 US_RATINGS = {
1466     'G': 0,
1467     'PG': 10,
1468     'PG-13': 13,
1469     'R': 16,
1470     'NC': 18,
1471 }
1472
1473
1474 def parse_age_limit(s):
1475     if s is None:
1476         return None
1477     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1478     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1479
1480
1481 def strip_jsonp(code):
1482     return re.sub(
1483         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1484
1485
1486 def js_to_json(code):
1487     def fix_kv(m):
1488         v = m.group(0)
1489         if v in ('true', 'false', 'null'):
1490             return v
1491         if v.startswith('"'):
1492             return v
1493         if v.startswith("'"):
1494             v = v[1:-1]
1495             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1496                 '\\\\': '\\\\',
1497                 "\\'": "'",
1498                 '"': '\\"',
1499             }[m.group(0)], v)
1500         return '"%s"' % v
1501
1502     res = re.sub(r'''(?x)
1503         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1504         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1505         [a-zA-Z_][a-zA-Z_0-9]*
1506         ''', fix_kv, code)
1507     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1508     return res
1509
1510
1511 def qualities(quality_ids):
1512     """ Get a numeric quality value out of a list of possible values """
1513     def q(qid):
1514         try:
1515             return quality_ids.index(qid)
1516         except ValueError:
1517             return -1
1518     return q
1519
1520
1521 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1522
1523
1524 def limit_length(s, length):
1525     """ Add ellipses to overly long strings """
1526     if s is None:
1527         return None
1528     ELLIPSES = '...'
1529     if len(s) > length:
1530         return s[:length - len(ELLIPSES)] + ELLIPSES
1531     return s
1532
1533
1534 def version_tuple(v):
1535     return tuple(int(e) for e in re.split(r'[-.]', v))
1536
1537
1538 def is_outdated_version(version, limit, assume_new=True):
1539     if not version:
1540         return not assume_new
1541     try:
1542         return version_tuple(version) < version_tuple(limit)
1543     except ValueError:
1544         return not assume_new
1545
1546
1547 def ytdl_is_updateable():
1548     """ Returns if youtube-dl can be updated with -U """
1549     from zipimport import zipimporter
1550
1551     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1552
1553
1554 def args_to_str(args):
1555     # Get a short string representation for a subprocess command
1556     return ' '.join(shlex_quote(a) for a in args)
1557
1558
1559 def urlhandle_detect_ext(url_handle):
1560     try:
1561         url_handle.headers
1562         getheader = lambda h: url_handle.headers[h]
1563     except AttributeError:  # Python < 3
1564         getheader = url_handle.info().getheader
1565
1566     return getheader('Content-Type').split("/")[1]
1567
1568
1569 def age_restricted(content_limit, age_limit):
1570     """ Returns True iff the content should be blocked """
1571
1572     if age_limit is None:  # No limit set
1573         return False
1574     if content_limit is None:
1575         return False  # Content available for everyone
1576     return age_limit < content_limit