youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import contextlib
   5 import ctypes
   6 import datetime
   7 import email.utils
   8 import errno
   9 import gzip
  10 import itertools
  11 import io
  12 import json
  13 import locale
  14 import math
  15 import os
  16 import pipes
  17 import platform
  18 import re
  19 import ssl
  20 import socket
  21 import struct
  22 import subprocess
  23 import sys
  24 import traceback
  25 import xml.etree.ElementTree
  26 import zlib
  27
  28 try:
  29     import urllib.request as compat_urllib_request
  30 except ImportError: # Python 2
  31     import urllib2 as compat_urllib_request
  32
  33 try:
  34     import urllib.error as compat_urllib_error
  35 except ImportError: # Python 2
  36     import urllib2 as compat_urllib_error
  37
  38 try:
  39     import urllib.parse as compat_urllib_parse
  40 except ImportError: # Python 2
  41     import urllib as compat_urllib_parse
  42
  43 try:
  44     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  45 except ImportError: # Python 2
  46     from urlparse import urlparse as compat_urllib_parse_urlparse
  47
  48 try:
  49     import urllib.parse as compat_urlparse
  50 except ImportError: # Python 2
  51     import urlparse as compat_urlparse
  52
  53 try:
  54     import http.cookiejar as compat_cookiejar
  55 except ImportError: # Python 2
  56     import cookielib as compat_cookiejar
  57
  58 try:
  59     import html.entities as compat_html_entities
  60 except ImportError: # Python 2
  61     import htmlentitydefs as compat_html_entities
  62
  63 try:
  64     import html.parser as compat_html_parser
  65 except ImportError: # Python 2
  66     import HTMLParser as compat_html_parser
  67
  68 try:
  69     import http.client as compat_http_client
  70 except ImportError: # Python 2
  71     import httplib as compat_http_client
  72
  73 try:
  74     from urllib.error import HTTPError as compat_HTTPError
  75 except ImportError:  # Python 2
  76     from urllib2 import HTTPError as compat_HTTPError
  77
  78 try:
  79     from urllib.request import urlretrieve as compat_urlretrieve
  80 except ImportError:  # Python 2
  81     from urllib import urlretrieve as compat_urlretrieve
  82
  83
  84 try:
  85     from subprocess import DEVNULL
  86     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  87 except ImportError:
  88     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  89
  90 try:
  91     from urllib.parse import parse_qs as compat_parse_qs
  92 except ImportError: # Python 2
  93     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  94     # Python 2's version is apparently totally broken
  95     def _unquote(string, encoding='utf-8', errors='replace'):
  96         if string == '':
  97             return string
  98         res = string.split('%')
  99         if len(res) == 1:
 100             return string
 101         if encoding is None:
 102             encoding = 'utf-8'
 103         if errors is None:
 104             errors = 'replace'
 105         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 106         pct_sequence = b''
 107         string = res[0]
 108         for item in res[1:]:
 109             try:
 110                 if not item:
 111                     raise ValueError
 112                 pct_sequence += item[:2].decode('hex')
 113                 rest = item[2:]
 114                 if not rest:
 115                     # This segment was just a single percent-encoded character.
 116                     # May be part of a sequence of code units, so delay decoding.
 117                     # (Stored in pct_sequence).
 118                     continue
 119             except ValueError:
 120                 rest = '%' + item
 121             # Encountered non-percent-encoded characters. Flush the current
 122             # pct_sequence.
 123             string += pct_sequence.decode(encoding, errors) + rest
 124             pct_sequence = b''
 125         if pct_sequence:
 126             # Flush the final pct_sequence
 127             string += pct_sequence.decode(encoding, errors)
 128         return string
 129
 130     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 131                 encoding='utf-8', errors='replace'):
 132         qs, _coerce_result = qs, unicode
 133         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 134         r = []
 135         for name_value in pairs:
 136             if not name_value and not strict_parsing:
 137                 continue
 138             nv = name_value.split('=', 1)
 139             if len(nv) != 2:
 140                 if strict_parsing:
 141                     raise ValueError("bad query field: %r" % (name_value,))
 142                 # Handle case of a control-name with no equal sign
 143                 if keep_blank_values:
 144                     nv.append('')
 145                 else:
 146                     continue
 147             if len(nv[1]) or keep_blank_values:
 148                 name = nv[0].replace('+', ' ')
 149                 name = _unquote(name, encoding=encoding, errors=errors)
 150                 name = _coerce_result(name)
 151                 value = nv[1].replace('+', ' ')
 152                 value = _unquote(value, encoding=encoding, errors=errors)
 153                 value = _coerce_result(value)
 154                 r.append((name, value))
 155         return r
 156
 157     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 158                 encoding='utf-8', errors='replace'):
 159         parsed_result = {}
 160         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 161                         encoding=encoding, errors=errors)
 162         for name, value in pairs:
 163             if name in parsed_result:
 164                 parsed_result[name].append(value)
 165             else:
 166                 parsed_result[name] = [value]
 167         return parsed_result
 168
 169 try:
 170     compat_str = unicode # Python 2
 171 except NameError:
 172     compat_str = str
 173
 174 try:
 175     compat_chr = unichr # Python 2
 176 except NameError:
 177     compat_chr = chr
 178
 179 try:
 180     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 181 except ImportError:  # Python 2.6
 182     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 183
 184 def compat_ord(c):
 185     if type(c) is int: return c
 186     else: return ord(c)
 187
 188 # This is not clearly defined otherwise
 189 compiled_regex_type = type(re.compile(''))
 190
 191 std_headers = {
 192     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 193     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 194     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 195     'Accept-Encoding': 'gzip, deflate',
 196     'Accept-Language': 'en-us,en;q=0.5',
 197 }
 198
 199 def preferredencoding():
 200     """Get preferred encoding.
 201
 202     Returns the best encoding scheme for the system, based on
 203     locale.getpreferredencoding() and some further tweaks.
 204     """
 205     try:
 206         pref = locale.getpreferredencoding()
 207         u'TEST'.encode(pref)
 208     except:
 209         pref = 'UTF-8'
 210
 211     return pref
 212
 213 if sys.version_info < (3,0):
 214     def compat_print(s):
 215         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 216 else:
 217     def compat_print(s):
 218         assert type(s) == type(u'')
 219         print(s)
 220
 221 # In Python 2.x, json.dump expects a bytestream.
 222 # In Python 3.x, it writes to a character stream
 223 if sys.version_info < (3,0):
 224     def write_json_file(obj, fn):
 225         with open(fn, 'wb') as f:
 226             json.dump(obj, f)
 227 else:
 228     def write_json_file(obj, fn):
 229         with open(fn, 'w', encoding='utf-8') as f:
 230             json.dump(obj, f)
 231
 232 if sys.version_info >= (2,7):
 233     def find_xpath_attr(node, xpath, key, val):
 234         """ Find the xpath xpath[@key=val] """
 235         assert re.match(r'^[a-zA-Z]+$', key)
 236         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 237         expr = xpath + u"[@%s='%s']" % (key, val)
 238         return node.find(expr)
 239 else:
 240     def find_xpath_attr(node, xpath, key, val):
 241         for f in node.findall(xpath):
 242             if f.attrib.get(key) == val:
 243                 return f
 244         return None
 245
 246 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 247 # the namespace parameter
 248 def xpath_with_ns(path, ns_map):
 249     components = [c.split(':') for c in path.split('/')]
 250     replaced = []
 251     for c in components:
 252         if len(c) == 1:
 253             replaced.append(c[0])
 254         else:
 255             ns, tag = c
 256             replaced.append('{%s}%s' % (ns_map[ns], tag))
 257     return '/'.join(replaced)
 258
 259 def htmlentity_transform(matchobj):
 260     """Transforms an HTML entity to a character.
 261
 262     This function receives a match object and is intended to be used with
 263     the re.sub() function.
 264     """
 265     entity = matchobj.group(1)
 266
 267     # Known non-numeric HTML entity
 268     if entity in compat_html_entities.name2codepoint:
 269         return compat_chr(compat_html_entities.name2codepoint[entity])
 270
 271     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 272     if mobj is not None:
 273         numstr = mobj.group(1)
 274         if numstr.startswith(u'x'):
 275             base = 16
 276             numstr = u'0%s' % numstr
 277         else:
 278             base = 10
 279         return compat_chr(int(numstr, base))
 280
 281     # Unknown entity in name, return its literal representation
 282     return (u'&%s;' % entity)
 283
 284 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 285 class BaseHTMLParser(compat_html_parser.HTMLParser):
 286     def __init(self):
 287         compat_html_parser.HTMLParser.__init__(self)
 288         self.html = None
 289
 290     def loads(self, html):
 291         self.html = html
 292         self.feed(html)
 293         self.close()
 294
 295 class AttrParser(BaseHTMLParser):
 296     """Modified HTMLParser that isolates a tag with the specified attribute"""
 297     def __init__(self, attribute, value):
 298         self.attribute = attribute
 299         self.value = value
 300         self.result = None
 301         self.started = False
 302         self.depth = {}
 303         self.watch_startpos = False
 304         self.error_count = 0
 305         BaseHTMLParser.__init__(self)
 306
 307     def error(self, message):
 308         if self.error_count > 10 or self.started:
 309             raise compat_html_parser.HTMLParseError(message, self.getpos())
 310         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 311         self.error_count += 1
 312         self.goahead(1)
 313
 314     def handle_starttag(self, tag, attrs):
 315         attrs = dict(attrs)
 316         if self.started:
 317             self.find_startpos(None)
 318         if self.attribute in attrs and attrs[self.attribute] == self.value:
 319             self.result = [tag]
 320             self.started = True
 321             self.watch_startpos = True
 322         if self.started:
 323             if not tag in self.depth: self.depth[tag] = 0
 324             self.depth[tag] += 1
 325
 326     def handle_endtag(self, tag):
 327         if self.started:
 328             if tag in self.depth: self.depth[tag] -= 1
 329             if self.depth[self.result[0]] == 0:
 330                 self.started = False
 331                 self.result.append(self.getpos())
 332
 333     def find_startpos(self, x):
 334         """Needed to put the start position of the result (self.result[1])
 335         after the opening tag with the requested id"""
 336         if self.watch_startpos:
 337             self.watch_startpos = False
 338             self.result.append(self.getpos())
 339     handle_entityref = handle_charref = handle_data = handle_comment = \
 340     handle_decl = handle_pi = unknown_decl = find_startpos
 341
 342     def get_result(self):
 343         if self.result is None:
 344             return None
 345         if len(self.result) != 3:
 346             return None
 347         lines = self.html.split('\n')
 348         lines = lines[self.result[1][0]-1:self.result[2][0]]
 349         lines[0] = lines[0][self.result[1][1]:]
 350         if len(lines) == 1:
 351             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 352         lines[-1] = lines[-1][:self.result[2][1]]
 353         return '\n'.join(lines).strip()
 354 # Hack for https://github.com/rg3/youtube-dl/issues/662
 355 if sys.version_info < (2, 7, 3):
 356     AttrParser.parse_endtag = (lambda self, i:
 357         i + len("</scr'+'ipt>")
 358         if self.rawdata[i:].startswith("</scr'+'ipt>")
 359         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 360
 361 def get_element_by_id(id, html):
 362     """Return the content of the tag with the specified ID in the passed HTML document"""
 363     return get_element_by_attribute("id", id, html)
 364
 365 def get_element_by_attribute(attribute, value, html):
 366     """Return the content of the tag with the specified attribute in the passed HTML document"""
 367     parser = AttrParser(attribute, value)
 368     try:
 369         parser.loads(html)
 370     except compat_html_parser.HTMLParseError:
 371         pass
 372     return parser.get_result()
 373
 374 class MetaParser(BaseHTMLParser):
 375     """
 376     Modified HTMLParser that isolates a meta tag with the specified name
 377     attribute.
 378     """
 379     def __init__(self, name):
 380         BaseHTMLParser.__init__(self)
 381         self.name = name
 382         self.content = None
 383         self.result = None
 384
 385     def handle_starttag(self, tag, attrs):
 386         if tag != 'meta':
 387             return
 388         attrs = dict(attrs)
 389         if attrs.get('name') == self.name:
 390             self.result = attrs.get('content')
 391
 392     def get_result(self):
 393         return self.result
 394
 395 def get_meta_content(name, html):
 396     """
 397     Return the content attribute from the meta tag with the given name attribute.
 398     """
 399     parser = MetaParser(name)
 400     try:
 401         parser.loads(html)
 402     except compat_html_parser.HTMLParseError:
 403         pass
 404     return parser.get_result()
 405
 406
 407 def clean_html(html):
 408     """Clean an HTML snippet into a readable string"""
 409     # Newline vs <br />
 410     html = html.replace('\n', ' ')
 411     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 412     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 413     # Strip html tags
 414     html = re.sub('<.*?>', '', html)
 415     # Replace html entities
 416     html = unescapeHTML(html)
 417     return html.strip()
 418
 419
 420 def sanitize_open(filename, open_mode):
 421     """Try to open the given filename, and slightly tweak it if this fails.
 422
 423     Attempts to open the given filename. If this fails, it tries to change
 424     the filename slightly, step by step, until it's either able to open it
 425     or it fails and raises a final exception, like the standard open()
 426     function.
 427
 428     It returns the tuple (stream, definitive_file_name).
 429     """
 430     try:
 431         if filename == u'-':
 432             if sys.platform == 'win32':
 433                 import msvcrt
 434                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 435             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 436         stream = open(encodeFilename(filename), open_mode)
 437         return (stream, filename)
 438     except (IOError, OSError) as err:
 439         if err.errno in (errno.EACCES,):
 440             raise
 441
 442         # In case of error, try to remove win32 forbidden chars
 443         alt_filename = os.path.join(
 444                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 445                         for path_part in os.path.split(filename)
 446                        )
 447         if alt_filename == filename:
 448             raise
 449         else:
 450             # An exception here should be caught in the caller
 451             stream = open(encodeFilename(filename), open_mode)
 452             return (stream, alt_filename)
 453
 454
 455 def timeconvert(timestr):
 456     """Convert RFC 2822 defined time string into system timestamp"""
 457     timestamp = None
 458     timetuple = email.utils.parsedate_tz(timestr)
 459     if timetuple is not None:
 460         timestamp = email.utils.mktime_tz(timetuple)
 461     return timestamp
 462
 463 def sanitize_filename(s, restricted=False, is_id=False):
 464     """Sanitizes a string so it could be used as part of a filename.
 465     If restricted is set, use a stricter subset of allowed characters.
 466     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 467     """
 468     def replace_insane(char):
 469         if char == '?' or ord(char) < 32 or ord(char) == 127:
 470             return ''
 471         elif char == '"':
 472             return '' if restricted else '\''
 473         elif char == ':':
 474             return '_-' if restricted else ' -'
 475         elif char in '\\/|*<>':
 476             return '_'
 477         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 478             return '_'
 479         if restricted and ord(char) > 127:
 480             return '_'
 481         return char
 482
 483     result = u''.join(map(replace_insane, s))
 484     if not is_id:
 485         while '__' in result:
 486             result = result.replace('__', '_')
 487         result = result.strip('_')
 488         # Common case of "Foreign band name - English song title"
 489         if restricted and result.startswith('-_'):
 490             result = result[2:]
 491         if not result:
 492             result = '_'
 493     return result
 494
 495 def orderedSet(iterable):
 496     """ Remove all duplicates from the input iterable """
 497     res = []
 498     for el in iterable:
 499         if el not in res:
 500             res.append(el)
 501     return res
 502
 503 def unescapeHTML(s):
 504     """
 505     @param s a string
 506     """
 507     assert type(s) == type(u'')
 508
 509     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 510     return result
 511
 512
 513 def encodeFilename(s, for_subprocess=False):
 514     """
 515     @param s The name of the file
 516     """
 517
 518     assert type(s) == compat_str
 519
 520     # Python 3 has a Unicode API
 521     if sys.version_info >= (3, 0):
 522         return s
 523
 524     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 525         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 526         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 527         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 528         if not for_subprocess:
 529             return s
 530         else:
 531             # For subprocess calls, encode with locale encoding
 532             # Refer to http://stackoverflow.com/a/9951851/35070
 533             encoding = preferredencoding()
 534     else:
 535         encoding = sys.getfilesystemencoding()
 536     if encoding is None:
 537         encoding = 'utf-8'
 538     return s.encode(encoding, 'ignore')
 539
 540
 541 def decodeOption(optval):
 542     if optval is None:
 543         return optval
 544     if isinstance(optval, bytes):
 545         optval = optval.decode(preferredencoding())
 546
 547     assert isinstance(optval, compat_str)
 548     return optval
 549
 550 def formatSeconds(secs):
 551     if secs > 3600:
 552         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 553     elif secs > 60:
 554         return '%d:%02d' % (secs // 60, secs % 60)
 555     else:
 556         return '%d' % secs
 557
 558
 559 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 560     if sys.version_info < (3, 2):
 561         import httplib
 562
 563         class HTTPSConnectionV3(httplib.HTTPSConnection):
 564             def __init__(self, *args, **kwargs):
 565                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 566
 567             def connect(self):
 568                 sock = socket.create_connection((self.host, self.port), self.timeout)
 569                 if getattr(self, '_tunnel_host', False):
 570                     self.sock = sock
 571                     self._tunnel()
 572                 try:
 573                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 574                 except ssl.SSLError:
 575                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 576
 577         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 578             def https_open(self, req):
 579                 return self.do_open(HTTPSConnectionV3, req)
 580         return HTTPSHandlerV3(**kwargs)
 581     else:
 582         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 583         context.verify_mode = (ssl.CERT_NONE
 584                                if opts_no_check_certificate
 585                                else ssl.CERT_REQUIRED)
 586         context.set_default_verify_paths()
 587         try:
 588             context.load_default_certs()
 589         except AttributeError:
 590             pass  # Python < 3.4
 591         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 592
 593 class ExtractorError(Exception):
 594     """Error during info extraction."""
 595     def __init__(self, msg, tb=None, expected=False, cause=None):
 596         """ tb, if given, is the original traceback (so that it can be printed out).
 597         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 598         """
 599
 600         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 601             expected = True
 602         if not expected:
 603             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 604         super(ExtractorError, self).__init__(msg)
 605
 606         self.traceback = tb
 607         self.exc_info = sys.exc_info()  # preserve original exception
 608         self.cause = cause
 609
 610     def format_traceback(self):
 611         if self.traceback is None:
 612             return None
 613         return u''.join(traceback.format_tb(self.traceback))
 614
 615
 616 class RegexNotFoundError(ExtractorError):
 617     """Error when a regex didn't match"""
 618     pass
 619
 620
 621 class DownloadError(Exception):
 622     """Download Error exception.
 623
 624     This exception may be thrown by FileDownloader objects if they are not
 625     configured to continue on errors. They will contain the appropriate
 626     error message.
 627     """
 628     def __init__(self, msg, exc_info=None):
 629         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 630         super(DownloadError, self).__init__(msg)
 631         self.exc_info = exc_info
 632
 633
 634 class SameFileError(Exception):
 635     """Same File exception.
 636
 637     This exception will be thrown by FileDownloader objects if they detect
 638     multiple files would have to be downloaded to the same file on disk.
 639     """
 640     pass
 641
 642
 643 class PostProcessingError(Exception):
 644     """Post Processing exception.
 645
 646     This exception may be raised by PostProcessor's .run() method to
 647     indicate an error in the postprocessing task.
 648     """
 649     def __init__(self, msg):
 650         self.msg = msg
 651
 652 class MaxDownloadsReached(Exception):
 653     """ --max-downloads limit has been reached. """
 654     pass
 655
 656
 657 class UnavailableVideoError(Exception):
 658     """Unavailable Format exception.
 659
 660     This exception will be thrown when a video is requested
 661     in a format that is not available for that video.
 662     """
 663     pass
 664
 665
 666 class ContentTooShortError(Exception):
 667     """Content Too Short exception.
 668
 669     This exception may be raised by FileDownloader objects when a file they
 670     download is too small for what the server announced first, indicating
 671     the connection was probably interrupted.
 672     """
 673     # Both in bytes
 674     downloaded = None
 675     expected = None
 676
 677     def __init__(self, downloaded, expected):
 678         self.downloaded = downloaded
 679         self.expected = expected
 680
 681 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 682     """Handler for HTTP requests and responses.
 683
 684     This class, when installed with an OpenerDirector, automatically adds
 685     the standard headers to every HTTP request and handles gzipped and
 686     deflated responses from web servers. If compression is to be avoided in
 687     a particular request, the original request in the program code only has
 688     to include the HTTP header "Youtubedl-No-Compression", which will be
 689     removed before making the real request.
 690
 691     Part of this code was copied from:
 692
 693     http://techknack.net/python-urllib2-handlers/
 694
 695     Andrew Rowls, the author of that code, agreed to release it to the
 696     public domain.
 697     """
 698
 699     @staticmethod
 700     def deflate(data):
 701         try:
 702             return zlib.decompress(data, -zlib.MAX_WBITS)
 703         except zlib.error:
 704             return zlib.decompress(data)
 705
 706     @staticmethod
 707     def addinfourl_wrapper(stream, headers, url, code):
 708         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 709             return compat_urllib_request.addinfourl(stream, headers, url, code)
 710         ret = compat_urllib_request.addinfourl(stream, headers, url)
 711         ret.code = code
 712         return ret
 713
 714     def http_request(self, req):
 715         for h,v in std_headers.items():
 716             if h in req.headers:
 717                 del req.headers[h]
 718             req.add_header(h, v)
 719         if 'Youtubedl-no-compression' in req.headers:
 720             if 'Accept-encoding' in req.headers:
 721                 del req.headers['Accept-encoding']
 722             del req.headers['Youtubedl-no-compression']
 723         if 'Youtubedl-user-agent' in req.headers:
 724             if 'User-agent' in req.headers:
 725                 del req.headers['User-agent']
 726             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 727             del req.headers['Youtubedl-user-agent']
 728         return req
 729
 730     def http_response(self, req, resp):
 731         old_resp = resp
 732         # gzip
 733         if resp.headers.get('Content-encoding', '') == 'gzip':
 734             content = resp.read()
 735             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 736             try:
 737                 uncompressed = io.BytesIO(gz.read())
 738             except IOError as original_ioerror:
 739                 # There may be junk add the end of the file
 740                 # See http://stackoverflow.com/q/4928560/35070 for details
 741                 for i in range(1, 1024):
 742                     try:
 743                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 744                         uncompressed = io.BytesIO(gz.read())
 745                     except IOError:
 746                         continue
 747                     break
 748                 else:
 749                     raise original_ioerror
 750             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 751             resp.msg = old_resp.msg
 752         # deflate
 753         if resp.headers.get('Content-encoding', '') == 'deflate':
 754             gz = io.BytesIO(self.deflate(resp.read()))
 755             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 756             resp.msg = old_resp.msg
 757         return resp
 758
 759     https_request = http_request
 760     https_response = http_response
 761
 762
 763 def unified_strdate(date_str):
 764     """Return a string with the date in the format YYYYMMDD"""
 765     upload_date = None
 766     #Replace commas
 767     date_str = date_str.replace(',', ' ')
 768     # %z (UTC offset) is only supported in python>=3.2
 769     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 770     format_expressions = [
 771         '%d %B %Y',
 772         '%d %b %Y',
 773         '%B %d %Y',
 774         '%b %d %Y',
 775         '%Y-%m-%d',
 776         '%d.%m.%Y',
 777         '%d/%m/%Y',
 778         '%Y/%m/%d %H:%M:%S',
 779         '%Y-%m-%d %H:%M:%S',
 780         '%d.%m.%Y %H:%M',
 781         '%d.%m.%Y %H.%M',
 782         '%Y-%m-%dT%H:%M:%SZ',
 783         '%Y-%m-%dT%H:%M:%S.%fZ',
 784         '%Y-%m-%dT%H:%M:%S.%f0Z',
 785         '%Y-%m-%dT%H:%M:%S',
 786         '%Y-%m-%dT%H:%M:%S.%f',
 787         '%Y-%m-%dT%H:%M',
 788     ]
 789     for expression in format_expressions:
 790         try:
 791             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 792         except ValueError:
 793             pass
 794     if upload_date is None:
 795         timetuple = email.utils.parsedate_tz(date_str)
 796         if timetuple:
 797             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 798     return upload_date
 799
 800 def determine_ext(url, default_ext=u'unknown_video'):
 801     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 802     if re.match(r'^[A-Za-z0-9]+$', guess):
 803         return guess
 804     else:
 805         return default_ext
 806
 807 def subtitles_filename(filename, sub_lang, sub_format):
 808     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 809
 810 def date_from_str(date_str):
 811     """
 812     Return a datetime object from a string in the format YYYYMMDD or
 813     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 814     today = datetime.date.today()
 815     if date_str == 'now'or date_str == 'today':
 816         return today
 817     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 818     if match is not None:
 819         sign = match.group('sign')
 820         time = int(match.group('time'))
 821         if sign == '-':
 822             time = -time
 823         unit = match.group('unit')
 824         #A bad aproximation?
 825         if unit == 'month':
 826             unit = 'day'
 827             time *= 30
 828         elif unit == 'year':
 829             unit = 'day'
 830             time *= 365
 831         unit += 's'
 832         delta = datetime.timedelta(**{unit: time})
 833         return today + delta
 834     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 835
 836 def hyphenate_date(date_str):
 837     """
 838     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 839     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 840     if match is not None:
 841         return '-'.join(match.groups())
 842     else:
 843         return date_str
 844
 845 class DateRange(object):
 846     """Represents a time interval between two dates"""
 847     def __init__(self, start=None, end=None):
 848         """start and end must be strings in the format accepted by date"""
 849         if start is not None:
 850             self.start = date_from_str(start)
 851         else:
 852             self.start = datetime.datetime.min.date()
 853         if end is not None:
 854             self.end = date_from_str(end)
 855         else:
 856             self.end = datetime.datetime.max.date()
 857         if self.start > self.end:
 858             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 859     @classmethod
 860     def day(cls, day):
 861         """Returns a range that only contains the given day"""
 862         return cls(day,day)
 863     def __contains__(self, date):
 864         """Check if the date is in the range"""
 865         if not isinstance(date, datetime.date):
 866             date = date_from_str(date)
 867         return self.start <= date <= self.end
 868     def __str__(self):
 869         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 870
 871
 872 def platform_name():
 873     """ Returns the platform name as a compat_str """
 874     res = platform.platform()
 875     if isinstance(res, bytes):
 876         res = res.decode(preferredencoding())
 877
 878     assert isinstance(res, compat_str)
 879     return res
 880
 881
 882 def write_string(s, out=None):
 883     if out is None:
 884         out = sys.stderr
 885     assert type(s) == compat_str
 886
 887     if ('b' in getattr(out, 'mode', '') or
 888             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 889         s = s.encode(preferredencoding(), 'ignore')
 890     try:
 891         out.write(s)
 892     except UnicodeEncodeError:
 893         # In Windows shells, this can fail even when the codec is just charmap!?
 894         # See https://wiki.python.org/moin/PrintFails#Issue
 895         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 896             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 897             out.write(s)
 898         else:
 899             raise
 900
 901     out.flush()
 902
 903
 904 def bytes_to_intlist(bs):
 905     if not bs:
 906         return []
 907     if isinstance(bs[0], int):  # Python 3
 908         return list(bs)
 909     else:
 910         return [ord(c) for c in bs]
 911
 912
 913 def intlist_to_bytes(xs):
 914     if not xs:
 915         return b''
 916     if isinstance(chr(0), bytes):  # Python 2
 917         return ''.join([chr(x) for x in xs])
 918     else:
 919         return bytes(xs)
 920
 921
 922 def get_cachedir(params={}):
 923     cache_root = os.environ.get('XDG_CACHE_HOME',
 924                                 os.path.expanduser('~/.cache'))
 925     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 926
 927
 928 # Cross-platform file locking
 929 if sys.platform == 'win32':
 930     import ctypes.wintypes
 931     import msvcrt
 932
 933     class OVERLAPPED(ctypes.Structure):
 934         _fields_ = [
 935             ('Internal', ctypes.wintypes.LPVOID),
 936             ('InternalHigh', ctypes.wintypes.LPVOID),
 937             ('Offset', ctypes.wintypes.DWORD),
 938             ('OffsetHigh', ctypes.wintypes.DWORD),
 939             ('hEvent', ctypes.wintypes.HANDLE),
 940         ]
 941
 942     kernel32 = ctypes.windll.kernel32
 943     LockFileEx = kernel32.LockFileEx
 944     LockFileEx.argtypes = [
 945         ctypes.wintypes.HANDLE,     # hFile
 946         ctypes.wintypes.DWORD,      # dwFlags
 947         ctypes.wintypes.DWORD,      # dwReserved
 948         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 949         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 950         ctypes.POINTER(OVERLAPPED)  # Overlapped
 951     ]
 952     LockFileEx.restype = ctypes.wintypes.BOOL
 953     UnlockFileEx = kernel32.UnlockFileEx
 954     UnlockFileEx.argtypes = [
 955         ctypes.wintypes.HANDLE,     # hFile
 956         ctypes.wintypes.DWORD,      # dwReserved
 957         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 958         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 959         ctypes.POINTER(OVERLAPPED)  # Overlapped
 960     ]
 961     UnlockFileEx.restype = ctypes.wintypes.BOOL
 962     whole_low = 0xffffffff
 963     whole_high = 0x7fffffff
 964
 965     def _lock_file(f, exclusive):
 966         overlapped = OVERLAPPED()
 967         overlapped.Offset = 0
 968         overlapped.OffsetHigh = 0
 969         overlapped.hEvent = 0
 970         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 971         handle = msvcrt.get_osfhandle(f.fileno())
 972         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 973                           whole_low, whole_high, f._lock_file_overlapped_p):
 974             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 975
 976     def _unlock_file(f):
 977         assert f._lock_file_overlapped_p
 978         handle = msvcrt.get_osfhandle(f.fileno())
 979         if not UnlockFileEx(handle, 0,
 980                             whole_low, whole_high, f._lock_file_overlapped_p):
 981             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 982
 983 else:
 984     import fcntl
 985
 986     def _lock_file(f, exclusive):
 987         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 988
 989     def _unlock_file(f):
 990         fcntl.lockf(f, fcntl.LOCK_UN)
 991
 992
 993 class locked_file(object):
 994     def __init__(self, filename, mode, encoding=None):
 995         assert mode in ['r', 'a', 'w']
 996         self.f = io.open(filename, mode, encoding=encoding)
 997         self.mode = mode
 998
 999     def __enter__(self):
1000         exclusive = self.mode != 'r'
1001         try:
1002             _lock_file(self.f, exclusive)
1003         except IOError:
1004             self.f.close()
1005             raise
1006         return self
1007
1008     def __exit__(self, etype, value, traceback):
1009         try:
1010             _unlock_file(self.f)
1011         finally:
1012             self.f.close()
1013
1014     def __iter__(self):
1015         return iter(self.f)
1016
1017     def write(self, *args):
1018         return self.f.write(*args)
1019
1020     def read(self, *args):
1021         return self.f.read(*args)
1022
1023
1024 def shell_quote(args):
1025     quoted_args = []
1026     encoding = sys.getfilesystemencoding()
1027     if encoding is None:
1028         encoding = 'utf-8'
1029     for a in args:
1030         if isinstance(a, bytes):
1031             # We may get a filename encoded with 'encodeFilename'
1032             a = a.decode(encoding)
1033         quoted_args.append(pipes.quote(a))
1034     return u' '.join(quoted_args)
1035
1036
1037 def takewhile_inclusive(pred, seq):
1038     """ Like itertools.takewhile, but include the latest evaluated element
1039         (the first element so that Not pred(e)) """
1040     for e in seq:
1041         yield e
1042         if not pred(e):
1043             return
1044
1045
1046 def smuggle_url(url, data):
1047     """ Pass additional data in a URL for internal use. """
1048
1049     sdata = compat_urllib_parse.urlencode(
1050         {u'__youtubedl_smuggle': json.dumps(data)})
1051     return url + u'#' + sdata
1052
1053
1054 def unsmuggle_url(smug_url, default=None):
1055     if not '#__youtubedl_smuggle' in smug_url:
1056         return smug_url, default
1057     url, _, sdata = smug_url.rpartition(u'#')
1058     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1059     data = json.loads(jsond)
1060     return url, data
1061
1062
1063 def format_bytes(bytes):
1064     if bytes is None:
1065         return u'N/A'
1066     if type(bytes) is str:
1067         bytes = float(bytes)
1068     if bytes == 0.0:
1069         exponent = 0
1070     else:
1071         exponent = int(math.log(bytes, 1024.0))
1072     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1073     converted = float(bytes) / float(1024 ** exponent)
1074     return u'%.2f%s' % (converted, suffix)
1075
1076
1077 def str_to_int(int_str):
1078     int_str = re.sub(r'[,\.]', u'', int_str)
1079     return int(int_str)
1080
1081
1082 def get_term_width():
1083     columns = os.environ.get('COLUMNS', None)
1084     if columns:
1085         return int(columns)
1086
1087     try:
1088         sp = subprocess.Popen(
1089             ['stty', 'size'],
1090             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1091         out, err = sp.communicate()
1092         return int(out.split()[1])
1093     except:
1094         pass
1095     return None
1096
1097
1098 def month_by_name(name):
1099     """ Return the number of a month by (locale-independently) English name """
1100
1101     ENGLISH_NAMES = [
1102         u'January', u'February', u'March', u'April', u'May', u'June',
1103         u'July', u'August', u'September', u'October', u'November', u'December']
1104     try:
1105         return ENGLISH_NAMES.index(name) + 1
1106     except ValueError:
1107         return None
1108
1109
1110 def fix_xml_ampersands(xml_str):
1111     """Replace all the '&' by '&amp;' in XML"""
1112     return re.sub(
1113         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1114         u'&amp;',
1115         xml_str)
1116
1117
1118 def setproctitle(title):
1119     assert isinstance(title, compat_str)
1120     try:
1121         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1122     except OSError:
1123         return
1124     title = title
1125     buf = ctypes.create_string_buffer(len(title) + 1)
1126     buf.value = title.encode('utf-8')
1127     try:
1128         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1129     except AttributeError:
1130         return  # Strange libc, just skip this
1131
1132
1133 def remove_start(s, start):
1134     if s.startswith(start):
1135         return s[len(start):]
1136     return s
1137
1138
1139 def url_basename(url):
1140     path = compat_urlparse.urlparse(url).path
1141     return path.strip(u'/').split(u'/')[-1]
1142
1143
1144 class HEADRequest(compat_urllib_request.Request):
1145     def get_method(self):
1146         return "HEAD"
1147
1148
1149 def int_or_none(v, scale=1):
1150     return v if v is None else (int(v) // scale)
1151
1152
1153 def parse_duration(s):
1154     if s is None:
1155         return None
1156
1157     m = re.match(
1158         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1159     if not m:
1160         return None
1161     res = int(m.group('secs'))
1162     if m.group('mins'):
1163         res += int(m.group('mins')) * 60
1164         if m.group('hours'):
1165             res += int(m.group('hours')) * 60 * 60
1166     return res
1167
1168
1169 def prepend_extension(filename, ext):
1170     name, real_ext = os.path.splitext(filename)
1171     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1172
1173
1174 def check_executable(exe, args=[]):
1175     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1176     args can be a list of arguments for a short output (like -version) """
1177     try:
1178         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1179     except OSError:
1180         return False
1181     return exe
1182
1183
1184 class PagedList(object):
1185     def __init__(self, pagefunc, pagesize):
1186         self._pagefunc = pagefunc
1187         self._pagesize = pagesize
1188
1189     def __len__(self):
1190         # This is only useful for tests
1191         return len(self.getslice())
1192
1193     def getslice(self, start=0, end=None):
1194         res = []
1195         for pagenum in itertools.count(start // self._pagesize):
1196             firstid = pagenum * self._pagesize
1197             nextfirstid = pagenum * self._pagesize + self._pagesize
1198             if start >= nextfirstid:
1199                 continue
1200
1201             page_results = list(self._pagefunc(pagenum))
1202
1203             startv = (
1204                 start % self._pagesize
1205                 if firstid <= start < nextfirstid
1206                 else 0)
1207
1208             endv = (
1209                 ((end - 1) % self._pagesize) + 1
1210                 if (end is not None and firstid <= end <= nextfirstid)
1211                 else None)
1212
1213             if startv != 0 or endv is not None:
1214                 page_results = page_results[startv:endv]
1215             res.extend(page_results)
1216
1217             # A little optimization - if current page is not "full", ie. does
1218             # not contain page_size videos then we can assume that this page
1219             # is the last one - there are no more ids on further pages -
1220             # i.e. no need to query again.
1221             if len(page_results) + startv < self._pagesize:
1222                 break
1223
1224             # If we got the whole page, but the next page is not interesting,
1225             # break out early as well
1226             if end == nextfirstid:
1227                 break
1228         return res
1229
1230
1231 def uppercase_escape(s):
1232     return re.sub(
1233         r'\\U([0-9a-fA-F]{8})',
1234         lambda m: compat_chr(int(m.group(1), base=16)), s)
1235
1236 try:
1237     struct.pack(u'!I', 0)
1238 except TypeError:
1239     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1240     def struct_pack(spec, *args):
1241         if isinstance(spec, compat_str):
1242             spec = spec.encode('ascii')
1243         return struct.pack(spec, *args)
1244
1245     def struct_unpack(spec, *args):
1246         if isinstance(spec, compat_str):
1247             spec = spec.encode('ascii')
1248         return struct.unpack(spec, *args)
1249 else:
1250     struct_pack = struct.pack
1251     struct_unpack = struct.unpack
1252
1253
1254 def read_batch_urls(batch_fd):
1255     def fixup(url):
1256         if not isinstance(url, compat_str):
1257             url = url.decode('utf-8', 'replace')
1258         BOM_UTF8 = u'\xef\xbb\xbf'
1259         if url.startswith(BOM_UTF8):
1260             url = url[len(BOM_UTF8):]
1261         url = url.strip()
1262         if url.startswith(('#', ';', ']')):
1263             return False
1264         return url
1265
1266     with contextlib.closing(batch_fd) as fd:
1267         return [url for url in map(fixup, fd) if url]
1268
1269
1270 def urlencode_postdata(*args, **kargs):
1271     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1272
1273
1274 def parse_xml(s):
1275     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1276         def doctype(self, name, pubid, system):
1277             pass  # Ignore doctypes
1278
1279     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1280     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1281     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)