youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import contextlib
   5 import ctypes
   6 import datetime
   7 import email.utils
   8 import errno
   9 import getpass
  10 import gzip
  11 import itertools
  12 import io
  13 import json
  14 import locale
  15 import math
  16 import os
  17 import pipes
  18 import platform
  19 import re
  20 import ssl
  21 import socket
  22 import struct
  23 import subprocess
  24 import sys
  25 import traceback
  26 import xml.etree.ElementTree
  27 import zlib
  28
  29 try:
  30     import urllib.request as compat_urllib_request
  31 except ImportError: # Python 2
  32     import urllib2 as compat_urllib_request
  33
  34 try:
  35     import urllib.error as compat_urllib_error
  36 except ImportError: # Python 2
  37     import urllib2 as compat_urllib_error
  38
  39 try:
  40     import urllib.parse as compat_urllib_parse
  41 except ImportError: # Python 2
  42     import urllib as compat_urllib_parse
  43
  44 try:
  45     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  46 except ImportError: # Python 2
  47     from urlparse import urlparse as compat_urllib_parse_urlparse
  48
  49 try:
  50     import urllib.parse as compat_urlparse
  51 except ImportError: # Python 2
  52     import urlparse as compat_urlparse
  53
  54 try:
  55     import http.cookiejar as compat_cookiejar
  56 except ImportError: # Python 2
  57     import cookielib as compat_cookiejar
  58
  59 try:
  60     import html.entities as compat_html_entities
  61 except ImportError: # Python 2
  62     import htmlentitydefs as compat_html_entities
  63
  64 try:
  65     import html.parser as compat_html_parser
  66 except ImportError: # Python 2
  67     import HTMLParser as compat_html_parser
  68
  69 try:
  70     import http.client as compat_http_client
  71 except ImportError: # Python 2
  72     import httplib as compat_http_client
  73
  74 try:
  75     from urllib.error import HTTPError as compat_HTTPError
  76 except ImportError:  # Python 2
  77     from urllib2 import HTTPError as compat_HTTPError
  78
  79 try:
  80     from urllib.request import urlretrieve as compat_urlretrieve
  81 except ImportError:  # Python 2
  82     from urllib import urlretrieve as compat_urlretrieve
  83
  84
  85 try:
  86     from subprocess import DEVNULL
  87     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  88 except ImportError:
  89     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  90
  91 try:
  92     from urllib.parse import parse_qs as compat_parse_qs
  93 except ImportError: # Python 2
  94     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  95     # Python 2's version is apparently totally broken
  96     def _unquote(string, encoding='utf-8', errors='replace'):
  97         if string == '':
  98             return string
  99         res = string.split('%')
 100         if len(res) == 1:
 101             return string
 102         if encoding is None:
 103             encoding = 'utf-8'
 104         if errors is None:
 105             errors = 'replace'
 106         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 107         pct_sequence = b''
 108         string = res[0]
 109         for item in res[1:]:
 110             try:
 111                 if not item:
 112                     raise ValueError
 113                 pct_sequence += item[:2].decode('hex')
 114                 rest = item[2:]
 115                 if not rest:
 116                     # This segment was just a single percent-encoded character.
 117                     # May be part of a sequence of code units, so delay decoding.
 118                     # (Stored in pct_sequence).
 119                     continue
 120             except ValueError:
 121                 rest = '%' + item
 122             # Encountered non-percent-encoded characters. Flush the current
 123             # pct_sequence.
 124             string += pct_sequence.decode(encoding, errors) + rest
 125             pct_sequence = b''
 126         if pct_sequence:
 127             # Flush the final pct_sequence
 128             string += pct_sequence.decode(encoding, errors)
 129         return string
 130
 131     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 132                 encoding='utf-8', errors='replace'):
 133         qs, _coerce_result = qs, unicode
 134         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 135         r = []
 136         for name_value in pairs:
 137             if not name_value and not strict_parsing:
 138                 continue
 139             nv = name_value.split('=', 1)
 140             if len(nv) != 2:
 141                 if strict_parsing:
 142                     raise ValueError("bad query field: %r" % (name_value,))
 143                 # Handle case of a control-name with no equal sign
 144                 if keep_blank_values:
 145                     nv.append('')
 146                 else:
 147                     continue
 148             if len(nv[1]) or keep_blank_values:
 149                 name = nv[0].replace('+', ' ')
 150                 name = _unquote(name, encoding=encoding, errors=errors)
 151                 name = _coerce_result(name)
 152                 value = nv[1].replace('+', ' ')
 153                 value = _unquote(value, encoding=encoding, errors=errors)
 154                 value = _coerce_result(value)
 155                 r.append((name, value))
 156         return r
 157
 158     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 159                 encoding='utf-8', errors='replace'):
 160         parsed_result = {}
 161         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 162                         encoding=encoding, errors=errors)
 163         for name, value in pairs:
 164             if name in parsed_result:
 165                 parsed_result[name].append(value)
 166             else:
 167                 parsed_result[name] = [value]
 168         return parsed_result
 169
 170 try:
 171     compat_str = unicode # Python 2
 172 except NameError:
 173     compat_str = str
 174
 175 try:
 176     compat_chr = unichr # Python 2
 177 except NameError:
 178     compat_chr = chr
 179
 180 try:
 181     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 182 except ImportError:  # Python 2.6
 183     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 184
 185 def compat_ord(c):
 186     if type(c) is int: return c
 187     else: return ord(c)
 188
 189 # This is not clearly defined otherwise
 190 compiled_regex_type = type(re.compile(''))
 191
 192 std_headers = {
 193     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 194     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 195     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 196     'Accept-Encoding': 'gzip, deflate',
 197     'Accept-Language': 'en-us,en;q=0.5',
 198 }
 199
 200 def preferredencoding():
 201     """Get preferred encoding.
 202
 203     Returns the best encoding scheme for the system, based on
 204     locale.getpreferredencoding() and some further tweaks.
 205     """
 206     try:
 207         pref = locale.getpreferredencoding()
 208         u'TEST'.encode(pref)
 209     except:
 210         pref = 'UTF-8'
 211
 212     return pref
 213
 214 if sys.version_info < (3,0):
 215     def compat_print(s):
 216         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 217 else:
 218     def compat_print(s):
 219         assert type(s) == type(u'')
 220         print(s)
 221
 222 # In Python 2.x, json.dump expects a bytestream.
 223 # In Python 3.x, it writes to a character stream
 224 if sys.version_info < (3,0):
 225     def write_json_file(obj, fn):
 226         with open(fn, 'wb') as f:
 227             json.dump(obj, f)
 228 else:
 229     def write_json_file(obj, fn):
 230         with open(fn, 'w', encoding='utf-8') as f:
 231             json.dump(obj, f)
 232
 233 if sys.version_info >= (2,7):
 234     def find_xpath_attr(node, xpath, key, val):
 235         """ Find the xpath xpath[@key=val] """
 236         assert re.match(r'^[a-zA-Z]+$', key)
 237         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 238         expr = xpath + u"[@%s='%s']" % (key, val)
 239         return node.find(expr)
 240 else:
 241     def find_xpath_attr(node, xpath, key, val):
 242         for f in node.findall(xpath):
 243             if f.attrib.get(key) == val:
 244                 return f
 245         return None
 246
 247 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 248 # the namespace parameter
 249 def xpath_with_ns(path, ns_map):
 250     components = [c.split(':') for c in path.split('/')]
 251     replaced = []
 252     for c in components:
 253         if len(c) == 1:
 254             replaced.append(c[0])
 255         else:
 256             ns, tag = c
 257             replaced.append('{%s}%s' % (ns_map[ns], tag))
 258     return '/'.join(replaced)
 259
 260 def htmlentity_transform(matchobj):
 261     """Transforms an HTML entity to a character.
 262
 263     This function receives a match object and is intended to be used with
 264     the re.sub() function.
 265     """
 266     entity = matchobj.group(1)
 267
 268     # Known non-numeric HTML entity
 269     if entity in compat_html_entities.name2codepoint:
 270         return compat_chr(compat_html_entities.name2codepoint[entity])
 271
 272     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 273     if mobj is not None:
 274         numstr = mobj.group(1)
 275         if numstr.startswith(u'x'):
 276             base = 16
 277             numstr = u'0%s' % numstr
 278         else:
 279             base = 10
 280         return compat_chr(int(numstr, base))
 281
 282     # Unknown entity in name, return its literal representation
 283     return (u'&%s;' % entity)
 284
 285 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 286 class BaseHTMLParser(compat_html_parser.HTMLParser):
 287     def __init(self):
 288         compat_html_parser.HTMLParser.__init__(self)
 289         self.html = None
 290
 291     def loads(self, html):
 292         self.html = html
 293         self.feed(html)
 294         self.close()
 295
 296 class AttrParser(BaseHTMLParser):
 297     """Modified HTMLParser that isolates a tag with the specified attribute"""
 298     def __init__(self, attribute, value):
 299         self.attribute = attribute
 300         self.value = value
 301         self.result = None
 302         self.started = False
 303         self.depth = {}
 304         self.watch_startpos = False
 305         self.error_count = 0
 306         BaseHTMLParser.__init__(self)
 307
 308     def error(self, message):
 309         if self.error_count > 10 or self.started:
 310             raise compat_html_parser.HTMLParseError(message, self.getpos())
 311         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 312         self.error_count += 1
 313         self.goahead(1)
 314
 315     def handle_starttag(self, tag, attrs):
 316         attrs = dict(attrs)
 317         if self.started:
 318             self.find_startpos(None)
 319         if self.attribute in attrs and attrs[self.attribute] == self.value:
 320             self.result = [tag]
 321             self.started = True
 322             self.watch_startpos = True
 323         if self.started:
 324             if not tag in self.depth: self.depth[tag] = 0
 325             self.depth[tag] += 1
 326
 327     def handle_endtag(self, tag):
 328         if self.started:
 329             if tag in self.depth: self.depth[tag] -= 1
 330             if self.depth[self.result[0]] == 0:
 331                 self.started = False
 332                 self.result.append(self.getpos())
 333
 334     def find_startpos(self, x):
 335         """Needed to put the start position of the result (self.result[1])
 336         after the opening tag with the requested id"""
 337         if self.watch_startpos:
 338             self.watch_startpos = False
 339             self.result.append(self.getpos())
 340     handle_entityref = handle_charref = handle_data = handle_comment = \
 341     handle_decl = handle_pi = unknown_decl = find_startpos
 342
 343     def get_result(self):
 344         if self.result is None:
 345             return None
 346         if len(self.result) != 3:
 347             return None
 348         lines = self.html.split('\n')
 349         lines = lines[self.result[1][0]-1:self.result[2][0]]
 350         lines[0] = lines[0][self.result[1][1]:]
 351         if len(lines) == 1:
 352             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 353         lines[-1] = lines[-1][:self.result[2][1]]
 354         return '\n'.join(lines).strip()
 355 # Hack for https://github.com/rg3/youtube-dl/issues/662
 356 if sys.version_info < (2, 7, 3):
 357     AttrParser.parse_endtag = (lambda self, i:
 358         i + len("</scr'+'ipt>")
 359         if self.rawdata[i:].startswith("</scr'+'ipt>")
 360         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 361
 362 def get_element_by_id(id, html):
 363     """Return the content of the tag with the specified ID in the passed HTML document"""
 364     return get_element_by_attribute("id", id, html)
 365
 366 def get_element_by_attribute(attribute, value, html):
 367     """Return the content of the tag with the specified attribute in the passed HTML document"""
 368     parser = AttrParser(attribute, value)
 369     try:
 370         parser.loads(html)
 371     except compat_html_parser.HTMLParseError:
 372         pass
 373     return parser.get_result()
 374
 375 class MetaParser(BaseHTMLParser):
 376     """
 377     Modified HTMLParser that isolates a meta tag with the specified name
 378     attribute.
 379     """
 380     def __init__(self, name):
 381         BaseHTMLParser.__init__(self)
 382         self.name = name
 383         self.content = None
 384         self.result = None
 385
 386     def handle_starttag(self, tag, attrs):
 387         if tag != 'meta':
 388             return
 389         attrs = dict(attrs)
 390         if attrs.get('name') == self.name:
 391             self.result = attrs.get('content')
 392
 393     def get_result(self):
 394         return self.result
 395
 396 def get_meta_content(name, html):
 397     """
 398     Return the content attribute from the meta tag with the given name attribute.
 399     """
 400     parser = MetaParser(name)
 401     try:
 402         parser.loads(html)
 403     except compat_html_parser.HTMLParseError:
 404         pass
 405     return parser.get_result()
 406
 407
 408 def clean_html(html):
 409     """Clean an HTML snippet into a readable string"""
 410     # Newline vs <br />
 411     html = html.replace('\n', ' ')
 412     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 413     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 414     # Strip html tags
 415     html = re.sub('<.*?>', '', html)
 416     # Replace html entities
 417     html = unescapeHTML(html)
 418     return html.strip()
 419
 420
 421 def sanitize_open(filename, open_mode):
 422     """Try to open the given filename, and slightly tweak it if this fails.
 423
 424     Attempts to open the given filename. If this fails, it tries to change
 425     the filename slightly, step by step, until it's either able to open it
 426     or it fails and raises a final exception, like the standard open()
 427     function.
 428
 429     It returns the tuple (stream, definitive_file_name).
 430     """
 431     try:
 432         if filename == u'-':
 433             if sys.platform == 'win32':
 434                 import msvcrt
 435                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 436             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 437         stream = open(encodeFilename(filename), open_mode)
 438         return (stream, filename)
 439     except (IOError, OSError) as err:
 440         if err.errno in (errno.EACCES,):
 441             raise
 442
 443         # In case of error, try to remove win32 forbidden chars
 444         alt_filename = os.path.join(
 445                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 446                         for path_part in os.path.split(filename)
 447                        )
 448         if alt_filename == filename:
 449             raise
 450         else:
 451             # An exception here should be caught in the caller
 452             stream = open(encodeFilename(filename), open_mode)
 453             return (stream, alt_filename)
 454
 455
 456 def timeconvert(timestr):
 457     """Convert RFC 2822 defined time string into system timestamp"""
 458     timestamp = None
 459     timetuple = email.utils.parsedate_tz(timestr)
 460     if timetuple is not None:
 461         timestamp = email.utils.mktime_tz(timetuple)
 462     return timestamp
 463
 464 def sanitize_filename(s, restricted=False, is_id=False):
 465     """Sanitizes a string so it could be used as part of a filename.
 466     If restricted is set, use a stricter subset of allowed characters.
 467     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 468     """
 469     def replace_insane(char):
 470         if char == '?' or ord(char) < 32 or ord(char) == 127:
 471             return ''
 472         elif char == '"':
 473             return '' if restricted else '\''
 474         elif char == ':':
 475             return '_-' if restricted else ' -'
 476         elif char in '\\/|*<>':
 477             return '_'
 478         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 479             return '_'
 480         if restricted and ord(char) > 127:
 481             return '_'
 482         return char
 483
 484     result = u''.join(map(replace_insane, s))
 485     if not is_id:
 486         while '__' in result:
 487             result = result.replace('__', '_')
 488         result = result.strip('_')
 489         # Common case of "Foreign band name - English song title"
 490         if restricted and result.startswith('-_'):
 491             result = result[2:]
 492         if not result:
 493             result = '_'
 494     return result
 495
 496 def orderedSet(iterable):
 497     """ Remove all duplicates from the input iterable """
 498     res = []
 499     for el in iterable:
 500         if el not in res:
 501             res.append(el)
 502     return res
 503
 504 def unescapeHTML(s):
 505     """
 506     @param s a string
 507     """
 508     assert type(s) == type(u'')
 509
 510     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 511     return result
 512
 513
 514 def encodeFilename(s, for_subprocess=False):
 515     """
 516     @param s The name of the file
 517     """
 518
 519     assert type(s) == compat_str
 520
 521     # Python 3 has a Unicode API
 522     if sys.version_info >= (3, 0):
 523         return s
 524
 525     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 526         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 527         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 528         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 529         if not for_subprocess:
 530             return s
 531         else:
 532             # For subprocess calls, encode with locale encoding
 533             # Refer to http://stackoverflow.com/a/9951851/35070
 534             encoding = preferredencoding()
 535     else:
 536         encoding = sys.getfilesystemencoding()
 537     if encoding is None:
 538         encoding = 'utf-8'
 539     return s.encode(encoding, 'ignore')
 540
 541
 542 def decodeOption(optval):
 543     if optval is None:
 544         return optval
 545     if isinstance(optval, bytes):
 546         optval = optval.decode(preferredencoding())
 547
 548     assert isinstance(optval, compat_str)
 549     return optval
 550
 551 def formatSeconds(secs):
 552     if secs > 3600:
 553         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 554     elif secs > 60:
 555         return '%d:%02d' % (secs // 60, secs % 60)
 556     else:
 557         return '%d' % secs
 558
 559
 560 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 561     if sys.version_info < (3, 2):
 562         import httplib
 563
 564         class HTTPSConnectionV3(httplib.HTTPSConnection):
 565             def __init__(self, *args, **kwargs):
 566                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 567
 568             def connect(self):
 569                 sock = socket.create_connection((self.host, self.port), self.timeout)
 570                 if getattr(self, '_tunnel_host', False):
 571                     self.sock = sock
 572                     self._tunnel()
 573                 try:
 574                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 575                 except ssl.SSLError:
 576                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 577
 578         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 579             def https_open(self, req):
 580                 return self.do_open(HTTPSConnectionV3, req)
 581         return HTTPSHandlerV3(**kwargs)
 582     else:
 583         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 584         context.verify_mode = (ssl.CERT_NONE
 585                                if opts_no_check_certificate
 586                                else ssl.CERT_REQUIRED)
 587         context.set_default_verify_paths()
 588         try:
 589             context.load_default_certs()
 590         except AttributeError:
 591             pass  # Python < 3.4
 592         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 593
 594 class ExtractorError(Exception):
 595     """Error during info extraction."""
 596     def __init__(self, msg, tb=None, expected=False, cause=None):
 597         """ tb, if given, is the original traceback (so that it can be printed out).
 598         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 599         """
 600
 601         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 602             expected = True
 603         if not expected:
 604             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 605         super(ExtractorError, self).__init__(msg)
 606
 607         self.traceback = tb
 608         self.exc_info = sys.exc_info()  # preserve original exception
 609         self.cause = cause
 610
 611     def format_traceback(self):
 612         if self.traceback is None:
 613             return None
 614         return u''.join(traceback.format_tb(self.traceback))
 615
 616
 617 class RegexNotFoundError(ExtractorError):
 618     """Error when a regex didn't match"""
 619     pass
 620
 621
 622 class DownloadError(Exception):
 623     """Download Error exception.
 624
 625     This exception may be thrown by FileDownloader objects if they are not
 626     configured to continue on errors. They will contain the appropriate
 627     error message.
 628     """
 629     def __init__(self, msg, exc_info=None):
 630         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 631         super(DownloadError, self).__init__(msg)
 632         self.exc_info = exc_info
 633
 634
 635 class SameFileError(Exception):
 636     """Same File exception.
 637
 638     This exception will be thrown by FileDownloader objects if they detect
 639     multiple files would have to be downloaded to the same file on disk.
 640     """
 641     pass
 642
 643
 644 class PostProcessingError(Exception):
 645     """Post Processing exception.
 646
 647     This exception may be raised by PostProcessor's .run() method to
 648     indicate an error in the postprocessing task.
 649     """
 650     def __init__(self, msg):
 651         self.msg = msg
 652
 653 class MaxDownloadsReached(Exception):
 654     """ --max-downloads limit has been reached. """
 655     pass
 656
 657
 658 class UnavailableVideoError(Exception):
 659     """Unavailable Format exception.
 660
 661     This exception will be thrown when a video is requested
 662     in a format that is not available for that video.
 663     """
 664     pass
 665
 666
 667 class ContentTooShortError(Exception):
 668     """Content Too Short exception.
 669
 670     This exception may be raised by FileDownloader objects when a file they
 671     download is too small for what the server announced first, indicating
 672     the connection was probably interrupted.
 673     """
 674     # Both in bytes
 675     downloaded = None
 676     expected = None
 677
 678     def __init__(self, downloaded, expected):
 679         self.downloaded = downloaded
 680         self.expected = expected
 681
 682 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 683     """Handler for HTTP requests and responses.
 684
 685     This class, when installed with an OpenerDirector, automatically adds
 686     the standard headers to every HTTP request and handles gzipped and
 687     deflated responses from web servers. If compression is to be avoided in
 688     a particular request, the original request in the program code only has
 689     to include the HTTP header "Youtubedl-No-Compression", which will be
 690     removed before making the real request.
 691
 692     Part of this code was copied from:
 693
 694     http://techknack.net/python-urllib2-handlers/
 695
 696     Andrew Rowls, the author of that code, agreed to release it to the
 697     public domain.
 698     """
 699
 700     @staticmethod
 701     def deflate(data):
 702         try:
 703             return zlib.decompress(data, -zlib.MAX_WBITS)
 704         except zlib.error:
 705             return zlib.decompress(data)
 706
 707     @staticmethod
 708     def addinfourl_wrapper(stream, headers, url, code):
 709         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 710             return compat_urllib_request.addinfourl(stream, headers, url, code)
 711         ret = compat_urllib_request.addinfourl(stream, headers, url)
 712         ret.code = code
 713         return ret
 714
 715     def http_request(self, req):
 716         for h,v in std_headers.items():
 717             if h in req.headers:
 718                 del req.headers[h]
 719             req.add_header(h, v)
 720         if 'Youtubedl-no-compression' in req.headers:
 721             if 'Accept-encoding' in req.headers:
 722                 del req.headers['Accept-encoding']
 723             del req.headers['Youtubedl-no-compression']
 724         if 'Youtubedl-user-agent' in req.headers:
 725             if 'User-agent' in req.headers:
 726                 del req.headers['User-agent']
 727             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 728             del req.headers['Youtubedl-user-agent']
 729         return req
 730
 731     def http_response(self, req, resp):
 732         old_resp = resp
 733         # gzip
 734         if resp.headers.get('Content-encoding', '') == 'gzip':
 735             content = resp.read()
 736             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 737             try:
 738                 uncompressed = io.BytesIO(gz.read())
 739             except IOError as original_ioerror:
 740                 # There may be junk add the end of the file
 741                 # See http://stackoverflow.com/q/4928560/35070 for details
 742                 for i in range(1, 1024):
 743                     try:
 744                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 745                         uncompressed = io.BytesIO(gz.read())
 746                     except IOError:
 747                         continue
 748                     break
 749                 else:
 750                     raise original_ioerror
 751             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 752             resp.msg = old_resp.msg
 753         # deflate
 754         if resp.headers.get('Content-encoding', '') == 'deflate':
 755             gz = io.BytesIO(self.deflate(resp.read()))
 756             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 757             resp.msg = old_resp.msg
 758         return resp
 759
 760     https_request = http_request
 761     https_response = http_response
 762
 763
 764 def unified_strdate(date_str):
 765     """Return a string with the date in the format YYYYMMDD"""
 766
 767     if date_str is None:
 768         return None
 769
 770     upload_date = None
 771     #Replace commas
 772     date_str = date_str.replace(',', ' ')
 773     # %z (UTC offset) is only supported in python>=3.2
 774     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 775     format_expressions = [
 776         '%d %B %Y',
 777         '%d %b %Y',
 778         '%B %d %Y',
 779         '%b %d %Y',
 780         '%Y-%m-%d',
 781         '%d.%m.%Y',
 782         '%d/%m/%Y',
 783         '%Y/%m/%d %H:%M:%S',
 784         '%Y-%m-%d %H:%M:%S',
 785         '%d.%m.%Y %H:%M',
 786         '%d.%m.%Y %H.%M',
 787         '%Y-%m-%dT%H:%M:%SZ',
 788         '%Y-%m-%dT%H:%M:%S.%fZ',
 789         '%Y-%m-%dT%H:%M:%S.%f0Z',
 790         '%Y-%m-%dT%H:%M:%S',
 791         '%Y-%m-%dT%H:%M:%S.%f',
 792         '%Y-%m-%dT%H:%M',
 793     ]
 794     for expression in format_expressions:
 795         try:
 796             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 797         except ValueError:
 798             pass
 799     if upload_date is None:
 800         timetuple = email.utils.parsedate_tz(date_str)
 801         if timetuple:
 802             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 803     return upload_date
 804
 805 def determine_ext(url, default_ext=u'unknown_video'):
 806     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 807     if re.match(r'^[A-Za-z0-9]+$', guess):
 808         return guess
 809     else:
 810         return default_ext
 811
 812 def subtitles_filename(filename, sub_lang, sub_format):
 813     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 814
 815 def date_from_str(date_str):
 816     """
 817     Return a datetime object from a string in the format YYYYMMDD or
 818     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 819     today = datetime.date.today()
 820     if date_str == 'now'or date_str == 'today':
 821         return today
 822     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 823     if match is not None:
 824         sign = match.group('sign')
 825         time = int(match.group('time'))
 826         if sign == '-':
 827             time = -time
 828         unit = match.group('unit')
 829         #A bad aproximation?
 830         if unit == 'month':
 831             unit = 'day'
 832             time *= 30
 833         elif unit == 'year':
 834             unit = 'day'
 835             time *= 365
 836         unit += 's'
 837         delta = datetime.timedelta(**{unit: time})
 838         return today + delta
 839     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 840
 841 def hyphenate_date(date_str):
 842     """
 843     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 844     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 845     if match is not None:
 846         return '-'.join(match.groups())
 847     else:
 848         return date_str
 849
 850 class DateRange(object):
 851     """Represents a time interval between two dates"""
 852     def __init__(self, start=None, end=None):
 853         """start and end must be strings in the format accepted by date"""
 854         if start is not None:
 855             self.start = date_from_str(start)
 856         else:
 857             self.start = datetime.datetime.min.date()
 858         if end is not None:
 859             self.end = date_from_str(end)
 860         else:
 861             self.end = datetime.datetime.max.date()
 862         if self.start > self.end:
 863             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 864     @classmethod
 865     def day(cls, day):
 866         """Returns a range that only contains the given day"""
 867         return cls(day,day)
 868     def __contains__(self, date):
 869         """Check if the date is in the range"""
 870         if not isinstance(date, datetime.date):
 871             date = date_from_str(date)
 872         return self.start <= date <= self.end
 873     def __str__(self):
 874         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 875
 876
 877 def platform_name():
 878     """ Returns the platform name as a compat_str """
 879     res = platform.platform()
 880     if isinstance(res, bytes):
 881         res = res.decode(preferredencoding())
 882
 883     assert isinstance(res, compat_str)
 884     return res
 885
 886
 887 def write_string(s, out=None):
 888     if out is None:
 889         out = sys.stderr
 890     assert type(s) == compat_str
 891
 892     if ('b' in getattr(out, 'mode', '') or
 893             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 894         s = s.encode(preferredencoding(), 'ignore')
 895     try:
 896         out.write(s)
 897     except UnicodeEncodeError:
 898         # In Windows shells, this can fail even when the codec is just charmap!?
 899         # See https://wiki.python.org/moin/PrintFails#Issue
 900         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 901             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 902             out.write(s)
 903         else:
 904             raise
 905
 906     out.flush()
 907
 908
 909 def bytes_to_intlist(bs):
 910     if not bs:
 911         return []
 912     if isinstance(bs[0], int):  # Python 3
 913         return list(bs)
 914     else:
 915         return [ord(c) for c in bs]
 916
 917
 918 def intlist_to_bytes(xs):
 919     if not xs:
 920         return b''
 921     if isinstance(chr(0), bytes):  # Python 2
 922         return ''.join([chr(x) for x in xs])
 923     else:
 924         return bytes(xs)
 925
 926
 927 def get_cachedir(params={}):
 928     cache_root = os.environ.get('XDG_CACHE_HOME',
 929                                 os.path.expanduser('~/.cache'))
 930     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 931
 932
 933 # Cross-platform file locking
 934 if sys.platform == 'win32':
 935     import ctypes.wintypes
 936     import msvcrt
 937
 938     class OVERLAPPED(ctypes.Structure):
 939         _fields_ = [
 940             ('Internal', ctypes.wintypes.LPVOID),
 941             ('InternalHigh', ctypes.wintypes.LPVOID),
 942             ('Offset', ctypes.wintypes.DWORD),
 943             ('OffsetHigh', ctypes.wintypes.DWORD),
 944             ('hEvent', ctypes.wintypes.HANDLE),
 945         ]
 946
 947     kernel32 = ctypes.windll.kernel32
 948     LockFileEx = kernel32.LockFileEx
 949     LockFileEx.argtypes = [
 950         ctypes.wintypes.HANDLE,     # hFile
 951         ctypes.wintypes.DWORD,      # dwFlags
 952         ctypes.wintypes.DWORD,      # dwReserved
 953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 955         ctypes.POINTER(OVERLAPPED)  # Overlapped
 956     ]
 957     LockFileEx.restype = ctypes.wintypes.BOOL
 958     UnlockFileEx = kernel32.UnlockFileEx
 959     UnlockFileEx.argtypes = [
 960         ctypes.wintypes.HANDLE,     # hFile
 961         ctypes.wintypes.DWORD,      # dwReserved
 962         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 963         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 964         ctypes.POINTER(OVERLAPPED)  # Overlapped
 965     ]
 966     UnlockFileEx.restype = ctypes.wintypes.BOOL
 967     whole_low = 0xffffffff
 968     whole_high = 0x7fffffff
 969
 970     def _lock_file(f, exclusive):
 971         overlapped = OVERLAPPED()
 972         overlapped.Offset = 0
 973         overlapped.OffsetHigh = 0
 974         overlapped.hEvent = 0
 975         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 976         handle = msvcrt.get_osfhandle(f.fileno())
 977         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 978                           whole_low, whole_high, f._lock_file_overlapped_p):
 979             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 980
 981     def _unlock_file(f):
 982         assert f._lock_file_overlapped_p
 983         handle = msvcrt.get_osfhandle(f.fileno())
 984         if not UnlockFileEx(handle, 0,
 985                             whole_low, whole_high, f._lock_file_overlapped_p):
 986             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 987
 988 else:
 989     import fcntl
 990
 991     def _lock_file(f, exclusive):
 992         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 993
 994     def _unlock_file(f):
 995         fcntl.lockf(f, fcntl.LOCK_UN)
 996
 997
 998 class locked_file(object):
 999     def __init__(self, filename, mode, encoding=None):
1000         assert mode in ['r', 'a', 'w']
1001         self.f = io.open(filename, mode, encoding=encoding)
1002         self.mode = mode
1003
1004     def __enter__(self):
1005         exclusive = self.mode != 'r'
1006         try:
1007             _lock_file(self.f, exclusive)
1008         except IOError:
1009             self.f.close()
1010             raise
1011         return self
1012
1013     def __exit__(self, etype, value, traceback):
1014         try:
1015             _unlock_file(self.f)
1016         finally:
1017             self.f.close()
1018
1019     def __iter__(self):
1020         return iter(self.f)
1021
1022     def write(self, *args):
1023         return self.f.write(*args)
1024
1025     def read(self, *args):
1026         return self.f.read(*args)
1027
1028
1029 def shell_quote(args):
1030     quoted_args = []
1031     encoding = sys.getfilesystemencoding()
1032     if encoding is None:
1033         encoding = 'utf-8'
1034     for a in args:
1035         if isinstance(a, bytes):
1036             # We may get a filename encoded with 'encodeFilename'
1037             a = a.decode(encoding)
1038         quoted_args.append(pipes.quote(a))
1039     return u' '.join(quoted_args)
1040
1041
1042 def takewhile_inclusive(pred, seq):
1043     """ Like itertools.takewhile, but include the latest evaluated element
1044         (the first element so that Not pred(e)) """
1045     for e in seq:
1046         yield e
1047         if not pred(e):
1048             return
1049
1050
1051 def smuggle_url(url, data):
1052     """ Pass additional data in a URL for internal use. """
1053
1054     sdata = compat_urllib_parse.urlencode(
1055         {u'__youtubedl_smuggle': json.dumps(data)})
1056     return url + u'#' + sdata
1057
1058
1059 def unsmuggle_url(smug_url, default=None):
1060     if not '#__youtubedl_smuggle' in smug_url:
1061         return smug_url, default
1062     url, _, sdata = smug_url.rpartition(u'#')
1063     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1064     data = json.loads(jsond)
1065     return url, data
1066
1067
1068 def format_bytes(bytes):
1069     if bytes is None:
1070         return u'N/A'
1071     if type(bytes) is str:
1072         bytes = float(bytes)
1073     if bytes == 0.0:
1074         exponent = 0
1075     else:
1076         exponent = int(math.log(bytes, 1024.0))
1077     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1078     converted = float(bytes) / float(1024 ** exponent)
1079     return u'%.2f%s' % (converted, suffix)
1080
1081
1082 def str_to_int(int_str):
1083     int_str = re.sub(r'[,\.]', u'', int_str)
1084     return int(int_str)
1085
1086
1087 def get_term_width():
1088     columns = os.environ.get('COLUMNS', None)
1089     if columns:
1090         return int(columns)
1091
1092     try:
1093         sp = subprocess.Popen(
1094             ['stty', 'size'],
1095             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1096         out, err = sp.communicate()
1097         return int(out.split()[1])
1098     except:
1099         pass
1100     return None
1101
1102
1103 def month_by_name(name):
1104     """ Return the number of a month by (locale-independently) English name """
1105
1106     ENGLISH_NAMES = [
1107         u'January', u'February', u'March', u'April', u'May', u'June',
1108         u'July', u'August', u'September', u'October', u'November', u'December']
1109     try:
1110         return ENGLISH_NAMES.index(name) + 1
1111     except ValueError:
1112         return None
1113
1114
1115 def fix_xml_ampersands(xml_str):
1116     """Replace all the '&' by '&amp;' in XML"""
1117     return re.sub(
1118         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1119         u'&amp;',
1120         xml_str)
1121
1122
1123 def setproctitle(title):
1124     assert isinstance(title, compat_str)
1125     try:
1126         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1127     except OSError:
1128         return
1129     title = title
1130     buf = ctypes.create_string_buffer(len(title) + 1)
1131     buf.value = title.encode('utf-8')
1132     try:
1133         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1134     except AttributeError:
1135         return  # Strange libc, just skip this
1136
1137
1138 def remove_start(s, start):
1139     if s.startswith(start):
1140         return s[len(start):]
1141     return s
1142
1143
1144 def url_basename(url):
1145     path = compat_urlparse.urlparse(url).path
1146     return path.strip(u'/').split(u'/')[-1]
1147
1148
1149 class HEADRequest(compat_urllib_request.Request):
1150     def get_method(self):
1151         return "HEAD"
1152
1153
1154 def int_or_none(v, scale=1):
1155     return v if v is None else (int(v) // scale)
1156
1157
1158 def parse_duration(s):
1159     if s is None:
1160         return None
1161
1162     m = re.match(
1163         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1164     if not m:
1165         return None
1166     res = int(m.group('secs'))
1167     if m.group('mins'):
1168         res += int(m.group('mins')) * 60
1169         if m.group('hours'):
1170             res += int(m.group('hours')) * 60 * 60
1171     return res
1172
1173
1174 def prepend_extension(filename, ext):
1175     name, real_ext = os.path.splitext(filename)
1176     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1177
1178
1179 def check_executable(exe, args=[]):
1180     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1181     args can be a list of arguments for a short output (like -version) """
1182     try:
1183         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1184     except OSError:
1185         return False
1186     return exe
1187
1188
1189 class PagedList(object):
1190     def __init__(self, pagefunc, pagesize):
1191         self._pagefunc = pagefunc
1192         self._pagesize = pagesize
1193
1194     def __len__(self):
1195         # This is only useful for tests
1196         return len(self.getslice())
1197
1198     def getslice(self, start=0, end=None):
1199         res = []
1200         for pagenum in itertools.count(start // self._pagesize):
1201             firstid = pagenum * self._pagesize
1202             nextfirstid = pagenum * self._pagesize + self._pagesize
1203             if start >= nextfirstid:
1204                 continue
1205
1206             page_results = list(self._pagefunc(pagenum))
1207
1208             startv = (
1209                 start % self._pagesize
1210                 if firstid <= start < nextfirstid
1211                 else 0)
1212
1213             endv = (
1214                 ((end - 1) % self._pagesize) + 1
1215                 if (end is not None and firstid <= end <= nextfirstid)
1216                 else None)
1217
1218             if startv != 0 or endv is not None:
1219                 page_results = page_results[startv:endv]
1220             res.extend(page_results)
1221
1222             # A little optimization - if current page is not "full", ie. does
1223             # not contain page_size videos then we can assume that this page
1224             # is the last one - there are no more ids on further pages -
1225             # i.e. no need to query again.
1226             if len(page_results) + startv < self._pagesize:
1227                 break
1228
1229             # If we got the whole page, but the next page is not interesting,
1230             # break out early as well
1231             if end == nextfirstid:
1232                 break
1233         return res
1234
1235
1236 def uppercase_escape(s):
1237     return re.sub(
1238         r'\\U([0-9a-fA-F]{8})',
1239         lambda m: compat_chr(int(m.group(1), base=16)), s)
1240
1241 try:
1242     struct.pack(u'!I', 0)
1243 except TypeError:
1244     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1245     def struct_pack(spec, *args):
1246         if isinstance(spec, compat_str):
1247             spec = spec.encode('ascii')
1248         return struct.pack(spec, *args)
1249
1250     def struct_unpack(spec, *args):
1251         if isinstance(spec, compat_str):
1252             spec = spec.encode('ascii')
1253         return struct.unpack(spec, *args)
1254 else:
1255     struct_pack = struct.pack
1256     struct_unpack = struct.unpack
1257
1258
1259 def read_batch_urls(batch_fd):
1260     def fixup(url):
1261         if not isinstance(url, compat_str):
1262             url = url.decode('utf-8', 'replace')
1263         BOM_UTF8 = u'\xef\xbb\xbf'
1264         if url.startswith(BOM_UTF8):
1265             url = url[len(BOM_UTF8):]
1266         url = url.strip()
1267         if url.startswith(('#', ';', ']')):
1268             return False
1269         return url
1270
1271     with contextlib.closing(batch_fd) as fd:
1272         return [url for url in map(fixup, fd) if url]
1273
1274
1275 def urlencode_postdata(*args, **kargs):
1276     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1277
1278
1279 def parse_xml(s):
1280     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1281         def doctype(self, name, pubid, system):
1282             pass  # Ignore doctypes
1283
1284     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1285     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1286     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1287
1288
1289 if sys.version_info < (3, 0) and sys.platform == 'win32':
1290     def compat_getpass(prompt, *args, **kwargs):
1291         if isinstance(prompt, compat_str):
1292             prompt = prompt.encode(preferredencoding())
1293         return getpass.getpass(prompt, *args, **kwargs)
1294 else:
1295     compat_getpass = getpass.getpass
1296
1297
1298 US_RATINGS = {
1299     'G': 0,
1300     'PG': 10,
1301     'PG-13': 13,
1302     'R': 16,
1303     'NC': 18,
1304 }