youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import io
  10 import json
  11 import locale
  12 import math
  13 import os
  14 import pipes
  15 import platform
  16 import re
  17 import ssl
  18 import socket
  19 import subprocess
  20 import sys
  21 import traceback
  22 import zlib
  23
  24 try:
  25     import urllib.request as compat_urllib_request
  26 except ImportError: # Python 2
  27     import urllib2 as compat_urllib_request
  28
  29 try:
  30     import urllib.error as compat_urllib_error
  31 except ImportError: # Python 2
  32     import urllib2 as compat_urllib_error
  33
  34 try:
  35     import urllib.parse as compat_urllib_parse
  36 except ImportError: # Python 2
  37     import urllib as compat_urllib_parse
  38
  39 try:
  40     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  41 except ImportError: # Python 2
  42     from urlparse import urlparse as compat_urllib_parse_urlparse
  43
  44 try:
  45     import urllib.parse as compat_urlparse
  46 except ImportError: # Python 2
  47     import urlparse as compat_urlparse
  48
  49 try:
  50     import http.cookiejar as compat_cookiejar
  51 except ImportError: # Python 2
  52     import cookielib as compat_cookiejar
  53
  54 try:
  55     import html.entities as compat_html_entities
  56 except ImportError: # Python 2
  57     import htmlentitydefs as compat_html_entities
  58
  59 try:
  60     import html.parser as compat_html_parser
  61 except ImportError: # Python 2
  62     import HTMLParser as compat_html_parser
  63
  64 try:
  65     import http.client as compat_http_client
  66 except ImportError: # Python 2
  67     import httplib as compat_http_client
  68
  69 try:
  70     from urllib.error import HTTPError as compat_HTTPError
  71 except ImportError:  # Python 2
  72     from urllib2 import HTTPError as compat_HTTPError
  73
  74 try:
  75     from urllib.request import urlretrieve as compat_urlretrieve
  76 except ImportError:  # Python 2
  77     from urllib import urlretrieve as compat_urlretrieve
  78
  79
  80 try:
  81     from subprocess import DEVNULL
  82     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  83 except ImportError:
  84     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  85
  86 try:
  87     from urllib.parse import parse_qs as compat_parse_qs
  88 except ImportError: # Python 2
  89     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  90     # Python 2's version is apparently totally broken
  91     def _unquote(string, encoding='utf-8', errors='replace'):
  92         if string == '':
  93             return string
  94         res = string.split('%')
  95         if len(res) == 1:
  96             return string
  97         if encoding is None:
  98             encoding = 'utf-8'
  99         if errors is None:
 100             errors = 'replace'
 101         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 102         pct_sequence = b''
 103         string = res[0]
 104         for item in res[1:]:
 105             try:
 106                 if not item:
 107                     raise ValueError
 108                 pct_sequence += item[:2].decode('hex')
 109                 rest = item[2:]
 110                 if not rest:
 111                     # This segment was just a single percent-encoded character.
 112                     # May be part of a sequence of code units, so delay decoding.
 113                     # (Stored in pct_sequence).
 114                     continue
 115             except ValueError:
 116                 rest = '%' + item
 117             # Encountered non-percent-encoded characters. Flush the current
 118             # pct_sequence.
 119             string += pct_sequence.decode(encoding, errors) + rest
 120             pct_sequence = b''
 121         if pct_sequence:
 122             # Flush the final pct_sequence
 123             string += pct_sequence.decode(encoding, errors)
 124         return string
 125
 126     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 127                 encoding='utf-8', errors='replace'):
 128         qs, _coerce_result = qs, unicode
 129         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 130         r = []
 131         for name_value in pairs:
 132             if not name_value and not strict_parsing:
 133                 continue
 134             nv = name_value.split('=', 1)
 135             if len(nv) != 2:
 136                 if strict_parsing:
 137                     raise ValueError("bad query field: %r" % (name_value,))
 138                 # Handle case of a control-name with no equal sign
 139                 if keep_blank_values:
 140                     nv.append('')
 141                 else:
 142                     continue
 143             if len(nv[1]) or keep_blank_values:
 144                 name = nv[0].replace('+', ' ')
 145                 name = _unquote(name, encoding=encoding, errors=errors)
 146                 name = _coerce_result(name)
 147                 value = nv[1].replace('+', ' ')
 148                 value = _unquote(value, encoding=encoding, errors=errors)
 149                 value = _coerce_result(value)
 150                 r.append((name, value))
 151         return r
 152
 153     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 154                 encoding='utf-8', errors='replace'):
 155         parsed_result = {}
 156         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 157                         encoding=encoding, errors=errors)
 158         for name, value in pairs:
 159             if name in parsed_result:
 160                 parsed_result[name].append(value)
 161             else:
 162                 parsed_result[name] = [value]
 163         return parsed_result
 164
 165 try:
 166     compat_str = unicode # Python 2
 167 except NameError:
 168     compat_str = str
 169
 170 try:
 171     compat_chr = unichr # Python 2
 172 except NameError:
 173     compat_chr = chr
 174
 175 def compat_ord(c):
 176     if type(c) is int: return c
 177     else: return ord(c)
 178
 179 # This is not clearly defined otherwise
 180 compiled_regex_type = type(re.compile(''))
 181
 182 std_headers = {
 183     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 184     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 185     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 186     'Accept-Encoding': 'gzip, deflate',
 187     'Accept-Language': 'en-us,en;q=0.5',
 188 }
 189
 190 def preferredencoding():
 191     """Get preferred encoding.
 192
 193     Returns the best encoding scheme for the system, based on
 194     locale.getpreferredencoding() and some further tweaks.
 195     """
 196     try:
 197         pref = locale.getpreferredencoding()
 198         u'TEST'.encode(pref)
 199     except:
 200         pref = 'UTF-8'
 201
 202     return pref
 203
 204 if sys.version_info < (3,0):
 205     def compat_print(s):
 206         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 207 else:
 208     def compat_print(s):
 209         assert type(s) == type(u'')
 210         print(s)
 211
 212 # In Python 2.x, json.dump expects a bytestream.
 213 # In Python 3.x, it writes to a character stream
 214 if sys.version_info < (3,0):
 215     def write_json_file(obj, fn):
 216         with open(fn, 'wb') as f:
 217             json.dump(obj, f)
 218 else:
 219     def write_json_file(obj, fn):
 220         with open(fn, 'w', encoding='utf-8') as f:
 221             json.dump(obj, f)
 222
 223 if sys.version_info >= (2,7):
 224     def find_xpath_attr(node, xpath, key, val):
 225         """ Find the xpath xpath[@key=val] """
 226         assert re.match(r'^[a-zA-Z]+$', key)
 227         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 228         expr = xpath + u"[@%s='%s']" % (key, val)
 229         return node.find(expr)
 230 else:
 231     def find_xpath_attr(node, xpath, key, val):
 232         for f in node.findall(xpath):
 233             if f.attrib.get(key) == val:
 234                 return f
 235         return None
 236
 237 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 238 # the namespace parameter
 239 def xpath_with_ns(path, ns_map):
 240     components = [c.split(':') for c in path.split('/')]
 241     replaced = []
 242     for c in components:
 243         if len(c) == 1:
 244             replaced.append(c[0])
 245         else:
 246             ns, tag = c
 247             replaced.append('{%s}%s' % (ns_map[ns], tag))
 248     return '/'.join(replaced)
 249
 250 def htmlentity_transform(matchobj):
 251     """Transforms an HTML entity to a character.
 252
 253     This function receives a match object and is intended to be used with
 254     the re.sub() function.
 255     """
 256     entity = matchobj.group(1)
 257
 258     # Known non-numeric HTML entity
 259     if entity in compat_html_entities.name2codepoint:
 260         return compat_chr(compat_html_entities.name2codepoint[entity])
 261
 262     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 263     if mobj is not None:
 264         numstr = mobj.group(1)
 265         if numstr.startswith(u'x'):
 266             base = 16
 267             numstr = u'0%s' % numstr
 268         else:
 269             base = 10
 270         return compat_chr(int(numstr, base))
 271
 272     # Unknown entity in name, return its literal representation
 273     return (u'&%s;' % entity)
 274
 275 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 276 class BaseHTMLParser(compat_html_parser.HTMLParser):
 277     def __init(self):
 278         compat_html_parser.HTMLParser.__init__(self)
 279         self.html = None
 280
 281     def loads(self, html):
 282         self.html = html
 283         self.feed(html)
 284         self.close()
 285
 286 class AttrParser(BaseHTMLParser):
 287     """Modified HTMLParser that isolates a tag with the specified attribute"""
 288     def __init__(self, attribute, value):
 289         self.attribute = attribute
 290         self.value = value
 291         self.result = None
 292         self.started = False
 293         self.depth = {}
 294         self.watch_startpos = False
 295         self.error_count = 0
 296         BaseHTMLParser.__init__(self)
 297
 298     def error(self, message):
 299         if self.error_count > 10 or self.started:
 300             raise compat_html_parser.HTMLParseError(message, self.getpos())
 301         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 302         self.error_count += 1
 303         self.goahead(1)
 304
 305     def handle_starttag(self, tag, attrs):
 306         attrs = dict(attrs)
 307         if self.started:
 308             self.find_startpos(None)
 309         if self.attribute in attrs and attrs[self.attribute] == self.value:
 310             self.result = [tag]
 311             self.started = True
 312             self.watch_startpos = True
 313         if self.started:
 314             if not tag in self.depth: self.depth[tag] = 0
 315             self.depth[tag] += 1
 316
 317     def handle_endtag(self, tag):
 318         if self.started:
 319             if tag in self.depth: self.depth[tag] -= 1
 320             if self.depth[self.result[0]] == 0:
 321                 self.started = False
 322                 self.result.append(self.getpos())
 323
 324     def find_startpos(self, x):
 325         """Needed to put the start position of the result (self.result[1])
 326         after the opening tag with the requested id"""
 327         if self.watch_startpos:
 328             self.watch_startpos = False
 329             self.result.append(self.getpos())
 330     handle_entityref = handle_charref = handle_data = handle_comment = \
 331     handle_decl = handle_pi = unknown_decl = find_startpos
 332
 333     def get_result(self):
 334         if self.result is None:
 335             return None
 336         if len(self.result) != 3:
 337             return None
 338         lines = self.html.split('\n')
 339         lines = lines[self.result[1][0]-1:self.result[2][0]]
 340         lines[0] = lines[0][self.result[1][1]:]
 341         if len(lines) == 1:
 342             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 343         lines[-1] = lines[-1][:self.result[2][1]]
 344         return '\n'.join(lines).strip()
 345 # Hack for https://github.com/rg3/youtube-dl/issues/662
 346 if sys.version_info < (2, 7, 3):
 347     AttrParser.parse_endtag = (lambda self, i:
 348         i + len("</scr'+'ipt>")
 349         if self.rawdata[i:].startswith("</scr'+'ipt>")
 350         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 351
 352 def get_element_by_id(id, html):
 353     """Return the content of the tag with the specified ID in the passed HTML document"""
 354     return get_element_by_attribute("id", id, html)
 355
 356 def get_element_by_attribute(attribute, value, html):
 357     """Return the content of the tag with the specified attribute in the passed HTML document"""
 358     parser = AttrParser(attribute, value)
 359     try:
 360         parser.loads(html)
 361     except compat_html_parser.HTMLParseError:
 362         pass
 363     return parser.get_result()
 364
 365 class MetaParser(BaseHTMLParser):
 366     """
 367     Modified HTMLParser that isolates a meta tag with the specified name
 368     attribute.
 369     """
 370     def __init__(self, name):
 371         BaseHTMLParser.__init__(self)
 372         self.name = name
 373         self.content = None
 374         self.result = None
 375
 376     def handle_starttag(self, tag, attrs):
 377         if tag != 'meta':
 378             return
 379         attrs = dict(attrs)
 380         if attrs.get('name') == self.name:
 381             self.result = attrs.get('content')
 382
 383     def get_result(self):
 384         return self.result
 385
 386 def get_meta_content(name, html):
 387     """
 388     Return the content attribute from the meta tag with the given name attribute.
 389     """
 390     parser = MetaParser(name)
 391     try:
 392         parser.loads(html)
 393     except compat_html_parser.HTMLParseError:
 394         pass
 395     return parser.get_result()
 396
 397
 398 def clean_html(html):
 399     """Clean an HTML snippet into a readable string"""
 400     # Newline vs <br />
 401     html = html.replace('\n', ' ')
 402     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 403     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 404     # Strip html tags
 405     html = re.sub('<.*?>', '', html)
 406     # Replace html entities
 407     html = unescapeHTML(html)
 408     return html.strip()
 409
 410
 411 def sanitize_open(filename, open_mode):
 412     """Try to open the given filename, and slightly tweak it if this fails.
 413
 414     Attempts to open the given filename. If this fails, it tries to change
 415     the filename slightly, step by step, until it's either able to open it
 416     or it fails and raises a final exception, like the standard open()
 417     function.
 418
 419     It returns the tuple (stream, definitive_file_name).
 420     """
 421     try:
 422         if filename == u'-':
 423             if sys.platform == 'win32':
 424                 import msvcrt
 425                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 426             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 427         stream = open(encodeFilename(filename), open_mode)
 428         return (stream, filename)
 429     except (IOError, OSError) as err:
 430         if err.errno in (errno.EACCES,):
 431             raise
 432
 433         # In case of error, try to remove win32 forbidden chars
 434         alt_filename = os.path.join(
 435                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 436                         for path_part in os.path.split(filename)
 437                        )
 438         if alt_filename == filename:
 439             raise
 440         else:
 441             # An exception here should be caught in the caller
 442             stream = open(encodeFilename(filename), open_mode)
 443             return (stream, alt_filename)
 444
 445
 446 def timeconvert(timestr):
 447     """Convert RFC 2822 defined time string into system timestamp"""
 448     timestamp = None
 449     timetuple = email.utils.parsedate_tz(timestr)
 450     if timetuple is not None:
 451         timestamp = email.utils.mktime_tz(timetuple)
 452     return timestamp
 453
 454 def sanitize_filename(s, restricted=False, is_id=False):
 455     """Sanitizes a string so it could be used as part of a filename.
 456     If restricted is set, use a stricter subset of allowed characters.
 457     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 458     """
 459     def replace_insane(char):
 460         if char == '?' or ord(char) < 32 or ord(char) == 127:
 461             return ''
 462         elif char == '"':
 463             return '' if restricted else '\''
 464         elif char == ':':
 465             return '_-' if restricted else ' -'
 466         elif char in '\\/|*<>':
 467             return '_'
 468         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 469             return '_'
 470         if restricted and ord(char) > 127:
 471             return '_'
 472         return char
 473
 474     result = u''.join(map(replace_insane, s))
 475     if not is_id:
 476         while '__' in result:
 477             result = result.replace('__', '_')
 478         result = result.strip('_')
 479         # Common case of "Foreign band name - English song title"
 480         if restricted and result.startswith('-_'):
 481             result = result[2:]
 482         if not result:
 483             result = '_'
 484     return result
 485
 486 def orderedSet(iterable):
 487     """ Remove all duplicates from the input iterable """
 488     res = []
 489     for el in iterable:
 490         if el not in res:
 491             res.append(el)
 492     return res
 493
 494 def unescapeHTML(s):
 495     """
 496     @param s a string
 497     """
 498     assert type(s) == type(u'')
 499
 500     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 501     return result
 502
 503 def encodeFilename(s):
 504     """
 505     @param s The name of the file
 506     """
 507
 508     assert type(s) == type(u'')
 509
 510     # Python 3 has a Unicode API
 511     if sys.version_info >= (3, 0):
 512         return s
 513
 514     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 515         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 516         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 517         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 518         return s
 519     else:
 520         encoding = sys.getfilesystemencoding()
 521         if encoding is None:
 522             encoding = 'utf-8'
 523         return s.encode(encoding, 'ignore')
 524
 525 def decodeOption(optval):
 526     if optval is None:
 527         return optval
 528     if isinstance(optval, bytes):
 529         optval = optval.decode(preferredencoding())
 530
 531     assert isinstance(optval, compat_str)
 532     return optval
 533
 534 def formatSeconds(secs):
 535     if secs > 3600:
 536         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 537     elif secs > 60:
 538         return '%d:%02d' % (secs // 60, secs % 60)
 539     else:
 540         return '%d' % secs
 541
 542 def make_HTTPS_handler(opts_no_check_certificate):
 543     if sys.version_info < (3, 2):
 544         import httplib
 545
 546         class HTTPSConnectionV3(httplib.HTTPSConnection):
 547             def __init__(self, *args, **kwargs):
 548                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 549
 550             def connect(self):
 551                 sock = socket.create_connection((self.host, self.port), self.timeout)
 552                 if getattr(self, '_tunnel_host', False):
 553                     self.sock = sock
 554                     self._tunnel()
 555                 try:
 556                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 557                 except ssl.SSLError:
 558                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 559
 560         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 561             def https_open(self, req):
 562                 return self.do_open(HTTPSConnectionV3, req)
 563         return HTTPSHandlerV3()
 564     else:
 565         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 566         context.verify_mode = (ssl.CERT_NONE
 567                                if opts_no_check_certificate
 568                                else ssl.CERT_REQUIRED)
 569         context.set_default_verify_paths()
 570         try:
 571             context.load_default_certs()
 572         except AttributeError:
 573             pass  # Python < 3.4
 574         return compat_urllib_request.HTTPSHandler(context=context)
 575
 576 class ExtractorError(Exception):
 577     """Error during info extraction."""
 578     def __init__(self, msg, tb=None, expected=False, cause=None):
 579         """ tb, if given, is the original traceback (so that it can be printed out).
 580         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 581         """
 582
 583         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 584             expected = True
 585         if not expected:
 586             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 587         super(ExtractorError, self).__init__(msg)
 588
 589         self.traceback = tb
 590         self.exc_info = sys.exc_info()  # preserve original exception
 591         self.cause = cause
 592
 593     def format_traceback(self):
 594         if self.traceback is None:
 595             return None
 596         return u''.join(traceback.format_tb(self.traceback))
 597
 598
 599 class RegexNotFoundError(ExtractorError):
 600     """Error when a regex didn't match"""
 601     pass
 602
 603
 604 class DownloadError(Exception):
 605     """Download Error exception.
 606
 607     This exception may be thrown by FileDownloader objects if they are not
 608     configured to continue on errors. They will contain the appropriate
 609     error message.
 610     """
 611     def __init__(self, msg, exc_info=None):
 612         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 613         super(DownloadError, self).__init__(msg)
 614         self.exc_info = exc_info
 615
 616
 617 class SameFileError(Exception):
 618     """Same File exception.
 619
 620     This exception will be thrown by FileDownloader objects if they detect
 621     multiple files would have to be downloaded to the same file on disk.
 622     """
 623     pass
 624
 625
 626 class PostProcessingError(Exception):
 627     """Post Processing exception.
 628
 629     This exception may be raised by PostProcessor's .run() method to
 630     indicate an error in the postprocessing task.
 631     """
 632     def __init__(self, msg):
 633         self.msg = msg
 634
 635 class MaxDownloadsReached(Exception):
 636     """ --max-downloads limit has been reached. """
 637     pass
 638
 639
 640 class UnavailableVideoError(Exception):
 641     """Unavailable Format exception.
 642
 643     This exception will be thrown when a video is requested
 644     in a format that is not available for that video.
 645     """
 646     pass
 647
 648
 649 class ContentTooShortError(Exception):
 650     """Content Too Short exception.
 651
 652     This exception may be raised by FileDownloader objects when a file they
 653     download is too small for what the server announced first, indicating
 654     the connection was probably interrupted.
 655     """
 656     # Both in bytes
 657     downloaded = None
 658     expected = None
 659
 660     def __init__(self, downloaded, expected):
 661         self.downloaded = downloaded
 662         self.expected = expected
 663
 664 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 665     """Handler for HTTP requests and responses.
 666
 667     This class, when installed with an OpenerDirector, automatically adds
 668     the standard headers to every HTTP request and handles gzipped and
 669     deflated responses from web servers. If compression is to be avoided in
 670     a particular request, the original request in the program code only has
 671     to include the HTTP header "Youtubedl-No-Compression", which will be
 672     removed before making the real request.
 673
 674     Part of this code was copied from:
 675
 676     http://techknack.net/python-urllib2-handlers/
 677
 678     Andrew Rowls, the author of that code, agreed to release it to the
 679     public domain.
 680     """
 681
 682     @staticmethod
 683     def deflate(data):
 684         try:
 685             return zlib.decompress(data, -zlib.MAX_WBITS)
 686         except zlib.error:
 687             return zlib.decompress(data)
 688
 689     @staticmethod
 690     def addinfourl_wrapper(stream, headers, url, code):
 691         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 692             return compat_urllib_request.addinfourl(stream, headers, url, code)
 693         ret = compat_urllib_request.addinfourl(stream, headers, url)
 694         ret.code = code
 695         return ret
 696
 697     def http_request(self, req):
 698         for h,v in std_headers.items():
 699             if h in req.headers:
 700                 del req.headers[h]
 701             req.add_header(h, v)
 702         if 'Youtubedl-no-compression' in req.headers:
 703             if 'Accept-encoding' in req.headers:
 704                 del req.headers['Accept-encoding']
 705             del req.headers['Youtubedl-no-compression']
 706         if 'Youtubedl-user-agent' in req.headers:
 707             if 'User-agent' in req.headers:
 708                 del req.headers['User-agent']
 709             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 710             del req.headers['Youtubedl-user-agent']
 711         return req
 712
 713     def http_response(self, req, resp):
 714         old_resp = resp
 715         # gzip
 716         if resp.headers.get('Content-encoding', '') == 'gzip':
 717             content = resp.read()
 718             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 719             try:
 720                 uncompressed = io.BytesIO(gz.read())
 721             except IOError as original_ioerror:
 722                 # There may be junk add the end of the file
 723                 # See http://stackoverflow.com/q/4928560/35070 for details
 724                 for i in range(1, 1024):
 725                     try:
 726                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 727                         uncompressed = io.BytesIO(gz.read())
 728                     except IOError:
 729                         continue
 730                     break
 731                 else:
 732                     raise original_ioerror
 733             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 734             resp.msg = old_resp.msg
 735         # deflate
 736         if resp.headers.get('Content-encoding', '') == 'deflate':
 737             gz = io.BytesIO(self.deflate(resp.read()))
 738             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 739             resp.msg = old_resp.msg
 740         return resp
 741
 742     https_request = http_request
 743     https_response = http_response
 744
 745 def unified_strdate(date_str):
 746     """Return a string with the date in the format YYYYMMDD"""
 747     upload_date = None
 748     #Replace commas
 749     date_str = date_str.replace(',',' ')
 750     # %z (UTC offset) is only supported in python>=3.2
 751     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 752     format_expressions = [
 753         '%d %B %Y',
 754         '%B %d %Y',
 755         '%b %d %Y',
 756         '%Y-%m-%d',
 757         '%d/%m/%Y',
 758         '%Y/%m/%d %H:%M:%S',
 759         '%d.%m.%Y %H:%M',
 760         '%Y-%m-%dT%H:%M:%SZ',
 761         '%Y-%m-%dT%H:%M:%S.%fZ',
 762         '%Y-%m-%dT%H:%M:%S.%f0Z',
 763         '%Y-%m-%dT%H:%M:%S',
 764     ]
 765     for expression in format_expressions:
 766         try:
 767             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 768         except:
 769             pass
 770     if upload_date is None:
 771         timetuple = email.utils.parsedate_tz(date_str)
 772         if timetuple:
 773             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 774     return upload_date
 775
 776 def determine_ext(url, default_ext=u'unknown_video'):
 777     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 778     if re.match(r'^[A-Za-z0-9]+$', guess):
 779         return guess
 780     else:
 781         return default_ext
 782
 783 def subtitles_filename(filename, sub_lang, sub_format):
 784     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 785
 786 def date_from_str(date_str):
 787     """
 788     Return a datetime object from a string in the format YYYYMMDD or
 789     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 790     today = datetime.date.today()
 791     if date_str == 'now'or date_str == 'today':
 792         return today
 793     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 794     if match is not None:
 795         sign = match.group('sign')
 796         time = int(match.group('time'))
 797         if sign == '-':
 798             time = -time
 799         unit = match.group('unit')
 800         #A bad aproximation?
 801         if unit == 'month':
 802             unit = 'day'
 803             time *= 30
 804         elif unit == 'year':
 805             unit = 'day'
 806             time *= 365
 807         unit += 's'
 808         delta = datetime.timedelta(**{unit: time})
 809         return today + delta
 810     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 811
 812 def hyphenate_date(date_str):
 813     """
 814     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 815     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 816     if match is not None:
 817         return '-'.join(match.groups())
 818     else:
 819         return date_str
 820
 821 class DateRange(object):
 822     """Represents a time interval between two dates"""
 823     def __init__(self, start=None, end=None):
 824         """start and end must be strings in the format accepted by date"""
 825         if start is not None:
 826             self.start = date_from_str(start)
 827         else:
 828             self.start = datetime.datetime.min.date()
 829         if end is not None:
 830             self.end = date_from_str(end)
 831         else:
 832             self.end = datetime.datetime.max.date()
 833         if self.start > self.end:
 834             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 835     @classmethod
 836     def day(cls, day):
 837         """Returns a range that only contains the given day"""
 838         return cls(day,day)
 839     def __contains__(self, date):
 840         """Check if the date is in the range"""
 841         if not isinstance(date, datetime.date):
 842             date = date_from_str(date)
 843         return self.start <= date <= self.end
 844     def __str__(self):
 845         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 846
 847
 848 def platform_name():
 849     """ Returns the platform name as a compat_str """
 850     res = platform.platform()
 851     if isinstance(res, bytes):
 852         res = res.decode(preferredencoding())
 853
 854     assert isinstance(res, compat_str)
 855     return res
 856
 857
 858 def write_string(s, out=None):
 859     if out is None:
 860         out = sys.stderr
 861     assert type(s) == type(u'')
 862
 863     if ('b' in getattr(out, 'mode', '') or
 864             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 865         s = s.encode(preferredencoding(), 'ignore')
 866     out.write(s)
 867     out.flush()
 868
 869
 870 def bytes_to_intlist(bs):
 871     if not bs:
 872         return []
 873     if isinstance(bs[0], int):  # Python 3
 874         return list(bs)
 875     else:
 876         return [ord(c) for c in bs]
 877
 878
 879 def intlist_to_bytes(xs):
 880     if not xs:
 881         return b''
 882     if isinstance(chr(0), bytes):  # Python 2
 883         return ''.join([chr(x) for x in xs])
 884     else:
 885         return bytes(xs)
 886
 887
 888 def get_cachedir(params={}):
 889     cache_root = os.environ.get('XDG_CACHE_HOME',
 890                                 os.path.expanduser('~/.cache'))
 891     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 892
 893
 894 # Cross-platform file locking
 895 if sys.platform == 'win32':
 896     import ctypes.wintypes
 897     import msvcrt
 898
 899     class OVERLAPPED(ctypes.Structure):
 900         _fields_ = [
 901             ('Internal', ctypes.wintypes.LPVOID),
 902             ('InternalHigh', ctypes.wintypes.LPVOID),
 903             ('Offset', ctypes.wintypes.DWORD),
 904             ('OffsetHigh', ctypes.wintypes.DWORD),
 905             ('hEvent', ctypes.wintypes.HANDLE),
 906         ]
 907
 908     kernel32 = ctypes.windll.kernel32
 909     LockFileEx = kernel32.LockFileEx
 910     LockFileEx.argtypes = [
 911         ctypes.wintypes.HANDLE,     # hFile
 912         ctypes.wintypes.DWORD,      # dwFlags
 913         ctypes.wintypes.DWORD,      # dwReserved
 914         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 915         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 916         ctypes.POINTER(OVERLAPPED)  # Overlapped
 917     ]
 918     LockFileEx.restype = ctypes.wintypes.BOOL
 919     UnlockFileEx = kernel32.UnlockFileEx
 920     UnlockFileEx.argtypes = [
 921         ctypes.wintypes.HANDLE,     # hFile
 922         ctypes.wintypes.DWORD,      # dwReserved
 923         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 924         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 925         ctypes.POINTER(OVERLAPPED)  # Overlapped
 926     ]
 927     UnlockFileEx.restype = ctypes.wintypes.BOOL
 928     whole_low = 0xffffffff
 929     whole_high = 0x7fffffff
 930
 931     def _lock_file(f, exclusive):
 932         overlapped = OVERLAPPED()
 933         overlapped.Offset = 0
 934         overlapped.OffsetHigh = 0
 935         overlapped.hEvent = 0
 936         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 937         handle = msvcrt.get_osfhandle(f.fileno())
 938         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 939                           whole_low, whole_high, f._lock_file_overlapped_p):
 940             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 941
 942     def _unlock_file(f):
 943         assert f._lock_file_overlapped_p
 944         handle = msvcrt.get_osfhandle(f.fileno())
 945         if not UnlockFileEx(handle, 0,
 946                             whole_low, whole_high, f._lock_file_overlapped_p):
 947             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 948
 949 else:
 950     import fcntl
 951
 952     def _lock_file(f, exclusive):
 953         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 954
 955     def _unlock_file(f):
 956         fcntl.lockf(f, fcntl.LOCK_UN)
 957
 958
 959 class locked_file(object):
 960     def __init__(self, filename, mode, encoding=None):
 961         assert mode in ['r', 'a', 'w']
 962         self.f = io.open(filename, mode, encoding=encoding)
 963         self.mode = mode
 964
 965     def __enter__(self):
 966         exclusive = self.mode != 'r'
 967         try:
 968             _lock_file(self.f, exclusive)
 969         except IOError:
 970             self.f.close()
 971             raise
 972         return self
 973
 974     def __exit__(self, etype, value, traceback):
 975         try:
 976             _unlock_file(self.f)
 977         finally:
 978             self.f.close()
 979
 980     def __iter__(self):
 981         return iter(self.f)
 982
 983     def write(self, *args):
 984         return self.f.write(*args)
 985
 986     def read(self, *args):
 987         return self.f.read(*args)
 988
 989
 990 def shell_quote(args):
 991     quoted_args = []
 992     encoding = sys.getfilesystemencoding()
 993     if encoding is None:
 994         encoding = 'utf-8'
 995     for a in args:
 996         if isinstance(a, bytes):
 997             # We may get a filename encoded with 'encodeFilename'
 998             a = a.decode(encoding)
 999         quoted_args.append(pipes.quote(a))
1000     return u' '.join(quoted_args)
1001
1002
1003 def takewhile_inclusive(pred, seq):
1004     """ Like itertools.takewhile, but include the latest evaluated element
1005         (the first element so that Not pred(e)) """
1006     for e in seq:
1007         yield e
1008         if not pred(e):
1009             return
1010
1011
1012 def smuggle_url(url, data):
1013     """ Pass additional data in a URL for internal use. """
1014
1015     sdata = compat_urllib_parse.urlencode(
1016         {u'__youtubedl_smuggle': json.dumps(data)})
1017     return url + u'#' + sdata
1018
1019
1020 def unsmuggle_url(smug_url):
1021     if not '#__youtubedl_smuggle' in smug_url:
1022         return smug_url, None
1023     url, _, sdata = smug_url.rpartition(u'#')
1024     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1025     data = json.loads(jsond)
1026     return url, data
1027
1028
1029 def format_bytes(bytes):
1030     if bytes is None:
1031         return u'N/A'
1032     if type(bytes) is str:
1033         bytes = float(bytes)
1034     if bytes == 0.0:
1035         exponent = 0
1036     else:
1037         exponent = int(math.log(bytes, 1024.0))
1038     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1039     converted = float(bytes) / float(1024 ** exponent)
1040     return u'%.2f%s' % (converted, suffix)
1041
1042
1043 def str_to_int(int_str):
1044     int_str = re.sub(r'[,\.]', u'', int_str)
1045     return int(int_str)
1046
1047
1048 def get_term_width():
1049     columns = os.environ.get('COLUMNS', None)
1050     if columns:
1051         return int(columns)
1052
1053     try:
1054         sp = subprocess.Popen(
1055             ['stty', 'size'],
1056             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1057         out, err = sp.communicate()
1058         return int(out.split()[1])
1059     except:
1060         pass
1061     return None
1062
1063
1064 def month_by_name(name):
1065     """ Return the number of a month by (locale-independently) English name """
1066
1067     ENGLISH_NAMES = [
1068         u'January', u'February', u'March', u'April', u'May', u'June',
1069         u'July', u'August', u'September', u'October', u'November', u'December']
1070     try:
1071         return ENGLISH_NAMES.index(name) + 1
1072     except ValueError:
1073         return None
1074
1075
1076 def fix_xml_all_ampersand(xml_str):
1077     """Replace all the '&' by '&amp;' in XML"""
1078     return xml_str.replace(u'&', u'&amp;')
1079
1080
1081 def setproctitle(title):
1082     assert isinstance(title, type(u''))
1083     try:
1084         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1085     except OSError:
1086         return
1087     title = title
1088     buf = ctypes.create_string_buffer(len(title) + 1)
1089     buf.value = title.encode('utf-8')
1090     try:
1091         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1092     except AttributeError:
1093         return  # Strange libc, just skip this
1094
1095
1096 def remove_start(s, start):
1097     if s.startswith(start):
1098         return s[len(start):]
1099     return s
1100
1101
1102 def url_basename(url):
1103     path = compat_urlparse.urlparse(url).path
1104     return path.strip(u'/').split(u'/')[-1]
1105
1106
1107 class HEADRequest(compat_urllib_request.Request):
1108     def get_method(self):
1109         return "HEAD"