youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import math
  12 import os
  13 import pipes
  14 import platform
  15 import re
  16 import ssl
  17 import socket
  18 import sys
  19 import traceback
  20 import zlib
  21
  22 try:
  23     import urllib.request as compat_urllib_request
  24 except ImportError: # Python 2
  25     import urllib2 as compat_urllib_request
  26
  27 try:
  28     import urllib.error as compat_urllib_error
  29 except ImportError: # Python 2
  30     import urllib2 as compat_urllib_error
  31
  32 try:
  33     import urllib.parse as compat_urllib_parse
  34 except ImportError: # Python 2
  35     import urllib as compat_urllib_parse
  36
  37 try:
  38     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  39 except ImportError: # Python 2
  40     from urlparse import urlparse as compat_urllib_parse_urlparse
  41
  42 try:
  43     import urllib.parse as compat_urlparse
  44 except ImportError: # Python 2
  45     import urlparse as compat_urlparse
  46
  47 try:
  48     import http.cookiejar as compat_cookiejar
  49 except ImportError: # Python 2
  50     import cookielib as compat_cookiejar
  51
  52 try:
  53     import html.entities as compat_html_entities
  54 except ImportError: # Python 2
  55     import htmlentitydefs as compat_html_entities
  56
  57 try:
  58     import html.parser as compat_html_parser
  59 except ImportError: # Python 2
  60     import HTMLParser as compat_html_parser
  61
  62 try:
  63     import http.client as compat_http_client
  64 except ImportError: # Python 2
  65     import httplib as compat_http_client
  66
  67 try:
  68     from urllib.error import HTTPError as compat_HTTPError
  69 except ImportError:  # Python 2
  70     from urllib2 import HTTPError as compat_HTTPError
  71
  72 try:
  73     from urllib.request import urlretrieve as compat_urlretrieve
  74 except ImportError:  # Python 2
  75     from urllib import urlretrieve as compat_urlretrieve
  76
  77
  78 try:
  79     from subprocess import DEVNULL
  80     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  81 except ImportError:
  82     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  83
  84 try:
  85     from urllib.parse import parse_qs as compat_parse_qs
  86 except ImportError: # Python 2
  87     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  88     # Python 2's version is apparently totally broken
  89     def _unquote(string, encoding='utf-8', errors='replace'):
  90         if string == '':
  91             return string
  92         res = string.split('%')
  93         if len(res) == 1:
  94             return string
  95         if encoding is None:
  96             encoding = 'utf-8'
  97         if errors is None:
  98             errors = 'replace'
  99         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 100         pct_sequence = b''
 101         string = res[0]
 102         for item in res[1:]:
 103             try:
 104                 if not item:
 105                     raise ValueError
 106                 pct_sequence += item[:2].decode('hex')
 107                 rest = item[2:]
 108                 if not rest:
 109                     # This segment was just a single percent-encoded character.
 110                     # May be part of a sequence of code units, so delay decoding.
 111                     # (Stored in pct_sequence).
 112                     continue
 113             except ValueError:
 114                 rest = '%' + item
 115             # Encountered non-percent-encoded characters. Flush the current
 116             # pct_sequence.
 117             string += pct_sequence.decode(encoding, errors) + rest
 118             pct_sequence = b''
 119         if pct_sequence:
 120             # Flush the final pct_sequence
 121             string += pct_sequence.decode(encoding, errors)
 122         return string
 123
 124     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 125                 encoding='utf-8', errors='replace'):
 126         qs, _coerce_result = qs, unicode
 127         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 128         r = []
 129         for name_value in pairs:
 130             if not name_value and not strict_parsing:
 131                 continue
 132             nv = name_value.split('=', 1)
 133             if len(nv) != 2:
 134                 if strict_parsing:
 135                     raise ValueError("bad query field: %r" % (name_value,))
 136                 # Handle case of a control-name with no equal sign
 137                 if keep_blank_values:
 138                     nv.append('')
 139                 else:
 140                     continue
 141             if len(nv[1]) or keep_blank_values:
 142                 name = nv[0].replace('+', ' ')
 143                 name = _unquote(name, encoding=encoding, errors=errors)
 144                 name = _coerce_result(name)
 145                 value = nv[1].replace('+', ' ')
 146                 value = _unquote(value, encoding=encoding, errors=errors)
 147                 value = _coerce_result(value)
 148                 r.append((name, value))
 149         return r
 150
 151     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 152                 encoding='utf-8', errors='replace'):
 153         parsed_result = {}
 154         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 155                         encoding=encoding, errors=errors)
 156         for name, value in pairs:
 157             if name in parsed_result:
 158                 parsed_result[name].append(value)
 159             else:
 160                 parsed_result[name] = [value]
 161         return parsed_result
 162
 163 try:
 164     compat_str = unicode # Python 2
 165 except NameError:
 166     compat_str = str
 167
 168 try:
 169     compat_chr = unichr # Python 2
 170 except NameError:
 171     compat_chr = chr
 172
 173 def compat_ord(c):
 174     if type(c) is int: return c
 175     else: return ord(c)
 176
 177 # This is not clearly defined otherwise
 178 compiled_regex_type = type(re.compile(''))
 179
 180 std_headers = {
 181     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 182     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 183     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 184     'Accept-Encoding': 'gzip, deflate',
 185     'Accept-Language': 'en-us,en;q=0.5',
 186 }
 187
 188 def preferredencoding():
 189     """Get preferred encoding.
 190
 191     Returns the best encoding scheme for the system, based on
 192     locale.getpreferredencoding() and some further tweaks.
 193     """
 194     try:
 195         pref = locale.getpreferredencoding()
 196         u'TEST'.encode(pref)
 197     except:
 198         pref = 'UTF-8'
 199
 200     return pref
 201
 202 if sys.version_info < (3,0):
 203     def compat_print(s):
 204         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 205 else:
 206     def compat_print(s):
 207         assert type(s) == type(u'')
 208         print(s)
 209
 210 # In Python 2.x, json.dump expects a bytestream.
 211 # In Python 3.x, it writes to a character stream
 212 if sys.version_info < (3,0):
 213     def write_json_file(obj, fn):
 214         with open(fn, 'wb') as f:
 215             json.dump(obj, f)
 216 else:
 217     def write_json_file(obj, fn):
 218         with open(fn, 'w', encoding='utf-8') as f:
 219             json.dump(obj, f)
 220
 221 if sys.version_info >= (2,7):
 222     def find_xpath_attr(node, xpath, key, val):
 223         """ Find the xpath xpath[@key=val] """
 224         assert re.match(r'^[a-zA-Z]+$', key)
 225         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 226         expr = xpath + u"[@%s='%s']" % (key, val)
 227         return node.find(expr)
 228 else:
 229     def find_xpath_attr(node, xpath, key, val):
 230         for f in node.findall(xpath):
 231             if f.attrib.get(key) == val:
 232                 return f
 233         return None
 234
 235 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 236 # the namespace parameter
 237 def xpath_with_ns(path, ns_map):
 238     components = [c.split(':') for c in path.split('/')]
 239     replaced = []
 240     for c in components:
 241         if len(c) == 1:
 242             replaced.append(c[0])
 243         else:
 244             ns, tag = c
 245             replaced.append('{%s}%s' % (ns_map[ns], tag))
 246     return '/'.join(replaced)
 247
 248 def htmlentity_transform(matchobj):
 249     """Transforms an HTML entity to a character.
 250
 251     This function receives a match object and is intended to be used with
 252     the re.sub() function.
 253     """
 254     entity = matchobj.group(1)
 255
 256     # Known non-numeric HTML entity
 257     if entity in compat_html_entities.name2codepoint:
 258         return compat_chr(compat_html_entities.name2codepoint[entity])
 259
 260     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 261     if mobj is not None:
 262         numstr = mobj.group(1)
 263         if numstr.startswith(u'x'):
 264             base = 16
 265             numstr = u'0%s' % numstr
 266         else:
 267             base = 10
 268         return compat_chr(int(numstr, base))
 269
 270     # Unknown entity in name, return its literal representation
 271     return (u'&%s;' % entity)
 272
 273 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 274 class BaseHTMLParser(compat_html_parser.HTMLParser):
 275     def __init(self):
 276         compat_html_parser.HTMLParser.__init__(self)
 277         self.html = None
 278
 279     def loads(self, html):
 280         self.html = html
 281         self.feed(html)
 282         self.close()
 283
 284 class AttrParser(BaseHTMLParser):
 285     """Modified HTMLParser that isolates a tag with the specified attribute"""
 286     def __init__(self, attribute, value):
 287         self.attribute = attribute
 288         self.value = value
 289         self.result = None
 290         self.started = False
 291         self.depth = {}
 292         self.watch_startpos = False
 293         self.error_count = 0
 294         BaseHTMLParser.__init__(self)
 295
 296     def error(self, message):
 297         if self.error_count > 10 or self.started:
 298             raise compat_html_parser.HTMLParseError(message, self.getpos())
 299         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 300         self.error_count += 1
 301         self.goahead(1)
 302
 303     def handle_starttag(self, tag, attrs):
 304         attrs = dict(attrs)
 305         if self.started:
 306             self.find_startpos(None)
 307         if self.attribute in attrs and attrs[self.attribute] == self.value:
 308             self.result = [tag]
 309             self.started = True
 310             self.watch_startpos = True
 311         if self.started:
 312             if not tag in self.depth: self.depth[tag] = 0
 313             self.depth[tag] += 1
 314
 315     def handle_endtag(self, tag):
 316         if self.started:
 317             if tag in self.depth: self.depth[tag] -= 1
 318             if self.depth[self.result[0]] == 0:
 319                 self.started = False
 320                 self.result.append(self.getpos())
 321
 322     def find_startpos(self, x):
 323         """Needed to put the start position of the result (self.result[1])
 324         after the opening tag with the requested id"""
 325         if self.watch_startpos:
 326             self.watch_startpos = False
 327             self.result.append(self.getpos())
 328     handle_entityref = handle_charref = handle_data = handle_comment = \
 329     handle_decl = handle_pi = unknown_decl = find_startpos
 330
 331     def get_result(self):
 332         if self.result is None:
 333             return None
 334         if len(self.result) != 3:
 335             return None
 336         lines = self.html.split('\n')
 337         lines = lines[self.result[1][0]-1:self.result[2][0]]
 338         lines[0] = lines[0][self.result[1][1]:]
 339         if len(lines) == 1:
 340             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 341         lines[-1] = lines[-1][:self.result[2][1]]
 342         return '\n'.join(lines).strip()
 343 # Hack for https://github.com/rg3/youtube-dl/issues/662
 344 if sys.version_info < (2, 7, 3):
 345     AttrParser.parse_endtag = (lambda self, i:
 346         i + len("</scr'+'ipt>")
 347         if self.rawdata[i:].startswith("</scr'+'ipt>")
 348         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 349
 350 def get_element_by_id(id, html):
 351     """Return the content of the tag with the specified ID in the passed HTML document"""
 352     return get_element_by_attribute("id", id, html)
 353
 354 def get_element_by_attribute(attribute, value, html):
 355     """Return the content of the tag with the specified attribute in the passed HTML document"""
 356     parser = AttrParser(attribute, value)
 357     try:
 358         parser.loads(html)
 359     except compat_html_parser.HTMLParseError:
 360         pass
 361     return parser.get_result()
 362
 363 class MetaParser(BaseHTMLParser):
 364     """
 365     Modified HTMLParser that isolates a meta tag with the specified name
 366     attribute.
 367     """
 368     def __init__(self, name):
 369         BaseHTMLParser.__init__(self)
 370         self.name = name
 371         self.content = None
 372         self.result = None
 373
 374     def handle_starttag(self, tag, attrs):
 375         if tag != 'meta':
 376             return
 377         attrs = dict(attrs)
 378         if attrs.get('name') == self.name:
 379             self.result = attrs.get('content')
 380
 381     def get_result(self):
 382         return self.result
 383
 384 def get_meta_content(name, html):
 385     """
 386     Return the content attribute from the meta tag with the given name attribute.
 387     """
 388     parser = MetaParser(name)
 389     try:
 390         parser.loads(html)
 391     except compat_html_parser.HTMLParseError:
 392         pass
 393     return parser.get_result()
 394
 395
 396 def clean_html(html):
 397     """Clean an HTML snippet into a readable string"""
 398     # Newline vs <br />
 399     html = html.replace('\n', ' ')
 400     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 401     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 402     # Strip html tags
 403     html = re.sub('<.*?>', '', html)
 404     # Replace html entities
 405     html = unescapeHTML(html)
 406     return html.strip()
 407
 408
 409 def sanitize_open(filename, open_mode):
 410     """Try to open the given filename, and slightly tweak it if this fails.
 411
 412     Attempts to open the given filename. If this fails, it tries to change
 413     the filename slightly, step by step, until it's either able to open it
 414     or it fails and raises a final exception, like the standard open()
 415     function.
 416
 417     It returns the tuple (stream, definitive_file_name).
 418     """
 419     try:
 420         if filename == u'-':
 421             if sys.platform == 'win32':
 422                 import msvcrt
 423                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 424             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 425         stream = open(encodeFilename(filename), open_mode)
 426         return (stream, filename)
 427     except (IOError, OSError) as err:
 428         if err.errno in (errno.EACCES,):
 429             raise
 430
 431         # In case of error, try to remove win32 forbidden chars
 432         alt_filename = os.path.join(
 433                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 434                         for path_part in os.path.split(filename)
 435                        )
 436         if alt_filename == filename:
 437             raise
 438         else:
 439             # An exception here should be caught in the caller
 440             stream = open(encodeFilename(filename), open_mode)
 441             return (stream, alt_filename)
 442
 443
 444 def timeconvert(timestr):
 445     """Convert RFC 2822 defined time string into system timestamp"""
 446     timestamp = None
 447     timetuple = email.utils.parsedate_tz(timestr)
 448     if timetuple is not None:
 449         timestamp = email.utils.mktime_tz(timetuple)
 450     return timestamp
 451
 452 def sanitize_filename(s, restricted=False, is_id=False):
 453     """Sanitizes a string so it could be used as part of a filename.
 454     If restricted is set, use a stricter subset of allowed characters.
 455     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 456     """
 457     def replace_insane(char):
 458         if char == '?' or ord(char) < 32 or ord(char) == 127:
 459             return ''
 460         elif char == '"':
 461             return '' if restricted else '\''
 462         elif char == ':':
 463             return '_-' if restricted else ' -'
 464         elif char in '\\/|*<>':
 465             return '_'
 466         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 467             return '_'
 468         if restricted and ord(char) > 127:
 469             return '_'
 470         return char
 471
 472     result = u''.join(map(replace_insane, s))
 473     if not is_id:
 474         while '__' in result:
 475             result = result.replace('__', '_')
 476         result = result.strip('_')
 477         # Common case of "Foreign band name - English song title"
 478         if restricted and result.startswith('-_'):
 479             result = result[2:]
 480         if not result:
 481             result = '_'
 482     return result
 483
 484 def orderedSet(iterable):
 485     """ Remove all duplicates from the input iterable """
 486     res = []
 487     for el in iterable:
 488         if el not in res:
 489             res.append(el)
 490     return res
 491
 492 def unescapeHTML(s):
 493     """
 494     @param s a string
 495     """
 496     assert type(s) == type(u'')
 497
 498     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 499     return result
 500
 501 def encodeFilename(s):
 502     """
 503     @param s The name of the file
 504     """
 505
 506     assert type(s) == type(u'')
 507
 508     # Python 3 has a Unicode API
 509     if sys.version_info >= (3, 0):
 510         return s
 511
 512     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 513         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 514         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 515         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 516         return s
 517     else:
 518         encoding = sys.getfilesystemencoding()
 519         if encoding is None:
 520             encoding = 'utf-8'
 521         return s.encode(encoding, 'ignore')
 522
 523 def decodeOption(optval):
 524     if optval is None:
 525         return optval
 526     if isinstance(optval, bytes):
 527         optval = optval.decode(preferredencoding())
 528
 529     assert isinstance(optval, compat_str)
 530     return optval
 531
 532 def formatSeconds(secs):
 533     if secs > 3600:
 534         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 535     elif secs > 60:
 536         return '%d:%02d' % (secs // 60, secs % 60)
 537     else:
 538         return '%d' % secs
 539
 540 def make_HTTPS_handler(opts_no_check_certificate):
 541     if sys.version_info < (3, 2):
 542         import httplib
 543
 544         class HTTPSConnectionV3(httplib.HTTPSConnection):
 545             def __init__(self, *args, **kwargs):
 546                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 547
 548             def connect(self):
 549                 sock = socket.create_connection((self.host, self.port), self.timeout)
 550                 if getattr(self, '_tunnel_host', False):
 551                     self.sock = sock
 552                     self._tunnel()
 553                 try:
 554                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 555                 except ssl.SSLError:
 556                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 557
 558         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 559             def https_open(self, req):
 560                 return self.do_open(HTTPSConnectionV3, req)
 561         return HTTPSHandlerV3()
 562     else:
 563         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 564         context.verify_mode = (ssl.CERT_NONE
 565                                if opts_no_check_certificate
 566                                else ssl.CERT_REQUIRED)
 567         context.set_default_verify_paths()
 568         try:
 569             context.load_default_certs()
 570         except AttributeError:
 571             pass  # Python < 3.4
 572         return compat_urllib_request.HTTPSHandler(context=context)
 573
 574 class ExtractorError(Exception):
 575     """Error during info extraction."""
 576     def __init__(self, msg, tb=None, expected=False, cause=None):
 577         """ tb, if given, is the original traceback (so that it can be printed out).
 578         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 579         """
 580
 581         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 582             expected = True
 583         if not expected:
 584             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 585         super(ExtractorError, self).__init__(msg)
 586
 587         self.traceback = tb
 588         self.exc_info = sys.exc_info()  # preserve original exception
 589         self.cause = cause
 590
 591     def format_traceback(self):
 592         if self.traceback is None:
 593             return None
 594         return u''.join(traceback.format_tb(self.traceback))
 595
 596
 597 class RegexNotFoundError(ExtractorError):
 598     """Error when a regex didn't match"""
 599     pass
 600
 601
 602 class DownloadError(Exception):
 603     """Download Error exception.
 604
 605     This exception may be thrown by FileDownloader objects if they are not
 606     configured to continue on errors. They will contain the appropriate
 607     error message.
 608     """
 609     def __init__(self, msg, exc_info=None):
 610         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 611         super(DownloadError, self).__init__(msg)
 612         self.exc_info = exc_info
 613
 614
 615 class SameFileError(Exception):
 616     """Same File exception.
 617
 618     This exception will be thrown by FileDownloader objects if they detect
 619     multiple files would have to be downloaded to the same file on disk.
 620     """
 621     pass
 622
 623
 624 class PostProcessingError(Exception):
 625     """Post Processing exception.
 626
 627     This exception may be raised by PostProcessor's .run() method to
 628     indicate an error in the postprocessing task.
 629     """
 630     def __init__(self, msg):
 631         self.msg = msg
 632
 633 class MaxDownloadsReached(Exception):
 634     """ --max-downloads limit has been reached. """
 635     pass
 636
 637
 638 class UnavailableVideoError(Exception):
 639     """Unavailable Format exception.
 640
 641     This exception will be thrown when a video is requested
 642     in a format that is not available for that video.
 643     """
 644     pass
 645
 646
 647 class ContentTooShortError(Exception):
 648     """Content Too Short exception.
 649
 650     This exception may be raised by FileDownloader objects when a file they
 651     download is too small for what the server announced first, indicating
 652     the connection was probably interrupted.
 653     """
 654     # Both in bytes
 655     downloaded = None
 656     expected = None
 657
 658     def __init__(self, downloaded, expected):
 659         self.downloaded = downloaded
 660         self.expected = expected
 661
 662 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 663     """Handler for HTTP requests and responses.
 664
 665     This class, when installed with an OpenerDirector, automatically adds
 666     the standard headers to every HTTP request and handles gzipped and
 667     deflated responses from web servers. If compression is to be avoided in
 668     a particular request, the original request in the program code only has
 669     to include the HTTP header "Youtubedl-No-Compression", which will be
 670     removed before making the real request.
 671
 672     Part of this code was copied from:
 673
 674     http://techknack.net/python-urllib2-handlers/
 675
 676     Andrew Rowls, the author of that code, agreed to release it to the
 677     public domain.
 678     """
 679
 680     @staticmethod
 681     def deflate(data):
 682         try:
 683             return zlib.decompress(data, -zlib.MAX_WBITS)
 684         except zlib.error:
 685             return zlib.decompress(data)
 686
 687     @staticmethod
 688     def addinfourl_wrapper(stream, headers, url, code):
 689         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 690             return compat_urllib_request.addinfourl(stream, headers, url, code)
 691         ret = compat_urllib_request.addinfourl(stream, headers, url)
 692         ret.code = code
 693         return ret
 694
 695     def http_request(self, req):
 696         for h,v in std_headers.items():
 697             if h in req.headers:
 698                 del req.headers[h]
 699             req.add_header(h, v)
 700         if 'Youtubedl-no-compression' in req.headers:
 701             if 'Accept-encoding' in req.headers:
 702                 del req.headers['Accept-encoding']
 703             del req.headers['Youtubedl-no-compression']
 704         if 'Youtubedl-user-agent' in req.headers:
 705             if 'User-agent' in req.headers:
 706                 del req.headers['User-agent']
 707             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 708             del req.headers['Youtubedl-user-agent']
 709         return req
 710
 711     def http_response(self, req, resp):
 712         old_resp = resp
 713         # gzip
 714         if resp.headers.get('Content-encoding', '') == 'gzip':
 715             content = resp.read()
 716             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 717             try:
 718                 uncompressed = io.BytesIO(gz.read())
 719             except IOError as original_ioerror:
 720                 # There may be junk add the end of the file
 721                 # See http://stackoverflow.com/q/4928560/35070 for details
 722                 for i in range(1, 1024):
 723                     try:
 724                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 725                         uncompressed = io.BytesIO(gz.read())
 726                     except IOError:
 727                         continue
 728                     break
 729                 else:
 730                     raise original_ioerror
 731             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 732             resp.msg = old_resp.msg
 733         # deflate
 734         if resp.headers.get('Content-encoding', '') == 'deflate':
 735             gz = io.BytesIO(self.deflate(resp.read()))
 736             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 737             resp.msg = old_resp.msg
 738         return resp
 739
 740     https_request = http_request
 741     https_response = http_response
 742
 743 def unified_strdate(date_str):
 744     """Return a string with the date in the format YYYYMMDD"""
 745     upload_date = None
 746     #Replace commas
 747     date_str = date_str.replace(',',' ')
 748     # %z (UTC offset) is only supported in python>=3.2
 749     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 750     format_expressions = [
 751         '%d %B %Y',
 752         '%B %d %Y',
 753         '%b %d %Y',
 754         '%Y-%m-%d',
 755         '%d/%m/%Y',
 756         '%Y/%m/%d %H:%M:%S',
 757         '%d.%m.%Y %H:%M',
 758         '%Y-%m-%dT%H:%M:%SZ',
 759         '%Y-%m-%dT%H:%M:%S.%fZ',
 760         '%Y-%m-%dT%H:%M:%S.%f0Z',
 761         '%Y-%m-%dT%H:%M:%S',
 762     ]
 763     for expression in format_expressions:
 764         try:
 765             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 766         except:
 767             pass
 768     return upload_date
 769
 770 def determine_ext(url, default_ext=u'unknown_video'):
 771     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 772     if re.match(r'^[A-Za-z0-9]+$', guess):
 773         return guess
 774     else:
 775         return default_ext
 776
 777 def subtitles_filename(filename, sub_lang, sub_format):
 778     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 779
 780 def date_from_str(date_str):
 781     """
 782     Return a datetime object from a string in the format YYYYMMDD or
 783     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 784     today = datetime.date.today()
 785     if date_str == 'now'or date_str == 'today':
 786         return today
 787     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 788     if match is not None:
 789         sign = match.group('sign')
 790         time = int(match.group('time'))
 791         if sign == '-':
 792             time = -time
 793         unit = match.group('unit')
 794         #A bad aproximation?
 795         if unit == 'month':
 796             unit = 'day'
 797             time *= 30
 798         elif unit == 'year':
 799             unit = 'day'
 800             time *= 365
 801         unit += 's'
 802         delta = datetime.timedelta(**{unit: time})
 803         return today + delta
 804     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 805
 806 class DateRange(object):
 807     """Represents a time interval between two dates"""
 808     def __init__(self, start=None, end=None):
 809         """start and end must be strings in the format accepted by date"""
 810         if start is not None:
 811             self.start = date_from_str(start)
 812         else:
 813             self.start = datetime.datetime.min.date()
 814         if end is not None:
 815             self.end = date_from_str(end)
 816         else:
 817             self.end = datetime.datetime.max.date()
 818         if self.start > self.end:
 819             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 820     @classmethod
 821     def day(cls, day):
 822         """Returns a range that only contains the given day"""
 823         return cls(day,day)
 824     def __contains__(self, date):
 825         """Check if the date is in the range"""
 826         if not isinstance(date, datetime.date):
 827             date = date_from_str(date)
 828         return self.start <= date <= self.end
 829     def __str__(self):
 830         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 831
 832
 833 def platform_name():
 834     """ Returns the platform name as a compat_str """
 835     res = platform.platform()
 836     if isinstance(res, bytes):
 837         res = res.decode(preferredencoding())
 838
 839     assert isinstance(res, compat_str)
 840     return res
 841
 842
 843 def write_string(s, out=None):
 844     if out is None:
 845         out = sys.stderr
 846     assert type(s) == type(u'')
 847
 848     if ('b' in getattr(out, 'mode', '') or
 849             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 850         s = s.encode(preferredencoding(), 'ignore')
 851     out.write(s)
 852     out.flush()
 853
 854
 855 def bytes_to_intlist(bs):
 856     if not bs:
 857         return []
 858     if isinstance(bs[0], int):  # Python 3
 859         return list(bs)
 860     else:
 861         return [ord(c) for c in bs]
 862
 863
 864 def intlist_to_bytes(xs):
 865     if not xs:
 866         return b''
 867     if isinstance(chr(0), bytes):  # Python 2
 868         return ''.join([chr(x) for x in xs])
 869     else:
 870         return bytes(xs)
 871
 872
 873 def get_cachedir(params={}):
 874     cache_root = os.environ.get('XDG_CACHE_HOME',
 875                                 os.path.expanduser('~/.cache'))
 876     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 877
 878
 879 # Cross-platform file locking
 880 if sys.platform == 'win32':
 881     import ctypes.wintypes
 882     import msvcrt
 883
 884     class OVERLAPPED(ctypes.Structure):
 885         _fields_ = [
 886             ('Internal', ctypes.wintypes.LPVOID),
 887             ('InternalHigh', ctypes.wintypes.LPVOID),
 888             ('Offset', ctypes.wintypes.DWORD),
 889             ('OffsetHigh', ctypes.wintypes.DWORD),
 890             ('hEvent', ctypes.wintypes.HANDLE),
 891         ]
 892
 893     kernel32 = ctypes.windll.kernel32
 894     LockFileEx = kernel32.LockFileEx
 895     LockFileEx.argtypes = [
 896         ctypes.wintypes.HANDLE,     # hFile
 897         ctypes.wintypes.DWORD,      # dwFlags
 898         ctypes.wintypes.DWORD,      # dwReserved
 899         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 900         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 901         ctypes.POINTER(OVERLAPPED)  # Overlapped
 902     ]
 903     LockFileEx.restype = ctypes.wintypes.BOOL
 904     UnlockFileEx = kernel32.UnlockFileEx
 905     UnlockFileEx.argtypes = [
 906         ctypes.wintypes.HANDLE,     # hFile
 907         ctypes.wintypes.DWORD,      # dwReserved
 908         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 909         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 910         ctypes.POINTER(OVERLAPPED)  # Overlapped
 911     ]
 912     UnlockFileEx.restype = ctypes.wintypes.BOOL
 913     whole_low = 0xffffffff
 914     whole_high = 0x7fffffff
 915
 916     def _lock_file(f, exclusive):
 917         overlapped = OVERLAPPED()
 918         overlapped.Offset = 0
 919         overlapped.OffsetHigh = 0
 920         overlapped.hEvent = 0
 921         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 922         handle = msvcrt.get_osfhandle(f.fileno())
 923         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 924                           whole_low, whole_high, f._lock_file_overlapped_p):
 925             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 926
 927     def _unlock_file(f):
 928         assert f._lock_file_overlapped_p
 929         handle = msvcrt.get_osfhandle(f.fileno())
 930         if not UnlockFileEx(handle, 0,
 931                             whole_low, whole_high, f._lock_file_overlapped_p):
 932             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 933
 934 else:
 935     import fcntl
 936
 937     def _lock_file(f, exclusive):
 938         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 939
 940     def _unlock_file(f):
 941         fcntl.lockf(f, fcntl.LOCK_UN)
 942
 943
 944 class locked_file(object):
 945     def __init__(self, filename, mode, encoding=None):
 946         assert mode in ['r', 'a', 'w']
 947         self.f = io.open(filename, mode, encoding=encoding)
 948         self.mode = mode
 949
 950     def __enter__(self):
 951         exclusive = self.mode != 'r'
 952         try:
 953             _lock_file(self.f, exclusive)
 954         except IOError:
 955             self.f.close()
 956             raise
 957         return self
 958
 959     def __exit__(self, etype, value, traceback):
 960         try:
 961             _unlock_file(self.f)
 962         finally:
 963             self.f.close()
 964
 965     def __iter__(self):
 966         return iter(self.f)
 967
 968     def write(self, *args):
 969         return self.f.write(*args)
 970
 971     def read(self, *args):
 972         return self.f.read(*args)
 973
 974
 975 def shell_quote(args):
 976     quoted_args = []
 977     encoding = sys.getfilesystemencoding()
 978     if encoding is None:
 979         encoding = 'utf-8'
 980     for a in args:
 981         if isinstance(a, bytes):
 982             # We may get a filename encoded with 'encodeFilename'
 983             a = a.decode(encoding)
 984         quoted_args.append(pipes.quote(a))
 985     return u' '.join(quoted_args)
 986
 987
 988 def takewhile_inclusive(pred, seq):
 989     """ Like itertools.takewhile, but include the latest evaluated element
 990         (the first element so that Not pred(e)) """
 991     for e in seq:
 992         yield e
 993         if not pred(e):
 994             return
 995
 996
 997 def smuggle_url(url, data):
 998     """ Pass additional data in a URL for internal use. """
 999
1000     sdata = compat_urllib_parse.urlencode(
1001         {u'__youtubedl_smuggle': json.dumps(data)})
1002     return url + u'#' + sdata
1003
1004
1005 def unsmuggle_url(smug_url):
1006     if not '#__youtubedl_smuggle' in smug_url:
1007         return smug_url, None
1008     url, _, sdata = smug_url.rpartition(u'#')
1009     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1010     data = json.loads(jsond)
1011     return url, data
1012
1013
1014 def format_bytes(bytes):
1015     if bytes is None:
1016         return u'N/A'
1017     if type(bytes) is str:
1018         bytes = float(bytes)
1019     if bytes == 0.0:
1020         exponent = 0
1021     else:
1022         exponent = int(math.log(bytes, 1024.0))
1023     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1024     converted = float(bytes) / float(1024 ** exponent)
1025     return u'%.2f%s' % (converted, suffix)
1026
1027 def str_to_int(int_str):
1028     int_str = re.sub(r'[,\.]', u'', int_str)
1029     return int(int_str)