youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         for f in node.findall(xpath):
 284             if f.attrib.get(key) == val:
 285                 return f
 286         return None
 287
 288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 289 # the namespace parameter
 290 def xpath_with_ns(path, ns_map):
 291     components = [c.split(':') for c in path.split('/')]
 292     replaced = []
 293     for c in components:
 294         if len(c) == 1:
 295             replaced.append(c[0])
 296         else:
 297             ns, tag = c
 298             replaced.append('{%s}%s' % (ns_map[ns], tag))
 299     return '/'.join(replaced)
 300
 301
 302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 303 class BaseHTMLParser(compat_html_parser.HTMLParser):
 304     def __init(self):
 305         compat_html_parser.HTMLParser.__init__(self)
 306         self.html = None
 307
 308     def loads(self, html):
 309         self.html = html
 310         self.feed(html)
 311         self.close()
 312
 313 class AttrParser(BaseHTMLParser):
 314     """Modified HTMLParser that isolates a tag with the specified attribute"""
 315     def __init__(self, attribute, value):
 316         self.attribute = attribute
 317         self.value = value
 318         self.result = None
 319         self.started = False
 320         self.depth = {}
 321         self.watch_startpos = False
 322         self.error_count = 0
 323         BaseHTMLParser.__init__(self)
 324
 325     def error(self, message):
 326         if self.error_count > 10 or self.started:
 327             raise compat_html_parser.HTMLParseError(message, self.getpos())
 328         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 329         self.error_count += 1
 330         self.goahead(1)
 331
 332     def handle_starttag(self, tag, attrs):
 333         attrs = dict(attrs)
 334         if self.started:
 335             self.find_startpos(None)
 336         if self.attribute in attrs and attrs[self.attribute] == self.value:
 337             self.result = [tag]
 338             self.started = True
 339             self.watch_startpos = True
 340         if self.started:
 341             if not tag in self.depth: self.depth[tag] = 0
 342             self.depth[tag] += 1
 343
 344     def handle_endtag(self, tag):
 345         if self.started:
 346             if tag in self.depth: self.depth[tag] -= 1
 347             if self.depth[self.result[0]] == 0:
 348                 self.started = False
 349                 self.result.append(self.getpos())
 350
 351     def find_startpos(self, x):
 352         """Needed to put the start position of the result (self.result[1])
 353         after the opening tag with the requested id"""
 354         if self.watch_startpos:
 355             self.watch_startpos = False
 356             self.result.append(self.getpos())
 357     handle_entityref = handle_charref = handle_data = handle_comment = \
 358     handle_decl = handle_pi = unknown_decl = find_startpos
 359
 360     def get_result(self):
 361         if self.result is None:
 362             return None
 363         if len(self.result) != 3:
 364             return None
 365         lines = self.html.split('\n')
 366         lines = lines[self.result[1][0]-1:self.result[2][0]]
 367         lines[0] = lines[0][self.result[1][1]:]
 368         if len(lines) == 1:
 369             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 370         lines[-1] = lines[-1][:self.result[2][1]]
 371         return '\n'.join(lines).strip()
 372 # Hack for https://github.com/rg3/youtube-dl/issues/662
 373 if sys.version_info < (2, 7, 3):
 374     AttrParser.parse_endtag = (lambda self, i:
 375         i + len("</scr'+'ipt>")
 376         if self.rawdata[i:].startswith("</scr'+'ipt>")
 377         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 378
 379 def get_element_by_id(id, html):
 380     """Return the content of the tag with the specified ID in the passed HTML document"""
 381     return get_element_by_attribute("id", id, html)
 382
 383 def get_element_by_attribute(attribute, value, html):
 384     """Return the content of the tag with the specified attribute in the passed HTML document"""
 385     parser = AttrParser(attribute, value)
 386     try:
 387         parser.loads(html)
 388     except compat_html_parser.HTMLParseError:
 389         pass
 390     return parser.get_result()
 391
 392 class MetaParser(BaseHTMLParser):
 393     """
 394     Modified HTMLParser that isolates a meta tag with the specified name
 395     attribute.
 396     """
 397     def __init__(self, name):
 398         BaseHTMLParser.__init__(self)
 399         self.name = name
 400         self.content = None
 401         self.result = None
 402
 403     def handle_starttag(self, tag, attrs):
 404         if tag != 'meta':
 405             return
 406         attrs = dict(attrs)
 407         if attrs.get('name') == self.name:
 408             self.result = attrs.get('content')
 409
 410     def get_result(self):
 411         return self.result
 412
 413 def get_meta_content(name, html):
 414     """
 415     Return the content attribute from the meta tag with the given name attribute.
 416     """
 417     parser = MetaParser(name)
 418     try:
 419         parser.loads(html)
 420     except compat_html_parser.HTMLParseError:
 421         pass
 422     return parser.get_result()
 423
 424
 425 def clean_html(html):
 426     """Clean an HTML snippet into a readable string"""
 427     # Newline vs <br />
 428     html = html.replace('\n', ' ')
 429     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 430     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 431     # Strip html tags
 432     html = re.sub('<.*?>', '', html)
 433     # Replace html entities
 434     html = unescapeHTML(html)
 435     return html.strip()
 436
 437
 438 def sanitize_open(filename, open_mode):
 439     """Try to open the given filename, and slightly tweak it if this fails.
 440
 441     Attempts to open the given filename. If this fails, it tries to change
 442     the filename slightly, step by step, until it's either able to open it
 443     or it fails and raises a final exception, like the standard open()
 444     function.
 445
 446     It returns the tuple (stream, definitive_file_name).
 447     """
 448     try:
 449         if filename == u'-':
 450             if sys.platform == 'win32':
 451                 import msvcrt
 452                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 453             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 454         stream = open(encodeFilename(filename), open_mode)
 455         return (stream, filename)
 456     except (IOError, OSError) as err:
 457         if err.errno in (errno.EACCES,):
 458             raise
 459
 460         # In case of error, try to remove win32 forbidden chars
 461         alt_filename = os.path.join(
 462                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 463                         for path_part in os.path.split(filename)
 464                        )
 465         if alt_filename == filename:
 466             raise
 467         else:
 468             # An exception here should be caught in the caller
 469             stream = open(encodeFilename(filename), open_mode)
 470             return (stream, alt_filename)
 471
 472
 473 def timeconvert(timestr):
 474     """Convert RFC 2822 defined time string into system timestamp"""
 475     timestamp = None
 476     timetuple = email.utils.parsedate_tz(timestr)
 477     if timetuple is not None:
 478         timestamp = email.utils.mktime_tz(timetuple)
 479     return timestamp
 480
 481 def sanitize_filename(s, restricted=False, is_id=False):
 482     """Sanitizes a string so it could be used as part of a filename.
 483     If restricted is set, use a stricter subset of allowed characters.
 484     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 485     """
 486     def replace_insane(char):
 487         if char == '?' or ord(char) < 32 or ord(char) == 127:
 488             return ''
 489         elif char == '"':
 490             return '' if restricted else '\''
 491         elif char == ':':
 492             return '_-' if restricted else ' -'
 493         elif char in '\\/|*<>':
 494             return '_'
 495         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 496             return '_'
 497         if restricted and ord(char) > 127:
 498             return '_'
 499         return char
 500
 501     result = u''.join(map(replace_insane, s))
 502     if not is_id:
 503         while '__' in result:
 504             result = result.replace('__', '_')
 505         result = result.strip('_')
 506         # Common case of "Foreign band name - English song title"
 507         if restricted and result.startswith('-_'):
 508             result = result[2:]
 509         if not result:
 510             result = '_'
 511     return result
 512
 513 def orderedSet(iterable):
 514     """ Remove all duplicates from the input iterable """
 515     res = []
 516     for el in iterable:
 517         if el not in res:
 518             res.append(el)
 519     return res
 520
 521
 522 def _htmlentity_transform(entity):
 523     """Transforms an HTML entity to a character."""
 524     # Known non-numeric HTML entity
 525     if entity in compat_html_entities.name2codepoint:
 526         return compat_chr(compat_html_entities.name2codepoint[entity])
 527
 528     mobj = re.match(r'#(x?[0-9]+)', entity)
 529     if mobj is not None:
 530         numstr = mobj.group(1)
 531         if numstr.startswith(u'x'):
 532             base = 16
 533             numstr = u'0%s' % numstr
 534         else:
 535             base = 10
 536         return compat_chr(int(numstr, base))
 537
 538     # Unknown entity in name, return its literal representation
 539     return (u'&%s;' % entity)
 540
 541
 542 def unescapeHTML(s):
 543     if s is None:
 544         return None
 545     assert type(s) == compat_str
 546
 547     return re.sub(
 548         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 549
 550
 551 def encodeFilename(s, for_subprocess=False):
 552     """
 553     @param s The name of the file
 554     """
 555
 556     assert type(s) == compat_str
 557
 558     # Python 3 has a Unicode API
 559     if sys.version_info >= (3, 0):
 560         return s
 561
 562     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 563         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 564         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 565         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 566         if not for_subprocess:
 567             return s
 568         else:
 569             # For subprocess calls, encode with locale encoding
 570             # Refer to http://stackoverflow.com/a/9951851/35070
 571             encoding = preferredencoding()
 572     else:
 573         encoding = sys.getfilesystemencoding()
 574     if encoding is None:
 575         encoding = 'utf-8'
 576     return s.encode(encoding, 'ignore')
 577
 578
 579 def encodeArgument(s):
 580     if not isinstance(s, compat_str):
 581         # Legacy code that uses byte strings
 582         # Uncomment the following line after fixing all post processors
 583         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 584         s = s.decode('ascii')
 585     return encodeFilename(s, True)
 586
 587
 588 def decodeOption(optval):
 589     if optval is None:
 590         return optval
 591     if isinstance(optval, bytes):
 592         optval = optval.decode(preferredencoding())
 593
 594     assert isinstance(optval, compat_str)
 595     return optval
 596
 597 def formatSeconds(secs):
 598     if secs > 3600:
 599         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 600     elif secs > 60:
 601         return '%d:%02d' % (secs // 60, secs % 60)
 602     else:
 603         return '%d' % secs
 604
 605
 606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 607     if sys.version_info < (3, 2):
 608         import httplib
 609
 610         class HTTPSConnectionV3(httplib.HTTPSConnection):
 611             def __init__(self, *args, **kwargs):
 612                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 613
 614             def connect(self):
 615                 sock = socket.create_connection((self.host, self.port), self.timeout)
 616                 if getattr(self, '_tunnel_host', False):
 617                     self.sock = sock
 618                     self._tunnel()
 619                 try:
 620                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 621                 except ssl.SSLError:
 622                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 623
 624         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 625             def https_open(self, req):
 626                 return self.do_open(HTTPSConnectionV3, req)
 627         return HTTPSHandlerV3(**kwargs)
 628     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 629         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 630         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 631         if opts_no_check_certificate:
 632             context.verify_mode = ssl.CERT_NONE
 633         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 634     else:  # Python < 3.4
 635         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 636         context.verify_mode = (ssl.CERT_NONE
 637                                if opts_no_check_certificate
 638                                else ssl.CERT_REQUIRED)
 639         context.set_default_verify_paths()
 640         try:
 641             context.load_default_certs()
 642         except AttributeError:
 643             pass  # Python < 3.4
 644         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 645
 646 class ExtractorError(Exception):
 647     """Error during info extraction."""
 648     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 649         """ tb, if given, is the original traceback (so that it can be printed out).
 650         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 651         """
 652
 653         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 654             expected = True
 655         if video_id is not None:
 656             msg = video_id + ': ' + msg
 657         if not expected:
 658             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 659         super(ExtractorError, self).__init__(msg)
 660
 661         self.traceback = tb
 662         self.exc_info = sys.exc_info()  # preserve original exception
 663         self.cause = cause
 664         self.video_id = video_id
 665
 666     def format_traceback(self):
 667         if self.traceback is None:
 668             return None
 669         return u''.join(traceback.format_tb(self.traceback))
 670
 671
 672 class RegexNotFoundError(ExtractorError):
 673     """Error when a regex didn't match"""
 674     pass
 675
 676
 677 class DownloadError(Exception):
 678     """Download Error exception.
 679
 680     This exception may be thrown by FileDownloader objects if they are not
 681     configured to continue on errors. They will contain the appropriate
 682     error message.
 683     """
 684     def __init__(self, msg, exc_info=None):
 685         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 686         super(DownloadError, self).__init__(msg)
 687         self.exc_info = exc_info
 688
 689
 690 class SameFileError(Exception):
 691     """Same File exception.
 692
 693     This exception will be thrown by FileDownloader objects if they detect
 694     multiple files would have to be downloaded to the same file on disk.
 695     """
 696     pass
 697
 698
 699 class PostProcessingError(Exception):
 700     """Post Processing exception.
 701
 702     This exception may be raised by PostProcessor's .run() method to
 703     indicate an error in the postprocessing task.
 704     """
 705     def __init__(self, msg):
 706         self.msg = msg
 707
 708 class MaxDownloadsReached(Exception):
 709     """ --max-downloads limit has been reached. """
 710     pass
 711
 712
 713 class UnavailableVideoError(Exception):
 714     """Unavailable Format exception.
 715
 716     This exception will be thrown when a video is requested
 717     in a format that is not available for that video.
 718     """
 719     pass
 720
 721
 722 class ContentTooShortError(Exception):
 723     """Content Too Short exception.
 724
 725     This exception may be raised by FileDownloader objects when a file they
 726     download is too small for what the server announced first, indicating
 727     the connection was probably interrupted.
 728     """
 729     # Both in bytes
 730     downloaded = None
 731     expected = None
 732
 733     def __init__(self, downloaded, expected):
 734         self.downloaded = downloaded
 735         self.expected = expected
 736
 737 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 738     """Handler for HTTP requests and responses.
 739
 740     This class, when installed with an OpenerDirector, automatically adds
 741     the standard headers to every HTTP request and handles gzipped and
 742     deflated responses from web servers. If compression is to be avoided in
 743     a particular request, the original request in the program code only has
 744     to include the HTTP header "Youtubedl-No-Compression", which will be
 745     removed before making the real request.
 746
 747     Part of this code was copied from:
 748
 749     http://techknack.net/python-urllib2-handlers/
 750
 751     Andrew Rowls, the author of that code, agreed to release it to the
 752     public domain.
 753     """
 754
 755     @staticmethod
 756     def deflate(data):
 757         try:
 758             return zlib.decompress(data, -zlib.MAX_WBITS)
 759         except zlib.error:
 760             return zlib.decompress(data)
 761
 762     @staticmethod
 763     def addinfourl_wrapper(stream, headers, url, code):
 764         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 765             return compat_urllib_request.addinfourl(stream, headers, url, code)
 766         ret = compat_urllib_request.addinfourl(stream, headers, url)
 767         ret.code = code
 768         return ret
 769
 770     def http_request(self, req):
 771         for h, v in std_headers.items():
 772             if h not in req.headers:
 773                 req.add_header(h, v)
 774         if 'Youtubedl-no-compression' in req.headers:
 775             if 'Accept-encoding' in req.headers:
 776                 del req.headers['Accept-encoding']
 777             del req.headers['Youtubedl-no-compression']
 778         if 'Youtubedl-user-agent' in req.headers:
 779             if 'User-agent' in req.headers:
 780                 del req.headers['User-agent']
 781             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 782             del req.headers['Youtubedl-user-agent']
 783         return req
 784
 785     def http_response(self, req, resp):
 786         old_resp = resp
 787         # gzip
 788         if resp.headers.get('Content-encoding', '') == 'gzip':
 789             content = resp.read()
 790             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 791             try:
 792                 uncompressed = io.BytesIO(gz.read())
 793             except IOError as original_ioerror:
 794                 # There may be junk add the end of the file
 795                 # See http://stackoverflow.com/q/4928560/35070 for details
 796                 for i in range(1, 1024):
 797                     try:
 798                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 799                         uncompressed = io.BytesIO(gz.read())
 800                     except IOError:
 801                         continue
 802                     break
 803                 else:
 804                     raise original_ioerror
 805             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 806             resp.msg = old_resp.msg
 807         # deflate
 808         if resp.headers.get('Content-encoding', '') == 'deflate':
 809             gz = io.BytesIO(self.deflate(resp.read()))
 810             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 811             resp.msg = old_resp.msg
 812         return resp
 813
 814     https_request = http_request
 815     https_response = http_response
 816
 817
 818 def parse_iso8601(date_str, delimiter='T'):
 819     """ Return a UNIX timestamp from the given date """
 820
 821     if date_str is None:
 822         return None
 823
 824     m = re.search(
 825         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 826         date_str)
 827     if not m:
 828         timezone = datetime.timedelta()
 829     else:
 830         date_str = date_str[:-len(m.group(0))]
 831         if not m.group('sign'):
 832             timezone = datetime.timedelta()
 833         else:
 834             sign = 1 if m.group('sign') == '+' else -1
 835             timezone = datetime.timedelta(
 836                 hours=sign * int(m.group('hours')),
 837                 minutes=sign * int(m.group('minutes')))
 838     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 839     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 840     return calendar.timegm(dt.timetuple())
 841
 842
 843 def unified_strdate(date_str):
 844     """Return a string with the date in the format YYYYMMDD"""
 845
 846     if date_str is None:
 847         return None
 848
 849     upload_date = None
 850     #Replace commas
 851     date_str = date_str.replace(',', ' ')
 852     # %z (UTC offset) is only supported in python>=3.2
 853     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 854     format_expressions = [
 855         '%d %B %Y',
 856         '%d %b %Y',
 857         '%B %d %Y',
 858         '%b %d %Y',
 859         '%b %dst %Y %I:%M%p',
 860         '%b %dnd %Y %I:%M%p',
 861         '%b %dth %Y %I:%M%p',
 862         '%Y-%m-%d',
 863         '%Y/%m/%d',
 864         '%d.%m.%Y',
 865         '%d/%m/%Y',
 866         '%d/%m/%y',
 867         '%Y/%m/%d %H:%M:%S',
 868         '%Y-%m-%d %H:%M:%S',
 869         '%d.%m.%Y %H:%M',
 870         '%d.%m.%Y %H.%M',
 871         '%Y-%m-%dT%H:%M:%SZ',
 872         '%Y-%m-%dT%H:%M:%S.%fZ',
 873         '%Y-%m-%dT%H:%M:%S.%f0Z',
 874         '%Y-%m-%dT%H:%M:%S',
 875         '%Y-%m-%dT%H:%M:%S.%f',
 876         '%Y-%m-%dT%H:%M',
 877     ]
 878     for expression in format_expressions:
 879         try:
 880             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 881         except ValueError:
 882             pass
 883     if upload_date is None:
 884         timetuple = email.utils.parsedate_tz(date_str)
 885         if timetuple:
 886             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 887     return upload_date
 888
 889 def determine_ext(url, default_ext=u'unknown_video'):
 890     if url is None:
 891         return default_ext
 892     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 893     if re.match(r'^[A-Za-z0-9]+$', guess):
 894         return guess
 895     else:
 896         return default_ext
 897
 898 def subtitles_filename(filename, sub_lang, sub_format):
 899     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 900
 901 def date_from_str(date_str):
 902     """
 903     Return a datetime object from a string in the format YYYYMMDD or
 904     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 905     today = datetime.date.today()
 906     if date_str == 'now'or date_str == 'today':
 907         return today
 908     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 909     if match is not None:
 910         sign = match.group('sign')
 911         time = int(match.group('time'))
 912         if sign == '-':
 913             time = -time
 914         unit = match.group('unit')
 915         #A bad aproximation?
 916         if unit == 'month':
 917             unit = 'day'
 918             time *= 30
 919         elif unit == 'year':
 920             unit = 'day'
 921             time *= 365
 922         unit += 's'
 923         delta = datetime.timedelta(**{unit: time})
 924         return today + delta
 925     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 926
 927 def hyphenate_date(date_str):
 928     """
 929     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 930     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 931     if match is not None:
 932         return '-'.join(match.groups())
 933     else:
 934         return date_str
 935
 936 class DateRange(object):
 937     """Represents a time interval between two dates"""
 938     def __init__(self, start=None, end=None):
 939         """start and end must be strings in the format accepted by date"""
 940         if start is not None:
 941             self.start = date_from_str(start)
 942         else:
 943             self.start = datetime.datetime.min.date()
 944         if end is not None:
 945             self.end = date_from_str(end)
 946         else:
 947             self.end = datetime.datetime.max.date()
 948         if self.start > self.end:
 949             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 950     @classmethod
 951     def day(cls, day):
 952         """Returns a range that only contains the given day"""
 953         return cls(day,day)
 954     def __contains__(self, date):
 955         """Check if the date is in the range"""
 956         if not isinstance(date, datetime.date):
 957             date = date_from_str(date)
 958         return self.start <= date <= self.end
 959     def __str__(self):
 960         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 961
 962
 963 def platform_name():
 964     """ Returns the platform name as a compat_str """
 965     res = platform.platform()
 966     if isinstance(res, bytes):
 967         res = res.decode(preferredencoding())
 968
 969     assert isinstance(res, compat_str)
 970     return res
 971
 972
 973 def _windows_write_string(s, out):
 974     """ Returns True if the string was written using special methods,
 975     False if it has yet to be written out."""
 976     # Adapted from http://stackoverflow.com/a/3259271/35070
 977
 978     import ctypes
 979     import ctypes.wintypes
 980
 981     WIN_OUTPUT_IDS = {
 982         1: -11,
 983         2: -12,
 984     }
 985
 986     try:
 987         fileno = out.fileno()
 988     except AttributeError:
 989         # If the output stream doesn't have a fileno, it's virtual
 990         return False
 991     if fileno not in WIN_OUTPUT_IDS:
 992         return False
 993
 994     GetStdHandle = ctypes.WINFUNCTYPE(
 995         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 996         ("GetStdHandle", ctypes.windll.kernel32))
 997     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 998
 999     WriteConsoleW = ctypes.WINFUNCTYPE(
1000         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1001         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1002         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1003     written = ctypes.wintypes.DWORD(0)
1004
1005     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1006     FILE_TYPE_CHAR = 0x0002
1007     FILE_TYPE_REMOTE = 0x8000
1008     GetConsoleMode = ctypes.WINFUNCTYPE(
1009         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1010         ctypes.POINTER(ctypes.wintypes.DWORD))(
1011         ("GetConsoleMode", ctypes.windll.kernel32))
1012     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1013
1014     def not_a_console(handle):
1015         if handle == INVALID_HANDLE_VALUE or handle is None:
1016             return True
1017         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1018                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1019
1020     if not_a_console(h):
1021         return False
1022
1023     def next_nonbmp_pos(s):
1024         try:
1025             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1026         except StopIteration:
1027             return len(s)
1028
1029     while s:
1030         count = min(next_nonbmp_pos(s), 1024)
1031
1032         ret = WriteConsoleW(
1033             h, s, count if count else 2, ctypes.byref(written), None)
1034         if ret == 0:
1035             raise OSError('Failed to write string')
1036         if not count:  # We just wrote a non-BMP character
1037             assert written.value == 2
1038             s = s[1:]
1039         else:
1040             assert written.value > 0
1041             s = s[written.value:]
1042     return True
1043
1044
1045 def write_string(s, out=None, encoding=None):
1046     if out is None:
1047         out = sys.stderr
1048     assert type(s) == compat_str
1049
1050     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1051         if _windows_write_string(s, out):
1052             return
1053
1054     if ('b' in getattr(out, 'mode', '') or
1055             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1056         byt = s.encode(encoding or preferredencoding(), 'ignore')
1057         out.write(byt)
1058     elif hasattr(out, 'buffer'):
1059         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1060         byt = s.encode(enc, 'ignore')
1061         out.buffer.write(byt)
1062     else:
1063         out.write(s)
1064     out.flush()
1065
1066
1067 def bytes_to_intlist(bs):
1068     if not bs:
1069         return []
1070     if isinstance(bs[0], int):  # Python 3
1071         return list(bs)
1072     else:
1073         return [ord(c) for c in bs]
1074
1075
1076 def intlist_to_bytes(xs):
1077     if not xs:
1078         return b''
1079     if isinstance(chr(0), bytes):  # Python 2
1080         return ''.join([chr(x) for x in xs])
1081     else:
1082         return bytes(xs)
1083
1084
1085 # Cross-platform file locking
1086 if sys.platform == 'win32':
1087     import ctypes.wintypes
1088     import msvcrt
1089
1090     class OVERLAPPED(ctypes.Structure):
1091         _fields_ = [
1092             ('Internal', ctypes.wintypes.LPVOID),
1093             ('InternalHigh', ctypes.wintypes.LPVOID),
1094             ('Offset', ctypes.wintypes.DWORD),
1095             ('OffsetHigh', ctypes.wintypes.DWORD),
1096             ('hEvent', ctypes.wintypes.HANDLE),
1097         ]
1098
1099     kernel32 = ctypes.windll.kernel32
1100     LockFileEx = kernel32.LockFileEx
1101     LockFileEx.argtypes = [
1102         ctypes.wintypes.HANDLE,     # hFile
1103         ctypes.wintypes.DWORD,      # dwFlags
1104         ctypes.wintypes.DWORD,      # dwReserved
1105         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1106         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1107         ctypes.POINTER(OVERLAPPED)  # Overlapped
1108     ]
1109     LockFileEx.restype = ctypes.wintypes.BOOL
1110     UnlockFileEx = kernel32.UnlockFileEx
1111     UnlockFileEx.argtypes = [
1112         ctypes.wintypes.HANDLE,     # hFile
1113         ctypes.wintypes.DWORD,      # dwReserved
1114         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1115         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1116         ctypes.POINTER(OVERLAPPED)  # Overlapped
1117     ]
1118     UnlockFileEx.restype = ctypes.wintypes.BOOL
1119     whole_low = 0xffffffff
1120     whole_high = 0x7fffffff
1121
1122     def _lock_file(f, exclusive):
1123         overlapped = OVERLAPPED()
1124         overlapped.Offset = 0
1125         overlapped.OffsetHigh = 0
1126         overlapped.hEvent = 0
1127         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1128         handle = msvcrt.get_osfhandle(f.fileno())
1129         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1130                           whole_low, whole_high, f._lock_file_overlapped_p):
1131             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1132
1133     def _unlock_file(f):
1134         assert f._lock_file_overlapped_p
1135         handle = msvcrt.get_osfhandle(f.fileno())
1136         if not UnlockFileEx(handle, 0,
1137                             whole_low, whole_high, f._lock_file_overlapped_p):
1138             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1139
1140 else:
1141     import fcntl
1142
1143     def _lock_file(f, exclusive):
1144         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1145
1146     def _unlock_file(f):
1147         fcntl.flock(f, fcntl.LOCK_UN)
1148
1149
1150 class locked_file(object):
1151     def __init__(self, filename, mode, encoding=None):
1152         assert mode in ['r', 'a', 'w']
1153         self.f = io.open(filename, mode, encoding=encoding)
1154         self.mode = mode
1155
1156     def __enter__(self):
1157         exclusive = self.mode != 'r'
1158         try:
1159             _lock_file(self.f, exclusive)
1160         except IOError:
1161             self.f.close()
1162             raise
1163         return self
1164
1165     def __exit__(self, etype, value, traceback):
1166         try:
1167             _unlock_file(self.f)
1168         finally:
1169             self.f.close()
1170
1171     def __iter__(self):
1172         return iter(self.f)
1173
1174     def write(self, *args):
1175         return self.f.write(*args)
1176
1177     def read(self, *args):
1178         return self.f.read(*args)
1179
1180
1181 def shell_quote(args):
1182     quoted_args = []
1183     encoding = sys.getfilesystemencoding()
1184     if encoding is None:
1185         encoding = 'utf-8'
1186     for a in args:
1187         if isinstance(a, bytes):
1188             # We may get a filename encoded with 'encodeFilename'
1189             a = a.decode(encoding)
1190         quoted_args.append(pipes.quote(a))
1191     return u' '.join(quoted_args)
1192
1193
1194 def takewhile_inclusive(pred, seq):
1195     """ Like itertools.takewhile, but include the latest evaluated element
1196         (the first element so that Not pred(e)) """
1197     for e in seq:
1198         yield e
1199         if not pred(e):
1200             return
1201
1202
1203 def smuggle_url(url, data):
1204     """ Pass additional data in a URL for internal use. """
1205
1206     sdata = compat_urllib_parse.urlencode(
1207         {u'__youtubedl_smuggle': json.dumps(data)})
1208     return url + u'#' + sdata
1209
1210
1211 def unsmuggle_url(smug_url, default=None):
1212     if not '#__youtubedl_smuggle' in smug_url:
1213         return smug_url, default
1214     url, _, sdata = smug_url.rpartition(u'#')
1215     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1216     data = json.loads(jsond)
1217     return url, data
1218
1219
1220 def format_bytes(bytes):
1221     if bytes is None:
1222         return u'N/A'
1223     if type(bytes) is str:
1224         bytes = float(bytes)
1225     if bytes == 0.0:
1226         exponent = 0
1227     else:
1228         exponent = int(math.log(bytes, 1024.0))
1229     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1230     converted = float(bytes) / float(1024 ** exponent)
1231     return u'%.2f%s' % (converted, suffix)
1232
1233
1234 def get_term_width():
1235     columns = os.environ.get('COLUMNS', None)
1236     if columns:
1237         return int(columns)
1238
1239     try:
1240         sp = subprocess.Popen(
1241             ['stty', 'size'],
1242             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1243         out, err = sp.communicate()
1244         return int(out.split()[1])
1245     except:
1246         pass
1247     return None
1248
1249
1250 def month_by_name(name):
1251     """ Return the number of a month by (locale-independently) English name """
1252
1253     ENGLISH_NAMES = [
1254         u'January', u'February', u'March', u'April', u'May', u'June',
1255         u'July', u'August', u'September', u'October', u'November', u'December']
1256     try:
1257         return ENGLISH_NAMES.index(name) + 1
1258     except ValueError:
1259         return None
1260
1261
1262 def fix_xml_ampersands(xml_str):
1263     """Replace all the '&' by '&amp;' in XML"""
1264     return re.sub(
1265         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1266         u'&amp;',
1267         xml_str)
1268
1269
1270 def setproctitle(title):
1271     assert isinstance(title, compat_str)
1272     try:
1273         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1274     except OSError:
1275         return
1276     title_bytes = title.encode('utf-8')
1277     buf = ctypes.create_string_buffer(len(title_bytes))
1278     buf.value = title_bytes
1279     try:
1280         libc.prctl(15, buf, 0, 0, 0)
1281     except AttributeError:
1282         return  # Strange libc, just skip this
1283
1284
1285 def remove_start(s, start):
1286     if s.startswith(start):
1287         return s[len(start):]
1288     return s
1289
1290
1291 def remove_end(s, end):
1292     if s.endswith(end):
1293         return s[:-len(end)]
1294     return s
1295
1296
1297 def url_basename(url):
1298     path = compat_urlparse.urlparse(url).path
1299     return path.strip(u'/').split(u'/')[-1]
1300
1301
1302 class HEADRequest(compat_urllib_request.Request):
1303     def get_method(self):
1304         return "HEAD"
1305
1306
1307 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1308     if get_attr:
1309         if v is not None:
1310             v = getattr(v, get_attr, None)
1311     if v == '':
1312         v = None
1313     return default if v is None else (int(v) * invscale // scale)
1314
1315
1316 def str_or_none(v, default=None):
1317     return default if v is None else compat_str(v)
1318
1319
1320 def str_to_int(int_str):
1321     """ A more relaxed version of int_or_none """
1322     if int_str is None:
1323         return None
1324     int_str = re.sub(r'[,\.\+]', u'', int_str)
1325     return int(int_str)
1326
1327
1328 def float_or_none(v, scale=1, invscale=1, default=None):
1329     return default if v is None else (float(v) * invscale / scale)
1330
1331
1332 def parse_duration(s):
1333     if s is None:
1334         return None
1335
1336     s = s.strip()
1337
1338     m = re.match(
1339         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1340     if not m:
1341         return None
1342     res = int(m.group('secs'))
1343     if m.group('mins'):
1344         res += int(m.group('mins')) * 60
1345         if m.group('hours'):
1346             res += int(m.group('hours')) * 60 * 60
1347     if m.group('ms'):
1348         res += float(m.group('ms'))
1349     return res
1350
1351
1352 def prepend_extension(filename, ext):
1353     name, real_ext = os.path.splitext(filename)
1354     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1355
1356
1357 def check_executable(exe, args=[]):
1358     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359     args can be a list of arguments for a short output (like -version) """
1360     try:
1361         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362     except OSError:
1363         return False
1364     return exe
1365
1366
1367 class PagedList(object):
1368     def __init__(self, pagefunc, pagesize):
1369         self._pagefunc = pagefunc
1370         self._pagesize = pagesize
1371
1372     def __len__(self):
1373         # This is only useful for tests
1374         return len(self.getslice())
1375
1376     def getslice(self, start=0, end=None):
1377         res = []
1378         for pagenum in itertools.count(start // self._pagesize):
1379             firstid = pagenum * self._pagesize
1380             nextfirstid = pagenum * self._pagesize + self._pagesize
1381             if start >= nextfirstid:
1382                 continue
1383
1384             page_results = list(self._pagefunc(pagenum))
1385
1386             startv = (
1387                 start % self._pagesize
1388                 if firstid <= start < nextfirstid
1389                 else 0)
1390
1391             endv = (
1392                 ((end - 1) % self._pagesize) + 1
1393                 if (end is not None and firstid <= end <= nextfirstid)
1394                 else None)
1395
1396             if startv != 0 or endv is not None:
1397                 page_results = page_results[startv:endv]
1398             res.extend(page_results)
1399
1400             # A little optimization - if current page is not "full", ie. does
1401             # not contain page_size videos then we can assume that this page
1402             # is the last one - there are no more ids on further pages -
1403             # i.e. no need to query again.
1404             if len(page_results) + startv < self._pagesize:
1405                 break
1406
1407             # If we got the whole page, but the next page is not interesting,
1408             # break out early as well
1409             if end == nextfirstid:
1410                 break
1411         return res
1412
1413
1414 def uppercase_escape(s):
1415     unicode_escape = codecs.getdecoder('unicode_escape')
1416     return re.sub(
1417         r'\\U[0-9a-fA-F]{8}',
1418         lambda m: unicode_escape(m.group(0))[0],
1419         s)
1420
1421 try:
1422     struct.pack(u'!I', 0)
1423 except TypeError:
1424     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1425     def struct_pack(spec, *args):
1426         if isinstance(spec, compat_str):
1427             spec = spec.encode('ascii')
1428         return struct.pack(spec, *args)
1429
1430     def struct_unpack(spec, *args):
1431         if isinstance(spec, compat_str):
1432             spec = spec.encode('ascii')
1433         return struct.unpack(spec, *args)
1434 else:
1435     struct_pack = struct.pack
1436     struct_unpack = struct.unpack
1437
1438
1439 def read_batch_urls(batch_fd):
1440     def fixup(url):
1441         if not isinstance(url, compat_str):
1442             url = url.decode('utf-8', 'replace')
1443         BOM_UTF8 = u'\xef\xbb\xbf'
1444         if url.startswith(BOM_UTF8):
1445             url = url[len(BOM_UTF8):]
1446         url = url.strip()
1447         if url.startswith(('#', ';', ']')):
1448             return False
1449         return url
1450
1451     with contextlib.closing(batch_fd) as fd:
1452         return [url for url in map(fixup, fd) if url]
1453
1454
1455 def urlencode_postdata(*args, **kargs):
1456     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1457
1458
1459 try:
1460     etree_iter = xml.etree.ElementTree.Element.iter
1461 except AttributeError:  # Python <=2.6
1462     etree_iter = lambda n: n.findall('.//*')
1463
1464
1465 def parse_xml(s):
1466     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1467         def doctype(self, name, pubid, system):
1468             pass  # Ignore doctypes
1469
1470     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1471     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1472     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1473     # Fix up XML parser in Python 2.x
1474     if sys.version_info < (3, 0):
1475         for n in etree_iter(tree):
1476             if n.text is not None:
1477                 if not isinstance(n.text, compat_str):
1478                     n.text = n.text.decode('utf-8')
1479     return tree
1480
1481
1482 if sys.version_info < (3, 0) and sys.platform == 'win32':
1483     def compat_getpass(prompt, *args, **kwargs):
1484         if isinstance(prompt, compat_str):
1485             prompt = prompt.encode(preferredencoding())
1486         return getpass.getpass(prompt, *args, **kwargs)
1487 else:
1488     compat_getpass = getpass.getpass
1489
1490
1491 US_RATINGS = {
1492     'G': 0,
1493     'PG': 10,
1494     'PG-13': 13,
1495     'R': 16,
1496     'NC': 18,
1497 }
1498
1499
1500 def strip_jsonp(code):
1501     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1502
1503
1504 def js_to_json(code):
1505     def fix_kv(m):
1506         key = m.group(2)
1507         if key.startswith("'"):
1508             assert key.endswith("'")
1509             assert '"' not in key
1510             key = '"%s"' % key[1:-1]
1511         elif not key.startswith('"'):
1512             key = '"%s"' % key
1513
1514         value = m.group(4)
1515         if value.startswith("'"):
1516             assert value.endswith("'")
1517             assert '"' not in value
1518             value = '"%s"' % value[1:-1]
1519
1520         return m.group(1) + key + m.group(3) + value
1521
1522     res = re.sub(r'''(?x)
1523             ([{,]\s*)
1524             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1525             (:\s*)
1526             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1527         ''', fix_kv, code)
1528     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1529     return res
1530
1531
1532 def qualities(quality_ids):
1533     """ Get a numeric quality value out of a list of possible values """
1534     def q(qid):
1535         try:
1536             return quality_ids.index(qid)
1537         except ValueError:
1538             return -1
1539     return q
1540
1541
1542 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1543
1544 try:
1545     subprocess_check_output = subprocess.check_output
1546 except AttributeError:
1547     def subprocess_check_output(*args, **kwargs):
1548         assert 'input' not in kwargs
1549         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1550         output, _ = p.communicate()
1551         ret = p.poll()
1552         if ret:
1553             raise subprocess.CalledProcessError(ret, p.args, output=output)
1554         return output