]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Add __len__ to PagedLists
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
e3946f98 4import ctypes
c496ca96
PH
5import datetime
6import email.utils
f45c185f 7import errno
d77c3dfd 8import gzip
b7ab0590 9import itertools
03f9daab 10import io
f4bfd65f 11import json
d77c3dfd 12import locale
02dbf93f 13import math
d77c3dfd 14import os
4eb7f1d1 15import pipes
c496ca96 16import platform
d77c3dfd 17import re
13ebea79 18import ssl
c496ca96 19import socket
1c088fa8 20import subprocess
d77c3dfd 21import sys
01951dda 22import traceback
d77c3dfd 23import zlib
d77c3dfd 24
01ba00ca 25try:
59ae15a5 26 import urllib.request as compat_urllib_request
01ba00ca 27except ImportError: # Python 2
59ae15a5 28 import urllib2 as compat_urllib_request
01ba00ca
PH
29
30try:
59ae15a5 31 import urllib.error as compat_urllib_error
01ba00ca 32except ImportError: # Python 2
59ae15a5 33 import urllib2 as compat_urllib_error
01ba00ca
PH
34
35try:
59ae15a5 36 import urllib.parse as compat_urllib_parse
01ba00ca 37except ImportError: # Python 2
59ae15a5 38 import urllib as compat_urllib_parse
01ba00ca 39
799c0763
PH
40try:
41 from urllib.parse import urlparse as compat_urllib_parse_urlparse
42except ImportError: # Python 2
43 from urlparse import urlparse as compat_urllib_parse_urlparse
44
6543f0dc
JMF
45try:
46 import urllib.parse as compat_urlparse
47except ImportError: # Python 2
48 import urlparse as compat_urlparse
49
01ba00ca 50try:
59ae15a5 51 import http.cookiejar as compat_cookiejar
01ba00ca 52except ImportError: # Python 2
59ae15a5 53 import cookielib as compat_cookiejar
01ba00ca 54
3e669f36 55try:
59ae15a5 56 import html.entities as compat_html_entities
9f37a959 57except ImportError: # Python 2
59ae15a5 58 import htmlentitydefs as compat_html_entities
3e669f36 59
a8156c1d 60try:
59ae15a5 61 import html.parser as compat_html_parser
9f37a959 62except ImportError: # Python 2
59ae15a5 63 import HTMLParser as compat_html_parser
a8156c1d 64
348d0a7a 65try:
59ae15a5 66 import http.client as compat_http_client
9f37a959 67except ImportError: # Python 2
59ae15a5 68 import httplib as compat_http_client
348d0a7a 69
2eabb802 70try:
0e283428 71 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
72except ImportError: # Python 2
73 from urllib2 import HTTPError as compat_HTTPError
74
e0df6211
PH
75try:
76 from urllib.request import urlretrieve as compat_urlretrieve
77except ImportError: # Python 2
78 from urllib import urlretrieve as compat_urlretrieve
79
80
5910e210
PH
81try:
82 from subprocess import DEVNULL
83 compat_subprocess_get_DEVNULL = lambda: DEVNULL
84except ImportError:
85 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
86
9f37a959 87try:
59ae15a5 88 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 89except ImportError: # Python 2
59ae15a5
PH
90 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
91 # Python 2's version is apparently totally broken
92 def _unquote(string, encoding='utf-8', errors='replace'):
93 if string == '':
94 return string
95 res = string.split('%')
96 if len(res) == 1:
97 return string
98 if encoding is None:
99 encoding = 'utf-8'
100 if errors is None:
101 errors = 'replace'
102 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
103 pct_sequence = b''
104 string = res[0]
105 for item in res[1:]:
106 try:
107 if not item:
108 raise ValueError
109 pct_sequence += item[:2].decode('hex')
110 rest = item[2:]
111 if not rest:
112 # This segment was just a single percent-encoded character.
113 # May be part of a sequence of code units, so delay decoding.
114 # (Stored in pct_sequence).
115 continue
116 except ValueError:
117 rest = '%' + item
118 # Encountered non-percent-encoded characters. Flush the current
119 # pct_sequence.
120 string += pct_sequence.decode(encoding, errors) + rest
121 pct_sequence = b''
122 if pct_sequence:
123 # Flush the final pct_sequence
124 string += pct_sequence.decode(encoding, errors)
125 return string
126
127 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
128 encoding='utf-8', errors='replace'):
129 qs, _coerce_result = qs, unicode
130 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
131 r = []
132 for name_value in pairs:
133 if not name_value and not strict_parsing:
134 continue
135 nv = name_value.split('=', 1)
136 if len(nv) != 2:
137 if strict_parsing:
138 raise ValueError("bad query field: %r" % (name_value,))
139 # Handle case of a control-name with no equal sign
140 if keep_blank_values:
141 nv.append('')
142 else:
143 continue
144 if len(nv[1]) or keep_blank_values:
145 name = nv[0].replace('+', ' ')
146 name = _unquote(name, encoding=encoding, errors=errors)
147 name = _coerce_result(name)
148 value = nv[1].replace('+', ' ')
149 value = _unquote(value, encoding=encoding, errors=errors)
150 value = _coerce_result(value)
151 r.append((name, value))
152 return r
153
154 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
155 encoding='utf-8', errors='replace'):
156 parsed_result = {}
157 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
158 encoding=encoding, errors=errors)
159 for name, value in pairs:
160 if name in parsed_result:
161 parsed_result[name].append(value)
162 else:
163 parsed_result[name] = [value]
164 return parsed_result
348d0a7a 165
3e669f36 166try:
59ae15a5 167 compat_str = unicode # Python 2
3e669f36 168except NameError:
59ae15a5 169 compat_str = str
3e669f36
PH
170
171try:
59ae15a5 172 compat_chr = unichr # Python 2
3e669f36 173except NameError:
59ae15a5 174 compat_chr = chr
3e669f36 175
b31756c1
FV
176def compat_ord(c):
177 if type(c) is int: return c
178 else: return ord(c)
179
468e2e92
FV
180# This is not clearly defined otherwise
181compiled_regex_type = type(re.compile(''))
182
3e669f36 183std_headers = {
ae8f7871 184 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
185 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
186 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
187 'Accept-Encoding': 'gzip, deflate',
188 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 189}
f427df17 190
d77c3dfd 191def preferredencoding():
59ae15a5 192 """Get preferred encoding.
d77c3dfd 193
59ae15a5
PH
194 Returns the best encoding scheme for the system, based on
195 locale.getpreferredencoding() and some further tweaks.
196 """
197 try:
198 pref = locale.getpreferredencoding()
199 u'TEST'.encode(pref)
200 except:
201 pref = 'UTF-8'
bae611f2 202
59ae15a5 203 return pref
d77c3dfd 204
8cd10ac4 205if sys.version_info < (3,0):
59ae15a5
PH
206 def compat_print(s):
207 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 208else:
59ae15a5
PH
209 def compat_print(s):
210 assert type(s) == type(u'')
211 print(s)
d77c3dfd 212
f4bfd65f
PH
213# In Python 2.x, json.dump expects a bytestream.
214# In Python 3.x, it writes to a character stream
215if sys.version_info < (3,0):
216 def write_json_file(obj, fn):
217 with open(fn, 'wb') as f:
218 json.dump(obj, f)
219else:
220 def write_json_file(obj, fn):
221 with open(fn, 'w', encoding='utf-8') as f:
222 json.dump(obj, f)
223
59ae56fa
PH
224if sys.version_info >= (2,7):
225 def find_xpath_attr(node, xpath, key, val):
226 """ Find the xpath xpath[@key=val] """
5de3ece2 227 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 228 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
229 expr = xpath + u"[@%s='%s']" % (key, val)
230 return node.find(expr)
231else:
232 def find_xpath_attr(node, xpath, key, val):
233 for f in node.findall(xpath):
234 if f.attrib.get(key) == val:
235 return f
236 return None
237
d7e66d39
JMF
238# On python2.6 the xml.etree.ElementTree.Element methods don't support
239# the namespace parameter
240def xpath_with_ns(path, ns_map):
241 components = [c.split(':') for c in path.split('/')]
242 replaced = []
243 for c in components:
244 if len(c) == 1:
245 replaced.append(c[0])
246 else:
247 ns, tag = c
248 replaced.append('{%s}%s' % (ns_map[ns], tag))
249 return '/'.join(replaced)
250
d77c3dfd 251def htmlentity_transform(matchobj):
59ae15a5
PH
252 """Transforms an HTML entity to a character.
253
254 This function receives a match object and is intended to be used with
255 the re.sub() function.
256 """
257 entity = matchobj.group(1)
258
259 # Known non-numeric HTML entity
260 if entity in compat_html_entities.name2codepoint:
261 return compat_chr(compat_html_entities.name2codepoint[entity])
262
263 mobj = re.match(u'(?u)#(x?\\d+)', entity)
264 if mobj is not None:
265 numstr = mobj.group(1)
266 if numstr.startswith(u'x'):
267 base = 16
268 numstr = u'0%s' % numstr
269 else:
270 base = 10
271 return compat_chr(int(numstr, base))
272
273 # Unknown entity in name, return its literal representation
274 return (u'&%s;' % entity)
d77c3dfd 275
a8156c1d 276compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
277class BaseHTMLParser(compat_html_parser.HTMLParser):
278 def __init(self):
279 compat_html_parser.HTMLParser.__init__(self)
280 self.html = None
281
282 def loads(self, html):
283 self.html = html
284 self.feed(html)
285 self.close()
286
287class AttrParser(BaseHTMLParser):
43e8fafd
ND
288 """Modified HTMLParser that isolates a tag with the specified attribute"""
289 def __init__(self, attribute, value):
290 self.attribute = attribute
291 self.value = value
59ae15a5
PH
292 self.result = None
293 self.started = False
294 self.depth = {}
59ae15a5
PH
295 self.watch_startpos = False
296 self.error_count = 0
a921f407 297 BaseHTMLParser.__init__(self)
59ae15a5
PH
298
299 def error(self, message):
300 if self.error_count > 10 or self.started:
301 raise compat_html_parser.HTMLParseError(message, self.getpos())
302 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
303 self.error_count += 1
304 self.goahead(1)
305
59ae15a5
PH
306 def handle_starttag(self, tag, attrs):
307 attrs = dict(attrs)
308 if self.started:
309 self.find_startpos(None)
43e8fafd 310 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
311 self.result = [tag]
312 self.started = True
313 self.watch_startpos = True
314 if self.started:
315 if not tag in self.depth: self.depth[tag] = 0
316 self.depth[tag] += 1
317
318 def handle_endtag(self, tag):
319 if self.started:
320 if tag in self.depth: self.depth[tag] -= 1
321 if self.depth[self.result[0]] == 0:
322 self.started = False
323 self.result.append(self.getpos())
324
325 def find_startpos(self, x):
326 """Needed to put the start position of the result (self.result[1])
327 after the opening tag with the requested id"""
328 if self.watch_startpos:
329 self.watch_startpos = False
330 self.result.append(self.getpos())
331 handle_entityref = handle_charref = handle_data = handle_comment = \
332 handle_decl = handle_pi = unknown_decl = find_startpos
333
334 def get_result(self):
335 if self.result is None:
336 return None
337 if len(self.result) != 3:
338 return None
339 lines = self.html.split('\n')
340 lines = lines[self.result[1][0]-1:self.result[2][0]]
341 lines[0] = lines[0][self.result[1][1]:]
342 if len(lines) == 1:
343 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
344 lines[-1] = lines[-1][:self.result[2][1]]
345 return '\n'.join(lines).strip()
3b024e17
PH
346# Hack for https://github.com/rg3/youtube-dl/issues/662
347if sys.version_info < (2, 7, 3):
348 AttrParser.parse_endtag = (lambda self, i:
349 i + len("</scr'+'ipt>")
350 if self.rawdata[i:].startswith("</scr'+'ipt>")
351 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
352
353def get_element_by_id(id, html):
43e8fafd
ND
354 """Return the content of the tag with the specified ID in the passed HTML document"""
355 return get_element_by_attribute("id", id, html)
356
357def get_element_by_attribute(attribute, value, html):
358 """Return the content of the tag with the specified attribute in the passed HTML document"""
359 parser = AttrParser(attribute, value)
59ae15a5
PH
360 try:
361 parser.loads(html)
362 except compat_html_parser.HTMLParseError:
363 pass
364 return parser.get_result()
9e6dd238 365
a921f407
JMF
366class MetaParser(BaseHTMLParser):
367 """
368 Modified HTMLParser that isolates a meta tag with the specified name
369 attribute.
370 """
371 def __init__(self, name):
372 BaseHTMLParser.__init__(self)
373 self.name = name
374 self.content = None
375 self.result = None
376
377 def handle_starttag(self, tag, attrs):
378 if tag != 'meta':
379 return
380 attrs = dict(attrs)
381 if attrs.get('name') == self.name:
382 self.result = attrs.get('content')
383
384 def get_result(self):
385 return self.result
386
387def get_meta_content(name, html):
388 """
389 Return the content attribute from the meta tag with the given name attribute.
390 """
391 parser = MetaParser(name)
392 try:
393 parser.loads(html)
394 except compat_html_parser.HTMLParseError:
395 pass
396 return parser.get_result()
397
9e6dd238
FV
398
399def clean_html(html):
59ae15a5
PH
400 """Clean an HTML snippet into a readable string"""
401 # Newline vs <br />
402 html = html.replace('\n', ' ')
6b3aef80
FV
403 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
404 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
405 # Strip html tags
406 html = re.sub('<.*?>', '', html)
407 # Replace html entities
408 html = unescapeHTML(html)
7decf895 409 return html.strip()
9e6dd238
FV
410
411
d77c3dfd 412def sanitize_open(filename, open_mode):
59ae15a5
PH
413 """Try to open the given filename, and slightly tweak it if this fails.
414
415 Attempts to open the given filename. If this fails, it tries to change
416 the filename slightly, step by step, until it's either able to open it
417 or it fails and raises a final exception, like the standard open()
418 function.
419
420 It returns the tuple (stream, definitive_file_name).
421 """
422 try:
423 if filename == u'-':
424 if sys.platform == 'win32':
425 import msvcrt
426 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 427 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
428 stream = open(encodeFilename(filename), open_mode)
429 return (stream, filename)
430 except (IOError, OSError) as err:
f45c185f
PH
431 if err.errno in (errno.EACCES,):
432 raise
59ae15a5 433
f45c185f
PH
434 # In case of error, try to remove win32 forbidden chars
435 alt_filename = os.path.join(
436 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
437 for path_part in os.path.split(filename)
438 )
439 if alt_filename == filename:
440 raise
441 else:
442 # An exception here should be caught in the caller
443 stream = open(encodeFilename(filename), open_mode)
444 return (stream, alt_filename)
d77c3dfd
FV
445
446
447def timeconvert(timestr):
59ae15a5
PH
448 """Convert RFC 2822 defined time string into system timestamp"""
449 timestamp = None
450 timetuple = email.utils.parsedate_tz(timestr)
451 if timetuple is not None:
452 timestamp = email.utils.mktime_tz(timetuple)
453 return timestamp
1c469a94 454
796173d0 455def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
456 """Sanitizes a string so it could be used as part of a filename.
457 If restricted is set, use a stricter subset of allowed characters.
796173d0 458 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
459 """
460 def replace_insane(char):
461 if char == '?' or ord(char) < 32 or ord(char) == 127:
462 return ''
463 elif char == '"':
464 return '' if restricted else '\''
465 elif char == ':':
466 return '_-' if restricted else ' -'
467 elif char in '\\/|*<>':
468 return '_'
627dcfff 469 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
470 return '_'
471 if restricted and ord(char) > 127:
472 return '_'
473 return char
474
475 result = u''.join(map(replace_insane, s))
796173d0
PH
476 if not is_id:
477 while '__' in result:
478 result = result.replace('__', '_')
479 result = result.strip('_')
480 # Common case of "Foreign band name - English song title"
481 if restricted and result.startswith('-_'):
482 result = result[2:]
483 if not result:
484 result = '_'
59ae15a5 485 return result
d77c3dfd
FV
486
487def orderedSet(iterable):
59ae15a5
PH
488 """ Remove all duplicates from the input iterable """
489 res = []
490 for el in iterable:
491 if el not in res:
492 res.append(el)
493 return res
d77c3dfd
FV
494
495def unescapeHTML(s):
59ae15a5
PH
496 """
497 @param s a string
498 """
499 assert type(s) == type(u'')
d77c3dfd 500
59ae15a5
PH
501 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
502 return result
d77c3dfd 503
8bf48f23
PH
504
505def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
506 """
507 @param s The name of the file
508 """
d77c3dfd 509
8bf48f23 510 assert type(s) == compat_str
d77c3dfd 511
59ae15a5
PH
512 # Python 3 has a Unicode API
513 if sys.version_info >= (3, 0):
514 return s
0f00efed 515
59ae15a5
PH
516 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
517 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
518 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
519 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
520 if not for_subprocess:
521 return s
522 else:
523 # For subprocess calls, encode with locale encoding
524 # Refer to http://stackoverflow.com/a/9951851/35070
525 encoding = preferredencoding()
59ae15a5 526 else:
6df40dcb 527 encoding = sys.getfilesystemencoding()
8bf48f23
PH
528 if encoding is None:
529 encoding = 'utf-8'
530 return s.encode(encoding, 'ignore')
531
d77c3dfd 532
8271226a
PH
533def decodeOption(optval):
534 if optval is None:
535 return optval
536 if isinstance(optval, bytes):
537 optval = optval.decode(preferredencoding())
538
539 assert isinstance(optval, compat_str)
540 return optval
1c256f70 541
4539dd30
PH
542def formatSeconds(secs):
543 if secs > 3600:
544 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
545 elif secs > 60:
546 return '%d:%02d' % (secs // 60, secs % 60)
547 else:
548 return '%d' % secs
549
a0ddb8a2
PH
550
551def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
552 if sys.version_info < (3, 2):
553 import httplib
554
555 class HTTPSConnectionV3(httplib.HTTPSConnection):
556 def __init__(self, *args, **kwargs):
557 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
558
559 def connect(self):
560 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 561 if getattr(self, '_tunnel_host', False):
13ebea79
PH
562 self.sock = sock
563 self._tunnel()
564 try:
565 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 566 except ssl.SSLError:
13ebea79
PH
567 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
568
569 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
570 def https_open(self, req):
571 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 572 return HTTPSHandlerV3(**kwargs)
ea6d901e 573 else:
13ebea79 574 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 575 context.verify_mode = (ssl.CERT_NONE
dca08720 576 if opts_no_check_certificate
ea6d901e 577 else ssl.CERT_REQUIRED)
303b479e
PH
578 context.set_default_verify_paths()
579 try:
580 context.load_default_certs()
581 except AttributeError:
582 pass # Python < 3.4
a0ddb8a2 583 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 584
1c256f70
PH
585class ExtractorError(Exception):
586 """Error during info extraction."""
2eabb802 587 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
588 """ tb, if given, is the original traceback (so that it can be printed out).
589 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
590 """
591
592 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
593 expected = True
594 if not expected:
298f833b 595 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 596 super(ExtractorError, self).__init__(msg)
d5979c5d 597
1c256f70 598 self.traceback = tb
8cc83b8d 599 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 600 self.cause = cause
1c256f70 601
01951dda
PH
602 def format_traceback(self):
603 if self.traceback is None:
604 return None
605 return u''.join(traceback.format_tb(self.traceback))
606
1c256f70 607
55b3e45b
JMF
608class RegexNotFoundError(ExtractorError):
609 """Error when a regex didn't match"""
610 pass
611
612
d77c3dfd 613class DownloadError(Exception):
59ae15a5 614 """Download Error exception.
d77c3dfd 615
59ae15a5
PH
616 This exception may be thrown by FileDownloader objects if they are not
617 configured to continue on errors. They will contain the appropriate
618 error message.
619 """
8cc83b8d
FV
620 def __init__(self, msg, exc_info=None):
621 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
622 super(DownloadError, self).__init__(msg)
623 self.exc_info = exc_info
d77c3dfd
FV
624
625
626class SameFileError(Exception):
59ae15a5 627 """Same File exception.
d77c3dfd 628
59ae15a5
PH
629 This exception will be thrown by FileDownloader objects if they detect
630 multiple files would have to be downloaded to the same file on disk.
631 """
632 pass
d77c3dfd
FV
633
634
635class PostProcessingError(Exception):
59ae15a5 636 """Post Processing exception.
d77c3dfd 637
59ae15a5
PH
638 This exception may be raised by PostProcessor's .run() method to
639 indicate an error in the postprocessing task.
640 """
7851b379
PH
641 def __init__(self, msg):
642 self.msg = msg
d77c3dfd
FV
643
644class MaxDownloadsReached(Exception):
59ae15a5
PH
645 """ --max-downloads limit has been reached. """
646 pass
d77c3dfd
FV
647
648
649class UnavailableVideoError(Exception):
59ae15a5 650 """Unavailable Format exception.
d77c3dfd 651
59ae15a5
PH
652 This exception will be thrown when a video is requested
653 in a format that is not available for that video.
654 """
655 pass
d77c3dfd
FV
656
657
658class ContentTooShortError(Exception):
59ae15a5 659 """Content Too Short exception.
d77c3dfd 660
59ae15a5
PH
661 This exception may be raised by FileDownloader objects when a file they
662 download is too small for what the server announced first, indicating
663 the connection was probably interrupted.
664 """
665 # Both in bytes
666 downloaded = None
667 expected = None
d77c3dfd 668
59ae15a5
PH
669 def __init__(self, downloaded, expected):
670 self.downloaded = downloaded
671 self.expected = expected
d77c3dfd 672
acebc9cd 673class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
674 """Handler for HTTP requests and responses.
675
676 This class, when installed with an OpenerDirector, automatically adds
677 the standard headers to every HTTP request and handles gzipped and
678 deflated responses from web servers. If compression is to be avoided in
679 a particular request, the original request in the program code only has
680 to include the HTTP header "Youtubedl-No-Compression", which will be
681 removed before making the real request.
682
683 Part of this code was copied from:
684
685 http://techknack.net/python-urllib2-handlers/
686
687 Andrew Rowls, the author of that code, agreed to release it to the
688 public domain.
689 """
690
691 @staticmethod
692 def deflate(data):
693 try:
694 return zlib.decompress(data, -zlib.MAX_WBITS)
695 except zlib.error:
696 return zlib.decompress(data)
697
698 @staticmethod
699 def addinfourl_wrapper(stream, headers, url, code):
700 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
701 return compat_urllib_request.addinfourl(stream, headers, url, code)
702 ret = compat_urllib_request.addinfourl(stream, headers, url)
703 ret.code = code
704 return ret
705
acebc9cd
PH
706 def http_request(self, req):
707 for h,v in std_headers.items():
59ae15a5
PH
708 if h in req.headers:
709 del req.headers[h]
335959e7 710 req.add_header(h, v)
59ae15a5
PH
711 if 'Youtubedl-no-compression' in req.headers:
712 if 'Accept-encoding' in req.headers:
713 del req.headers['Accept-encoding']
714 del req.headers['Youtubedl-no-compression']
3446dfb7 715 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
716 if 'User-agent' in req.headers:
717 del req.headers['User-agent']
718 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 719 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
720 return req
721
acebc9cd 722 def http_response(self, req, resp):
59ae15a5
PH
723 old_resp = resp
724 # gzip
725 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
726 content = resp.read()
727 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
728 try:
729 uncompressed = io.BytesIO(gz.read())
730 except IOError as original_ioerror:
731 # There may be junk add the end of the file
732 # See http://stackoverflow.com/q/4928560/35070 for details
733 for i in range(1, 1024):
734 try:
735 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
736 uncompressed = io.BytesIO(gz.read())
737 except IOError:
738 continue
739 break
740 else:
741 raise original_ioerror
742 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
743 resp.msg = old_resp.msg
744 # deflate
745 if resp.headers.get('Content-encoding', '') == 'deflate':
746 gz = io.BytesIO(self.deflate(resp.read()))
747 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
748 resp.msg = old_resp.msg
749 return resp
0f8d03f8 750
acebc9cd
PH
751 https_request = http_request
752 https_response = http_response
bf50b038
JMF
753
754def unified_strdate(date_str):
755 """Return a string with the date in the format YYYYMMDD"""
756 upload_date = None
757 #Replace commas
758 date_str = date_str.replace(',',' ')
759 # %z (UTC offset) is only supported in python>=3.2
760 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
761 format_expressions = [
762 '%d %B %Y',
763 '%B %d %Y',
764 '%b %d %Y',
765 '%Y-%m-%d',
766 '%d/%m/%Y',
767 '%Y/%m/%d %H:%M:%S',
5d73273f 768 '%Y-%m-%d %H:%M:%S',
19e1d359
JMF
769 '%d.%m.%Y %H:%M',
770 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
771 '%Y-%m-%dT%H:%M:%S.%fZ',
772 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 773 '%Y-%m-%dT%H:%M:%S',
19e1d359 774 ]
bf50b038
JMF
775 for expression in format_expressions:
776 try:
777 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
778 except:
779 pass
42393ce2
PH
780 if upload_date is None:
781 timetuple = email.utils.parsedate_tz(date_str)
782 if timetuple:
783 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
784 return upload_date
785
cbdbb766 786def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
787 guess = url.partition(u'?')[0].rpartition(u'.')[2]
788 if re.match(r'^[A-Za-z0-9]+$', guess):
789 return guess
790 else:
cbdbb766 791 return default_ext
73e79f2a 792
d4051a8e
JMF
793def subtitles_filename(filename, sub_lang, sub_format):
794 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
795
bd558525 796def date_from_str(date_str):
37254abc
JMF
797 """
798 Return a datetime object from a string in the format YYYYMMDD or
799 (now|today)[+-][0-9](day|week|month|year)(s)?"""
800 today = datetime.date.today()
801 if date_str == 'now'or date_str == 'today':
802 return today
803 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
804 if match is not None:
805 sign = match.group('sign')
806 time = int(match.group('time'))
807 if sign == '-':
808 time = -time
809 unit = match.group('unit')
810 #A bad aproximation?
811 if unit == 'month':
812 unit = 'day'
813 time *= 30
814 elif unit == 'year':
815 unit = 'day'
816 time *= 365
817 unit += 's'
818 delta = datetime.timedelta(**{unit: time})
819 return today + delta
bd558525
JMF
820 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
821
e63fc1be 822def hyphenate_date(date_str):
823 """
824 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
825 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
826 if match is not None:
827 return '-'.join(match.groups())
828 else:
829 return date_str
830
bd558525
JMF
831class DateRange(object):
832 """Represents a time interval between two dates"""
833 def __init__(self, start=None, end=None):
834 """start and end must be strings in the format accepted by date"""
835 if start is not None:
836 self.start = date_from_str(start)
837 else:
838 self.start = datetime.datetime.min.date()
839 if end is not None:
840 self.end = date_from_str(end)
841 else:
842 self.end = datetime.datetime.max.date()
37254abc 843 if self.start > self.end:
bd558525
JMF
844 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
845 @classmethod
846 def day(cls, day):
847 """Returns a range that only contains the given day"""
848 return cls(day,day)
849 def __contains__(self, date):
850 """Check if the date is in the range"""
37254abc
JMF
851 if not isinstance(date, datetime.date):
852 date = date_from_str(date)
853 return self.start <= date <= self.end
bd558525
JMF
854 def __str__(self):
855 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
856
857
858def platform_name():
859 """ Returns the platform name as a compat_str """
860 res = platform.platform()
861 if isinstance(res, bytes):
862 res = res.decode(preferredencoding())
863
864 assert isinstance(res, compat_str)
865 return res
c257baff
PH
866
867
7459e3a2
PH
868def write_string(s, out=None):
869 if out is None:
870 out = sys.stderr
8bf48f23 871 assert type(s) == compat_str
7459e3a2
PH
872
873 if ('b' in getattr(out, 'mode', '') or
874 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
875 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
876 try:
877 out.write(s)
878 except UnicodeEncodeError:
879 # In Windows shells, this can fail even when the codec is just charmap!?
880 # See https://wiki.python.org/moin/PrintFails#Issue
881 if sys.platform == 'win32' and hasattr(out, 'encoding'):
882 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
883 out.write(s)
884 else:
885 raise
886
7459e3a2
PH
887 out.flush()
888
889
48ea9cea
PH
890def bytes_to_intlist(bs):
891 if not bs:
892 return []
893 if isinstance(bs[0], int): # Python 3
894 return list(bs)
895 else:
896 return [ord(c) for c in bs]
897
c257baff 898
cba892fa 899def intlist_to_bytes(xs):
900 if not xs:
901 return b''
902 if isinstance(chr(0), bytes): # Python 2
903 return ''.join([chr(x) for x in xs])
904 else:
905 return bytes(xs)
c38b1e77
PH
906
907
908def get_cachedir(params={}):
909 cache_root = os.environ.get('XDG_CACHE_HOME',
910 os.path.expanduser('~/.cache'))
911 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
912
913
914# Cross-platform file locking
915if sys.platform == 'win32':
916 import ctypes.wintypes
917 import msvcrt
918
919 class OVERLAPPED(ctypes.Structure):
920 _fields_ = [
921 ('Internal', ctypes.wintypes.LPVOID),
922 ('InternalHigh', ctypes.wintypes.LPVOID),
923 ('Offset', ctypes.wintypes.DWORD),
924 ('OffsetHigh', ctypes.wintypes.DWORD),
925 ('hEvent', ctypes.wintypes.HANDLE),
926 ]
927
928 kernel32 = ctypes.windll.kernel32
929 LockFileEx = kernel32.LockFileEx
930 LockFileEx.argtypes = [
931 ctypes.wintypes.HANDLE, # hFile
932 ctypes.wintypes.DWORD, # dwFlags
933 ctypes.wintypes.DWORD, # dwReserved
934 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
935 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
936 ctypes.POINTER(OVERLAPPED) # Overlapped
937 ]
938 LockFileEx.restype = ctypes.wintypes.BOOL
939 UnlockFileEx = kernel32.UnlockFileEx
940 UnlockFileEx.argtypes = [
941 ctypes.wintypes.HANDLE, # hFile
942 ctypes.wintypes.DWORD, # dwReserved
943 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
944 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
945 ctypes.POINTER(OVERLAPPED) # Overlapped
946 ]
947 UnlockFileEx.restype = ctypes.wintypes.BOOL
948 whole_low = 0xffffffff
949 whole_high = 0x7fffffff
950
951 def _lock_file(f, exclusive):
952 overlapped = OVERLAPPED()
953 overlapped.Offset = 0
954 overlapped.OffsetHigh = 0
955 overlapped.hEvent = 0
956 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
957 handle = msvcrt.get_osfhandle(f.fileno())
958 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
959 whole_low, whole_high, f._lock_file_overlapped_p):
960 raise OSError('Locking file failed: %r' % ctypes.FormatError())
961
962 def _unlock_file(f):
963 assert f._lock_file_overlapped_p
964 handle = msvcrt.get_osfhandle(f.fileno())
965 if not UnlockFileEx(handle, 0,
966 whole_low, whole_high, f._lock_file_overlapped_p):
967 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
968
969else:
970 import fcntl
971
972 def _lock_file(f, exclusive):
973 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
974
975 def _unlock_file(f):
976 fcntl.lockf(f, fcntl.LOCK_UN)
977
978
979class locked_file(object):
980 def __init__(self, filename, mode, encoding=None):
981 assert mode in ['r', 'a', 'w']
982 self.f = io.open(filename, mode, encoding=encoding)
983 self.mode = mode
984
985 def __enter__(self):
986 exclusive = self.mode != 'r'
987 try:
988 _lock_file(self.f, exclusive)
989 except IOError:
990 self.f.close()
991 raise
992 return self
993
994 def __exit__(self, etype, value, traceback):
995 try:
996 _unlock_file(self.f)
997 finally:
998 self.f.close()
999
1000 def __iter__(self):
1001 return iter(self.f)
1002
1003 def write(self, *args):
1004 return self.f.write(*args)
1005
1006 def read(self, *args):
1007 return self.f.read(*args)
4eb7f1d1
JMF
1008
1009
1010def shell_quote(args):
a6a173c2
JMF
1011 quoted_args = []
1012 encoding = sys.getfilesystemencoding()
1013 if encoding is None:
1014 encoding = 'utf-8'
1015 for a in args:
1016 if isinstance(a, bytes):
1017 # We may get a filename encoded with 'encodeFilename'
1018 a = a.decode(encoding)
1019 quoted_args.append(pipes.quote(a))
1020 return u' '.join(quoted_args)
9d4660ca
PH
1021
1022
f4d96df0
PH
1023def takewhile_inclusive(pred, seq):
1024 """ Like itertools.takewhile, but include the latest evaluated element
1025 (the first element so that Not pred(e)) """
1026 for e in seq:
1027 yield e
1028 if not pred(e):
1029 return
1030
1031
9d4660ca
PH
1032def smuggle_url(url, data):
1033 """ Pass additional data in a URL for internal use. """
1034
1035 sdata = compat_urllib_parse.urlencode(
1036 {u'__youtubedl_smuggle': json.dumps(data)})
1037 return url + u'#' + sdata
1038
1039
79f82953 1040def unsmuggle_url(smug_url, default=None):
9d4660ca 1041 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1042 return smug_url, default
9d4660ca
PH
1043 url, _, sdata = smug_url.rpartition(u'#')
1044 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1045 data = json.loads(jsond)
1046 return url, data
02dbf93f
PH
1047
1048
02dbf93f
PH
1049def format_bytes(bytes):
1050 if bytes is None:
1051 return u'N/A'
1052 if type(bytes) is str:
1053 bytes = float(bytes)
1054 if bytes == 0.0:
1055 exponent = 0
1056 else:
1057 exponent = int(math.log(bytes, 1024.0))
1058 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1059 converted = float(bytes) / float(1024 ** exponent)
1060 return u'%.2f%s' % (converted, suffix)
f53c966a 1061
1c088fa8 1062
f53c966a
JMF
1063def str_to_int(int_str):
1064 int_str = re.sub(r'[,\.]', u'', int_str)
1065 return int(int_str)
1c088fa8
PH
1066
1067
1068def get_term_width():
1069 columns = os.environ.get('COLUMNS', None)
1070 if columns:
1071 return int(columns)
1072
1073 try:
1074 sp = subprocess.Popen(
1075 ['stty', 'size'],
1076 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1077 out, err = sp.communicate()
1078 return int(out.split()[1])
1079 except:
1080 pass
1081 return None
caefb1de
PH
1082
1083
1084def month_by_name(name):
1085 """ Return the number of a month by (locale-independently) English name """
1086
1087 ENGLISH_NAMES = [
dadb8184 1088 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1089 u'July', u'August', u'September', u'October', u'November', u'December']
1090 try:
1091 return ENGLISH_NAMES.index(name) + 1
1092 except ValueError:
1093 return None
18258362
JMF
1094
1095
5aafe895 1096def fix_xml_ampersands(xml_str):
18258362 1097 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1098 return re.sub(
1099 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1100 u'&amp;',
1101 xml_str)
e3946f98
PH
1102
1103
1104def setproctitle(title):
8bf48f23 1105 assert isinstance(title, compat_str)
e3946f98
PH
1106 try:
1107 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1108 except OSError:
1109 return
1110 title = title
1111 buf = ctypes.create_string_buffer(len(title) + 1)
e64eaaa9 1112 buf.value = title.encode('utf-8')
e3946f98
PH
1113 try:
1114 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1115 except AttributeError:
1116 return # Strange libc, just skip this
d7dda168
PH
1117
1118
1119def remove_start(s, start):
1120 if s.startswith(start):
1121 return s[len(start):]
1122 return s
29eb5174
PH
1123
1124
1125def url_basename(url):
9b8aaeed
JMF
1126 path = compat_urlparse.urlparse(url).path
1127 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1128
1129
1130class HEADRequest(compat_urllib_request.Request):
1131 def get_method(self):
1132 return "HEAD"
7217e148
PH
1133
1134
dd27fd17
PH
1135def int_or_none(v, scale=1):
1136 return v if v is None else (int(v) // scale)
608d11f5
PH
1137
1138
1139def parse_duration(s):
1140 if s is None:
1141 return None
1142
1143 m = re.match(
1144 r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
1145 if not m:
1146 return None
1147 res = int(m.group('secs'))
1148 if m.group('mins'):
1149 res += int(m.group('mins')) * 60
1150 if m.group('hours'):
1151 res += int(m.group('hours')) * 60 * 60
1152 return res
91d7d0b3
JMF
1153
1154
1155def prepend_extension(filename, ext):
1156 name, real_ext = os.path.splitext(filename)
1157 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1158
1159
1160def check_executable(exe, args=[]):
1161 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1162 args can be a list of arguments for a short output (like -version) """
1163 try:
1164 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1165 except OSError:
1166 return False
1167 return exe
b7ab0590
PH
1168
1169
1170class PagedList(object):
1171 def __init__(self, pagefunc, pagesize):
1172 self._pagefunc = pagefunc
1173 self._pagesize = pagesize
1174
dd26ced1
PH
1175 def __len__(self):
1176 # This is only useful for tests
1177 return len(self.getslice())
1178
b7ab0590
PH
1179 def getslice(self, start=0, end=None):
1180 res = []
1181 for pagenum in itertools.count(start // self._pagesize):
1182 firstid = pagenum * self._pagesize
1183 nextfirstid = pagenum * self._pagesize + self._pagesize
1184 if start >= nextfirstid:
1185 continue
1186
1187 page_results = list(self._pagefunc(pagenum))
1188
1189 startv = (
1190 start % self._pagesize
1191 if firstid <= start < nextfirstid
1192 else 0)
1193
1194 endv = (
1195 ((end - 1) % self._pagesize) + 1
1196 if (end is not None and firstid <= end <= nextfirstid)
1197 else None)
1198
1199 if startv != 0 or endv is not None:
1200 page_results = page_results[startv:endv]
1201 res.extend(page_results)
1202
1203 # A little optimization - if current page is not "full", ie. does
1204 # not contain page_size videos then we can assume that this page
1205 # is the last one - there are no more ids on further pages -
1206 # i.e. no need to query again.
1207 if len(page_results) + startv < self._pagesize:
1208 break
1209
1210 # If we got the whole page, but the next page is not interesting,
1211 # break out early as well
1212 if end == nextfirstid:
1213 break
1214 return res