]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[orf] Use new extraction method (Fixes #2057)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
e3946f98 4import ctypes
c496ca96
PH
5import datetime
6import email.utils
f45c185f 7import errno
d77c3dfd 8import gzip
03f9daab 9import io
f4bfd65f 10import json
d77c3dfd 11import locale
02dbf93f 12import math
d77c3dfd 13import os
4eb7f1d1 14import pipes
c496ca96 15import platform
d77c3dfd 16import re
13ebea79 17import ssl
c496ca96 18import socket
1c088fa8 19import subprocess
d77c3dfd 20import sys
01951dda 21import traceback
d77c3dfd 22import zlib
d77c3dfd 23
01ba00ca 24try:
59ae15a5 25 import urllib.request as compat_urllib_request
01ba00ca 26except ImportError: # Python 2
59ae15a5 27 import urllib2 as compat_urllib_request
01ba00ca
PH
28
29try:
59ae15a5 30 import urllib.error as compat_urllib_error
01ba00ca 31except ImportError: # Python 2
59ae15a5 32 import urllib2 as compat_urllib_error
01ba00ca
PH
33
34try:
59ae15a5 35 import urllib.parse as compat_urllib_parse
01ba00ca 36except ImportError: # Python 2
59ae15a5 37 import urllib as compat_urllib_parse
01ba00ca 38
799c0763
PH
39try:
40 from urllib.parse import urlparse as compat_urllib_parse_urlparse
41except ImportError: # Python 2
42 from urlparse import urlparse as compat_urllib_parse_urlparse
43
6543f0dc
JMF
44try:
45 import urllib.parse as compat_urlparse
46except ImportError: # Python 2
47 import urlparse as compat_urlparse
48
01ba00ca 49try:
59ae15a5 50 import http.cookiejar as compat_cookiejar
01ba00ca 51except ImportError: # Python 2
59ae15a5 52 import cookielib as compat_cookiejar
01ba00ca 53
3e669f36 54try:
59ae15a5 55 import html.entities as compat_html_entities
9f37a959 56except ImportError: # Python 2
59ae15a5 57 import htmlentitydefs as compat_html_entities
3e669f36 58
a8156c1d 59try:
59ae15a5 60 import html.parser as compat_html_parser
9f37a959 61except ImportError: # Python 2
59ae15a5 62 import HTMLParser as compat_html_parser
a8156c1d 63
348d0a7a 64try:
59ae15a5 65 import http.client as compat_http_client
9f37a959 66except ImportError: # Python 2
59ae15a5 67 import httplib as compat_http_client
348d0a7a 68
2eabb802 69try:
0e283428 70 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
71except ImportError: # Python 2
72 from urllib2 import HTTPError as compat_HTTPError
73
e0df6211
PH
74try:
75 from urllib.request import urlretrieve as compat_urlretrieve
76except ImportError: # Python 2
77 from urllib import urlretrieve as compat_urlretrieve
78
79
5910e210
PH
80try:
81 from subprocess import DEVNULL
82 compat_subprocess_get_DEVNULL = lambda: DEVNULL
83except ImportError:
84 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
85
9f37a959 86try:
59ae15a5 87 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 88except ImportError: # Python 2
59ae15a5
PH
89 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
90 # Python 2's version is apparently totally broken
91 def _unquote(string, encoding='utf-8', errors='replace'):
92 if string == '':
93 return string
94 res = string.split('%')
95 if len(res) == 1:
96 return string
97 if encoding is None:
98 encoding = 'utf-8'
99 if errors is None:
100 errors = 'replace'
101 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
102 pct_sequence = b''
103 string = res[0]
104 for item in res[1:]:
105 try:
106 if not item:
107 raise ValueError
108 pct_sequence += item[:2].decode('hex')
109 rest = item[2:]
110 if not rest:
111 # This segment was just a single percent-encoded character.
112 # May be part of a sequence of code units, so delay decoding.
113 # (Stored in pct_sequence).
114 continue
115 except ValueError:
116 rest = '%' + item
117 # Encountered non-percent-encoded characters. Flush the current
118 # pct_sequence.
119 string += pct_sequence.decode(encoding, errors) + rest
120 pct_sequence = b''
121 if pct_sequence:
122 # Flush the final pct_sequence
123 string += pct_sequence.decode(encoding, errors)
124 return string
125
126 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
127 encoding='utf-8', errors='replace'):
128 qs, _coerce_result = qs, unicode
129 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
130 r = []
131 for name_value in pairs:
132 if not name_value and not strict_parsing:
133 continue
134 nv = name_value.split('=', 1)
135 if len(nv) != 2:
136 if strict_parsing:
137 raise ValueError("bad query field: %r" % (name_value,))
138 # Handle case of a control-name with no equal sign
139 if keep_blank_values:
140 nv.append('')
141 else:
142 continue
143 if len(nv[1]) or keep_blank_values:
144 name = nv[0].replace('+', ' ')
145 name = _unquote(name, encoding=encoding, errors=errors)
146 name = _coerce_result(name)
147 value = nv[1].replace('+', ' ')
148 value = _unquote(value, encoding=encoding, errors=errors)
149 value = _coerce_result(value)
150 r.append((name, value))
151 return r
152
153 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
154 encoding='utf-8', errors='replace'):
155 parsed_result = {}
156 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
157 encoding=encoding, errors=errors)
158 for name, value in pairs:
159 if name in parsed_result:
160 parsed_result[name].append(value)
161 else:
162 parsed_result[name] = [value]
163 return parsed_result
348d0a7a 164
3e669f36 165try:
59ae15a5 166 compat_str = unicode # Python 2
3e669f36 167except NameError:
59ae15a5 168 compat_str = str
3e669f36
PH
169
170try:
59ae15a5 171 compat_chr = unichr # Python 2
3e669f36 172except NameError:
59ae15a5 173 compat_chr = chr
3e669f36 174
b31756c1
FV
175def compat_ord(c):
176 if type(c) is int: return c
177 else: return ord(c)
178
468e2e92
FV
179# This is not clearly defined otherwise
180compiled_regex_type = type(re.compile(''))
181
3e669f36 182std_headers = {
ae8f7871 183 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
184 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
185 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
186 'Accept-Encoding': 'gzip, deflate',
187 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 188}
f427df17 189
d77c3dfd 190def preferredencoding():
59ae15a5 191 """Get preferred encoding.
d77c3dfd 192
59ae15a5
PH
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
198 u'TEST'.encode(pref)
199 except:
200 pref = 'UTF-8'
bae611f2 201
59ae15a5 202 return pref
d77c3dfd 203
8cd10ac4 204if sys.version_info < (3,0):
59ae15a5
PH
205 def compat_print(s):
206 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 207else:
59ae15a5
PH
208 def compat_print(s):
209 assert type(s) == type(u'')
210 print(s)
d77c3dfd 211
f4bfd65f
PH
212# In Python 2.x, json.dump expects a bytestream.
213# In Python 3.x, it writes to a character stream
214if sys.version_info < (3,0):
215 def write_json_file(obj, fn):
216 with open(fn, 'wb') as f:
217 json.dump(obj, f)
218else:
219 def write_json_file(obj, fn):
220 with open(fn, 'w', encoding='utf-8') as f:
221 json.dump(obj, f)
222
59ae56fa
PH
223if sys.version_info >= (2,7):
224 def find_xpath_attr(node, xpath, key, val):
225 """ Find the xpath xpath[@key=val] """
5de3ece2 226 assert re.match(r'^[a-zA-Z]+$', key)
54543467 227 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
228 expr = xpath + u"[@%s='%s']" % (key, val)
229 return node.find(expr)
230else:
231 def find_xpath_attr(node, xpath, key, val):
232 for f in node.findall(xpath):
233 if f.attrib.get(key) == val:
234 return f
235 return None
236
d7e66d39
JMF
237# On python2.6 the xml.etree.ElementTree.Element methods don't support
238# the namespace parameter
239def xpath_with_ns(path, ns_map):
240 components = [c.split(':') for c in path.split('/')]
241 replaced = []
242 for c in components:
243 if len(c) == 1:
244 replaced.append(c[0])
245 else:
246 ns, tag = c
247 replaced.append('{%s}%s' % (ns_map[ns], tag))
248 return '/'.join(replaced)
249
d77c3dfd 250def htmlentity_transform(matchobj):
59ae15a5
PH
251 """Transforms an HTML entity to a character.
252
253 This function receives a match object and is intended to be used with
254 the re.sub() function.
255 """
256 entity = matchobj.group(1)
257
258 # Known non-numeric HTML entity
259 if entity in compat_html_entities.name2codepoint:
260 return compat_chr(compat_html_entities.name2codepoint[entity])
261
262 mobj = re.match(u'(?u)#(x?\\d+)', entity)
263 if mobj is not None:
264 numstr = mobj.group(1)
265 if numstr.startswith(u'x'):
266 base = 16
267 numstr = u'0%s' % numstr
268 else:
269 base = 10
270 return compat_chr(int(numstr, base))
271
272 # Unknown entity in name, return its literal representation
273 return (u'&%s;' % entity)
d77c3dfd 274
a8156c1d 275compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
276class BaseHTMLParser(compat_html_parser.HTMLParser):
277 def __init(self):
278 compat_html_parser.HTMLParser.__init__(self)
279 self.html = None
280
281 def loads(self, html):
282 self.html = html
283 self.feed(html)
284 self.close()
285
286class AttrParser(BaseHTMLParser):
43e8fafd
ND
287 """Modified HTMLParser that isolates a tag with the specified attribute"""
288 def __init__(self, attribute, value):
289 self.attribute = attribute
290 self.value = value
59ae15a5
PH
291 self.result = None
292 self.started = False
293 self.depth = {}
59ae15a5
PH
294 self.watch_startpos = False
295 self.error_count = 0
a921f407 296 BaseHTMLParser.__init__(self)
59ae15a5
PH
297
298 def error(self, message):
299 if self.error_count > 10 or self.started:
300 raise compat_html_parser.HTMLParseError(message, self.getpos())
301 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
302 self.error_count += 1
303 self.goahead(1)
304
59ae15a5
PH
305 def handle_starttag(self, tag, attrs):
306 attrs = dict(attrs)
307 if self.started:
308 self.find_startpos(None)
43e8fafd 309 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
310 self.result = [tag]
311 self.started = True
312 self.watch_startpos = True
313 if self.started:
314 if not tag in self.depth: self.depth[tag] = 0
315 self.depth[tag] += 1
316
317 def handle_endtag(self, tag):
318 if self.started:
319 if tag in self.depth: self.depth[tag] -= 1
320 if self.depth[self.result[0]] == 0:
321 self.started = False
322 self.result.append(self.getpos())
323
324 def find_startpos(self, x):
325 """Needed to put the start position of the result (self.result[1])
326 after the opening tag with the requested id"""
327 if self.watch_startpos:
328 self.watch_startpos = False
329 self.result.append(self.getpos())
330 handle_entityref = handle_charref = handle_data = handle_comment = \
331 handle_decl = handle_pi = unknown_decl = find_startpos
332
333 def get_result(self):
334 if self.result is None:
335 return None
336 if len(self.result) != 3:
337 return None
338 lines = self.html.split('\n')
339 lines = lines[self.result[1][0]-1:self.result[2][0]]
340 lines[0] = lines[0][self.result[1][1]:]
341 if len(lines) == 1:
342 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
343 lines[-1] = lines[-1][:self.result[2][1]]
344 return '\n'.join(lines).strip()
3b024e17
PH
345# Hack for https://github.com/rg3/youtube-dl/issues/662
346if sys.version_info < (2, 7, 3):
347 AttrParser.parse_endtag = (lambda self, i:
348 i + len("</scr'+'ipt>")
349 if self.rawdata[i:].startswith("</scr'+'ipt>")
350 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
351
352def get_element_by_id(id, html):
43e8fafd
ND
353 """Return the content of the tag with the specified ID in the passed HTML document"""
354 return get_element_by_attribute("id", id, html)
355
356def get_element_by_attribute(attribute, value, html):
357 """Return the content of the tag with the specified attribute in the passed HTML document"""
358 parser = AttrParser(attribute, value)
59ae15a5
PH
359 try:
360 parser.loads(html)
361 except compat_html_parser.HTMLParseError:
362 pass
363 return parser.get_result()
9e6dd238 364
a921f407
JMF
365class MetaParser(BaseHTMLParser):
366 """
367 Modified HTMLParser that isolates a meta tag with the specified name
368 attribute.
369 """
370 def __init__(self, name):
371 BaseHTMLParser.__init__(self)
372 self.name = name
373 self.content = None
374 self.result = None
375
376 def handle_starttag(self, tag, attrs):
377 if tag != 'meta':
378 return
379 attrs = dict(attrs)
380 if attrs.get('name') == self.name:
381 self.result = attrs.get('content')
382
383 def get_result(self):
384 return self.result
385
386def get_meta_content(name, html):
387 """
388 Return the content attribute from the meta tag with the given name attribute.
389 """
390 parser = MetaParser(name)
391 try:
392 parser.loads(html)
393 except compat_html_parser.HTMLParseError:
394 pass
395 return parser.get_result()
396
9e6dd238
FV
397
398def clean_html(html):
59ae15a5
PH
399 """Clean an HTML snippet into a readable string"""
400 # Newline vs <br />
401 html = html.replace('\n', ' ')
6b3aef80
FV
402 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
403 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
404 # Strip html tags
405 html = re.sub('<.*?>', '', html)
406 # Replace html entities
407 html = unescapeHTML(html)
7decf895 408 return html.strip()
9e6dd238
FV
409
410
d77c3dfd 411def sanitize_open(filename, open_mode):
59ae15a5
PH
412 """Try to open the given filename, and slightly tweak it if this fails.
413
414 Attempts to open the given filename. If this fails, it tries to change
415 the filename slightly, step by step, until it's either able to open it
416 or it fails and raises a final exception, like the standard open()
417 function.
418
419 It returns the tuple (stream, definitive_file_name).
420 """
421 try:
422 if filename == u'-':
423 if sys.platform == 'win32':
424 import msvcrt
425 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 426 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
427 stream = open(encodeFilename(filename), open_mode)
428 return (stream, filename)
429 except (IOError, OSError) as err:
f45c185f
PH
430 if err.errno in (errno.EACCES,):
431 raise
59ae15a5 432
f45c185f
PH
433 # In case of error, try to remove win32 forbidden chars
434 alt_filename = os.path.join(
435 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
436 for path_part in os.path.split(filename)
437 )
438 if alt_filename == filename:
439 raise
440 else:
441 # An exception here should be caught in the caller
442 stream = open(encodeFilename(filename), open_mode)
443 return (stream, alt_filename)
d77c3dfd
FV
444
445
446def timeconvert(timestr):
59ae15a5
PH
447 """Convert RFC 2822 defined time string into system timestamp"""
448 timestamp = None
449 timetuple = email.utils.parsedate_tz(timestr)
450 if timetuple is not None:
451 timestamp = email.utils.mktime_tz(timetuple)
452 return timestamp
1c469a94 453
796173d0 454def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
455 """Sanitizes a string so it could be used as part of a filename.
456 If restricted is set, use a stricter subset of allowed characters.
796173d0 457 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
458 """
459 def replace_insane(char):
460 if char == '?' or ord(char) < 32 or ord(char) == 127:
461 return ''
462 elif char == '"':
463 return '' if restricted else '\''
464 elif char == ':':
465 return '_-' if restricted else ' -'
466 elif char in '\\/|*<>':
467 return '_'
627dcfff 468 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
469 return '_'
470 if restricted and ord(char) > 127:
471 return '_'
472 return char
473
474 result = u''.join(map(replace_insane, s))
796173d0
PH
475 if not is_id:
476 while '__' in result:
477 result = result.replace('__', '_')
478 result = result.strip('_')
479 # Common case of "Foreign band name - English song title"
480 if restricted and result.startswith('-_'):
481 result = result[2:]
482 if not result:
483 result = '_'
59ae15a5 484 return result
d77c3dfd
FV
485
486def orderedSet(iterable):
59ae15a5
PH
487 """ Remove all duplicates from the input iterable """
488 res = []
489 for el in iterable:
490 if el not in res:
491 res.append(el)
492 return res
d77c3dfd
FV
493
494def unescapeHTML(s):
59ae15a5
PH
495 """
496 @param s a string
497 """
498 assert type(s) == type(u'')
d77c3dfd 499
59ae15a5
PH
500 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
501 return result
d77c3dfd 502
8bf48f23
PH
503
504def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
505 """
506 @param s The name of the file
507 """
d77c3dfd 508
8bf48f23 509 assert type(s) == compat_str
d77c3dfd 510
59ae15a5
PH
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
513 return s
0f00efed 514
59ae15a5
PH
515 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
516 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
517 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
518 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
519 if not for_subprocess:
520 return s
521 else:
522 # For subprocess calls, encode with locale encoding
523 # Refer to http://stackoverflow.com/a/9951851/35070
524 encoding = preferredencoding()
59ae15a5 525 else:
6df40dcb 526 encoding = sys.getfilesystemencoding()
8bf48f23
PH
527 if encoding is None:
528 encoding = 'utf-8'
529 return s.encode(encoding, 'ignore')
530
d77c3dfd 531
8271226a
PH
532def decodeOption(optval):
533 if optval is None:
534 return optval
535 if isinstance(optval, bytes):
536 optval = optval.decode(preferredencoding())
537
538 assert isinstance(optval, compat_str)
539 return optval
1c256f70 540
4539dd30
PH
541def formatSeconds(secs):
542 if secs > 3600:
543 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
544 elif secs > 60:
545 return '%d:%02d' % (secs // 60, secs % 60)
546 else:
547 return '%d' % secs
548
a0ddb8a2
PH
549
550def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
551 if sys.version_info < (3, 2):
552 import httplib
553
554 class HTTPSConnectionV3(httplib.HTTPSConnection):
555 def __init__(self, *args, **kwargs):
556 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
557
558 def connect(self):
559 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 560 if getattr(self, '_tunnel_host', False):
13ebea79
PH
561 self.sock = sock
562 self._tunnel()
563 try:
564 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 565 except ssl.SSLError:
13ebea79
PH
566 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
567
568 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
569 def https_open(self, req):
570 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 571 return HTTPSHandlerV3(**kwargs)
ea6d901e 572 else:
13ebea79 573 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 574 context.verify_mode = (ssl.CERT_NONE
dca08720 575 if opts_no_check_certificate
ea6d901e 576 else ssl.CERT_REQUIRED)
303b479e
PH
577 context.set_default_verify_paths()
578 try:
579 context.load_default_certs()
580 except AttributeError:
581 pass # Python < 3.4
a0ddb8a2 582 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 583
1c256f70
PH
584class ExtractorError(Exception):
585 """Error during info extraction."""
2eabb802 586 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
587 """ tb, if given, is the original traceback (so that it can be printed out).
588 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
589 """
590
591 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
592 expected = True
593 if not expected:
298f833b 594 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 595 super(ExtractorError, self).__init__(msg)
d5979c5d 596
1c256f70 597 self.traceback = tb
8cc83b8d 598 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 599 self.cause = cause
1c256f70 600
01951dda
PH
601 def format_traceback(self):
602 if self.traceback is None:
603 return None
604 return u''.join(traceback.format_tb(self.traceback))
605
1c256f70 606
55b3e45b
JMF
607class RegexNotFoundError(ExtractorError):
608 """Error when a regex didn't match"""
609 pass
610
611
d77c3dfd 612class DownloadError(Exception):
59ae15a5 613 """Download Error exception.
d77c3dfd 614
59ae15a5
PH
615 This exception may be thrown by FileDownloader objects if they are not
616 configured to continue on errors. They will contain the appropriate
617 error message.
618 """
8cc83b8d
FV
619 def __init__(self, msg, exc_info=None):
620 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
621 super(DownloadError, self).__init__(msg)
622 self.exc_info = exc_info
d77c3dfd
FV
623
624
625class SameFileError(Exception):
59ae15a5 626 """Same File exception.
d77c3dfd 627
59ae15a5
PH
628 This exception will be thrown by FileDownloader objects if they detect
629 multiple files would have to be downloaded to the same file on disk.
630 """
631 pass
d77c3dfd
FV
632
633
634class PostProcessingError(Exception):
59ae15a5 635 """Post Processing exception.
d77c3dfd 636
59ae15a5
PH
637 This exception may be raised by PostProcessor's .run() method to
638 indicate an error in the postprocessing task.
639 """
7851b379
PH
640 def __init__(self, msg):
641 self.msg = msg
d77c3dfd
FV
642
643class MaxDownloadsReached(Exception):
59ae15a5
PH
644 """ --max-downloads limit has been reached. """
645 pass
d77c3dfd
FV
646
647
648class UnavailableVideoError(Exception):
59ae15a5 649 """Unavailable Format exception.
d77c3dfd 650
59ae15a5
PH
651 This exception will be thrown when a video is requested
652 in a format that is not available for that video.
653 """
654 pass
d77c3dfd
FV
655
656
657class ContentTooShortError(Exception):
59ae15a5 658 """Content Too Short exception.
d77c3dfd 659
59ae15a5
PH
660 This exception may be raised by FileDownloader objects when a file they
661 download is too small for what the server announced first, indicating
662 the connection was probably interrupted.
663 """
664 # Both in bytes
665 downloaded = None
666 expected = None
d77c3dfd 667
59ae15a5
PH
668 def __init__(self, downloaded, expected):
669 self.downloaded = downloaded
670 self.expected = expected
d77c3dfd 671
acebc9cd 672class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
673 """Handler for HTTP requests and responses.
674
675 This class, when installed with an OpenerDirector, automatically adds
676 the standard headers to every HTTP request and handles gzipped and
677 deflated responses from web servers. If compression is to be avoided in
678 a particular request, the original request in the program code only has
679 to include the HTTP header "Youtubedl-No-Compression", which will be
680 removed before making the real request.
681
682 Part of this code was copied from:
683
684 http://techknack.net/python-urllib2-handlers/
685
686 Andrew Rowls, the author of that code, agreed to release it to the
687 public domain.
688 """
689
690 @staticmethod
691 def deflate(data):
692 try:
693 return zlib.decompress(data, -zlib.MAX_WBITS)
694 except zlib.error:
695 return zlib.decompress(data)
696
697 @staticmethod
698 def addinfourl_wrapper(stream, headers, url, code):
699 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
700 return compat_urllib_request.addinfourl(stream, headers, url, code)
701 ret = compat_urllib_request.addinfourl(stream, headers, url)
702 ret.code = code
703 return ret
704
acebc9cd
PH
705 def http_request(self, req):
706 for h,v in std_headers.items():
59ae15a5
PH
707 if h in req.headers:
708 del req.headers[h]
335959e7 709 req.add_header(h, v)
59ae15a5
PH
710 if 'Youtubedl-no-compression' in req.headers:
711 if 'Accept-encoding' in req.headers:
712 del req.headers['Accept-encoding']
713 del req.headers['Youtubedl-no-compression']
3446dfb7 714 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
715 if 'User-agent' in req.headers:
716 del req.headers['User-agent']
717 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 718 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
719 return req
720
acebc9cd 721 def http_response(self, req, resp):
59ae15a5
PH
722 old_resp = resp
723 # gzip
724 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
725 content = resp.read()
726 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
727 try:
728 uncompressed = io.BytesIO(gz.read())
729 except IOError as original_ioerror:
730 # There may be junk add the end of the file
731 # See http://stackoverflow.com/q/4928560/35070 for details
732 for i in range(1, 1024):
733 try:
734 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
735 uncompressed = io.BytesIO(gz.read())
736 except IOError:
737 continue
738 break
739 else:
740 raise original_ioerror
741 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
742 resp.msg = old_resp.msg
743 # deflate
744 if resp.headers.get('Content-encoding', '') == 'deflate':
745 gz = io.BytesIO(self.deflate(resp.read()))
746 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
747 resp.msg = old_resp.msg
748 return resp
0f8d03f8 749
acebc9cd
PH
750 https_request = http_request
751 https_response = http_response
bf50b038
JMF
752
753def unified_strdate(date_str):
754 """Return a string with the date in the format YYYYMMDD"""
755 upload_date = None
756 #Replace commas
757 date_str = date_str.replace(',',' ')
758 # %z (UTC offset) is only supported in python>=3.2
759 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
760 format_expressions = [
761 '%d %B %Y',
762 '%B %d %Y',
763 '%b %d %Y',
764 '%Y-%m-%d',
765 '%d/%m/%Y',
766 '%Y/%m/%d %H:%M:%S',
5d73273f 767 '%Y-%m-%d %H:%M:%S',
19e1d359
JMF
768 '%d.%m.%Y %H:%M',
769 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
770 '%Y-%m-%dT%H:%M:%S.%fZ',
771 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 772 '%Y-%m-%dT%H:%M:%S',
19e1d359 773 ]
bf50b038
JMF
774 for expression in format_expressions:
775 try:
776 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
777 except:
778 pass
42393ce2
PH
779 if upload_date is None:
780 timetuple = email.utils.parsedate_tz(date_str)
781 if timetuple:
782 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
783 return upload_date
784
cbdbb766 785def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
786 guess = url.partition(u'?')[0].rpartition(u'.')[2]
787 if re.match(r'^[A-Za-z0-9]+$', guess):
788 return guess
789 else:
cbdbb766 790 return default_ext
73e79f2a 791
d4051a8e
JMF
792def subtitles_filename(filename, sub_lang, sub_format):
793 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
794
bd558525 795def date_from_str(date_str):
37254abc
JMF
796 """
797 Return a datetime object from a string in the format YYYYMMDD or
798 (now|today)[+-][0-9](day|week|month|year)(s)?"""
799 today = datetime.date.today()
800 if date_str == 'now'or date_str == 'today':
801 return today
802 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
803 if match is not None:
804 sign = match.group('sign')
805 time = int(match.group('time'))
806 if sign == '-':
807 time = -time
808 unit = match.group('unit')
809 #A bad aproximation?
810 if unit == 'month':
811 unit = 'day'
812 time *= 30
813 elif unit == 'year':
814 unit = 'day'
815 time *= 365
816 unit += 's'
817 delta = datetime.timedelta(**{unit: time})
818 return today + delta
bd558525
JMF
819 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
820
821class DateRange(object):
822 """Represents a time interval between two dates"""
823 def __init__(self, start=None, end=None):
824 """start and end must be strings in the format accepted by date"""
825 if start is not None:
826 self.start = date_from_str(start)
827 else:
828 self.start = datetime.datetime.min.date()
829 if end is not None:
830 self.end = date_from_str(end)
831 else:
832 self.end = datetime.datetime.max.date()
37254abc 833 if self.start > self.end:
bd558525
JMF
834 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
835 @classmethod
836 def day(cls, day):
837 """Returns a range that only contains the given day"""
838 return cls(day,day)
839 def __contains__(self, date):
840 """Check if the date is in the range"""
37254abc
JMF
841 if not isinstance(date, datetime.date):
842 date = date_from_str(date)
843 return self.start <= date <= self.end
bd558525
JMF
844 def __str__(self):
845 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
846
847
848def platform_name():
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
853
854 assert isinstance(res, compat_str)
855 return res
c257baff
PH
856
857
7459e3a2
PH
858def write_string(s, out=None):
859 if out is None:
860 out = sys.stderr
8bf48f23 861 assert type(s) == compat_str
7459e3a2
PH
862
863 if ('b' in getattr(out, 'mode', '') or
864 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
865 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
866 try:
867 out.write(s)
868 except UnicodeEncodeError:
869 # In Windows shells, this can fail even when the codec is just charmap!?
870 # See https://wiki.python.org/moin/PrintFails#Issue
871 if sys.platform == 'win32' and hasattr(out, 'encoding'):
872 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
873 out.write(s)
874 else:
875 raise
876
7459e3a2
PH
877 out.flush()
878
879
48ea9cea
PH
880def bytes_to_intlist(bs):
881 if not bs:
882 return []
883 if isinstance(bs[0], int): # Python 3
884 return list(bs)
885 else:
886 return [ord(c) for c in bs]
887
c257baff 888
cba892fa 889def intlist_to_bytes(xs):
890 if not xs:
891 return b''
892 if isinstance(chr(0), bytes): # Python 2
893 return ''.join([chr(x) for x in xs])
894 else:
895 return bytes(xs)
c38b1e77
PH
896
897
898def get_cachedir(params={}):
899 cache_root = os.environ.get('XDG_CACHE_HOME',
900 os.path.expanduser('~/.cache'))
901 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
902
903
904# Cross-platform file locking
905if sys.platform == 'win32':
906 import ctypes.wintypes
907 import msvcrt
908
909 class OVERLAPPED(ctypes.Structure):
910 _fields_ = [
911 ('Internal', ctypes.wintypes.LPVOID),
912 ('InternalHigh', ctypes.wintypes.LPVOID),
913 ('Offset', ctypes.wintypes.DWORD),
914 ('OffsetHigh', ctypes.wintypes.DWORD),
915 ('hEvent', ctypes.wintypes.HANDLE),
916 ]
917
918 kernel32 = ctypes.windll.kernel32
919 LockFileEx = kernel32.LockFileEx
920 LockFileEx.argtypes = [
921 ctypes.wintypes.HANDLE, # hFile
922 ctypes.wintypes.DWORD, # dwFlags
923 ctypes.wintypes.DWORD, # dwReserved
924 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
925 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
926 ctypes.POINTER(OVERLAPPED) # Overlapped
927 ]
928 LockFileEx.restype = ctypes.wintypes.BOOL
929 UnlockFileEx = kernel32.UnlockFileEx
930 UnlockFileEx.argtypes = [
931 ctypes.wintypes.HANDLE, # hFile
932 ctypes.wintypes.DWORD, # dwReserved
933 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
934 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
935 ctypes.POINTER(OVERLAPPED) # Overlapped
936 ]
937 UnlockFileEx.restype = ctypes.wintypes.BOOL
938 whole_low = 0xffffffff
939 whole_high = 0x7fffffff
940
941 def _lock_file(f, exclusive):
942 overlapped = OVERLAPPED()
943 overlapped.Offset = 0
944 overlapped.OffsetHigh = 0
945 overlapped.hEvent = 0
946 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
947 handle = msvcrt.get_osfhandle(f.fileno())
948 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
949 whole_low, whole_high, f._lock_file_overlapped_p):
950 raise OSError('Locking file failed: %r' % ctypes.FormatError())
951
952 def _unlock_file(f):
953 assert f._lock_file_overlapped_p
954 handle = msvcrt.get_osfhandle(f.fileno())
955 if not UnlockFileEx(handle, 0,
956 whole_low, whole_high, f._lock_file_overlapped_p):
957 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
958
959else:
960 import fcntl
961
962 def _lock_file(f, exclusive):
963 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
964
965 def _unlock_file(f):
966 fcntl.lockf(f, fcntl.LOCK_UN)
967
968
969class locked_file(object):
970 def __init__(self, filename, mode, encoding=None):
971 assert mode in ['r', 'a', 'w']
972 self.f = io.open(filename, mode, encoding=encoding)
973 self.mode = mode
974
975 def __enter__(self):
976 exclusive = self.mode != 'r'
977 try:
978 _lock_file(self.f, exclusive)
979 except IOError:
980 self.f.close()
981 raise
982 return self
983
984 def __exit__(self, etype, value, traceback):
985 try:
986 _unlock_file(self.f)
987 finally:
988 self.f.close()
989
990 def __iter__(self):
991 return iter(self.f)
992
993 def write(self, *args):
994 return self.f.write(*args)
995
996 def read(self, *args):
997 return self.f.read(*args)
4eb7f1d1
JMF
998
999
1000def shell_quote(args):
a6a173c2
JMF
1001 quoted_args = []
1002 encoding = sys.getfilesystemencoding()
1003 if encoding is None:
1004 encoding = 'utf-8'
1005 for a in args:
1006 if isinstance(a, bytes):
1007 # We may get a filename encoded with 'encodeFilename'
1008 a = a.decode(encoding)
1009 quoted_args.append(pipes.quote(a))
1010 return u' '.join(quoted_args)
9d4660ca
PH
1011
1012
f4d96df0
PH
1013def takewhile_inclusive(pred, seq):
1014 """ Like itertools.takewhile, but include the latest evaluated element
1015 (the first element so that Not pred(e)) """
1016 for e in seq:
1017 yield e
1018 if not pred(e):
1019 return
1020
1021
9d4660ca
PH
1022def smuggle_url(url, data):
1023 """ Pass additional data in a URL for internal use. """
1024
1025 sdata = compat_urllib_parse.urlencode(
1026 {u'__youtubedl_smuggle': json.dumps(data)})
1027 return url + u'#' + sdata
1028
1029
1030def unsmuggle_url(smug_url):
1031 if not '#__youtubedl_smuggle' in smug_url:
1032 return smug_url, None
1033 url, _, sdata = smug_url.rpartition(u'#')
1034 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1035 data = json.loads(jsond)
1036 return url, data
02dbf93f
PH
1037
1038
02dbf93f
PH
1039def format_bytes(bytes):
1040 if bytes is None:
1041 return u'N/A'
1042 if type(bytes) is str:
1043 bytes = float(bytes)
1044 if bytes == 0.0:
1045 exponent = 0
1046 else:
1047 exponent = int(math.log(bytes, 1024.0))
1048 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1049 converted = float(bytes) / float(1024 ** exponent)
1050 return u'%.2f%s' % (converted, suffix)
f53c966a 1051
1c088fa8 1052
f53c966a
JMF
1053def str_to_int(int_str):
1054 int_str = re.sub(r'[,\.]', u'', int_str)
1055 return int(int_str)
1c088fa8
PH
1056
1057
1058def get_term_width():
1059 columns = os.environ.get('COLUMNS', None)
1060 if columns:
1061 return int(columns)
1062
1063 try:
1064 sp = subprocess.Popen(
1065 ['stty', 'size'],
1066 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1067 out, err = sp.communicate()
1068 return int(out.split()[1])
1069 except:
1070 pass
1071 return None
caefb1de
PH
1072
1073
1074def month_by_name(name):
1075 """ Return the number of a month by (locale-independently) English name """
1076
1077 ENGLISH_NAMES = [
dadb8184 1078 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1079 u'July', u'August', u'September', u'October', u'November', u'December']
1080 try:
1081 return ENGLISH_NAMES.index(name) + 1
1082 except ValueError:
1083 return None
18258362
JMF
1084
1085
1086def fix_xml_all_ampersand(xml_str):
1087 """Replace all the '&' by '&amp;' in XML"""
1088 return xml_str.replace(u'&', u'&amp;')
e3946f98
PH
1089
1090
1091def setproctitle(title):
8bf48f23 1092 assert isinstance(title, compat_str)
e3946f98
PH
1093 try:
1094 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1095 except OSError:
1096 return
1097 title = title
1098 buf = ctypes.create_string_buffer(len(title) + 1)
e64eaaa9 1099 buf.value = title.encode('utf-8')
e3946f98
PH
1100 try:
1101 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1102 except AttributeError:
1103 return # Strange libc, just skip this
d7dda168
PH
1104
1105
1106def remove_start(s, start):
1107 if s.startswith(start):
1108 return s[len(start):]
1109 return s
29eb5174
PH
1110
1111
1112def url_basename(url):
9b8aaeed
JMF
1113 path = compat_urlparse.urlparse(url).path
1114 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1115
1116
1117class HEADRequest(compat_urllib_request.Request):
1118 def get_method(self):
1119 return "HEAD"
7217e148
PH
1120
1121
1122def int_or_none(v):
1123 return v if v is None else int(v)
608d11f5
PH
1124
1125
1126def parse_duration(s):
1127 if s is None:
1128 return None
1129
1130 m = re.match(
1131 r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
1132 if not m:
1133 return None
1134 res = int(m.group('secs'))
1135 if m.group('mins'):
1136 res += int(m.group('mins')) * 60
1137 if m.group('hours'):
1138 res += int(m.group('hours')) * 60 * 60
1139 return res
91d7d0b3
JMF
1140
1141
1142def prepend_extension(filename, ext):
1143 name, real_ext = os.path.splitext(filename)
1144 return u'{0}.{1}{2}'.format(name, ext, real_ext)