]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[viki] Fix subtitles extraction
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd 10import locale
02dbf93f 11import math
d77c3dfd 12import os
4eb7f1d1 13import pipes
c496ca96 14import platform
d77c3dfd 15import re
13ebea79 16import ssl
c496ca96 17import socket
d77c3dfd 18import sys
01951dda 19import traceback
02dbf93f 20import xml.etree.ElementTree
d77c3dfd 21import zlib
d77c3dfd 22
01ba00ca 23try:
59ae15a5 24 import urllib.request as compat_urllib_request
01ba00ca 25except ImportError: # Python 2
59ae15a5 26 import urllib2 as compat_urllib_request
01ba00ca
PH
27
28try:
59ae15a5 29 import urllib.error as compat_urllib_error
01ba00ca 30except ImportError: # Python 2
59ae15a5 31 import urllib2 as compat_urllib_error
01ba00ca
PH
32
33try:
59ae15a5 34 import urllib.parse as compat_urllib_parse
01ba00ca 35except ImportError: # Python 2
59ae15a5 36 import urllib as compat_urllib_parse
01ba00ca 37
799c0763
PH
38try:
39 from urllib.parse import urlparse as compat_urllib_parse_urlparse
40except ImportError: # Python 2
41 from urlparse import urlparse as compat_urllib_parse_urlparse
42
6543f0dc
JMF
43try:
44 import urllib.parse as compat_urlparse
45except ImportError: # Python 2
46 import urlparse as compat_urlparse
47
01ba00ca 48try:
59ae15a5 49 import http.cookiejar as compat_cookiejar
01ba00ca 50except ImportError: # Python 2
59ae15a5 51 import cookielib as compat_cookiejar
01ba00ca 52
3e669f36 53try:
59ae15a5 54 import html.entities as compat_html_entities
9f37a959 55except ImportError: # Python 2
59ae15a5 56 import htmlentitydefs as compat_html_entities
3e669f36 57
a8156c1d 58try:
59ae15a5 59 import html.parser as compat_html_parser
9f37a959 60except ImportError: # Python 2
59ae15a5 61 import HTMLParser as compat_html_parser
a8156c1d 62
348d0a7a 63try:
59ae15a5 64 import http.client as compat_http_client
9f37a959 65except ImportError: # Python 2
59ae15a5 66 import httplib as compat_http_client
348d0a7a 67
2eabb802 68try:
0e283428 69 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
70except ImportError: # Python 2
71 from urllib2 import HTTPError as compat_HTTPError
72
e0df6211
PH
73try:
74 from urllib.request import urlretrieve as compat_urlretrieve
75except ImportError: # Python 2
76 from urllib import urlretrieve as compat_urlretrieve
77
78
5910e210
PH
79try:
80 from subprocess import DEVNULL
81 compat_subprocess_get_DEVNULL = lambda: DEVNULL
82except ImportError:
83 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
84
9f37a959 85try:
59ae15a5 86 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 87except ImportError: # Python 2
59ae15a5
PH
88 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
89 # Python 2's version is apparently totally broken
90 def _unquote(string, encoding='utf-8', errors='replace'):
91 if string == '':
92 return string
93 res = string.split('%')
94 if len(res) == 1:
95 return string
96 if encoding is None:
97 encoding = 'utf-8'
98 if errors is None:
99 errors = 'replace'
100 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
101 pct_sequence = b''
102 string = res[0]
103 for item in res[1:]:
104 try:
105 if not item:
106 raise ValueError
107 pct_sequence += item[:2].decode('hex')
108 rest = item[2:]
109 if not rest:
110 # This segment was just a single percent-encoded character.
111 # May be part of a sequence of code units, so delay decoding.
112 # (Stored in pct_sequence).
113 continue
114 except ValueError:
115 rest = '%' + item
116 # Encountered non-percent-encoded characters. Flush the current
117 # pct_sequence.
118 string += pct_sequence.decode(encoding, errors) + rest
119 pct_sequence = b''
120 if pct_sequence:
121 # Flush the final pct_sequence
122 string += pct_sequence.decode(encoding, errors)
123 return string
124
125 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
126 encoding='utf-8', errors='replace'):
127 qs, _coerce_result = qs, unicode
128 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
129 r = []
130 for name_value in pairs:
131 if not name_value and not strict_parsing:
132 continue
133 nv = name_value.split('=', 1)
134 if len(nv) != 2:
135 if strict_parsing:
136 raise ValueError("bad query field: %r" % (name_value,))
137 # Handle case of a control-name with no equal sign
138 if keep_blank_values:
139 nv.append('')
140 else:
141 continue
142 if len(nv[1]) or keep_blank_values:
143 name = nv[0].replace('+', ' ')
144 name = _unquote(name, encoding=encoding, errors=errors)
145 name = _coerce_result(name)
146 value = nv[1].replace('+', ' ')
147 value = _unquote(value, encoding=encoding, errors=errors)
148 value = _coerce_result(value)
149 r.append((name, value))
150 return r
151
152 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
153 encoding='utf-8', errors='replace'):
154 parsed_result = {}
155 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
156 encoding=encoding, errors=errors)
157 for name, value in pairs:
158 if name in parsed_result:
159 parsed_result[name].append(value)
160 else:
161 parsed_result[name] = [value]
162 return parsed_result
348d0a7a 163
3e669f36 164try:
59ae15a5 165 compat_str = unicode # Python 2
3e669f36 166except NameError:
59ae15a5 167 compat_str = str
3e669f36
PH
168
169try:
59ae15a5 170 compat_chr = unichr # Python 2
3e669f36 171except NameError:
59ae15a5 172 compat_chr = chr
3e669f36 173
b31756c1
FV
174def compat_ord(c):
175 if type(c) is int: return c
176 else: return ord(c)
177
468e2e92
FV
178# This is not clearly defined otherwise
179compiled_regex_type = type(re.compile(''))
180
3e669f36 181std_headers = {
ae8f7871 182 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
183 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
184 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
185 'Accept-Encoding': 'gzip, deflate',
186 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 187}
f427df17 188
d77c3dfd 189def preferredencoding():
59ae15a5 190 """Get preferred encoding.
d77c3dfd 191
59ae15a5
PH
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
197 u'TEST'.encode(pref)
198 except:
199 pref = 'UTF-8'
bae611f2 200
59ae15a5 201 return pref
d77c3dfd 202
8cd10ac4 203if sys.version_info < (3,0):
59ae15a5
PH
204 def compat_print(s):
205 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 206else:
59ae15a5
PH
207 def compat_print(s):
208 assert type(s) == type(u'')
209 print(s)
d77c3dfd 210
f4bfd65f
PH
211# In Python 2.x, json.dump expects a bytestream.
212# In Python 3.x, it writes to a character stream
213if sys.version_info < (3,0):
214 def write_json_file(obj, fn):
215 with open(fn, 'wb') as f:
216 json.dump(obj, f)
217else:
218 def write_json_file(obj, fn):
219 with open(fn, 'w', encoding='utf-8') as f:
220 json.dump(obj, f)
221
59ae56fa
PH
222if sys.version_info >= (2,7):
223 def find_xpath_attr(node, xpath, key, val):
224 """ Find the xpath xpath[@key=val] """
5de3ece2 225 assert re.match(r'^[a-zA-Z]+$', key)
54543467 226 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
227 expr = xpath + u"[@%s='%s']" % (key, val)
228 return node.find(expr)
229else:
230 def find_xpath_attr(node, xpath, key, val):
231 for f in node.findall(xpath):
232 if f.attrib.get(key) == val:
233 return f
234 return None
235
d7e66d39
JMF
236# On python2.6 the xml.etree.ElementTree.Element methods don't support
237# the namespace parameter
238def xpath_with_ns(path, ns_map):
239 components = [c.split(':') for c in path.split('/')]
240 replaced = []
241 for c in components:
242 if len(c) == 1:
243 replaced.append(c[0])
244 else:
245 ns, tag = c
246 replaced.append('{%s}%s' % (ns_map[ns], tag))
247 return '/'.join(replaced)
248
d77c3dfd 249def htmlentity_transform(matchobj):
59ae15a5
PH
250 """Transforms an HTML entity to a character.
251
252 This function receives a match object and is intended to be used with
253 the re.sub() function.
254 """
255 entity = matchobj.group(1)
256
257 # Known non-numeric HTML entity
258 if entity in compat_html_entities.name2codepoint:
259 return compat_chr(compat_html_entities.name2codepoint[entity])
260
261 mobj = re.match(u'(?u)#(x?\\d+)', entity)
262 if mobj is not None:
263 numstr = mobj.group(1)
264 if numstr.startswith(u'x'):
265 base = 16
266 numstr = u'0%s' % numstr
267 else:
268 base = 10
269 return compat_chr(int(numstr, base))
270
271 # Unknown entity in name, return its literal representation
272 return (u'&%s;' % entity)
d77c3dfd 273
a8156c1d 274compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
275class BaseHTMLParser(compat_html_parser.HTMLParser):
276 def __init(self):
277 compat_html_parser.HTMLParser.__init__(self)
278 self.html = None
279
280 def loads(self, html):
281 self.html = html
282 self.feed(html)
283 self.close()
284
285class AttrParser(BaseHTMLParser):
43e8fafd
ND
286 """Modified HTMLParser that isolates a tag with the specified attribute"""
287 def __init__(self, attribute, value):
288 self.attribute = attribute
289 self.value = value
59ae15a5
PH
290 self.result = None
291 self.started = False
292 self.depth = {}
59ae15a5
PH
293 self.watch_startpos = False
294 self.error_count = 0
a921f407 295 BaseHTMLParser.__init__(self)
59ae15a5
PH
296
297 def error(self, message):
298 if self.error_count > 10 or self.started:
299 raise compat_html_parser.HTMLParseError(message, self.getpos())
300 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
301 self.error_count += 1
302 self.goahead(1)
303
59ae15a5
PH
304 def handle_starttag(self, tag, attrs):
305 attrs = dict(attrs)
306 if self.started:
307 self.find_startpos(None)
43e8fafd 308 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
309 self.result = [tag]
310 self.started = True
311 self.watch_startpos = True
312 if self.started:
313 if not tag in self.depth: self.depth[tag] = 0
314 self.depth[tag] += 1
315
316 def handle_endtag(self, tag):
317 if self.started:
318 if tag in self.depth: self.depth[tag] -= 1
319 if self.depth[self.result[0]] == 0:
320 self.started = False
321 self.result.append(self.getpos())
322
323 def find_startpos(self, x):
324 """Needed to put the start position of the result (self.result[1])
325 after the opening tag with the requested id"""
326 if self.watch_startpos:
327 self.watch_startpos = False
328 self.result.append(self.getpos())
329 handle_entityref = handle_charref = handle_data = handle_comment = \
330 handle_decl = handle_pi = unknown_decl = find_startpos
331
332 def get_result(self):
333 if self.result is None:
334 return None
335 if len(self.result) != 3:
336 return None
337 lines = self.html.split('\n')
338 lines = lines[self.result[1][0]-1:self.result[2][0]]
339 lines[0] = lines[0][self.result[1][1]:]
340 if len(lines) == 1:
341 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
342 lines[-1] = lines[-1][:self.result[2][1]]
343 return '\n'.join(lines).strip()
3b024e17
PH
344# Hack for https://github.com/rg3/youtube-dl/issues/662
345if sys.version_info < (2, 7, 3):
346 AttrParser.parse_endtag = (lambda self, i:
347 i + len("</scr'+'ipt>")
348 if self.rawdata[i:].startswith("</scr'+'ipt>")
349 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
350
351def get_element_by_id(id, html):
43e8fafd
ND
352 """Return the content of the tag with the specified ID in the passed HTML document"""
353 return get_element_by_attribute("id", id, html)
354
355def get_element_by_attribute(attribute, value, html):
356 """Return the content of the tag with the specified attribute in the passed HTML document"""
357 parser = AttrParser(attribute, value)
59ae15a5
PH
358 try:
359 parser.loads(html)
360 except compat_html_parser.HTMLParseError:
361 pass
362 return parser.get_result()
9e6dd238 363
a921f407
JMF
364class MetaParser(BaseHTMLParser):
365 """
366 Modified HTMLParser that isolates a meta tag with the specified name
367 attribute.
368 """
369 def __init__(self, name):
370 BaseHTMLParser.__init__(self)
371 self.name = name
372 self.content = None
373 self.result = None
374
375 def handle_starttag(self, tag, attrs):
376 if tag != 'meta':
377 return
378 attrs = dict(attrs)
379 if attrs.get('name') == self.name:
380 self.result = attrs.get('content')
381
382 def get_result(self):
383 return self.result
384
385def get_meta_content(name, html):
386 """
387 Return the content attribute from the meta tag with the given name attribute.
388 """
389 parser = MetaParser(name)
390 try:
391 parser.loads(html)
392 except compat_html_parser.HTMLParseError:
393 pass
394 return parser.get_result()
395
9e6dd238
FV
396
397def clean_html(html):
59ae15a5
PH
398 """Clean an HTML snippet into a readable string"""
399 # Newline vs <br />
400 html = html.replace('\n', ' ')
6b3aef80
FV
401 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
402 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
403 # Strip html tags
404 html = re.sub('<.*?>', '', html)
405 # Replace html entities
406 html = unescapeHTML(html)
7decf895 407 return html.strip()
9e6dd238
FV
408
409
d77c3dfd 410def sanitize_open(filename, open_mode):
59ae15a5
PH
411 """Try to open the given filename, and slightly tweak it if this fails.
412
413 Attempts to open the given filename. If this fails, it tries to change
414 the filename slightly, step by step, until it's either able to open it
415 or it fails and raises a final exception, like the standard open()
416 function.
417
418 It returns the tuple (stream, definitive_file_name).
419 """
420 try:
421 if filename == u'-':
422 if sys.platform == 'win32':
423 import msvcrt
424 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 425 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
426 stream = open(encodeFilename(filename), open_mode)
427 return (stream, filename)
428 except (IOError, OSError) as err:
f45c185f
PH
429 if err.errno in (errno.EACCES,):
430 raise
59ae15a5 431
f45c185f
PH
432 # In case of error, try to remove win32 forbidden chars
433 alt_filename = os.path.join(
434 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
435 for path_part in os.path.split(filename)
436 )
437 if alt_filename == filename:
438 raise
439 else:
440 # An exception here should be caught in the caller
441 stream = open(encodeFilename(filename), open_mode)
442 return (stream, alt_filename)
d77c3dfd
FV
443
444
445def timeconvert(timestr):
59ae15a5
PH
446 """Convert RFC 2822 defined time string into system timestamp"""
447 timestamp = None
448 timetuple = email.utils.parsedate_tz(timestr)
449 if timetuple is not None:
450 timestamp = email.utils.mktime_tz(timetuple)
451 return timestamp
1c469a94 452
796173d0 453def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
796173d0 456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
457 """
458 def replace_insane(char):
459 if char == '?' or ord(char) < 32 or ord(char) == 127:
460 return ''
461 elif char == '"':
462 return '' if restricted else '\''
463 elif char == ':':
464 return '_-' if restricted else ' -'
465 elif char in '\\/|*<>':
466 return '_'
627dcfff 467 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
468 return '_'
469 if restricted and ord(char) > 127:
470 return '_'
471 return char
472
473 result = u''.join(map(replace_insane, s))
796173d0
PH
474 if not is_id:
475 while '__' in result:
476 result = result.replace('__', '_')
477 result = result.strip('_')
478 # Common case of "Foreign band name - English song title"
479 if restricted and result.startswith('-_'):
480 result = result[2:]
481 if not result:
482 result = '_'
59ae15a5 483 return result
d77c3dfd
FV
484
485def orderedSet(iterable):
59ae15a5
PH
486 """ Remove all duplicates from the input iterable """
487 res = []
488 for el in iterable:
489 if el not in res:
490 res.append(el)
491 return res
d77c3dfd
FV
492
493def unescapeHTML(s):
59ae15a5
PH
494 """
495 @param s a string
496 """
497 assert type(s) == type(u'')
d77c3dfd 498
59ae15a5
PH
499 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
500 return result
d77c3dfd
FV
501
502def encodeFilename(s):
59ae15a5
PH
503 """
504 @param s The name of the file
505 """
d77c3dfd 506
59ae15a5 507 assert type(s) == type(u'')
d77c3dfd 508
59ae15a5
PH
509 # Python 3 has a Unicode API
510 if sys.version_info >= (3, 0):
511 return s
0f00efed 512
59ae15a5
PH
513 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
514 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
515 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
516 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
517 return s
518 else:
6df40dcb
PH
519 encoding = sys.getfilesystemencoding()
520 if encoding is None:
521 encoding = 'utf-8'
522 return s.encode(encoding, 'ignore')
d77c3dfd 523
8271226a
PH
524def decodeOption(optval):
525 if optval is None:
526 return optval
527 if isinstance(optval, bytes):
528 optval = optval.decode(preferredencoding())
529
530 assert isinstance(optval, compat_str)
531 return optval
1c256f70 532
4539dd30
PH
533def formatSeconds(secs):
534 if secs > 3600:
535 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
536 elif secs > 60:
537 return '%d:%02d' % (secs // 60, secs % 60)
538 else:
539 return '%d' % secs
540
dca08720 541def make_HTTPS_handler(opts_no_check_certificate):
13ebea79
PH
542 if sys.version_info < (3, 2):
543 import httplib
544
545 class HTTPSConnectionV3(httplib.HTTPSConnection):
546 def __init__(self, *args, **kwargs):
547 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
548
549 def connect(self):
550 sock = socket.create_connection((self.host, self.port), self.timeout)
551 if self._tunnel_host:
552 self.sock = sock
553 self._tunnel()
554 try:
555 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
556 except ssl.SSLError as e:
557 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
558
559 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
560 def https_open(self, req):
561 return self.do_open(HTTPSConnectionV3, req)
562 return HTTPSHandlerV3()
ea6d901e 563 else:
13ebea79 564 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e
PH
565 context.set_default_verify_paths()
566
567 context.verify_mode = (ssl.CERT_NONE
dca08720 568 if opts_no_check_certificate
ea6d901e 569 else ssl.CERT_REQUIRED)
acebc9cd 570 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 571
1c256f70
PH
572class ExtractorError(Exception):
573 """Error during info extraction."""
2eabb802 574 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
575 """ tb, if given, is the original traceback (so that it can be printed out).
576 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
577 """
578
579 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
580 expected = True
581 if not expected:
298f833b 582 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 583 super(ExtractorError, self).__init__(msg)
d5979c5d 584
1c256f70 585 self.traceback = tb
8cc83b8d 586 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 587 self.cause = cause
1c256f70 588
01951dda
PH
589 def format_traceback(self):
590 if self.traceback is None:
591 return None
592 return u''.join(traceback.format_tb(self.traceback))
593
1c256f70 594
55b3e45b
JMF
595class RegexNotFoundError(ExtractorError):
596 """Error when a regex didn't match"""
597 pass
598
599
d77c3dfd 600class DownloadError(Exception):
59ae15a5 601 """Download Error exception.
d77c3dfd 602
59ae15a5
PH
603 This exception may be thrown by FileDownloader objects if they are not
604 configured to continue on errors. They will contain the appropriate
605 error message.
606 """
8cc83b8d
FV
607 def __init__(self, msg, exc_info=None):
608 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
609 super(DownloadError, self).__init__(msg)
610 self.exc_info = exc_info
d77c3dfd
FV
611
612
613class SameFileError(Exception):
59ae15a5 614 """Same File exception.
d77c3dfd 615
59ae15a5
PH
616 This exception will be thrown by FileDownloader objects if they detect
617 multiple files would have to be downloaded to the same file on disk.
618 """
619 pass
d77c3dfd
FV
620
621
622class PostProcessingError(Exception):
59ae15a5 623 """Post Processing exception.
d77c3dfd 624
59ae15a5
PH
625 This exception may be raised by PostProcessor's .run() method to
626 indicate an error in the postprocessing task.
627 """
7851b379
PH
628 def __init__(self, msg):
629 self.msg = msg
d77c3dfd
FV
630
631class MaxDownloadsReached(Exception):
59ae15a5
PH
632 """ --max-downloads limit has been reached. """
633 pass
d77c3dfd
FV
634
635
636class UnavailableVideoError(Exception):
59ae15a5 637 """Unavailable Format exception.
d77c3dfd 638
59ae15a5
PH
639 This exception will be thrown when a video is requested
640 in a format that is not available for that video.
641 """
642 pass
d77c3dfd
FV
643
644
645class ContentTooShortError(Exception):
59ae15a5 646 """Content Too Short exception.
d77c3dfd 647
59ae15a5
PH
648 This exception may be raised by FileDownloader objects when a file they
649 download is too small for what the server announced first, indicating
650 the connection was probably interrupted.
651 """
652 # Both in bytes
653 downloaded = None
654 expected = None
d77c3dfd 655
59ae15a5
PH
656 def __init__(self, downloaded, expected):
657 self.downloaded = downloaded
658 self.expected = expected
d77c3dfd 659
acebc9cd 660class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
661 """Handler for HTTP requests and responses.
662
663 This class, when installed with an OpenerDirector, automatically adds
664 the standard headers to every HTTP request and handles gzipped and
665 deflated responses from web servers. If compression is to be avoided in
666 a particular request, the original request in the program code only has
667 to include the HTTP header "Youtubedl-No-Compression", which will be
668 removed before making the real request.
669
670 Part of this code was copied from:
671
672 http://techknack.net/python-urllib2-handlers/
673
674 Andrew Rowls, the author of that code, agreed to release it to the
675 public domain.
676 """
677
678 @staticmethod
679 def deflate(data):
680 try:
681 return zlib.decompress(data, -zlib.MAX_WBITS)
682 except zlib.error:
683 return zlib.decompress(data)
684
685 @staticmethod
686 def addinfourl_wrapper(stream, headers, url, code):
687 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
688 return compat_urllib_request.addinfourl(stream, headers, url, code)
689 ret = compat_urllib_request.addinfourl(stream, headers, url)
690 ret.code = code
691 return ret
692
acebc9cd
PH
693 def http_request(self, req):
694 for h,v in std_headers.items():
59ae15a5
PH
695 if h in req.headers:
696 del req.headers[h]
335959e7 697 req.add_header(h, v)
59ae15a5
PH
698 if 'Youtubedl-no-compression' in req.headers:
699 if 'Accept-encoding' in req.headers:
700 del req.headers['Accept-encoding']
701 del req.headers['Youtubedl-no-compression']
3446dfb7 702 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
703 if 'User-agent' in req.headers:
704 del req.headers['User-agent']
705 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 706 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
707 return req
708
acebc9cd 709 def http_response(self, req, resp):
59ae15a5
PH
710 old_resp = resp
711 # gzip
712 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
713 content = resp.read()
714 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
715 try:
716 uncompressed = io.BytesIO(gz.read())
717 except IOError as original_ioerror:
718 # There may be junk add the end of the file
719 # See http://stackoverflow.com/q/4928560/35070 for details
720 for i in range(1, 1024):
721 try:
722 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
723 uncompressed = io.BytesIO(gz.read())
724 except IOError:
725 continue
726 break
727 else:
728 raise original_ioerror
729 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
730 resp.msg = old_resp.msg
731 # deflate
732 if resp.headers.get('Content-encoding', '') == 'deflate':
733 gz = io.BytesIO(self.deflate(resp.read()))
734 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
735 resp.msg = old_resp.msg
736 return resp
0f8d03f8 737
acebc9cd
PH
738 https_request = http_request
739 https_response = http_response
bf50b038
JMF
740
741def unified_strdate(date_str):
742 """Return a string with the date in the format YYYYMMDD"""
743 upload_date = None
744 #Replace commas
745 date_str = date_str.replace(',',' ')
746 # %z (UTC offset) is only supported in python>=3.2
747 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
748 format_expressions = [
749 '%d %B %Y',
750 '%B %d %Y',
751 '%b %d %Y',
752 '%Y-%m-%d',
753 '%d/%m/%Y',
754 '%Y/%m/%d %H:%M:%S',
755 '%d.%m.%Y %H:%M',
756 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
757 '%Y-%m-%dT%H:%M:%S.%fZ',
758 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 759 '%Y-%m-%dT%H:%M:%S',
19e1d359 760 ]
bf50b038
JMF
761 for expression in format_expressions:
762 try:
763 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
764 except:
765 pass
766 return upload_date
767
cbdbb766 768def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
769 guess = url.partition(u'?')[0].rpartition(u'.')[2]
770 if re.match(r'^[A-Za-z0-9]+$', guess):
771 return guess
772 else:
cbdbb766 773 return default_ext
73e79f2a 774
d4051a8e
JMF
775def subtitles_filename(filename, sub_lang, sub_format):
776 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
777
bd558525 778def date_from_str(date_str):
37254abc
JMF
779 """
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
783 if date_str == 'now'or date_str == 'today':
784 return today
785 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
786 if match is not None:
787 sign = match.group('sign')
788 time = int(match.group('time'))
789 if sign == '-':
790 time = -time
791 unit = match.group('unit')
792 #A bad aproximation?
793 if unit == 'month':
794 unit = 'day'
795 time *= 30
796 elif unit == 'year':
797 unit = 'day'
798 time *= 365
799 unit += 's'
800 delta = datetime.timedelta(**{unit: time})
801 return today + delta
bd558525
JMF
802 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
803
804class DateRange(object):
805 """Represents a time interval between two dates"""
806 def __init__(self, start=None, end=None):
807 """start and end must be strings in the format accepted by date"""
808 if start is not None:
809 self.start = date_from_str(start)
810 else:
811 self.start = datetime.datetime.min.date()
812 if end is not None:
813 self.end = date_from_str(end)
814 else:
815 self.end = datetime.datetime.max.date()
37254abc 816 if self.start > self.end:
bd558525
JMF
817 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
818 @classmethod
819 def day(cls, day):
820 """Returns a range that only contains the given day"""
821 return cls(day,day)
822 def __contains__(self, date):
823 """Check if the date is in the range"""
37254abc
JMF
824 if not isinstance(date, datetime.date):
825 date = date_from_str(date)
826 return self.start <= date <= self.end
bd558525
JMF
827 def __str__(self):
828 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
829
830
831def platform_name():
832 """ Returns the platform name as a compat_str """
833 res = platform.platform()
834 if isinstance(res, bytes):
835 res = res.decode(preferredencoding())
836
837 assert isinstance(res, compat_str)
838 return res
c257baff
PH
839
840
7459e3a2
PH
841def write_string(s, out=None):
842 if out is None:
843 out = sys.stderr
844 assert type(s) == type(u'')
845
846 if ('b' in getattr(out, 'mode', '') or
847 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
848 s = s.encode(preferredencoding(), 'ignore')
849 out.write(s)
850 out.flush()
851
852
48ea9cea
PH
853def bytes_to_intlist(bs):
854 if not bs:
855 return []
856 if isinstance(bs[0], int): # Python 3
857 return list(bs)
858 else:
859 return [ord(c) for c in bs]
860
c257baff 861
cba892fa 862def intlist_to_bytes(xs):
863 if not xs:
864 return b''
865 if isinstance(chr(0), bytes): # Python 2
866 return ''.join([chr(x) for x in xs])
867 else:
868 return bytes(xs)
c38b1e77
PH
869
870
871def get_cachedir(params={}):
872 cache_root = os.environ.get('XDG_CACHE_HOME',
873 os.path.expanduser('~/.cache'))
874 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
875
876
877# Cross-platform file locking
878if sys.platform == 'win32':
879 import ctypes.wintypes
880 import msvcrt
881
882 class OVERLAPPED(ctypes.Structure):
883 _fields_ = [
884 ('Internal', ctypes.wintypes.LPVOID),
885 ('InternalHigh', ctypes.wintypes.LPVOID),
886 ('Offset', ctypes.wintypes.DWORD),
887 ('OffsetHigh', ctypes.wintypes.DWORD),
888 ('hEvent', ctypes.wintypes.HANDLE),
889 ]
890
891 kernel32 = ctypes.windll.kernel32
892 LockFileEx = kernel32.LockFileEx
893 LockFileEx.argtypes = [
894 ctypes.wintypes.HANDLE, # hFile
895 ctypes.wintypes.DWORD, # dwFlags
896 ctypes.wintypes.DWORD, # dwReserved
897 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
898 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
899 ctypes.POINTER(OVERLAPPED) # Overlapped
900 ]
901 LockFileEx.restype = ctypes.wintypes.BOOL
902 UnlockFileEx = kernel32.UnlockFileEx
903 UnlockFileEx.argtypes = [
904 ctypes.wintypes.HANDLE, # hFile
905 ctypes.wintypes.DWORD, # dwReserved
906 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
907 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
908 ctypes.POINTER(OVERLAPPED) # Overlapped
909 ]
910 UnlockFileEx.restype = ctypes.wintypes.BOOL
911 whole_low = 0xffffffff
912 whole_high = 0x7fffffff
913
914 def _lock_file(f, exclusive):
915 overlapped = OVERLAPPED()
916 overlapped.Offset = 0
917 overlapped.OffsetHigh = 0
918 overlapped.hEvent = 0
919 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
920 handle = msvcrt.get_osfhandle(f.fileno())
921 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
922 whole_low, whole_high, f._lock_file_overlapped_p):
923 raise OSError('Locking file failed: %r' % ctypes.FormatError())
924
925 def _unlock_file(f):
926 assert f._lock_file_overlapped_p
927 handle = msvcrt.get_osfhandle(f.fileno())
928 if not UnlockFileEx(handle, 0,
929 whole_low, whole_high, f._lock_file_overlapped_p):
930 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
931
932else:
933 import fcntl
934
935 def _lock_file(f, exclusive):
936 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
937
938 def _unlock_file(f):
939 fcntl.lockf(f, fcntl.LOCK_UN)
940
941
942class locked_file(object):
943 def __init__(self, filename, mode, encoding=None):
944 assert mode in ['r', 'a', 'w']
945 self.f = io.open(filename, mode, encoding=encoding)
946 self.mode = mode
947
948 def __enter__(self):
949 exclusive = self.mode != 'r'
950 try:
951 _lock_file(self.f, exclusive)
952 except IOError:
953 self.f.close()
954 raise
955 return self
956
957 def __exit__(self, etype, value, traceback):
958 try:
959 _unlock_file(self.f)
960 finally:
961 self.f.close()
962
963 def __iter__(self):
964 return iter(self.f)
965
966 def write(self, *args):
967 return self.f.write(*args)
968
969 def read(self, *args):
970 return self.f.read(*args)
4eb7f1d1
JMF
971
972
973def shell_quote(args):
a6a173c2
JMF
974 quoted_args = []
975 encoding = sys.getfilesystemencoding()
976 if encoding is None:
977 encoding = 'utf-8'
978 for a in args:
979 if isinstance(a, bytes):
980 # We may get a filename encoded with 'encodeFilename'
981 a = a.decode(encoding)
982 quoted_args.append(pipes.quote(a))
983 return u' '.join(quoted_args)
9d4660ca
PH
984
985
f4d96df0
PH
986def takewhile_inclusive(pred, seq):
987 """ Like itertools.takewhile, but include the latest evaluated element
988 (the first element so that Not pred(e)) """
989 for e in seq:
990 yield e
991 if not pred(e):
992 return
993
994
9d4660ca
PH
995def smuggle_url(url, data):
996 """ Pass additional data in a URL for internal use. """
997
998 sdata = compat_urllib_parse.urlencode(
999 {u'__youtubedl_smuggle': json.dumps(data)})
1000 return url + u'#' + sdata
1001
1002
1003def unsmuggle_url(smug_url):
1004 if not '#__youtubedl_smuggle' in smug_url:
1005 return smug_url, None
1006 url, _, sdata = smug_url.rpartition(u'#')
1007 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1008 data = json.loads(jsond)
1009 return url, data
02dbf93f
PH
1010
1011
1012def parse_xml_doc(s):
1013 assert isinstance(s, type(u''))
1014 return xml.etree.ElementTree.fromstring(s.encode('utf-8'))
1015
1016
1017def format_bytes(bytes):
1018 if bytes is None:
1019 return u'N/A'
1020 if type(bytes) is str:
1021 bytes = float(bytes)
1022 if bytes == 0.0:
1023 exponent = 0
1024 else:
1025 exponent = int(math.log(bytes, 1024.0))
1026 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1027 converted = float(bytes) / float(1024 ** exponent)
1028 return u'%.2f%s' % (converted, suffix)