]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[--load-info] Always read file as UTF-8
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd 10import locale
02dbf93f 11import math
d77c3dfd 12import os
4eb7f1d1 13import pipes
c496ca96 14import platform
d77c3dfd 15import re
13ebea79 16import ssl
c496ca96 17import socket
d77c3dfd 18import sys
01951dda 19import traceback
d77c3dfd 20import zlib
d77c3dfd 21
01ba00ca 22try:
59ae15a5 23 import urllib.request as compat_urllib_request
01ba00ca 24except ImportError: # Python 2
59ae15a5 25 import urllib2 as compat_urllib_request
01ba00ca
PH
26
27try:
59ae15a5 28 import urllib.error as compat_urllib_error
01ba00ca 29except ImportError: # Python 2
59ae15a5 30 import urllib2 as compat_urllib_error
01ba00ca
PH
31
32try:
59ae15a5 33 import urllib.parse as compat_urllib_parse
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib as compat_urllib_parse
01ba00ca 36
799c0763
PH
37try:
38 from urllib.parse import urlparse as compat_urllib_parse_urlparse
39except ImportError: # Python 2
40 from urlparse import urlparse as compat_urllib_parse_urlparse
41
6543f0dc
JMF
42try:
43 import urllib.parse as compat_urlparse
44except ImportError: # Python 2
45 import urlparse as compat_urlparse
46
01ba00ca 47try:
59ae15a5 48 import http.cookiejar as compat_cookiejar
01ba00ca 49except ImportError: # Python 2
59ae15a5 50 import cookielib as compat_cookiejar
01ba00ca 51
3e669f36 52try:
59ae15a5 53 import html.entities as compat_html_entities
9f37a959 54except ImportError: # Python 2
59ae15a5 55 import htmlentitydefs as compat_html_entities
3e669f36 56
a8156c1d 57try:
59ae15a5 58 import html.parser as compat_html_parser
9f37a959 59except ImportError: # Python 2
59ae15a5 60 import HTMLParser as compat_html_parser
a8156c1d 61
348d0a7a 62try:
59ae15a5 63 import http.client as compat_http_client
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import httplib as compat_http_client
348d0a7a 66
2eabb802 67try:
0e283428 68 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
69except ImportError: # Python 2
70 from urllib2 import HTTPError as compat_HTTPError
71
e0df6211
PH
72try:
73 from urllib.request import urlretrieve as compat_urlretrieve
74except ImportError: # Python 2
75 from urllib import urlretrieve as compat_urlretrieve
76
77
5910e210
PH
78try:
79 from subprocess import DEVNULL
80 compat_subprocess_get_DEVNULL = lambda: DEVNULL
81except ImportError:
82 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
83
9f37a959 84try:
59ae15a5 85 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 86except ImportError: # Python 2
59ae15a5
PH
87 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
88 # Python 2's version is apparently totally broken
89 def _unquote(string, encoding='utf-8', errors='replace'):
90 if string == '':
91 return string
92 res = string.split('%')
93 if len(res) == 1:
94 return string
95 if encoding is None:
96 encoding = 'utf-8'
97 if errors is None:
98 errors = 'replace'
99 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
100 pct_sequence = b''
101 string = res[0]
102 for item in res[1:]:
103 try:
104 if not item:
105 raise ValueError
106 pct_sequence += item[:2].decode('hex')
107 rest = item[2:]
108 if not rest:
109 # This segment was just a single percent-encoded character.
110 # May be part of a sequence of code units, so delay decoding.
111 # (Stored in pct_sequence).
112 continue
113 except ValueError:
114 rest = '%' + item
115 # Encountered non-percent-encoded characters. Flush the current
116 # pct_sequence.
117 string += pct_sequence.decode(encoding, errors) + rest
118 pct_sequence = b''
119 if pct_sequence:
120 # Flush the final pct_sequence
121 string += pct_sequence.decode(encoding, errors)
122 return string
123
124 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
125 encoding='utf-8', errors='replace'):
126 qs, _coerce_result = qs, unicode
127 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
128 r = []
129 for name_value in pairs:
130 if not name_value and not strict_parsing:
131 continue
132 nv = name_value.split('=', 1)
133 if len(nv) != 2:
134 if strict_parsing:
135 raise ValueError("bad query field: %r" % (name_value,))
136 # Handle case of a control-name with no equal sign
137 if keep_blank_values:
138 nv.append('')
139 else:
140 continue
141 if len(nv[1]) or keep_blank_values:
142 name = nv[0].replace('+', ' ')
143 name = _unquote(name, encoding=encoding, errors=errors)
144 name = _coerce_result(name)
145 value = nv[1].replace('+', ' ')
146 value = _unquote(value, encoding=encoding, errors=errors)
147 value = _coerce_result(value)
148 r.append((name, value))
149 return r
150
151 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
152 encoding='utf-8', errors='replace'):
153 parsed_result = {}
154 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
155 encoding=encoding, errors=errors)
156 for name, value in pairs:
157 if name in parsed_result:
158 parsed_result[name].append(value)
159 else:
160 parsed_result[name] = [value]
161 return parsed_result
348d0a7a 162
3e669f36 163try:
59ae15a5 164 compat_str = unicode # Python 2
3e669f36 165except NameError:
59ae15a5 166 compat_str = str
3e669f36
PH
167
168try:
59ae15a5 169 compat_chr = unichr # Python 2
3e669f36 170except NameError:
59ae15a5 171 compat_chr = chr
3e669f36 172
b31756c1
FV
173def compat_ord(c):
174 if type(c) is int: return c
175 else: return ord(c)
176
468e2e92
FV
177# This is not clearly defined otherwise
178compiled_regex_type = type(re.compile(''))
179
3e669f36 180std_headers = {
ae8f7871 181 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
182 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
183 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
184 'Accept-Encoding': 'gzip, deflate',
185 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 186}
f427df17 187
d77c3dfd 188def preferredencoding():
59ae15a5 189 """Get preferred encoding.
d77c3dfd 190
59ae15a5
PH
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
193 """
194 try:
195 pref = locale.getpreferredencoding()
196 u'TEST'.encode(pref)
197 except:
198 pref = 'UTF-8'
bae611f2 199
59ae15a5 200 return pref
d77c3dfd 201
8cd10ac4 202if sys.version_info < (3,0):
59ae15a5
PH
203 def compat_print(s):
204 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 205else:
59ae15a5
PH
206 def compat_print(s):
207 assert type(s) == type(u'')
208 print(s)
d77c3dfd 209
f4bfd65f
PH
210# In Python 2.x, json.dump expects a bytestream.
211# In Python 3.x, it writes to a character stream
212if sys.version_info < (3,0):
213 def write_json_file(obj, fn):
214 with open(fn, 'wb') as f:
215 json.dump(obj, f)
216else:
217 def write_json_file(obj, fn):
218 with open(fn, 'w', encoding='utf-8') as f:
219 json.dump(obj, f)
220
59ae56fa
PH
221if sys.version_info >= (2,7):
222 def find_xpath_attr(node, xpath, key, val):
223 """ Find the xpath xpath[@key=val] """
5de3ece2 224 assert re.match(r'^[a-zA-Z]+$', key)
54543467 225 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
226 expr = xpath + u"[@%s='%s']" % (key, val)
227 return node.find(expr)
228else:
229 def find_xpath_attr(node, xpath, key, val):
230 for f in node.findall(xpath):
231 if f.attrib.get(key) == val:
232 return f
233 return None
234
d7e66d39
JMF
235# On python2.6 the xml.etree.ElementTree.Element methods don't support
236# the namespace parameter
237def xpath_with_ns(path, ns_map):
238 components = [c.split(':') for c in path.split('/')]
239 replaced = []
240 for c in components:
241 if len(c) == 1:
242 replaced.append(c[0])
243 else:
244 ns, tag = c
245 replaced.append('{%s}%s' % (ns_map[ns], tag))
246 return '/'.join(replaced)
247
d77c3dfd 248def htmlentity_transform(matchobj):
59ae15a5
PH
249 """Transforms an HTML entity to a character.
250
251 This function receives a match object and is intended to be used with
252 the re.sub() function.
253 """
254 entity = matchobj.group(1)
255
256 # Known non-numeric HTML entity
257 if entity in compat_html_entities.name2codepoint:
258 return compat_chr(compat_html_entities.name2codepoint[entity])
259
260 mobj = re.match(u'(?u)#(x?\\d+)', entity)
261 if mobj is not None:
262 numstr = mobj.group(1)
263 if numstr.startswith(u'x'):
264 base = 16
265 numstr = u'0%s' % numstr
266 else:
267 base = 10
268 return compat_chr(int(numstr, base))
269
270 # Unknown entity in name, return its literal representation
271 return (u'&%s;' % entity)
d77c3dfd 272
a8156c1d 273compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
274class BaseHTMLParser(compat_html_parser.HTMLParser):
275 def __init(self):
276 compat_html_parser.HTMLParser.__init__(self)
277 self.html = None
278
279 def loads(self, html):
280 self.html = html
281 self.feed(html)
282 self.close()
283
284class AttrParser(BaseHTMLParser):
43e8fafd
ND
285 """Modified HTMLParser that isolates a tag with the specified attribute"""
286 def __init__(self, attribute, value):
287 self.attribute = attribute
288 self.value = value
59ae15a5
PH
289 self.result = None
290 self.started = False
291 self.depth = {}
59ae15a5
PH
292 self.watch_startpos = False
293 self.error_count = 0
a921f407 294 BaseHTMLParser.__init__(self)
59ae15a5
PH
295
296 def error(self, message):
297 if self.error_count > 10 or self.started:
298 raise compat_html_parser.HTMLParseError(message, self.getpos())
299 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
300 self.error_count += 1
301 self.goahead(1)
302
59ae15a5
PH
303 def handle_starttag(self, tag, attrs):
304 attrs = dict(attrs)
305 if self.started:
306 self.find_startpos(None)
43e8fafd 307 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
308 self.result = [tag]
309 self.started = True
310 self.watch_startpos = True
311 if self.started:
312 if not tag in self.depth: self.depth[tag] = 0
313 self.depth[tag] += 1
314
315 def handle_endtag(self, tag):
316 if self.started:
317 if tag in self.depth: self.depth[tag] -= 1
318 if self.depth[self.result[0]] == 0:
319 self.started = False
320 self.result.append(self.getpos())
321
322 def find_startpos(self, x):
323 """Needed to put the start position of the result (self.result[1])
324 after the opening tag with the requested id"""
325 if self.watch_startpos:
326 self.watch_startpos = False
327 self.result.append(self.getpos())
328 handle_entityref = handle_charref = handle_data = handle_comment = \
329 handle_decl = handle_pi = unknown_decl = find_startpos
330
331 def get_result(self):
332 if self.result is None:
333 return None
334 if len(self.result) != 3:
335 return None
336 lines = self.html.split('\n')
337 lines = lines[self.result[1][0]-1:self.result[2][0]]
338 lines[0] = lines[0][self.result[1][1]:]
339 if len(lines) == 1:
340 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
341 lines[-1] = lines[-1][:self.result[2][1]]
342 return '\n'.join(lines).strip()
3b024e17
PH
343# Hack for https://github.com/rg3/youtube-dl/issues/662
344if sys.version_info < (2, 7, 3):
345 AttrParser.parse_endtag = (lambda self, i:
346 i + len("</scr'+'ipt>")
347 if self.rawdata[i:].startswith("</scr'+'ipt>")
348 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
349
350def get_element_by_id(id, html):
43e8fafd
ND
351 """Return the content of the tag with the specified ID in the passed HTML document"""
352 return get_element_by_attribute("id", id, html)
353
354def get_element_by_attribute(attribute, value, html):
355 """Return the content of the tag with the specified attribute in the passed HTML document"""
356 parser = AttrParser(attribute, value)
59ae15a5
PH
357 try:
358 parser.loads(html)
359 except compat_html_parser.HTMLParseError:
360 pass
361 return parser.get_result()
9e6dd238 362
a921f407
JMF
363class MetaParser(BaseHTMLParser):
364 """
365 Modified HTMLParser that isolates a meta tag with the specified name
366 attribute.
367 """
368 def __init__(self, name):
369 BaseHTMLParser.__init__(self)
370 self.name = name
371 self.content = None
372 self.result = None
373
374 def handle_starttag(self, tag, attrs):
375 if tag != 'meta':
376 return
377 attrs = dict(attrs)
378 if attrs.get('name') == self.name:
379 self.result = attrs.get('content')
380
381 def get_result(self):
382 return self.result
383
384def get_meta_content(name, html):
385 """
386 Return the content attribute from the meta tag with the given name attribute.
387 """
388 parser = MetaParser(name)
389 try:
390 parser.loads(html)
391 except compat_html_parser.HTMLParseError:
392 pass
393 return parser.get_result()
394
9e6dd238
FV
395
396def clean_html(html):
59ae15a5
PH
397 """Clean an HTML snippet into a readable string"""
398 # Newline vs <br />
399 html = html.replace('\n', ' ')
6b3aef80
FV
400 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
401 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
402 # Strip html tags
403 html = re.sub('<.*?>', '', html)
404 # Replace html entities
405 html = unescapeHTML(html)
7decf895 406 return html.strip()
9e6dd238
FV
407
408
d77c3dfd 409def sanitize_open(filename, open_mode):
59ae15a5
PH
410 """Try to open the given filename, and slightly tweak it if this fails.
411
412 Attempts to open the given filename. If this fails, it tries to change
413 the filename slightly, step by step, until it's either able to open it
414 or it fails and raises a final exception, like the standard open()
415 function.
416
417 It returns the tuple (stream, definitive_file_name).
418 """
419 try:
420 if filename == u'-':
421 if sys.platform == 'win32':
422 import msvcrt
423 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 424 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
425 stream = open(encodeFilename(filename), open_mode)
426 return (stream, filename)
427 except (IOError, OSError) as err:
f45c185f
PH
428 if err.errno in (errno.EACCES,):
429 raise
59ae15a5 430
f45c185f
PH
431 # In case of error, try to remove win32 forbidden chars
432 alt_filename = os.path.join(
433 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
434 for path_part in os.path.split(filename)
435 )
436 if alt_filename == filename:
437 raise
438 else:
439 # An exception here should be caught in the caller
440 stream = open(encodeFilename(filename), open_mode)
441 return (stream, alt_filename)
d77c3dfd
FV
442
443
444def timeconvert(timestr):
59ae15a5
PH
445 """Convert RFC 2822 defined time string into system timestamp"""
446 timestamp = None
447 timetuple = email.utils.parsedate_tz(timestr)
448 if timetuple is not None:
449 timestamp = email.utils.mktime_tz(timetuple)
450 return timestamp
1c469a94 451
796173d0 452def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
453 """Sanitizes a string so it could be used as part of a filename.
454 If restricted is set, use a stricter subset of allowed characters.
796173d0 455 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
456 """
457 def replace_insane(char):
458 if char == '?' or ord(char) < 32 or ord(char) == 127:
459 return ''
460 elif char == '"':
461 return '' if restricted else '\''
462 elif char == ':':
463 return '_-' if restricted else ' -'
464 elif char in '\\/|*<>':
465 return '_'
627dcfff 466 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
467 return '_'
468 if restricted and ord(char) > 127:
469 return '_'
470 return char
471
472 result = u''.join(map(replace_insane, s))
796173d0
PH
473 if not is_id:
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
479 result = result[2:]
480 if not result:
481 result = '_'
59ae15a5 482 return result
d77c3dfd
FV
483
484def orderedSet(iterable):
59ae15a5
PH
485 """ Remove all duplicates from the input iterable """
486 res = []
487 for el in iterable:
488 if el not in res:
489 res.append(el)
490 return res
d77c3dfd
FV
491
492def unescapeHTML(s):
59ae15a5
PH
493 """
494 @param s a string
495 """
496 assert type(s) == type(u'')
d77c3dfd 497
59ae15a5
PH
498 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
499 return result
d77c3dfd
FV
500
501def encodeFilename(s):
59ae15a5
PH
502 """
503 @param s The name of the file
504 """
d77c3dfd 505
59ae15a5 506 assert type(s) == type(u'')
d77c3dfd 507
59ae15a5
PH
508 # Python 3 has a Unicode API
509 if sys.version_info >= (3, 0):
510 return s
0f00efed 511
59ae15a5
PH
512 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
513 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
514 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
515 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
516 return s
517 else:
6df40dcb
PH
518 encoding = sys.getfilesystemencoding()
519 if encoding is None:
520 encoding = 'utf-8'
521 return s.encode(encoding, 'ignore')
d77c3dfd 522
8271226a
PH
523def decodeOption(optval):
524 if optval is None:
525 return optval
526 if isinstance(optval, bytes):
527 optval = optval.decode(preferredencoding())
528
529 assert isinstance(optval, compat_str)
530 return optval
1c256f70 531
4539dd30
PH
532def formatSeconds(secs):
533 if secs > 3600:
534 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
535 elif secs > 60:
536 return '%d:%02d' % (secs // 60, secs % 60)
537 else:
538 return '%d' % secs
539
dca08720 540def make_HTTPS_handler(opts_no_check_certificate):
13ebea79
PH
541 if sys.version_info < (3, 2):
542 import httplib
543
544 class HTTPSConnectionV3(httplib.HTTPSConnection):
545 def __init__(self, *args, **kwargs):
546 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
547
548 def connect(self):
549 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 550 if getattr(self, '_tunnel_host', False):
13ebea79
PH
551 self.sock = sock
552 self._tunnel()
553 try:
554 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 555 except ssl.SSLError:
13ebea79
PH
556 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
557
558 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
559 def https_open(self, req):
560 return self.do_open(HTTPSConnectionV3, req)
561 return HTTPSHandlerV3()
ea6d901e 562 else:
13ebea79 563 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 564 context.verify_mode = (ssl.CERT_NONE
dca08720 565 if opts_no_check_certificate
ea6d901e 566 else ssl.CERT_REQUIRED)
303b479e
PH
567 context.set_default_verify_paths()
568 try:
569 context.load_default_certs()
570 except AttributeError:
571 pass # Python < 3.4
acebc9cd 572 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 573
1c256f70
PH
574class ExtractorError(Exception):
575 """Error during info extraction."""
2eabb802 576 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
577 """ tb, if given, is the original traceback (so that it can be printed out).
578 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
579 """
580
581 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
582 expected = True
583 if not expected:
298f833b 584 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 585 super(ExtractorError, self).__init__(msg)
d5979c5d 586
1c256f70 587 self.traceback = tb
8cc83b8d 588 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 589 self.cause = cause
1c256f70 590
01951dda
PH
591 def format_traceback(self):
592 if self.traceback is None:
593 return None
594 return u''.join(traceback.format_tb(self.traceback))
595
1c256f70 596
55b3e45b
JMF
597class RegexNotFoundError(ExtractorError):
598 """Error when a regex didn't match"""
599 pass
600
601
d77c3dfd 602class DownloadError(Exception):
59ae15a5 603 """Download Error exception.
d77c3dfd 604
59ae15a5
PH
605 This exception may be thrown by FileDownloader objects if they are not
606 configured to continue on errors. They will contain the appropriate
607 error message.
608 """
8cc83b8d
FV
609 def __init__(self, msg, exc_info=None):
610 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
611 super(DownloadError, self).__init__(msg)
612 self.exc_info = exc_info
d77c3dfd
FV
613
614
615class SameFileError(Exception):
59ae15a5 616 """Same File exception.
d77c3dfd 617
59ae15a5
PH
618 This exception will be thrown by FileDownloader objects if they detect
619 multiple files would have to be downloaded to the same file on disk.
620 """
621 pass
d77c3dfd
FV
622
623
624class PostProcessingError(Exception):
59ae15a5 625 """Post Processing exception.
d77c3dfd 626
59ae15a5
PH
627 This exception may be raised by PostProcessor's .run() method to
628 indicate an error in the postprocessing task.
629 """
7851b379
PH
630 def __init__(self, msg):
631 self.msg = msg
d77c3dfd
FV
632
633class MaxDownloadsReached(Exception):
59ae15a5
PH
634 """ --max-downloads limit has been reached. """
635 pass
d77c3dfd
FV
636
637
638class UnavailableVideoError(Exception):
59ae15a5 639 """Unavailable Format exception.
d77c3dfd 640
59ae15a5
PH
641 This exception will be thrown when a video is requested
642 in a format that is not available for that video.
643 """
644 pass
d77c3dfd
FV
645
646
647class ContentTooShortError(Exception):
59ae15a5 648 """Content Too Short exception.
d77c3dfd 649
59ae15a5
PH
650 This exception may be raised by FileDownloader objects when a file they
651 download is too small for what the server announced first, indicating
652 the connection was probably interrupted.
653 """
654 # Both in bytes
655 downloaded = None
656 expected = None
d77c3dfd 657
59ae15a5
PH
658 def __init__(self, downloaded, expected):
659 self.downloaded = downloaded
660 self.expected = expected
d77c3dfd 661
acebc9cd 662class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
663 """Handler for HTTP requests and responses.
664
665 This class, when installed with an OpenerDirector, automatically adds
666 the standard headers to every HTTP request and handles gzipped and
667 deflated responses from web servers. If compression is to be avoided in
668 a particular request, the original request in the program code only has
669 to include the HTTP header "Youtubedl-No-Compression", which will be
670 removed before making the real request.
671
672 Part of this code was copied from:
673
674 http://techknack.net/python-urllib2-handlers/
675
676 Andrew Rowls, the author of that code, agreed to release it to the
677 public domain.
678 """
679
680 @staticmethod
681 def deflate(data):
682 try:
683 return zlib.decompress(data, -zlib.MAX_WBITS)
684 except zlib.error:
685 return zlib.decompress(data)
686
687 @staticmethod
688 def addinfourl_wrapper(stream, headers, url, code):
689 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
690 return compat_urllib_request.addinfourl(stream, headers, url, code)
691 ret = compat_urllib_request.addinfourl(stream, headers, url)
692 ret.code = code
693 return ret
694
acebc9cd
PH
695 def http_request(self, req):
696 for h,v in std_headers.items():
59ae15a5
PH
697 if h in req.headers:
698 del req.headers[h]
335959e7 699 req.add_header(h, v)
59ae15a5
PH
700 if 'Youtubedl-no-compression' in req.headers:
701 if 'Accept-encoding' in req.headers:
702 del req.headers['Accept-encoding']
703 del req.headers['Youtubedl-no-compression']
3446dfb7 704 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
705 if 'User-agent' in req.headers:
706 del req.headers['User-agent']
707 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 708 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
709 return req
710
acebc9cd 711 def http_response(self, req, resp):
59ae15a5
PH
712 old_resp = resp
713 # gzip
714 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
715 content = resp.read()
716 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
717 try:
718 uncompressed = io.BytesIO(gz.read())
719 except IOError as original_ioerror:
720 # There may be junk add the end of the file
721 # See http://stackoverflow.com/q/4928560/35070 for details
722 for i in range(1, 1024):
723 try:
724 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
725 uncompressed = io.BytesIO(gz.read())
726 except IOError:
727 continue
728 break
729 else:
730 raise original_ioerror
731 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
732 resp.msg = old_resp.msg
733 # deflate
734 if resp.headers.get('Content-encoding', '') == 'deflate':
735 gz = io.BytesIO(self.deflate(resp.read()))
736 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
737 resp.msg = old_resp.msg
738 return resp
0f8d03f8 739
acebc9cd
PH
740 https_request = http_request
741 https_response = http_response
bf50b038
JMF
742
743def unified_strdate(date_str):
744 """Return a string with the date in the format YYYYMMDD"""
745 upload_date = None
746 #Replace commas
747 date_str = date_str.replace(',',' ')
748 # %z (UTC offset) is only supported in python>=3.2
749 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
750 format_expressions = [
751 '%d %B %Y',
752 '%B %d %Y',
753 '%b %d %Y',
754 '%Y-%m-%d',
755 '%d/%m/%Y',
756 '%Y/%m/%d %H:%M:%S',
757 '%d.%m.%Y %H:%M',
758 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
759 '%Y-%m-%dT%H:%M:%S.%fZ',
760 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 761 '%Y-%m-%dT%H:%M:%S',
19e1d359 762 ]
bf50b038
JMF
763 for expression in format_expressions:
764 try:
765 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
766 except:
767 pass
768 return upload_date
769
cbdbb766 770def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
771 guess = url.partition(u'?')[0].rpartition(u'.')[2]
772 if re.match(r'^[A-Za-z0-9]+$', guess):
773 return guess
774 else:
cbdbb766 775 return default_ext
73e79f2a 776
d4051a8e
JMF
777def subtitles_filename(filename, sub_lang, sub_format):
778 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
779
bd558525 780def date_from_str(date_str):
37254abc
JMF
781 """
782 Return a datetime object from a string in the format YYYYMMDD or
783 (now|today)[+-][0-9](day|week|month|year)(s)?"""
784 today = datetime.date.today()
785 if date_str == 'now'or date_str == 'today':
786 return today
787 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
788 if match is not None:
789 sign = match.group('sign')
790 time = int(match.group('time'))
791 if sign == '-':
792 time = -time
793 unit = match.group('unit')
794 #A bad aproximation?
795 if unit == 'month':
796 unit = 'day'
797 time *= 30
798 elif unit == 'year':
799 unit = 'day'
800 time *= 365
801 unit += 's'
802 delta = datetime.timedelta(**{unit: time})
803 return today + delta
bd558525
JMF
804 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
805
806class DateRange(object):
807 """Represents a time interval between two dates"""
808 def __init__(self, start=None, end=None):
809 """start and end must be strings in the format accepted by date"""
810 if start is not None:
811 self.start = date_from_str(start)
812 else:
813 self.start = datetime.datetime.min.date()
814 if end is not None:
815 self.end = date_from_str(end)
816 else:
817 self.end = datetime.datetime.max.date()
37254abc 818 if self.start > self.end:
bd558525
JMF
819 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
820 @classmethod
821 def day(cls, day):
822 """Returns a range that only contains the given day"""
823 return cls(day,day)
824 def __contains__(self, date):
825 """Check if the date is in the range"""
37254abc
JMF
826 if not isinstance(date, datetime.date):
827 date = date_from_str(date)
828 return self.start <= date <= self.end
bd558525
JMF
829 def __str__(self):
830 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
831
832
833def platform_name():
834 """ Returns the platform name as a compat_str """
835 res = platform.platform()
836 if isinstance(res, bytes):
837 res = res.decode(preferredencoding())
838
839 assert isinstance(res, compat_str)
840 return res
c257baff
PH
841
842
7459e3a2
PH
843def write_string(s, out=None):
844 if out is None:
845 out = sys.stderr
846 assert type(s) == type(u'')
847
848 if ('b' in getattr(out, 'mode', '') or
849 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
850 s = s.encode(preferredencoding(), 'ignore')
851 out.write(s)
852 out.flush()
853
854
48ea9cea
PH
855def bytes_to_intlist(bs):
856 if not bs:
857 return []
858 if isinstance(bs[0], int): # Python 3
859 return list(bs)
860 else:
861 return [ord(c) for c in bs]
862
c257baff 863
cba892fa 864def intlist_to_bytes(xs):
865 if not xs:
866 return b''
867 if isinstance(chr(0), bytes): # Python 2
868 return ''.join([chr(x) for x in xs])
869 else:
870 return bytes(xs)
c38b1e77
PH
871
872
873def get_cachedir(params={}):
874 cache_root = os.environ.get('XDG_CACHE_HOME',
875 os.path.expanduser('~/.cache'))
876 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
877
878
879# Cross-platform file locking
880if sys.platform == 'win32':
881 import ctypes.wintypes
882 import msvcrt
883
884 class OVERLAPPED(ctypes.Structure):
885 _fields_ = [
886 ('Internal', ctypes.wintypes.LPVOID),
887 ('InternalHigh', ctypes.wintypes.LPVOID),
888 ('Offset', ctypes.wintypes.DWORD),
889 ('OffsetHigh', ctypes.wintypes.DWORD),
890 ('hEvent', ctypes.wintypes.HANDLE),
891 ]
892
893 kernel32 = ctypes.windll.kernel32
894 LockFileEx = kernel32.LockFileEx
895 LockFileEx.argtypes = [
896 ctypes.wintypes.HANDLE, # hFile
897 ctypes.wintypes.DWORD, # dwFlags
898 ctypes.wintypes.DWORD, # dwReserved
899 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
900 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
901 ctypes.POINTER(OVERLAPPED) # Overlapped
902 ]
903 LockFileEx.restype = ctypes.wintypes.BOOL
904 UnlockFileEx = kernel32.UnlockFileEx
905 UnlockFileEx.argtypes = [
906 ctypes.wintypes.HANDLE, # hFile
907 ctypes.wintypes.DWORD, # dwReserved
908 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
909 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
910 ctypes.POINTER(OVERLAPPED) # Overlapped
911 ]
912 UnlockFileEx.restype = ctypes.wintypes.BOOL
913 whole_low = 0xffffffff
914 whole_high = 0x7fffffff
915
916 def _lock_file(f, exclusive):
917 overlapped = OVERLAPPED()
918 overlapped.Offset = 0
919 overlapped.OffsetHigh = 0
920 overlapped.hEvent = 0
921 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
922 handle = msvcrt.get_osfhandle(f.fileno())
923 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
924 whole_low, whole_high, f._lock_file_overlapped_p):
925 raise OSError('Locking file failed: %r' % ctypes.FormatError())
926
927 def _unlock_file(f):
928 assert f._lock_file_overlapped_p
929 handle = msvcrt.get_osfhandle(f.fileno())
930 if not UnlockFileEx(handle, 0,
931 whole_low, whole_high, f._lock_file_overlapped_p):
932 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
933
934else:
935 import fcntl
936
937 def _lock_file(f, exclusive):
938 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
939
940 def _unlock_file(f):
941 fcntl.lockf(f, fcntl.LOCK_UN)
942
943
944class locked_file(object):
945 def __init__(self, filename, mode, encoding=None):
946 assert mode in ['r', 'a', 'w']
947 self.f = io.open(filename, mode, encoding=encoding)
948 self.mode = mode
949
950 def __enter__(self):
951 exclusive = self.mode != 'r'
952 try:
953 _lock_file(self.f, exclusive)
954 except IOError:
955 self.f.close()
956 raise
957 return self
958
959 def __exit__(self, etype, value, traceback):
960 try:
961 _unlock_file(self.f)
962 finally:
963 self.f.close()
964
965 def __iter__(self):
966 return iter(self.f)
967
968 def write(self, *args):
969 return self.f.write(*args)
970
971 def read(self, *args):
972 return self.f.read(*args)
4eb7f1d1
JMF
973
974
975def shell_quote(args):
a6a173c2
JMF
976 quoted_args = []
977 encoding = sys.getfilesystemencoding()
978 if encoding is None:
979 encoding = 'utf-8'
980 for a in args:
981 if isinstance(a, bytes):
982 # We may get a filename encoded with 'encodeFilename'
983 a = a.decode(encoding)
984 quoted_args.append(pipes.quote(a))
985 return u' '.join(quoted_args)
9d4660ca
PH
986
987
f4d96df0
PH
988def takewhile_inclusive(pred, seq):
989 """ Like itertools.takewhile, but include the latest evaluated element
990 (the first element so that Not pred(e)) """
991 for e in seq:
992 yield e
993 if not pred(e):
994 return
995
996
9d4660ca
PH
997def smuggle_url(url, data):
998 """ Pass additional data in a URL for internal use. """
999
1000 sdata = compat_urllib_parse.urlencode(
1001 {u'__youtubedl_smuggle': json.dumps(data)})
1002 return url + u'#' + sdata
1003
1004
1005def unsmuggle_url(smug_url):
1006 if not '#__youtubedl_smuggle' in smug_url:
1007 return smug_url, None
1008 url, _, sdata = smug_url.rpartition(u'#')
1009 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1010 data = json.loads(jsond)
1011 return url, data
02dbf93f
PH
1012
1013
02dbf93f
PH
1014def format_bytes(bytes):
1015 if bytes is None:
1016 return u'N/A'
1017 if type(bytes) is str:
1018 bytes = float(bytes)
1019 if bytes == 0.0:
1020 exponent = 0
1021 else:
1022 exponent = int(math.log(bytes, 1024.0))
1023 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1024 converted = float(bytes) / float(1024 ** exponent)
1025 return u'%.2f%s' % (converted, suffix)
f53c966a
JMF
1026
1027def str_to_int(int_str):
1028 int_str = re.sub(r'[,\.]', u'', int_str)
1029 return int(int_str)