]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Improve parse_duration
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
e3946f98 4import ctypes
c496ca96
PH
5import datetime
6import email.utils
f45c185f 7import errno
d77c3dfd 8import gzip
b7ab0590 9import itertools
03f9daab 10import io
f4bfd65f 11import json
d77c3dfd 12import locale
02dbf93f 13import math
d77c3dfd 14import os
4eb7f1d1 15import pipes
c496ca96 16import platform
d77c3dfd 17import re
13ebea79 18import ssl
c496ca96 19import socket
b53466e1 20import struct
1c088fa8 21import subprocess
d77c3dfd 22import sys
01951dda 23import traceback
d77c3dfd 24import zlib
d77c3dfd 25
01ba00ca 26try:
59ae15a5 27 import urllib.request as compat_urllib_request
01ba00ca 28except ImportError: # Python 2
59ae15a5 29 import urllib2 as compat_urllib_request
01ba00ca
PH
30
31try:
59ae15a5 32 import urllib.error as compat_urllib_error
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib2 as compat_urllib_error
01ba00ca
PH
35
36try:
59ae15a5 37 import urllib.parse as compat_urllib_parse
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import urllib as compat_urllib_parse
01ba00ca 40
799c0763
PH
41try:
42 from urllib.parse import urlparse as compat_urllib_parse_urlparse
43except ImportError: # Python 2
44 from urlparse import urlparse as compat_urllib_parse_urlparse
45
6543f0dc
JMF
46try:
47 import urllib.parse as compat_urlparse
48except ImportError: # Python 2
49 import urlparse as compat_urlparse
50
01ba00ca 51try:
59ae15a5 52 import http.cookiejar as compat_cookiejar
01ba00ca 53except ImportError: # Python 2
59ae15a5 54 import cookielib as compat_cookiejar
01ba00ca 55
3e669f36 56try:
59ae15a5 57 import html.entities as compat_html_entities
9f37a959 58except ImportError: # Python 2
59ae15a5 59 import htmlentitydefs as compat_html_entities
3e669f36 60
a8156c1d 61try:
59ae15a5 62 import html.parser as compat_html_parser
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import HTMLParser as compat_html_parser
a8156c1d 65
348d0a7a 66try:
59ae15a5 67 import http.client as compat_http_client
9f37a959 68except ImportError: # Python 2
59ae15a5 69 import httplib as compat_http_client
348d0a7a 70
2eabb802 71try:
0e283428 72 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
73except ImportError: # Python 2
74 from urllib2 import HTTPError as compat_HTTPError
75
e0df6211
PH
76try:
77 from urllib.request import urlretrieve as compat_urlretrieve
78except ImportError: # Python 2
79 from urllib import urlretrieve as compat_urlretrieve
80
81
5910e210
PH
82try:
83 from subprocess import DEVNULL
84 compat_subprocess_get_DEVNULL = lambda: DEVNULL
85except ImportError:
86 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
87
9f37a959 88try:
59ae15a5 89 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 90except ImportError: # Python 2
59ae15a5
PH
91 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
92 # Python 2's version is apparently totally broken
93 def _unquote(string, encoding='utf-8', errors='replace'):
94 if string == '':
95 return string
96 res = string.split('%')
97 if len(res) == 1:
98 return string
99 if encoding is None:
100 encoding = 'utf-8'
101 if errors is None:
102 errors = 'replace'
103 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
104 pct_sequence = b''
105 string = res[0]
106 for item in res[1:]:
107 try:
108 if not item:
109 raise ValueError
110 pct_sequence += item[:2].decode('hex')
111 rest = item[2:]
112 if not rest:
113 # This segment was just a single percent-encoded character.
114 # May be part of a sequence of code units, so delay decoding.
115 # (Stored in pct_sequence).
116 continue
117 except ValueError:
118 rest = '%' + item
119 # Encountered non-percent-encoded characters. Flush the current
120 # pct_sequence.
121 string += pct_sequence.decode(encoding, errors) + rest
122 pct_sequence = b''
123 if pct_sequence:
124 # Flush the final pct_sequence
125 string += pct_sequence.decode(encoding, errors)
126 return string
127
128 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
130 qs, _coerce_result = qs, unicode
131 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
132 r = []
133 for name_value in pairs:
134 if not name_value and not strict_parsing:
135 continue
136 nv = name_value.split('=', 1)
137 if len(nv) != 2:
138 if strict_parsing:
139 raise ValueError("bad query field: %r" % (name_value,))
140 # Handle case of a control-name with no equal sign
141 if keep_blank_values:
142 nv.append('')
143 else:
144 continue
145 if len(nv[1]) or keep_blank_values:
146 name = nv[0].replace('+', ' ')
147 name = _unquote(name, encoding=encoding, errors=errors)
148 name = _coerce_result(name)
149 value = nv[1].replace('+', ' ')
150 value = _unquote(value, encoding=encoding, errors=errors)
151 value = _coerce_result(value)
152 r.append((name, value))
153 return r
154
155 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
156 encoding='utf-8', errors='replace'):
157 parsed_result = {}
158 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
159 encoding=encoding, errors=errors)
160 for name, value in pairs:
161 if name in parsed_result:
162 parsed_result[name].append(value)
163 else:
164 parsed_result[name] = [value]
165 return parsed_result
348d0a7a 166
3e669f36 167try:
59ae15a5 168 compat_str = unicode # Python 2
3e669f36 169except NameError:
59ae15a5 170 compat_str = str
3e669f36
PH
171
172try:
59ae15a5 173 compat_chr = unichr # Python 2
3e669f36 174except NameError:
59ae15a5 175 compat_chr = chr
3e669f36 176
b31756c1
FV
177def compat_ord(c):
178 if type(c) is int: return c
179 else: return ord(c)
180
468e2e92
FV
181# This is not clearly defined otherwise
182compiled_regex_type = type(re.compile(''))
183
3e669f36 184std_headers = {
ae8f7871 185 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
186 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
187 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
188 'Accept-Encoding': 'gzip, deflate',
189 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 190}
f427df17 191
d77c3dfd 192def preferredencoding():
59ae15a5 193 """Get preferred encoding.
d77c3dfd 194
59ae15a5
PH
195 Returns the best encoding scheme for the system, based on
196 locale.getpreferredencoding() and some further tweaks.
197 """
198 try:
199 pref = locale.getpreferredencoding()
200 u'TEST'.encode(pref)
201 except:
202 pref = 'UTF-8'
bae611f2 203
59ae15a5 204 return pref
d77c3dfd 205
8cd10ac4 206if sys.version_info < (3,0):
59ae15a5
PH
207 def compat_print(s):
208 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 209else:
59ae15a5
PH
210 def compat_print(s):
211 assert type(s) == type(u'')
212 print(s)
d77c3dfd 213
f4bfd65f
PH
214# In Python 2.x, json.dump expects a bytestream.
215# In Python 3.x, it writes to a character stream
216if sys.version_info < (3,0):
217 def write_json_file(obj, fn):
218 with open(fn, 'wb') as f:
219 json.dump(obj, f)
220else:
221 def write_json_file(obj, fn):
222 with open(fn, 'w', encoding='utf-8') as f:
223 json.dump(obj, f)
224
59ae56fa
PH
225if sys.version_info >= (2,7):
226 def find_xpath_attr(node, xpath, key, val):
227 """ Find the xpath xpath[@key=val] """
5de3ece2 228 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 229 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
230 expr = xpath + u"[@%s='%s']" % (key, val)
231 return node.find(expr)
232else:
233 def find_xpath_attr(node, xpath, key, val):
234 for f in node.findall(xpath):
235 if f.attrib.get(key) == val:
236 return f
237 return None
238
d7e66d39
JMF
239# On python2.6 the xml.etree.ElementTree.Element methods don't support
240# the namespace parameter
241def xpath_with_ns(path, ns_map):
242 components = [c.split(':') for c in path.split('/')]
243 replaced = []
244 for c in components:
245 if len(c) == 1:
246 replaced.append(c[0])
247 else:
248 ns, tag = c
249 replaced.append('{%s}%s' % (ns_map[ns], tag))
250 return '/'.join(replaced)
251
d77c3dfd 252def htmlentity_transform(matchobj):
59ae15a5
PH
253 """Transforms an HTML entity to a character.
254
255 This function receives a match object and is intended to be used with
256 the re.sub() function.
257 """
258 entity = matchobj.group(1)
259
260 # Known non-numeric HTML entity
261 if entity in compat_html_entities.name2codepoint:
262 return compat_chr(compat_html_entities.name2codepoint[entity])
263
264 mobj = re.match(u'(?u)#(x?\\d+)', entity)
265 if mobj is not None:
266 numstr = mobj.group(1)
267 if numstr.startswith(u'x'):
268 base = 16
269 numstr = u'0%s' % numstr
270 else:
271 base = 10
272 return compat_chr(int(numstr, base))
273
274 # Unknown entity in name, return its literal representation
275 return (u'&%s;' % entity)
d77c3dfd 276
a8156c1d 277compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
278class BaseHTMLParser(compat_html_parser.HTMLParser):
279 def __init(self):
280 compat_html_parser.HTMLParser.__init__(self)
281 self.html = None
282
283 def loads(self, html):
284 self.html = html
285 self.feed(html)
286 self.close()
287
288class AttrParser(BaseHTMLParser):
43e8fafd
ND
289 """Modified HTMLParser that isolates a tag with the specified attribute"""
290 def __init__(self, attribute, value):
291 self.attribute = attribute
292 self.value = value
59ae15a5
PH
293 self.result = None
294 self.started = False
295 self.depth = {}
59ae15a5
PH
296 self.watch_startpos = False
297 self.error_count = 0
a921f407 298 BaseHTMLParser.__init__(self)
59ae15a5
PH
299
300 def error(self, message):
301 if self.error_count > 10 or self.started:
302 raise compat_html_parser.HTMLParseError(message, self.getpos())
303 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
304 self.error_count += 1
305 self.goahead(1)
306
59ae15a5
PH
307 def handle_starttag(self, tag, attrs):
308 attrs = dict(attrs)
309 if self.started:
310 self.find_startpos(None)
43e8fafd 311 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
312 self.result = [tag]
313 self.started = True
314 self.watch_startpos = True
315 if self.started:
316 if not tag in self.depth: self.depth[tag] = 0
317 self.depth[tag] += 1
318
319 def handle_endtag(self, tag):
320 if self.started:
321 if tag in self.depth: self.depth[tag] -= 1
322 if self.depth[self.result[0]] == 0:
323 self.started = False
324 self.result.append(self.getpos())
325
326 def find_startpos(self, x):
327 """Needed to put the start position of the result (self.result[1])
328 after the opening tag with the requested id"""
329 if self.watch_startpos:
330 self.watch_startpos = False
331 self.result.append(self.getpos())
332 handle_entityref = handle_charref = handle_data = handle_comment = \
333 handle_decl = handle_pi = unknown_decl = find_startpos
334
335 def get_result(self):
336 if self.result is None:
337 return None
338 if len(self.result) != 3:
339 return None
340 lines = self.html.split('\n')
341 lines = lines[self.result[1][0]-1:self.result[2][0]]
342 lines[0] = lines[0][self.result[1][1]:]
343 if len(lines) == 1:
344 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
345 lines[-1] = lines[-1][:self.result[2][1]]
346 return '\n'.join(lines).strip()
3b024e17
PH
347# Hack for https://github.com/rg3/youtube-dl/issues/662
348if sys.version_info < (2, 7, 3):
349 AttrParser.parse_endtag = (lambda self, i:
350 i + len("</scr'+'ipt>")
351 if self.rawdata[i:].startswith("</scr'+'ipt>")
352 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
353
354def get_element_by_id(id, html):
43e8fafd
ND
355 """Return the content of the tag with the specified ID in the passed HTML document"""
356 return get_element_by_attribute("id", id, html)
357
358def get_element_by_attribute(attribute, value, html):
359 """Return the content of the tag with the specified attribute in the passed HTML document"""
360 parser = AttrParser(attribute, value)
59ae15a5
PH
361 try:
362 parser.loads(html)
363 except compat_html_parser.HTMLParseError:
364 pass
365 return parser.get_result()
9e6dd238 366
a921f407
JMF
367class MetaParser(BaseHTMLParser):
368 """
369 Modified HTMLParser that isolates a meta tag with the specified name
370 attribute.
371 """
372 def __init__(self, name):
373 BaseHTMLParser.__init__(self)
374 self.name = name
375 self.content = None
376 self.result = None
377
378 def handle_starttag(self, tag, attrs):
379 if tag != 'meta':
380 return
381 attrs = dict(attrs)
382 if attrs.get('name') == self.name:
383 self.result = attrs.get('content')
384
385 def get_result(self):
386 return self.result
387
388def get_meta_content(name, html):
389 """
390 Return the content attribute from the meta tag with the given name attribute.
391 """
392 parser = MetaParser(name)
393 try:
394 parser.loads(html)
395 except compat_html_parser.HTMLParseError:
396 pass
397 return parser.get_result()
398
9e6dd238
FV
399
400def clean_html(html):
59ae15a5
PH
401 """Clean an HTML snippet into a readable string"""
402 # Newline vs <br />
403 html = html.replace('\n', ' ')
6b3aef80
FV
404 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
405 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
406 # Strip html tags
407 html = re.sub('<.*?>', '', html)
408 # Replace html entities
409 html = unescapeHTML(html)
7decf895 410 return html.strip()
9e6dd238
FV
411
412
d77c3dfd 413def sanitize_open(filename, open_mode):
59ae15a5
PH
414 """Try to open the given filename, and slightly tweak it if this fails.
415
416 Attempts to open the given filename. If this fails, it tries to change
417 the filename slightly, step by step, until it's either able to open it
418 or it fails and raises a final exception, like the standard open()
419 function.
420
421 It returns the tuple (stream, definitive_file_name).
422 """
423 try:
424 if filename == u'-':
425 if sys.platform == 'win32':
426 import msvcrt
427 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 428 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
429 stream = open(encodeFilename(filename), open_mode)
430 return (stream, filename)
431 except (IOError, OSError) as err:
f45c185f
PH
432 if err.errno in (errno.EACCES,):
433 raise
59ae15a5 434
f45c185f
PH
435 # In case of error, try to remove win32 forbidden chars
436 alt_filename = os.path.join(
437 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
438 for path_part in os.path.split(filename)
439 )
440 if alt_filename == filename:
441 raise
442 else:
443 # An exception here should be caught in the caller
444 stream = open(encodeFilename(filename), open_mode)
445 return (stream, alt_filename)
d77c3dfd
FV
446
447
448def timeconvert(timestr):
59ae15a5
PH
449 """Convert RFC 2822 defined time string into system timestamp"""
450 timestamp = None
451 timetuple = email.utils.parsedate_tz(timestr)
452 if timetuple is not None:
453 timestamp = email.utils.mktime_tz(timetuple)
454 return timestamp
1c469a94 455
796173d0 456def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
457 """Sanitizes a string so it could be used as part of a filename.
458 If restricted is set, use a stricter subset of allowed characters.
796173d0 459 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
460 """
461 def replace_insane(char):
462 if char == '?' or ord(char) < 32 or ord(char) == 127:
463 return ''
464 elif char == '"':
465 return '' if restricted else '\''
466 elif char == ':':
467 return '_-' if restricted else ' -'
468 elif char in '\\/|*<>':
469 return '_'
627dcfff 470 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
471 return '_'
472 if restricted and ord(char) > 127:
473 return '_'
474 return char
475
476 result = u''.join(map(replace_insane, s))
796173d0
PH
477 if not is_id:
478 while '__' in result:
479 result = result.replace('__', '_')
480 result = result.strip('_')
481 # Common case of "Foreign band name - English song title"
482 if restricted and result.startswith('-_'):
483 result = result[2:]
484 if not result:
485 result = '_'
59ae15a5 486 return result
d77c3dfd
FV
487
488def orderedSet(iterable):
59ae15a5
PH
489 """ Remove all duplicates from the input iterable """
490 res = []
491 for el in iterable:
492 if el not in res:
493 res.append(el)
494 return res
d77c3dfd
FV
495
496def unescapeHTML(s):
59ae15a5
PH
497 """
498 @param s a string
499 """
500 assert type(s) == type(u'')
d77c3dfd 501
59ae15a5
PH
502 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
503 return result
d77c3dfd 504
8bf48f23
PH
505
506def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
507 """
508 @param s The name of the file
509 """
d77c3dfd 510
8bf48f23 511 assert type(s) == compat_str
d77c3dfd 512
59ae15a5
PH
513 # Python 3 has a Unicode API
514 if sys.version_info >= (3, 0):
515 return s
0f00efed 516
59ae15a5
PH
517 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
518 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
519 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
520 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
521 if not for_subprocess:
522 return s
523 else:
524 # For subprocess calls, encode with locale encoding
525 # Refer to http://stackoverflow.com/a/9951851/35070
526 encoding = preferredencoding()
59ae15a5 527 else:
6df40dcb 528 encoding = sys.getfilesystemencoding()
8bf48f23
PH
529 if encoding is None:
530 encoding = 'utf-8'
531 return s.encode(encoding, 'ignore')
532
d77c3dfd 533
8271226a
PH
534def decodeOption(optval):
535 if optval is None:
536 return optval
537 if isinstance(optval, bytes):
538 optval = optval.decode(preferredencoding())
539
540 assert isinstance(optval, compat_str)
541 return optval
1c256f70 542
4539dd30
PH
543def formatSeconds(secs):
544 if secs > 3600:
545 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
546 elif secs > 60:
547 return '%d:%02d' % (secs // 60, secs % 60)
548 else:
549 return '%d' % secs
550
a0ddb8a2
PH
551
552def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
553 if sys.version_info < (3, 2):
554 import httplib
555
556 class HTTPSConnectionV3(httplib.HTTPSConnection):
557 def __init__(self, *args, **kwargs):
558 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
559
560 def connect(self):
561 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 562 if getattr(self, '_tunnel_host', False):
13ebea79
PH
563 self.sock = sock
564 self._tunnel()
565 try:
566 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 567 except ssl.SSLError:
13ebea79
PH
568 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
569
570 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
571 def https_open(self, req):
572 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 573 return HTTPSHandlerV3(**kwargs)
ea6d901e 574 else:
13ebea79 575 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 576 context.verify_mode = (ssl.CERT_NONE
dca08720 577 if opts_no_check_certificate
ea6d901e 578 else ssl.CERT_REQUIRED)
303b479e
PH
579 context.set_default_verify_paths()
580 try:
581 context.load_default_certs()
582 except AttributeError:
583 pass # Python < 3.4
a0ddb8a2 584 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 585
1c256f70
PH
586class ExtractorError(Exception):
587 """Error during info extraction."""
2eabb802 588 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
591 """
592
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
594 expected = True
595 if not expected:
298f833b 596 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 597 super(ExtractorError, self).__init__(msg)
d5979c5d 598
1c256f70 599 self.traceback = tb
8cc83b8d 600 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 601 self.cause = cause
1c256f70 602
01951dda
PH
603 def format_traceback(self):
604 if self.traceback is None:
605 return None
606 return u''.join(traceback.format_tb(self.traceback))
607
1c256f70 608
55b3e45b
JMF
609class RegexNotFoundError(ExtractorError):
610 """Error when a regex didn't match"""
611 pass
612
613
d77c3dfd 614class DownloadError(Exception):
59ae15a5 615 """Download Error exception.
d77c3dfd 616
59ae15a5
PH
617 This exception may be thrown by FileDownloader objects if they are not
618 configured to continue on errors. They will contain the appropriate
619 error message.
620 """
8cc83b8d
FV
621 def __init__(self, msg, exc_info=None):
622 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
623 super(DownloadError, self).__init__(msg)
624 self.exc_info = exc_info
d77c3dfd
FV
625
626
627class SameFileError(Exception):
59ae15a5 628 """Same File exception.
d77c3dfd 629
59ae15a5
PH
630 This exception will be thrown by FileDownloader objects if they detect
631 multiple files would have to be downloaded to the same file on disk.
632 """
633 pass
d77c3dfd
FV
634
635
636class PostProcessingError(Exception):
59ae15a5 637 """Post Processing exception.
d77c3dfd 638
59ae15a5
PH
639 This exception may be raised by PostProcessor's .run() method to
640 indicate an error in the postprocessing task.
641 """
7851b379
PH
642 def __init__(self, msg):
643 self.msg = msg
d77c3dfd
FV
644
645class MaxDownloadsReached(Exception):
59ae15a5
PH
646 """ --max-downloads limit has been reached. """
647 pass
d77c3dfd
FV
648
649
650class UnavailableVideoError(Exception):
59ae15a5 651 """Unavailable Format exception.
d77c3dfd 652
59ae15a5
PH
653 This exception will be thrown when a video is requested
654 in a format that is not available for that video.
655 """
656 pass
d77c3dfd
FV
657
658
659class ContentTooShortError(Exception):
59ae15a5 660 """Content Too Short exception.
d77c3dfd 661
59ae15a5
PH
662 This exception may be raised by FileDownloader objects when a file they
663 download is too small for what the server announced first, indicating
664 the connection was probably interrupted.
665 """
666 # Both in bytes
667 downloaded = None
668 expected = None
d77c3dfd 669
59ae15a5
PH
670 def __init__(self, downloaded, expected):
671 self.downloaded = downloaded
672 self.expected = expected
d77c3dfd 673
acebc9cd 674class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
675 """Handler for HTTP requests and responses.
676
677 This class, when installed with an OpenerDirector, automatically adds
678 the standard headers to every HTTP request and handles gzipped and
679 deflated responses from web servers. If compression is to be avoided in
680 a particular request, the original request in the program code only has
681 to include the HTTP header "Youtubedl-No-Compression", which will be
682 removed before making the real request.
683
684 Part of this code was copied from:
685
686 http://techknack.net/python-urllib2-handlers/
687
688 Andrew Rowls, the author of that code, agreed to release it to the
689 public domain.
690 """
691
692 @staticmethod
693 def deflate(data):
694 try:
695 return zlib.decompress(data, -zlib.MAX_WBITS)
696 except zlib.error:
697 return zlib.decompress(data)
698
699 @staticmethod
700 def addinfourl_wrapper(stream, headers, url, code):
701 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
702 return compat_urllib_request.addinfourl(stream, headers, url, code)
703 ret = compat_urllib_request.addinfourl(stream, headers, url)
704 ret.code = code
705 return ret
706
acebc9cd
PH
707 def http_request(self, req):
708 for h,v in std_headers.items():
59ae15a5
PH
709 if h in req.headers:
710 del req.headers[h]
335959e7 711 req.add_header(h, v)
59ae15a5
PH
712 if 'Youtubedl-no-compression' in req.headers:
713 if 'Accept-encoding' in req.headers:
714 del req.headers['Accept-encoding']
715 del req.headers['Youtubedl-no-compression']
3446dfb7 716 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
717 if 'User-agent' in req.headers:
718 del req.headers['User-agent']
719 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 720 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
721 return req
722
acebc9cd 723 def http_response(self, req, resp):
59ae15a5
PH
724 old_resp = resp
725 # gzip
726 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
727 content = resp.read()
728 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
729 try:
730 uncompressed = io.BytesIO(gz.read())
731 except IOError as original_ioerror:
732 # There may be junk add the end of the file
733 # See http://stackoverflow.com/q/4928560/35070 for details
734 for i in range(1, 1024):
735 try:
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
737 uncompressed = io.BytesIO(gz.read())
738 except IOError:
739 continue
740 break
741 else:
742 raise original_ioerror
743 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
744 resp.msg = old_resp.msg
745 # deflate
746 if resp.headers.get('Content-encoding', '') == 'deflate':
747 gz = io.BytesIO(self.deflate(resp.read()))
748 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
749 resp.msg = old_resp.msg
750 return resp
0f8d03f8 751
acebc9cd
PH
752 https_request = http_request
753 https_response = http_response
bf50b038 754
5de90176 755
bf50b038
JMF
756def unified_strdate(date_str):
757 """Return a string with the date in the format YYYYMMDD"""
758 upload_date = None
759 #Replace commas
026fcc04 760 date_str = date_str.replace(',', ' ')
bf50b038 761 # %z (UTC offset) is only supported in python>=3.2
026fcc04 762 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
763 format_expressions = [
764 '%d %B %Y',
765 '%B %d %Y',
766 '%b %d %Y',
767 '%Y-%m-%d',
768 '%d/%m/%Y',
769 '%Y/%m/%d %H:%M:%S',
5d73273f 770 '%Y-%m-%d %H:%M:%S',
19e1d359
JMF
771 '%d.%m.%Y %H:%M',
772 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
773 '%Y-%m-%dT%H:%M:%S.%fZ',
774 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 775 '%Y-%m-%dT%H:%M:%S',
5de90176 776 '%Y-%m-%dT%H:%M',
19e1d359 777 ]
bf50b038
JMF
778 for expression in format_expressions:
779 try:
780 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 781 except ValueError:
bf50b038 782 pass
42393ce2
PH
783 if upload_date is None:
784 timetuple = email.utils.parsedate_tz(date_str)
785 if timetuple:
786 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
787 return upload_date
788
cbdbb766 789def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
790 guess = url.partition(u'?')[0].rpartition(u'.')[2]
791 if re.match(r'^[A-Za-z0-9]+$', guess):
792 return guess
793 else:
cbdbb766 794 return default_ext
73e79f2a 795
d4051a8e
JMF
796def subtitles_filename(filename, sub_lang, sub_format):
797 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
798
bd558525 799def date_from_str(date_str):
37254abc
JMF
800 """
801 Return a datetime object from a string in the format YYYYMMDD or
802 (now|today)[+-][0-9](day|week|month|year)(s)?"""
803 today = datetime.date.today()
804 if date_str == 'now'or date_str == 'today':
805 return today
806 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
807 if match is not None:
808 sign = match.group('sign')
809 time = int(match.group('time'))
810 if sign == '-':
811 time = -time
812 unit = match.group('unit')
813 #A bad aproximation?
814 if unit == 'month':
815 unit = 'day'
816 time *= 30
817 elif unit == 'year':
818 unit = 'day'
819 time *= 365
820 unit += 's'
821 delta = datetime.timedelta(**{unit: time})
822 return today + delta
bd558525
JMF
823 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
824
e63fc1be 825def hyphenate_date(date_str):
826 """
827 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
828 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
829 if match is not None:
830 return '-'.join(match.groups())
831 else:
832 return date_str
833
bd558525
JMF
834class DateRange(object):
835 """Represents a time interval between two dates"""
836 def __init__(self, start=None, end=None):
837 """start and end must be strings in the format accepted by date"""
838 if start is not None:
839 self.start = date_from_str(start)
840 else:
841 self.start = datetime.datetime.min.date()
842 if end is not None:
843 self.end = date_from_str(end)
844 else:
845 self.end = datetime.datetime.max.date()
37254abc 846 if self.start > self.end:
bd558525
JMF
847 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
848 @classmethod
849 def day(cls, day):
850 """Returns a range that only contains the given day"""
851 return cls(day,day)
852 def __contains__(self, date):
853 """Check if the date is in the range"""
37254abc
JMF
854 if not isinstance(date, datetime.date):
855 date = date_from_str(date)
856 return self.start <= date <= self.end
bd558525
JMF
857 def __str__(self):
858 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
859
860
861def platform_name():
862 """ Returns the platform name as a compat_str """
863 res = platform.platform()
864 if isinstance(res, bytes):
865 res = res.decode(preferredencoding())
866
867 assert isinstance(res, compat_str)
868 return res
c257baff
PH
869
870
7459e3a2
PH
871def write_string(s, out=None):
872 if out is None:
873 out = sys.stderr
8bf48f23 874 assert type(s) == compat_str
7459e3a2
PH
875
876 if ('b' in getattr(out, 'mode', '') or
877 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
878 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
879 try:
880 out.write(s)
881 except UnicodeEncodeError:
882 # In Windows shells, this can fail even when the codec is just charmap!?
883 # See https://wiki.python.org/moin/PrintFails#Issue
884 if sys.platform == 'win32' and hasattr(out, 'encoding'):
885 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
886 out.write(s)
887 else:
888 raise
889
7459e3a2
PH
890 out.flush()
891
892
48ea9cea
PH
893def bytes_to_intlist(bs):
894 if not bs:
895 return []
896 if isinstance(bs[0], int): # Python 3
897 return list(bs)
898 else:
899 return [ord(c) for c in bs]
900
c257baff 901
cba892fa 902def intlist_to_bytes(xs):
903 if not xs:
904 return b''
905 if isinstance(chr(0), bytes): # Python 2
906 return ''.join([chr(x) for x in xs])
907 else:
908 return bytes(xs)
c38b1e77
PH
909
910
911def get_cachedir(params={}):
912 cache_root = os.environ.get('XDG_CACHE_HOME',
913 os.path.expanduser('~/.cache'))
914 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
915
916
917# Cross-platform file locking
918if sys.platform == 'win32':
919 import ctypes.wintypes
920 import msvcrt
921
922 class OVERLAPPED(ctypes.Structure):
923 _fields_ = [
924 ('Internal', ctypes.wintypes.LPVOID),
925 ('InternalHigh', ctypes.wintypes.LPVOID),
926 ('Offset', ctypes.wintypes.DWORD),
927 ('OffsetHigh', ctypes.wintypes.DWORD),
928 ('hEvent', ctypes.wintypes.HANDLE),
929 ]
930
931 kernel32 = ctypes.windll.kernel32
932 LockFileEx = kernel32.LockFileEx
933 LockFileEx.argtypes = [
934 ctypes.wintypes.HANDLE, # hFile
935 ctypes.wintypes.DWORD, # dwFlags
936 ctypes.wintypes.DWORD, # dwReserved
937 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
939 ctypes.POINTER(OVERLAPPED) # Overlapped
940 ]
941 LockFileEx.restype = ctypes.wintypes.BOOL
942 UnlockFileEx = kernel32.UnlockFileEx
943 UnlockFileEx.argtypes = [
944 ctypes.wintypes.HANDLE, # hFile
945 ctypes.wintypes.DWORD, # dwReserved
946 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
947 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
948 ctypes.POINTER(OVERLAPPED) # Overlapped
949 ]
950 UnlockFileEx.restype = ctypes.wintypes.BOOL
951 whole_low = 0xffffffff
952 whole_high = 0x7fffffff
953
954 def _lock_file(f, exclusive):
955 overlapped = OVERLAPPED()
956 overlapped.Offset = 0
957 overlapped.OffsetHigh = 0
958 overlapped.hEvent = 0
959 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
960 handle = msvcrt.get_osfhandle(f.fileno())
961 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
962 whole_low, whole_high, f._lock_file_overlapped_p):
963 raise OSError('Locking file failed: %r' % ctypes.FormatError())
964
965 def _unlock_file(f):
966 assert f._lock_file_overlapped_p
967 handle = msvcrt.get_osfhandle(f.fileno())
968 if not UnlockFileEx(handle, 0,
969 whole_low, whole_high, f._lock_file_overlapped_p):
970 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
971
972else:
973 import fcntl
974
975 def _lock_file(f, exclusive):
976 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
977
978 def _unlock_file(f):
979 fcntl.lockf(f, fcntl.LOCK_UN)
980
981
982class locked_file(object):
983 def __init__(self, filename, mode, encoding=None):
984 assert mode in ['r', 'a', 'w']
985 self.f = io.open(filename, mode, encoding=encoding)
986 self.mode = mode
987
988 def __enter__(self):
989 exclusive = self.mode != 'r'
990 try:
991 _lock_file(self.f, exclusive)
992 except IOError:
993 self.f.close()
994 raise
995 return self
996
997 def __exit__(self, etype, value, traceback):
998 try:
999 _unlock_file(self.f)
1000 finally:
1001 self.f.close()
1002
1003 def __iter__(self):
1004 return iter(self.f)
1005
1006 def write(self, *args):
1007 return self.f.write(*args)
1008
1009 def read(self, *args):
1010 return self.f.read(*args)
4eb7f1d1
JMF
1011
1012
1013def shell_quote(args):
a6a173c2
JMF
1014 quoted_args = []
1015 encoding = sys.getfilesystemencoding()
1016 if encoding is None:
1017 encoding = 'utf-8'
1018 for a in args:
1019 if isinstance(a, bytes):
1020 # We may get a filename encoded with 'encodeFilename'
1021 a = a.decode(encoding)
1022 quoted_args.append(pipes.quote(a))
1023 return u' '.join(quoted_args)
9d4660ca
PH
1024
1025
f4d96df0
PH
1026def takewhile_inclusive(pred, seq):
1027 """ Like itertools.takewhile, but include the latest evaluated element
1028 (the first element so that Not pred(e)) """
1029 for e in seq:
1030 yield e
1031 if not pred(e):
1032 return
1033
1034
9d4660ca
PH
1035def smuggle_url(url, data):
1036 """ Pass additional data in a URL for internal use. """
1037
1038 sdata = compat_urllib_parse.urlencode(
1039 {u'__youtubedl_smuggle': json.dumps(data)})
1040 return url + u'#' + sdata
1041
1042
79f82953 1043def unsmuggle_url(smug_url, default=None):
9d4660ca 1044 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1045 return smug_url, default
9d4660ca
PH
1046 url, _, sdata = smug_url.rpartition(u'#')
1047 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1048 data = json.loads(jsond)
1049 return url, data
02dbf93f
PH
1050
1051
02dbf93f
PH
1052def format_bytes(bytes):
1053 if bytes is None:
1054 return u'N/A'
1055 if type(bytes) is str:
1056 bytes = float(bytes)
1057 if bytes == 0.0:
1058 exponent = 0
1059 else:
1060 exponent = int(math.log(bytes, 1024.0))
1061 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1062 converted = float(bytes) / float(1024 ** exponent)
1063 return u'%.2f%s' % (converted, suffix)
f53c966a 1064
1c088fa8 1065
f53c966a
JMF
1066def str_to_int(int_str):
1067 int_str = re.sub(r'[,\.]', u'', int_str)
1068 return int(int_str)
1c088fa8
PH
1069
1070
1071def get_term_width():
1072 columns = os.environ.get('COLUMNS', None)
1073 if columns:
1074 return int(columns)
1075
1076 try:
1077 sp = subprocess.Popen(
1078 ['stty', 'size'],
1079 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1080 out, err = sp.communicate()
1081 return int(out.split()[1])
1082 except:
1083 pass
1084 return None
caefb1de
PH
1085
1086
1087def month_by_name(name):
1088 """ Return the number of a month by (locale-independently) English name """
1089
1090 ENGLISH_NAMES = [
dadb8184 1091 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1092 u'July', u'August', u'September', u'October', u'November', u'December']
1093 try:
1094 return ENGLISH_NAMES.index(name) + 1
1095 except ValueError:
1096 return None
18258362
JMF
1097
1098
5aafe895 1099def fix_xml_ampersands(xml_str):
18258362 1100 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1101 return re.sub(
1102 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1103 u'&amp;',
1104 xml_str)
e3946f98
PH
1105
1106
1107def setproctitle(title):
8bf48f23 1108 assert isinstance(title, compat_str)
e3946f98
PH
1109 try:
1110 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1111 except OSError:
1112 return
1113 title = title
1114 buf = ctypes.create_string_buffer(len(title) + 1)
e64eaaa9 1115 buf.value = title.encode('utf-8')
e3946f98
PH
1116 try:
1117 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1118 except AttributeError:
1119 return # Strange libc, just skip this
d7dda168
PH
1120
1121
1122def remove_start(s, start):
1123 if s.startswith(start):
1124 return s[len(start):]
1125 return s
29eb5174
PH
1126
1127
1128def url_basename(url):
9b8aaeed
JMF
1129 path = compat_urlparse.urlparse(url).path
1130 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1131
1132
1133class HEADRequest(compat_urllib_request.Request):
1134 def get_method(self):
1135 return "HEAD"
7217e148
PH
1136
1137
dd27fd17
PH
1138def int_or_none(v, scale=1):
1139 return v if v is None else (int(v) // scale)
608d11f5
PH
1140
1141
1142def parse_duration(s):
1143 if s is None:
1144 return None
1145
1146 m = re.match(
2db806b4 1147 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
608d11f5
PH
1148 if not m:
1149 return None
1150 res = int(m.group('secs'))
1151 if m.group('mins'):
1152 res += int(m.group('mins')) * 60
1153 if m.group('hours'):
1154 res += int(m.group('hours')) * 60 * 60
1155 return res
91d7d0b3
JMF
1156
1157
1158def prepend_extension(filename, ext):
1159 name, real_ext = os.path.splitext(filename)
1160 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1161
1162
1163def check_executable(exe, args=[]):
1164 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1165 args can be a list of arguments for a short output (like -version) """
1166 try:
1167 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1168 except OSError:
1169 return False
1170 return exe
b7ab0590
PH
1171
1172
1173class PagedList(object):
1174 def __init__(self, pagefunc, pagesize):
1175 self._pagefunc = pagefunc
1176 self._pagesize = pagesize
1177
dd26ced1
PH
1178 def __len__(self):
1179 # This is only useful for tests
1180 return len(self.getslice())
1181
b7ab0590
PH
1182 def getslice(self, start=0, end=None):
1183 res = []
1184 for pagenum in itertools.count(start // self._pagesize):
1185 firstid = pagenum * self._pagesize
1186 nextfirstid = pagenum * self._pagesize + self._pagesize
1187 if start >= nextfirstid:
1188 continue
1189
1190 page_results = list(self._pagefunc(pagenum))
1191
1192 startv = (
1193 start % self._pagesize
1194 if firstid <= start < nextfirstid
1195 else 0)
1196
1197 endv = (
1198 ((end - 1) % self._pagesize) + 1
1199 if (end is not None and firstid <= end <= nextfirstid)
1200 else None)
1201
1202 if startv != 0 or endv is not None:
1203 page_results = page_results[startv:endv]
1204 res.extend(page_results)
1205
1206 # A little optimization - if current page is not "full", ie. does
1207 # not contain page_size videos then we can assume that this page
1208 # is the last one - there are no more ids on further pages -
1209 # i.e. no need to query again.
1210 if len(page_results) + startv < self._pagesize:
1211 break
1212
1213 # If we got the whole page, but the next page is not interesting,
1214 # break out early as well
1215 if end == nextfirstid:
1216 break
1217 return res
81c2f20b
PH
1218
1219
1220def uppercase_escape(s):
1221 return re.sub(
1222 r'\\U([0-9a-fA-F]{8})',
1223 lambda m: compat_chr(int(m.group(1), base=16)), s)
b53466e1
PH
1224
1225try:
1226 struct.pack(u'!I', 0)
1227except TypeError:
1228 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1229 def struct_pack(spec, *args):
1230 if isinstance(spec, compat_str):
1231 spec = spec.encode('ascii')
1232 return struct.pack(spec, *args)
1233
1234 def struct_unpack(spec, *args):
1235 if isinstance(spec, compat_str):
1236 spec = spec.encode('ascii')
1237 return struct.unpack(spec, *args)
1238else:
1239 struct_pack = struct.pack
1240 struct_unpack = struct.unpack