]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[podomatic] Use unicode_literals
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
e3946f98 4import ctypes
c496ca96
PH
5import datetime
6import email.utils
f45c185f 7import errno
d77c3dfd 8import gzip
b7ab0590 9import itertools
03f9daab 10import io
f4bfd65f 11import json
d77c3dfd 12import locale
02dbf93f 13import math
d77c3dfd 14import os
4eb7f1d1 15import pipes
c496ca96 16import platform
d77c3dfd 17import re
13ebea79 18import ssl
c496ca96 19import socket
b53466e1 20import struct
1c088fa8 21import subprocess
d77c3dfd 22import sys
01951dda 23import traceback
d77c3dfd 24import zlib
d77c3dfd 25
01ba00ca 26try:
59ae15a5 27 import urllib.request as compat_urllib_request
01ba00ca 28except ImportError: # Python 2
59ae15a5 29 import urllib2 as compat_urllib_request
01ba00ca
PH
30
31try:
59ae15a5 32 import urllib.error as compat_urllib_error
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib2 as compat_urllib_error
01ba00ca
PH
35
36try:
59ae15a5 37 import urllib.parse as compat_urllib_parse
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import urllib as compat_urllib_parse
01ba00ca 40
799c0763
PH
41try:
42 from urllib.parse import urlparse as compat_urllib_parse_urlparse
43except ImportError: # Python 2
44 from urlparse import urlparse as compat_urllib_parse_urlparse
45
6543f0dc
JMF
46try:
47 import urllib.parse as compat_urlparse
48except ImportError: # Python 2
49 import urlparse as compat_urlparse
50
01ba00ca 51try:
59ae15a5 52 import http.cookiejar as compat_cookiejar
01ba00ca 53except ImportError: # Python 2
59ae15a5 54 import cookielib as compat_cookiejar
01ba00ca 55
3e669f36 56try:
59ae15a5 57 import html.entities as compat_html_entities
9f37a959 58except ImportError: # Python 2
59ae15a5 59 import htmlentitydefs as compat_html_entities
3e669f36 60
a8156c1d 61try:
59ae15a5 62 import html.parser as compat_html_parser
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import HTMLParser as compat_html_parser
a8156c1d 65
348d0a7a 66try:
59ae15a5 67 import http.client as compat_http_client
9f37a959 68except ImportError: # Python 2
59ae15a5 69 import httplib as compat_http_client
348d0a7a 70
2eabb802 71try:
0e283428 72 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
73except ImportError: # Python 2
74 from urllib2 import HTTPError as compat_HTTPError
75
e0df6211
PH
76try:
77 from urllib.request import urlretrieve as compat_urlretrieve
78except ImportError: # Python 2
79 from urllib import urlretrieve as compat_urlretrieve
80
81
5910e210
PH
82try:
83 from subprocess import DEVNULL
84 compat_subprocess_get_DEVNULL = lambda: DEVNULL
85except ImportError:
86 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
87
9f37a959 88try:
59ae15a5 89 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 90except ImportError: # Python 2
59ae15a5
PH
91 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
92 # Python 2's version is apparently totally broken
93 def _unquote(string, encoding='utf-8', errors='replace'):
94 if string == '':
95 return string
96 res = string.split('%')
97 if len(res) == 1:
98 return string
99 if encoding is None:
100 encoding = 'utf-8'
101 if errors is None:
102 errors = 'replace'
103 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
104 pct_sequence = b''
105 string = res[0]
106 for item in res[1:]:
107 try:
108 if not item:
109 raise ValueError
110 pct_sequence += item[:2].decode('hex')
111 rest = item[2:]
112 if not rest:
113 # This segment was just a single percent-encoded character.
114 # May be part of a sequence of code units, so delay decoding.
115 # (Stored in pct_sequence).
116 continue
117 except ValueError:
118 rest = '%' + item
119 # Encountered non-percent-encoded characters. Flush the current
120 # pct_sequence.
121 string += pct_sequence.decode(encoding, errors) + rest
122 pct_sequence = b''
123 if pct_sequence:
124 # Flush the final pct_sequence
125 string += pct_sequence.decode(encoding, errors)
126 return string
127
128 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
130 qs, _coerce_result = qs, unicode
131 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
132 r = []
133 for name_value in pairs:
134 if not name_value and not strict_parsing:
135 continue
136 nv = name_value.split('=', 1)
137 if len(nv) != 2:
138 if strict_parsing:
139 raise ValueError("bad query field: %r" % (name_value,))
140 # Handle case of a control-name with no equal sign
141 if keep_blank_values:
142 nv.append('')
143 else:
144 continue
145 if len(nv[1]) or keep_blank_values:
146 name = nv[0].replace('+', ' ')
147 name = _unquote(name, encoding=encoding, errors=errors)
148 name = _coerce_result(name)
149 value = nv[1].replace('+', ' ')
150 value = _unquote(value, encoding=encoding, errors=errors)
151 value = _coerce_result(value)
152 r.append((name, value))
153 return r
154
155 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
156 encoding='utf-8', errors='replace'):
157 parsed_result = {}
158 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
159 encoding=encoding, errors=errors)
160 for name, value in pairs:
161 if name in parsed_result:
162 parsed_result[name].append(value)
163 else:
164 parsed_result[name] = [value]
165 return parsed_result
348d0a7a 166
3e669f36 167try:
59ae15a5 168 compat_str = unicode # Python 2
3e669f36 169except NameError:
59ae15a5 170 compat_str = str
3e669f36
PH
171
172try:
59ae15a5 173 compat_chr = unichr # Python 2
3e669f36 174except NameError:
59ae15a5 175 compat_chr = chr
3e669f36 176
f7300c5c
JMF
177try:
178 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
179except ImportError: # Python 2.6
180 from xml.parsers.expat import ExpatError as compat_xml_parse_error
181
b31756c1
FV
182def compat_ord(c):
183 if type(c) is int: return c
184 else: return ord(c)
185
468e2e92
FV
186# This is not clearly defined otherwise
187compiled_regex_type = type(re.compile(''))
188
3e669f36 189std_headers = {
ae8f7871 190 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
191 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
192 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
193 'Accept-Encoding': 'gzip, deflate',
194 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 195}
f427df17 196
d77c3dfd 197def preferredencoding():
59ae15a5 198 """Get preferred encoding.
d77c3dfd 199
59ae15a5
PH
200 Returns the best encoding scheme for the system, based on
201 locale.getpreferredencoding() and some further tweaks.
202 """
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
bae611f2 208
59ae15a5 209 return pref
d77c3dfd 210
8cd10ac4 211if sys.version_info < (3,0):
59ae15a5
PH
212 def compat_print(s):
213 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 214else:
59ae15a5
PH
215 def compat_print(s):
216 assert type(s) == type(u'')
217 print(s)
d77c3dfd 218
f4bfd65f
PH
219# In Python 2.x, json.dump expects a bytestream.
220# In Python 3.x, it writes to a character stream
221if sys.version_info < (3,0):
222 def write_json_file(obj, fn):
223 with open(fn, 'wb') as f:
224 json.dump(obj, f)
225else:
226 def write_json_file(obj, fn):
227 with open(fn, 'w', encoding='utf-8') as f:
228 json.dump(obj, f)
229
59ae56fa
PH
230if sys.version_info >= (2,7):
231 def find_xpath_attr(node, xpath, key, val):
232 """ Find the xpath xpath[@key=val] """
5de3ece2 233 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 234 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
235 expr = xpath + u"[@%s='%s']" % (key, val)
236 return node.find(expr)
237else:
238 def find_xpath_attr(node, xpath, key, val):
239 for f in node.findall(xpath):
240 if f.attrib.get(key) == val:
241 return f
242 return None
243
d7e66d39
JMF
244# On python2.6 the xml.etree.ElementTree.Element methods don't support
245# the namespace parameter
246def xpath_with_ns(path, ns_map):
247 components = [c.split(':') for c in path.split('/')]
248 replaced = []
249 for c in components:
250 if len(c) == 1:
251 replaced.append(c[0])
252 else:
253 ns, tag = c
254 replaced.append('{%s}%s' % (ns_map[ns], tag))
255 return '/'.join(replaced)
256
d77c3dfd 257def htmlentity_transform(matchobj):
59ae15a5
PH
258 """Transforms an HTML entity to a character.
259
260 This function receives a match object and is intended to be used with
261 the re.sub() function.
262 """
263 entity = matchobj.group(1)
264
265 # Known non-numeric HTML entity
266 if entity in compat_html_entities.name2codepoint:
267 return compat_chr(compat_html_entities.name2codepoint[entity])
268
269 mobj = re.match(u'(?u)#(x?\\d+)', entity)
270 if mobj is not None:
271 numstr = mobj.group(1)
272 if numstr.startswith(u'x'):
273 base = 16
274 numstr = u'0%s' % numstr
275 else:
276 base = 10
277 return compat_chr(int(numstr, base))
278
279 # Unknown entity in name, return its literal representation
280 return (u'&%s;' % entity)
d77c3dfd 281
a8156c1d 282compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
283class BaseHTMLParser(compat_html_parser.HTMLParser):
284 def __init(self):
285 compat_html_parser.HTMLParser.__init__(self)
286 self.html = None
287
288 def loads(self, html):
289 self.html = html
290 self.feed(html)
291 self.close()
292
293class AttrParser(BaseHTMLParser):
43e8fafd
ND
294 """Modified HTMLParser that isolates a tag with the specified attribute"""
295 def __init__(self, attribute, value):
296 self.attribute = attribute
297 self.value = value
59ae15a5
PH
298 self.result = None
299 self.started = False
300 self.depth = {}
59ae15a5
PH
301 self.watch_startpos = False
302 self.error_count = 0
a921f407 303 BaseHTMLParser.__init__(self)
59ae15a5
PH
304
305 def error(self, message):
306 if self.error_count > 10 or self.started:
307 raise compat_html_parser.HTMLParseError(message, self.getpos())
308 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
309 self.error_count += 1
310 self.goahead(1)
311
59ae15a5
PH
312 def handle_starttag(self, tag, attrs):
313 attrs = dict(attrs)
314 if self.started:
315 self.find_startpos(None)
43e8fafd 316 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
317 self.result = [tag]
318 self.started = True
319 self.watch_startpos = True
320 if self.started:
321 if not tag in self.depth: self.depth[tag] = 0
322 self.depth[tag] += 1
323
324 def handle_endtag(self, tag):
325 if self.started:
326 if tag in self.depth: self.depth[tag] -= 1
327 if self.depth[self.result[0]] == 0:
328 self.started = False
329 self.result.append(self.getpos())
330
331 def find_startpos(self, x):
332 """Needed to put the start position of the result (self.result[1])
333 after the opening tag with the requested id"""
334 if self.watch_startpos:
335 self.watch_startpos = False
336 self.result.append(self.getpos())
337 handle_entityref = handle_charref = handle_data = handle_comment = \
338 handle_decl = handle_pi = unknown_decl = find_startpos
339
340 def get_result(self):
341 if self.result is None:
342 return None
343 if len(self.result) != 3:
344 return None
345 lines = self.html.split('\n')
346 lines = lines[self.result[1][0]-1:self.result[2][0]]
347 lines[0] = lines[0][self.result[1][1]:]
348 if len(lines) == 1:
349 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
350 lines[-1] = lines[-1][:self.result[2][1]]
351 return '\n'.join(lines).strip()
3b024e17
PH
352# Hack for https://github.com/rg3/youtube-dl/issues/662
353if sys.version_info < (2, 7, 3):
354 AttrParser.parse_endtag = (lambda self, i:
355 i + len("</scr'+'ipt>")
356 if self.rawdata[i:].startswith("</scr'+'ipt>")
357 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
358
359def get_element_by_id(id, html):
43e8fafd
ND
360 """Return the content of the tag with the specified ID in the passed HTML document"""
361 return get_element_by_attribute("id", id, html)
362
363def get_element_by_attribute(attribute, value, html):
364 """Return the content of the tag with the specified attribute in the passed HTML document"""
365 parser = AttrParser(attribute, value)
59ae15a5
PH
366 try:
367 parser.loads(html)
368 except compat_html_parser.HTMLParseError:
369 pass
370 return parser.get_result()
9e6dd238 371
a921f407
JMF
372class MetaParser(BaseHTMLParser):
373 """
374 Modified HTMLParser that isolates a meta tag with the specified name
375 attribute.
376 """
377 def __init__(self, name):
378 BaseHTMLParser.__init__(self)
379 self.name = name
380 self.content = None
381 self.result = None
382
383 def handle_starttag(self, tag, attrs):
384 if tag != 'meta':
385 return
386 attrs = dict(attrs)
387 if attrs.get('name') == self.name:
388 self.result = attrs.get('content')
389
390 def get_result(self):
391 return self.result
392
393def get_meta_content(name, html):
394 """
395 Return the content attribute from the meta tag with the given name attribute.
396 """
397 parser = MetaParser(name)
398 try:
399 parser.loads(html)
400 except compat_html_parser.HTMLParseError:
401 pass
402 return parser.get_result()
403
9e6dd238
FV
404
405def clean_html(html):
59ae15a5
PH
406 """Clean an HTML snippet into a readable string"""
407 # Newline vs <br />
408 html = html.replace('\n', ' ')
6b3aef80
FV
409 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
410 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
411 # Strip html tags
412 html = re.sub('<.*?>', '', html)
413 # Replace html entities
414 html = unescapeHTML(html)
7decf895 415 return html.strip()
9e6dd238
FV
416
417
d77c3dfd 418def sanitize_open(filename, open_mode):
59ae15a5
PH
419 """Try to open the given filename, and slightly tweak it if this fails.
420
421 Attempts to open the given filename. If this fails, it tries to change
422 the filename slightly, step by step, until it's either able to open it
423 or it fails and raises a final exception, like the standard open()
424 function.
425
426 It returns the tuple (stream, definitive_file_name).
427 """
428 try:
429 if filename == u'-':
430 if sys.platform == 'win32':
431 import msvcrt
432 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 433 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
434 stream = open(encodeFilename(filename), open_mode)
435 return (stream, filename)
436 except (IOError, OSError) as err:
f45c185f
PH
437 if err.errno in (errno.EACCES,):
438 raise
59ae15a5 439
f45c185f
PH
440 # In case of error, try to remove win32 forbidden chars
441 alt_filename = os.path.join(
442 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
443 for path_part in os.path.split(filename)
444 )
445 if alt_filename == filename:
446 raise
447 else:
448 # An exception here should be caught in the caller
449 stream = open(encodeFilename(filename), open_mode)
450 return (stream, alt_filename)
d77c3dfd
FV
451
452
453def timeconvert(timestr):
59ae15a5
PH
454 """Convert RFC 2822 defined time string into system timestamp"""
455 timestamp = None
456 timetuple = email.utils.parsedate_tz(timestr)
457 if timetuple is not None:
458 timestamp = email.utils.mktime_tz(timetuple)
459 return timestamp
1c469a94 460
796173d0 461def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
462 """Sanitizes a string so it could be used as part of a filename.
463 If restricted is set, use a stricter subset of allowed characters.
796173d0 464 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
465 """
466 def replace_insane(char):
467 if char == '?' or ord(char) < 32 or ord(char) == 127:
468 return ''
469 elif char == '"':
470 return '' if restricted else '\''
471 elif char == ':':
472 return '_-' if restricted else ' -'
473 elif char in '\\/|*<>':
474 return '_'
627dcfff 475 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
476 return '_'
477 if restricted and ord(char) > 127:
478 return '_'
479 return char
480
481 result = u''.join(map(replace_insane, s))
796173d0
PH
482 if not is_id:
483 while '__' in result:
484 result = result.replace('__', '_')
485 result = result.strip('_')
486 # Common case of "Foreign band name - English song title"
487 if restricted and result.startswith('-_'):
488 result = result[2:]
489 if not result:
490 result = '_'
59ae15a5 491 return result
d77c3dfd
FV
492
493def orderedSet(iterable):
59ae15a5
PH
494 """ Remove all duplicates from the input iterable """
495 res = []
496 for el in iterable:
497 if el not in res:
498 res.append(el)
499 return res
d77c3dfd
FV
500
501def unescapeHTML(s):
59ae15a5
PH
502 """
503 @param s a string
504 """
505 assert type(s) == type(u'')
d77c3dfd 506
59ae15a5
PH
507 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
508 return result
d77c3dfd 509
8bf48f23
PH
510
511def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
512 """
513 @param s The name of the file
514 """
d77c3dfd 515
8bf48f23 516 assert type(s) == compat_str
d77c3dfd 517
59ae15a5
PH
518 # Python 3 has a Unicode API
519 if sys.version_info >= (3, 0):
520 return s
0f00efed 521
59ae15a5
PH
522 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
523 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
524 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
525 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
526 if not for_subprocess:
527 return s
528 else:
529 # For subprocess calls, encode with locale encoding
530 # Refer to http://stackoverflow.com/a/9951851/35070
531 encoding = preferredencoding()
59ae15a5 532 else:
6df40dcb 533 encoding = sys.getfilesystemencoding()
8bf48f23
PH
534 if encoding is None:
535 encoding = 'utf-8'
536 return s.encode(encoding, 'ignore')
537
d77c3dfd 538
8271226a
PH
539def decodeOption(optval):
540 if optval is None:
541 return optval
542 if isinstance(optval, bytes):
543 optval = optval.decode(preferredencoding())
544
545 assert isinstance(optval, compat_str)
546 return optval
1c256f70 547
4539dd30
PH
548def formatSeconds(secs):
549 if secs > 3600:
550 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
551 elif secs > 60:
552 return '%d:%02d' % (secs // 60, secs % 60)
553 else:
554 return '%d' % secs
555
a0ddb8a2
PH
556
557def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
558 if sys.version_info < (3, 2):
559 import httplib
560
561 class HTTPSConnectionV3(httplib.HTTPSConnection):
562 def __init__(self, *args, **kwargs):
563 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
564
565 def connect(self):
566 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 567 if getattr(self, '_tunnel_host', False):
13ebea79
PH
568 self.sock = sock
569 self._tunnel()
570 try:
571 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 572 except ssl.SSLError:
13ebea79
PH
573 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
574
575 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
576 def https_open(self, req):
577 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 578 return HTTPSHandlerV3(**kwargs)
ea6d901e 579 else:
13ebea79 580 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 581 context.verify_mode = (ssl.CERT_NONE
dca08720 582 if opts_no_check_certificate
ea6d901e 583 else ssl.CERT_REQUIRED)
303b479e
PH
584 context.set_default_verify_paths()
585 try:
586 context.load_default_certs()
587 except AttributeError:
588 pass # Python < 3.4
a0ddb8a2 589 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 590
1c256f70
PH
591class ExtractorError(Exception):
592 """Error during info extraction."""
2eabb802 593 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
594 """ tb, if given, is the original traceback (so that it can be printed out).
595 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
596 """
597
598 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
599 expected = True
600 if not expected:
298f833b 601 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 602 super(ExtractorError, self).__init__(msg)
d5979c5d 603
1c256f70 604 self.traceback = tb
8cc83b8d 605 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 606 self.cause = cause
1c256f70 607
01951dda
PH
608 def format_traceback(self):
609 if self.traceback is None:
610 return None
611 return u''.join(traceback.format_tb(self.traceback))
612
1c256f70 613
55b3e45b
JMF
614class RegexNotFoundError(ExtractorError):
615 """Error when a regex didn't match"""
616 pass
617
618
d77c3dfd 619class DownloadError(Exception):
59ae15a5 620 """Download Error exception.
d77c3dfd 621
59ae15a5
PH
622 This exception may be thrown by FileDownloader objects if they are not
623 configured to continue on errors. They will contain the appropriate
624 error message.
625 """
8cc83b8d
FV
626 def __init__(self, msg, exc_info=None):
627 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
628 super(DownloadError, self).__init__(msg)
629 self.exc_info = exc_info
d77c3dfd
FV
630
631
632class SameFileError(Exception):
59ae15a5 633 """Same File exception.
d77c3dfd 634
59ae15a5
PH
635 This exception will be thrown by FileDownloader objects if they detect
636 multiple files would have to be downloaded to the same file on disk.
637 """
638 pass
d77c3dfd
FV
639
640
641class PostProcessingError(Exception):
59ae15a5 642 """Post Processing exception.
d77c3dfd 643
59ae15a5
PH
644 This exception may be raised by PostProcessor's .run() method to
645 indicate an error in the postprocessing task.
646 """
7851b379
PH
647 def __init__(self, msg):
648 self.msg = msg
d77c3dfd
FV
649
650class MaxDownloadsReached(Exception):
59ae15a5
PH
651 """ --max-downloads limit has been reached. """
652 pass
d77c3dfd
FV
653
654
655class UnavailableVideoError(Exception):
59ae15a5 656 """Unavailable Format exception.
d77c3dfd 657
59ae15a5
PH
658 This exception will be thrown when a video is requested
659 in a format that is not available for that video.
660 """
661 pass
d77c3dfd
FV
662
663
664class ContentTooShortError(Exception):
59ae15a5 665 """Content Too Short exception.
d77c3dfd 666
59ae15a5
PH
667 This exception may be raised by FileDownloader objects when a file they
668 download is too small for what the server announced first, indicating
669 the connection was probably interrupted.
670 """
671 # Both in bytes
672 downloaded = None
673 expected = None
d77c3dfd 674
59ae15a5
PH
675 def __init__(self, downloaded, expected):
676 self.downloaded = downloaded
677 self.expected = expected
d77c3dfd 678
acebc9cd 679class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
680 """Handler for HTTP requests and responses.
681
682 This class, when installed with an OpenerDirector, automatically adds
683 the standard headers to every HTTP request and handles gzipped and
684 deflated responses from web servers. If compression is to be avoided in
685 a particular request, the original request in the program code only has
686 to include the HTTP header "Youtubedl-No-Compression", which will be
687 removed before making the real request.
688
689 Part of this code was copied from:
690
691 http://techknack.net/python-urllib2-handlers/
692
693 Andrew Rowls, the author of that code, agreed to release it to the
694 public domain.
695 """
696
697 @staticmethod
698 def deflate(data):
699 try:
700 return zlib.decompress(data, -zlib.MAX_WBITS)
701 except zlib.error:
702 return zlib.decompress(data)
703
704 @staticmethod
705 def addinfourl_wrapper(stream, headers, url, code):
706 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
707 return compat_urllib_request.addinfourl(stream, headers, url, code)
708 ret = compat_urllib_request.addinfourl(stream, headers, url)
709 ret.code = code
710 return ret
711
acebc9cd
PH
712 def http_request(self, req):
713 for h,v in std_headers.items():
59ae15a5
PH
714 if h in req.headers:
715 del req.headers[h]
335959e7 716 req.add_header(h, v)
59ae15a5
PH
717 if 'Youtubedl-no-compression' in req.headers:
718 if 'Accept-encoding' in req.headers:
719 del req.headers['Accept-encoding']
720 del req.headers['Youtubedl-no-compression']
3446dfb7 721 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
722 if 'User-agent' in req.headers:
723 del req.headers['User-agent']
724 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 725 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
726 return req
727
acebc9cd 728 def http_response(self, req, resp):
59ae15a5
PH
729 old_resp = resp
730 # gzip
731 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
732 content = resp.read()
733 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
734 try:
735 uncompressed = io.BytesIO(gz.read())
736 except IOError as original_ioerror:
737 # There may be junk add the end of the file
738 # See http://stackoverflow.com/q/4928560/35070 for details
739 for i in range(1, 1024):
740 try:
741 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
742 uncompressed = io.BytesIO(gz.read())
743 except IOError:
744 continue
745 break
746 else:
747 raise original_ioerror
748 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
749 resp.msg = old_resp.msg
750 # deflate
751 if resp.headers.get('Content-encoding', '') == 'deflate':
752 gz = io.BytesIO(self.deflate(resp.read()))
753 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
754 resp.msg = old_resp.msg
755 return resp
0f8d03f8 756
acebc9cd
PH
757 https_request = http_request
758 https_response = http_response
bf50b038 759
5de90176 760
bf50b038
JMF
761def unified_strdate(date_str):
762 """Return a string with the date in the format YYYYMMDD"""
763 upload_date = None
764 #Replace commas
026fcc04 765 date_str = date_str.replace(',', ' ')
bf50b038 766 # %z (UTC offset) is only supported in python>=3.2
026fcc04 767 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
768 format_expressions = [
769 '%d %B %Y',
0f99566c 770 '%d %b %Y',
19e1d359
JMF
771 '%B %d %Y',
772 '%b %d %Y',
773 '%Y-%m-%d',
774 '%d/%m/%Y',
775 '%Y/%m/%d %H:%M:%S',
5d73273f 776 '%Y-%m-%d %H:%M:%S',
19e1d359
JMF
777 '%d.%m.%Y %H:%M',
778 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
779 '%Y-%m-%dT%H:%M:%S.%fZ',
780 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 781 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 782 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 783 '%Y-%m-%dT%H:%M',
19e1d359 784 ]
bf50b038
JMF
785 for expression in format_expressions:
786 try:
787 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 788 except ValueError:
bf50b038 789 pass
42393ce2
PH
790 if upload_date is None:
791 timetuple = email.utils.parsedate_tz(date_str)
792 if timetuple:
793 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
794 return upload_date
795
cbdbb766 796def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
797 guess = url.partition(u'?')[0].rpartition(u'.')[2]
798 if re.match(r'^[A-Za-z0-9]+$', guess):
799 return guess
800 else:
cbdbb766 801 return default_ext
73e79f2a 802
d4051a8e
JMF
803def subtitles_filename(filename, sub_lang, sub_format):
804 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
805
bd558525 806def date_from_str(date_str):
37254abc
JMF
807 """
808 Return a datetime object from a string in the format YYYYMMDD or
809 (now|today)[+-][0-9](day|week|month|year)(s)?"""
810 today = datetime.date.today()
811 if date_str == 'now'or date_str == 'today':
812 return today
813 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
814 if match is not None:
815 sign = match.group('sign')
816 time = int(match.group('time'))
817 if sign == '-':
818 time = -time
819 unit = match.group('unit')
820 #A bad aproximation?
821 if unit == 'month':
822 unit = 'day'
823 time *= 30
824 elif unit == 'year':
825 unit = 'day'
826 time *= 365
827 unit += 's'
828 delta = datetime.timedelta(**{unit: time})
829 return today + delta
bd558525
JMF
830 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
831
e63fc1be 832def hyphenate_date(date_str):
833 """
834 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
835 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
836 if match is not None:
837 return '-'.join(match.groups())
838 else:
839 return date_str
840
bd558525
JMF
841class DateRange(object):
842 """Represents a time interval between two dates"""
843 def __init__(self, start=None, end=None):
844 """start and end must be strings in the format accepted by date"""
845 if start is not None:
846 self.start = date_from_str(start)
847 else:
848 self.start = datetime.datetime.min.date()
849 if end is not None:
850 self.end = date_from_str(end)
851 else:
852 self.end = datetime.datetime.max.date()
37254abc 853 if self.start > self.end:
bd558525
JMF
854 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
855 @classmethod
856 def day(cls, day):
857 """Returns a range that only contains the given day"""
858 return cls(day,day)
859 def __contains__(self, date):
860 """Check if the date is in the range"""
37254abc
JMF
861 if not isinstance(date, datetime.date):
862 date = date_from_str(date)
863 return self.start <= date <= self.end
bd558525
JMF
864 def __str__(self):
865 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
866
867
868def platform_name():
869 """ Returns the platform name as a compat_str """
870 res = platform.platform()
871 if isinstance(res, bytes):
872 res = res.decode(preferredencoding())
873
874 assert isinstance(res, compat_str)
875 return res
c257baff
PH
876
877
7459e3a2
PH
878def write_string(s, out=None):
879 if out is None:
880 out = sys.stderr
8bf48f23 881 assert type(s) == compat_str
7459e3a2
PH
882
883 if ('b' in getattr(out, 'mode', '') or
884 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
885 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
886 try:
887 out.write(s)
888 except UnicodeEncodeError:
889 # In Windows shells, this can fail even when the codec is just charmap!?
890 # See https://wiki.python.org/moin/PrintFails#Issue
891 if sys.platform == 'win32' and hasattr(out, 'encoding'):
892 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
893 out.write(s)
894 else:
895 raise
896
7459e3a2
PH
897 out.flush()
898
899
48ea9cea
PH
900def bytes_to_intlist(bs):
901 if not bs:
902 return []
903 if isinstance(bs[0], int): # Python 3
904 return list(bs)
905 else:
906 return [ord(c) for c in bs]
907
c257baff 908
cba892fa 909def intlist_to_bytes(xs):
910 if not xs:
911 return b''
912 if isinstance(chr(0), bytes): # Python 2
913 return ''.join([chr(x) for x in xs])
914 else:
915 return bytes(xs)
c38b1e77
PH
916
917
918def get_cachedir(params={}):
919 cache_root = os.environ.get('XDG_CACHE_HOME',
920 os.path.expanduser('~/.cache'))
921 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
922
923
924# Cross-platform file locking
925if sys.platform == 'win32':
926 import ctypes.wintypes
927 import msvcrt
928
929 class OVERLAPPED(ctypes.Structure):
930 _fields_ = [
931 ('Internal', ctypes.wintypes.LPVOID),
932 ('InternalHigh', ctypes.wintypes.LPVOID),
933 ('Offset', ctypes.wintypes.DWORD),
934 ('OffsetHigh', ctypes.wintypes.DWORD),
935 ('hEvent', ctypes.wintypes.HANDLE),
936 ]
937
938 kernel32 = ctypes.windll.kernel32
939 LockFileEx = kernel32.LockFileEx
940 LockFileEx.argtypes = [
941 ctypes.wintypes.HANDLE, # hFile
942 ctypes.wintypes.DWORD, # dwFlags
943 ctypes.wintypes.DWORD, # dwReserved
944 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
945 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
946 ctypes.POINTER(OVERLAPPED) # Overlapped
947 ]
948 LockFileEx.restype = ctypes.wintypes.BOOL
949 UnlockFileEx = kernel32.UnlockFileEx
950 UnlockFileEx.argtypes = [
951 ctypes.wintypes.HANDLE, # hFile
952 ctypes.wintypes.DWORD, # dwReserved
953 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
954 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
955 ctypes.POINTER(OVERLAPPED) # Overlapped
956 ]
957 UnlockFileEx.restype = ctypes.wintypes.BOOL
958 whole_low = 0xffffffff
959 whole_high = 0x7fffffff
960
961 def _lock_file(f, exclusive):
962 overlapped = OVERLAPPED()
963 overlapped.Offset = 0
964 overlapped.OffsetHigh = 0
965 overlapped.hEvent = 0
966 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
967 handle = msvcrt.get_osfhandle(f.fileno())
968 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
969 whole_low, whole_high, f._lock_file_overlapped_p):
970 raise OSError('Locking file failed: %r' % ctypes.FormatError())
971
972 def _unlock_file(f):
973 assert f._lock_file_overlapped_p
974 handle = msvcrt.get_osfhandle(f.fileno())
975 if not UnlockFileEx(handle, 0,
976 whole_low, whole_high, f._lock_file_overlapped_p):
977 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
978
979else:
980 import fcntl
981
982 def _lock_file(f, exclusive):
983 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
984
985 def _unlock_file(f):
986 fcntl.lockf(f, fcntl.LOCK_UN)
987
988
989class locked_file(object):
990 def __init__(self, filename, mode, encoding=None):
991 assert mode in ['r', 'a', 'w']
992 self.f = io.open(filename, mode, encoding=encoding)
993 self.mode = mode
994
995 def __enter__(self):
996 exclusive = self.mode != 'r'
997 try:
998 _lock_file(self.f, exclusive)
999 except IOError:
1000 self.f.close()
1001 raise
1002 return self
1003
1004 def __exit__(self, etype, value, traceback):
1005 try:
1006 _unlock_file(self.f)
1007 finally:
1008 self.f.close()
1009
1010 def __iter__(self):
1011 return iter(self.f)
1012
1013 def write(self, *args):
1014 return self.f.write(*args)
1015
1016 def read(self, *args):
1017 return self.f.read(*args)
4eb7f1d1
JMF
1018
1019
1020def shell_quote(args):
a6a173c2
JMF
1021 quoted_args = []
1022 encoding = sys.getfilesystemencoding()
1023 if encoding is None:
1024 encoding = 'utf-8'
1025 for a in args:
1026 if isinstance(a, bytes):
1027 # We may get a filename encoded with 'encodeFilename'
1028 a = a.decode(encoding)
1029 quoted_args.append(pipes.quote(a))
1030 return u' '.join(quoted_args)
9d4660ca
PH
1031
1032
f4d96df0
PH
1033def takewhile_inclusive(pred, seq):
1034 """ Like itertools.takewhile, but include the latest evaluated element
1035 (the first element so that Not pred(e)) """
1036 for e in seq:
1037 yield e
1038 if not pred(e):
1039 return
1040
1041
9d4660ca
PH
1042def smuggle_url(url, data):
1043 """ Pass additional data in a URL for internal use. """
1044
1045 sdata = compat_urllib_parse.urlencode(
1046 {u'__youtubedl_smuggle': json.dumps(data)})
1047 return url + u'#' + sdata
1048
1049
79f82953 1050def unsmuggle_url(smug_url, default=None):
9d4660ca 1051 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1052 return smug_url, default
9d4660ca
PH
1053 url, _, sdata = smug_url.rpartition(u'#')
1054 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1055 data = json.loads(jsond)
1056 return url, data
02dbf93f
PH
1057
1058
02dbf93f
PH
1059def format_bytes(bytes):
1060 if bytes is None:
1061 return u'N/A'
1062 if type(bytes) is str:
1063 bytes = float(bytes)
1064 if bytes == 0.0:
1065 exponent = 0
1066 else:
1067 exponent = int(math.log(bytes, 1024.0))
1068 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1069 converted = float(bytes) / float(1024 ** exponent)
1070 return u'%.2f%s' % (converted, suffix)
f53c966a 1071
1c088fa8 1072
f53c966a
JMF
1073def str_to_int(int_str):
1074 int_str = re.sub(r'[,\.]', u'', int_str)
1075 return int(int_str)
1c088fa8
PH
1076
1077
1078def get_term_width():
1079 columns = os.environ.get('COLUMNS', None)
1080 if columns:
1081 return int(columns)
1082
1083 try:
1084 sp = subprocess.Popen(
1085 ['stty', 'size'],
1086 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1087 out, err = sp.communicate()
1088 return int(out.split()[1])
1089 except:
1090 pass
1091 return None
caefb1de
PH
1092
1093
1094def month_by_name(name):
1095 """ Return the number of a month by (locale-independently) English name """
1096
1097 ENGLISH_NAMES = [
dadb8184 1098 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1099 u'July', u'August', u'September', u'October', u'November', u'December']
1100 try:
1101 return ENGLISH_NAMES.index(name) + 1
1102 except ValueError:
1103 return None
18258362
JMF
1104
1105
5aafe895 1106def fix_xml_ampersands(xml_str):
18258362 1107 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1108 return re.sub(
1109 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1110 u'&amp;',
1111 xml_str)
e3946f98
PH
1112
1113
1114def setproctitle(title):
8bf48f23 1115 assert isinstance(title, compat_str)
e3946f98
PH
1116 try:
1117 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1118 except OSError:
1119 return
1120 title = title
1121 buf = ctypes.create_string_buffer(len(title) + 1)
e64eaaa9 1122 buf.value = title.encode('utf-8')
e3946f98
PH
1123 try:
1124 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1125 except AttributeError:
1126 return # Strange libc, just skip this
d7dda168
PH
1127
1128
1129def remove_start(s, start):
1130 if s.startswith(start):
1131 return s[len(start):]
1132 return s
29eb5174
PH
1133
1134
1135def url_basename(url):
9b8aaeed
JMF
1136 path = compat_urlparse.urlparse(url).path
1137 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1138
1139
1140class HEADRequest(compat_urllib_request.Request):
1141 def get_method(self):
1142 return "HEAD"
7217e148
PH
1143
1144
dd27fd17
PH
1145def int_or_none(v, scale=1):
1146 return v if v is None else (int(v) // scale)
608d11f5
PH
1147
1148
1149def parse_duration(s):
1150 if s is None:
1151 return None
1152
1153 m = re.match(
2db806b4 1154 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
608d11f5
PH
1155 if not m:
1156 return None
1157 res = int(m.group('secs'))
1158 if m.group('mins'):
1159 res += int(m.group('mins')) * 60
1160 if m.group('hours'):
1161 res += int(m.group('hours')) * 60 * 60
1162 return res
91d7d0b3
JMF
1163
1164
1165def prepend_extension(filename, ext):
1166 name, real_ext = os.path.splitext(filename)
1167 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1168
1169
1170def check_executable(exe, args=[]):
1171 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1172 args can be a list of arguments for a short output (like -version) """
1173 try:
1174 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1175 except OSError:
1176 return False
1177 return exe
b7ab0590
PH
1178
1179
1180class PagedList(object):
1181 def __init__(self, pagefunc, pagesize):
1182 self._pagefunc = pagefunc
1183 self._pagesize = pagesize
1184
dd26ced1
PH
1185 def __len__(self):
1186 # This is only useful for tests
1187 return len(self.getslice())
1188
b7ab0590
PH
1189 def getslice(self, start=0, end=None):
1190 res = []
1191 for pagenum in itertools.count(start // self._pagesize):
1192 firstid = pagenum * self._pagesize
1193 nextfirstid = pagenum * self._pagesize + self._pagesize
1194 if start >= nextfirstid:
1195 continue
1196
1197 page_results = list(self._pagefunc(pagenum))
1198
1199 startv = (
1200 start % self._pagesize
1201 if firstid <= start < nextfirstid
1202 else 0)
1203
1204 endv = (
1205 ((end - 1) % self._pagesize) + 1
1206 if (end is not None and firstid <= end <= nextfirstid)
1207 else None)
1208
1209 if startv != 0 or endv is not None:
1210 page_results = page_results[startv:endv]
1211 res.extend(page_results)
1212
1213 # A little optimization - if current page is not "full", ie. does
1214 # not contain page_size videos then we can assume that this page
1215 # is the last one - there are no more ids on further pages -
1216 # i.e. no need to query again.
1217 if len(page_results) + startv < self._pagesize:
1218 break
1219
1220 # If we got the whole page, but the next page is not interesting,
1221 # break out early as well
1222 if end == nextfirstid:
1223 break
1224 return res
81c2f20b
PH
1225
1226
1227def uppercase_escape(s):
1228 return re.sub(
1229 r'\\U([0-9a-fA-F]{8})',
1230 lambda m: compat_chr(int(m.group(1), base=16)), s)
b53466e1
PH
1231
1232try:
1233 struct.pack(u'!I', 0)
1234except TypeError:
1235 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1236 def struct_pack(spec, *args):
1237 if isinstance(spec, compat_str):
1238 spec = spec.encode('ascii')
1239 return struct.pack(spec, *args)
1240
1241 def struct_unpack(spec, *args):
1242 if isinstance(spec, compat_str):
1243 spec = spec.encode('ascii')
1244 return struct.unpack(spec, *args)
1245else:
1246 struct_pack = struct.pack
1247 struct_unpack = struct.unpack