]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[test/test_http] Fix getsockname() on Jython
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
be4a824d 42 compat_http_client,
c86b6142 43 compat_kwargs,
8c25f81b 44 compat_parse_qs,
702ccf2d 45 compat_shlex_quote,
be4a824d 46 compat_socket_create_connection,
8c25f81b 47 compat_str,
edaa23f8 48 compat_struct_pack,
8c25f81b
PH
49 compat_urllib_error,
50 compat_urllib_parse,
15707c7e 51 compat_urllib_parse_urlencode,
8c25f81b 52 compat_urllib_parse_urlparse,
7581bfc9 53 compat_urllib_parse_unquote_plus,
8c25f81b
PH
54 compat_urllib_request,
55 compat_urlparse,
810c10ba 56 compat_xpath,
8c25f81b 57)
4644ac55 58
71aff188
YCH
59from .socks import (
60 ProxyType,
61 sockssocket,
62)
63
4644ac55 64
51fb4995
YCH
65def register_socks_protocols():
66 # "Register" SOCKS protocols
d5ae6bb5
YCH
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
72
73
468e2e92
FV
74# This is not clearly defined otherwise
75compiled_regex_type = type(re.compile(''))
76
3e669f36 77std_headers = {
9c7b3898 78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 83}
f427df17 84
5f6a1245 85
bf42a990
S
86NO_DEFAULT = object()
87
7105440c
YCH
88ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
91
a7aaa398
S
92KNOWN_EXTENSIONS = (
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
96 'mkv', 'mka', 'mk3d',
97 'avi', 'divx',
98 'mov',
99 'asf', 'wmv', 'wma',
100 '3gp', '3g2',
101 'mp3',
102 'flac',
103 'ape',
104 'wav',
105 'f4f', 'f4m', 'm3u8', 'smil')
106
c587cbb7 107# needed for sanitizing filenames in restricted mode
778a1ccc
YCH
108ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
c587cbb7 111
7105440c 112
d77c3dfd 113def preferredencoding():
59ae15a5 114 """Get preferred encoding.
d77c3dfd 115
59ae15a5
PH
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
118 """
119 try:
120 pref = locale.getpreferredencoding()
28e614de 121 'TEST'.encode(pref)
70a1165b 122 except Exception:
59ae15a5 123 pref = 'UTF-8'
bae611f2 124
59ae15a5 125 return pref
d77c3dfd 126
f4bfd65f 127
181c8655 128def write_json_file(obj, fn):
1394646a 129 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 130
92120217 131 fn = encodeFilename(fn)
61ee5aeb 132 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
140 else:
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
143
73159f99
S
144 args = {
145 'suffix': '.tmp',
ec5f6016
JMF
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
73159f99
S
148 'delete': False,
149 }
150
181c8655
PH
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
73159f99 154 args['mode'] = 'wb'
181c8655 155 else:
73159f99
S
156 args.update({
157 'mode': 'w',
158 'encoding': 'utf-8',
159 })
160
c86b6142 161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
162
163 try:
164 with tf:
165 json.dump(obj, tf)
1394646a
IK
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
169 try:
170 os.unlink(fn)
171 except OSError:
172 pass
181c8655 173 os.rename(tf.name, fn)
70a1165b 174 except Exception:
181c8655
PH
175 try:
176 os.remove(tf.name)
177 except OSError:
178 pass
179 raise
180
181
182if sys.version_info >= (2, 7):
ee114368 183 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 184 """ Find the xpath xpath[@key=val] """
5d2354f1 185 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
187 return node.find(expr)
188else:
ee114368 189 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 190 for f in node.findall(compat_xpath(xpath)):
ee114368
S
191 if key not in f.attrib:
192 continue
193 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
194 return f
195 return None
196
d7e66d39
JMF
197# On python2.6 the xml.etree.ElementTree.Element methods don't support
198# the namespace parameter
5f6a1245
JW
199
200
d7e66d39
JMF
201def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
203 replaced = []
204 for c in components:
205 if len(c) == 1:
206 replaced.append(c[0])
207 else:
208 ns, tag = c
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
211
d77c3dfd 212
a41fb80c 213def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 214 def _find_xpath(xpath):
810c10ba 215 return node.find(compat_xpath(xpath))
578c0745
S
216
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
219 else:
220 for xp in xpath:
221 n = _find_xpath(xp)
222 if n is not None:
223 break
d74bebd5 224
8e636da4 225 if n is None:
bf42a990
S
226 if default is not NO_DEFAULT:
227 return default
228 elif fatal:
bf0ff932
PH
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
231 else:
232 return None
a41fb80c
S
233 return n
234
235
236def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
239 return n
240 if n.text is None:
241 if default is not NO_DEFAULT:
242 return default
243 elif fatal:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
246 else:
247 return None
248 return n.text
a41fb80c
S
249
250
251def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
253 if n is None:
254 if default is not NO_DEFAULT:
255 return default
256 elif fatal:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
259 else:
260 return None
261 return n.attrib[key]
bf0ff932
PH
262
263
9e6dd238 264def get_element_by_id(id, html):
43e8fafd 265 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 266 return get_element_by_attribute('id', id, html)
43e8fafd 267
12ea2f30 268
43e8fafd
ND
269def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 271
38285056
PH
272 m = re.search(r'''(?xs)
273 <([a-zA-Z0-9:._-]+)
abc97b5e 274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 275 \s+%s=['"]?%s['"]?
abc97b5e 276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
277 \s*>
278 (?P<content>.*?)
279 </\1>
280 ''' % (re.escape(attribute), re.escape(value)), html)
281
282 if not m:
283 return None
284 res = m.group('content')
285
286 if res.startswith('"') or res.startswith("'"):
287 res = res[1:-1]
a921f407 288
38285056 289 return unescapeHTML(res)
a921f407 290
c5229f39 291
8bb56eee
BF
292class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
294 def __init__(self):
c5229f39 295 self.attrs = {}
8bb56eee
BF
296 compat_HTMLParser.__init__(self)
297
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
300
c5229f39 301
8bb56eee
BF
302def extract_attributes(html_element):
303 """Given a string for an HTML element such as
304 <el
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&amp;"
307 sq='"' dq="'"
308 >
309 Decode and return a dictionary of attributes.
310 {
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
314 }.
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
317 """
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
320 parser.close()
321 return parser.attrs
9e6dd238 322
c5229f39 323
9e6dd238 324def clean_html(html):
59ae15a5 325 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
326
327 if html is None: # Convenience for sanitizing descriptions etc.
328 return html
329
59ae15a5
PH
330 # Newline vs <br />
331 html = html.replace('\n', ' ')
6b3aef80
FV
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
334 # Strip html tags
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
7decf895 338 return html.strip()
9e6dd238
FV
339
340
d77c3dfd 341def sanitize_open(filename, open_mode):
59ae15a5
PH
342 """Try to open the given filename, and slightly tweak it if this fails.
343
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
347 function.
348
349 It returns the tuple (stream, definitive_file_name).
350 """
351 try:
28e614de 352 if filename == '-':
59ae15a5
PH
353 if sys.platform == 'win32':
354 import msvcrt
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
f45c185f
PH
360 if err.errno in (errno.EACCES,):
361 raise
59ae15a5 362
f45c185f 363 # In case of error, try to remove win32 forbidden chars
d55de57b 364 alt_filename = sanitize_path(filename)
f45c185f
PH
365 if alt_filename == filename:
366 raise
367 else:
368 # An exception here should be caught in the caller
d55de57b 369 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 370 return (stream, alt_filename)
d77c3dfd
FV
371
372
373def timeconvert(timestr):
59ae15a5
PH
374 """Convert RFC 2822 defined time string into system timestamp"""
375 timestamp = None
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
379 return timestamp
1c469a94 380
5f6a1245 381
796173d0 382def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
796173d0 385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
386 """
387 def replace_insane(char):
c587cbb7
AT
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
59ae15a5
PH
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
391 return ''
392 elif char == '"':
393 return '' if restricted else '\''
394 elif char == ':':
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
397 return '_'
627dcfff 398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
399 return '_'
400 if restricted and ord(char) > 127:
401 return '_'
402 return char
403
2aeb06d6
PH
404 # Handle timestamps
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 406 result = ''.join(map(replace_insane, s))
796173d0
PH
407 if not is_id:
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
413 result = result[2:]
5a42414b
PH
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
a7440261 416 result = result.lstrip('.')
796173d0
PH
417 if not result:
418 result = '_'
59ae15a5 419 return result
d77c3dfd 420
5f6a1245 421
a2aaf4db
S
422def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
425 return s
be531ef1
S
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
430 if drive_or_unc:
a2aaf4db
S
431 norm_path.pop(0)
432 sanitized_path = [
c90d16cf 433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 434 for path_part in norm_path]
be531ef1
S
435 if drive_or_unc:
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
437 return os.path.join(*sanitized_path)
438
439
67dda517
S
440# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441# unwanted failures due to missing protocol
17bcc626
S
442def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
444
445
67dda517 446def sanitized_Request(url, *args, **kwargs):
17bcc626 447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
448
449
d77c3dfd 450def orderedSet(iterable):
59ae15a5
PH
451 """ Remove all duplicates from the input iterable """
452 res = []
453 for el in iterable:
454 if el not in res:
455 res.append(el)
456 return res
d77c3dfd 457
912b38b4 458
4e408e47
PH
459def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
464
91757b0f 465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
466 if mobj is not None:
467 numstr = mobj.group(1)
28e614de 468 if numstr.startswith('x'):
4e408e47 469 base = 16
28e614de 470 numstr = '0%s' % numstr
4e408e47
PH
471 else:
472 base = 10
7aefc49c
S
473 # See https://github.com/rg3/youtube-dl/issues/7518
474 try:
475 return compat_chr(int(numstr, base))
476 except ValueError:
477 pass
4e408e47
PH
478
479 # Unknown entity in name, return its literal representation
7a3f0c00 480 return '&%s;' % entity
4e408e47
PH
481
482
d77c3dfd 483def unescapeHTML(s):
912b38b4
PH
484 if s is None:
485 return None
486 assert type(s) == compat_str
d77c3dfd 487
4e408e47
PH
488 return re.sub(
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 490
8bf48f23 491
aa49acd1
S
492def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
497 else:
498 encoding = sys.getfilesystemencoding()
499 if encoding is None:
500 encoding = 'utf-8'
501 return encoding
502
503
8bf48f23 504def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
505 """
506 @param s The name of the file
507 """
d77c3dfd 508
8bf48f23 509 assert type(s) == compat_str
d77c3dfd 510
59ae15a5
PH
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
513 return s
0f00efed 514
aa49acd1
S
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
519 return s
520
8ee239e9
YCH
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
523 return s
524
aa49acd1
S
525 return s.encode(get_subprocess_encoding(), 'ignore')
526
527
528def decodeFilename(b, for_subprocess=False):
529
530 if sys.version_info >= (3, 0):
531 return b
532
533 if not isinstance(b, bytes):
534 return b
535
536 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 537
f07b74fc
PH
538
539def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
7af808a5 543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
546
547
aa49acd1
S
548def decodeArgument(b):
549 return decodeFilename(b, True)
550
551
8271226a
PH
552def decodeOption(optval):
553 if optval is None:
554 return optval
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
557
558 assert isinstance(optval, compat_str)
559 return optval
1c256f70 560
5f6a1245 561
4539dd30
PH
562def formatSeconds(secs):
563 if secs > 3600:
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
565 elif secs > 60:
566 return '%d:%02d' % (secs // 60, secs % 60)
567 else:
568 return '%d' % secs
569
a0ddb8a2 570
be4a824d
PH
571def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 575 if opts_no_check_certificate:
be5f2c19 576 context.check_hostname = False
0db261ba 577 context.verify_mode = ssl.CERT_NONE
a2366922 578 try:
be4a824d 579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
580 except TypeError:
581 # Python 2.7.8
582 # (create_default_context present but HTTPSHandler has no context=)
583 pass
584
585 if sys.version_info < (3, 2):
d7932313 586 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 587 else: # Python < 3.4
d7932313 588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 589 context.verify_mode = (ssl.CERT_NONE
dca08720 590 if opts_no_check_certificate
ea6d901e 591 else ssl.CERT_REQUIRED)
303b479e 592 context.set_default_verify_paths()
be4a824d 593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 594
732ea2f0 595
08f2a92c
JMF
596def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
599 else:
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
604 return msg
605
606
1c256f70
PH
607class ExtractorError(Exception):
608 """Error during info extraction."""
5f6a1245 609
d11271dd 610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
613 """
614
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
616 expected = True
d11271dd
PH
617 if video_id is not None:
618 msg = video_id + ': ' + msg
410f3e73 619 if cause:
28e614de 620 msg += ' (caused by %r)' % cause
9a82b238 621 if not expected:
08f2a92c 622 msg += bug_reports_message()
1c256f70 623 super(ExtractorError, self).__init__(msg)
d5979c5d 624
1c256f70 625 self.traceback = tb
8cc83b8d 626 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 627 self.cause = cause
d11271dd 628 self.video_id = video_id
1c256f70 629
01951dda
PH
630 def format_traceback(self):
631 if self.traceback is None:
632 return None
28e614de 633 return ''.join(traceback.format_tb(self.traceback))
01951dda 634
1c256f70 635
416c7fcb
PH
636class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
640 self.url = url
641
642
55b3e45b
JMF
643class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
645 pass
646
647
d77c3dfd 648class DownloadError(Exception):
59ae15a5 649 """Download Error exception.
d77c3dfd 650
59ae15a5
PH
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
653 error message.
654 """
5f6a1245 655
8cc83b8d
FV
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
d77c3dfd
FV
660
661
662class SameFileError(Exception):
59ae15a5 663 """Same File exception.
d77c3dfd 664
59ae15a5
PH
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
667 """
668 pass
d77c3dfd
FV
669
670
671class PostProcessingError(Exception):
59ae15a5 672 """Post Processing exception.
d77c3dfd 673
59ae15a5
PH
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
676 """
5f6a1245 677
7851b379
PH
678 def __init__(self, msg):
679 self.msg = msg
d77c3dfd 680
5f6a1245 681
d77c3dfd 682class MaxDownloadsReached(Exception):
59ae15a5
PH
683 """ --max-downloads limit has been reached. """
684 pass
d77c3dfd
FV
685
686
687class UnavailableVideoError(Exception):
59ae15a5 688 """Unavailable Format exception.
d77c3dfd 689
59ae15a5
PH
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
692 """
693 pass
d77c3dfd
FV
694
695
696class ContentTooShortError(Exception):
59ae15a5 697 """Content Too Short exception.
d77c3dfd 698
59ae15a5
PH
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
702 """
d77c3dfd 703
59ae15a5 704 def __init__(self, downloaded, expected):
2c7ed247 705 # Both in bytes
59ae15a5
PH
706 self.downloaded = downloaded
707 self.expected = expected
d77c3dfd 708
5f6a1245 709
c5a59d93 710def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
5a1a2e94 715 kwargs[b'strict'] = True
be4a824d
PH
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
722 else: # Python 2.6
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
726 if is_https:
d7932313
PH
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
730 else:
731 self.sock = sock
732 hc.connect = functools.partial(_hc_connect, hc)
733
734 return hc
735
736
87f0e62d 737def handle_youtubedl_headers(headers):
992fc9d6
YCH
738 filtered_headers = headers
739
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 742 del filtered_headers['Youtubedl-no-compression']
87f0e62d 743
992fc9d6 744 return filtered_headers
87f0e62d
YCH
745
746
acebc9cd 747class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
748 """Handler for HTTP requests and responses.
749
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
0424ec30 754 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
755 removed before making the real request.
756
757 Part of this code was copied from:
758
759 http://techknack.net/python-urllib2-handlers/
760
761 Andrew Rowls, the author of that code, agreed to release it to the
762 public domain.
763 """
764
be4a824d
PH
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
768
769 def http_open(self, req):
71aff188
YCH
770 conn_class = compat_http_client.HTTPConnection
771
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
773 if socks_proxy:
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
776
be4a824d 777 return self.do_open(functools.partial(
71aff188 778 _create_http_connection, self, conn_class, False),
be4a824d
PH
779 req)
780
59ae15a5
PH
781 @staticmethod
782 def deflate(data):
783 try:
784 return zlib.decompress(data, -zlib.MAX_WBITS)
785 except zlib.error:
786 return zlib.decompress(data)
787
788 @staticmethod
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
793 ret.code = code
794 return ret
795
acebc9cd 796 def http_request(self, req):
51f267d9
S
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
807
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
15d260eb 810 req = update_Request(req, url=url_escaped)
51f267d9 811
33ac271b 812 for h, v in std_headers.items():
3d5f7a39
JK
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
33ac271b 816 req.add_header(h, v)
87f0e62d
YCH
817
818 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
819
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
824
59ae15a5
PH
825 return req
826
acebc9cd 827 def http_response(self, req, resp):
59ae15a5
PH
828 old_resp = resp
829 # gzip
830 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
833 try:
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
839 try:
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
842 except IOError:
843 continue
844 break
845 else:
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 848 resp.msg = old_resp.msg
c047270c 849 del resp.headers['Content-encoding']
59ae15a5
PH
850 # deflate
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
c047270c 855 del resp.headers['Content-encoding']
ad729172
S
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
860 if location:
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
864 else:
865 location = location.decode('utf-8')
5a4d9ddb
S
866 location_escaped = escape_url(location)
867 if location != location_escaped:
868 del resp.headers['Location']
869 resp.headers['Location'] = location_escaped
59ae15a5 870 return resp
0f8d03f8 871
acebc9cd
PH
872 https_request = http_request
873 https_response = http_response
bf50b038 874
5de90176 875
71aff188
YCH
876def make_socks_conn_class(base_class, socks_proxy):
877 assert issubclass(base_class, (
878 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
879
880 url_components = compat_urlparse.urlparse(socks_proxy)
881 if url_components.scheme.lower() == 'socks5':
882 socks_type = ProxyType.SOCKS5
883 elif url_components.scheme.lower() in ('socks', 'socks4'):
884 socks_type = ProxyType.SOCKS4
51fb4995
YCH
885 elif url_components.scheme.lower() == 'socks4a':
886 socks_type = ProxyType.SOCKS4A
71aff188 887
cdd94c2e
YCH
888 def unquote_if_non_empty(s):
889 if not s:
890 return s
891 return compat_urllib_parse_unquote_plus(s)
892
71aff188
YCH
893 proxy_args = (
894 socks_type,
895 url_components.hostname, url_components.port or 1080,
896 True, # Remote DNS
cdd94c2e
YCH
897 unquote_if_non_empty(url_components.username),
898 unquote_if_non_empty(url_components.password),
71aff188
YCH
899 )
900
901 class SocksConnection(base_class):
902 def connect(self):
903 self.sock = sockssocket()
904 self.sock.setproxy(*proxy_args)
905 if type(self.timeout) in (int, float):
906 self.sock.settimeout(self.timeout)
907 self.sock.connect((self.host, self.port))
908
909 if isinstance(self, compat_http_client.HTTPSConnection):
910 if hasattr(self, '_context'): # Python > 2.6
911 self.sock = self._context.wrap_socket(
912 self.sock, server_hostname=self.host)
913 else:
914 self.sock = ssl.wrap_socket(self.sock)
915
916 return SocksConnection
917
918
be4a824d
PH
919class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
920 def __init__(self, params, https_conn_class=None, *args, **kwargs):
921 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
922 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
923 self._params = params
924
925 def https_open(self, req):
4f264c02 926 kwargs = {}
71aff188
YCH
927 conn_class = self._https_conn_class
928
4f264c02
JMF
929 if hasattr(self, '_context'): # python > 2.6
930 kwargs['context'] = self._context
931 if hasattr(self, '_check_hostname'): # python 3.x
932 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
933
934 socks_proxy = req.headers.get('Ytdl-socks-proxy')
935 if socks_proxy:
936 conn_class = make_socks_conn_class(conn_class, socks_proxy)
937 del req.headers['Ytdl-socks-proxy']
938
be4a824d 939 return self.do_open(functools.partial(
71aff188 940 _create_http_connection, self, conn_class, True),
4f264c02 941 req, **kwargs)
be4a824d
PH
942
943
a6420bf5
S
944class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
945 def __init__(self, cookiejar=None):
946 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
947
948 def http_response(self, request, response):
949 # Python 2 will choke on next HTTP request in row if there are non-ASCII
950 # characters in Set-Cookie HTTP header of last response (see
951 # https://github.com/rg3/youtube-dl/issues/6769).
952 # In order to at least prevent crashing we will percent encode Set-Cookie
953 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
954 # if sys.version_info < (3, 0) and response.headers:
955 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
956 # set_cookie = response.headers.get(set_cookie_header)
957 # if set_cookie:
958 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
959 # if set_cookie != set_cookie_escaped:
960 # del response.headers[set_cookie_header]
961 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
962 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
963
964 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
965 https_response = http_response
966
967
08b38d54 968def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
969 """ Return a UNIX timestamp from the given date """
970
971 if date_str is None:
972 return None
973
52c3a6e4
S
974 date_str = re.sub(r'\.[0-9]+', '', date_str)
975
08b38d54
PH
976 if timezone is None:
977 m = re.search(
52c3a6e4 978 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
979 date_str)
980 if not m:
912b38b4
PH
981 timezone = datetime.timedelta()
982 else:
08b38d54
PH
983 date_str = date_str[:-len(m.group(0))]
984 if not m.group('sign'):
985 timezone = datetime.timedelta()
986 else:
987 sign = 1 if m.group('sign') == '+' else -1
988 timezone = datetime.timedelta(
989 hours=sign * int(m.group('hours')),
990 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
991 try:
992 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
993 dt = datetime.datetime.strptime(date_str, date_format) - timezone
994 return calendar.timegm(dt.timetuple())
995 except ValueError:
996 pass
912b38b4
PH
997
998
42bdd9d0 999def unified_strdate(date_str, day_first=True):
bf50b038 1000 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1001
1002 if date_str is None:
1003 return None
bf50b038 1004 upload_date = None
5f6a1245 1005 # Replace commas
026fcc04 1006 date_str = date_str.replace(',', ' ')
bf50b038 1007 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
1008 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1009 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 1010 # Remove AM/PM + timezone
9bb8e0a3 1011 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 1012
19e1d359
JMF
1013 format_expressions = [
1014 '%d %B %Y',
0f99566c 1015 '%d %b %Y',
19e1d359
JMF
1016 '%B %d %Y',
1017 '%b %d %Y',
f160785c
S
1018 '%b %dst %Y %I:%M',
1019 '%b %dnd %Y %I:%M',
1020 '%b %dth %Y %I:%M',
a69801e2 1021 '%Y %m %d',
19e1d359 1022 '%Y-%m-%d',
fe556f1b 1023 '%Y/%m/%d',
19e1d359 1024 '%Y/%m/%d %H:%M:%S',
5d73273f 1025 '%Y-%m-%d %H:%M:%S',
e9be9a6a 1026 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 1027 '%d.%m.%Y %H:%M',
b047de6f 1028 '%d.%m.%Y %H.%M',
19e1d359 1029 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
1030 '%Y-%m-%dT%H:%M:%S.%fZ',
1031 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 1032 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 1033 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 1034 '%Y-%m-%dT%H:%M',
19e1d359 1035 ]
42bdd9d0
PH
1036 if day_first:
1037 format_expressions.extend([
79c21abb 1038 '%d-%m-%Y',
776dc399 1039 '%d.%m.%Y',
5950cb1d 1040 '%d.%m.%y',
776dc399
S
1041 '%d/%m/%Y',
1042 '%d/%m/%y',
42bdd9d0
PH
1043 '%d/%m/%Y %H:%M:%S',
1044 ])
1045 else:
1046 format_expressions.extend([
79c21abb 1047 '%m-%d-%Y',
776dc399
S
1048 '%m.%d.%Y',
1049 '%m/%d/%Y',
1050 '%m/%d/%y',
42bdd9d0
PH
1051 '%m/%d/%Y %H:%M:%S',
1052 ])
bf50b038
JMF
1053 for expression in format_expressions:
1054 try:
1055 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1056 except ValueError:
bf50b038 1057 pass
42393ce2
PH
1058 if upload_date is None:
1059 timetuple = email.utils.parsedate_tz(date_str)
1060 if timetuple:
c6b9cf05
S
1061 try:
1062 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1063 except ValueError:
1064 pass
6a750402
JMF
1065 if upload_date is not None:
1066 return compat_str(upload_date)
bf50b038 1067
5f6a1245 1068
28e614de 1069def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1070 if url is None:
1071 return default_ext
9cb9a5df 1072 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1073 if re.match(r'^[A-Za-z0-9]+$', guess):
1074 return guess
a7aaa398
S
1075 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1076 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1077 return guess.rstrip('/')
73e79f2a 1078 else:
cbdbb766 1079 return default_ext
73e79f2a 1080
5f6a1245 1081
d4051a8e 1082def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1083 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1084
5f6a1245 1085
bd558525 1086def date_from_str(date_str):
37254abc
JMF
1087 """
1088 Return a datetime object from a string in the format YYYYMMDD or
1089 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1090 today = datetime.date.today()
f8795e10 1091 if date_str in ('now', 'today'):
37254abc 1092 return today
f8795e10
PH
1093 if date_str == 'yesterday':
1094 return today - datetime.timedelta(days=1)
37254abc
JMF
1095 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1096 if match is not None:
1097 sign = match.group('sign')
1098 time = int(match.group('time'))
1099 if sign == '-':
1100 time = -time
1101 unit = match.group('unit')
dfb1b146 1102 # A bad approximation?
37254abc
JMF
1103 if unit == 'month':
1104 unit = 'day'
1105 time *= 30
1106 elif unit == 'year':
1107 unit = 'day'
1108 time *= 365
1109 unit += 's'
1110 delta = datetime.timedelta(**{unit: time})
1111 return today + delta
611c1dd9 1112 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1113
1114
e63fc1be 1115def hyphenate_date(date_str):
1116 """
1117 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1118 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1119 if match is not None:
1120 return '-'.join(match.groups())
1121 else:
1122 return date_str
1123
5f6a1245 1124
bd558525
JMF
1125class DateRange(object):
1126 """Represents a time interval between two dates"""
5f6a1245 1127
bd558525
JMF
1128 def __init__(self, start=None, end=None):
1129 """start and end must be strings in the format accepted by date"""
1130 if start is not None:
1131 self.start = date_from_str(start)
1132 else:
1133 self.start = datetime.datetime.min.date()
1134 if end is not None:
1135 self.end = date_from_str(end)
1136 else:
1137 self.end = datetime.datetime.max.date()
37254abc 1138 if self.start > self.end:
bd558525 1139 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1140
bd558525
JMF
1141 @classmethod
1142 def day(cls, day):
1143 """Returns a range that only contains the given day"""
5f6a1245
JW
1144 return cls(day, day)
1145
bd558525
JMF
1146 def __contains__(self, date):
1147 """Check if the date is in the range"""
37254abc
JMF
1148 if not isinstance(date, datetime.date):
1149 date = date_from_str(date)
1150 return self.start <= date <= self.end
5f6a1245 1151
bd558525 1152 def __str__(self):
5f6a1245 1153 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1154
1155
1156def platform_name():
1157 """ Returns the platform name as a compat_str """
1158 res = platform.platform()
1159 if isinstance(res, bytes):
1160 res = res.decode(preferredencoding())
1161
1162 assert isinstance(res, compat_str)
1163 return res
c257baff
PH
1164
1165
b58ddb32
PH
1166def _windows_write_string(s, out):
1167 """ Returns True if the string was written using special methods,
1168 False if it has yet to be written out."""
1169 # Adapted from http://stackoverflow.com/a/3259271/35070
1170
1171 import ctypes
1172 import ctypes.wintypes
1173
1174 WIN_OUTPUT_IDS = {
1175 1: -11,
1176 2: -12,
1177 }
1178
a383a98a
PH
1179 try:
1180 fileno = out.fileno()
1181 except AttributeError:
1182 # If the output stream doesn't have a fileno, it's virtual
1183 return False
aa42e873
PH
1184 except io.UnsupportedOperation:
1185 # Some strange Windows pseudo files?
1186 return False
b58ddb32
PH
1187 if fileno not in WIN_OUTPUT_IDS:
1188 return False
1189
e2f89ec7 1190 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1191 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1192 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1193 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1194
e2f89ec7 1195 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1196 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1197 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1198 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1199 written = ctypes.wintypes.DWORD(0)
1200
611c1dd9 1201 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1202 FILE_TYPE_CHAR = 0x0002
1203 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1204 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1205 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1206 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1207 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1208 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1209
1210 def not_a_console(handle):
1211 if handle == INVALID_HANDLE_VALUE or handle is None:
1212 return True
8fb3ac36
PH
1213 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1214 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1215
1216 if not_a_console(h):
1217 return False
1218
d1b9c912
PH
1219 def next_nonbmp_pos(s):
1220 try:
1221 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1222 except StopIteration:
1223 return len(s)
1224
1225 while s:
1226 count = min(next_nonbmp_pos(s), 1024)
1227
b58ddb32 1228 ret = WriteConsoleW(
d1b9c912 1229 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1230 if ret == 0:
1231 raise OSError('Failed to write string')
d1b9c912
PH
1232 if not count: # We just wrote a non-BMP character
1233 assert written.value == 2
1234 s = s[1:]
1235 else:
1236 assert written.value > 0
1237 s = s[written.value:]
b58ddb32
PH
1238 return True
1239
1240
734f90bb 1241def write_string(s, out=None, encoding=None):
7459e3a2
PH
1242 if out is None:
1243 out = sys.stderr
8bf48f23 1244 assert type(s) == compat_str
7459e3a2 1245
b58ddb32
PH
1246 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1247 if _windows_write_string(s, out):
1248 return
1249
7459e3a2
PH
1250 if ('b' in getattr(out, 'mode', '') or
1251 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1252 byt = s.encode(encoding or preferredencoding(), 'ignore')
1253 out.write(byt)
1254 elif hasattr(out, 'buffer'):
1255 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1256 byt = s.encode(enc, 'ignore')
1257 out.buffer.write(byt)
1258 else:
8bf48f23 1259 out.write(s)
7459e3a2
PH
1260 out.flush()
1261
1262
48ea9cea
PH
1263def bytes_to_intlist(bs):
1264 if not bs:
1265 return []
1266 if isinstance(bs[0], int): # Python 3
1267 return list(bs)
1268 else:
1269 return [ord(c) for c in bs]
1270
c257baff 1271
cba892fa 1272def intlist_to_bytes(xs):
1273 if not xs:
1274 return b''
edaa23f8 1275 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1276
1277
c1c9a79c
PH
1278# Cross-platform file locking
1279if sys.platform == 'win32':
1280 import ctypes.wintypes
1281 import msvcrt
1282
1283 class OVERLAPPED(ctypes.Structure):
1284 _fields_ = [
1285 ('Internal', ctypes.wintypes.LPVOID),
1286 ('InternalHigh', ctypes.wintypes.LPVOID),
1287 ('Offset', ctypes.wintypes.DWORD),
1288 ('OffsetHigh', ctypes.wintypes.DWORD),
1289 ('hEvent', ctypes.wintypes.HANDLE),
1290 ]
1291
1292 kernel32 = ctypes.windll.kernel32
1293 LockFileEx = kernel32.LockFileEx
1294 LockFileEx.argtypes = [
1295 ctypes.wintypes.HANDLE, # hFile
1296 ctypes.wintypes.DWORD, # dwFlags
1297 ctypes.wintypes.DWORD, # dwReserved
1298 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1299 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1300 ctypes.POINTER(OVERLAPPED) # Overlapped
1301 ]
1302 LockFileEx.restype = ctypes.wintypes.BOOL
1303 UnlockFileEx = kernel32.UnlockFileEx
1304 UnlockFileEx.argtypes = [
1305 ctypes.wintypes.HANDLE, # hFile
1306 ctypes.wintypes.DWORD, # dwReserved
1307 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1308 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1309 ctypes.POINTER(OVERLAPPED) # Overlapped
1310 ]
1311 UnlockFileEx.restype = ctypes.wintypes.BOOL
1312 whole_low = 0xffffffff
1313 whole_high = 0x7fffffff
1314
1315 def _lock_file(f, exclusive):
1316 overlapped = OVERLAPPED()
1317 overlapped.Offset = 0
1318 overlapped.OffsetHigh = 0
1319 overlapped.hEvent = 0
1320 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1321 handle = msvcrt.get_osfhandle(f.fileno())
1322 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1323 whole_low, whole_high, f._lock_file_overlapped_p):
1324 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1325
1326 def _unlock_file(f):
1327 assert f._lock_file_overlapped_p
1328 handle = msvcrt.get_osfhandle(f.fileno())
1329 if not UnlockFileEx(handle, 0,
1330 whole_low, whole_high, f._lock_file_overlapped_p):
1331 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1332
1333else:
399a76e6
YCH
1334 # Some platforms, such as Jython, is missing fcntl
1335 try:
1336 import fcntl
c1c9a79c 1337
399a76e6
YCH
1338 def _lock_file(f, exclusive):
1339 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1340
399a76e6
YCH
1341 def _unlock_file(f):
1342 fcntl.flock(f, fcntl.LOCK_UN)
1343 except ImportError:
1344 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1345
1346 def _lock_file(f, exclusive):
1347 raise IOError(UNSUPPORTED_MSG)
1348
1349 def _unlock_file(f):
1350 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1351
1352
1353class locked_file(object):
1354 def __init__(self, filename, mode, encoding=None):
1355 assert mode in ['r', 'a', 'w']
1356 self.f = io.open(filename, mode, encoding=encoding)
1357 self.mode = mode
1358
1359 def __enter__(self):
1360 exclusive = self.mode != 'r'
1361 try:
1362 _lock_file(self.f, exclusive)
1363 except IOError:
1364 self.f.close()
1365 raise
1366 return self
1367
1368 def __exit__(self, etype, value, traceback):
1369 try:
1370 _unlock_file(self.f)
1371 finally:
1372 self.f.close()
1373
1374 def __iter__(self):
1375 return iter(self.f)
1376
1377 def write(self, *args):
1378 return self.f.write(*args)
1379
1380 def read(self, *args):
1381 return self.f.read(*args)
4eb7f1d1
JMF
1382
1383
4644ac55
S
1384def get_filesystem_encoding():
1385 encoding = sys.getfilesystemencoding()
1386 return encoding if encoding is not None else 'utf-8'
1387
1388
4eb7f1d1 1389def shell_quote(args):
a6a173c2 1390 quoted_args = []
4644ac55 1391 encoding = get_filesystem_encoding()
a6a173c2
JMF
1392 for a in args:
1393 if isinstance(a, bytes):
1394 # We may get a filename encoded with 'encodeFilename'
1395 a = a.decode(encoding)
1396 quoted_args.append(pipes.quote(a))
28e614de 1397 return ' '.join(quoted_args)
9d4660ca
PH
1398
1399
1400def smuggle_url(url, data):
1401 """ Pass additional data in a URL for internal use. """
1402
15707c7e 1403 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1404 {'__youtubedl_smuggle': json.dumps(data)})
1405 return url + '#' + sdata
9d4660ca
PH
1406
1407
79f82953 1408def unsmuggle_url(smug_url, default=None):
83e865a3 1409 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1410 return smug_url, default
28e614de
PH
1411 url, _, sdata = smug_url.rpartition('#')
1412 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1413 data = json.loads(jsond)
1414 return url, data
02dbf93f
PH
1415
1416
02dbf93f
PH
1417def format_bytes(bytes):
1418 if bytes is None:
28e614de 1419 return 'N/A'
02dbf93f
PH
1420 if type(bytes) is str:
1421 bytes = float(bytes)
1422 if bytes == 0.0:
1423 exponent = 0
1424 else:
1425 exponent = int(math.log(bytes, 1024.0))
28e614de 1426 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1427 converted = float(bytes) / float(1024 ** exponent)
28e614de 1428 return '%.2f%s' % (converted, suffix)
f53c966a 1429
1c088fa8 1430
fb47597b
S
1431def lookup_unit_table(unit_table, s):
1432 units_re = '|'.join(re.escape(u) for u in unit_table)
1433 m = re.match(
782b1b5b 1434 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1435 if not m:
1436 return None
1437 num_str = m.group('num').replace(',', '.')
1438 mult = unit_table[m.group('unit')]
1439 return int(float(num_str) * mult)
1440
1441
be64b5b0
PH
1442def parse_filesize(s):
1443 if s is None:
1444 return None
1445
dfb1b146 1446 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1447 # but we support those too
1448 _UNIT_TABLE = {
1449 'B': 1,
1450 'b': 1,
1451 'KiB': 1024,
1452 'KB': 1000,
1453 'kB': 1024,
1454 'Kb': 1000,
1455 'MiB': 1024 ** 2,
1456 'MB': 1000 ** 2,
1457 'mB': 1024 ** 2,
1458 'Mb': 1000 ** 2,
1459 'GiB': 1024 ** 3,
1460 'GB': 1000 ** 3,
1461 'gB': 1024 ** 3,
1462 'Gb': 1000 ** 3,
1463 'TiB': 1024 ** 4,
1464 'TB': 1000 ** 4,
1465 'tB': 1024 ** 4,
1466 'Tb': 1000 ** 4,
1467 'PiB': 1024 ** 5,
1468 'PB': 1000 ** 5,
1469 'pB': 1024 ** 5,
1470 'Pb': 1000 ** 5,
1471 'EiB': 1024 ** 6,
1472 'EB': 1000 ** 6,
1473 'eB': 1024 ** 6,
1474 'Eb': 1000 ** 6,
1475 'ZiB': 1024 ** 7,
1476 'ZB': 1000 ** 7,
1477 'zB': 1024 ** 7,
1478 'Zb': 1000 ** 7,
1479 'YiB': 1024 ** 8,
1480 'YB': 1000 ** 8,
1481 'yB': 1024 ** 8,
1482 'Yb': 1000 ** 8,
1483 }
1484
fb47597b
S
1485 return lookup_unit_table(_UNIT_TABLE, s)
1486
1487
1488def parse_count(s):
1489 if s is None:
be64b5b0
PH
1490 return None
1491
fb47597b
S
1492 s = s.strip()
1493
1494 if re.match(r'^[\d,.]+$', s):
1495 return str_to_int(s)
1496
1497 _UNIT_TABLE = {
1498 'k': 1000,
1499 'K': 1000,
1500 'm': 1000 ** 2,
1501 'M': 1000 ** 2,
1502 'kk': 1000 ** 2,
1503 'KK': 1000 ** 2,
1504 }
be64b5b0 1505
fb47597b 1506 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1507
2f7ae819 1508
caefb1de
PH
1509def month_by_name(name):
1510 """ Return the number of a month by (locale-independently) English name """
1511
caefb1de 1512 try:
7105440c
YCH
1513 return ENGLISH_MONTH_NAMES.index(name) + 1
1514 except ValueError:
1515 return None
1516
1517
1518def month_by_abbreviation(abbrev):
1519 """ Return the number of a month by (locale-independently) English
1520 abbreviations """
1521
1522 try:
1523 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1524 except ValueError:
1525 return None
18258362
JMF
1526
1527
5aafe895 1528def fix_xml_ampersands(xml_str):
18258362 1529 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1530 return re.sub(
1531 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1532 '&amp;',
5aafe895 1533 xml_str)
e3946f98
PH
1534
1535
1536def setproctitle(title):
8bf48f23 1537 assert isinstance(title, compat_str)
c1c05c67
YCH
1538
1539 # ctypes in Jython is not complete
1540 # http://bugs.jython.org/issue2148
1541 if sys.platform.startswith('java'):
1542 return
1543
e3946f98 1544 try:
611c1dd9 1545 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1546 except OSError:
1547 return
6eefe533
PH
1548 title_bytes = title.encode('utf-8')
1549 buf = ctypes.create_string_buffer(len(title_bytes))
1550 buf.value = title_bytes
e3946f98 1551 try:
6eefe533 1552 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1553 except AttributeError:
1554 return # Strange libc, just skip this
d7dda168
PH
1555
1556
1557def remove_start(s, start):
46bc9b7d 1558 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1559
1560
2b9faf55 1561def remove_end(s, end):
46bc9b7d 1562 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1563
1564
31b2051e
S
1565def remove_quotes(s):
1566 if s is None or len(s) < 2:
1567 return s
1568 for quote in ('"', "'", ):
1569 if s[0] == quote and s[-1] == quote:
1570 return s[1:-1]
1571 return s
1572
1573
29eb5174 1574def url_basename(url):
9b8aaeed 1575 path = compat_urlparse.urlparse(url).path
28e614de 1576 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1577
1578
1579class HEADRequest(compat_urllib_request.Request):
1580 def get_method(self):
611c1dd9 1581 return 'HEAD'
7217e148
PH
1582
1583
9732d77e 1584def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1585 if get_attr:
1586 if v is not None:
1587 v = getattr(v, get_attr, None)
9572013d
PH
1588 if v == '':
1589 v = None
1812afb7
S
1590 if v is None:
1591 return default
1592 try:
1593 return int(v) * invscale // scale
1594 except ValueError:
af98f8ff 1595 return default
9732d77e 1596
9572013d 1597
40a90862
JMF
1598def str_or_none(v, default=None):
1599 return default if v is None else compat_str(v)
1600
9732d77e
PH
1601
1602def str_to_int(int_str):
48d4681e 1603 """ A more relaxed version of int_or_none """
9732d77e
PH
1604 if int_str is None:
1605 return None
28e614de 1606 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1607 return int(int_str)
608d11f5
PH
1608
1609
9732d77e 1610def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1611 if v is None:
1612 return default
1613 try:
1614 return float(v) * invscale / scale
1615 except ValueError:
1616 return default
43f775e4
PH
1617
1618
608d11f5 1619def parse_duration(s):
8f9312c3 1620 if not isinstance(s, compat_basestring):
608d11f5
PH
1621 return None
1622
ca7b3246
S
1623 s = s.strip()
1624
acaff495 1625 days, hours, mins, secs, ms = [None] * 5
1626 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1627 if m:
1628 days, hours, mins, secs, ms = m.groups()
1629 else:
1630 m = re.match(
1631 r'''(?ix)(?:P?T)?
8f4b58d7 1632 (?:
acaff495 1633 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1634 )?
acaff495 1635 (?:
1636 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1637 )?
1638 (?:
1639 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1640 )?
1641 (?:
1642 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1643 )?$''', s)
1644 if m:
1645 days, hours, mins, secs, ms = m.groups()
1646 else:
1647 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1648 if m:
1649 hours, mins = m.groups()
1650 else:
1651 return None
1652
1653 duration = 0
1654 if secs:
1655 duration += float(secs)
1656 if mins:
1657 duration += float(mins) * 60
1658 if hours:
1659 duration += float(hours) * 60 * 60
1660 if days:
1661 duration += float(days) * 24 * 60 * 60
1662 if ms:
1663 duration += float(ms)
1664 return duration
91d7d0b3
JMF
1665
1666
e65e4c88 1667def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1668 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1669 return (
1670 '{0}.{1}{2}'.format(name, ext, real_ext)
1671 if not expected_real_ext or real_ext[1:] == expected_real_ext
1672 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1673
1674
b3ed15b7
S
1675def replace_extension(filename, ext, expected_real_ext=None):
1676 name, real_ext = os.path.splitext(filename)
1677 return '{0}.{1}'.format(
1678 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1679 ext)
1680
1681
d70ad093
PH
1682def check_executable(exe, args=[]):
1683 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1684 args can be a list of arguments for a short output (like -version) """
1685 try:
1686 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1687 except OSError:
1688 return False
1689 return exe
b7ab0590
PH
1690
1691
95807118 1692def get_exe_version(exe, args=['--version'],
cae97f65 1693 version_re=None, unrecognized='present'):
95807118
PH
1694 """ Returns the version of the specified executable,
1695 or False if the executable is not present """
1696 try:
cae97f65 1697 out, _ = subprocess.Popen(
54116803 1698 [encodeArgument(exe)] + args,
95807118
PH
1699 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1700 except OSError:
1701 return False
cae97f65
PH
1702 if isinstance(out, bytes): # Python 2.x
1703 out = out.decode('ascii', 'ignore')
1704 return detect_exe_version(out, version_re, unrecognized)
1705
1706
1707def detect_exe_version(output, version_re=None, unrecognized='present'):
1708 assert isinstance(output, compat_str)
1709 if version_re is None:
1710 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1711 m = re.search(version_re, output)
95807118
PH
1712 if m:
1713 return m.group(1)
1714 else:
1715 return unrecognized
1716
1717
b7ab0590 1718class PagedList(object):
dd26ced1
PH
1719 def __len__(self):
1720 # This is only useful for tests
1721 return len(self.getslice())
1722
9c44d242
PH
1723
1724class OnDemandPagedList(PagedList):
b95dc034 1725 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1726 self._pagefunc = pagefunc
1727 self._pagesize = pagesize
b95dc034
YCH
1728 self._use_cache = use_cache
1729 if use_cache:
1730 self._cache = {}
9c44d242 1731
b7ab0590
PH
1732 def getslice(self, start=0, end=None):
1733 res = []
1734 for pagenum in itertools.count(start // self._pagesize):
1735 firstid = pagenum * self._pagesize
1736 nextfirstid = pagenum * self._pagesize + self._pagesize
1737 if start >= nextfirstid:
1738 continue
1739
b95dc034
YCH
1740 page_results = None
1741 if self._use_cache:
1742 page_results = self._cache.get(pagenum)
1743 if page_results is None:
1744 page_results = list(self._pagefunc(pagenum))
1745 if self._use_cache:
1746 self._cache[pagenum] = page_results
b7ab0590
PH
1747
1748 startv = (
1749 start % self._pagesize
1750 if firstid <= start < nextfirstid
1751 else 0)
1752
1753 endv = (
1754 ((end - 1) % self._pagesize) + 1
1755 if (end is not None and firstid <= end <= nextfirstid)
1756 else None)
1757
1758 if startv != 0 or endv is not None:
1759 page_results = page_results[startv:endv]
1760 res.extend(page_results)
1761
1762 # A little optimization - if current page is not "full", ie. does
1763 # not contain page_size videos then we can assume that this page
1764 # is the last one - there are no more ids on further pages -
1765 # i.e. no need to query again.
1766 if len(page_results) + startv < self._pagesize:
1767 break
1768
1769 # If we got the whole page, but the next page is not interesting,
1770 # break out early as well
1771 if end == nextfirstid:
1772 break
1773 return res
81c2f20b
PH
1774
1775
9c44d242
PH
1776class InAdvancePagedList(PagedList):
1777 def __init__(self, pagefunc, pagecount, pagesize):
1778 self._pagefunc = pagefunc
1779 self._pagecount = pagecount
1780 self._pagesize = pagesize
1781
1782 def getslice(self, start=0, end=None):
1783 res = []
1784 start_page = start // self._pagesize
1785 end_page = (
1786 self._pagecount if end is None else (end // self._pagesize + 1))
1787 skip_elems = start - start_page * self._pagesize
1788 only_more = None if end is None else end - start
1789 for pagenum in range(start_page, end_page):
1790 page = list(self._pagefunc(pagenum))
1791 if skip_elems:
1792 page = page[skip_elems:]
1793 skip_elems = None
1794 if only_more is not None:
1795 if len(page) < only_more:
1796 only_more -= len(page)
1797 else:
1798 page = page[:only_more]
1799 res.extend(page)
1800 break
1801 res.extend(page)
1802 return res
1803
1804
81c2f20b 1805def uppercase_escape(s):
676eb3f2 1806 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1807 return re.sub(
a612753d 1808 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1809 lambda m: unicode_escape(m.group(0))[0],
1810 s)
0fe2ff78
YCH
1811
1812
1813def lowercase_escape(s):
1814 unicode_escape = codecs.getdecoder('unicode_escape')
1815 return re.sub(
1816 r'\\u[0-9a-fA-F]{4}',
1817 lambda m: unicode_escape(m.group(0))[0],
1818 s)
b53466e1 1819
d05cfe06
S
1820
1821def escape_rfc3986(s):
1822 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1823 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1824 s = s.encode('utf-8')
ecc0c5ee 1825 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1826
1827
1828def escape_url(url):
1829 """Escape URL as suggested by RFC 3986"""
1830 url_parsed = compat_urllib_parse_urlparse(url)
1831 return url_parsed._replace(
efbed08d 1832 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1833 path=escape_rfc3986(url_parsed.path),
1834 params=escape_rfc3986(url_parsed.params),
1835 query=escape_rfc3986(url_parsed.query),
1836 fragment=escape_rfc3986(url_parsed.fragment)
1837 ).geturl()
1838
62e609ab
PH
1839
1840def read_batch_urls(batch_fd):
1841 def fixup(url):
1842 if not isinstance(url, compat_str):
1843 url = url.decode('utf-8', 'replace')
28e614de 1844 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1845 if url.startswith(BOM_UTF8):
1846 url = url[len(BOM_UTF8):]
1847 url = url.strip()
1848 if url.startswith(('#', ';', ']')):
1849 return False
1850 return url
1851
1852 with contextlib.closing(batch_fd) as fd:
1853 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1854
1855
1856def urlencode_postdata(*args, **kargs):
15707c7e 1857 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1858
1859
38f9ef31 1860def update_url_query(url, query):
cacd9966
YCH
1861 if not query:
1862 return url
38f9ef31 1863 parsed_url = compat_urlparse.urlparse(url)
1864 qs = compat_parse_qs(parsed_url.query)
1865 qs.update(query)
1866 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1867 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1868
8e60dc75 1869
ed0291d1
S
1870def update_Request(req, url=None, data=None, headers={}, query={}):
1871 req_headers = req.headers.copy()
1872 req_headers.update(headers)
1873 req_data = data or req.data
1874 req_url = update_url_query(url or req.get_full_url(), query)
1875 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1876 new_req = req_type(
1877 req_url, data=req_data, headers=req_headers,
1878 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1879 if hasattr(req, 'timeout'):
1880 new_req.timeout = req.timeout
1881 return new_req
1882
1883
86296ad2 1884def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1885 if isinstance(key_or_keys, (list, tuple)):
1886 for key in key_or_keys:
86296ad2
S
1887 if key not in d or d[key] is None or skip_false_values and not d[key]:
1888 continue
1889 return d[key]
cbecc9b9
S
1890 return default
1891 return d.get(key_or_keys, default)
1892
1893
8e60dc75
S
1894def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1895 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1896
16392824 1897
a1a530b0
PH
1898US_RATINGS = {
1899 'G': 0,
1900 'PG': 10,
1901 'PG-13': 13,
1902 'R': 16,
1903 'NC': 18,
1904}
fac55558
PH
1905
1906
146c80e2
S
1907def parse_age_limit(s):
1908 if s is None:
d838b1bd 1909 return None
146c80e2 1910 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1911 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1912
1913
fac55558 1914def strip_jsonp(code):
609a61e3 1915 return re.sub(
5950cb1d 1916 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1917
1918
e05f6939
PH
1919def js_to_json(code):
1920 def fix_kv(m):
e7b6d122
PH
1921 v = m.group(0)
1922 if v in ('true', 'false', 'null'):
1923 return v
bd1e4844 1924 elif v.startswith('/*') or v == ',':
1925 return ""
1926
1927 if v[0] in ("'", '"'):
1928 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 1929 '"': '\\"',
bd1e4844 1930 "\\'": "'",
1931 '\\\n': '',
1932 '\\x': '\\u00',
1933 }.get(m.group(0), m.group(0)), v[1:-1])
1934
89ac4a19 1935 INTEGER_TABLE = (
cda6d47a
S
1936 (r'^0[xX][0-9a-fA-F]+', 16),
1937 (r'^0+[0-7]+', 8),
89ac4a19
S
1938 )
1939
1940 for regex, base in INTEGER_TABLE:
1941 im = re.match(regex, v)
1942 if im:
cda6d47a 1943 i = int(im.group(0), base)
89ac4a19
S
1944 return '"%d":' % i if v.endswith(':') else '%d' % i
1945
e7b6d122 1946 return '"%s"' % v
e05f6939 1947
bd1e4844 1948 return re.sub(r'''(?sx)
1949 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1950 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1951 /\*.*?\*/|,(?=\s*[\]}])|
1952 [a-zA-Z_][.a-zA-Z_0-9]*|
89ac4a19 1953 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 1954 [0-9]+(?=\s*:)
e05f6939 1955 ''', fix_kv, code)
e05f6939
PH
1956
1957
478c2c61
PH
1958def qualities(quality_ids):
1959 """ Get a numeric quality value out of a list of possible values """
1960 def q(qid):
1961 try:
1962 return quality_ids.index(qid)
1963 except ValueError:
1964 return -1
1965 return q
1966
acd69589
PH
1967
1968DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1969
a020a0dc
PH
1970
1971def limit_length(s, length):
1972 """ Add ellipses to overly long strings """
1973 if s is None:
1974 return None
1975 ELLIPSES = '...'
1976 if len(s) > length:
1977 return s[:length - len(ELLIPSES)] + ELLIPSES
1978 return s
48844745
PH
1979
1980
1981def version_tuple(v):
5f9b8394 1982 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1983
1984
1985def is_outdated_version(version, limit, assume_new=True):
1986 if not version:
1987 return not assume_new
1988 try:
1989 return version_tuple(version) < version_tuple(limit)
1990 except ValueError:
1991 return not assume_new
732ea2f0
PH
1992
1993
1994def ytdl_is_updateable():
1995 """ Returns if youtube-dl can be updated with -U """
1996 from zipimport import zipimporter
1997
1998 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1999
2000
2001def args_to_str(args):
2002 # Get a short string representation for a subprocess command
702ccf2d 2003 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2004
2005
9b9c5355 2006def error_to_compat_str(err):
fdae2358
S
2007 err_str = str(err)
2008 # On python 2 error byte string must be decoded with proper
2009 # encoding rather than ascii
2010 if sys.version_info[0] < 3:
2011 err_str = err_str.decode(preferredencoding())
2012 return err_str
2013
2014
c460bdd5 2015def mimetype2ext(mt):
eb9ee194
S
2016 if mt is None:
2017 return None
2018
765ac263
JMF
2019 ext = {
2020 'audio/mp4': 'm4a',
2021 }.get(mt)
2022 if ext is not None:
2023 return ext
2024
c460bdd5
PH
2025 _, _, res = mt.rpartition('/')
2026
2027 return {
f6861ec9 2028 '3gpp': '3gp',
cafcf657 2029 'smptett+xml': 'tt',
2030 'srt': 'srt',
2031 'ttaf+xml': 'dfxp',
a0d8d704 2032 'ttml+xml': 'ttml',
cafcf657 2033 'vtt': 'vtt',
f6861ec9 2034 'x-flv': 'flv',
a0d8d704
YCH
2035 'x-mp4-fragmented': 'mp4',
2036 'x-ms-wmv': 'wmv',
c460bdd5
PH
2037 }.get(res, res)
2038
2039
2ccd1b10 2040def urlhandle_detect_ext(url_handle):
79298173 2041 getheader = url_handle.headers.get
2ccd1b10 2042
b55ee18f
PH
2043 cd = getheader('Content-Disposition')
2044 if cd:
2045 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2046 if m:
2047 e = determine_ext(m.group('filename'), default_ext=None)
2048 if e:
2049 return e
2050
c460bdd5 2051 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2052
2053
1e399778
YCH
2054def encode_data_uri(data, mime_type):
2055 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2056
2057
05900629 2058def age_restricted(content_limit, age_limit):
6ec6cb4e 2059 """ Returns True iff the content should be blocked """
05900629
PH
2060
2061 if age_limit is None: # No limit set
2062 return False
2063 if content_limit is None:
2064 return False # Content available for everyone
2065 return age_limit < content_limit
61ca9a80
PH
2066
2067
2068def is_html(first_bytes):
2069 """ Detect whether a file contains HTML by examining its first bytes. """
2070
2071 BOMS = [
2072 (b'\xef\xbb\xbf', 'utf-8'),
2073 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2074 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2075 (b'\xff\xfe', 'utf-16-le'),
2076 (b'\xfe\xff', 'utf-16-be'),
2077 ]
2078 for bom, enc in BOMS:
2079 if first_bytes.startswith(bom):
2080 s = first_bytes[len(bom):].decode(enc, 'replace')
2081 break
2082 else:
2083 s = first_bytes.decode('utf-8', 'replace')
2084
2085 return re.match(r'^\s*<', s)
a055469f
PH
2086
2087
2088def determine_protocol(info_dict):
2089 protocol = info_dict.get('protocol')
2090 if protocol is not None:
2091 return protocol
2092
2093 url = info_dict['url']
2094 if url.startswith('rtmp'):
2095 return 'rtmp'
2096 elif url.startswith('mms'):
2097 return 'mms'
2098 elif url.startswith('rtsp'):
2099 return 'rtsp'
2100
2101 ext = determine_ext(url)
2102 if ext == 'm3u8':
2103 return 'm3u8'
2104 elif ext == 'f4m':
2105 return 'f4m'
2106
2107 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2108
2109
2110def render_table(header_row, data):
2111 """ Render a list of rows, each as a list of values """
2112 table = [header_row] + data
2113 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2114 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2115 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2116
2117
2118def _match_one(filter_part, dct):
2119 COMPARISON_OPERATORS = {
2120 '<': operator.lt,
2121 '<=': operator.le,
2122 '>': operator.gt,
2123 '>=': operator.ge,
2124 '=': operator.eq,
2125 '!=': operator.ne,
2126 }
2127 operator_rex = re.compile(r'''(?x)\s*
2128 (?P<key>[a-z_]+)
2129 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2130 (?:
2131 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2132 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2133 )
2134 \s*$
2135 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2136 m = operator_rex.search(filter_part)
2137 if m:
2138 op = COMPARISON_OPERATORS[m.group('op')]
2139 if m.group('strval') is not None:
2140 if m.group('op') not in ('=', '!='):
2141 raise ValueError(
2142 'Operator %s does not support string values!' % m.group('op'))
2143 comparison_value = m.group('strval')
2144 else:
2145 try:
2146 comparison_value = int(m.group('intval'))
2147 except ValueError:
2148 comparison_value = parse_filesize(m.group('intval'))
2149 if comparison_value is None:
2150 comparison_value = parse_filesize(m.group('intval') + 'B')
2151 if comparison_value is None:
2152 raise ValueError(
2153 'Invalid integer value %r in filter part %r' % (
2154 m.group('intval'), filter_part))
2155 actual_value = dct.get(m.group('key'))
2156 if actual_value is None:
2157 return m.group('none_inclusive')
2158 return op(actual_value, comparison_value)
2159
2160 UNARY_OPERATORS = {
2161 '': lambda v: v is not None,
2162 '!': lambda v: v is None,
2163 }
2164 operator_rex = re.compile(r'''(?x)\s*
2165 (?P<op>%s)\s*(?P<key>[a-z_]+)
2166 \s*$
2167 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2168 m = operator_rex.search(filter_part)
2169 if m:
2170 op = UNARY_OPERATORS[m.group('op')]
2171 actual_value = dct.get(m.group('key'))
2172 return op(actual_value)
2173
2174 raise ValueError('Invalid filter part %r' % filter_part)
2175
2176
2177def match_str(filter_str, dct):
2178 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2179
2180 return all(
2181 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2182
2183
2184def match_filter_func(filter_str):
2185 def _match_func(info_dict):
2186 if match_str(filter_str, info_dict):
2187 return None
2188 else:
2189 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2190 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2191 return _match_func
91410c9b
PH
2192
2193
bf6427d2
YCH
2194def parse_dfxp_time_expr(time_expr):
2195 if not time_expr:
d631d5f9 2196 return
bf6427d2
YCH
2197
2198 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2199 if mobj:
2200 return float(mobj.group('time_offset'))
2201
db2fe38b 2202 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2203 if mobj:
db2fe38b 2204 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2205
2206
c1c924ab
YCH
2207def srt_subtitles_timecode(seconds):
2208 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2209
2210
2211def dfxp2srt(dfxp_data):
4e335771
YCH
2212 _x = functools.partial(xpath_with_ns, ns_map={
2213 'ttml': 'http://www.w3.org/ns/ttml',
2214 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2215 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2216 })
bf6427d2 2217
87de7069 2218 class TTMLPElementParser(object):
2b14cb56 2219 out = ''
bf6427d2 2220
2b14cb56 2221 def start(self, tag, attrib):
2222 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2223 self.out += '\n'
bf6427d2 2224
2b14cb56 2225 def end(self, tag):
2226 pass
bf6427d2 2227
2b14cb56 2228 def data(self, data):
2229 self.out += data
2230
2231 def close(self):
2232 return self.out.strip()
2233
2234 def parse_node(node):
2235 target = TTMLPElementParser()
2236 parser = xml.etree.ElementTree.XMLParser(target=target)
2237 parser.feed(xml.etree.ElementTree.tostring(node))
2238 return parser.close()
bf6427d2 2239
36e6f62c 2240 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2241 out = []
5bf28d78 2242 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2243
2244 if not paras:
2245 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2246
2247 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2248 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2249 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2250 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2251 if begin_time is None:
2252 continue
7dff0363 2253 if not end_time:
d631d5f9
YCH
2254 if not dur:
2255 continue
2256 end_time = begin_time + dur
bf6427d2
YCH
2257 out.append('%d\n%s --> %s\n%s\n\n' % (
2258 index,
c1c924ab
YCH
2259 srt_subtitles_timecode(begin_time),
2260 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2261 parse_node(para)))
2262
2263 return ''.join(out)
2264
2265
66e289ba
S
2266def cli_option(params, command_option, param):
2267 param = params.get(param)
2268 return [command_option, param] if param is not None else []
2269
2270
2271def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2272 param = params.get(param)
2273 assert isinstance(param, bool)
2274 if separator:
2275 return [command_option + separator + (true_value if param else false_value)]
2276 return [command_option, true_value if param else false_value]
2277
2278
2279def cli_valueless_option(params, command_option, param, expected_value=True):
2280 param = params.get(param)
2281 return [command_option] if param == expected_value else []
2282
2283
2284def cli_configuration_args(params, param, default=[]):
2285 ex_args = params.get(param)
2286 if ex_args is None:
2287 return default
2288 assert isinstance(ex_args, list)
2289 return ex_args
2290
2291
39672624
YCH
2292class ISO639Utils(object):
2293 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2294 _lang_map = {
2295 'aa': 'aar',
2296 'ab': 'abk',
2297 'ae': 'ave',
2298 'af': 'afr',
2299 'ak': 'aka',
2300 'am': 'amh',
2301 'an': 'arg',
2302 'ar': 'ara',
2303 'as': 'asm',
2304 'av': 'ava',
2305 'ay': 'aym',
2306 'az': 'aze',
2307 'ba': 'bak',
2308 'be': 'bel',
2309 'bg': 'bul',
2310 'bh': 'bih',
2311 'bi': 'bis',
2312 'bm': 'bam',
2313 'bn': 'ben',
2314 'bo': 'bod',
2315 'br': 'bre',
2316 'bs': 'bos',
2317 'ca': 'cat',
2318 'ce': 'che',
2319 'ch': 'cha',
2320 'co': 'cos',
2321 'cr': 'cre',
2322 'cs': 'ces',
2323 'cu': 'chu',
2324 'cv': 'chv',
2325 'cy': 'cym',
2326 'da': 'dan',
2327 'de': 'deu',
2328 'dv': 'div',
2329 'dz': 'dzo',
2330 'ee': 'ewe',
2331 'el': 'ell',
2332 'en': 'eng',
2333 'eo': 'epo',
2334 'es': 'spa',
2335 'et': 'est',
2336 'eu': 'eus',
2337 'fa': 'fas',
2338 'ff': 'ful',
2339 'fi': 'fin',
2340 'fj': 'fij',
2341 'fo': 'fao',
2342 'fr': 'fra',
2343 'fy': 'fry',
2344 'ga': 'gle',
2345 'gd': 'gla',
2346 'gl': 'glg',
2347 'gn': 'grn',
2348 'gu': 'guj',
2349 'gv': 'glv',
2350 'ha': 'hau',
2351 'he': 'heb',
2352 'hi': 'hin',
2353 'ho': 'hmo',
2354 'hr': 'hrv',
2355 'ht': 'hat',
2356 'hu': 'hun',
2357 'hy': 'hye',
2358 'hz': 'her',
2359 'ia': 'ina',
2360 'id': 'ind',
2361 'ie': 'ile',
2362 'ig': 'ibo',
2363 'ii': 'iii',
2364 'ik': 'ipk',
2365 'io': 'ido',
2366 'is': 'isl',
2367 'it': 'ita',
2368 'iu': 'iku',
2369 'ja': 'jpn',
2370 'jv': 'jav',
2371 'ka': 'kat',
2372 'kg': 'kon',
2373 'ki': 'kik',
2374 'kj': 'kua',
2375 'kk': 'kaz',
2376 'kl': 'kal',
2377 'km': 'khm',
2378 'kn': 'kan',
2379 'ko': 'kor',
2380 'kr': 'kau',
2381 'ks': 'kas',
2382 'ku': 'kur',
2383 'kv': 'kom',
2384 'kw': 'cor',
2385 'ky': 'kir',
2386 'la': 'lat',
2387 'lb': 'ltz',
2388 'lg': 'lug',
2389 'li': 'lim',
2390 'ln': 'lin',
2391 'lo': 'lao',
2392 'lt': 'lit',
2393 'lu': 'lub',
2394 'lv': 'lav',
2395 'mg': 'mlg',
2396 'mh': 'mah',
2397 'mi': 'mri',
2398 'mk': 'mkd',
2399 'ml': 'mal',
2400 'mn': 'mon',
2401 'mr': 'mar',
2402 'ms': 'msa',
2403 'mt': 'mlt',
2404 'my': 'mya',
2405 'na': 'nau',
2406 'nb': 'nob',
2407 'nd': 'nde',
2408 'ne': 'nep',
2409 'ng': 'ndo',
2410 'nl': 'nld',
2411 'nn': 'nno',
2412 'no': 'nor',
2413 'nr': 'nbl',
2414 'nv': 'nav',
2415 'ny': 'nya',
2416 'oc': 'oci',
2417 'oj': 'oji',
2418 'om': 'orm',
2419 'or': 'ori',
2420 'os': 'oss',
2421 'pa': 'pan',
2422 'pi': 'pli',
2423 'pl': 'pol',
2424 'ps': 'pus',
2425 'pt': 'por',
2426 'qu': 'que',
2427 'rm': 'roh',
2428 'rn': 'run',
2429 'ro': 'ron',
2430 'ru': 'rus',
2431 'rw': 'kin',
2432 'sa': 'san',
2433 'sc': 'srd',
2434 'sd': 'snd',
2435 'se': 'sme',
2436 'sg': 'sag',
2437 'si': 'sin',
2438 'sk': 'slk',
2439 'sl': 'slv',
2440 'sm': 'smo',
2441 'sn': 'sna',
2442 'so': 'som',
2443 'sq': 'sqi',
2444 'sr': 'srp',
2445 'ss': 'ssw',
2446 'st': 'sot',
2447 'su': 'sun',
2448 'sv': 'swe',
2449 'sw': 'swa',
2450 'ta': 'tam',
2451 'te': 'tel',
2452 'tg': 'tgk',
2453 'th': 'tha',
2454 'ti': 'tir',
2455 'tk': 'tuk',
2456 'tl': 'tgl',
2457 'tn': 'tsn',
2458 'to': 'ton',
2459 'tr': 'tur',
2460 'ts': 'tso',
2461 'tt': 'tat',
2462 'tw': 'twi',
2463 'ty': 'tah',
2464 'ug': 'uig',
2465 'uk': 'ukr',
2466 'ur': 'urd',
2467 'uz': 'uzb',
2468 've': 'ven',
2469 'vi': 'vie',
2470 'vo': 'vol',
2471 'wa': 'wln',
2472 'wo': 'wol',
2473 'xh': 'xho',
2474 'yi': 'yid',
2475 'yo': 'yor',
2476 'za': 'zha',
2477 'zh': 'zho',
2478 'zu': 'zul',
2479 }
2480
2481 @classmethod
2482 def short2long(cls, code):
2483 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2484 return cls._lang_map.get(code[:2])
2485
2486 @classmethod
2487 def long2short(cls, code):
2488 """Convert language code from ISO 639-2/T to ISO 639-1"""
2489 for short_name, long_name in cls._lang_map.items():
2490 if long_name == code:
2491 return short_name
2492
2493
4eb10f66
YCH
2494class ISO3166Utils(object):
2495 # From http://data.okfn.org/data/core/country-list
2496 _country_map = {
2497 'AF': 'Afghanistan',
2498 'AX': 'Åland Islands',
2499 'AL': 'Albania',
2500 'DZ': 'Algeria',
2501 'AS': 'American Samoa',
2502 'AD': 'Andorra',
2503 'AO': 'Angola',
2504 'AI': 'Anguilla',
2505 'AQ': 'Antarctica',
2506 'AG': 'Antigua and Barbuda',
2507 'AR': 'Argentina',
2508 'AM': 'Armenia',
2509 'AW': 'Aruba',
2510 'AU': 'Australia',
2511 'AT': 'Austria',
2512 'AZ': 'Azerbaijan',
2513 'BS': 'Bahamas',
2514 'BH': 'Bahrain',
2515 'BD': 'Bangladesh',
2516 'BB': 'Barbados',
2517 'BY': 'Belarus',
2518 'BE': 'Belgium',
2519 'BZ': 'Belize',
2520 'BJ': 'Benin',
2521 'BM': 'Bermuda',
2522 'BT': 'Bhutan',
2523 'BO': 'Bolivia, Plurinational State of',
2524 'BQ': 'Bonaire, Sint Eustatius and Saba',
2525 'BA': 'Bosnia and Herzegovina',
2526 'BW': 'Botswana',
2527 'BV': 'Bouvet Island',
2528 'BR': 'Brazil',
2529 'IO': 'British Indian Ocean Territory',
2530 'BN': 'Brunei Darussalam',
2531 'BG': 'Bulgaria',
2532 'BF': 'Burkina Faso',
2533 'BI': 'Burundi',
2534 'KH': 'Cambodia',
2535 'CM': 'Cameroon',
2536 'CA': 'Canada',
2537 'CV': 'Cape Verde',
2538 'KY': 'Cayman Islands',
2539 'CF': 'Central African Republic',
2540 'TD': 'Chad',
2541 'CL': 'Chile',
2542 'CN': 'China',
2543 'CX': 'Christmas Island',
2544 'CC': 'Cocos (Keeling) Islands',
2545 'CO': 'Colombia',
2546 'KM': 'Comoros',
2547 'CG': 'Congo',
2548 'CD': 'Congo, the Democratic Republic of the',
2549 'CK': 'Cook Islands',
2550 'CR': 'Costa Rica',
2551 'CI': 'Côte d\'Ivoire',
2552 'HR': 'Croatia',
2553 'CU': 'Cuba',
2554 'CW': 'Curaçao',
2555 'CY': 'Cyprus',
2556 'CZ': 'Czech Republic',
2557 'DK': 'Denmark',
2558 'DJ': 'Djibouti',
2559 'DM': 'Dominica',
2560 'DO': 'Dominican Republic',
2561 'EC': 'Ecuador',
2562 'EG': 'Egypt',
2563 'SV': 'El Salvador',
2564 'GQ': 'Equatorial Guinea',
2565 'ER': 'Eritrea',
2566 'EE': 'Estonia',
2567 'ET': 'Ethiopia',
2568 'FK': 'Falkland Islands (Malvinas)',
2569 'FO': 'Faroe Islands',
2570 'FJ': 'Fiji',
2571 'FI': 'Finland',
2572 'FR': 'France',
2573 'GF': 'French Guiana',
2574 'PF': 'French Polynesia',
2575 'TF': 'French Southern Territories',
2576 'GA': 'Gabon',
2577 'GM': 'Gambia',
2578 'GE': 'Georgia',
2579 'DE': 'Germany',
2580 'GH': 'Ghana',
2581 'GI': 'Gibraltar',
2582 'GR': 'Greece',
2583 'GL': 'Greenland',
2584 'GD': 'Grenada',
2585 'GP': 'Guadeloupe',
2586 'GU': 'Guam',
2587 'GT': 'Guatemala',
2588 'GG': 'Guernsey',
2589 'GN': 'Guinea',
2590 'GW': 'Guinea-Bissau',
2591 'GY': 'Guyana',
2592 'HT': 'Haiti',
2593 'HM': 'Heard Island and McDonald Islands',
2594 'VA': 'Holy See (Vatican City State)',
2595 'HN': 'Honduras',
2596 'HK': 'Hong Kong',
2597 'HU': 'Hungary',
2598 'IS': 'Iceland',
2599 'IN': 'India',
2600 'ID': 'Indonesia',
2601 'IR': 'Iran, Islamic Republic of',
2602 'IQ': 'Iraq',
2603 'IE': 'Ireland',
2604 'IM': 'Isle of Man',
2605 'IL': 'Israel',
2606 'IT': 'Italy',
2607 'JM': 'Jamaica',
2608 'JP': 'Japan',
2609 'JE': 'Jersey',
2610 'JO': 'Jordan',
2611 'KZ': 'Kazakhstan',
2612 'KE': 'Kenya',
2613 'KI': 'Kiribati',
2614 'KP': 'Korea, Democratic People\'s Republic of',
2615 'KR': 'Korea, Republic of',
2616 'KW': 'Kuwait',
2617 'KG': 'Kyrgyzstan',
2618 'LA': 'Lao People\'s Democratic Republic',
2619 'LV': 'Latvia',
2620 'LB': 'Lebanon',
2621 'LS': 'Lesotho',
2622 'LR': 'Liberia',
2623 'LY': 'Libya',
2624 'LI': 'Liechtenstein',
2625 'LT': 'Lithuania',
2626 'LU': 'Luxembourg',
2627 'MO': 'Macao',
2628 'MK': 'Macedonia, the Former Yugoslav Republic of',
2629 'MG': 'Madagascar',
2630 'MW': 'Malawi',
2631 'MY': 'Malaysia',
2632 'MV': 'Maldives',
2633 'ML': 'Mali',
2634 'MT': 'Malta',
2635 'MH': 'Marshall Islands',
2636 'MQ': 'Martinique',
2637 'MR': 'Mauritania',
2638 'MU': 'Mauritius',
2639 'YT': 'Mayotte',
2640 'MX': 'Mexico',
2641 'FM': 'Micronesia, Federated States of',
2642 'MD': 'Moldova, Republic of',
2643 'MC': 'Monaco',
2644 'MN': 'Mongolia',
2645 'ME': 'Montenegro',
2646 'MS': 'Montserrat',
2647 'MA': 'Morocco',
2648 'MZ': 'Mozambique',
2649 'MM': 'Myanmar',
2650 'NA': 'Namibia',
2651 'NR': 'Nauru',
2652 'NP': 'Nepal',
2653 'NL': 'Netherlands',
2654 'NC': 'New Caledonia',
2655 'NZ': 'New Zealand',
2656 'NI': 'Nicaragua',
2657 'NE': 'Niger',
2658 'NG': 'Nigeria',
2659 'NU': 'Niue',
2660 'NF': 'Norfolk Island',
2661 'MP': 'Northern Mariana Islands',
2662 'NO': 'Norway',
2663 'OM': 'Oman',
2664 'PK': 'Pakistan',
2665 'PW': 'Palau',
2666 'PS': 'Palestine, State of',
2667 'PA': 'Panama',
2668 'PG': 'Papua New Guinea',
2669 'PY': 'Paraguay',
2670 'PE': 'Peru',
2671 'PH': 'Philippines',
2672 'PN': 'Pitcairn',
2673 'PL': 'Poland',
2674 'PT': 'Portugal',
2675 'PR': 'Puerto Rico',
2676 'QA': 'Qatar',
2677 'RE': 'Réunion',
2678 'RO': 'Romania',
2679 'RU': 'Russian Federation',
2680 'RW': 'Rwanda',
2681 'BL': 'Saint Barthélemy',
2682 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2683 'KN': 'Saint Kitts and Nevis',
2684 'LC': 'Saint Lucia',
2685 'MF': 'Saint Martin (French part)',
2686 'PM': 'Saint Pierre and Miquelon',
2687 'VC': 'Saint Vincent and the Grenadines',
2688 'WS': 'Samoa',
2689 'SM': 'San Marino',
2690 'ST': 'Sao Tome and Principe',
2691 'SA': 'Saudi Arabia',
2692 'SN': 'Senegal',
2693 'RS': 'Serbia',
2694 'SC': 'Seychelles',
2695 'SL': 'Sierra Leone',
2696 'SG': 'Singapore',
2697 'SX': 'Sint Maarten (Dutch part)',
2698 'SK': 'Slovakia',
2699 'SI': 'Slovenia',
2700 'SB': 'Solomon Islands',
2701 'SO': 'Somalia',
2702 'ZA': 'South Africa',
2703 'GS': 'South Georgia and the South Sandwich Islands',
2704 'SS': 'South Sudan',
2705 'ES': 'Spain',
2706 'LK': 'Sri Lanka',
2707 'SD': 'Sudan',
2708 'SR': 'Suriname',
2709 'SJ': 'Svalbard and Jan Mayen',
2710 'SZ': 'Swaziland',
2711 'SE': 'Sweden',
2712 'CH': 'Switzerland',
2713 'SY': 'Syrian Arab Republic',
2714 'TW': 'Taiwan, Province of China',
2715 'TJ': 'Tajikistan',
2716 'TZ': 'Tanzania, United Republic of',
2717 'TH': 'Thailand',
2718 'TL': 'Timor-Leste',
2719 'TG': 'Togo',
2720 'TK': 'Tokelau',
2721 'TO': 'Tonga',
2722 'TT': 'Trinidad and Tobago',
2723 'TN': 'Tunisia',
2724 'TR': 'Turkey',
2725 'TM': 'Turkmenistan',
2726 'TC': 'Turks and Caicos Islands',
2727 'TV': 'Tuvalu',
2728 'UG': 'Uganda',
2729 'UA': 'Ukraine',
2730 'AE': 'United Arab Emirates',
2731 'GB': 'United Kingdom',
2732 'US': 'United States',
2733 'UM': 'United States Minor Outlying Islands',
2734 'UY': 'Uruguay',
2735 'UZ': 'Uzbekistan',
2736 'VU': 'Vanuatu',
2737 'VE': 'Venezuela, Bolivarian Republic of',
2738 'VN': 'Viet Nam',
2739 'VG': 'Virgin Islands, British',
2740 'VI': 'Virgin Islands, U.S.',
2741 'WF': 'Wallis and Futuna',
2742 'EH': 'Western Sahara',
2743 'YE': 'Yemen',
2744 'ZM': 'Zambia',
2745 'ZW': 'Zimbabwe',
2746 }
2747
2748 @classmethod
2749 def short2full(cls, code):
2750 """Convert an ISO 3166-2 country code to the corresponding full name"""
2751 return cls._country_map.get(code.upper())
2752
2753
91410c9b 2754class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2755 def __init__(self, proxies=None):
2756 # Set default handlers
2757 for type in ('http', 'https'):
2758 setattr(self, '%s_open' % type,
2759 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2760 meth(r, proxy, type))
2761 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2762
91410c9b 2763 def proxy_open(self, req, proxy, type):
2461f79d 2764 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2765 if req_proxy is not None:
2766 proxy = req_proxy
2461f79d
PH
2767 del req.headers['Ytdl-request-proxy']
2768
2769 if proxy == '__noproxy__':
2770 return None # No Proxy
51fb4995 2771 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2772 req.add_header('Ytdl-socks-proxy', proxy)
2773 # youtube-dl's http/https handlers do wrapping the socket with socks
2774 return None
91410c9b
PH
2775 return compat_urllib_request.ProxyHandler.proxy_open(
2776 self, req, proxy, type)
5bc880b9
YCH
2777
2778
2779def ohdave_rsa_encrypt(data, exponent, modulus):
2780 '''
2781 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2782
2783 Input:
2784 data: data to encrypt, bytes-like object
2785 exponent, modulus: parameter e and N of RSA algorithm, both integer
2786 Output: hex string of encrypted data
2787
2788 Limitation: supports one block encryption only
2789 '''
2790
2791 payload = int(binascii.hexlify(data[::-1]), 16)
2792 encrypted = pow(payload, exponent, modulus)
2793 return '%x' % encrypted
81bdc8fd
YCH
2794
2795
5eb6bdce 2796def encode_base_n(num, n, table=None):
59f898b7 2797 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2798 if not table:
2799 table = FULL_TABLE[:n]
2800
5eb6bdce
YCH
2801 if n > len(table):
2802 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2803
2804 if num == 0:
2805 return table[0]
2806
81bdc8fd
YCH
2807 ret = ''
2808 while num:
2809 ret = table[num % n] + ret
2810 num = num // n
2811 return ret
f52354a8
YCH
2812
2813
2814def decode_packed_codes(code):
2815 mobj = re.search(
680079be 2816 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2817 code)
2818 obfucasted_code, base, count, symbols = mobj.groups()
2819 base = int(base)
2820 count = int(count)
2821 symbols = symbols.split('|')
2822 symbol_table = {}
2823
2824 while count:
2825 count -= 1
5eb6bdce 2826 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2827 symbol_table[base_n_count] = symbols[count] or base_n_count
2828
2829 return re.sub(
2830 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2831 obfucasted_code)