]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[xvideos] Extract html5 player formats (Closes #9495)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
be4a824d 42 compat_http_client,
c86b6142 43 compat_kwargs,
8c25f81b 44 compat_parse_qs,
702ccf2d 45 compat_shlex_quote,
be4a824d 46 compat_socket_create_connection,
8c25f81b 47 compat_str,
edaa23f8 48 compat_struct_pack,
8c25f81b
PH
49 compat_urllib_error,
50 compat_urllib_parse,
15707c7e 51 compat_urllib_parse_urlencode,
8c25f81b 52 compat_urllib_parse_urlparse,
7581bfc9 53 compat_urllib_parse_unquote_plus,
8c25f81b
PH
54 compat_urllib_request,
55 compat_urlparse,
810c10ba 56 compat_xpath,
8c25f81b 57)
4644ac55 58
71aff188
YCH
59from .socks import (
60 ProxyType,
61 sockssocket,
62)
63
4644ac55 64
51fb4995
YCH
65def register_socks_protocols():
66 # "Register" SOCKS protocols
d5ae6bb5
YCH
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
72
73
468e2e92
FV
74# This is not clearly defined otherwise
75compiled_regex_type = type(re.compile(''))
76
3e669f36 77std_headers = {
9c7b3898 78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 83}
f427df17 84
5f6a1245 85
bf42a990
S
86NO_DEFAULT = object()
87
7105440c
YCH
88ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
91
a7aaa398
S
92KNOWN_EXTENSIONS = (
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
96 'mkv', 'mka', 'mk3d',
97 'avi', 'divx',
98 'mov',
99 'asf', 'wmv', 'wma',
100 '3gp', '3g2',
101 'mp3',
102 'flac',
103 'ape',
104 'wav',
105 'f4f', 'f4m', 'm3u8', 'smil')
106
c587cbb7 107# needed for sanitizing filenames in restricted mode
778a1ccc
YCH
108ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
c587cbb7 111
7105440c 112
d77c3dfd 113def preferredencoding():
59ae15a5 114 """Get preferred encoding.
d77c3dfd 115
59ae15a5
PH
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
118 """
119 try:
120 pref = locale.getpreferredencoding()
28e614de 121 'TEST'.encode(pref)
70a1165b 122 except Exception:
59ae15a5 123 pref = 'UTF-8'
bae611f2 124
59ae15a5 125 return pref
d77c3dfd 126
f4bfd65f 127
181c8655 128def write_json_file(obj, fn):
1394646a 129 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 130
92120217 131 fn = encodeFilename(fn)
61ee5aeb 132 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
140 else:
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
143
73159f99
S
144 args = {
145 'suffix': '.tmp',
ec5f6016
JMF
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
73159f99
S
148 'delete': False,
149 }
150
181c8655
PH
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
73159f99 154 args['mode'] = 'wb'
181c8655 155 else:
73159f99
S
156 args.update({
157 'mode': 'w',
158 'encoding': 'utf-8',
159 })
160
c86b6142 161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
162
163 try:
164 with tf:
165 json.dump(obj, tf)
1394646a
IK
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
169 try:
170 os.unlink(fn)
171 except OSError:
172 pass
181c8655 173 os.rename(tf.name, fn)
70a1165b 174 except Exception:
181c8655
PH
175 try:
176 os.remove(tf.name)
177 except OSError:
178 pass
179 raise
180
181
182if sys.version_info >= (2, 7):
ee114368 183 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 184 """ Find the xpath xpath[@key=val] """
5d2354f1 185 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
187 return node.find(expr)
188else:
ee114368 189 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 190 for f in node.findall(compat_xpath(xpath)):
ee114368
S
191 if key not in f.attrib:
192 continue
193 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
194 return f
195 return None
196
d7e66d39
JMF
197# On python2.6 the xml.etree.ElementTree.Element methods don't support
198# the namespace parameter
5f6a1245
JW
199
200
d7e66d39
JMF
201def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
203 replaced = []
204 for c in components:
205 if len(c) == 1:
206 replaced.append(c[0])
207 else:
208 ns, tag = c
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
211
d77c3dfd 212
a41fb80c 213def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 214 def _find_xpath(xpath):
810c10ba 215 return node.find(compat_xpath(xpath))
578c0745
S
216
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
219 else:
220 for xp in xpath:
221 n = _find_xpath(xp)
222 if n is not None:
223 break
d74bebd5 224
8e636da4 225 if n is None:
bf42a990
S
226 if default is not NO_DEFAULT:
227 return default
228 elif fatal:
bf0ff932
PH
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
231 else:
232 return None
a41fb80c
S
233 return n
234
235
236def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
239 return n
240 if n.text is None:
241 if default is not NO_DEFAULT:
242 return default
243 elif fatal:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
246 else:
247 return None
248 return n.text
a41fb80c
S
249
250
251def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
253 if n is None:
254 if default is not NO_DEFAULT:
255 return default
256 elif fatal:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
259 else:
260 return None
261 return n.attrib[key]
bf0ff932
PH
262
263
9e6dd238 264def get_element_by_id(id, html):
43e8fafd 265 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 266 return get_element_by_attribute('id', id, html)
43e8fafd 267
12ea2f30 268
43e8fafd
ND
269def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 271
38285056
PH
272 m = re.search(r'''(?xs)
273 <([a-zA-Z0-9:._-]+)
abc97b5e 274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 275 \s+%s=['"]?%s['"]?
abc97b5e 276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
277 \s*>
278 (?P<content>.*?)
279 </\1>
280 ''' % (re.escape(attribute), re.escape(value)), html)
281
282 if not m:
283 return None
284 res = m.group('content')
285
286 if res.startswith('"') or res.startswith("'"):
287 res = res[1:-1]
a921f407 288
38285056 289 return unescapeHTML(res)
a921f407 290
c5229f39 291
8bb56eee
BF
292class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
294 def __init__(self):
c5229f39 295 self.attrs = {}
8bb56eee
BF
296 compat_HTMLParser.__init__(self)
297
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
300
c5229f39 301
8bb56eee
BF
302def extract_attributes(html_element):
303 """Given a string for an HTML element such as
304 <el
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&amp;"
307 sq='"' dq="'"
308 >
309 Decode and return a dictionary of attributes.
310 {
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
314 }.
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
317 """
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
320 parser.close()
321 return parser.attrs
9e6dd238 322
c5229f39 323
9e6dd238 324def clean_html(html):
59ae15a5 325 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
326
327 if html is None: # Convenience for sanitizing descriptions etc.
328 return html
329
59ae15a5
PH
330 # Newline vs <br />
331 html = html.replace('\n', ' ')
6b3aef80
FV
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
334 # Strip html tags
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
7decf895 338 return html.strip()
9e6dd238
FV
339
340
d77c3dfd 341def sanitize_open(filename, open_mode):
59ae15a5
PH
342 """Try to open the given filename, and slightly tweak it if this fails.
343
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
347 function.
348
349 It returns the tuple (stream, definitive_file_name).
350 """
351 try:
28e614de 352 if filename == '-':
59ae15a5
PH
353 if sys.platform == 'win32':
354 import msvcrt
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
f45c185f
PH
360 if err.errno in (errno.EACCES,):
361 raise
59ae15a5 362
f45c185f 363 # In case of error, try to remove win32 forbidden chars
d55de57b 364 alt_filename = sanitize_path(filename)
f45c185f
PH
365 if alt_filename == filename:
366 raise
367 else:
368 # An exception here should be caught in the caller
d55de57b 369 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 370 return (stream, alt_filename)
d77c3dfd
FV
371
372
373def timeconvert(timestr):
59ae15a5
PH
374 """Convert RFC 2822 defined time string into system timestamp"""
375 timestamp = None
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
379 return timestamp
1c469a94 380
5f6a1245 381
796173d0 382def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
796173d0 385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
386 """
387 def replace_insane(char):
c587cbb7
AT
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
59ae15a5
PH
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
391 return ''
392 elif char == '"':
393 return '' if restricted else '\''
394 elif char == ':':
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
397 return '_'
627dcfff 398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
399 return '_'
400 if restricted and ord(char) > 127:
401 return '_'
402 return char
403
2aeb06d6
PH
404 # Handle timestamps
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 406 result = ''.join(map(replace_insane, s))
796173d0
PH
407 if not is_id:
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
413 result = result[2:]
5a42414b
PH
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
a7440261 416 result = result.lstrip('.')
796173d0
PH
417 if not result:
418 result = '_'
59ae15a5 419 return result
d77c3dfd 420
5f6a1245 421
a2aaf4db
S
422def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
425 return s
be531ef1
S
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
430 if drive_or_unc:
a2aaf4db
S
431 norm_path.pop(0)
432 sanitized_path = [
c90d16cf 433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 434 for path_part in norm_path]
be531ef1
S
435 if drive_or_unc:
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
437 return os.path.join(*sanitized_path)
438
439
67dda517
S
440# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441# unwanted failures due to missing protocol
17bcc626
S
442def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
444
445
67dda517 446def sanitized_Request(url, *args, **kwargs):
17bcc626 447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
448
449
d77c3dfd 450def orderedSet(iterable):
59ae15a5
PH
451 """ Remove all duplicates from the input iterable """
452 res = []
453 for el in iterable:
454 if el not in res:
455 res.append(el)
456 return res
d77c3dfd 457
912b38b4 458
4e408e47
PH
459def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
464
91757b0f 465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
466 if mobj is not None:
467 numstr = mobj.group(1)
28e614de 468 if numstr.startswith('x'):
4e408e47 469 base = 16
28e614de 470 numstr = '0%s' % numstr
4e408e47
PH
471 else:
472 base = 10
7aefc49c
S
473 # See https://github.com/rg3/youtube-dl/issues/7518
474 try:
475 return compat_chr(int(numstr, base))
476 except ValueError:
477 pass
4e408e47
PH
478
479 # Unknown entity in name, return its literal representation
7a3f0c00 480 return '&%s;' % entity
4e408e47
PH
481
482
d77c3dfd 483def unescapeHTML(s):
912b38b4
PH
484 if s is None:
485 return None
486 assert type(s) == compat_str
d77c3dfd 487
4e408e47
PH
488 return re.sub(
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 490
8bf48f23 491
aa49acd1
S
492def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
497 else:
498 encoding = sys.getfilesystemencoding()
499 if encoding is None:
500 encoding = 'utf-8'
501 return encoding
502
503
8bf48f23 504def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
505 """
506 @param s The name of the file
507 """
d77c3dfd 508
8bf48f23 509 assert type(s) == compat_str
d77c3dfd 510
59ae15a5
PH
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
513 return s
0f00efed 514
aa49acd1
S
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
519 return s
520
8ee239e9
YCH
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
523 return s
524
aa49acd1
S
525 return s.encode(get_subprocess_encoding(), 'ignore')
526
527
528def decodeFilename(b, for_subprocess=False):
529
530 if sys.version_info >= (3, 0):
531 return b
532
533 if not isinstance(b, bytes):
534 return b
535
536 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 537
f07b74fc
PH
538
539def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
7af808a5 543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
546
547
aa49acd1
S
548def decodeArgument(b):
549 return decodeFilename(b, True)
550
551
8271226a
PH
552def decodeOption(optval):
553 if optval is None:
554 return optval
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
557
558 assert isinstance(optval, compat_str)
559 return optval
1c256f70 560
5f6a1245 561
4539dd30
PH
562def formatSeconds(secs):
563 if secs > 3600:
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
565 elif secs > 60:
566 return '%d:%02d' % (secs // 60, secs % 60)
567 else:
568 return '%d' % secs
569
a0ddb8a2 570
be4a824d
PH
571def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 575 if opts_no_check_certificate:
be5f2c19 576 context.check_hostname = False
0db261ba 577 context.verify_mode = ssl.CERT_NONE
a2366922 578 try:
be4a824d 579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
580 except TypeError:
581 # Python 2.7.8
582 # (create_default_context present but HTTPSHandler has no context=)
583 pass
584
585 if sys.version_info < (3, 2):
d7932313 586 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 587 else: # Python < 3.4
d7932313 588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 589 context.verify_mode = (ssl.CERT_NONE
dca08720 590 if opts_no_check_certificate
ea6d901e 591 else ssl.CERT_REQUIRED)
303b479e 592 context.set_default_verify_paths()
be4a824d 593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 594
732ea2f0 595
08f2a92c
JMF
596def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
599 else:
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
604 return msg
605
606
1c256f70
PH
607class ExtractorError(Exception):
608 """Error during info extraction."""
5f6a1245 609
d11271dd 610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
613 """
614
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
616 expected = True
d11271dd
PH
617 if video_id is not None:
618 msg = video_id + ': ' + msg
410f3e73 619 if cause:
28e614de 620 msg += ' (caused by %r)' % cause
9a82b238 621 if not expected:
08f2a92c 622 msg += bug_reports_message()
1c256f70 623 super(ExtractorError, self).__init__(msg)
d5979c5d 624
1c256f70 625 self.traceback = tb
8cc83b8d 626 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 627 self.cause = cause
d11271dd 628 self.video_id = video_id
1c256f70 629
01951dda
PH
630 def format_traceback(self):
631 if self.traceback is None:
632 return None
28e614de 633 return ''.join(traceback.format_tb(self.traceback))
01951dda 634
1c256f70 635
416c7fcb
PH
636class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
640 self.url = url
641
642
55b3e45b
JMF
643class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
645 pass
646
647
d77c3dfd 648class DownloadError(Exception):
59ae15a5 649 """Download Error exception.
d77c3dfd 650
59ae15a5
PH
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
653 error message.
654 """
5f6a1245 655
8cc83b8d
FV
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
d77c3dfd
FV
660
661
662class SameFileError(Exception):
59ae15a5 663 """Same File exception.
d77c3dfd 664
59ae15a5
PH
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
667 """
668 pass
d77c3dfd
FV
669
670
671class PostProcessingError(Exception):
59ae15a5 672 """Post Processing exception.
d77c3dfd 673
59ae15a5
PH
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
676 """
5f6a1245 677
7851b379
PH
678 def __init__(self, msg):
679 self.msg = msg
d77c3dfd 680
5f6a1245 681
d77c3dfd 682class MaxDownloadsReached(Exception):
59ae15a5
PH
683 """ --max-downloads limit has been reached. """
684 pass
d77c3dfd
FV
685
686
687class UnavailableVideoError(Exception):
59ae15a5 688 """Unavailable Format exception.
d77c3dfd 689
59ae15a5
PH
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
692 """
693 pass
d77c3dfd
FV
694
695
696class ContentTooShortError(Exception):
59ae15a5 697 """Content Too Short exception.
d77c3dfd 698
59ae15a5
PH
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
702 """
d77c3dfd 703
59ae15a5 704 def __init__(self, downloaded, expected):
2c7ed247 705 # Both in bytes
59ae15a5
PH
706 self.downloaded = downloaded
707 self.expected = expected
d77c3dfd 708
5f6a1245 709
c5a59d93 710def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
5a1a2e94 715 kwargs[b'strict'] = True
be4a824d
PH
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
722 else: # Python 2.6
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
726 if is_https:
d7932313
PH
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
730 else:
731 self.sock = sock
732 hc.connect = functools.partial(_hc_connect, hc)
733
734 return hc
735
736
87f0e62d 737def handle_youtubedl_headers(headers):
992fc9d6
YCH
738 filtered_headers = headers
739
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 742 del filtered_headers['Youtubedl-no-compression']
87f0e62d 743
992fc9d6 744 return filtered_headers
87f0e62d
YCH
745
746
acebc9cd 747class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
748 """Handler for HTTP requests and responses.
749
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
0424ec30 754 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
755 removed before making the real request.
756
757 Part of this code was copied from:
758
759 http://techknack.net/python-urllib2-handlers/
760
761 Andrew Rowls, the author of that code, agreed to release it to the
762 public domain.
763 """
764
be4a824d
PH
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
768
769 def http_open(self, req):
71aff188
YCH
770 conn_class = compat_http_client.HTTPConnection
771
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
773 if socks_proxy:
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
776
be4a824d 777 return self.do_open(functools.partial(
71aff188 778 _create_http_connection, self, conn_class, False),
be4a824d
PH
779 req)
780
59ae15a5
PH
781 @staticmethod
782 def deflate(data):
783 try:
784 return zlib.decompress(data, -zlib.MAX_WBITS)
785 except zlib.error:
786 return zlib.decompress(data)
787
788 @staticmethod
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
793 ret.code = code
794 return ret
795
acebc9cd 796 def http_request(self, req):
51f267d9
S
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
807
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
15d260eb 810 req = update_Request(req, url=url_escaped)
51f267d9 811
33ac271b 812 for h, v in std_headers.items():
3d5f7a39
JK
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
33ac271b 816 req.add_header(h, v)
87f0e62d
YCH
817
818 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
819
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
824
59ae15a5
PH
825 return req
826
acebc9cd 827 def http_response(self, req, resp):
59ae15a5
PH
828 old_resp = resp
829 # gzip
830 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
833 try:
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
839 try:
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
842 except IOError:
843 continue
844 break
845 else:
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 848 resp.msg = old_resp.msg
c047270c 849 del resp.headers['Content-encoding']
59ae15a5
PH
850 # deflate
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
c047270c 855 del resp.headers['Content-encoding']
ad729172
S
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
860 if location:
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
864 location_escaped = escape_url(location)
865 if location != location_escaped:
866 del resp.headers['Location']
867 resp.headers['Location'] = location_escaped
59ae15a5 868 return resp
0f8d03f8 869
acebc9cd
PH
870 https_request = http_request
871 https_response = http_response
bf50b038 872
5de90176 873
71aff188
YCH
874def make_socks_conn_class(base_class, socks_proxy):
875 assert issubclass(base_class, (
876 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
877
878 url_components = compat_urlparse.urlparse(socks_proxy)
879 if url_components.scheme.lower() == 'socks5':
880 socks_type = ProxyType.SOCKS5
881 elif url_components.scheme.lower() in ('socks', 'socks4'):
882 socks_type = ProxyType.SOCKS4
51fb4995
YCH
883 elif url_components.scheme.lower() == 'socks4a':
884 socks_type = ProxyType.SOCKS4A
71aff188
YCH
885
886 proxy_args = (
887 socks_type,
888 url_components.hostname, url_components.port or 1080,
889 True, # Remote DNS
7581bfc9
YCH
890 compat_urllib_parse_unquote_plus(url_components.username),
891 compat_urllib_parse_unquote_plus(url_components.password),
71aff188
YCH
892 )
893
894 class SocksConnection(base_class):
895 def connect(self):
896 self.sock = sockssocket()
897 self.sock.setproxy(*proxy_args)
898 if type(self.timeout) in (int, float):
899 self.sock.settimeout(self.timeout)
900 self.sock.connect((self.host, self.port))
901
902 if isinstance(self, compat_http_client.HTTPSConnection):
903 if hasattr(self, '_context'): # Python > 2.6
904 self.sock = self._context.wrap_socket(
905 self.sock, server_hostname=self.host)
906 else:
907 self.sock = ssl.wrap_socket(self.sock)
908
909 return SocksConnection
910
911
be4a824d
PH
912class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
913 def __init__(self, params, https_conn_class=None, *args, **kwargs):
914 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
915 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
916 self._params = params
917
918 def https_open(self, req):
4f264c02 919 kwargs = {}
71aff188
YCH
920 conn_class = self._https_conn_class
921
4f264c02
JMF
922 if hasattr(self, '_context'): # python > 2.6
923 kwargs['context'] = self._context
924 if hasattr(self, '_check_hostname'): # python 3.x
925 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
926
927 socks_proxy = req.headers.get('Ytdl-socks-proxy')
928 if socks_proxy:
929 conn_class = make_socks_conn_class(conn_class, socks_proxy)
930 del req.headers['Ytdl-socks-proxy']
931
be4a824d 932 return self.do_open(functools.partial(
71aff188 933 _create_http_connection, self, conn_class, True),
4f264c02 934 req, **kwargs)
be4a824d
PH
935
936
a6420bf5
S
937class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
938 def __init__(self, cookiejar=None):
939 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
940
941 def http_response(self, request, response):
942 # Python 2 will choke on next HTTP request in row if there are non-ASCII
943 # characters in Set-Cookie HTTP header of last response (see
944 # https://github.com/rg3/youtube-dl/issues/6769).
945 # In order to at least prevent crashing we will percent encode Set-Cookie
946 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
947 # if sys.version_info < (3, 0) and response.headers:
948 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
949 # set_cookie = response.headers.get(set_cookie_header)
950 # if set_cookie:
951 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
952 # if set_cookie != set_cookie_escaped:
953 # del response.headers[set_cookie_header]
954 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
955 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
956
957 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
958 https_response = http_response
959
960
08b38d54 961def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
962 """ Return a UNIX timestamp from the given date """
963
964 if date_str is None:
965 return None
966
52c3a6e4
S
967 date_str = re.sub(r'\.[0-9]+', '', date_str)
968
08b38d54
PH
969 if timezone is None:
970 m = re.search(
52c3a6e4 971 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
972 date_str)
973 if not m:
912b38b4
PH
974 timezone = datetime.timedelta()
975 else:
08b38d54
PH
976 date_str = date_str[:-len(m.group(0))]
977 if not m.group('sign'):
978 timezone = datetime.timedelta()
979 else:
980 sign = 1 if m.group('sign') == '+' else -1
981 timezone = datetime.timedelta(
982 hours=sign * int(m.group('hours')),
983 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
984 try:
985 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
986 dt = datetime.datetime.strptime(date_str, date_format) - timezone
987 return calendar.timegm(dt.timetuple())
988 except ValueError:
989 pass
912b38b4
PH
990
991
42bdd9d0 992def unified_strdate(date_str, day_first=True):
bf50b038 993 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
994
995 if date_str is None:
996 return None
bf50b038 997 upload_date = None
5f6a1245 998 # Replace commas
026fcc04 999 date_str = date_str.replace(',', ' ')
bf50b038 1000 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
1001 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1002 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 1003 # Remove AM/PM + timezone
9bb8e0a3 1004 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 1005
19e1d359
JMF
1006 format_expressions = [
1007 '%d %B %Y',
0f99566c 1008 '%d %b %Y',
19e1d359
JMF
1009 '%B %d %Y',
1010 '%b %d %Y',
f160785c
S
1011 '%b %dst %Y %I:%M',
1012 '%b %dnd %Y %I:%M',
1013 '%b %dth %Y %I:%M',
a69801e2 1014 '%Y %m %d',
19e1d359 1015 '%Y-%m-%d',
fe556f1b 1016 '%Y/%m/%d',
19e1d359 1017 '%Y/%m/%d %H:%M:%S',
5d73273f 1018 '%Y-%m-%d %H:%M:%S',
e9be9a6a 1019 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 1020 '%d.%m.%Y %H:%M',
b047de6f 1021 '%d.%m.%Y %H.%M',
19e1d359 1022 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
1023 '%Y-%m-%dT%H:%M:%S.%fZ',
1024 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 1025 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 1026 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 1027 '%Y-%m-%dT%H:%M',
19e1d359 1028 ]
42bdd9d0
PH
1029 if day_first:
1030 format_expressions.extend([
79c21abb 1031 '%d-%m-%Y',
776dc399
S
1032 '%d.%m.%Y',
1033 '%d/%m/%Y',
1034 '%d/%m/%y',
42bdd9d0
PH
1035 '%d/%m/%Y %H:%M:%S',
1036 ])
1037 else:
1038 format_expressions.extend([
79c21abb 1039 '%m-%d-%Y',
776dc399
S
1040 '%m.%d.%Y',
1041 '%m/%d/%Y',
1042 '%m/%d/%y',
42bdd9d0
PH
1043 '%m/%d/%Y %H:%M:%S',
1044 ])
bf50b038
JMF
1045 for expression in format_expressions:
1046 try:
1047 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1048 except ValueError:
bf50b038 1049 pass
42393ce2
PH
1050 if upload_date is None:
1051 timetuple = email.utils.parsedate_tz(date_str)
1052 if timetuple:
1053 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1054 if upload_date is not None:
1055 return compat_str(upload_date)
bf50b038 1056
5f6a1245 1057
28e614de 1058def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1059 if url is None:
1060 return default_ext
9cb9a5df 1061 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1062 if re.match(r'^[A-Za-z0-9]+$', guess):
1063 return guess
a7aaa398
S
1064 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1065 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1066 return guess.rstrip('/')
73e79f2a 1067 else:
cbdbb766 1068 return default_ext
73e79f2a 1069
5f6a1245 1070
d4051a8e 1071def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1072 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1073
5f6a1245 1074
bd558525 1075def date_from_str(date_str):
37254abc
JMF
1076 """
1077 Return a datetime object from a string in the format YYYYMMDD or
1078 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1079 today = datetime.date.today()
f8795e10 1080 if date_str in ('now', 'today'):
37254abc 1081 return today
f8795e10
PH
1082 if date_str == 'yesterday':
1083 return today - datetime.timedelta(days=1)
37254abc
JMF
1084 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1085 if match is not None:
1086 sign = match.group('sign')
1087 time = int(match.group('time'))
1088 if sign == '-':
1089 time = -time
1090 unit = match.group('unit')
dfb1b146 1091 # A bad approximation?
37254abc
JMF
1092 if unit == 'month':
1093 unit = 'day'
1094 time *= 30
1095 elif unit == 'year':
1096 unit = 'day'
1097 time *= 365
1098 unit += 's'
1099 delta = datetime.timedelta(**{unit: time})
1100 return today + delta
611c1dd9 1101 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1102
1103
e63fc1be 1104def hyphenate_date(date_str):
1105 """
1106 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1107 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1108 if match is not None:
1109 return '-'.join(match.groups())
1110 else:
1111 return date_str
1112
5f6a1245 1113
bd558525
JMF
1114class DateRange(object):
1115 """Represents a time interval between two dates"""
5f6a1245 1116
bd558525
JMF
1117 def __init__(self, start=None, end=None):
1118 """start and end must be strings in the format accepted by date"""
1119 if start is not None:
1120 self.start = date_from_str(start)
1121 else:
1122 self.start = datetime.datetime.min.date()
1123 if end is not None:
1124 self.end = date_from_str(end)
1125 else:
1126 self.end = datetime.datetime.max.date()
37254abc 1127 if self.start > self.end:
bd558525 1128 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1129
bd558525
JMF
1130 @classmethod
1131 def day(cls, day):
1132 """Returns a range that only contains the given day"""
5f6a1245
JW
1133 return cls(day, day)
1134
bd558525
JMF
1135 def __contains__(self, date):
1136 """Check if the date is in the range"""
37254abc
JMF
1137 if not isinstance(date, datetime.date):
1138 date = date_from_str(date)
1139 return self.start <= date <= self.end
5f6a1245 1140
bd558525 1141 def __str__(self):
5f6a1245 1142 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1143
1144
1145def platform_name():
1146 """ Returns the platform name as a compat_str """
1147 res = platform.platform()
1148 if isinstance(res, bytes):
1149 res = res.decode(preferredencoding())
1150
1151 assert isinstance(res, compat_str)
1152 return res
c257baff
PH
1153
1154
b58ddb32
PH
1155def _windows_write_string(s, out):
1156 """ Returns True if the string was written using special methods,
1157 False if it has yet to be written out."""
1158 # Adapted from http://stackoverflow.com/a/3259271/35070
1159
1160 import ctypes
1161 import ctypes.wintypes
1162
1163 WIN_OUTPUT_IDS = {
1164 1: -11,
1165 2: -12,
1166 }
1167
a383a98a
PH
1168 try:
1169 fileno = out.fileno()
1170 except AttributeError:
1171 # If the output stream doesn't have a fileno, it's virtual
1172 return False
aa42e873
PH
1173 except io.UnsupportedOperation:
1174 # Some strange Windows pseudo files?
1175 return False
b58ddb32
PH
1176 if fileno not in WIN_OUTPUT_IDS:
1177 return False
1178
e2f89ec7 1179 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1180 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1181 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1182 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1183
e2f89ec7 1184 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1185 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1186 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1187 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1188 written = ctypes.wintypes.DWORD(0)
1189
611c1dd9 1190 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1191 FILE_TYPE_CHAR = 0x0002
1192 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1193 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1194 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1195 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1196 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1197 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1198
1199 def not_a_console(handle):
1200 if handle == INVALID_HANDLE_VALUE or handle is None:
1201 return True
8fb3ac36
PH
1202 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1203 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1204
1205 if not_a_console(h):
1206 return False
1207
d1b9c912
PH
1208 def next_nonbmp_pos(s):
1209 try:
1210 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1211 except StopIteration:
1212 return len(s)
1213
1214 while s:
1215 count = min(next_nonbmp_pos(s), 1024)
1216
b58ddb32 1217 ret = WriteConsoleW(
d1b9c912 1218 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1219 if ret == 0:
1220 raise OSError('Failed to write string')
d1b9c912
PH
1221 if not count: # We just wrote a non-BMP character
1222 assert written.value == 2
1223 s = s[1:]
1224 else:
1225 assert written.value > 0
1226 s = s[written.value:]
b58ddb32
PH
1227 return True
1228
1229
734f90bb 1230def write_string(s, out=None, encoding=None):
7459e3a2
PH
1231 if out is None:
1232 out = sys.stderr
8bf48f23 1233 assert type(s) == compat_str
7459e3a2 1234
b58ddb32
PH
1235 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1236 if _windows_write_string(s, out):
1237 return
1238
7459e3a2
PH
1239 if ('b' in getattr(out, 'mode', '') or
1240 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1241 byt = s.encode(encoding or preferredencoding(), 'ignore')
1242 out.write(byt)
1243 elif hasattr(out, 'buffer'):
1244 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1245 byt = s.encode(enc, 'ignore')
1246 out.buffer.write(byt)
1247 else:
8bf48f23 1248 out.write(s)
7459e3a2
PH
1249 out.flush()
1250
1251
48ea9cea
PH
1252def bytes_to_intlist(bs):
1253 if not bs:
1254 return []
1255 if isinstance(bs[0], int): # Python 3
1256 return list(bs)
1257 else:
1258 return [ord(c) for c in bs]
1259
c257baff 1260
cba892fa 1261def intlist_to_bytes(xs):
1262 if not xs:
1263 return b''
edaa23f8 1264 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1265
1266
c1c9a79c
PH
1267# Cross-platform file locking
1268if sys.platform == 'win32':
1269 import ctypes.wintypes
1270 import msvcrt
1271
1272 class OVERLAPPED(ctypes.Structure):
1273 _fields_ = [
1274 ('Internal', ctypes.wintypes.LPVOID),
1275 ('InternalHigh', ctypes.wintypes.LPVOID),
1276 ('Offset', ctypes.wintypes.DWORD),
1277 ('OffsetHigh', ctypes.wintypes.DWORD),
1278 ('hEvent', ctypes.wintypes.HANDLE),
1279 ]
1280
1281 kernel32 = ctypes.windll.kernel32
1282 LockFileEx = kernel32.LockFileEx
1283 LockFileEx.argtypes = [
1284 ctypes.wintypes.HANDLE, # hFile
1285 ctypes.wintypes.DWORD, # dwFlags
1286 ctypes.wintypes.DWORD, # dwReserved
1287 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1288 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1289 ctypes.POINTER(OVERLAPPED) # Overlapped
1290 ]
1291 LockFileEx.restype = ctypes.wintypes.BOOL
1292 UnlockFileEx = kernel32.UnlockFileEx
1293 UnlockFileEx.argtypes = [
1294 ctypes.wintypes.HANDLE, # hFile
1295 ctypes.wintypes.DWORD, # dwReserved
1296 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1297 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1298 ctypes.POINTER(OVERLAPPED) # Overlapped
1299 ]
1300 UnlockFileEx.restype = ctypes.wintypes.BOOL
1301 whole_low = 0xffffffff
1302 whole_high = 0x7fffffff
1303
1304 def _lock_file(f, exclusive):
1305 overlapped = OVERLAPPED()
1306 overlapped.Offset = 0
1307 overlapped.OffsetHigh = 0
1308 overlapped.hEvent = 0
1309 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1310 handle = msvcrt.get_osfhandle(f.fileno())
1311 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1312 whole_low, whole_high, f._lock_file_overlapped_p):
1313 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1314
1315 def _unlock_file(f):
1316 assert f._lock_file_overlapped_p
1317 handle = msvcrt.get_osfhandle(f.fileno())
1318 if not UnlockFileEx(handle, 0,
1319 whole_low, whole_high, f._lock_file_overlapped_p):
1320 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1321
1322else:
399a76e6
YCH
1323 # Some platforms, such as Jython, is missing fcntl
1324 try:
1325 import fcntl
c1c9a79c 1326
399a76e6
YCH
1327 def _lock_file(f, exclusive):
1328 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1329
399a76e6
YCH
1330 def _unlock_file(f):
1331 fcntl.flock(f, fcntl.LOCK_UN)
1332 except ImportError:
1333 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1334
1335 def _lock_file(f, exclusive):
1336 raise IOError(UNSUPPORTED_MSG)
1337
1338 def _unlock_file(f):
1339 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1340
1341
1342class locked_file(object):
1343 def __init__(self, filename, mode, encoding=None):
1344 assert mode in ['r', 'a', 'w']
1345 self.f = io.open(filename, mode, encoding=encoding)
1346 self.mode = mode
1347
1348 def __enter__(self):
1349 exclusive = self.mode != 'r'
1350 try:
1351 _lock_file(self.f, exclusive)
1352 except IOError:
1353 self.f.close()
1354 raise
1355 return self
1356
1357 def __exit__(self, etype, value, traceback):
1358 try:
1359 _unlock_file(self.f)
1360 finally:
1361 self.f.close()
1362
1363 def __iter__(self):
1364 return iter(self.f)
1365
1366 def write(self, *args):
1367 return self.f.write(*args)
1368
1369 def read(self, *args):
1370 return self.f.read(*args)
4eb7f1d1
JMF
1371
1372
4644ac55
S
1373def get_filesystem_encoding():
1374 encoding = sys.getfilesystemencoding()
1375 return encoding if encoding is not None else 'utf-8'
1376
1377
4eb7f1d1 1378def shell_quote(args):
a6a173c2 1379 quoted_args = []
4644ac55 1380 encoding = get_filesystem_encoding()
a6a173c2
JMF
1381 for a in args:
1382 if isinstance(a, bytes):
1383 # We may get a filename encoded with 'encodeFilename'
1384 a = a.decode(encoding)
1385 quoted_args.append(pipes.quote(a))
28e614de 1386 return ' '.join(quoted_args)
9d4660ca
PH
1387
1388
1389def smuggle_url(url, data):
1390 """ Pass additional data in a URL for internal use. """
1391
15707c7e 1392 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1393 {'__youtubedl_smuggle': json.dumps(data)})
1394 return url + '#' + sdata
9d4660ca
PH
1395
1396
79f82953 1397def unsmuggle_url(smug_url, default=None):
83e865a3 1398 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1399 return smug_url, default
28e614de
PH
1400 url, _, sdata = smug_url.rpartition('#')
1401 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1402 data = json.loads(jsond)
1403 return url, data
02dbf93f
PH
1404
1405
02dbf93f
PH
1406def format_bytes(bytes):
1407 if bytes is None:
28e614de 1408 return 'N/A'
02dbf93f
PH
1409 if type(bytes) is str:
1410 bytes = float(bytes)
1411 if bytes == 0.0:
1412 exponent = 0
1413 else:
1414 exponent = int(math.log(bytes, 1024.0))
28e614de 1415 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1416 converted = float(bytes) / float(1024 ** exponent)
28e614de 1417 return '%.2f%s' % (converted, suffix)
f53c966a 1418
1c088fa8 1419
fb47597b
S
1420def lookup_unit_table(unit_table, s):
1421 units_re = '|'.join(re.escape(u) for u in unit_table)
1422 m = re.match(
782b1b5b 1423 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1424 if not m:
1425 return None
1426 num_str = m.group('num').replace(',', '.')
1427 mult = unit_table[m.group('unit')]
1428 return int(float(num_str) * mult)
1429
1430
be64b5b0
PH
1431def parse_filesize(s):
1432 if s is None:
1433 return None
1434
dfb1b146 1435 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1436 # but we support those too
1437 _UNIT_TABLE = {
1438 'B': 1,
1439 'b': 1,
1440 'KiB': 1024,
1441 'KB': 1000,
1442 'kB': 1024,
1443 'Kb': 1000,
1444 'MiB': 1024 ** 2,
1445 'MB': 1000 ** 2,
1446 'mB': 1024 ** 2,
1447 'Mb': 1000 ** 2,
1448 'GiB': 1024 ** 3,
1449 'GB': 1000 ** 3,
1450 'gB': 1024 ** 3,
1451 'Gb': 1000 ** 3,
1452 'TiB': 1024 ** 4,
1453 'TB': 1000 ** 4,
1454 'tB': 1024 ** 4,
1455 'Tb': 1000 ** 4,
1456 'PiB': 1024 ** 5,
1457 'PB': 1000 ** 5,
1458 'pB': 1024 ** 5,
1459 'Pb': 1000 ** 5,
1460 'EiB': 1024 ** 6,
1461 'EB': 1000 ** 6,
1462 'eB': 1024 ** 6,
1463 'Eb': 1000 ** 6,
1464 'ZiB': 1024 ** 7,
1465 'ZB': 1000 ** 7,
1466 'zB': 1024 ** 7,
1467 'Zb': 1000 ** 7,
1468 'YiB': 1024 ** 8,
1469 'YB': 1000 ** 8,
1470 'yB': 1024 ** 8,
1471 'Yb': 1000 ** 8,
1472 }
1473
fb47597b
S
1474 return lookup_unit_table(_UNIT_TABLE, s)
1475
1476
1477def parse_count(s):
1478 if s is None:
be64b5b0
PH
1479 return None
1480
fb47597b
S
1481 s = s.strip()
1482
1483 if re.match(r'^[\d,.]+$', s):
1484 return str_to_int(s)
1485
1486 _UNIT_TABLE = {
1487 'k': 1000,
1488 'K': 1000,
1489 'm': 1000 ** 2,
1490 'M': 1000 ** 2,
1491 'kk': 1000 ** 2,
1492 'KK': 1000 ** 2,
1493 }
be64b5b0 1494
fb47597b 1495 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1496
2f7ae819 1497
caefb1de
PH
1498def month_by_name(name):
1499 """ Return the number of a month by (locale-independently) English name """
1500
caefb1de 1501 try:
7105440c
YCH
1502 return ENGLISH_MONTH_NAMES.index(name) + 1
1503 except ValueError:
1504 return None
1505
1506
1507def month_by_abbreviation(abbrev):
1508 """ Return the number of a month by (locale-independently) English
1509 abbreviations """
1510
1511 try:
1512 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1513 except ValueError:
1514 return None
18258362
JMF
1515
1516
5aafe895 1517def fix_xml_ampersands(xml_str):
18258362 1518 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1519 return re.sub(
1520 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1521 '&amp;',
5aafe895 1522 xml_str)
e3946f98
PH
1523
1524
1525def setproctitle(title):
8bf48f23 1526 assert isinstance(title, compat_str)
c1c05c67
YCH
1527
1528 # ctypes in Jython is not complete
1529 # http://bugs.jython.org/issue2148
1530 if sys.platform.startswith('java'):
1531 return
1532
e3946f98 1533 try:
611c1dd9 1534 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1535 except OSError:
1536 return
6eefe533
PH
1537 title_bytes = title.encode('utf-8')
1538 buf = ctypes.create_string_buffer(len(title_bytes))
1539 buf.value = title_bytes
e3946f98 1540 try:
6eefe533 1541 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1542 except AttributeError:
1543 return # Strange libc, just skip this
d7dda168
PH
1544
1545
1546def remove_start(s, start):
1547 if s.startswith(start):
1548 return s[len(start):]
1549 return s
29eb5174
PH
1550
1551
2b9faf55
PH
1552def remove_end(s, end):
1553 if s.endswith(end):
1554 return s[:-len(end)]
1555 return s
1556
1557
31b2051e
S
1558def remove_quotes(s):
1559 if s is None or len(s) < 2:
1560 return s
1561 for quote in ('"', "'", ):
1562 if s[0] == quote and s[-1] == quote:
1563 return s[1:-1]
1564 return s
1565
1566
29eb5174 1567def url_basename(url):
9b8aaeed 1568 path = compat_urlparse.urlparse(url).path
28e614de 1569 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1570
1571
1572class HEADRequest(compat_urllib_request.Request):
1573 def get_method(self):
611c1dd9 1574 return 'HEAD'
7217e148
PH
1575
1576
9732d77e 1577def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1578 if get_attr:
1579 if v is not None:
1580 v = getattr(v, get_attr, None)
9572013d
PH
1581 if v == '':
1582 v = None
1812afb7
S
1583 if v is None:
1584 return default
1585 try:
1586 return int(v) * invscale // scale
1587 except ValueError:
af98f8ff 1588 return default
9732d77e 1589
9572013d 1590
40a90862
JMF
1591def str_or_none(v, default=None):
1592 return default if v is None else compat_str(v)
1593
9732d77e
PH
1594
1595def str_to_int(int_str):
48d4681e 1596 """ A more relaxed version of int_or_none """
9732d77e
PH
1597 if int_str is None:
1598 return None
28e614de 1599 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1600 return int(int_str)
608d11f5
PH
1601
1602
9732d77e 1603def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1604 if v is None:
1605 return default
1606 try:
1607 return float(v) * invscale / scale
1608 except ValueError:
1609 return default
43f775e4
PH
1610
1611
608d11f5 1612def parse_duration(s):
8f9312c3 1613 if not isinstance(s, compat_basestring):
608d11f5
PH
1614 return None
1615
ca7b3246
S
1616 s = s.strip()
1617
acaff495 1618 days, hours, mins, secs, ms = [None] * 5
1619 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1620 if m:
1621 days, hours, mins, secs, ms = m.groups()
1622 else:
1623 m = re.match(
1624 r'''(?ix)(?:P?T)?
8f4b58d7 1625 (?:
acaff495 1626 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1627 )?
acaff495 1628 (?:
1629 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1630 )?
1631 (?:
1632 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1633 )?
1634 (?:
1635 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1636 )?$''', s)
1637 if m:
1638 days, hours, mins, secs, ms = m.groups()
1639 else:
1640 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1641 if m:
1642 hours, mins = m.groups()
1643 else:
1644 return None
1645
1646 duration = 0
1647 if secs:
1648 duration += float(secs)
1649 if mins:
1650 duration += float(mins) * 60
1651 if hours:
1652 duration += float(hours) * 60 * 60
1653 if days:
1654 duration += float(days) * 24 * 60 * 60
1655 if ms:
1656 duration += float(ms)
1657 return duration
91d7d0b3
JMF
1658
1659
e65e4c88 1660def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1661 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1662 return (
1663 '{0}.{1}{2}'.format(name, ext, real_ext)
1664 if not expected_real_ext or real_ext[1:] == expected_real_ext
1665 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1666
1667
b3ed15b7
S
1668def replace_extension(filename, ext, expected_real_ext=None):
1669 name, real_ext = os.path.splitext(filename)
1670 return '{0}.{1}'.format(
1671 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1672 ext)
1673
1674
d70ad093
PH
1675def check_executable(exe, args=[]):
1676 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1677 args can be a list of arguments for a short output (like -version) """
1678 try:
1679 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1680 except OSError:
1681 return False
1682 return exe
b7ab0590
PH
1683
1684
95807118 1685def get_exe_version(exe, args=['--version'],
cae97f65 1686 version_re=None, unrecognized='present'):
95807118
PH
1687 """ Returns the version of the specified executable,
1688 or False if the executable is not present """
1689 try:
cae97f65 1690 out, _ = subprocess.Popen(
54116803 1691 [encodeArgument(exe)] + args,
95807118
PH
1692 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1693 except OSError:
1694 return False
cae97f65
PH
1695 if isinstance(out, bytes): # Python 2.x
1696 out = out.decode('ascii', 'ignore')
1697 return detect_exe_version(out, version_re, unrecognized)
1698
1699
1700def detect_exe_version(output, version_re=None, unrecognized='present'):
1701 assert isinstance(output, compat_str)
1702 if version_re is None:
1703 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1704 m = re.search(version_re, output)
95807118
PH
1705 if m:
1706 return m.group(1)
1707 else:
1708 return unrecognized
1709
1710
b7ab0590 1711class PagedList(object):
dd26ced1
PH
1712 def __len__(self):
1713 # This is only useful for tests
1714 return len(self.getslice())
1715
9c44d242
PH
1716
1717class OnDemandPagedList(PagedList):
b95dc034 1718 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1719 self._pagefunc = pagefunc
1720 self._pagesize = pagesize
b95dc034
YCH
1721 self._use_cache = use_cache
1722 if use_cache:
1723 self._cache = {}
9c44d242 1724
b7ab0590
PH
1725 def getslice(self, start=0, end=None):
1726 res = []
1727 for pagenum in itertools.count(start // self._pagesize):
1728 firstid = pagenum * self._pagesize
1729 nextfirstid = pagenum * self._pagesize + self._pagesize
1730 if start >= nextfirstid:
1731 continue
1732
b95dc034
YCH
1733 page_results = None
1734 if self._use_cache:
1735 page_results = self._cache.get(pagenum)
1736 if page_results is None:
1737 page_results = list(self._pagefunc(pagenum))
1738 if self._use_cache:
1739 self._cache[pagenum] = page_results
b7ab0590
PH
1740
1741 startv = (
1742 start % self._pagesize
1743 if firstid <= start < nextfirstid
1744 else 0)
1745
1746 endv = (
1747 ((end - 1) % self._pagesize) + 1
1748 if (end is not None and firstid <= end <= nextfirstid)
1749 else None)
1750
1751 if startv != 0 or endv is not None:
1752 page_results = page_results[startv:endv]
1753 res.extend(page_results)
1754
1755 # A little optimization - if current page is not "full", ie. does
1756 # not contain page_size videos then we can assume that this page
1757 # is the last one - there are no more ids on further pages -
1758 # i.e. no need to query again.
1759 if len(page_results) + startv < self._pagesize:
1760 break
1761
1762 # If we got the whole page, but the next page is not interesting,
1763 # break out early as well
1764 if end == nextfirstid:
1765 break
1766 return res
81c2f20b
PH
1767
1768
9c44d242
PH
1769class InAdvancePagedList(PagedList):
1770 def __init__(self, pagefunc, pagecount, pagesize):
1771 self._pagefunc = pagefunc
1772 self._pagecount = pagecount
1773 self._pagesize = pagesize
1774
1775 def getslice(self, start=0, end=None):
1776 res = []
1777 start_page = start // self._pagesize
1778 end_page = (
1779 self._pagecount if end is None else (end // self._pagesize + 1))
1780 skip_elems = start - start_page * self._pagesize
1781 only_more = None if end is None else end - start
1782 for pagenum in range(start_page, end_page):
1783 page = list(self._pagefunc(pagenum))
1784 if skip_elems:
1785 page = page[skip_elems:]
1786 skip_elems = None
1787 if only_more is not None:
1788 if len(page) < only_more:
1789 only_more -= len(page)
1790 else:
1791 page = page[:only_more]
1792 res.extend(page)
1793 break
1794 res.extend(page)
1795 return res
1796
1797
81c2f20b 1798def uppercase_escape(s):
676eb3f2 1799 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1800 return re.sub(
a612753d 1801 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1802 lambda m: unicode_escape(m.group(0))[0],
1803 s)
0fe2ff78
YCH
1804
1805
1806def lowercase_escape(s):
1807 unicode_escape = codecs.getdecoder('unicode_escape')
1808 return re.sub(
1809 r'\\u[0-9a-fA-F]{4}',
1810 lambda m: unicode_escape(m.group(0))[0],
1811 s)
b53466e1 1812
d05cfe06
S
1813
1814def escape_rfc3986(s):
1815 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1816 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1817 s = s.encode('utf-8')
ecc0c5ee 1818 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1819
1820
1821def escape_url(url):
1822 """Escape URL as suggested by RFC 3986"""
1823 url_parsed = compat_urllib_parse_urlparse(url)
1824 return url_parsed._replace(
efbed08d 1825 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1826 path=escape_rfc3986(url_parsed.path),
1827 params=escape_rfc3986(url_parsed.params),
1828 query=escape_rfc3986(url_parsed.query),
1829 fragment=escape_rfc3986(url_parsed.fragment)
1830 ).geturl()
1831
62e609ab
PH
1832
1833def read_batch_urls(batch_fd):
1834 def fixup(url):
1835 if not isinstance(url, compat_str):
1836 url = url.decode('utf-8', 'replace')
28e614de 1837 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1838 if url.startswith(BOM_UTF8):
1839 url = url[len(BOM_UTF8):]
1840 url = url.strip()
1841 if url.startswith(('#', ';', ']')):
1842 return False
1843 return url
1844
1845 with contextlib.closing(batch_fd) as fd:
1846 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1847
1848
1849def urlencode_postdata(*args, **kargs):
15707c7e 1850 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1851
1852
38f9ef31 1853def update_url_query(url, query):
cacd9966
YCH
1854 if not query:
1855 return url
38f9ef31 1856 parsed_url = compat_urlparse.urlparse(url)
1857 qs = compat_parse_qs(parsed_url.query)
1858 qs.update(query)
1859 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1860 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1861
8e60dc75 1862
ed0291d1
S
1863def update_Request(req, url=None, data=None, headers={}, query={}):
1864 req_headers = req.headers.copy()
1865 req_headers.update(headers)
1866 req_data = data or req.data
1867 req_url = update_url_query(url or req.get_full_url(), query)
1868 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1869 new_req = req_type(
1870 req_url, data=req_data, headers=req_headers,
1871 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1872 if hasattr(req, 'timeout'):
1873 new_req.timeout = req.timeout
1874 return new_req
1875
1876
86296ad2 1877def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1878 if isinstance(key_or_keys, (list, tuple)):
1879 for key in key_or_keys:
86296ad2
S
1880 if key not in d or d[key] is None or skip_false_values and not d[key]:
1881 continue
1882 return d[key]
cbecc9b9
S
1883 return default
1884 return d.get(key_or_keys, default)
1885
1886
8e60dc75
S
1887def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1888 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1889
16392824 1890
a1a530b0
PH
1891US_RATINGS = {
1892 'G': 0,
1893 'PG': 10,
1894 'PG-13': 13,
1895 'R': 16,
1896 'NC': 18,
1897}
fac55558
PH
1898
1899
146c80e2
S
1900def parse_age_limit(s):
1901 if s is None:
d838b1bd 1902 return None
146c80e2 1903 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1904 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1905
1906
fac55558 1907def strip_jsonp(code):
609a61e3 1908 return re.sub(
8411229b 1909 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1910
1911
e05f6939
PH
1912def js_to_json(code):
1913 def fix_kv(m):
e7b6d122
PH
1914 v = m.group(0)
1915 if v in ('true', 'false', 'null'):
1916 return v
bd1e4844 1917 elif v.startswith('/*') or v == ',':
1918 return ""
1919
1920 if v[0] in ("'", '"'):
1921 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 1922 '"': '\\"',
bd1e4844 1923 "\\'": "'",
1924 '\\\n': '',
1925 '\\x': '\\u00',
1926 }.get(m.group(0), m.group(0)), v[1:-1])
1927
89ac4a19 1928 INTEGER_TABLE = (
cda6d47a
S
1929 (r'^0[xX][0-9a-fA-F]+', 16),
1930 (r'^0+[0-7]+', 8),
89ac4a19
S
1931 )
1932
1933 for regex, base in INTEGER_TABLE:
1934 im = re.match(regex, v)
1935 if im:
cda6d47a 1936 i = int(im.group(0), base)
89ac4a19
S
1937 return '"%d":' % i if v.endswith(':') else '%d' % i
1938
e7b6d122 1939 return '"%s"' % v
e05f6939 1940
bd1e4844 1941 return re.sub(r'''(?sx)
1942 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1943 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1944 /\*.*?\*/|,(?=\s*[\]}])|
1945 [a-zA-Z_][.a-zA-Z_0-9]*|
89ac4a19 1946 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 1947 [0-9]+(?=\s*:)
e05f6939 1948 ''', fix_kv, code)
e05f6939
PH
1949
1950
478c2c61
PH
1951def qualities(quality_ids):
1952 """ Get a numeric quality value out of a list of possible values """
1953 def q(qid):
1954 try:
1955 return quality_ids.index(qid)
1956 except ValueError:
1957 return -1
1958 return q
1959
acd69589
PH
1960
1961DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1962
a020a0dc
PH
1963
1964def limit_length(s, length):
1965 """ Add ellipses to overly long strings """
1966 if s is None:
1967 return None
1968 ELLIPSES = '...'
1969 if len(s) > length:
1970 return s[:length - len(ELLIPSES)] + ELLIPSES
1971 return s
48844745
PH
1972
1973
1974def version_tuple(v):
5f9b8394 1975 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1976
1977
1978def is_outdated_version(version, limit, assume_new=True):
1979 if not version:
1980 return not assume_new
1981 try:
1982 return version_tuple(version) < version_tuple(limit)
1983 except ValueError:
1984 return not assume_new
732ea2f0
PH
1985
1986
1987def ytdl_is_updateable():
1988 """ Returns if youtube-dl can be updated with -U """
1989 from zipimport import zipimporter
1990
1991 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1992
1993
1994def args_to_str(args):
1995 # Get a short string representation for a subprocess command
702ccf2d 1996 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
1997
1998
9b9c5355 1999def error_to_compat_str(err):
fdae2358
S
2000 err_str = str(err)
2001 # On python 2 error byte string must be decoded with proper
2002 # encoding rather than ascii
2003 if sys.version_info[0] < 3:
2004 err_str = err_str.decode(preferredencoding())
2005 return err_str
2006
2007
c460bdd5 2008def mimetype2ext(mt):
eb9ee194
S
2009 if mt is None:
2010 return None
2011
765ac263
JMF
2012 ext = {
2013 'audio/mp4': 'm4a',
2014 }.get(mt)
2015 if ext is not None:
2016 return ext
2017
c460bdd5
PH
2018 _, _, res = mt.rpartition('/')
2019
2020 return {
f6861ec9 2021 '3gpp': '3gp',
cafcf657 2022 'smptett+xml': 'tt',
2023 'srt': 'srt',
2024 'ttaf+xml': 'dfxp',
a0d8d704 2025 'ttml+xml': 'ttml',
cafcf657 2026 'vtt': 'vtt',
f6861ec9 2027 'x-flv': 'flv',
a0d8d704
YCH
2028 'x-mp4-fragmented': 'mp4',
2029 'x-ms-wmv': 'wmv',
c460bdd5
PH
2030 }.get(res, res)
2031
2032
2ccd1b10
PH
2033def urlhandle_detect_ext(url_handle):
2034 try:
2035 url_handle.headers
2036 getheader = lambda h: url_handle.headers[h]
2037 except AttributeError: # Python < 3
2038 getheader = url_handle.info().getheader
2039
b55ee18f
PH
2040 cd = getheader('Content-Disposition')
2041 if cd:
2042 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2043 if m:
2044 e = determine_ext(m.group('filename'), default_ext=None)
2045 if e:
2046 return e
2047
c460bdd5 2048 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2049
2050
1e399778
YCH
2051def encode_data_uri(data, mime_type):
2052 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2053
2054
05900629 2055def age_restricted(content_limit, age_limit):
6ec6cb4e 2056 """ Returns True iff the content should be blocked """
05900629
PH
2057
2058 if age_limit is None: # No limit set
2059 return False
2060 if content_limit is None:
2061 return False # Content available for everyone
2062 return age_limit < content_limit
61ca9a80
PH
2063
2064
2065def is_html(first_bytes):
2066 """ Detect whether a file contains HTML by examining its first bytes. """
2067
2068 BOMS = [
2069 (b'\xef\xbb\xbf', 'utf-8'),
2070 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2071 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2072 (b'\xff\xfe', 'utf-16-le'),
2073 (b'\xfe\xff', 'utf-16-be'),
2074 ]
2075 for bom, enc in BOMS:
2076 if first_bytes.startswith(bom):
2077 s = first_bytes[len(bom):].decode(enc, 'replace')
2078 break
2079 else:
2080 s = first_bytes.decode('utf-8', 'replace')
2081
2082 return re.match(r'^\s*<', s)
a055469f
PH
2083
2084
2085def determine_protocol(info_dict):
2086 protocol = info_dict.get('protocol')
2087 if protocol is not None:
2088 return protocol
2089
2090 url = info_dict['url']
2091 if url.startswith('rtmp'):
2092 return 'rtmp'
2093 elif url.startswith('mms'):
2094 return 'mms'
2095 elif url.startswith('rtsp'):
2096 return 'rtsp'
2097
2098 ext = determine_ext(url)
2099 if ext == 'm3u8':
2100 return 'm3u8'
2101 elif ext == 'f4m':
2102 return 'f4m'
2103
2104 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2105
2106
2107def render_table(header_row, data):
2108 """ Render a list of rows, each as a list of values """
2109 table = [header_row] + data
2110 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2111 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2112 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2113
2114
2115def _match_one(filter_part, dct):
2116 COMPARISON_OPERATORS = {
2117 '<': operator.lt,
2118 '<=': operator.le,
2119 '>': operator.gt,
2120 '>=': operator.ge,
2121 '=': operator.eq,
2122 '!=': operator.ne,
2123 }
2124 operator_rex = re.compile(r'''(?x)\s*
2125 (?P<key>[a-z_]+)
2126 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2127 (?:
2128 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2129 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2130 )
2131 \s*$
2132 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2133 m = operator_rex.search(filter_part)
2134 if m:
2135 op = COMPARISON_OPERATORS[m.group('op')]
2136 if m.group('strval') is not None:
2137 if m.group('op') not in ('=', '!='):
2138 raise ValueError(
2139 'Operator %s does not support string values!' % m.group('op'))
2140 comparison_value = m.group('strval')
2141 else:
2142 try:
2143 comparison_value = int(m.group('intval'))
2144 except ValueError:
2145 comparison_value = parse_filesize(m.group('intval'))
2146 if comparison_value is None:
2147 comparison_value = parse_filesize(m.group('intval') + 'B')
2148 if comparison_value is None:
2149 raise ValueError(
2150 'Invalid integer value %r in filter part %r' % (
2151 m.group('intval'), filter_part))
2152 actual_value = dct.get(m.group('key'))
2153 if actual_value is None:
2154 return m.group('none_inclusive')
2155 return op(actual_value, comparison_value)
2156
2157 UNARY_OPERATORS = {
2158 '': lambda v: v is not None,
2159 '!': lambda v: v is None,
2160 }
2161 operator_rex = re.compile(r'''(?x)\s*
2162 (?P<op>%s)\s*(?P<key>[a-z_]+)
2163 \s*$
2164 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2165 m = operator_rex.search(filter_part)
2166 if m:
2167 op = UNARY_OPERATORS[m.group('op')]
2168 actual_value = dct.get(m.group('key'))
2169 return op(actual_value)
2170
2171 raise ValueError('Invalid filter part %r' % filter_part)
2172
2173
2174def match_str(filter_str, dct):
2175 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2176
2177 return all(
2178 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2179
2180
2181def match_filter_func(filter_str):
2182 def _match_func(info_dict):
2183 if match_str(filter_str, info_dict):
2184 return None
2185 else:
2186 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2187 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2188 return _match_func
91410c9b
PH
2189
2190
bf6427d2
YCH
2191def parse_dfxp_time_expr(time_expr):
2192 if not time_expr:
d631d5f9 2193 return
bf6427d2
YCH
2194
2195 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2196 if mobj:
2197 return float(mobj.group('time_offset'))
2198
db2fe38b 2199 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2200 if mobj:
db2fe38b 2201 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2202
2203
c1c924ab
YCH
2204def srt_subtitles_timecode(seconds):
2205 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2206
2207
2208def dfxp2srt(dfxp_data):
4e335771
YCH
2209 _x = functools.partial(xpath_with_ns, ns_map={
2210 'ttml': 'http://www.w3.org/ns/ttml',
2211 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2212 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2213 })
bf6427d2 2214
87de7069 2215 class TTMLPElementParser(object):
2b14cb56 2216 out = ''
bf6427d2 2217
2b14cb56 2218 def start(self, tag, attrib):
2219 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2220 self.out += '\n'
bf6427d2 2221
2b14cb56 2222 def end(self, tag):
2223 pass
bf6427d2 2224
2b14cb56 2225 def data(self, data):
2226 self.out += data
2227
2228 def close(self):
2229 return self.out.strip()
2230
2231 def parse_node(node):
2232 target = TTMLPElementParser()
2233 parser = xml.etree.ElementTree.XMLParser(target=target)
2234 parser.feed(xml.etree.ElementTree.tostring(node))
2235 return parser.close()
bf6427d2 2236
36e6f62c 2237 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2238 out = []
5bf28d78 2239 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2240
2241 if not paras:
2242 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2243
2244 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2245 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2246 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2247 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2248 if begin_time is None:
2249 continue
7dff0363 2250 if not end_time:
d631d5f9
YCH
2251 if not dur:
2252 continue
2253 end_time = begin_time + dur
bf6427d2
YCH
2254 out.append('%d\n%s --> %s\n%s\n\n' % (
2255 index,
c1c924ab
YCH
2256 srt_subtitles_timecode(begin_time),
2257 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2258 parse_node(para)))
2259
2260 return ''.join(out)
2261
2262
66e289ba
S
2263def cli_option(params, command_option, param):
2264 param = params.get(param)
2265 return [command_option, param] if param is not None else []
2266
2267
2268def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2269 param = params.get(param)
2270 assert isinstance(param, bool)
2271 if separator:
2272 return [command_option + separator + (true_value if param else false_value)]
2273 return [command_option, true_value if param else false_value]
2274
2275
2276def cli_valueless_option(params, command_option, param, expected_value=True):
2277 param = params.get(param)
2278 return [command_option] if param == expected_value else []
2279
2280
2281def cli_configuration_args(params, param, default=[]):
2282 ex_args = params.get(param)
2283 if ex_args is None:
2284 return default
2285 assert isinstance(ex_args, list)
2286 return ex_args
2287
2288
39672624
YCH
2289class ISO639Utils(object):
2290 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2291 _lang_map = {
2292 'aa': 'aar',
2293 'ab': 'abk',
2294 'ae': 'ave',
2295 'af': 'afr',
2296 'ak': 'aka',
2297 'am': 'amh',
2298 'an': 'arg',
2299 'ar': 'ara',
2300 'as': 'asm',
2301 'av': 'ava',
2302 'ay': 'aym',
2303 'az': 'aze',
2304 'ba': 'bak',
2305 'be': 'bel',
2306 'bg': 'bul',
2307 'bh': 'bih',
2308 'bi': 'bis',
2309 'bm': 'bam',
2310 'bn': 'ben',
2311 'bo': 'bod',
2312 'br': 'bre',
2313 'bs': 'bos',
2314 'ca': 'cat',
2315 'ce': 'che',
2316 'ch': 'cha',
2317 'co': 'cos',
2318 'cr': 'cre',
2319 'cs': 'ces',
2320 'cu': 'chu',
2321 'cv': 'chv',
2322 'cy': 'cym',
2323 'da': 'dan',
2324 'de': 'deu',
2325 'dv': 'div',
2326 'dz': 'dzo',
2327 'ee': 'ewe',
2328 'el': 'ell',
2329 'en': 'eng',
2330 'eo': 'epo',
2331 'es': 'spa',
2332 'et': 'est',
2333 'eu': 'eus',
2334 'fa': 'fas',
2335 'ff': 'ful',
2336 'fi': 'fin',
2337 'fj': 'fij',
2338 'fo': 'fao',
2339 'fr': 'fra',
2340 'fy': 'fry',
2341 'ga': 'gle',
2342 'gd': 'gla',
2343 'gl': 'glg',
2344 'gn': 'grn',
2345 'gu': 'guj',
2346 'gv': 'glv',
2347 'ha': 'hau',
2348 'he': 'heb',
2349 'hi': 'hin',
2350 'ho': 'hmo',
2351 'hr': 'hrv',
2352 'ht': 'hat',
2353 'hu': 'hun',
2354 'hy': 'hye',
2355 'hz': 'her',
2356 'ia': 'ina',
2357 'id': 'ind',
2358 'ie': 'ile',
2359 'ig': 'ibo',
2360 'ii': 'iii',
2361 'ik': 'ipk',
2362 'io': 'ido',
2363 'is': 'isl',
2364 'it': 'ita',
2365 'iu': 'iku',
2366 'ja': 'jpn',
2367 'jv': 'jav',
2368 'ka': 'kat',
2369 'kg': 'kon',
2370 'ki': 'kik',
2371 'kj': 'kua',
2372 'kk': 'kaz',
2373 'kl': 'kal',
2374 'km': 'khm',
2375 'kn': 'kan',
2376 'ko': 'kor',
2377 'kr': 'kau',
2378 'ks': 'kas',
2379 'ku': 'kur',
2380 'kv': 'kom',
2381 'kw': 'cor',
2382 'ky': 'kir',
2383 'la': 'lat',
2384 'lb': 'ltz',
2385 'lg': 'lug',
2386 'li': 'lim',
2387 'ln': 'lin',
2388 'lo': 'lao',
2389 'lt': 'lit',
2390 'lu': 'lub',
2391 'lv': 'lav',
2392 'mg': 'mlg',
2393 'mh': 'mah',
2394 'mi': 'mri',
2395 'mk': 'mkd',
2396 'ml': 'mal',
2397 'mn': 'mon',
2398 'mr': 'mar',
2399 'ms': 'msa',
2400 'mt': 'mlt',
2401 'my': 'mya',
2402 'na': 'nau',
2403 'nb': 'nob',
2404 'nd': 'nde',
2405 'ne': 'nep',
2406 'ng': 'ndo',
2407 'nl': 'nld',
2408 'nn': 'nno',
2409 'no': 'nor',
2410 'nr': 'nbl',
2411 'nv': 'nav',
2412 'ny': 'nya',
2413 'oc': 'oci',
2414 'oj': 'oji',
2415 'om': 'orm',
2416 'or': 'ori',
2417 'os': 'oss',
2418 'pa': 'pan',
2419 'pi': 'pli',
2420 'pl': 'pol',
2421 'ps': 'pus',
2422 'pt': 'por',
2423 'qu': 'que',
2424 'rm': 'roh',
2425 'rn': 'run',
2426 'ro': 'ron',
2427 'ru': 'rus',
2428 'rw': 'kin',
2429 'sa': 'san',
2430 'sc': 'srd',
2431 'sd': 'snd',
2432 'se': 'sme',
2433 'sg': 'sag',
2434 'si': 'sin',
2435 'sk': 'slk',
2436 'sl': 'slv',
2437 'sm': 'smo',
2438 'sn': 'sna',
2439 'so': 'som',
2440 'sq': 'sqi',
2441 'sr': 'srp',
2442 'ss': 'ssw',
2443 'st': 'sot',
2444 'su': 'sun',
2445 'sv': 'swe',
2446 'sw': 'swa',
2447 'ta': 'tam',
2448 'te': 'tel',
2449 'tg': 'tgk',
2450 'th': 'tha',
2451 'ti': 'tir',
2452 'tk': 'tuk',
2453 'tl': 'tgl',
2454 'tn': 'tsn',
2455 'to': 'ton',
2456 'tr': 'tur',
2457 'ts': 'tso',
2458 'tt': 'tat',
2459 'tw': 'twi',
2460 'ty': 'tah',
2461 'ug': 'uig',
2462 'uk': 'ukr',
2463 'ur': 'urd',
2464 'uz': 'uzb',
2465 've': 'ven',
2466 'vi': 'vie',
2467 'vo': 'vol',
2468 'wa': 'wln',
2469 'wo': 'wol',
2470 'xh': 'xho',
2471 'yi': 'yid',
2472 'yo': 'yor',
2473 'za': 'zha',
2474 'zh': 'zho',
2475 'zu': 'zul',
2476 }
2477
2478 @classmethod
2479 def short2long(cls, code):
2480 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2481 return cls._lang_map.get(code[:2])
2482
2483 @classmethod
2484 def long2short(cls, code):
2485 """Convert language code from ISO 639-2/T to ISO 639-1"""
2486 for short_name, long_name in cls._lang_map.items():
2487 if long_name == code:
2488 return short_name
2489
2490
4eb10f66
YCH
2491class ISO3166Utils(object):
2492 # From http://data.okfn.org/data/core/country-list
2493 _country_map = {
2494 'AF': 'Afghanistan',
2495 'AX': 'Åland Islands',
2496 'AL': 'Albania',
2497 'DZ': 'Algeria',
2498 'AS': 'American Samoa',
2499 'AD': 'Andorra',
2500 'AO': 'Angola',
2501 'AI': 'Anguilla',
2502 'AQ': 'Antarctica',
2503 'AG': 'Antigua and Barbuda',
2504 'AR': 'Argentina',
2505 'AM': 'Armenia',
2506 'AW': 'Aruba',
2507 'AU': 'Australia',
2508 'AT': 'Austria',
2509 'AZ': 'Azerbaijan',
2510 'BS': 'Bahamas',
2511 'BH': 'Bahrain',
2512 'BD': 'Bangladesh',
2513 'BB': 'Barbados',
2514 'BY': 'Belarus',
2515 'BE': 'Belgium',
2516 'BZ': 'Belize',
2517 'BJ': 'Benin',
2518 'BM': 'Bermuda',
2519 'BT': 'Bhutan',
2520 'BO': 'Bolivia, Plurinational State of',
2521 'BQ': 'Bonaire, Sint Eustatius and Saba',
2522 'BA': 'Bosnia and Herzegovina',
2523 'BW': 'Botswana',
2524 'BV': 'Bouvet Island',
2525 'BR': 'Brazil',
2526 'IO': 'British Indian Ocean Territory',
2527 'BN': 'Brunei Darussalam',
2528 'BG': 'Bulgaria',
2529 'BF': 'Burkina Faso',
2530 'BI': 'Burundi',
2531 'KH': 'Cambodia',
2532 'CM': 'Cameroon',
2533 'CA': 'Canada',
2534 'CV': 'Cape Verde',
2535 'KY': 'Cayman Islands',
2536 'CF': 'Central African Republic',
2537 'TD': 'Chad',
2538 'CL': 'Chile',
2539 'CN': 'China',
2540 'CX': 'Christmas Island',
2541 'CC': 'Cocos (Keeling) Islands',
2542 'CO': 'Colombia',
2543 'KM': 'Comoros',
2544 'CG': 'Congo',
2545 'CD': 'Congo, the Democratic Republic of the',
2546 'CK': 'Cook Islands',
2547 'CR': 'Costa Rica',
2548 'CI': 'Côte d\'Ivoire',
2549 'HR': 'Croatia',
2550 'CU': 'Cuba',
2551 'CW': 'Curaçao',
2552 'CY': 'Cyprus',
2553 'CZ': 'Czech Republic',
2554 'DK': 'Denmark',
2555 'DJ': 'Djibouti',
2556 'DM': 'Dominica',
2557 'DO': 'Dominican Republic',
2558 'EC': 'Ecuador',
2559 'EG': 'Egypt',
2560 'SV': 'El Salvador',
2561 'GQ': 'Equatorial Guinea',
2562 'ER': 'Eritrea',
2563 'EE': 'Estonia',
2564 'ET': 'Ethiopia',
2565 'FK': 'Falkland Islands (Malvinas)',
2566 'FO': 'Faroe Islands',
2567 'FJ': 'Fiji',
2568 'FI': 'Finland',
2569 'FR': 'France',
2570 'GF': 'French Guiana',
2571 'PF': 'French Polynesia',
2572 'TF': 'French Southern Territories',
2573 'GA': 'Gabon',
2574 'GM': 'Gambia',
2575 'GE': 'Georgia',
2576 'DE': 'Germany',
2577 'GH': 'Ghana',
2578 'GI': 'Gibraltar',
2579 'GR': 'Greece',
2580 'GL': 'Greenland',
2581 'GD': 'Grenada',
2582 'GP': 'Guadeloupe',
2583 'GU': 'Guam',
2584 'GT': 'Guatemala',
2585 'GG': 'Guernsey',
2586 'GN': 'Guinea',
2587 'GW': 'Guinea-Bissau',
2588 'GY': 'Guyana',
2589 'HT': 'Haiti',
2590 'HM': 'Heard Island and McDonald Islands',
2591 'VA': 'Holy See (Vatican City State)',
2592 'HN': 'Honduras',
2593 'HK': 'Hong Kong',
2594 'HU': 'Hungary',
2595 'IS': 'Iceland',
2596 'IN': 'India',
2597 'ID': 'Indonesia',
2598 'IR': 'Iran, Islamic Republic of',
2599 'IQ': 'Iraq',
2600 'IE': 'Ireland',
2601 'IM': 'Isle of Man',
2602 'IL': 'Israel',
2603 'IT': 'Italy',
2604 'JM': 'Jamaica',
2605 'JP': 'Japan',
2606 'JE': 'Jersey',
2607 'JO': 'Jordan',
2608 'KZ': 'Kazakhstan',
2609 'KE': 'Kenya',
2610 'KI': 'Kiribati',
2611 'KP': 'Korea, Democratic People\'s Republic of',
2612 'KR': 'Korea, Republic of',
2613 'KW': 'Kuwait',
2614 'KG': 'Kyrgyzstan',
2615 'LA': 'Lao People\'s Democratic Republic',
2616 'LV': 'Latvia',
2617 'LB': 'Lebanon',
2618 'LS': 'Lesotho',
2619 'LR': 'Liberia',
2620 'LY': 'Libya',
2621 'LI': 'Liechtenstein',
2622 'LT': 'Lithuania',
2623 'LU': 'Luxembourg',
2624 'MO': 'Macao',
2625 'MK': 'Macedonia, the Former Yugoslav Republic of',
2626 'MG': 'Madagascar',
2627 'MW': 'Malawi',
2628 'MY': 'Malaysia',
2629 'MV': 'Maldives',
2630 'ML': 'Mali',
2631 'MT': 'Malta',
2632 'MH': 'Marshall Islands',
2633 'MQ': 'Martinique',
2634 'MR': 'Mauritania',
2635 'MU': 'Mauritius',
2636 'YT': 'Mayotte',
2637 'MX': 'Mexico',
2638 'FM': 'Micronesia, Federated States of',
2639 'MD': 'Moldova, Republic of',
2640 'MC': 'Monaco',
2641 'MN': 'Mongolia',
2642 'ME': 'Montenegro',
2643 'MS': 'Montserrat',
2644 'MA': 'Morocco',
2645 'MZ': 'Mozambique',
2646 'MM': 'Myanmar',
2647 'NA': 'Namibia',
2648 'NR': 'Nauru',
2649 'NP': 'Nepal',
2650 'NL': 'Netherlands',
2651 'NC': 'New Caledonia',
2652 'NZ': 'New Zealand',
2653 'NI': 'Nicaragua',
2654 'NE': 'Niger',
2655 'NG': 'Nigeria',
2656 'NU': 'Niue',
2657 'NF': 'Norfolk Island',
2658 'MP': 'Northern Mariana Islands',
2659 'NO': 'Norway',
2660 'OM': 'Oman',
2661 'PK': 'Pakistan',
2662 'PW': 'Palau',
2663 'PS': 'Palestine, State of',
2664 'PA': 'Panama',
2665 'PG': 'Papua New Guinea',
2666 'PY': 'Paraguay',
2667 'PE': 'Peru',
2668 'PH': 'Philippines',
2669 'PN': 'Pitcairn',
2670 'PL': 'Poland',
2671 'PT': 'Portugal',
2672 'PR': 'Puerto Rico',
2673 'QA': 'Qatar',
2674 'RE': 'Réunion',
2675 'RO': 'Romania',
2676 'RU': 'Russian Federation',
2677 'RW': 'Rwanda',
2678 'BL': 'Saint Barthélemy',
2679 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2680 'KN': 'Saint Kitts and Nevis',
2681 'LC': 'Saint Lucia',
2682 'MF': 'Saint Martin (French part)',
2683 'PM': 'Saint Pierre and Miquelon',
2684 'VC': 'Saint Vincent and the Grenadines',
2685 'WS': 'Samoa',
2686 'SM': 'San Marino',
2687 'ST': 'Sao Tome and Principe',
2688 'SA': 'Saudi Arabia',
2689 'SN': 'Senegal',
2690 'RS': 'Serbia',
2691 'SC': 'Seychelles',
2692 'SL': 'Sierra Leone',
2693 'SG': 'Singapore',
2694 'SX': 'Sint Maarten (Dutch part)',
2695 'SK': 'Slovakia',
2696 'SI': 'Slovenia',
2697 'SB': 'Solomon Islands',
2698 'SO': 'Somalia',
2699 'ZA': 'South Africa',
2700 'GS': 'South Georgia and the South Sandwich Islands',
2701 'SS': 'South Sudan',
2702 'ES': 'Spain',
2703 'LK': 'Sri Lanka',
2704 'SD': 'Sudan',
2705 'SR': 'Suriname',
2706 'SJ': 'Svalbard and Jan Mayen',
2707 'SZ': 'Swaziland',
2708 'SE': 'Sweden',
2709 'CH': 'Switzerland',
2710 'SY': 'Syrian Arab Republic',
2711 'TW': 'Taiwan, Province of China',
2712 'TJ': 'Tajikistan',
2713 'TZ': 'Tanzania, United Republic of',
2714 'TH': 'Thailand',
2715 'TL': 'Timor-Leste',
2716 'TG': 'Togo',
2717 'TK': 'Tokelau',
2718 'TO': 'Tonga',
2719 'TT': 'Trinidad and Tobago',
2720 'TN': 'Tunisia',
2721 'TR': 'Turkey',
2722 'TM': 'Turkmenistan',
2723 'TC': 'Turks and Caicos Islands',
2724 'TV': 'Tuvalu',
2725 'UG': 'Uganda',
2726 'UA': 'Ukraine',
2727 'AE': 'United Arab Emirates',
2728 'GB': 'United Kingdom',
2729 'US': 'United States',
2730 'UM': 'United States Minor Outlying Islands',
2731 'UY': 'Uruguay',
2732 'UZ': 'Uzbekistan',
2733 'VU': 'Vanuatu',
2734 'VE': 'Venezuela, Bolivarian Republic of',
2735 'VN': 'Viet Nam',
2736 'VG': 'Virgin Islands, British',
2737 'VI': 'Virgin Islands, U.S.',
2738 'WF': 'Wallis and Futuna',
2739 'EH': 'Western Sahara',
2740 'YE': 'Yemen',
2741 'ZM': 'Zambia',
2742 'ZW': 'Zimbabwe',
2743 }
2744
2745 @classmethod
2746 def short2full(cls, code):
2747 """Convert an ISO 3166-2 country code to the corresponding full name"""
2748 return cls._country_map.get(code.upper())
2749
2750
91410c9b 2751class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2752 def __init__(self, proxies=None):
2753 # Set default handlers
2754 for type in ('http', 'https'):
2755 setattr(self, '%s_open' % type,
2756 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2757 meth(r, proxy, type))
2758 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2759
91410c9b 2760 def proxy_open(self, req, proxy, type):
2461f79d 2761 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2762 if req_proxy is not None:
2763 proxy = req_proxy
2461f79d
PH
2764 del req.headers['Ytdl-request-proxy']
2765
2766 if proxy == '__noproxy__':
2767 return None # No Proxy
51fb4995 2768 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2769 req.add_header('Ytdl-socks-proxy', proxy)
2770 # youtube-dl's http/https handlers do wrapping the socket with socks
2771 return None
91410c9b
PH
2772 return compat_urllib_request.ProxyHandler.proxy_open(
2773 self, req, proxy, type)
5bc880b9
YCH
2774
2775
2776def ohdave_rsa_encrypt(data, exponent, modulus):
2777 '''
2778 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2779
2780 Input:
2781 data: data to encrypt, bytes-like object
2782 exponent, modulus: parameter e and N of RSA algorithm, both integer
2783 Output: hex string of encrypted data
2784
2785 Limitation: supports one block encryption only
2786 '''
2787
2788 payload = int(binascii.hexlify(data[::-1]), 16)
2789 encrypted = pow(payload, exponent, modulus)
2790 return '%x' % encrypted
81bdc8fd
YCH
2791
2792
5eb6bdce 2793def encode_base_n(num, n, table=None):
59f898b7 2794 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2795 if not table:
2796 table = FULL_TABLE[:n]
2797
5eb6bdce
YCH
2798 if n > len(table):
2799 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2800
2801 if num == 0:
2802 return table[0]
2803
81bdc8fd
YCH
2804 ret = ''
2805 while num:
2806 ret = table[num % n] + ret
2807 num = num // n
2808 return ret
f52354a8
YCH
2809
2810
2811def decode_packed_codes(code):
2812 mobj = re.search(
680079be 2813 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2814 code)
2815 obfucasted_code, base, count, symbols = mobj.groups()
2816 base = int(base)
2817 count = int(count)
2818 symbols = symbols.split('|')
2819 symbol_table = {}
2820
2821 while count:
2822 count -= 1
5eb6bdce 2823 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2824 symbol_table[base_n_count] = symbols[count] or base_n_count
2825
2826 return re.sub(
2827 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2828 obfucasted_code)