]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[formula1] Add new extractor(closes #3617)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
be4a824d 42 compat_http_client,
c86b6142 43 compat_kwargs,
8c25f81b 44 compat_parse_qs,
702ccf2d 45 compat_shlex_quote,
be4a824d 46 compat_socket_create_connection,
8c25f81b 47 compat_str,
edaa23f8 48 compat_struct_pack,
8c25f81b
PH
49 compat_urllib_error,
50 compat_urllib_parse,
15707c7e 51 compat_urllib_parse_urlencode,
8c25f81b 52 compat_urllib_parse_urlparse,
7581bfc9 53 compat_urllib_parse_unquote_plus,
8c25f81b
PH
54 compat_urllib_request,
55 compat_urlparse,
810c10ba 56 compat_xpath,
8c25f81b 57)
4644ac55 58
71aff188
YCH
59from .socks import (
60 ProxyType,
61 sockssocket,
62)
63
4644ac55 64
51fb4995
YCH
65def register_socks_protocols():
66 # "Register" SOCKS protocols
d5ae6bb5
YCH
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
72
73
468e2e92
FV
74# This is not clearly defined otherwise
75compiled_regex_type = type(re.compile(''))
76
3e669f36 77std_headers = {
9c7b3898 78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 83}
f427df17 84
5f6a1245 85
bf42a990
S
86NO_DEFAULT = object()
87
7105440c
YCH
88ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
91
a7aaa398
S
92KNOWN_EXTENSIONS = (
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
96 'mkv', 'mka', 'mk3d',
97 'avi', 'divx',
98 'mov',
99 'asf', 'wmv', 'wma',
100 '3gp', '3g2',
101 'mp3',
102 'flac',
103 'ape',
104 'wav',
105 'f4f', 'f4m', 'm3u8', 'smil')
106
c587cbb7 107# needed for sanitizing filenames in restricted mode
778a1ccc
YCH
108ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
c587cbb7 111
7105440c 112
d77c3dfd 113def preferredencoding():
59ae15a5 114 """Get preferred encoding.
d77c3dfd 115
59ae15a5
PH
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
118 """
119 try:
120 pref = locale.getpreferredencoding()
28e614de 121 'TEST'.encode(pref)
70a1165b 122 except Exception:
59ae15a5 123 pref = 'UTF-8'
bae611f2 124
59ae15a5 125 return pref
d77c3dfd 126
f4bfd65f 127
181c8655 128def write_json_file(obj, fn):
1394646a 129 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 130
92120217 131 fn = encodeFilename(fn)
61ee5aeb 132 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
140 else:
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
143
73159f99
S
144 args = {
145 'suffix': '.tmp',
ec5f6016
JMF
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
73159f99
S
148 'delete': False,
149 }
150
181c8655
PH
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
73159f99 154 args['mode'] = 'wb'
181c8655 155 else:
73159f99
S
156 args.update({
157 'mode': 'w',
158 'encoding': 'utf-8',
159 })
160
c86b6142 161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
162
163 try:
164 with tf:
165 json.dump(obj, tf)
1394646a
IK
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
169 try:
170 os.unlink(fn)
171 except OSError:
172 pass
181c8655 173 os.rename(tf.name, fn)
70a1165b 174 except Exception:
181c8655
PH
175 try:
176 os.remove(tf.name)
177 except OSError:
178 pass
179 raise
180
181
182if sys.version_info >= (2, 7):
ee114368 183 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 184 """ Find the xpath xpath[@key=val] """
5d2354f1 185 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
187 return node.find(expr)
188else:
ee114368 189 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 190 for f in node.findall(compat_xpath(xpath)):
ee114368
S
191 if key not in f.attrib:
192 continue
193 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
194 return f
195 return None
196
d7e66d39
JMF
197# On python2.6 the xml.etree.ElementTree.Element methods don't support
198# the namespace parameter
5f6a1245
JW
199
200
d7e66d39
JMF
201def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
203 replaced = []
204 for c in components:
205 if len(c) == 1:
206 replaced.append(c[0])
207 else:
208 ns, tag = c
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
211
d77c3dfd 212
a41fb80c 213def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 214 def _find_xpath(xpath):
810c10ba 215 return node.find(compat_xpath(xpath))
578c0745
S
216
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
219 else:
220 for xp in xpath:
221 n = _find_xpath(xp)
222 if n is not None:
223 break
d74bebd5 224
8e636da4 225 if n is None:
bf42a990
S
226 if default is not NO_DEFAULT:
227 return default
228 elif fatal:
bf0ff932
PH
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
231 else:
232 return None
a41fb80c
S
233 return n
234
235
236def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
239 return n
240 if n.text is None:
241 if default is not NO_DEFAULT:
242 return default
243 elif fatal:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
246 else:
247 return None
248 return n.text
a41fb80c
S
249
250
251def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
253 if n is None:
254 if default is not NO_DEFAULT:
255 return default
256 elif fatal:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
259 else:
260 return None
261 return n.attrib[key]
bf0ff932
PH
262
263
9e6dd238 264def get_element_by_id(id, html):
43e8fafd 265 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 266 return get_element_by_attribute('id', id, html)
43e8fafd 267
12ea2f30 268
43e8fafd
ND
269def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 271
38285056
PH
272 m = re.search(r'''(?xs)
273 <([a-zA-Z0-9:._-]+)
abc97b5e 274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 275 \s+%s=['"]?%s['"]?
abc97b5e 276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
277 \s*>
278 (?P<content>.*?)
279 </\1>
280 ''' % (re.escape(attribute), re.escape(value)), html)
281
282 if not m:
283 return None
284 res = m.group('content')
285
286 if res.startswith('"') or res.startswith("'"):
287 res = res[1:-1]
a921f407 288
38285056 289 return unescapeHTML(res)
a921f407 290
c5229f39 291
8bb56eee
BF
292class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
294 def __init__(self):
c5229f39 295 self.attrs = {}
8bb56eee
BF
296 compat_HTMLParser.__init__(self)
297
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
300
c5229f39 301
8bb56eee
BF
302def extract_attributes(html_element):
303 """Given a string for an HTML element such as
304 <el
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&amp;"
307 sq='"' dq="'"
308 >
309 Decode and return a dictionary of attributes.
310 {
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
314 }.
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
317 """
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
320 parser.close()
321 return parser.attrs
9e6dd238 322
c5229f39 323
9e6dd238 324def clean_html(html):
59ae15a5 325 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
326
327 if html is None: # Convenience for sanitizing descriptions etc.
328 return html
329
59ae15a5
PH
330 # Newline vs <br />
331 html = html.replace('\n', ' ')
6b3aef80
FV
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
334 # Strip html tags
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
7decf895 338 return html.strip()
9e6dd238
FV
339
340
d77c3dfd 341def sanitize_open(filename, open_mode):
59ae15a5
PH
342 """Try to open the given filename, and slightly tweak it if this fails.
343
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
347 function.
348
349 It returns the tuple (stream, definitive_file_name).
350 """
351 try:
28e614de 352 if filename == '-':
59ae15a5
PH
353 if sys.platform == 'win32':
354 import msvcrt
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
f45c185f
PH
360 if err.errno in (errno.EACCES,):
361 raise
59ae15a5 362
f45c185f 363 # In case of error, try to remove win32 forbidden chars
d55de57b 364 alt_filename = sanitize_path(filename)
f45c185f
PH
365 if alt_filename == filename:
366 raise
367 else:
368 # An exception here should be caught in the caller
d55de57b 369 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 370 return (stream, alt_filename)
d77c3dfd
FV
371
372
373def timeconvert(timestr):
59ae15a5
PH
374 """Convert RFC 2822 defined time string into system timestamp"""
375 timestamp = None
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
379 return timestamp
1c469a94 380
5f6a1245 381
796173d0 382def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
796173d0 385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
386 """
387 def replace_insane(char):
c587cbb7
AT
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
59ae15a5
PH
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
391 return ''
392 elif char == '"':
393 return '' if restricted else '\''
394 elif char == ':':
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
397 return '_'
627dcfff 398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
399 return '_'
400 if restricted and ord(char) > 127:
401 return '_'
402 return char
403
2aeb06d6
PH
404 # Handle timestamps
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 406 result = ''.join(map(replace_insane, s))
796173d0
PH
407 if not is_id:
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
413 result = result[2:]
5a42414b
PH
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
a7440261 416 result = result.lstrip('.')
796173d0
PH
417 if not result:
418 result = '_'
59ae15a5 419 return result
d77c3dfd 420
5f6a1245 421
a2aaf4db
S
422def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
425 return s
be531ef1
S
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
430 if drive_or_unc:
a2aaf4db
S
431 norm_path.pop(0)
432 sanitized_path = [
c90d16cf 433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 434 for path_part in norm_path]
be531ef1
S
435 if drive_or_unc:
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
437 return os.path.join(*sanitized_path)
438
439
67dda517
S
440# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441# unwanted failures due to missing protocol
17bcc626
S
442def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
444
445
67dda517 446def sanitized_Request(url, *args, **kwargs):
17bcc626 447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
448
449
d77c3dfd 450def orderedSet(iterable):
59ae15a5
PH
451 """ Remove all duplicates from the input iterable """
452 res = []
453 for el in iterable:
454 if el not in res:
455 res.append(el)
456 return res
d77c3dfd 457
912b38b4 458
4e408e47
PH
459def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
464
91757b0f 465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
466 if mobj is not None:
467 numstr = mobj.group(1)
28e614de 468 if numstr.startswith('x'):
4e408e47 469 base = 16
28e614de 470 numstr = '0%s' % numstr
4e408e47
PH
471 else:
472 base = 10
7aefc49c
S
473 # See https://github.com/rg3/youtube-dl/issues/7518
474 try:
475 return compat_chr(int(numstr, base))
476 except ValueError:
477 pass
4e408e47
PH
478
479 # Unknown entity in name, return its literal representation
7a3f0c00 480 return '&%s;' % entity
4e408e47
PH
481
482
d77c3dfd 483def unescapeHTML(s):
912b38b4
PH
484 if s is None:
485 return None
486 assert type(s) == compat_str
d77c3dfd 487
4e408e47
PH
488 return re.sub(
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 490
8bf48f23 491
aa49acd1
S
492def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
497 else:
498 encoding = sys.getfilesystemencoding()
499 if encoding is None:
500 encoding = 'utf-8'
501 return encoding
502
503
8bf48f23 504def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
505 """
506 @param s The name of the file
507 """
d77c3dfd 508
8bf48f23 509 assert type(s) == compat_str
d77c3dfd 510
59ae15a5
PH
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
513 return s
0f00efed 514
aa49acd1
S
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
519 return s
520
8ee239e9
YCH
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
523 return s
524
aa49acd1
S
525 return s.encode(get_subprocess_encoding(), 'ignore')
526
527
528def decodeFilename(b, for_subprocess=False):
529
530 if sys.version_info >= (3, 0):
531 return b
532
533 if not isinstance(b, bytes):
534 return b
535
536 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 537
f07b74fc
PH
538
539def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
7af808a5 543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
546
547
aa49acd1
S
548def decodeArgument(b):
549 return decodeFilename(b, True)
550
551
8271226a
PH
552def decodeOption(optval):
553 if optval is None:
554 return optval
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
557
558 assert isinstance(optval, compat_str)
559 return optval
1c256f70 560
5f6a1245 561
4539dd30
PH
562def formatSeconds(secs):
563 if secs > 3600:
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
565 elif secs > 60:
566 return '%d:%02d' % (secs // 60, secs % 60)
567 else:
568 return '%d' % secs
569
a0ddb8a2 570
be4a824d
PH
571def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 575 if opts_no_check_certificate:
be5f2c19 576 context.check_hostname = False
0db261ba 577 context.verify_mode = ssl.CERT_NONE
a2366922 578 try:
be4a824d 579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
580 except TypeError:
581 # Python 2.7.8
582 # (create_default_context present but HTTPSHandler has no context=)
583 pass
584
585 if sys.version_info < (3, 2):
d7932313 586 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 587 else: # Python < 3.4
d7932313 588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 589 context.verify_mode = (ssl.CERT_NONE
dca08720 590 if opts_no_check_certificate
ea6d901e 591 else ssl.CERT_REQUIRED)
303b479e 592 context.set_default_verify_paths()
be4a824d 593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 594
732ea2f0 595
08f2a92c
JMF
596def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
599 else:
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
604 return msg
605
606
1c256f70
PH
607class ExtractorError(Exception):
608 """Error during info extraction."""
5f6a1245 609
d11271dd 610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
613 """
614
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
616 expected = True
d11271dd
PH
617 if video_id is not None:
618 msg = video_id + ': ' + msg
410f3e73 619 if cause:
28e614de 620 msg += ' (caused by %r)' % cause
9a82b238 621 if not expected:
08f2a92c 622 msg += bug_reports_message()
1c256f70 623 super(ExtractorError, self).__init__(msg)
d5979c5d 624
1c256f70 625 self.traceback = tb
8cc83b8d 626 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 627 self.cause = cause
d11271dd 628 self.video_id = video_id
1c256f70 629
01951dda
PH
630 def format_traceback(self):
631 if self.traceback is None:
632 return None
28e614de 633 return ''.join(traceback.format_tb(self.traceback))
01951dda 634
1c256f70 635
416c7fcb
PH
636class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
640 self.url = url
641
642
55b3e45b
JMF
643class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
645 pass
646
647
d77c3dfd 648class DownloadError(Exception):
59ae15a5 649 """Download Error exception.
d77c3dfd 650
59ae15a5
PH
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
653 error message.
654 """
5f6a1245 655
8cc83b8d
FV
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
d77c3dfd
FV
660
661
662class SameFileError(Exception):
59ae15a5 663 """Same File exception.
d77c3dfd 664
59ae15a5
PH
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
667 """
668 pass
d77c3dfd
FV
669
670
671class PostProcessingError(Exception):
59ae15a5 672 """Post Processing exception.
d77c3dfd 673
59ae15a5
PH
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
676 """
5f6a1245 677
7851b379
PH
678 def __init__(self, msg):
679 self.msg = msg
d77c3dfd 680
5f6a1245 681
d77c3dfd 682class MaxDownloadsReached(Exception):
59ae15a5
PH
683 """ --max-downloads limit has been reached. """
684 pass
d77c3dfd
FV
685
686
687class UnavailableVideoError(Exception):
59ae15a5 688 """Unavailable Format exception.
d77c3dfd 689
59ae15a5
PH
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
692 """
693 pass
d77c3dfd
FV
694
695
696class ContentTooShortError(Exception):
59ae15a5 697 """Content Too Short exception.
d77c3dfd 698
59ae15a5
PH
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
702 """
d77c3dfd 703
59ae15a5 704 def __init__(self, downloaded, expected):
2c7ed247 705 # Both in bytes
59ae15a5
PH
706 self.downloaded = downloaded
707 self.expected = expected
d77c3dfd 708
5f6a1245 709
c5a59d93 710def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
5a1a2e94 715 kwargs[b'strict'] = True
be4a824d
PH
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
722 else: # Python 2.6
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
726 if is_https:
d7932313
PH
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
730 else:
731 self.sock = sock
732 hc.connect = functools.partial(_hc_connect, hc)
733
734 return hc
735
736
87f0e62d 737def handle_youtubedl_headers(headers):
992fc9d6
YCH
738 filtered_headers = headers
739
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 742 del filtered_headers['Youtubedl-no-compression']
87f0e62d 743
992fc9d6 744 return filtered_headers
87f0e62d
YCH
745
746
acebc9cd 747class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
748 """Handler for HTTP requests and responses.
749
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
0424ec30 754 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
755 removed before making the real request.
756
757 Part of this code was copied from:
758
759 http://techknack.net/python-urllib2-handlers/
760
761 Andrew Rowls, the author of that code, agreed to release it to the
762 public domain.
763 """
764
be4a824d
PH
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
768
769 def http_open(self, req):
71aff188
YCH
770 conn_class = compat_http_client.HTTPConnection
771
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
773 if socks_proxy:
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
776
be4a824d 777 return self.do_open(functools.partial(
71aff188 778 _create_http_connection, self, conn_class, False),
be4a824d
PH
779 req)
780
59ae15a5
PH
781 @staticmethod
782 def deflate(data):
783 try:
784 return zlib.decompress(data, -zlib.MAX_WBITS)
785 except zlib.error:
786 return zlib.decompress(data)
787
788 @staticmethod
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
793 ret.code = code
794 return ret
795
acebc9cd 796 def http_request(self, req):
51f267d9
S
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
807
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
15d260eb 810 req = update_Request(req, url=url_escaped)
51f267d9 811
33ac271b 812 for h, v in std_headers.items():
3d5f7a39
JK
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
33ac271b 816 req.add_header(h, v)
87f0e62d
YCH
817
818 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
819
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
824
59ae15a5
PH
825 return req
826
acebc9cd 827 def http_response(self, req, resp):
59ae15a5
PH
828 old_resp = resp
829 # gzip
830 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
833 try:
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
839 try:
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
842 except IOError:
843 continue
844 break
845 else:
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 848 resp.msg = old_resp.msg
c047270c 849 del resp.headers['Content-encoding']
59ae15a5
PH
850 # deflate
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
c047270c 855 del resp.headers['Content-encoding']
ad729172
S
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
860 if location:
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
864 location_escaped = escape_url(location)
865 if location != location_escaped:
866 del resp.headers['Location']
867 resp.headers['Location'] = location_escaped
59ae15a5 868 return resp
0f8d03f8 869
acebc9cd
PH
870 https_request = http_request
871 https_response = http_response
bf50b038 872
5de90176 873
71aff188
YCH
874def make_socks_conn_class(base_class, socks_proxy):
875 assert issubclass(base_class, (
876 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
877
878 url_components = compat_urlparse.urlparse(socks_proxy)
879 if url_components.scheme.lower() == 'socks5':
880 socks_type = ProxyType.SOCKS5
881 elif url_components.scheme.lower() in ('socks', 'socks4'):
882 socks_type = ProxyType.SOCKS4
51fb4995
YCH
883 elif url_components.scheme.lower() == 'socks4a':
884 socks_type = ProxyType.SOCKS4A
71aff188 885
cdd94c2e
YCH
886 def unquote_if_non_empty(s):
887 if not s:
888 return s
889 return compat_urllib_parse_unquote_plus(s)
890
71aff188
YCH
891 proxy_args = (
892 socks_type,
893 url_components.hostname, url_components.port or 1080,
894 True, # Remote DNS
cdd94c2e
YCH
895 unquote_if_non_empty(url_components.username),
896 unquote_if_non_empty(url_components.password),
71aff188
YCH
897 )
898
899 class SocksConnection(base_class):
900 def connect(self):
901 self.sock = sockssocket()
902 self.sock.setproxy(*proxy_args)
903 if type(self.timeout) in (int, float):
904 self.sock.settimeout(self.timeout)
905 self.sock.connect((self.host, self.port))
906
907 if isinstance(self, compat_http_client.HTTPSConnection):
908 if hasattr(self, '_context'): # Python > 2.6
909 self.sock = self._context.wrap_socket(
910 self.sock, server_hostname=self.host)
911 else:
912 self.sock = ssl.wrap_socket(self.sock)
913
914 return SocksConnection
915
916
be4a824d
PH
917class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
918 def __init__(self, params, https_conn_class=None, *args, **kwargs):
919 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
920 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
921 self._params = params
922
923 def https_open(self, req):
4f264c02 924 kwargs = {}
71aff188
YCH
925 conn_class = self._https_conn_class
926
4f264c02
JMF
927 if hasattr(self, '_context'): # python > 2.6
928 kwargs['context'] = self._context
929 if hasattr(self, '_check_hostname'): # python 3.x
930 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
931
932 socks_proxy = req.headers.get('Ytdl-socks-proxy')
933 if socks_proxy:
934 conn_class = make_socks_conn_class(conn_class, socks_proxy)
935 del req.headers['Ytdl-socks-proxy']
936
be4a824d 937 return self.do_open(functools.partial(
71aff188 938 _create_http_connection, self, conn_class, True),
4f264c02 939 req, **kwargs)
be4a824d
PH
940
941
a6420bf5
S
942class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
943 def __init__(self, cookiejar=None):
944 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
945
946 def http_response(self, request, response):
947 # Python 2 will choke on next HTTP request in row if there are non-ASCII
948 # characters in Set-Cookie HTTP header of last response (see
949 # https://github.com/rg3/youtube-dl/issues/6769).
950 # In order to at least prevent crashing we will percent encode Set-Cookie
951 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
952 # if sys.version_info < (3, 0) and response.headers:
953 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
954 # set_cookie = response.headers.get(set_cookie_header)
955 # if set_cookie:
956 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
957 # if set_cookie != set_cookie_escaped:
958 # del response.headers[set_cookie_header]
959 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
960 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
961
962 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
963 https_response = http_response
964
965
08b38d54 966def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
967 """ Return a UNIX timestamp from the given date """
968
969 if date_str is None:
970 return None
971
52c3a6e4
S
972 date_str = re.sub(r'\.[0-9]+', '', date_str)
973
08b38d54
PH
974 if timezone is None:
975 m = re.search(
52c3a6e4 976 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
977 date_str)
978 if not m:
912b38b4
PH
979 timezone = datetime.timedelta()
980 else:
08b38d54
PH
981 date_str = date_str[:-len(m.group(0))]
982 if not m.group('sign'):
983 timezone = datetime.timedelta()
984 else:
985 sign = 1 if m.group('sign') == '+' else -1
986 timezone = datetime.timedelta(
987 hours=sign * int(m.group('hours')),
988 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
989 try:
990 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
991 dt = datetime.datetime.strptime(date_str, date_format) - timezone
992 return calendar.timegm(dt.timetuple())
993 except ValueError:
994 pass
912b38b4
PH
995
996
42bdd9d0 997def unified_strdate(date_str, day_first=True):
bf50b038 998 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
999
1000 if date_str is None:
1001 return None
bf50b038 1002 upload_date = None
5f6a1245 1003 # Replace commas
026fcc04 1004 date_str = date_str.replace(',', ' ')
bf50b038 1005 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
1006 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1007 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 1008 # Remove AM/PM + timezone
9bb8e0a3 1009 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 1010
19e1d359
JMF
1011 format_expressions = [
1012 '%d %B %Y',
0f99566c 1013 '%d %b %Y',
19e1d359
JMF
1014 '%B %d %Y',
1015 '%b %d %Y',
f160785c
S
1016 '%b %dst %Y %I:%M',
1017 '%b %dnd %Y %I:%M',
1018 '%b %dth %Y %I:%M',
a69801e2 1019 '%Y %m %d',
19e1d359 1020 '%Y-%m-%d',
fe556f1b 1021 '%Y/%m/%d',
19e1d359 1022 '%Y/%m/%d %H:%M:%S',
5d73273f 1023 '%Y-%m-%d %H:%M:%S',
e9be9a6a 1024 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 1025 '%d.%m.%Y %H:%M',
b047de6f 1026 '%d.%m.%Y %H.%M',
19e1d359 1027 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
1028 '%Y-%m-%dT%H:%M:%S.%fZ',
1029 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 1030 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 1031 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 1032 '%Y-%m-%dT%H:%M',
19e1d359 1033 ]
42bdd9d0
PH
1034 if day_first:
1035 format_expressions.extend([
79c21abb 1036 '%d-%m-%Y',
776dc399
S
1037 '%d.%m.%Y',
1038 '%d/%m/%Y',
1039 '%d/%m/%y',
42bdd9d0
PH
1040 '%d/%m/%Y %H:%M:%S',
1041 ])
1042 else:
1043 format_expressions.extend([
79c21abb 1044 '%m-%d-%Y',
776dc399
S
1045 '%m.%d.%Y',
1046 '%m/%d/%Y',
1047 '%m/%d/%y',
42bdd9d0
PH
1048 '%m/%d/%Y %H:%M:%S',
1049 ])
bf50b038
JMF
1050 for expression in format_expressions:
1051 try:
1052 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1053 except ValueError:
bf50b038 1054 pass
42393ce2
PH
1055 if upload_date is None:
1056 timetuple = email.utils.parsedate_tz(date_str)
1057 if timetuple:
1058 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1059 if upload_date is not None:
1060 return compat_str(upload_date)
bf50b038 1061
5f6a1245 1062
28e614de 1063def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1064 if url is None:
1065 return default_ext
9cb9a5df 1066 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1067 if re.match(r'^[A-Za-z0-9]+$', guess):
1068 return guess
a7aaa398
S
1069 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1070 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1071 return guess.rstrip('/')
73e79f2a 1072 else:
cbdbb766 1073 return default_ext
73e79f2a 1074
5f6a1245 1075
d4051a8e 1076def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1077 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1078
5f6a1245 1079
bd558525 1080def date_from_str(date_str):
37254abc
JMF
1081 """
1082 Return a datetime object from a string in the format YYYYMMDD or
1083 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1084 today = datetime.date.today()
f8795e10 1085 if date_str in ('now', 'today'):
37254abc 1086 return today
f8795e10
PH
1087 if date_str == 'yesterday':
1088 return today - datetime.timedelta(days=1)
37254abc
JMF
1089 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1090 if match is not None:
1091 sign = match.group('sign')
1092 time = int(match.group('time'))
1093 if sign == '-':
1094 time = -time
1095 unit = match.group('unit')
dfb1b146 1096 # A bad approximation?
37254abc
JMF
1097 if unit == 'month':
1098 unit = 'day'
1099 time *= 30
1100 elif unit == 'year':
1101 unit = 'day'
1102 time *= 365
1103 unit += 's'
1104 delta = datetime.timedelta(**{unit: time})
1105 return today + delta
611c1dd9 1106 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1107
1108
e63fc1be 1109def hyphenate_date(date_str):
1110 """
1111 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1112 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1113 if match is not None:
1114 return '-'.join(match.groups())
1115 else:
1116 return date_str
1117
5f6a1245 1118
bd558525
JMF
1119class DateRange(object):
1120 """Represents a time interval between two dates"""
5f6a1245 1121
bd558525
JMF
1122 def __init__(self, start=None, end=None):
1123 """start and end must be strings in the format accepted by date"""
1124 if start is not None:
1125 self.start = date_from_str(start)
1126 else:
1127 self.start = datetime.datetime.min.date()
1128 if end is not None:
1129 self.end = date_from_str(end)
1130 else:
1131 self.end = datetime.datetime.max.date()
37254abc 1132 if self.start > self.end:
bd558525 1133 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1134
bd558525
JMF
1135 @classmethod
1136 def day(cls, day):
1137 """Returns a range that only contains the given day"""
5f6a1245
JW
1138 return cls(day, day)
1139
bd558525
JMF
1140 def __contains__(self, date):
1141 """Check if the date is in the range"""
37254abc
JMF
1142 if not isinstance(date, datetime.date):
1143 date = date_from_str(date)
1144 return self.start <= date <= self.end
5f6a1245 1145
bd558525 1146 def __str__(self):
5f6a1245 1147 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1148
1149
1150def platform_name():
1151 """ Returns the platform name as a compat_str """
1152 res = platform.platform()
1153 if isinstance(res, bytes):
1154 res = res.decode(preferredencoding())
1155
1156 assert isinstance(res, compat_str)
1157 return res
c257baff
PH
1158
1159
b58ddb32
PH
1160def _windows_write_string(s, out):
1161 """ Returns True if the string was written using special methods,
1162 False if it has yet to be written out."""
1163 # Adapted from http://stackoverflow.com/a/3259271/35070
1164
1165 import ctypes
1166 import ctypes.wintypes
1167
1168 WIN_OUTPUT_IDS = {
1169 1: -11,
1170 2: -12,
1171 }
1172
a383a98a
PH
1173 try:
1174 fileno = out.fileno()
1175 except AttributeError:
1176 # If the output stream doesn't have a fileno, it's virtual
1177 return False
aa42e873
PH
1178 except io.UnsupportedOperation:
1179 # Some strange Windows pseudo files?
1180 return False
b58ddb32
PH
1181 if fileno not in WIN_OUTPUT_IDS:
1182 return False
1183
e2f89ec7 1184 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1185 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1186 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1187 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1188
e2f89ec7 1189 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1190 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1191 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1192 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1193 written = ctypes.wintypes.DWORD(0)
1194
611c1dd9 1195 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1196 FILE_TYPE_CHAR = 0x0002
1197 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1198 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1199 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1200 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1201 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1202 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1203
1204 def not_a_console(handle):
1205 if handle == INVALID_HANDLE_VALUE or handle is None:
1206 return True
8fb3ac36
PH
1207 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1208 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1209
1210 if not_a_console(h):
1211 return False
1212
d1b9c912
PH
1213 def next_nonbmp_pos(s):
1214 try:
1215 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1216 except StopIteration:
1217 return len(s)
1218
1219 while s:
1220 count = min(next_nonbmp_pos(s), 1024)
1221
b58ddb32 1222 ret = WriteConsoleW(
d1b9c912 1223 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1224 if ret == 0:
1225 raise OSError('Failed to write string')
d1b9c912
PH
1226 if not count: # We just wrote a non-BMP character
1227 assert written.value == 2
1228 s = s[1:]
1229 else:
1230 assert written.value > 0
1231 s = s[written.value:]
b58ddb32
PH
1232 return True
1233
1234
734f90bb 1235def write_string(s, out=None, encoding=None):
7459e3a2
PH
1236 if out is None:
1237 out = sys.stderr
8bf48f23 1238 assert type(s) == compat_str
7459e3a2 1239
b58ddb32
PH
1240 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1241 if _windows_write_string(s, out):
1242 return
1243
7459e3a2
PH
1244 if ('b' in getattr(out, 'mode', '') or
1245 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1246 byt = s.encode(encoding or preferredencoding(), 'ignore')
1247 out.write(byt)
1248 elif hasattr(out, 'buffer'):
1249 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1250 byt = s.encode(enc, 'ignore')
1251 out.buffer.write(byt)
1252 else:
8bf48f23 1253 out.write(s)
7459e3a2
PH
1254 out.flush()
1255
1256
48ea9cea
PH
1257def bytes_to_intlist(bs):
1258 if not bs:
1259 return []
1260 if isinstance(bs[0], int): # Python 3
1261 return list(bs)
1262 else:
1263 return [ord(c) for c in bs]
1264
c257baff 1265
cba892fa 1266def intlist_to_bytes(xs):
1267 if not xs:
1268 return b''
edaa23f8 1269 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1270
1271
c1c9a79c
PH
1272# Cross-platform file locking
1273if sys.platform == 'win32':
1274 import ctypes.wintypes
1275 import msvcrt
1276
1277 class OVERLAPPED(ctypes.Structure):
1278 _fields_ = [
1279 ('Internal', ctypes.wintypes.LPVOID),
1280 ('InternalHigh', ctypes.wintypes.LPVOID),
1281 ('Offset', ctypes.wintypes.DWORD),
1282 ('OffsetHigh', ctypes.wintypes.DWORD),
1283 ('hEvent', ctypes.wintypes.HANDLE),
1284 ]
1285
1286 kernel32 = ctypes.windll.kernel32
1287 LockFileEx = kernel32.LockFileEx
1288 LockFileEx.argtypes = [
1289 ctypes.wintypes.HANDLE, # hFile
1290 ctypes.wintypes.DWORD, # dwFlags
1291 ctypes.wintypes.DWORD, # dwReserved
1292 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1293 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1294 ctypes.POINTER(OVERLAPPED) # Overlapped
1295 ]
1296 LockFileEx.restype = ctypes.wintypes.BOOL
1297 UnlockFileEx = kernel32.UnlockFileEx
1298 UnlockFileEx.argtypes = [
1299 ctypes.wintypes.HANDLE, # hFile
1300 ctypes.wintypes.DWORD, # dwReserved
1301 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1302 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1303 ctypes.POINTER(OVERLAPPED) # Overlapped
1304 ]
1305 UnlockFileEx.restype = ctypes.wintypes.BOOL
1306 whole_low = 0xffffffff
1307 whole_high = 0x7fffffff
1308
1309 def _lock_file(f, exclusive):
1310 overlapped = OVERLAPPED()
1311 overlapped.Offset = 0
1312 overlapped.OffsetHigh = 0
1313 overlapped.hEvent = 0
1314 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1315 handle = msvcrt.get_osfhandle(f.fileno())
1316 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1317 whole_low, whole_high, f._lock_file_overlapped_p):
1318 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1319
1320 def _unlock_file(f):
1321 assert f._lock_file_overlapped_p
1322 handle = msvcrt.get_osfhandle(f.fileno())
1323 if not UnlockFileEx(handle, 0,
1324 whole_low, whole_high, f._lock_file_overlapped_p):
1325 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1326
1327else:
399a76e6
YCH
1328 # Some platforms, such as Jython, is missing fcntl
1329 try:
1330 import fcntl
c1c9a79c 1331
399a76e6
YCH
1332 def _lock_file(f, exclusive):
1333 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1334
399a76e6
YCH
1335 def _unlock_file(f):
1336 fcntl.flock(f, fcntl.LOCK_UN)
1337 except ImportError:
1338 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1339
1340 def _lock_file(f, exclusive):
1341 raise IOError(UNSUPPORTED_MSG)
1342
1343 def _unlock_file(f):
1344 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1345
1346
1347class locked_file(object):
1348 def __init__(self, filename, mode, encoding=None):
1349 assert mode in ['r', 'a', 'w']
1350 self.f = io.open(filename, mode, encoding=encoding)
1351 self.mode = mode
1352
1353 def __enter__(self):
1354 exclusive = self.mode != 'r'
1355 try:
1356 _lock_file(self.f, exclusive)
1357 except IOError:
1358 self.f.close()
1359 raise
1360 return self
1361
1362 def __exit__(self, etype, value, traceback):
1363 try:
1364 _unlock_file(self.f)
1365 finally:
1366 self.f.close()
1367
1368 def __iter__(self):
1369 return iter(self.f)
1370
1371 def write(self, *args):
1372 return self.f.write(*args)
1373
1374 def read(self, *args):
1375 return self.f.read(*args)
4eb7f1d1
JMF
1376
1377
4644ac55
S
1378def get_filesystem_encoding():
1379 encoding = sys.getfilesystemencoding()
1380 return encoding if encoding is not None else 'utf-8'
1381
1382
4eb7f1d1 1383def shell_quote(args):
a6a173c2 1384 quoted_args = []
4644ac55 1385 encoding = get_filesystem_encoding()
a6a173c2
JMF
1386 for a in args:
1387 if isinstance(a, bytes):
1388 # We may get a filename encoded with 'encodeFilename'
1389 a = a.decode(encoding)
1390 quoted_args.append(pipes.quote(a))
28e614de 1391 return ' '.join(quoted_args)
9d4660ca
PH
1392
1393
1394def smuggle_url(url, data):
1395 """ Pass additional data in a URL for internal use. """
1396
15707c7e 1397 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1398 {'__youtubedl_smuggle': json.dumps(data)})
1399 return url + '#' + sdata
9d4660ca
PH
1400
1401
79f82953 1402def unsmuggle_url(smug_url, default=None):
83e865a3 1403 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1404 return smug_url, default
28e614de
PH
1405 url, _, sdata = smug_url.rpartition('#')
1406 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1407 data = json.loads(jsond)
1408 return url, data
02dbf93f
PH
1409
1410
02dbf93f
PH
1411def format_bytes(bytes):
1412 if bytes is None:
28e614de 1413 return 'N/A'
02dbf93f
PH
1414 if type(bytes) is str:
1415 bytes = float(bytes)
1416 if bytes == 0.0:
1417 exponent = 0
1418 else:
1419 exponent = int(math.log(bytes, 1024.0))
28e614de 1420 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1421 converted = float(bytes) / float(1024 ** exponent)
28e614de 1422 return '%.2f%s' % (converted, suffix)
f53c966a 1423
1c088fa8 1424
fb47597b
S
1425def lookup_unit_table(unit_table, s):
1426 units_re = '|'.join(re.escape(u) for u in unit_table)
1427 m = re.match(
782b1b5b 1428 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1429 if not m:
1430 return None
1431 num_str = m.group('num').replace(',', '.')
1432 mult = unit_table[m.group('unit')]
1433 return int(float(num_str) * mult)
1434
1435
be64b5b0
PH
1436def parse_filesize(s):
1437 if s is None:
1438 return None
1439
dfb1b146 1440 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1441 # but we support those too
1442 _UNIT_TABLE = {
1443 'B': 1,
1444 'b': 1,
1445 'KiB': 1024,
1446 'KB': 1000,
1447 'kB': 1024,
1448 'Kb': 1000,
1449 'MiB': 1024 ** 2,
1450 'MB': 1000 ** 2,
1451 'mB': 1024 ** 2,
1452 'Mb': 1000 ** 2,
1453 'GiB': 1024 ** 3,
1454 'GB': 1000 ** 3,
1455 'gB': 1024 ** 3,
1456 'Gb': 1000 ** 3,
1457 'TiB': 1024 ** 4,
1458 'TB': 1000 ** 4,
1459 'tB': 1024 ** 4,
1460 'Tb': 1000 ** 4,
1461 'PiB': 1024 ** 5,
1462 'PB': 1000 ** 5,
1463 'pB': 1024 ** 5,
1464 'Pb': 1000 ** 5,
1465 'EiB': 1024 ** 6,
1466 'EB': 1000 ** 6,
1467 'eB': 1024 ** 6,
1468 'Eb': 1000 ** 6,
1469 'ZiB': 1024 ** 7,
1470 'ZB': 1000 ** 7,
1471 'zB': 1024 ** 7,
1472 'Zb': 1000 ** 7,
1473 'YiB': 1024 ** 8,
1474 'YB': 1000 ** 8,
1475 'yB': 1024 ** 8,
1476 'Yb': 1000 ** 8,
1477 }
1478
fb47597b
S
1479 return lookup_unit_table(_UNIT_TABLE, s)
1480
1481
1482def parse_count(s):
1483 if s is None:
be64b5b0
PH
1484 return None
1485
fb47597b
S
1486 s = s.strip()
1487
1488 if re.match(r'^[\d,.]+$', s):
1489 return str_to_int(s)
1490
1491 _UNIT_TABLE = {
1492 'k': 1000,
1493 'K': 1000,
1494 'm': 1000 ** 2,
1495 'M': 1000 ** 2,
1496 'kk': 1000 ** 2,
1497 'KK': 1000 ** 2,
1498 }
be64b5b0 1499
fb47597b 1500 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1501
2f7ae819 1502
caefb1de
PH
1503def month_by_name(name):
1504 """ Return the number of a month by (locale-independently) English name """
1505
caefb1de 1506 try:
7105440c
YCH
1507 return ENGLISH_MONTH_NAMES.index(name) + 1
1508 except ValueError:
1509 return None
1510
1511
1512def month_by_abbreviation(abbrev):
1513 """ Return the number of a month by (locale-independently) English
1514 abbreviations """
1515
1516 try:
1517 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1518 except ValueError:
1519 return None
18258362
JMF
1520
1521
5aafe895 1522def fix_xml_ampersands(xml_str):
18258362 1523 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1524 return re.sub(
1525 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1526 '&amp;',
5aafe895 1527 xml_str)
e3946f98
PH
1528
1529
1530def setproctitle(title):
8bf48f23 1531 assert isinstance(title, compat_str)
c1c05c67
YCH
1532
1533 # ctypes in Jython is not complete
1534 # http://bugs.jython.org/issue2148
1535 if sys.platform.startswith('java'):
1536 return
1537
e3946f98 1538 try:
611c1dd9 1539 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1540 except OSError:
1541 return
6eefe533
PH
1542 title_bytes = title.encode('utf-8')
1543 buf = ctypes.create_string_buffer(len(title_bytes))
1544 buf.value = title_bytes
e3946f98 1545 try:
6eefe533 1546 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1547 except AttributeError:
1548 return # Strange libc, just skip this
d7dda168
PH
1549
1550
1551def remove_start(s, start):
1552 if s.startswith(start):
1553 return s[len(start):]
1554 return s
29eb5174
PH
1555
1556
2b9faf55
PH
1557def remove_end(s, end):
1558 if s.endswith(end):
1559 return s[:-len(end)]
1560 return s
1561
1562
31b2051e
S
1563def remove_quotes(s):
1564 if s is None or len(s) < 2:
1565 return s
1566 for quote in ('"', "'", ):
1567 if s[0] == quote and s[-1] == quote:
1568 return s[1:-1]
1569 return s
1570
1571
29eb5174 1572def url_basename(url):
9b8aaeed 1573 path = compat_urlparse.urlparse(url).path
28e614de 1574 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1575
1576
1577class HEADRequest(compat_urllib_request.Request):
1578 def get_method(self):
611c1dd9 1579 return 'HEAD'
7217e148
PH
1580
1581
9732d77e 1582def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1583 if get_attr:
1584 if v is not None:
1585 v = getattr(v, get_attr, None)
9572013d
PH
1586 if v == '':
1587 v = None
1812afb7
S
1588 if v is None:
1589 return default
1590 try:
1591 return int(v) * invscale // scale
1592 except ValueError:
af98f8ff 1593 return default
9732d77e 1594
9572013d 1595
40a90862
JMF
1596def str_or_none(v, default=None):
1597 return default if v is None else compat_str(v)
1598
9732d77e
PH
1599
1600def str_to_int(int_str):
48d4681e 1601 """ A more relaxed version of int_or_none """
9732d77e
PH
1602 if int_str is None:
1603 return None
28e614de 1604 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1605 return int(int_str)
608d11f5
PH
1606
1607
9732d77e 1608def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1609 if v is None:
1610 return default
1611 try:
1612 return float(v) * invscale / scale
1613 except ValueError:
1614 return default
43f775e4
PH
1615
1616
608d11f5 1617def parse_duration(s):
8f9312c3 1618 if not isinstance(s, compat_basestring):
608d11f5
PH
1619 return None
1620
ca7b3246
S
1621 s = s.strip()
1622
acaff495 1623 days, hours, mins, secs, ms = [None] * 5
1624 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1625 if m:
1626 days, hours, mins, secs, ms = m.groups()
1627 else:
1628 m = re.match(
1629 r'''(?ix)(?:P?T)?
8f4b58d7 1630 (?:
acaff495 1631 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1632 )?
acaff495 1633 (?:
1634 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1635 )?
1636 (?:
1637 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1638 )?
1639 (?:
1640 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1641 )?$''', s)
1642 if m:
1643 days, hours, mins, secs, ms = m.groups()
1644 else:
1645 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1646 if m:
1647 hours, mins = m.groups()
1648 else:
1649 return None
1650
1651 duration = 0
1652 if secs:
1653 duration += float(secs)
1654 if mins:
1655 duration += float(mins) * 60
1656 if hours:
1657 duration += float(hours) * 60 * 60
1658 if days:
1659 duration += float(days) * 24 * 60 * 60
1660 if ms:
1661 duration += float(ms)
1662 return duration
91d7d0b3
JMF
1663
1664
e65e4c88 1665def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1666 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1667 return (
1668 '{0}.{1}{2}'.format(name, ext, real_ext)
1669 if not expected_real_ext or real_ext[1:] == expected_real_ext
1670 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1671
1672
b3ed15b7
S
1673def replace_extension(filename, ext, expected_real_ext=None):
1674 name, real_ext = os.path.splitext(filename)
1675 return '{0}.{1}'.format(
1676 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1677 ext)
1678
1679
d70ad093
PH
1680def check_executable(exe, args=[]):
1681 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1682 args can be a list of arguments for a short output (like -version) """
1683 try:
1684 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1685 except OSError:
1686 return False
1687 return exe
b7ab0590
PH
1688
1689
95807118 1690def get_exe_version(exe, args=['--version'],
cae97f65 1691 version_re=None, unrecognized='present'):
95807118
PH
1692 """ Returns the version of the specified executable,
1693 or False if the executable is not present """
1694 try:
cae97f65 1695 out, _ = subprocess.Popen(
54116803 1696 [encodeArgument(exe)] + args,
95807118
PH
1697 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1698 except OSError:
1699 return False
cae97f65
PH
1700 if isinstance(out, bytes): # Python 2.x
1701 out = out.decode('ascii', 'ignore')
1702 return detect_exe_version(out, version_re, unrecognized)
1703
1704
1705def detect_exe_version(output, version_re=None, unrecognized='present'):
1706 assert isinstance(output, compat_str)
1707 if version_re is None:
1708 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1709 m = re.search(version_re, output)
95807118
PH
1710 if m:
1711 return m.group(1)
1712 else:
1713 return unrecognized
1714
1715
b7ab0590 1716class PagedList(object):
dd26ced1
PH
1717 def __len__(self):
1718 # This is only useful for tests
1719 return len(self.getslice())
1720
9c44d242
PH
1721
1722class OnDemandPagedList(PagedList):
b95dc034 1723 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1724 self._pagefunc = pagefunc
1725 self._pagesize = pagesize
b95dc034
YCH
1726 self._use_cache = use_cache
1727 if use_cache:
1728 self._cache = {}
9c44d242 1729
b7ab0590
PH
1730 def getslice(self, start=0, end=None):
1731 res = []
1732 for pagenum in itertools.count(start // self._pagesize):
1733 firstid = pagenum * self._pagesize
1734 nextfirstid = pagenum * self._pagesize + self._pagesize
1735 if start >= nextfirstid:
1736 continue
1737
b95dc034
YCH
1738 page_results = None
1739 if self._use_cache:
1740 page_results = self._cache.get(pagenum)
1741 if page_results is None:
1742 page_results = list(self._pagefunc(pagenum))
1743 if self._use_cache:
1744 self._cache[pagenum] = page_results
b7ab0590
PH
1745
1746 startv = (
1747 start % self._pagesize
1748 if firstid <= start < nextfirstid
1749 else 0)
1750
1751 endv = (
1752 ((end - 1) % self._pagesize) + 1
1753 if (end is not None and firstid <= end <= nextfirstid)
1754 else None)
1755
1756 if startv != 0 or endv is not None:
1757 page_results = page_results[startv:endv]
1758 res.extend(page_results)
1759
1760 # A little optimization - if current page is not "full", ie. does
1761 # not contain page_size videos then we can assume that this page
1762 # is the last one - there are no more ids on further pages -
1763 # i.e. no need to query again.
1764 if len(page_results) + startv < self._pagesize:
1765 break
1766
1767 # If we got the whole page, but the next page is not interesting,
1768 # break out early as well
1769 if end == nextfirstid:
1770 break
1771 return res
81c2f20b
PH
1772
1773
9c44d242
PH
1774class InAdvancePagedList(PagedList):
1775 def __init__(self, pagefunc, pagecount, pagesize):
1776 self._pagefunc = pagefunc
1777 self._pagecount = pagecount
1778 self._pagesize = pagesize
1779
1780 def getslice(self, start=0, end=None):
1781 res = []
1782 start_page = start // self._pagesize
1783 end_page = (
1784 self._pagecount if end is None else (end // self._pagesize + 1))
1785 skip_elems = start - start_page * self._pagesize
1786 only_more = None if end is None else end - start
1787 for pagenum in range(start_page, end_page):
1788 page = list(self._pagefunc(pagenum))
1789 if skip_elems:
1790 page = page[skip_elems:]
1791 skip_elems = None
1792 if only_more is not None:
1793 if len(page) < only_more:
1794 only_more -= len(page)
1795 else:
1796 page = page[:only_more]
1797 res.extend(page)
1798 break
1799 res.extend(page)
1800 return res
1801
1802
81c2f20b 1803def uppercase_escape(s):
676eb3f2 1804 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1805 return re.sub(
a612753d 1806 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1807 lambda m: unicode_escape(m.group(0))[0],
1808 s)
0fe2ff78
YCH
1809
1810
1811def lowercase_escape(s):
1812 unicode_escape = codecs.getdecoder('unicode_escape')
1813 return re.sub(
1814 r'\\u[0-9a-fA-F]{4}',
1815 lambda m: unicode_escape(m.group(0))[0],
1816 s)
b53466e1 1817
d05cfe06
S
1818
1819def escape_rfc3986(s):
1820 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1821 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1822 s = s.encode('utf-8')
ecc0c5ee 1823 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1824
1825
1826def escape_url(url):
1827 """Escape URL as suggested by RFC 3986"""
1828 url_parsed = compat_urllib_parse_urlparse(url)
1829 return url_parsed._replace(
efbed08d 1830 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1831 path=escape_rfc3986(url_parsed.path),
1832 params=escape_rfc3986(url_parsed.params),
1833 query=escape_rfc3986(url_parsed.query),
1834 fragment=escape_rfc3986(url_parsed.fragment)
1835 ).geturl()
1836
62e609ab
PH
1837
1838def read_batch_urls(batch_fd):
1839 def fixup(url):
1840 if not isinstance(url, compat_str):
1841 url = url.decode('utf-8', 'replace')
28e614de 1842 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1843 if url.startswith(BOM_UTF8):
1844 url = url[len(BOM_UTF8):]
1845 url = url.strip()
1846 if url.startswith(('#', ';', ']')):
1847 return False
1848 return url
1849
1850 with contextlib.closing(batch_fd) as fd:
1851 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1852
1853
1854def urlencode_postdata(*args, **kargs):
15707c7e 1855 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1856
1857
38f9ef31 1858def update_url_query(url, query):
cacd9966
YCH
1859 if not query:
1860 return url
38f9ef31 1861 parsed_url = compat_urlparse.urlparse(url)
1862 qs = compat_parse_qs(parsed_url.query)
1863 qs.update(query)
1864 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1865 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1866
8e60dc75 1867
ed0291d1
S
1868def update_Request(req, url=None, data=None, headers={}, query={}):
1869 req_headers = req.headers.copy()
1870 req_headers.update(headers)
1871 req_data = data or req.data
1872 req_url = update_url_query(url or req.get_full_url(), query)
1873 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1874 new_req = req_type(
1875 req_url, data=req_data, headers=req_headers,
1876 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1877 if hasattr(req, 'timeout'):
1878 new_req.timeout = req.timeout
1879 return new_req
1880
1881
86296ad2 1882def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1883 if isinstance(key_or_keys, (list, tuple)):
1884 for key in key_or_keys:
86296ad2
S
1885 if key not in d or d[key] is None or skip_false_values and not d[key]:
1886 continue
1887 return d[key]
cbecc9b9
S
1888 return default
1889 return d.get(key_or_keys, default)
1890
1891
8e60dc75
S
1892def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1893 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1894
16392824 1895
a1a530b0
PH
1896US_RATINGS = {
1897 'G': 0,
1898 'PG': 10,
1899 'PG-13': 13,
1900 'R': 16,
1901 'NC': 18,
1902}
fac55558
PH
1903
1904
146c80e2
S
1905def parse_age_limit(s):
1906 if s is None:
d838b1bd 1907 return None
146c80e2 1908 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1909 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1910
1911
fac55558 1912def strip_jsonp(code):
609a61e3 1913 return re.sub(
8411229b 1914 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1915
1916
e05f6939
PH
1917def js_to_json(code):
1918 def fix_kv(m):
e7b6d122
PH
1919 v = m.group(0)
1920 if v in ('true', 'false', 'null'):
1921 return v
bd1e4844 1922 elif v.startswith('/*') or v == ',':
1923 return ""
1924
1925 if v[0] in ("'", '"'):
1926 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 1927 '"': '\\"',
bd1e4844 1928 "\\'": "'",
1929 '\\\n': '',
1930 '\\x': '\\u00',
1931 }.get(m.group(0), m.group(0)), v[1:-1])
1932
89ac4a19 1933 INTEGER_TABLE = (
cda6d47a
S
1934 (r'^0[xX][0-9a-fA-F]+', 16),
1935 (r'^0+[0-7]+', 8),
89ac4a19
S
1936 )
1937
1938 for regex, base in INTEGER_TABLE:
1939 im = re.match(regex, v)
1940 if im:
cda6d47a 1941 i = int(im.group(0), base)
89ac4a19
S
1942 return '"%d":' % i if v.endswith(':') else '%d' % i
1943
e7b6d122 1944 return '"%s"' % v
e05f6939 1945
bd1e4844 1946 return re.sub(r'''(?sx)
1947 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1948 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1949 /\*.*?\*/|,(?=\s*[\]}])|
1950 [a-zA-Z_][.a-zA-Z_0-9]*|
89ac4a19 1951 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 1952 [0-9]+(?=\s*:)
e05f6939 1953 ''', fix_kv, code)
e05f6939
PH
1954
1955
478c2c61
PH
1956def qualities(quality_ids):
1957 """ Get a numeric quality value out of a list of possible values """
1958 def q(qid):
1959 try:
1960 return quality_ids.index(qid)
1961 except ValueError:
1962 return -1
1963 return q
1964
acd69589
PH
1965
1966DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1967
a020a0dc
PH
1968
1969def limit_length(s, length):
1970 """ Add ellipses to overly long strings """
1971 if s is None:
1972 return None
1973 ELLIPSES = '...'
1974 if len(s) > length:
1975 return s[:length - len(ELLIPSES)] + ELLIPSES
1976 return s
48844745
PH
1977
1978
1979def version_tuple(v):
5f9b8394 1980 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1981
1982
1983def is_outdated_version(version, limit, assume_new=True):
1984 if not version:
1985 return not assume_new
1986 try:
1987 return version_tuple(version) < version_tuple(limit)
1988 except ValueError:
1989 return not assume_new
732ea2f0
PH
1990
1991
1992def ytdl_is_updateable():
1993 """ Returns if youtube-dl can be updated with -U """
1994 from zipimport import zipimporter
1995
1996 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1997
1998
1999def args_to_str(args):
2000 # Get a short string representation for a subprocess command
702ccf2d 2001 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2002
2003
9b9c5355 2004def error_to_compat_str(err):
fdae2358
S
2005 err_str = str(err)
2006 # On python 2 error byte string must be decoded with proper
2007 # encoding rather than ascii
2008 if sys.version_info[0] < 3:
2009 err_str = err_str.decode(preferredencoding())
2010 return err_str
2011
2012
c460bdd5 2013def mimetype2ext(mt):
eb9ee194
S
2014 if mt is None:
2015 return None
2016
765ac263
JMF
2017 ext = {
2018 'audio/mp4': 'm4a',
2019 }.get(mt)
2020 if ext is not None:
2021 return ext
2022
c460bdd5
PH
2023 _, _, res = mt.rpartition('/')
2024
2025 return {
f6861ec9 2026 '3gpp': '3gp',
cafcf657 2027 'smptett+xml': 'tt',
2028 'srt': 'srt',
2029 'ttaf+xml': 'dfxp',
a0d8d704 2030 'ttml+xml': 'ttml',
cafcf657 2031 'vtt': 'vtt',
f6861ec9 2032 'x-flv': 'flv',
a0d8d704
YCH
2033 'x-mp4-fragmented': 'mp4',
2034 'x-ms-wmv': 'wmv',
c460bdd5
PH
2035 }.get(res, res)
2036
2037
2ccd1b10 2038def urlhandle_detect_ext(url_handle):
79298173 2039 getheader = url_handle.headers.get
2ccd1b10 2040
b55ee18f
PH
2041 cd = getheader('Content-Disposition')
2042 if cd:
2043 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2044 if m:
2045 e = determine_ext(m.group('filename'), default_ext=None)
2046 if e:
2047 return e
2048
c460bdd5 2049 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2050
2051
1e399778
YCH
2052def encode_data_uri(data, mime_type):
2053 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2054
2055
05900629 2056def age_restricted(content_limit, age_limit):
6ec6cb4e 2057 """ Returns True iff the content should be blocked """
05900629
PH
2058
2059 if age_limit is None: # No limit set
2060 return False
2061 if content_limit is None:
2062 return False # Content available for everyone
2063 return age_limit < content_limit
61ca9a80
PH
2064
2065
2066def is_html(first_bytes):
2067 """ Detect whether a file contains HTML by examining its first bytes. """
2068
2069 BOMS = [
2070 (b'\xef\xbb\xbf', 'utf-8'),
2071 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2072 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2073 (b'\xff\xfe', 'utf-16-le'),
2074 (b'\xfe\xff', 'utf-16-be'),
2075 ]
2076 for bom, enc in BOMS:
2077 if first_bytes.startswith(bom):
2078 s = first_bytes[len(bom):].decode(enc, 'replace')
2079 break
2080 else:
2081 s = first_bytes.decode('utf-8', 'replace')
2082
2083 return re.match(r'^\s*<', s)
a055469f
PH
2084
2085
2086def determine_protocol(info_dict):
2087 protocol = info_dict.get('protocol')
2088 if protocol is not None:
2089 return protocol
2090
2091 url = info_dict['url']
2092 if url.startswith('rtmp'):
2093 return 'rtmp'
2094 elif url.startswith('mms'):
2095 return 'mms'
2096 elif url.startswith('rtsp'):
2097 return 'rtsp'
2098
2099 ext = determine_ext(url)
2100 if ext == 'm3u8':
2101 return 'm3u8'
2102 elif ext == 'f4m':
2103 return 'f4m'
2104
2105 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2106
2107
2108def render_table(header_row, data):
2109 """ Render a list of rows, each as a list of values """
2110 table = [header_row] + data
2111 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2112 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2113 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2114
2115
2116def _match_one(filter_part, dct):
2117 COMPARISON_OPERATORS = {
2118 '<': operator.lt,
2119 '<=': operator.le,
2120 '>': operator.gt,
2121 '>=': operator.ge,
2122 '=': operator.eq,
2123 '!=': operator.ne,
2124 }
2125 operator_rex = re.compile(r'''(?x)\s*
2126 (?P<key>[a-z_]+)
2127 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2128 (?:
2129 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2130 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2131 )
2132 \s*$
2133 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2134 m = operator_rex.search(filter_part)
2135 if m:
2136 op = COMPARISON_OPERATORS[m.group('op')]
2137 if m.group('strval') is not None:
2138 if m.group('op') not in ('=', '!='):
2139 raise ValueError(
2140 'Operator %s does not support string values!' % m.group('op'))
2141 comparison_value = m.group('strval')
2142 else:
2143 try:
2144 comparison_value = int(m.group('intval'))
2145 except ValueError:
2146 comparison_value = parse_filesize(m.group('intval'))
2147 if comparison_value is None:
2148 comparison_value = parse_filesize(m.group('intval') + 'B')
2149 if comparison_value is None:
2150 raise ValueError(
2151 'Invalid integer value %r in filter part %r' % (
2152 m.group('intval'), filter_part))
2153 actual_value = dct.get(m.group('key'))
2154 if actual_value is None:
2155 return m.group('none_inclusive')
2156 return op(actual_value, comparison_value)
2157
2158 UNARY_OPERATORS = {
2159 '': lambda v: v is not None,
2160 '!': lambda v: v is None,
2161 }
2162 operator_rex = re.compile(r'''(?x)\s*
2163 (?P<op>%s)\s*(?P<key>[a-z_]+)
2164 \s*$
2165 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2166 m = operator_rex.search(filter_part)
2167 if m:
2168 op = UNARY_OPERATORS[m.group('op')]
2169 actual_value = dct.get(m.group('key'))
2170 return op(actual_value)
2171
2172 raise ValueError('Invalid filter part %r' % filter_part)
2173
2174
2175def match_str(filter_str, dct):
2176 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2177
2178 return all(
2179 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2180
2181
2182def match_filter_func(filter_str):
2183 def _match_func(info_dict):
2184 if match_str(filter_str, info_dict):
2185 return None
2186 else:
2187 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2188 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2189 return _match_func
91410c9b
PH
2190
2191
bf6427d2
YCH
2192def parse_dfxp_time_expr(time_expr):
2193 if not time_expr:
d631d5f9 2194 return
bf6427d2
YCH
2195
2196 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2197 if mobj:
2198 return float(mobj.group('time_offset'))
2199
db2fe38b 2200 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2201 if mobj:
db2fe38b 2202 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2203
2204
c1c924ab
YCH
2205def srt_subtitles_timecode(seconds):
2206 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2207
2208
2209def dfxp2srt(dfxp_data):
4e335771
YCH
2210 _x = functools.partial(xpath_with_ns, ns_map={
2211 'ttml': 'http://www.w3.org/ns/ttml',
2212 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2213 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2214 })
bf6427d2 2215
87de7069 2216 class TTMLPElementParser(object):
2b14cb56 2217 out = ''
bf6427d2 2218
2b14cb56 2219 def start(self, tag, attrib):
2220 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2221 self.out += '\n'
bf6427d2 2222
2b14cb56 2223 def end(self, tag):
2224 pass
bf6427d2 2225
2b14cb56 2226 def data(self, data):
2227 self.out += data
2228
2229 def close(self):
2230 return self.out.strip()
2231
2232 def parse_node(node):
2233 target = TTMLPElementParser()
2234 parser = xml.etree.ElementTree.XMLParser(target=target)
2235 parser.feed(xml.etree.ElementTree.tostring(node))
2236 return parser.close()
bf6427d2 2237
36e6f62c 2238 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2239 out = []
5bf28d78 2240 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2241
2242 if not paras:
2243 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2244
2245 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2246 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2247 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2248 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2249 if begin_time is None:
2250 continue
7dff0363 2251 if not end_time:
d631d5f9
YCH
2252 if not dur:
2253 continue
2254 end_time = begin_time + dur
bf6427d2
YCH
2255 out.append('%d\n%s --> %s\n%s\n\n' % (
2256 index,
c1c924ab
YCH
2257 srt_subtitles_timecode(begin_time),
2258 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2259 parse_node(para)))
2260
2261 return ''.join(out)
2262
2263
66e289ba
S
2264def cli_option(params, command_option, param):
2265 param = params.get(param)
2266 return [command_option, param] if param is not None else []
2267
2268
2269def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2270 param = params.get(param)
2271 assert isinstance(param, bool)
2272 if separator:
2273 return [command_option + separator + (true_value if param else false_value)]
2274 return [command_option, true_value if param else false_value]
2275
2276
2277def cli_valueless_option(params, command_option, param, expected_value=True):
2278 param = params.get(param)
2279 return [command_option] if param == expected_value else []
2280
2281
2282def cli_configuration_args(params, param, default=[]):
2283 ex_args = params.get(param)
2284 if ex_args is None:
2285 return default
2286 assert isinstance(ex_args, list)
2287 return ex_args
2288
2289
39672624
YCH
2290class ISO639Utils(object):
2291 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2292 _lang_map = {
2293 'aa': 'aar',
2294 'ab': 'abk',
2295 'ae': 'ave',
2296 'af': 'afr',
2297 'ak': 'aka',
2298 'am': 'amh',
2299 'an': 'arg',
2300 'ar': 'ara',
2301 'as': 'asm',
2302 'av': 'ava',
2303 'ay': 'aym',
2304 'az': 'aze',
2305 'ba': 'bak',
2306 'be': 'bel',
2307 'bg': 'bul',
2308 'bh': 'bih',
2309 'bi': 'bis',
2310 'bm': 'bam',
2311 'bn': 'ben',
2312 'bo': 'bod',
2313 'br': 'bre',
2314 'bs': 'bos',
2315 'ca': 'cat',
2316 'ce': 'che',
2317 'ch': 'cha',
2318 'co': 'cos',
2319 'cr': 'cre',
2320 'cs': 'ces',
2321 'cu': 'chu',
2322 'cv': 'chv',
2323 'cy': 'cym',
2324 'da': 'dan',
2325 'de': 'deu',
2326 'dv': 'div',
2327 'dz': 'dzo',
2328 'ee': 'ewe',
2329 'el': 'ell',
2330 'en': 'eng',
2331 'eo': 'epo',
2332 'es': 'spa',
2333 'et': 'est',
2334 'eu': 'eus',
2335 'fa': 'fas',
2336 'ff': 'ful',
2337 'fi': 'fin',
2338 'fj': 'fij',
2339 'fo': 'fao',
2340 'fr': 'fra',
2341 'fy': 'fry',
2342 'ga': 'gle',
2343 'gd': 'gla',
2344 'gl': 'glg',
2345 'gn': 'grn',
2346 'gu': 'guj',
2347 'gv': 'glv',
2348 'ha': 'hau',
2349 'he': 'heb',
2350 'hi': 'hin',
2351 'ho': 'hmo',
2352 'hr': 'hrv',
2353 'ht': 'hat',
2354 'hu': 'hun',
2355 'hy': 'hye',
2356 'hz': 'her',
2357 'ia': 'ina',
2358 'id': 'ind',
2359 'ie': 'ile',
2360 'ig': 'ibo',
2361 'ii': 'iii',
2362 'ik': 'ipk',
2363 'io': 'ido',
2364 'is': 'isl',
2365 'it': 'ita',
2366 'iu': 'iku',
2367 'ja': 'jpn',
2368 'jv': 'jav',
2369 'ka': 'kat',
2370 'kg': 'kon',
2371 'ki': 'kik',
2372 'kj': 'kua',
2373 'kk': 'kaz',
2374 'kl': 'kal',
2375 'km': 'khm',
2376 'kn': 'kan',
2377 'ko': 'kor',
2378 'kr': 'kau',
2379 'ks': 'kas',
2380 'ku': 'kur',
2381 'kv': 'kom',
2382 'kw': 'cor',
2383 'ky': 'kir',
2384 'la': 'lat',
2385 'lb': 'ltz',
2386 'lg': 'lug',
2387 'li': 'lim',
2388 'ln': 'lin',
2389 'lo': 'lao',
2390 'lt': 'lit',
2391 'lu': 'lub',
2392 'lv': 'lav',
2393 'mg': 'mlg',
2394 'mh': 'mah',
2395 'mi': 'mri',
2396 'mk': 'mkd',
2397 'ml': 'mal',
2398 'mn': 'mon',
2399 'mr': 'mar',
2400 'ms': 'msa',
2401 'mt': 'mlt',
2402 'my': 'mya',
2403 'na': 'nau',
2404 'nb': 'nob',
2405 'nd': 'nde',
2406 'ne': 'nep',
2407 'ng': 'ndo',
2408 'nl': 'nld',
2409 'nn': 'nno',
2410 'no': 'nor',
2411 'nr': 'nbl',
2412 'nv': 'nav',
2413 'ny': 'nya',
2414 'oc': 'oci',
2415 'oj': 'oji',
2416 'om': 'orm',
2417 'or': 'ori',
2418 'os': 'oss',
2419 'pa': 'pan',
2420 'pi': 'pli',
2421 'pl': 'pol',
2422 'ps': 'pus',
2423 'pt': 'por',
2424 'qu': 'que',
2425 'rm': 'roh',
2426 'rn': 'run',
2427 'ro': 'ron',
2428 'ru': 'rus',
2429 'rw': 'kin',
2430 'sa': 'san',
2431 'sc': 'srd',
2432 'sd': 'snd',
2433 'se': 'sme',
2434 'sg': 'sag',
2435 'si': 'sin',
2436 'sk': 'slk',
2437 'sl': 'slv',
2438 'sm': 'smo',
2439 'sn': 'sna',
2440 'so': 'som',
2441 'sq': 'sqi',
2442 'sr': 'srp',
2443 'ss': 'ssw',
2444 'st': 'sot',
2445 'su': 'sun',
2446 'sv': 'swe',
2447 'sw': 'swa',
2448 'ta': 'tam',
2449 'te': 'tel',
2450 'tg': 'tgk',
2451 'th': 'tha',
2452 'ti': 'tir',
2453 'tk': 'tuk',
2454 'tl': 'tgl',
2455 'tn': 'tsn',
2456 'to': 'ton',
2457 'tr': 'tur',
2458 'ts': 'tso',
2459 'tt': 'tat',
2460 'tw': 'twi',
2461 'ty': 'tah',
2462 'ug': 'uig',
2463 'uk': 'ukr',
2464 'ur': 'urd',
2465 'uz': 'uzb',
2466 've': 'ven',
2467 'vi': 'vie',
2468 'vo': 'vol',
2469 'wa': 'wln',
2470 'wo': 'wol',
2471 'xh': 'xho',
2472 'yi': 'yid',
2473 'yo': 'yor',
2474 'za': 'zha',
2475 'zh': 'zho',
2476 'zu': 'zul',
2477 }
2478
2479 @classmethod
2480 def short2long(cls, code):
2481 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2482 return cls._lang_map.get(code[:2])
2483
2484 @classmethod
2485 def long2short(cls, code):
2486 """Convert language code from ISO 639-2/T to ISO 639-1"""
2487 for short_name, long_name in cls._lang_map.items():
2488 if long_name == code:
2489 return short_name
2490
2491
4eb10f66
YCH
2492class ISO3166Utils(object):
2493 # From http://data.okfn.org/data/core/country-list
2494 _country_map = {
2495 'AF': 'Afghanistan',
2496 'AX': 'Åland Islands',
2497 'AL': 'Albania',
2498 'DZ': 'Algeria',
2499 'AS': 'American Samoa',
2500 'AD': 'Andorra',
2501 'AO': 'Angola',
2502 'AI': 'Anguilla',
2503 'AQ': 'Antarctica',
2504 'AG': 'Antigua and Barbuda',
2505 'AR': 'Argentina',
2506 'AM': 'Armenia',
2507 'AW': 'Aruba',
2508 'AU': 'Australia',
2509 'AT': 'Austria',
2510 'AZ': 'Azerbaijan',
2511 'BS': 'Bahamas',
2512 'BH': 'Bahrain',
2513 'BD': 'Bangladesh',
2514 'BB': 'Barbados',
2515 'BY': 'Belarus',
2516 'BE': 'Belgium',
2517 'BZ': 'Belize',
2518 'BJ': 'Benin',
2519 'BM': 'Bermuda',
2520 'BT': 'Bhutan',
2521 'BO': 'Bolivia, Plurinational State of',
2522 'BQ': 'Bonaire, Sint Eustatius and Saba',
2523 'BA': 'Bosnia and Herzegovina',
2524 'BW': 'Botswana',
2525 'BV': 'Bouvet Island',
2526 'BR': 'Brazil',
2527 'IO': 'British Indian Ocean Territory',
2528 'BN': 'Brunei Darussalam',
2529 'BG': 'Bulgaria',
2530 'BF': 'Burkina Faso',
2531 'BI': 'Burundi',
2532 'KH': 'Cambodia',
2533 'CM': 'Cameroon',
2534 'CA': 'Canada',
2535 'CV': 'Cape Verde',
2536 'KY': 'Cayman Islands',
2537 'CF': 'Central African Republic',
2538 'TD': 'Chad',
2539 'CL': 'Chile',
2540 'CN': 'China',
2541 'CX': 'Christmas Island',
2542 'CC': 'Cocos (Keeling) Islands',
2543 'CO': 'Colombia',
2544 'KM': 'Comoros',
2545 'CG': 'Congo',
2546 'CD': 'Congo, the Democratic Republic of the',
2547 'CK': 'Cook Islands',
2548 'CR': 'Costa Rica',
2549 'CI': 'Côte d\'Ivoire',
2550 'HR': 'Croatia',
2551 'CU': 'Cuba',
2552 'CW': 'Curaçao',
2553 'CY': 'Cyprus',
2554 'CZ': 'Czech Republic',
2555 'DK': 'Denmark',
2556 'DJ': 'Djibouti',
2557 'DM': 'Dominica',
2558 'DO': 'Dominican Republic',
2559 'EC': 'Ecuador',
2560 'EG': 'Egypt',
2561 'SV': 'El Salvador',
2562 'GQ': 'Equatorial Guinea',
2563 'ER': 'Eritrea',
2564 'EE': 'Estonia',
2565 'ET': 'Ethiopia',
2566 'FK': 'Falkland Islands (Malvinas)',
2567 'FO': 'Faroe Islands',
2568 'FJ': 'Fiji',
2569 'FI': 'Finland',
2570 'FR': 'France',
2571 'GF': 'French Guiana',
2572 'PF': 'French Polynesia',
2573 'TF': 'French Southern Territories',
2574 'GA': 'Gabon',
2575 'GM': 'Gambia',
2576 'GE': 'Georgia',
2577 'DE': 'Germany',
2578 'GH': 'Ghana',
2579 'GI': 'Gibraltar',
2580 'GR': 'Greece',
2581 'GL': 'Greenland',
2582 'GD': 'Grenada',
2583 'GP': 'Guadeloupe',
2584 'GU': 'Guam',
2585 'GT': 'Guatemala',
2586 'GG': 'Guernsey',
2587 'GN': 'Guinea',
2588 'GW': 'Guinea-Bissau',
2589 'GY': 'Guyana',
2590 'HT': 'Haiti',
2591 'HM': 'Heard Island and McDonald Islands',
2592 'VA': 'Holy See (Vatican City State)',
2593 'HN': 'Honduras',
2594 'HK': 'Hong Kong',
2595 'HU': 'Hungary',
2596 'IS': 'Iceland',
2597 'IN': 'India',
2598 'ID': 'Indonesia',
2599 'IR': 'Iran, Islamic Republic of',
2600 'IQ': 'Iraq',
2601 'IE': 'Ireland',
2602 'IM': 'Isle of Man',
2603 'IL': 'Israel',
2604 'IT': 'Italy',
2605 'JM': 'Jamaica',
2606 'JP': 'Japan',
2607 'JE': 'Jersey',
2608 'JO': 'Jordan',
2609 'KZ': 'Kazakhstan',
2610 'KE': 'Kenya',
2611 'KI': 'Kiribati',
2612 'KP': 'Korea, Democratic People\'s Republic of',
2613 'KR': 'Korea, Republic of',
2614 'KW': 'Kuwait',
2615 'KG': 'Kyrgyzstan',
2616 'LA': 'Lao People\'s Democratic Republic',
2617 'LV': 'Latvia',
2618 'LB': 'Lebanon',
2619 'LS': 'Lesotho',
2620 'LR': 'Liberia',
2621 'LY': 'Libya',
2622 'LI': 'Liechtenstein',
2623 'LT': 'Lithuania',
2624 'LU': 'Luxembourg',
2625 'MO': 'Macao',
2626 'MK': 'Macedonia, the Former Yugoslav Republic of',
2627 'MG': 'Madagascar',
2628 'MW': 'Malawi',
2629 'MY': 'Malaysia',
2630 'MV': 'Maldives',
2631 'ML': 'Mali',
2632 'MT': 'Malta',
2633 'MH': 'Marshall Islands',
2634 'MQ': 'Martinique',
2635 'MR': 'Mauritania',
2636 'MU': 'Mauritius',
2637 'YT': 'Mayotte',
2638 'MX': 'Mexico',
2639 'FM': 'Micronesia, Federated States of',
2640 'MD': 'Moldova, Republic of',
2641 'MC': 'Monaco',
2642 'MN': 'Mongolia',
2643 'ME': 'Montenegro',
2644 'MS': 'Montserrat',
2645 'MA': 'Morocco',
2646 'MZ': 'Mozambique',
2647 'MM': 'Myanmar',
2648 'NA': 'Namibia',
2649 'NR': 'Nauru',
2650 'NP': 'Nepal',
2651 'NL': 'Netherlands',
2652 'NC': 'New Caledonia',
2653 'NZ': 'New Zealand',
2654 'NI': 'Nicaragua',
2655 'NE': 'Niger',
2656 'NG': 'Nigeria',
2657 'NU': 'Niue',
2658 'NF': 'Norfolk Island',
2659 'MP': 'Northern Mariana Islands',
2660 'NO': 'Norway',
2661 'OM': 'Oman',
2662 'PK': 'Pakistan',
2663 'PW': 'Palau',
2664 'PS': 'Palestine, State of',
2665 'PA': 'Panama',
2666 'PG': 'Papua New Guinea',
2667 'PY': 'Paraguay',
2668 'PE': 'Peru',
2669 'PH': 'Philippines',
2670 'PN': 'Pitcairn',
2671 'PL': 'Poland',
2672 'PT': 'Portugal',
2673 'PR': 'Puerto Rico',
2674 'QA': 'Qatar',
2675 'RE': 'Réunion',
2676 'RO': 'Romania',
2677 'RU': 'Russian Federation',
2678 'RW': 'Rwanda',
2679 'BL': 'Saint Barthélemy',
2680 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2681 'KN': 'Saint Kitts and Nevis',
2682 'LC': 'Saint Lucia',
2683 'MF': 'Saint Martin (French part)',
2684 'PM': 'Saint Pierre and Miquelon',
2685 'VC': 'Saint Vincent and the Grenadines',
2686 'WS': 'Samoa',
2687 'SM': 'San Marino',
2688 'ST': 'Sao Tome and Principe',
2689 'SA': 'Saudi Arabia',
2690 'SN': 'Senegal',
2691 'RS': 'Serbia',
2692 'SC': 'Seychelles',
2693 'SL': 'Sierra Leone',
2694 'SG': 'Singapore',
2695 'SX': 'Sint Maarten (Dutch part)',
2696 'SK': 'Slovakia',
2697 'SI': 'Slovenia',
2698 'SB': 'Solomon Islands',
2699 'SO': 'Somalia',
2700 'ZA': 'South Africa',
2701 'GS': 'South Georgia and the South Sandwich Islands',
2702 'SS': 'South Sudan',
2703 'ES': 'Spain',
2704 'LK': 'Sri Lanka',
2705 'SD': 'Sudan',
2706 'SR': 'Suriname',
2707 'SJ': 'Svalbard and Jan Mayen',
2708 'SZ': 'Swaziland',
2709 'SE': 'Sweden',
2710 'CH': 'Switzerland',
2711 'SY': 'Syrian Arab Republic',
2712 'TW': 'Taiwan, Province of China',
2713 'TJ': 'Tajikistan',
2714 'TZ': 'Tanzania, United Republic of',
2715 'TH': 'Thailand',
2716 'TL': 'Timor-Leste',
2717 'TG': 'Togo',
2718 'TK': 'Tokelau',
2719 'TO': 'Tonga',
2720 'TT': 'Trinidad and Tobago',
2721 'TN': 'Tunisia',
2722 'TR': 'Turkey',
2723 'TM': 'Turkmenistan',
2724 'TC': 'Turks and Caicos Islands',
2725 'TV': 'Tuvalu',
2726 'UG': 'Uganda',
2727 'UA': 'Ukraine',
2728 'AE': 'United Arab Emirates',
2729 'GB': 'United Kingdom',
2730 'US': 'United States',
2731 'UM': 'United States Minor Outlying Islands',
2732 'UY': 'Uruguay',
2733 'UZ': 'Uzbekistan',
2734 'VU': 'Vanuatu',
2735 'VE': 'Venezuela, Bolivarian Republic of',
2736 'VN': 'Viet Nam',
2737 'VG': 'Virgin Islands, British',
2738 'VI': 'Virgin Islands, U.S.',
2739 'WF': 'Wallis and Futuna',
2740 'EH': 'Western Sahara',
2741 'YE': 'Yemen',
2742 'ZM': 'Zambia',
2743 'ZW': 'Zimbabwe',
2744 }
2745
2746 @classmethod
2747 def short2full(cls, code):
2748 """Convert an ISO 3166-2 country code to the corresponding full name"""
2749 return cls._country_map.get(code.upper())
2750
2751
91410c9b 2752class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2753 def __init__(self, proxies=None):
2754 # Set default handlers
2755 for type in ('http', 'https'):
2756 setattr(self, '%s_open' % type,
2757 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2758 meth(r, proxy, type))
2759 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2760
91410c9b 2761 def proxy_open(self, req, proxy, type):
2461f79d 2762 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2763 if req_proxy is not None:
2764 proxy = req_proxy
2461f79d
PH
2765 del req.headers['Ytdl-request-proxy']
2766
2767 if proxy == '__noproxy__':
2768 return None # No Proxy
51fb4995 2769 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2770 req.add_header('Ytdl-socks-proxy', proxy)
2771 # youtube-dl's http/https handlers do wrapping the socket with socks
2772 return None
91410c9b
PH
2773 return compat_urllib_request.ProxyHandler.proxy_open(
2774 self, req, proxy, type)
5bc880b9
YCH
2775
2776
2777def ohdave_rsa_encrypt(data, exponent, modulus):
2778 '''
2779 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2780
2781 Input:
2782 data: data to encrypt, bytes-like object
2783 exponent, modulus: parameter e and N of RSA algorithm, both integer
2784 Output: hex string of encrypted data
2785
2786 Limitation: supports one block encryption only
2787 '''
2788
2789 payload = int(binascii.hexlify(data[::-1]), 16)
2790 encrypted = pow(payload, exponent, modulus)
2791 return '%x' % encrypted
81bdc8fd
YCH
2792
2793
5eb6bdce 2794def encode_base_n(num, n, table=None):
59f898b7 2795 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2796 if not table:
2797 table = FULL_TABLE[:n]
2798
5eb6bdce
YCH
2799 if n > len(table):
2800 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2801
2802 if num == 0:
2803 return table[0]
2804
81bdc8fd
YCH
2805 ret = ''
2806 while num:
2807 ret = table[num % n] + ret
2808 num = num // n
2809 return ret
f52354a8
YCH
2810
2811
2812def decode_packed_codes(code):
2813 mobj = re.search(
680079be 2814 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2815 code)
2816 obfucasted_code, base, count, symbols = mobj.groups()
2817 base = int(base)
2818 count = int(count)
2819 symbols = symbols.split('|')
2820 symbol_table = {}
2821
2822 while count:
2823 count -= 1
5eb6bdce 2824 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2825 symbol_table[base_n_count] = symbols[count] or base_n_count
2826
2827 return re.sub(
2828 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2829 obfucasted_code)