]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[spankbang:playlist] Add extractor (closes #19145)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
1bab3437 42 compat_cookiejar,
d7cd9a9e 43 compat_ctypes_WINFUNCTYPE,
36e6f62c 44 compat_etree_fromstring,
51098426 45 compat_expanduser,
8c25f81b 46 compat_html_entities,
55b2f099 47 compat_html_entities_html5,
be4a824d 48 compat_http_client,
c86b6142 49 compat_kwargs,
efa97bdc 50 compat_os_name,
8c25f81b 51 compat_parse_qs,
702ccf2d 52 compat_shlex_quote,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
60c08562 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29 186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0685d972 187JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 188
7105440c 189
d77c3dfd 190def preferredencoding():
59ae15a5 191 """Get preferred encoding.
d77c3dfd 192
59ae15a5
PH
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
28e614de 198 'TEST'.encode(pref)
70a1165b 199 except Exception:
59ae15a5 200 pref = 'UTF-8'
bae611f2 201
59ae15a5 202 return pref
d77c3dfd 203
f4bfd65f 204
181c8655 205def write_json_file(obj, fn):
1394646a 206 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 207
92120217 208 fn = encodeFilename(fn)
61ee5aeb 209 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
73159f99
S
221 args = {
222 'suffix': '.tmp',
ec5f6016
JMF
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
73159f99
S
225 'delete': False,
226 }
227
181c8655
PH
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
73159f99 231 args['mode'] = 'wb'
181c8655 232 else:
73159f99
S
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
c86b6142 238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
239
240 try:
241 with tf:
242 json.dump(obj, tf)
1394646a
IK
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
181c8655 250 os.rename(tf.name, fn)
70a1165b 251 except Exception:
181c8655
PH
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259if sys.version_info >= (2, 7):
ee114368 260 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 261 """ Find the xpath xpath[@key=val] """
5d2354f1 262 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
264 return node.find(expr)
265else:
ee114368 266 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 267 for f in node.findall(compat_xpath(xpath)):
ee114368
S
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
271 return f
272 return None
273
d7e66d39
JMF
274# On python2.6 the xml.etree.ElementTree.Element methods don't support
275# the namespace parameter
5f6a1245
JW
276
277
d7e66d39
JMF
278def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
d77c3dfd 289
a41fb80c 290def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 291 def _find_xpath(xpath):
810c10ba 292 return node.find(compat_xpath(xpath))
578c0745
S
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
d74bebd5 301
8e636da4 302 if n is None:
bf42a990
S
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
bf0ff932
PH
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
a41fb80c
S
310 return n
311
312
313def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
a41fb80c
S
326
327
328def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
bf0ff932
PH
339
340
9e6dd238 341def get_element_by_id(id, html):
43e8fafd 342 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 343 return get_element_by_attribute('id', id, html)
43e8fafd 344
12ea2f30 345
84c237fb 346def get_element_by_class(class_name, html):
2af12ad9
TC
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
84c237fb
YCH
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
2af12ad9 364def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 365 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 366
84c237fb
YCH
367 value = re.escape(value) if escape_value else value
368
2af12ad9
TC
369 retlist = []
370 for m in re.finditer(r'''(?xs)
38285056 371 <([a-zA-Z0-9:._-]+)
609ff8ca 372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 373 \s+%s=['"]?%s['"]?
609ff8ca 374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
375 \s*>
376 (?P<content>.*?)
377 </\1>
2af12ad9
TC
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
38285056 380
2af12ad9
TC
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
38285056 383
2af12ad9 384 retlist.append(unescapeHTML(res))
a921f407 385
2af12ad9 386 return retlist
a921f407 387
c5229f39 388
8bb56eee
BF
389class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
c5229f39 392 self.attrs = {}
8bb56eee
BF
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
c5229f39 398
8bb56eee
BF
399def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
b4a3d461
S
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
8bb56eee 422 return parser.attrs
9e6dd238 423
c5229f39 424
9e6dd238 425def clean_html(html):
59ae15a5 426 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
59ae15a5
PH
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
edd9221c
TF
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
7decf895 439 return html.strip()
9e6dd238
FV
440
441
d77c3dfd 442def sanitize_open(filename, open_mode):
59ae15a5
PH
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
28e614de 453 if filename == '-':
59ae15a5
PH
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
f45c185f
PH
461 if err.errno in (errno.EACCES,):
462 raise
59ae15a5 463
f45c185f 464 # In case of error, try to remove win32 forbidden chars
d55de57b 465 alt_filename = sanitize_path(filename)
f45c185f
PH
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
d55de57b 470 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 471 return (stream, alt_filename)
d77c3dfd
FV
472
473
474def timeconvert(timestr):
59ae15a5
PH
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
1c469a94 481
5f6a1245 482
796173d0 483def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
158af524
S
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
59ae15a5
PH
488 """
489 def replace_insane(char):
c587cbb7
AT
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
59ae15a5
PH
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
627dcfff 500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
2aeb06d6
PH
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 508 result = ''.join(map(replace_insane, s))
796173d0
PH
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
5a42414b
PH
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
a7440261 518 result = result.lstrip('.')
796173d0
PH
519 if not result:
520 result = '_'
59ae15a5 521 return result
d77c3dfd 522
5f6a1245 523
a2aaf4db
S
524def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
be531ef1
S
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
a2aaf4db
S
533 norm_path.pop(0)
534 sanitized_path = [
ec85ded8 535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 536 for path_part in norm_path]
be531ef1
S
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
539 return os.path.join(*sanitized_path)
540
541
17bcc626 542def sanitize_url(url):
befa4708
S
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
17bcc626
S
558
559
67dda517 560def sanitized_Request(url, *args, **kwargs):
17bcc626 561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
562
563
51098426
S
564def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
d77c3dfd 569def orderedSet(iterable):
59ae15a5
PH
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
d77c3dfd 576
912b38b4 577
55b2f099 578def _htmlentity_transform(entity_with_semicolon):
4e408e47 579 """Transforms an HTML entity to a character."""
55b2f099
YCH
580 entity = entity_with_semicolon[:-1]
581
4e408e47
PH
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
55b2f099
YCH
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
91757b0f 591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
592 if mobj is not None:
593 numstr = mobj.group(1)
28e614de 594 if numstr.startswith('x'):
4e408e47 595 base = 16
28e614de 596 numstr = '0%s' % numstr
4e408e47
PH
597 else:
598 base = 10
7aefc49c
S
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
4e408e47
PH
604
605 # Unknown entity in name, return its literal representation
7a3f0c00 606 return '&%s;' % entity
4e408e47
PH
607
608
d77c3dfd 609def unescapeHTML(s):
912b38b4
PH
610 if s is None:
611 return None
612 assert type(s) == compat_str
d77c3dfd 613
4e408e47 614 return re.sub(
95f3f7c2 615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 616
8bf48f23 617
aa49acd1
S
618def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
8bf48f23 630def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
631 """
632 @param s The name of the file
633 """
d77c3dfd 634
8bf48f23 635 assert type(s) == compat_str
d77c3dfd 636
59ae15a5
PH
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
0f00efed 640
aa49acd1
S
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
8ee239e9
YCH
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
aa49acd1
S
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 663
f07b74fc
PH
664
665def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
7af808a5 669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
aa49acd1
S
674def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
8271226a
PH
678def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
1c256f70 686
5f6a1245 687
4539dd30
PH
688def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
a0ddb8a2 696
be4a824d
PH
697def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 701 if opts_no_check_certificate:
be5f2c19 702 context.check_hostname = False
0db261ba 703 context.verify_mode = ssl.CERT_NONE
a2366922 704 try:
be4a824d 705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
d7932313 712 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 713 else: # Python < 3.4
d7932313 714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 715 context.verify_mode = (ssl.CERT_NONE
dca08720 716 if opts_no_check_certificate
ea6d901e 717 else ssl.CERT_REQUIRED)
303b479e 718 context.set_default_verify_paths()
be4a824d 719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 720
732ea2f0 721
08f2a92c
JMF
722def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
bf5b9d85
PM
733class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738class ExtractorError(YoutubeDLError):
1c256f70 739 """Error during info extraction."""
5f6a1245 740
d11271dd 741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
d11271dd
PH
748 if video_id is not None:
749 msg = video_id + ': ' + msg
410f3e73 750 if cause:
28e614de 751 msg += ' (caused by %r)' % cause
9a82b238 752 if not expected:
08f2a92c 753 msg += bug_reports_message()
1c256f70 754 super(ExtractorError, self).__init__(msg)
d5979c5d 755
1c256f70 756 self.traceback = tb
8cc83b8d 757 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 758 self.cause = cause
d11271dd 759 self.video_id = video_id
1c256f70 760
01951dda
PH
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
28e614de 764 return ''.join(traceback.format_tb(self.traceback))
01951dda 765
1c256f70 766
416c7fcb
PH
767class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
55b3e45b
JMF
774class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
773f291d
S
779class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
bf5b9d85 791class DownloadError(YoutubeDLError):
59ae15a5 792 """Download Error exception.
d77c3dfd 793
59ae15a5
PH
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
5f6a1245 798
8cc83b8d
FV
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
d77c3dfd
FV
803
804
bf5b9d85 805class SameFileError(YoutubeDLError):
59ae15a5 806 """Same File exception.
d77c3dfd 807
59ae15a5
PH
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
d77c3dfd
FV
812
813
bf5b9d85 814class PostProcessingError(YoutubeDLError):
59ae15a5 815 """Post Processing exception.
d77c3dfd 816
59ae15a5
PH
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
5f6a1245 820
7851b379 821 def __init__(self, msg):
bf5b9d85 822 super(PostProcessingError, self).__init__(msg)
7851b379 823 self.msg = msg
d77c3dfd 824
5f6a1245 825
bf5b9d85 826class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
827 """ --max-downloads limit has been reached. """
828 pass
d77c3dfd
FV
829
830
bf5b9d85 831class UnavailableVideoError(YoutubeDLError):
59ae15a5 832 """Unavailable Format exception.
d77c3dfd 833
59ae15a5
PH
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
d77c3dfd
FV
838
839
bf5b9d85 840class ContentTooShortError(YoutubeDLError):
59ae15a5 841 """Content Too Short exception.
d77c3dfd 842
59ae15a5
PH
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
d77c3dfd 847
59ae15a5 848 def __init__(self, downloaded, expected):
bf5b9d85
PM
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
2c7ed247 852 # Both in bytes
59ae15a5
PH
853 self.downloaded = downloaded
854 self.expected = expected
d77c3dfd 855
5f6a1245 856
bf5b9d85 857class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
bd264412 861 self.msg = msg
efa97bdc
YCH
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
bf5b9d85 873class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
874 pass
875
876
c5a59d93 877def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
65220c3b
S
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 884 source_address = ydl_handler._params.get('source_address')
8959018a 885
be4a824d 886 if source_address is not None:
8959018a
AU
887 # This is to workaround _create_connection() from socket where it will try all
888 # address data from getaddrinfo() including IPv6. This filters the result from
889 # getaddrinfo() based on the source_address value.
890 # This is based on the cpython socket.create_connection() function.
891 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
892 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
893 host, port = address
894 err = None
895 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
896 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
897 ip_addrs = [addr for addr in addrs if addr[0] == af]
898 if addrs and not ip_addrs:
899 ip_version = 'v4' if af == socket.AF_INET else 'v6'
900 raise socket.error(
901 "No remote IP%s addresses available for connect, can't use '%s' as source address"
902 % (ip_version, source_address[0]))
8959018a
AU
903 for res in ip_addrs:
904 af, socktype, proto, canonname, sa = res
905 sock = None
906 try:
907 sock = socket.socket(af, socktype, proto)
908 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
909 sock.settimeout(timeout)
910 sock.bind(source_address)
911 sock.connect(sa)
912 err = None # Explicitly break reference cycle
913 return sock
914 except socket.error as _:
915 err = _
916 if sock is not None:
917 sock.close()
918 if err is not None:
919 raise err
920 else:
9e21e6d9
S
921 raise socket.error('getaddrinfo returns an empty list')
922 if hasattr(hc, '_create_connection'):
923 hc._create_connection = _create_connection
be4a824d
PH
924 sa = (source_address, 0)
925 if hasattr(hc, 'source_address'): # Python 2.7+
926 hc.source_address = sa
927 else: # Python 2.6
928 def _hc_connect(self, *args, **kwargs):
9e21e6d9 929 sock = _create_connection(
be4a824d
PH
930 (self.host, self.port), self.timeout, sa)
931 if is_https:
d7932313
PH
932 self.sock = ssl.wrap_socket(
933 sock, self.key_file, self.cert_file,
934 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
935 else:
936 self.sock = sock
937 hc.connect = functools.partial(_hc_connect, hc)
938
939 return hc
940
941
87f0e62d 942def handle_youtubedl_headers(headers):
992fc9d6
YCH
943 filtered_headers = headers
944
945 if 'Youtubedl-no-compression' in filtered_headers:
946 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 947 del filtered_headers['Youtubedl-no-compression']
87f0e62d 948
992fc9d6 949 return filtered_headers
87f0e62d
YCH
950
951
acebc9cd 952class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
953 """Handler for HTTP requests and responses.
954
955 This class, when installed with an OpenerDirector, automatically adds
956 the standard headers to every HTTP request and handles gzipped and
957 deflated responses from web servers. If compression is to be avoided in
958 a particular request, the original request in the program code only has
0424ec30 959 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
960 removed before making the real request.
961
962 Part of this code was copied from:
963
964 http://techknack.net/python-urllib2-handlers/
965
966 Andrew Rowls, the author of that code, agreed to release it to the
967 public domain.
968 """
969
be4a824d
PH
970 def __init__(self, params, *args, **kwargs):
971 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
972 self._params = params
973
974 def http_open(self, req):
71aff188
YCH
975 conn_class = compat_http_client.HTTPConnection
976
977 socks_proxy = req.headers.get('Ytdl-socks-proxy')
978 if socks_proxy:
979 conn_class = make_socks_conn_class(conn_class, socks_proxy)
980 del req.headers['Ytdl-socks-proxy']
981
be4a824d 982 return self.do_open(functools.partial(
71aff188 983 _create_http_connection, self, conn_class, False),
be4a824d
PH
984 req)
985
59ae15a5
PH
986 @staticmethod
987 def deflate(data):
988 try:
989 return zlib.decompress(data, -zlib.MAX_WBITS)
990 except zlib.error:
991 return zlib.decompress(data)
992
acebc9cd 993 def http_request(self, req):
51f267d9
S
994 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
995 # always respected by websites, some tend to give out URLs with non percent-encoded
996 # non-ASCII characters (see telemb.py, ard.py [#3412])
997 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
998 # To work around aforementioned issue we will replace request's original URL with
999 # percent-encoded one
1000 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1001 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1002 url = req.get_full_url()
1003 url_escaped = escape_url(url)
1004
1005 # Substitute URL if any change after escaping
1006 if url != url_escaped:
15d260eb 1007 req = update_Request(req, url=url_escaped)
51f267d9 1008
33ac271b 1009 for h, v in std_headers.items():
3d5f7a39
JK
1010 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1011 # The dict keys are capitalized because of this bug by urllib
1012 if h.capitalize() not in req.headers:
33ac271b 1013 req.add_header(h, v)
87f0e62d
YCH
1014
1015 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1016
1017 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1018 # Python 2.6 is brain-dead when it comes to fragments
1019 req._Request__original = req._Request__original.partition('#')[0]
1020 req._Request__r_type = req._Request__r_type.partition('#')[0]
1021
59ae15a5
PH
1022 return req
1023
acebc9cd 1024 def http_response(self, req, resp):
59ae15a5
PH
1025 old_resp = resp
1026 # gzip
1027 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1028 content = resp.read()
1029 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1030 try:
1031 uncompressed = io.BytesIO(gz.read())
1032 except IOError as original_ioerror:
1033 # There may be junk add the end of the file
1034 # See http://stackoverflow.com/q/4928560/35070 for details
1035 for i in range(1, 1024):
1036 try:
1037 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1038 uncompressed = io.BytesIO(gz.read())
1039 except IOError:
1040 continue
1041 break
1042 else:
1043 raise original_ioerror
b407d853 1044 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1045 resp.msg = old_resp.msg
c047270c 1046 del resp.headers['Content-encoding']
59ae15a5
PH
1047 # deflate
1048 if resp.headers.get('Content-encoding', '') == 'deflate':
1049 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1050 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1051 resp.msg = old_resp.msg
c047270c 1052 del resp.headers['Content-encoding']
ad729172
S
1053 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1054 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1055 if 300 <= resp.code < 400:
1056 location = resp.headers.get('Location')
1057 if location:
1058 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1059 if sys.version_info >= (3, 0):
1060 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1061 else:
1062 location = location.decode('utf-8')
5a4d9ddb
S
1063 location_escaped = escape_url(location)
1064 if location != location_escaped:
1065 del resp.headers['Location']
9a4aec8b
YCH
1066 if sys.version_info < (3, 0):
1067 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1068 resp.headers['Location'] = location_escaped
59ae15a5 1069 return resp
0f8d03f8 1070
acebc9cd
PH
1071 https_request = http_request
1072 https_response = http_response
bf50b038 1073
5de90176 1074
71aff188
YCH
1075def make_socks_conn_class(base_class, socks_proxy):
1076 assert issubclass(base_class, (
1077 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1078
1079 url_components = compat_urlparse.urlparse(socks_proxy)
1080 if url_components.scheme.lower() == 'socks5':
1081 socks_type = ProxyType.SOCKS5
1082 elif url_components.scheme.lower() in ('socks', 'socks4'):
1083 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1084 elif url_components.scheme.lower() == 'socks4a':
1085 socks_type = ProxyType.SOCKS4A
71aff188 1086
cdd94c2e
YCH
1087 def unquote_if_non_empty(s):
1088 if not s:
1089 return s
1090 return compat_urllib_parse_unquote_plus(s)
1091
71aff188
YCH
1092 proxy_args = (
1093 socks_type,
1094 url_components.hostname, url_components.port or 1080,
1095 True, # Remote DNS
cdd94c2e
YCH
1096 unquote_if_non_empty(url_components.username),
1097 unquote_if_non_empty(url_components.password),
71aff188
YCH
1098 )
1099
1100 class SocksConnection(base_class):
1101 def connect(self):
1102 self.sock = sockssocket()
1103 self.sock.setproxy(*proxy_args)
1104 if type(self.timeout) in (int, float):
1105 self.sock.settimeout(self.timeout)
1106 self.sock.connect((self.host, self.port))
1107
1108 if isinstance(self, compat_http_client.HTTPSConnection):
1109 if hasattr(self, '_context'): # Python > 2.6
1110 self.sock = self._context.wrap_socket(
1111 self.sock, server_hostname=self.host)
1112 else:
1113 self.sock = ssl.wrap_socket(self.sock)
1114
1115 return SocksConnection
1116
1117
be4a824d
PH
1118class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1119 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1120 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1121 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1122 self._params = params
1123
1124 def https_open(self, req):
4f264c02 1125 kwargs = {}
71aff188
YCH
1126 conn_class = self._https_conn_class
1127
4f264c02
JMF
1128 if hasattr(self, '_context'): # python > 2.6
1129 kwargs['context'] = self._context
1130 if hasattr(self, '_check_hostname'): # python 3.x
1131 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1132
1133 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1134 if socks_proxy:
1135 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1136 del req.headers['Ytdl-socks-proxy']
1137
be4a824d 1138 return self.do_open(functools.partial(
71aff188 1139 _create_http_connection, self, conn_class, True),
4f264c02 1140 req, **kwargs)
be4a824d
PH
1141
1142
1bab3437
S
1143class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1144 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1145 # Store session cookies with `expires` set to 0 instead of an empty
1146 # string
1147 for cookie in self:
1148 if cookie.expires is None:
1149 cookie.expires = 0
1150 compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
1151
1152 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1153 compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
1154 # Session cookies are denoted by either `expires` field set to
1155 # an empty string or 0. MozillaCookieJar only recognizes the former
1156 # (see [1]). So we need force the latter to be recognized as session
1157 # cookies on our own.
1158 # Session cookies may be important for cookies-based authentication,
1159 # e.g. usually, when user does not check 'Remember me' check box while
1160 # logging in on a site, some important cookies are stored as session
1161 # cookies so that not recognizing them will result in failed login.
1162 # 1. https://bugs.python.org/issue17164
1163 for cookie in self:
1164 # Treat `expires=0` cookies as session cookies
1165 if cookie.expires == 0:
1166 cookie.expires = None
1167 cookie.discard = True
1168
1169
a6420bf5
S
1170class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1171 def __init__(self, cookiejar=None):
1172 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1173
1174 def http_response(self, request, response):
1175 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1176 # characters in Set-Cookie HTTP header of last response (see
1177 # https://github.com/rg3/youtube-dl/issues/6769).
1178 # In order to at least prevent crashing we will percent encode Set-Cookie
1179 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1180 # if sys.version_info < (3, 0) and response.headers:
1181 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1182 # set_cookie = response.headers.get(set_cookie_header)
1183 # if set_cookie:
1184 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1185 # if set_cookie != set_cookie_escaped:
1186 # del response.headers[set_cookie_header]
1187 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1188 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1189
1190 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1191 https_response = http_response
1192
1193
46f59e89
S
1194def extract_timezone(date_str):
1195 m = re.search(
1196 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1197 date_str)
1198 if not m:
1199 timezone = datetime.timedelta()
1200 else:
1201 date_str = date_str[:-len(m.group('tz'))]
1202 if not m.group('sign'):
1203 timezone = datetime.timedelta()
1204 else:
1205 sign = 1 if m.group('sign') == '+' else -1
1206 timezone = datetime.timedelta(
1207 hours=sign * int(m.group('hours')),
1208 minutes=sign * int(m.group('minutes')))
1209 return timezone, date_str
1210
1211
08b38d54 1212def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1213 """ Return a UNIX timestamp from the given date """
1214
1215 if date_str is None:
1216 return None
1217
52c3a6e4
S
1218 date_str = re.sub(r'\.[0-9]+', '', date_str)
1219
08b38d54 1220 if timezone is None:
46f59e89
S
1221 timezone, date_str = extract_timezone(date_str)
1222
52c3a6e4
S
1223 try:
1224 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1225 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1226 return calendar.timegm(dt.timetuple())
1227 except ValueError:
1228 pass
912b38b4
PH
1229
1230
46f59e89
S
1231def date_formats(day_first=True):
1232 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1233
1234
42bdd9d0 1235def unified_strdate(date_str, day_first=True):
bf50b038 1236 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1237
1238 if date_str is None:
1239 return None
bf50b038 1240 upload_date = None
5f6a1245 1241 # Replace commas
026fcc04 1242 date_str = date_str.replace(',', ' ')
42bdd9d0 1243 # Remove AM/PM + timezone
9bb8e0a3 1244 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1245 _, date_str = extract_timezone(date_str)
42bdd9d0 1246
46f59e89 1247 for expression in date_formats(day_first):
bf50b038
JMF
1248 try:
1249 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1250 except ValueError:
bf50b038 1251 pass
42393ce2
PH
1252 if upload_date is None:
1253 timetuple = email.utils.parsedate_tz(date_str)
1254 if timetuple:
c6b9cf05
S
1255 try:
1256 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1257 except ValueError:
1258 pass
6a750402
JMF
1259 if upload_date is not None:
1260 return compat_str(upload_date)
bf50b038 1261
5f6a1245 1262
46f59e89
S
1263def unified_timestamp(date_str, day_first=True):
1264 if date_str is None:
1265 return None
1266
2ae2ffda 1267 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1268
7dc2a74e 1269 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1270 timezone, date_str = extract_timezone(date_str)
1271
1272 # Remove AM/PM + timezone
1273 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1274
deef3195
S
1275 # Remove unrecognized timezones from ISO 8601 alike timestamps
1276 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1277 if m:
1278 date_str = date_str[:-len(m.group('tz'))]
1279
f226880c
PH
1280 # Python only supports microseconds, so remove nanoseconds
1281 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1282 if m:
1283 date_str = m.group(1)
1284
46f59e89
S
1285 for expression in date_formats(day_first):
1286 try:
7dc2a74e 1287 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1288 return calendar.timegm(dt.timetuple())
1289 except ValueError:
1290 pass
1291 timetuple = email.utils.parsedate_tz(date_str)
1292 if timetuple:
7dc2a74e 1293 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1294
1295
28e614de 1296def determine_ext(url, default_ext='unknown_video'):
85750f89 1297 if url is None or '.' not in url:
f4776371 1298 return default_ext
9cb9a5df 1299 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1300 if re.match(r'^[A-Za-z0-9]+$', guess):
1301 return guess
a7aaa398
S
1302 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1303 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1304 return guess.rstrip('/')
73e79f2a 1305 else:
cbdbb766 1306 return default_ext
73e79f2a 1307
5f6a1245 1308
d4051a8e 1309def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1310 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1311
5f6a1245 1312
bd558525 1313def date_from_str(date_str):
37254abc
JMF
1314 """
1315 Return a datetime object from a string in the format YYYYMMDD or
1316 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1317 today = datetime.date.today()
f8795e10 1318 if date_str in ('now', 'today'):
37254abc 1319 return today
f8795e10
PH
1320 if date_str == 'yesterday':
1321 return today - datetime.timedelta(days=1)
ec85ded8 1322 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1323 if match is not None:
1324 sign = match.group('sign')
1325 time = int(match.group('time'))
1326 if sign == '-':
1327 time = -time
1328 unit = match.group('unit')
dfb1b146 1329 # A bad approximation?
37254abc
JMF
1330 if unit == 'month':
1331 unit = 'day'
1332 time *= 30
1333 elif unit == 'year':
1334 unit = 'day'
1335 time *= 365
1336 unit += 's'
1337 delta = datetime.timedelta(**{unit: time})
1338 return today + delta
611c1dd9 1339 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1340
1341
e63fc1be 1342def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
5f6a1245 1351
bd558525
JMF
1352class DateRange(object):
1353 """Represents a time interval between two dates"""
5f6a1245 1354
bd558525
JMF
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start)
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
1362 self.end = date_from_str(end)
1363 else:
1364 self.end = datetime.datetime.max.date()
37254abc 1365 if self.start > self.end:
bd558525 1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1367
bd558525
JMF
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
5f6a1245
JW
1371 return cls(day, day)
1372
bd558525
JMF
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
37254abc
JMF
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
5f6a1245 1378
bd558525 1379 def __str__(self):
5f6a1245 1380 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1381
1382
1383def platform_name():
1384 """ Returns the platform name as a compat_str """
1385 res = platform.platform()
1386 if isinstance(res, bytes):
1387 res = res.decode(preferredencoding())
1388
1389 assert isinstance(res, compat_str)
1390 return res
c257baff
PH
1391
1392
b58ddb32
PH
1393def _windows_write_string(s, out):
1394 """ Returns True if the string was written using special methods,
1395 False if it has yet to be written out."""
1396 # Adapted from http://stackoverflow.com/a/3259271/35070
1397
1398 import ctypes
1399 import ctypes.wintypes
1400
1401 WIN_OUTPUT_IDS = {
1402 1: -11,
1403 2: -12,
1404 }
1405
a383a98a
PH
1406 try:
1407 fileno = out.fileno()
1408 except AttributeError:
1409 # If the output stream doesn't have a fileno, it's virtual
1410 return False
aa42e873
PH
1411 except io.UnsupportedOperation:
1412 # Some strange Windows pseudo files?
1413 return False
b58ddb32
PH
1414 if fileno not in WIN_OUTPUT_IDS:
1415 return False
1416
d7cd9a9e 1417 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1418 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1419 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1420 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1421
d7cd9a9e 1422 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1423 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1424 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1425 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1426 written = ctypes.wintypes.DWORD(0)
1427
d7cd9a9e 1428 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1429 FILE_TYPE_CHAR = 0x0002
1430 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1431 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1432 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1433 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1434 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1435 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1436
1437 def not_a_console(handle):
1438 if handle == INVALID_HANDLE_VALUE or handle is None:
1439 return True
8fb3ac36
PH
1440 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1441 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1442
1443 if not_a_console(h):
1444 return False
1445
d1b9c912
PH
1446 def next_nonbmp_pos(s):
1447 try:
1448 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1449 except StopIteration:
1450 return len(s)
1451
1452 while s:
1453 count = min(next_nonbmp_pos(s), 1024)
1454
b58ddb32 1455 ret = WriteConsoleW(
d1b9c912 1456 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1457 if ret == 0:
1458 raise OSError('Failed to write string')
d1b9c912
PH
1459 if not count: # We just wrote a non-BMP character
1460 assert written.value == 2
1461 s = s[1:]
1462 else:
1463 assert written.value > 0
1464 s = s[written.value:]
b58ddb32
PH
1465 return True
1466
1467
734f90bb 1468def write_string(s, out=None, encoding=None):
7459e3a2
PH
1469 if out is None:
1470 out = sys.stderr
8bf48f23 1471 assert type(s) == compat_str
7459e3a2 1472
b58ddb32
PH
1473 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1474 if _windows_write_string(s, out):
1475 return
1476
7459e3a2
PH
1477 if ('b' in getattr(out, 'mode', '') or
1478 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1479 byt = s.encode(encoding or preferredencoding(), 'ignore')
1480 out.write(byt)
1481 elif hasattr(out, 'buffer'):
1482 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1483 byt = s.encode(enc, 'ignore')
1484 out.buffer.write(byt)
1485 else:
8bf48f23 1486 out.write(s)
7459e3a2
PH
1487 out.flush()
1488
1489
48ea9cea
PH
1490def bytes_to_intlist(bs):
1491 if not bs:
1492 return []
1493 if isinstance(bs[0], int): # Python 3
1494 return list(bs)
1495 else:
1496 return [ord(c) for c in bs]
1497
c257baff 1498
cba892fa 1499def intlist_to_bytes(xs):
1500 if not xs:
1501 return b''
edaa23f8 1502 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1503
1504
c1c9a79c
PH
1505# Cross-platform file locking
1506if sys.platform == 'win32':
1507 import ctypes.wintypes
1508 import msvcrt
1509
1510 class OVERLAPPED(ctypes.Structure):
1511 _fields_ = [
1512 ('Internal', ctypes.wintypes.LPVOID),
1513 ('InternalHigh', ctypes.wintypes.LPVOID),
1514 ('Offset', ctypes.wintypes.DWORD),
1515 ('OffsetHigh', ctypes.wintypes.DWORD),
1516 ('hEvent', ctypes.wintypes.HANDLE),
1517 ]
1518
1519 kernel32 = ctypes.windll.kernel32
1520 LockFileEx = kernel32.LockFileEx
1521 LockFileEx.argtypes = [
1522 ctypes.wintypes.HANDLE, # hFile
1523 ctypes.wintypes.DWORD, # dwFlags
1524 ctypes.wintypes.DWORD, # dwReserved
1525 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1526 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1527 ctypes.POINTER(OVERLAPPED) # Overlapped
1528 ]
1529 LockFileEx.restype = ctypes.wintypes.BOOL
1530 UnlockFileEx = kernel32.UnlockFileEx
1531 UnlockFileEx.argtypes = [
1532 ctypes.wintypes.HANDLE, # hFile
1533 ctypes.wintypes.DWORD, # dwReserved
1534 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1535 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1536 ctypes.POINTER(OVERLAPPED) # Overlapped
1537 ]
1538 UnlockFileEx.restype = ctypes.wintypes.BOOL
1539 whole_low = 0xffffffff
1540 whole_high = 0x7fffffff
1541
1542 def _lock_file(f, exclusive):
1543 overlapped = OVERLAPPED()
1544 overlapped.Offset = 0
1545 overlapped.OffsetHigh = 0
1546 overlapped.hEvent = 0
1547 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1548 handle = msvcrt.get_osfhandle(f.fileno())
1549 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1550 whole_low, whole_high, f._lock_file_overlapped_p):
1551 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1552
1553 def _unlock_file(f):
1554 assert f._lock_file_overlapped_p
1555 handle = msvcrt.get_osfhandle(f.fileno())
1556 if not UnlockFileEx(handle, 0,
1557 whole_low, whole_high, f._lock_file_overlapped_p):
1558 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1559
1560else:
399a76e6
YCH
1561 # Some platforms, such as Jython, is missing fcntl
1562 try:
1563 import fcntl
c1c9a79c 1564
399a76e6
YCH
1565 def _lock_file(f, exclusive):
1566 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1567
399a76e6
YCH
1568 def _unlock_file(f):
1569 fcntl.flock(f, fcntl.LOCK_UN)
1570 except ImportError:
1571 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1572
1573 def _lock_file(f, exclusive):
1574 raise IOError(UNSUPPORTED_MSG)
1575
1576 def _unlock_file(f):
1577 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1578
1579
1580class locked_file(object):
1581 def __init__(self, filename, mode, encoding=None):
1582 assert mode in ['r', 'a', 'w']
1583 self.f = io.open(filename, mode, encoding=encoding)
1584 self.mode = mode
1585
1586 def __enter__(self):
1587 exclusive = self.mode != 'r'
1588 try:
1589 _lock_file(self.f, exclusive)
1590 except IOError:
1591 self.f.close()
1592 raise
1593 return self
1594
1595 def __exit__(self, etype, value, traceback):
1596 try:
1597 _unlock_file(self.f)
1598 finally:
1599 self.f.close()
1600
1601 def __iter__(self):
1602 return iter(self.f)
1603
1604 def write(self, *args):
1605 return self.f.write(*args)
1606
1607 def read(self, *args):
1608 return self.f.read(*args)
4eb7f1d1
JMF
1609
1610
4644ac55
S
1611def get_filesystem_encoding():
1612 encoding = sys.getfilesystemencoding()
1613 return encoding if encoding is not None else 'utf-8'
1614
1615
4eb7f1d1 1616def shell_quote(args):
a6a173c2 1617 quoted_args = []
4644ac55 1618 encoding = get_filesystem_encoding()
a6a173c2
JMF
1619 for a in args:
1620 if isinstance(a, bytes):
1621 # We may get a filename encoded with 'encodeFilename'
1622 a = a.decode(encoding)
aefce8e6 1623 quoted_args.append(compat_shlex_quote(a))
28e614de 1624 return ' '.join(quoted_args)
9d4660ca
PH
1625
1626
1627def smuggle_url(url, data):
1628 """ Pass additional data in a URL for internal use. """
1629
81953d1a
RA
1630 url, idata = unsmuggle_url(url, {})
1631 data.update(idata)
15707c7e 1632 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1633 {'__youtubedl_smuggle': json.dumps(data)})
1634 return url + '#' + sdata
9d4660ca
PH
1635
1636
79f82953 1637def unsmuggle_url(smug_url, default=None):
83e865a3 1638 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1639 return smug_url, default
28e614de
PH
1640 url, _, sdata = smug_url.rpartition('#')
1641 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1642 data = json.loads(jsond)
1643 return url, data
02dbf93f
PH
1644
1645
02dbf93f
PH
1646def format_bytes(bytes):
1647 if bytes is None:
28e614de 1648 return 'N/A'
02dbf93f
PH
1649 if type(bytes) is str:
1650 bytes = float(bytes)
1651 if bytes == 0.0:
1652 exponent = 0
1653 else:
1654 exponent = int(math.log(bytes, 1024.0))
28e614de 1655 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1656 converted = float(bytes) / float(1024 ** exponent)
28e614de 1657 return '%.2f%s' % (converted, suffix)
f53c966a 1658
1c088fa8 1659
fb47597b
S
1660def lookup_unit_table(unit_table, s):
1661 units_re = '|'.join(re.escape(u) for u in unit_table)
1662 m = re.match(
782b1b5b 1663 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1664 if not m:
1665 return None
1666 num_str = m.group('num').replace(',', '.')
1667 mult = unit_table[m.group('unit')]
1668 return int(float(num_str) * mult)
1669
1670
be64b5b0
PH
1671def parse_filesize(s):
1672 if s is None:
1673 return None
1674
dfb1b146 1675 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1676 # but we support those too
1677 _UNIT_TABLE = {
1678 'B': 1,
1679 'b': 1,
70852b47 1680 'bytes': 1,
be64b5b0
PH
1681 'KiB': 1024,
1682 'KB': 1000,
1683 'kB': 1024,
1684 'Kb': 1000,
13585d76 1685 'kb': 1000,
70852b47
YCH
1686 'kilobytes': 1000,
1687 'kibibytes': 1024,
be64b5b0
PH
1688 'MiB': 1024 ** 2,
1689 'MB': 1000 ** 2,
1690 'mB': 1024 ** 2,
1691 'Mb': 1000 ** 2,
13585d76 1692 'mb': 1000 ** 2,
70852b47
YCH
1693 'megabytes': 1000 ** 2,
1694 'mebibytes': 1024 ** 2,
be64b5b0
PH
1695 'GiB': 1024 ** 3,
1696 'GB': 1000 ** 3,
1697 'gB': 1024 ** 3,
1698 'Gb': 1000 ** 3,
13585d76 1699 'gb': 1000 ** 3,
70852b47
YCH
1700 'gigabytes': 1000 ** 3,
1701 'gibibytes': 1024 ** 3,
be64b5b0
PH
1702 'TiB': 1024 ** 4,
1703 'TB': 1000 ** 4,
1704 'tB': 1024 ** 4,
1705 'Tb': 1000 ** 4,
13585d76 1706 'tb': 1000 ** 4,
70852b47
YCH
1707 'terabytes': 1000 ** 4,
1708 'tebibytes': 1024 ** 4,
be64b5b0
PH
1709 'PiB': 1024 ** 5,
1710 'PB': 1000 ** 5,
1711 'pB': 1024 ** 5,
1712 'Pb': 1000 ** 5,
13585d76 1713 'pb': 1000 ** 5,
70852b47
YCH
1714 'petabytes': 1000 ** 5,
1715 'pebibytes': 1024 ** 5,
be64b5b0
PH
1716 'EiB': 1024 ** 6,
1717 'EB': 1000 ** 6,
1718 'eB': 1024 ** 6,
1719 'Eb': 1000 ** 6,
13585d76 1720 'eb': 1000 ** 6,
70852b47
YCH
1721 'exabytes': 1000 ** 6,
1722 'exbibytes': 1024 ** 6,
be64b5b0
PH
1723 'ZiB': 1024 ** 7,
1724 'ZB': 1000 ** 7,
1725 'zB': 1024 ** 7,
1726 'Zb': 1000 ** 7,
13585d76 1727 'zb': 1000 ** 7,
70852b47
YCH
1728 'zettabytes': 1000 ** 7,
1729 'zebibytes': 1024 ** 7,
be64b5b0
PH
1730 'YiB': 1024 ** 8,
1731 'YB': 1000 ** 8,
1732 'yB': 1024 ** 8,
1733 'Yb': 1000 ** 8,
13585d76 1734 'yb': 1000 ** 8,
70852b47
YCH
1735 'yottabytes': 1000 ** 8,
1736 'yobibytes': 1024 ** 8,
be64b5b0
PH
1737 }
1738
fb47597b
S
1739 return lookup_unit_table(_UNIT_TABLE, s)
1740
1741
1742def parse_count(s):
1743 if s is None:
be64b5b0
PH
1744 return None
1745
fb47597b
S
1746 s = s.strip()
1747
1748 if re.match(r'^[\d,.]+$', s):
1749 return str_to_int(s)
1750
1751 _UNIT_TABLE = {
1752 'k': 1000,
1753 'K': 1000,
1754 'm': 1000 ** 2,
1755 'M': 1000 ** 2,
1756 'kk': 1000 ** 2,
1757 'KK': 1000 ** 2,
1758 }
be64b5b0 1759
fb47597b 1760 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1761
2f7ae819 1762
b871d7e9
S
1763def parse_resolution(s):
1764 if s is None:
1765 return {}
1766
1767 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1768 if mobj:
1769 return {
1770 'width': int(mobj.group('w')),
1771 'height': int(mobj.group('h')),
1772 }
1773
1774 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1775 if mobj:
1776 return {'height': int(mobj.group(1))}
1777
1778 mobj = re.search(r'\b([48])[kK]\b', s)
1779 if mobj:
1780 return {'height': int(mobj.group(1)) * 540}
1781
1782 return {}
1783
1784
a942d6cb 1785def month_by_name(name, lang='en'):
caefb1de
PH
1786 """ Return the number of a month by (locale-independently) English name """
1787
f6717dec 1788 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1789
caefb1de 1790 try:
f6717dec 1791 return month_names.index(name) + 1
7105440c
YCH
1792 except ValueError:
1793 return None
1794
1795
1796def month_by_abbreviation(abbrev):
1797 """ Return the number of a month by (locale-independently) English
1798 abbreviations """
1799
1800 try:
1801 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1802 except ValueError:
1803 return None
18258362
JMF
1804
1805
5aafe895 1806def fix_xml_ampersands(xml_str):
18258362 1807 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1808 return re.sub(
1809 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1810 '&amp;',
5aafe895 1811 xml_str)
e3946f98
PH
1812
1813
1814def setproctitle(title):
8bf48f23 1815 assert isinstance(title, compat_str)
c1c05c67
YCH
1816
1817 # ctypes in Jython is not complete
1818 # http://bugs.jython.org/issue2148
1819 if sys.platform.startswith('java'):
1820 return
1821
e3946f98 1822 try:
611c1dd9 1823 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1824 except OSError:
1825 return
2f49bcd6
RC
1826 except TypeError:
1827 # LoadLibrary in Windows Python 2.7.13 only expects
1828 # a bytestring, but since unicode_literals turns
1829 # every string into a unicode string, it fails.
1830 return
6eefe533
PH
1831 title_bytes = title.encode('utf-8')
1832 buf = ctypes.create_string_buffer(len(title_bytes))
1833 buf.value = title_bytes
e3946f98 1834 try:
6eefe533 1835 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1836 except AttributeError:
1837 return # Strange libc, just skip this
d7dda168
PH
1838
1839
1840def remove_start(s, start):
46bc9b7d 1841 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1842
1843
2b9faf55 1844def remove_end(s, end):
46bc9b7d 1845 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1846
1847
31b2051e
S
1848def remove_quotes(s):
1849 if s is None or len(s) < 2:
1850 return s
1851 for quote in ('"', "'", ):
1852 if s[0] == quote and s[-1] == quote:
1853 return s[1:-1]
1854 return s
1855
1856
29eb5174 1857def url_basename(url):
9b8aaeed 1858 path = compat_urlparse.urlparse(url).path
28e614de 1859 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1860
1861
02dc0a36
S
1862def base_url(url):
1863 return re.match(r'https?://[^?#&]+/', url).group()
1864
1865
e34c3361 1866def urljoin(base, path):
4b5de77b
S
1867 if isinstance(path, bytes):
1868 path = path.decode('utf-8')
e34c3361
S
1869 if not isinstance(path, compat_str) or not path:
1870 return None
fad4ceb5 1871 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1872 return path
4b5de77b
S
1873 if isinstance(base, bytes):
1874 base = base.decode('utf-8')
1875 if not isinstance(base, compat_str) or not re.match(
1876 r'^(?:https?:)?//', base):
e34c3361
S
1877 return None
1878 return compat_urlparse.urljoin(base, path)
1879
1880
aa94a6d3
PH
1881class HEADRequest(compat_urllib_request.Request):
1882 def get_method(self):
611c1dd9 1883 return 'HEAD'
7217e148
PH
1884
1885
95cf60e8
S
1886class PUTRequest(compat_urllib_request.Request):
1887 def get_method(self):
1888 return 'PUT'
1889
1890
9732d77e 1891def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1892 if get_attr:
1893 if v is not None:
1894 v = getattr(v, get_attr, None)
9572013d
PH
1895 if v == '':
1896 v = None
1812afb7
S
1897 if v is None:
1898 return default
1899 try:
1900 return int(v) * invscale // scale
1901 except ValueError:
af98f8ff 1902 return default
9732d77e 1903
9572013d 1904
40a90862
JMF
1905def str_or_none(v, default=None):
1906 return default if v is None else compat_str(v)
1907
9732d77e
PH
1908
1909def str_to_int(int_str):
48d4681e 1910 """ A more relaxed version of int_or_none """
9732d77e
PH
1911 if int_str is None:
1912 return None
28e614de 1913 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1914 return int(int_str)
608d11f5
PH
1915
1916
9732d77e 1917def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1918 if v is None:
1919 return default
1920 try:
1921 return float(v) * invscale / scale
1922 except ValueError:
1923 return default
43f775e4
PH
1924
1925
c7e327c4
S
1926def bool_or_none(v, default=None):
1927 return v if isinstance(v, bool) else default
1928
1929
b72b4431
S
1930def strip_or_none(v):
1931 return None if v is None else v.strip()
1932
1933
af03000a
S
1934def url_or_none(url):
1935 if not url or not isinstance(url, compat_str):
1936 return None
1937 url = url.strip()
1938 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1939
1940
608d11f5 1941def parse_duration(s):
8f9312c3 1942 if not isinstance(s, compat_basestring):
608d11f5
PH
1943 return None
1944
ca7b3246
S
1945 s = s.strip()
1946
acaff495 1947 days, hours, mins, secs, ms = [None] * 5
15846398 1948 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1949 if m:
1950 days, hours, mins, secs, ms = m.groups()
1951 else:
1952 m = re.match(
056653bb
S
1953 r'''(?ix)(?:P?
1954 (?:
1955 [0-9]+\s*y(?:ears?)?\s*
1956 )?
1957 (?:
1958 [0-9]+\s*m(?:onths?)?\s*
1959 )?
1960 (?:
1961 [0-9]+\s*w(?:eeks?)?\s*
1962 )?
8f4b58d7 1963 (?:
acaff495 1964 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1965 )?
056653bb 1966 T)?
acaff495 1967 (?:
1968 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1969 )?
1970 (?:
1971 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1972 )?
1973 (?:
1974 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1975 )?Z?$''', s)
acaff495 1976 if m:
1977 days, hours, mins, secs, ms = m.groups()
1978 else:
15846398 1979 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1980 if m:
1981 hours, mins = m.groups()
1982 else:
1983 return None
1984
1985 duration = 0
1986 if secs:
1987 duration += float(secs)
1988 if mins:
1989 duration += float(mins) * 60
1990 if hours:
1991 duration += float(hours) * 60 * 60
1992 if days:
1993 duration += float(days) * 24 * 60 * 60
1994 if ms:
1995 duration += float(ms)
1996 return duration
91d7d0b3
JMF
1997
1998
e65e4c88 1999def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2000 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2001 return (
2002 '{0}.{1}{2}'.format(name, ext, real_ext)
2003 if not expected_real_ext or real_ext[1:] == expected_real_ext
2004 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2005
2006
b3ed15b7
S
2007def replace_extension(filename, ext, expected_real_ext=None):
2008 name, real_ext = os.path.splitext(filename)
2009 return '{0}.{1}'.format(
2010 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2011 ext)
2012
2013
d70ad093
PH
2014def check_executable(exe, args=[]):
2015 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2016 args can be a list of arguments for a short output (like -version) """
2017 try:
2018 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
2019 except OSError:
2020 return False
2021 return exe
b7ab0590
PH
2022
2023
95807118 2024def get_exe_version(exe, args=['--version'],
cae97f65 2025 version_re=None, unrecognized='present'):
95807118
PH
2026 """ Returns the version of the specified executable,
2027 or False if the executable is not present """
2028 try:
b64d04c1
YCH
2029 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2030 # SIGTTOU if youtube-dl is run in the background.
2031 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 2032 out, _ = subprocess.Popen(
54116803 2033 [encodeArgument(exe)] + args,
00ca7552 2034 stdin=subprocess.PIPE,
95807118
PH
2035 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2036 except OSError:
2037 return False
cae97f65
PH
2038 if isinstance(out, bytes): # Python 2.x
2039 out = out.decode('ascii', 'ignore')
2040 return detect_exe_version(out, version_re, unrecognized)
2041
2042
2043def detect_exe_version(output, version_re=None, unrecognized='present'):
2044 assert isinstance(output, compat_str)
2045 if version_re is None:
2046 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2047 m = re.search(version_re, output)
95807118
PH
2048 if m:
2049 return m.group(1)
2050 else:
2051 return unrecognized
2052
2053
b7ab0590 2054class PagedList(object):
dd26ced1
PH
2055 def __len__(self):
2056 # This is only useful for tests
2057 return len(self.getslice())
2058
9c44d242
PH
2059
2060class OnDemandPagedList(PagedList):
6be08ce6 2061 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
2062 self._pagefunc = pagefunc
2063 self._pagesize = pagesize
b95dc034
YCH
2064 self._use_cache = use_cache
2065 if use_cache:
2066 self._cache = {}
9c44d242 2067
b7ab0590
PH
2068 def getslice(self, start=0, end=None):
2069 res = []
2070 for pagenum in itertools.count(start // self._pagesize):
2071 firstid = pagenum * self._pagesize
2072 nextfirstid = pagenum * self._pagesize + self._pagesize
2073 if start >= nextfirstid:
2074 continue
2075
b95dc034
YCH
2076 page_results = None
2077 if self._use_cache:
2078 page_results = self._cache.get(pagenum)
2079 if page_results is None:
2080 page_results = list(self._pagefunc(pagenum))
2081 if self._use_cache:
2082 self._cache[pagenum] = page_results
b7ab0590
PH
2083
2084 startv = (
2085 start % self._pagesize
2086 if firstid <= start < nextfirstid
2087 else 0)
2088
2089 endv = (
2090 ((end - 1) % self._pagesize) + 1
2091 if (end is not None and firstid <= end <= nextfirstid)
2092 else None)
2093
2094 if startv != 0 or endv is not None:
2095 page_results = page_results[startv:endv]
2096 res.extend(page_results)
2097
2098 # A little optimization - if current page is not "full", ie. does
2099 # not contain page_size videos then we can assume that this page
2100 # is the last one - there are no more ids on further pages -
2101 # i.e. no need to query again.
2102 if len(page_results) + startv < self._pagesize:
2103 break
2104
2105 # If we got the whole page, but the next page is not interesting,
2106 # break out early as well
2107 if end == nextfirstid:
2108 break
2109 return res
81c2f20b
PH
2110
2111
9c44d242
PH
2112class InAdvancePagedList(PagedList):
2113 def __init__(self, pagefunc, pagecount, pagesize):
2114 self._pagefunc = pagefunc
2115 self._pagecount = pagecount
2116 self._pagesize = pagesize
2117
2118 def getslice(self, start=0, end=None):
2119 res = []
2120 start_page = start // self._pagesize
2121 end_page = (
2122 self._pagecount if end is None else (end // self._pagesize + 1))
2123 skip_elems = start - start_page * self._pagesize
2124 only_more = None if end is None else end - start
2125 for pagenum in range(start_page, end_page):
2126 page = list(self._pagefunc(pagenum))
2127 if skip_elems:
2128 page = page[skip_elems:]
2129 skip_elems = None
2130 if only_more is not None:
2131 if len(page) < only_more:
2132 only_more -= len(page)
2133 else:
2134 page = page[:only_more]
2135 res.extend(page)
2136 break
2137 res.extend(page)
2138 return res
2139
2140
81c2f20b 2141def uppercase_escape(s):
676eb3f2 2142 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2143 return re.sub(
a612753d 2144 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2145 lambda m: unicode_escape(m.group(0))[0],
2146 s)
0fe2ff78
YCH
2147
2148
2149def lowercase_escape(s):
2150 unicode_escape = codecs.getdecoder('unicode_escape')
2151 return re.sub(
2152 r'\\u[0-9a-fA-F]{4}',
2153 lambda m: unicode_escape(m.group(0))[0],
2154 s)
b53466e1 2155
d05cfe06
S
2156
2157def escape_rfc3986(s):
2158 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2159 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2160 s = s.encode('utf-8')
ecc0c5ee 2161 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2162
2163
2164def escape_url(url):
2165 """Escape URL as suggested by RFC 3986"""
2166 url_parsed = compat_urllib_parse_urlparse(url)
2167 return url_parsed._replace(
efbed08d 2168 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2169 path=escape_rfc3986(url_parsed.path),
2170 params=escape_rfc3986(url_parsed.params),
2171 query=escape_rfc3986(url_parsed.query),
2172 fragment=escape_rfc3986(url_parsed.fragment)
2173 ).geturl()
2174
62e609ab
PH
2175
2176def read_batch_urls(batch_fd):
2177 def fixup(url):
2178 if not isinstance(url, compat_str):
2179 url = url.decode('utf-8', 'replace')
28e614de 2180 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2181 if url.startswith(BOM_UTF8):
2182 url = url[len(BOM_UTF8):]
2183 url = url.strip()
2184 if url.startswith(('#', ';', ']')):
2185 return False
2186 return url
2187
2188 with contextlib.closing(batch_fd) as fd:
2189 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2190
2191
2192def urlencode_postdata(*args, **kargs):
15707c7e 2193 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2194
2195
38f9ef31 2196def update_url_query(url, query):
cacd9966
YCH
2197 if not query:
2198 return url
38f9ef31 2199 parsed_url = compat_urlparse.urlparse(url)
2200 qs = compat_parse_qs(parsed_url.query)
2201 qs.update(query)
2202 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2203 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2204
8e60dc75 2205
ed0291d1
S
2206def update_Request(req, url=None, data=None, headers={}, query={}):
2207 req_headers = req.headers.copy()
2208 req_headers.update(headers)
2209 req_data = data or req.data
2210 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2211 req_get_method = req.get_method()
2212 if req_get_method == 'HEAD':
2213 req_type = HEADRequest
2214 elif req_get_method == 'PUT':
2215 req_type = PUTRequest
2216 else:
2217 req_type = compat_urllib_request.Request
ed0291d1
S
2218 new_req = req_type(
2219 req_url, data=req_data, headers=req_headers,
2220 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2221 if hasattr(req, 'timeout'):
2222 new_req.timeout = req.timeout
2223 return new_req
2224
2225
10c87c15 2226def _multipart_encode_impl(data, boundary):
0c265486
YCH
2227 content_type = 'multipart/form-data; boundary=%s' % boundary
2228
2229 out = b''
2230 for k, v in data.items():
2231 out += b'--' + boundary.encode('ascii') + b'\r\n'
2232 if isinstance(k, compat_str):
2233 k = k.encode('utf-8')
2234 if isinstance(v, compat_str):
2235 v = v.encode('utf-8')
2236 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2237 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2238 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2239 if boundary.encode('ascii') in content:
2240 raise ValueError('Boundary overlaps with data')
2241 out += content
2242
2243 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2244
2245 return out, content_type
2246
2247
2248def multipart_encode(data, boundary=None):
2249 '''
2250 Encode a dict to RFC 7578-compliant form-data
2251
2252 data:
2253 A dict where keys and values can be either Unicode or bytes-like
2254 objects.
2255 boundary:
2256 If specified a Unicode object, it's used as the boundary. Otherwise
2257 a random boundary is generated.
2258
2259 Reference: https://tools.ietf.org/html/rfc7578
2260 '''
2261 has_specified_boundary = boundary is not None
2262
2263 while True:
2264 if boundary is None:
2265 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2266
2267 try:
10c87c15 2268 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2269 break
2270 except ValueError:
2271 if has_specified_boundary:
2272 raise
2273 boundary = None
2274
2275 return out, content_type
2276
2277
86296ad2 2278def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2279 if isinstance(key_or_keys, (list, tuple)):
2280 for key in key_or_keys:
86296ad2
S
2281 if key not in d or d[key] is None or skip_false_values and not d[key]:
2282 continue
2283 return d[key]
cbecc9b9
S
2284 return default
2285 return d.get(key_or_keys, default)
2286
2287
329ca3be 2288def try_get(src, getter, expected_type=None):
a32a9a7e
S
2289 if not isinstance(getter, (list, tuple)):
2290 getter = [getter]
2291 for get in getter:
2292 try:
2293 v = get(src)
2294 except (AttributeError, KeyError, TypeError, IndexError):
2295 pass
2296 else:
2297 if expected_type is None or isinstance(v, expected_type):
2298 return v
329ca3be
S
2299
2300
6cc62232
S
2301def merge_dicts(*dicts):
2302 merged = {}
2303 for a_dict in dicts:
2304 for k, v in a_dict.items():
2305 if v is None:
2306 continue
2307 if (k not in merged or
2308 (isinstance(v, compat_str) and v and
2309 isinstance(merged[k], compat_str) and
2310 not merged[k])):
2311 merged[k] = v
2312 return merged
2313
2314
8e60dc75
S
2315def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2316 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2317
16392824 2318
a1a530b0
PH
2319US_RATINGS = {
2320 'G': 0,
2321 'PG': 10,
2322 'PG-13': 13,
2323 'R': 16,
2324 'NC': 18,
2325}
fac55558
PH
2326
2327
a8795327 2328TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2329 'TV-Y': 0,
2330 'TV-Y7': 7,
2331 'TV-G': 0,
2332 'TV-PG': 0,
2333 'TV-14': 14,
2334 'TV-MA': 17,
a8795327
S
2335}
2336
2337
146c80e2 2338def parse_age_limit(s):
a8795327
S
2339 if type(s) == int:
2340 return s if 0 <= s <= 21 else None
2341 if not isinstance(s, compat_basestring):
d838b1bd 2342 return None
146c80e2 2343 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2344 if m:
2345 return int(m.group('age'))
2346 if s in US_RATINGS:
2347 return US_RATINGS[s]
5a16c9d9 2348 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2349 if m:
5a16c9d9 2350 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2351 return None
146c80e2
S
2352
2353
fac55558 2354def strip_jsonp(code):
609a61e3 2355 return re.sub(
5552c9eb 2356 r'''(?sx)^
e9c671d5 2357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2358 (?:\s*&&\s*(?P=func_name))?
2359 \s*\(\s*(?P<callback_data>.*)\);?
2360 \s*?(?://[^\n]*)*$''',
2361 r'\g<callback_data>', code)
478c2c61
PH
2362
2363
e05f6939 2364def js_to_json(code):
4195096e
S
2365 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2366 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2367 INTEGER_TABLE = (
2368 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2369 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2370 )
2371
e05f6939 2372 def fix_kv(m):
e7b6d122
PH
2373 v = m.group(0)
2374 if v in ('true', 'false', 'null'):
2375 return v
b3ee552e 2376 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2377 return ""
2378
2379 if v[0] in ("'", '"'):
2380 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2381 '"': '\\"',
bd1e4844 2382 "\\'": "'",
2383 '\\\n': '',
2384 '\\x': '\\u00',
2385 }.get(m.group(0), m.group(0)), v[1:-1])
2386
89ac4a19
S
2387 for regex, base in INTEGER_TABLE:
2388 im = re.match(regex, v)
2389 if im:
e4659b45 2390 i = int(im.group(1), base)
89ac4a19
S
2391 return '"%d":' % i if v.endswith(':') else '%d' % i
2392
e7b6d122 2393 return '"%s"' % v
e05f6939 2394
bd1e4844 2395 return re.sub(r'''(?sx)
2396 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2397 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2398 {comment}|,(?={skip}[\]}}])|
c384d537 2399 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2400 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2401 [0-9]+(?={skip}:)
2402 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2403
2404
478c2c61
PH
2405def qualities(quality_ids):
2406 """ Get a numeric quality value out of a list of possible values """
2407 def q(qid):
2408 try:
2409 return quality_ids.index(qid)
2410 except ValueError:
2411 return -1
2412 return q
2413
acd69589
PH
2414
2415DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2416
a020a0dc
PH
2417
2418def limit_length(s, length):
2419 """ Add ellipses to overly long strings """
2420 if s is None:
2421 return None
2422 ELLIPSES = '...'
2423 if len(s) > length:
2424 return s[:length - len(ELLIPSES)] + ELLIPSES
2425 return s
48844745
PH
2426
2427
2428def version_tuple(v):
5f9b8394 2429 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2430
2431
2432def is_outdated_version(version, limit, assume_new=True):
2433 if not version:
2434 return not assume_new
2435 try:
2436 return version_tuple(version) < version_tuple(limit)
2437 except ValueError:
2438 return not assume_new
732ea2f0
PH
2439
2440
2441def ytdl_is_updateable():
2442 """ Returns if youtube-dl can be updated with -U """
2443 from zipimport import zipimporter
2444
2445 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2446
2447
2448def args_to_str(args):
2449 # Get a short string representation for a subprocess command
702ccf2d 2450 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2451
2452
9b9c5355 2453def error_to_compat_str(err):
fdae2358
S
2454 err_str = str(err)
2455 # On python 2 error byte string must be decoded with proper
2456 # encoding rather than ascii
2457 if sys.version_info[0] < 3:
2458 err_str = err_str.decode(preferredencoding())
2459 return err_str
2460
2461
c460bdd5 2462def mimetype2ext(mt):
eb9ee194
S
2463 if mt is None:
2464 return None
2465
765ac263
JMF
2466 ext = {
2467 'audio/mp4': 'm4a',
6c33d24b
YCH
2468 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2469 # it's the most popular one
2470 'audio/mpeg': 'mp3',
765ac263
JMF
2471 }.get(mt)
2472 if ext is not None:
2473 return ext
2474
c460bdd5 2475 _, _, res = mt.rpartition('/')
6562d34a 2476 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2477
2478 return {
f6861ec9 2479 '3gpp': '3gp',
cafcf657 2480 'smptett+xml': 'tt',
cafcf657 2481 'ttaf+xml': 'dfxp',
a0d8d704 2482 'ttml+xml': 'ttml',
f6861ec9 2483 'x-flv': 'flv',
a0d8d704 2484 'x-mp4-fragmented': 'mp4',
d4f05d47 2485 'x-ms-sami': 'sami',
a0d8d704 2486 'x-ms-wmv': 'wmv',
b4173f15
RA
2487 'mpegurl': 'm3u8',
2488 'x-mpegurl': 'm3u8',
2489 'vnd.apple.mpegurl': 'm3u8',
2490 'dash+xml': 'mpd',
b4173f15 2491 'f4m+xml': 'f4m',
f164b971 2492 'hds+xml': 'f4m',
e910fe2f 2493 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2494 'quicktime': 'mov',
98ce1a3f 2495 'mp2t': 'ts',
c460bdd5
PH
2496 }.get(res, res)
2497
2498
4f3c5e06 2499def parse_codecs(codecs_str):
2500 # http://tools.ietf.org/html/rfc6381
2501 if not codecs_str:
2502 return {}
2503 splited_codecs = list(filter(None, map(
2504 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2505 vcodec, acodec = None, None
2506 for full_codec in splited_codecs:
2507 codec = full_codec.split('.')[0]
25d110be 2508 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
4f3c5e06 2509 if not vcodec:
2510 vcodec = full_codec
60f5c9fb 2511 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2512 if not acodec:
2513 acodec = full_codec
2514 else:
60f5c9fb 2515 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2516 if not vcodec and not acodec:
2517 if len(splited_codecs) == 2:
2518 return {
2519 'vcodec': vcodec,
2520 'acodec': acodec,
2521 }
2522 elif len(splited_codecs) == 1:
2523 return {
2524 'vcodec': 'none',
2525 'acodec': vcodec,
2526 }
2527 else:
2528 return {
2529 'vcodec': vcodec or 'none',
2530 'acodec': acodec or 'none',
2531 }
2532 return {}
2533
2534
2ccd1b10 2535def urlhandle_detect_ext(url_handle):
79298173 2536 getheader = url_handle.headers.get
2ccd1b10 2537
b55ee18f
PH
2538 cd = getheader('Content-Disposition')
2539 if cd:
2540 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2541 if m:
2542 e = determine_ext(m.group('filename'), default_ext=None)
2543 if e:
2544 return e
2545
c460bdd5 2546 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2547
2548
1e399778
YCH
2549def encode_data_uri(data, mime_type):
2550 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2551
2552
05900629 2553def age_restricted(content_limit, age_limit):
6ec6cb4e 2554 """ Returns True iff the content should be blocked """
05900629
PH
2555
2556 if age_limit is None: # No limit set
2557 return False
2558 if content_limit is None:
2559 return False # Content available for everyone
2560 return age_limit < content_limit
61ca9a80
PH
2561
2562
2563def is_html(first_bytes):
2564 """ Detect whether a file contains HTML by examining its first bytes. """
2565
2566 BOMS = [
2567 (b'\xef\xbb\xbf', 'utf-8'),
2568 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2569 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2570 (b'\xff\xfe', 'utf-16-le'),
2571 (b'\xfe\xff', 'utf-16-be'),
2572 ]
2573 for bom, enc in BOMS:
2574 if first_bytes.startswith(bom):
2575 s = first_bytes[len(bom):].decode(enc, 'replace')
2576 break
2577 else:
2578 s = first_bytes.decode('utf-8', 'replace')
2579
2580 return re.match(r'^\s*<', s)
a055469f
PH
2581
2582
2583def determine_protocol(info_dict):
2584 protocol = info_dict.get('protocol')
2585 if protocol is not None:
2586 return protocol
2587
2588 url = info_dict['url']
2589 if url.startswith('rtmp'):
2590 return 'rtmp'
2591 elif url.startswith('mms'):
2592 return 'mms'
2593 elif url.startswith('rtsp'):
2594 return 'rtsp'
2595
2596 ext = determine_ext(url)
2597 if ext == 'm3u8':
2598 return 'm3u8'
2599 elif ext == 'f4m':
2600 return 'f4m'
2601
2602 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2603
2604
2605def render_table(header_row, data):
2606 """ Render a list of rows, each as a list of values """
2607 table = [header_row] + data
2608 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2609 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2610 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2611
2612
2613def _match_one(filter_part, dct):
2614 COMPARISON_OPERATORS = {
2615 '<': operator.lt,
2616 '<=': operator.le,
2617 '>': operator.gt,
2618 '>=': operator.ge,
2619 '=': operator.eq,
2620 '!=': operator.ne,
2621 }
2622 operator_rex = re.compile(r'''(?x)\s*
2623 (?P<key>[a-z_]+)
2624 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2625 (?:
2626 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2627 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2628 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2629 )
2630 \s*$
2631 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2632 m = operator_rex.search(filter_part)
2633 if m:
2634 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2635 actual_value = dct.get(m.group('key'))
db13c16e
S
2636 if (m.group('quotedstrval') is not None or
2637 m.group('strval') is not None or
e5a088dc
S
2638 # If the original field is a string and matching comparisonvalue is
2639 # a number we should respect the origin of the original field
2640 # and process comparison value as a string (see
2641 # https://github.com/rg3/youtube-dl/issues/11082).
2642 actual_value is not None and m.group('intval') is not None and
2643 isinstance(actual_value, compat_str)):
347de493
PH
2644 if m.group('op') not in ('=', '!='):
2645 raise ValueError(
2646 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2647 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2648 quote = m.group('quote')
2649 if quote is not None:
2650 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2651 else:
2652 try:
2653 comparison_value = int(m.group('intval'))
2654 except ValueError:
2655 comparison_value = parse_filesize(m.group('intval'))
2656 if comparison_value is None:
2657 comparison_value = parse_filesize(m.group('intval') + 'B')
2658 if comparison_value is None:
2659 raise ValueError(
2660 'Invalid integer value %r in filter part %r' % (
2661 m.group('intval'), filter_part))
347de493
PH
2662 if actual_value is None:
2663 return m.group('none_inclusive')
2664 return op(actual_value, comparison_value)
2665
2666 UNARY_OPERATORS = {
1cc47c66
S
2667 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2668 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
2669 }
2670 operator_rex = re.compile(r'''(?x)\s*
2671 (?P<op>%s)\s*(?P<key>[a-z_]+)
2672 \s*$
2673 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2674 m = operator_rex.search(filter_part)
2675 if m:
2676 op = UNARY_OPERATORS[m.group('op')]
2677 actual_value = dct.get(m.group('key'))
2678 return op(actual_value)
2679
2680 raise ValueError('Invalid filter part %r' % filter_part)
2681
2682
2683def match_str(filter_str, dct):
2684 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2685
2686 return all(
2687 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2688
2689
2690def match_filter_func(filter_str):
2691 def _match_func(info_dict):
2692 if match_str(filter_str, info_dict):
2693 return None
2694 else:
2695 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2696 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2697 return _match_func
91410c9b
PH
2698
2699
bf6427d2
YCH
2700def parse_dfxp_time_expr(time_expr):
2701 if not time_expr:
d631d5f9 2702 return
bf6427d2
YCH
2703
2704 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2705 if mobj:
2706 return float(mobj.group('time_offset'))
2707
db2fe38b 2708 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2709 if mobj:
db2fe38b 2710 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2711
2712
c1c924ab
YCH
2713def srt_subtitles_timecode(seconds):
2714 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2715
2716
2717def dfxp2srt(dfxp_data):
3869028f
YCH
2718 '''
2719 @param dfxp_data A bytes-like object containing DFXP data
2720 @returns A unicode object containing converted SRT data
2721 '''
5b995f71 2722 LEGACY_NAMESPACES = (
3869028f
YCH
2723 (b'http://www.w3.org/ns/ttml', [
2724 b'http://www.w3.org/2004/11/ttaf1',
2725 b'http://www.w3.org/2006/04/ttaf1',
2726 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2727 ]),
3869028f
YCH
2728 (b'http://www.w3.org/ns/ttml#styling', [
2729 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2730 ]),
2731 )
2732
2733 SUPPORTED_STYLING = [
2734 'color',
2735 'fontFamily',
2736 'fontSize',
2737 'fontStyle',
2738 'fontWeight',
2739 'textDecoration'
2740 ]
2741
4e335771 2742 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 2743 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 2744 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2745 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2746 })
bf6427d2 2747
5b995f71
RA
2748 styles = {}
2749 default_style = {}
2750
87de7069 2751 class TTMLPElementParser(object):
5b995f71
RA
2752 _out = ''
2753 _unclosed_elements = []
2754 _applied_styles = []
bf6427d2 2755
2b14cb56 2756 def start(self, tag, attrib):
5b995f71
RA
2757 if tag in (_x('ttml:br'), 'br'):
2758 self._out += '\n'
2759 else:
2760 unclosed_elements = []
2761 style = {}
2762 element_style_id = attrib.get('style')
2763 if default_style:
2764 style.update(default_style)
2765 if element_style_id:
2766 style.update(styles.get(element_style_id, {}))
2767 for prop in SUPPORTED_STYLING:
2768 prop_val = attrib.get(_x('tts:' + prop))
2769 if prop_val:
2770 style[prop] = prop_val
2771 if style:
2772 font = ''
2773 for k, v in sorted(style.items()):
2774 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2775 continue
2776 if k == 'color':
2777 font += ' color="%s"' % v
2778 elif k == 'fontSize':
2779 font += ' size="%s"' % v
2780 elif k == 'fontFamily':
2781 font += ' face="%s"' % v
2782 elif k == 'fontWeight' and v == 'bold':
2783 self._out += '<b>'
2784 unclosed_elements.append('b')
2785 elif k == 'fontStyle' and v == 'italic':
2786 self._out += '<i>'
2787 unclosed_elements.append('i')
2788 elif k == 'textDecoration' and v == 'underline':
2789 self._out += '<u>'
2790 unclosed_elements.append('u')
2791 if font:
2792 self._out += '<font' + font + '>'
2793 unclosed_elements.append('font')
2794 applied_style = {}
2795 if self._applied_styles:
2796 applied_style.update(self._applied_styles[-1])
2797 applied_style.update(style)
2798 self._applied_styles.append(applied_style)
2799 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2800
2b14cb56 2801 def end(self, tag):
5b995f71
RA
2802 if tag not in (_x('ttml:br'), 'br'):
2803 unclosed_elements = self._unclosed_elements.pop()
2804 for element in reversed(unclosed_elements):
2805 self._out += '</%s>' % element
2806 if unclosed_elements and self._applied_styles:
2807 self._applied_styles.pop()
bf6427d2 2808
2b14cb56 2809 def data(self, data):
5b995f71 2810 self._out += data
2b14cb56 2811
2812 def close(self):
5b995f71 2813 return self._out.strip()
2b14cb56 2814
2815 def parse_node(node):
2816 target = TTMLPElementParser()
2817 parser = xml.etree.ElementTree.XMLParser(target=target)
2818 parser.feed(xml.etree.ElementTree.tostring(node))
2819 return parser.close()
bf6427d2 2820
5b995f71
RA
2821 for k, v in LEGACY_NAMESPACES:
2822 for ns in v:
2823 dfxp_data = dfxp_data.replace(ns, k)
2824
3869028f 2825 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2826 out = []
5b995f71 2827 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2828
2829 if not paras:
2830 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2831
5b995f71
RA
2832 repeat = False
2833 while True:
2834 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
2835 style_id = style.get('id') or style.get(_x('xml:id'))
2836 if not style_id:
2837 continue
5b995f71
RA
2838 parent_style_id = style.get('style')
2839 if parent_style_id:
2840 if parent_style_id not in styles:
2841 repeat = True
2842 continue
2843 styles[style_id] = styles[parent_style_id].copy()
2844 for prop in SUPPORTED_STYLING:
2845 prop_val = style.get(_x('tts:' + prop))
2846 if prop_val:
2847 styles.setdefault(style_id, {})[prop] = prop_val
2848 if repeat:
2849 repeat = False
2850 else:
2851 break
2852
2853 for p in ('body', 'div'):
2854 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2855 if ele is None:
2856 continue
2857 style = styles.get(ele.get('style'))
2858 if not style:
2859 continue
2860 default_style.update(style)
2861
bf6427d2 2862 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2863 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2864 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2865 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2866 if begin_time is None:
2867 continue
7dff0363 2868 if not end_time:
d631d5f9
YCH
2869 if not dur:
2870 continue
2871 end_time = begin_time + dur
bf6427d2
YCH
2872 out.append('%d\n%s --> %s\n%s\n\n' % (
2873 index,
c1c924ab
YCH
2874 srt_subtitles_timecode(begin_time),
2875 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2876 parse_node(para)))
2877
2878 return ''.join(out)
2879
2880
66e289ba
S
2881def cli_option(params, command_option, param):
2882 param = params.get(param)
98e698f1
RA
2883 if param:
2884 param = compat_str(param)
66e289ba
S
2885 return [command_option, param] if param is not None else []
2886
2887
2888def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2889 param = params.get(param)
5b232f46
S
2890 if param is None:
2891 return []
66e289ba
S
2892 assert isinstance(param, bool)
2893 if separator:
2894 return [command_option + separator + (true_value if param else false_value)]
2895 return [command_option, true_value if param else false_value]
2896
2897
2898def cli_valueless_option(params, command_option, param, expected_value=True):
2899 param = params.get(param)
2900 return [command_option] if param == expected_value else []
2901
2902
2903def cli_configuration_args(params, param, default=[]):
2904 ex_args = params.get(param)
2905 if ex_args is None:
2906 return default
2907 assert isinstance(ex_args, list)
2908 return ex_args
2909
2910
39672624
YCH
2911class ISO639Utils(object):
2912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2913 _lang_map = {
2914 'aa': 'aar',
2915 'ab': 'abk',
2916 'ae': 'ave',
2917 'af': 'afr',
2918 'ak': 'aka',
2919 'am': 'amh',
2920 'an': 'arg',
2921 'ar': 'ara',
2922 'as': 'asm',
2923 'av': 'ava',
2924 'ay': 'aym',
2925 'az': 'aze',
2926 'ba': 'bak',
2927 'be': 'bel',
2928 'bg': 'bul',
2929 'bh': 'bih',
2930 'bi': 'bis',
2931 'bm': 'bam',
2932 'bn': 'ben',
2933 'bo': 'bod',
2934 'br': 'bre',
2935 'bs': 'bos',
2936 'ca': 'cat',
2937 'ce': 'che',
2938 'ch': 'cha',
2939 'co': 'cos',
2940 'cr': 'cre',
2941 'cs': 'ces',
2942 'cu': 'chu',
2943 'cv': 'chv',
2944 'cy': 'cym',
2945 'da': 'dan',
2946 'de': 'deu',
2947 'dv': 'div',
2948 'dz': 'dzo',
2949 'ee': 'ewe',
2950 'el': 'ell',
2951 'en': 'eng',
2952 'eo': 'epo',
2953 'es': 'spa',
2954 'et': 'est',
2955 'eu': 'eus',
2956 'fa': 'fas',
2957 'ff': 'ful',
2958 'fi': 'fin',
2959 'fj': 'fij',
2960 'fo': 'fao',
2961 'fr': 'fra',
2962 'fy': 'fry',
2963 'ga': 'gle',
2964 'gd': 'gla',
2965 'gl': 'glg',
2966 'gn': 'grn',
2967 'gu': 'guj',
2968 'gv': 'glv',
2969 'ha': 'hau',
2970 'he': 'heb',
b7acc835 2971 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
2972 'hi': 'hin',
2973 'ho': 'hmo',
2974 'hr': 'hrv',
2975 'ht': 'hat',
2976 'hu': 'hun',
2977 'hy': 'hye',
2978 'hz': 'her',
2979 'ia': 'ina',
2980 'id': 'ind',
b7acc835 2981 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
2982 'ie': 'ile',
2983 'ig': 'ibo',
2984 'ii': 'iii',
2985 'ik': 'ipk',
2986 'io': 'ido',
2987 'is': 'isl',
2988 'it': 'ita',
2989 'iu': 'iku',
2990 'ja': 'jpn',
2991 'jv': 'jav',
2992 'ka': 'kat',
2993 'kg': 'kon',
2994 'ki': 'kik',
2995 'kj': 'kua',
2996 'kk': 'kaz',
2997 'kl': 'kal',
2998 'km': 'khm',
2999 'kn': 'kan',
3000 'ko': 'kor',
3001 'kr': 'kau',
3002 'ks': 'kas',
3003 'ku': 'kur',
3004 'kv': 'kom',
3005 'kw': 'cor',
3006 'ky': 'kir',
3007 'la': 'lat',
3008 'lb': 'ltz',
3009 'lg': 'lug',
3010 'li': 'lim',
3011 'ln': 'lin',
3012 'lo': 'lao',
3013 'lt': 'lit',
3014 'lu': 'lub',
3015 'lv': 'lav',
3016 'mg': 'mlg',
3017 'mh': 'mah',
3018 'mi': 'mri',
3019 'mk': 'mkd',
3020 'ml': 'mal',
3021 'mn': 'mon',
3022 'mr': 'mar',
3023 'ms': 'msa',
3024 'mt': 'mlt',
3025 'my': 'mya',
3026 'na': 'nau',
3027 'nb': 'nob',
3028 'nd': 'nde',
3029 'ne': 'nep',
3030 'ng': 'ndo',
3031 'nl': 'nld',
3032 'nn': 'nno',
3033 'no': 'nor',
3034 'nr': 'nbl',
3035 'nv': 'nav',
3036 'ny': 'nya',
3037 'oc': 'oci',
3038 'oj': 'oji',
3039 'om': 'orm',
3040 'or': 'ori',
3041 'os': 'oss',
3042 'pa': 'pan',
3043 'pi': 'pli',
3044 'pl': 'pol',
3045 'ps': 'pus',
3046 'pt': 'por',
3047 'qu': 'que',
3048 'rm': 'roh',
3049 'rn': 'run',
3050 'ro': 'ron',
3051 'ru': 'rus',
3052 'rw': 'kin',
3053 'sa': 'san',
3054 'sc': 'srd',
3055 'sd': 'snd',
3056 'se': 'sme',
3057 'sg': 'sag',
3058 'si': 'sin',
3059 'sk': 'slk',
3060 'sl': 'slv',
3061 'sm': 'smo',
3062 'sn': 'sna',
3063 'so': 'som',
3064 'sq': 'sqi',
3065 'sr': 'srp',
3066 'ss': 'ssw',
3067 'st': 'sot',
3068 'su': 'sun',
3069 'sv': 'swe',
3070 'sw': 'swa',
3071 'ta': 'tam',
3072 'te': 'tel',
3073 'tg': 'tgk',
3074 'th': 'tha',
3075 'ti': 'tir',
3076 'tk': 'tuk',
3077 'tl': 'tgl',
3078 'tn': 'tsn',
3079 'to': 'ton',
3080 'tr': 'tur',
3081 'ts': 'tso',
3082 'tt': 'tat',
3083 'tw': 'twi',
3084 'ty': 'tah',
3085 'ug': 'uig',
3086 'uk': 'ukr',
3087 'ur': 'urd',
3088 'uz': 'uzb',
3089 've': 'ven',
3090 'vi': 'vie',
3091 'vo': 'vol',
3092 'wa': 'wln',
3093 'wo': 'wol',
3094 'xh': 'xho',
3095 'yi': 'yid',
e9a50fba 3096 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3097 'yo': 'yor',
3098 'za': 'zha',
3099 'zh': 'zho',
3100 'zu': 'zul',
3101 }
3102
3103 @classmethod
3104 def short2long(cls, code):
3105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3106 return cls._lang_map.get(code[:2])
3107
3108 @classmethod
3109 def long2short(cls, code):
3110 """Convert language code from ISO 639-2/T to ISO 639-1"""
3111 for short_name, long_name in cls._lang_map.items():
3112 if long_name == code:
3113 return short_name
3114
3115
4eb10f66
YCH
3116class ISO3166Utils(object):
3117 # From http://data.okfn.org/data/core/country-list
3118 _country_map = {
3119 'AF': 'Afghanistan',
3120 'AX': 'Åland Islands',
3121 'AL': 'Albania',
3122 'DZ': 'Algeria',
3123 'AS': 'American Samoa',
3124 'AD': 'Andorra',
3125 'AO': 'Angola',
3126 'AI': 'Anguilla',
3127 'AQ': 'Antarctica',
3128 'AG': 'Antigua and Barbuda',
3129 'AR': 'Argentina',
3130 'AM': 'Armenia',
3131 'AW': 'Aruba',
3132 'AU': 'Australia',
3133 'AT': 'Austria',
3134 'AZ': 'Azerbaijan',
3135 'BS': 'Bahamas',
3136 'BH': 'Bahrain',
3137 'BD': 'Bangladesh',
3138 'BB': 'Barbados',
3139 'BY': 'Belarus',
3140 'BE': 'Belgium',
3141 'BZ': 'Belize',
3142 'BJ': 'Benin',
3143 'BM': 'Bermuda',
3144 'BT': 'Bhutan',
3145 'BO': 'Bolivia, Plurinational State of',
3146 'BQ': 'Bonaire, Sint Eustatius and Saba',
3147 'BA': 'Bosnia and Herzegovina',
3148 'BW': 'Botswana',
3149 'BV': 'Bouvet Island',
3150 'BR': 'Brazil',
3151 'IO': 'British Indian Ocean Territory',
3152 'BN': 'Brunei Darussalam',
3153 'BG': 'Bulgaria',
3154 'BF': 'Burkina Faso',
3155 'BI': 'Burundi',
3156 'KH': 'Cambodia',
3157 'CM': 'Cameroon',
3158 'CA': 'Canada',
3159 'CV': 'Cape Verde',
3160 'KY': 'Cayman Islands',
3161 'CF': 'Central African Republic',
3162 'TD': 'Chad',
3163 'CL': 'Chile',
3164 'CN': 'China',
3165 'CX': 'Christmas Island',
3166 'CC': 'Cocos (Keeling) Islands',
3167 'CO': 'Colombia',
3168 'KM': 'Comoros',
3169 'CG': 'Congo',
3170 'CD': 'Congo, the Democratic Republic of the',
3171 'CK': 'Cook Islands',
3172 'CR': 'Costa Rica',
3173 'CI': 'Côte d\'Ivoire',
3174 'HR': 'Croatia',
3175 'CU': 'Cuba',
3176 'CW': 'Curaçao',
3177 'CY': 'Cyprus',
3178 'CZ': 'Czech Republic',
3179 'DK': 'Denmark',
3180 'DJ': 'Djibouti',
3181 'DM': 'Dominica',
3182 'DO': 'Dominican Republic',
3183 'EC': 'Ecuador',
3184 'EG': 'Egypt',
3185 'SV': 'El Salvador',
3186 'GQ': 'Equatorial Guinea',
3187 'ER': 'Eritrea',
3188 'EE': 'Estonia',
3189 'ET': 'Ethiopia',
3190 'FK': 'Falkland Islands (Malvinas)',
3191 'FO': 'Faroe Islands',
3192 'FJ': 'Fiji',
3193 'FI': 'Finland',
3194 'FR': 'France',
3195 'GF': 'French Guiana',
3196 'PF': 'French Polynesia',
3197 'TF': 'French Southern Territories',
3198 'GA': 'Gabon',
3199 'GM': 'Gambia',
3200 'GE': 'Georgia',
3201 'DE': 'Germany',
3202 'GH': 'Ghana',
3203 'GI': 'Gibraltar',
3204 'GR': 'Greece',
3205 'GL': 'Greenland',
3206 'GD': 'Grenada',
3207 'GP': 'Guadeloupe',
3208 'GU': 'Guam',
3209 'GT': 'Guatemala',
3210 'GG': 'Guernsey',
3211 'GN': 'Guinea',
3212 'GW': 'Guinea-Bissau',
3213 'GY': 'Guyana',
3214 'HT': 'Haiti',
3215 'HM': 'Heard Island and McDonald Islands',
3216 'VA': 'Holy See (Vatican City State)',
3217 'HN': 'Honduras',
3218 'HK': 'Hong Kong',
3219 'HU': 'Hungary',
3220 'IS': 'Iceland',
3221 'IN': 'India',
3222 'ID': 'Indonesia',
3223 'IR': 'Iran, Islamic Republic of',
3224 'IQ': 'Iraq',
3225 'IE': 'Ireland',
3226 'IM': 'Isle of Man',
3227 'IL': 'Israel',
3228 'IT': 'Italy',
3229 'JM': 'Jamaica',
3230 'JP': 'Japan',
3231 'JE': 'Jersey',
3232 'JO': 'Jordan',
3233 'KZ': 'Kazakhstan',
3234 'KE': 'Kenya',
3235 'KI': 'Kiribati',
3236 'KP': 'Korea, Democratic People\'s Republic of',
3237 'KR': 'Korea, Republic of',
3238 'KW': 'Kuwait',
3239 'KG': 'Kyrgyzstan',
3240 'LA': 'Lao People\'s Democratic Republic',
3241 'LV': 'Latvia',
3242 'LB': 'Lebanon',
3243 'LS': 'Lesotho',
3244 'LR': 'Liberia',
3245 'LY': 'Libya',
3246 'LI': 'Liechtenstein',
3247 'LT': 'Lithuania',
3248 'LU': 'Luxembourg',
3249 'MO': 'Macao',
3250 'MK': 'Macedonia, the Former Yugoslav Republic of',
3251 'MG': 'Madagascar',
3252 'MW': 'Malawi',
3253 'MY': 'Malaysia',
3254 'MV': 'Maldives',
3255 'ML': 'Mali',
3256 'MT': 'Malta',
3257 'MH': 'Marshall Islands',
3258 'MQ': 'Martinique',
3259 'MR': 'Mauritania',
3260 'MU': 'Mauritius',
3261 'YT': 'Mayotte',
3262 'MX': 'Mexico',
3263 'FM': 'Micronesia, Federated States of',
3264 'MD': 'Moldova, Republic of',
3265 'MC': 'Monaco',
3266 'MN': 'Mongolia',
3267 'ME': 'Montenegro',
3268 'MS': 'Montserrat',
3269 'MA': 'Morocco',
3270 'MZ': 'Mozambique',
3271 'MM': 'Myanmar',
3272 'NA': 'Namibia',
3273 'NR': 'Nauru',
3274 'NP': 'Nepal',
3275 'NL': 'Netherlands',
3276 'NC': 'New Caledonia',
3277 'NZ': 'New Zealand',
3278 'NI': 'Nicaragua',
3279 'NE': 'Niger',
3280 'NG': 'Nigeria',
3281 'NU': 'Niue',
3282 'NF': 'Norfolk Island',
3283 'MP': 'Northern Mariana Islands',
3284 'NO': 'Norway',
3285 'OM': 'Oman',
3286 'PK': 'Pakistan',
3287 'PW': 'Palau',
3288 'PS': 'Palestine, State of',
3289 'PA': 'Panama',
3290 'PG': 'Papua New Guinea',
3291 'PY': 'Paraguay',
3292 'PE': 'Peru',
3293 'PH': 'Philippines',
3294 'PN': 'Pitcairn',
3295 'PL': 'Poland',
3296 'PT': 'Portugal',
3297 'PR': 'Puerto Rico',
3298 'QA': 'Qatar',
3299 'RE': 'Réunion',
3300 'RO': 'Romania',
3301 'RU': 'Russian Federation',
3302 'RW': 'Rwanda',
3303 'BL': 'Saint Barthélemy',
3304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3305 'KN': 'Saint Kitts and Nevis',
3306 'LC': 'Saint Lucia',
3307 'MF': 'Saint Martin (French part)',
3308 'PM': 'Saint Pierre and Miquelon',
3309 'VC': 'Saint Vincent and the Grenadines',
3310 'WS': 'Samoa',
3311 'SM': 'San Marino',
3312 'ST': 'Sao Tome and Principe',
3313 'SA': 'Saudi Arabia',
3314 'SN': 'Senegal',
3315 'RS': 'Serbia',
3316 'SC': 'Seychelles',
3317 'SL': 'Sierra Leone',
3318 'SG': 'Singapore',
3319 'SX': 'Sint Maarten (Dutch part)',
3320 'SK': 'Slovakia',
3321 'SI': 'Slovenia',
3322 'SB': 'Solomon Islands',
3323 'SO': 'Somalia',
3324 'ZA': 'South Africa',
3325 'GS': 'South Georgia and the South Sandwich Islands',
3326 'SS': 'South Sudan',
3327 'ES': 'Spain',
3328 'LK': 'Sri Lanka',
3329 'SD': 'Sudan',
3330 'SR': 'Suriname',
3331 'SJ': 'Svalbard and Jan Mayen',
3332 'SZ': 'Swaziland',
3333 'SE': 'Sweden',
3334 'CH': 'Switzerland',
3335 'SY': 'Syrian Arab Republic',
3336 'TW': 'Taiwan, Province of China',
3337 'TJ': 'Tajikistan',
3338 'TZ': 'Tanzania, United Republic of',
3339 'TH': 'Thailand',
3340 'TL': 'Timor-Leste',
3341 'TG': 'Togo',
3342 'TK': 'Tokelau',
3343 'TO': 'Tonga',
3344 'TT': 'Trinidad and Tobago',
3345 'TN': 'Tunisia',
3346 'TR': 'Turkey',
3347 'TM': 'Turkmenistan',
3348 'TC': 'Turks and Caicos Islands',
3349 'TV': 'Tuvalu',
3350 'UG': 'Uganda',
3351 'UA': 'Ukraine',
3352 'AE': 'United Arab Emirates',
3353 'GB': 'United Kingdom',
3354 'US': 'United States',
3355 'UM': 'United States Minor Outlying Islands',
3356 'UY': 'Uruguay',
3357 'UZ': 'Uzbekistan',
3358 'VU': 'Vanuatu',
3359 'VE': 'Venezuela, Bolivarian Republic of',
3360 'VN': 'Viet Nam',
3361 'VG': 'Virgin Islands, British',
3362 'VI': 'Virgin Islands, U.S.',
3363 'WF': 'Wallis and Futuna',
3364 'EH': 'Western Sahara',
3365 'YE': 'Yemen',
3366 'ZM': 'Zambia',
3367 'ZW': 'Zimbabwe',
3368 }
3369
3370 @classmethod
3371 def short2full(cls, code):
3372 """Convert an ISO 3166-2 country code to the corresponding full name"""
3373 return cls._country_map.get(code.upper())
3374
3375
773f291d
S
3376class GeoUtils(object):
3377 # Major IPv4 address blocks per country
3378 _country_ip_map = {
3379 'AD': '85.94.160.0/19',
3380 'AE': '94.200.0.0/13',
3381 'AF': '149.54.0.0/17',
3382 'AG': '209.59.64.0/18',
3383 'AI': '204.14.248.0/21',
3384 'AL': '46.99.0.0/16',
3385 'AM': '46.70.0.0/15',
3386 'AO': '105.168.0.0/13',
3387 'AP': '159.117.192.0/21',
3388 'AR': '181.0.0.0/12',
3389 'AS': '202.70.112.0/20',
3390 'AT': '84.112.0.0/13',
3391 'AU': '1.128.0.0/11',
3392 'AW': '181.41.0.0/18',
3393 'AZ': '5.191.0.0/16',
3394 'BA': '31.176.128.0/17',
3395 'BB': '65.48.128.0/17',
3396 'BD': '114.130.0.0/16',
3397 'BE': '57.0.0.0/8',
3398 'BF': '129.45.128.0/17',
3399 'BG': '95.42.0.0/15',
3400 'BH': '37.131.0.0/17',
3401 'BI': '154.117.192.0/18',
3402 'BJ': '137.255.0.0/16',
3403 'BL': '192.131.134.0/24',
3404 'BM': '196.12.64.0/18',
3405 'BN': '156.31.0.0/16',
3406 'BO': '161.56.0.0/16',
3407 'BQ': '161.0.80.0/20',
3408 'BR': '152.240.0.0/12',
3409 'BS': '24.51.64.0/18',
3410 'BT': '119.2.96.0/19',
3411 'BW': '168.167.0.0/16',
3412 'BY': '178.120.0.0/13',
3413 'BZ': '179.42.192.0/18',
3414 'CA': '99.224.0.0/11',
3415 'CD': '41.243.0.0/16',
3416 'CF': '196.32.200.0/21',
3417 'CG': '197.214.128.0/17',
3418 'CH': '85.0.0.0/13',
3419 'CI': '154.232.0.0/14',
3420 'CK': '202.65.32.0/19',
3421 'CL': '152.172.0.0/14',
3422 'CM': '165.210.0.0/15',
3423 'CN': '36.128.0.0/10',
3424 'CO': '181.240.0.0/12',
3425 'CR': '201.192.0.0/12',
3426 'CU': '152.206.0.0/15',
3427 'CV': '165.90.96.0/19',
3428 'CW': '190.88.128.0/17',
3429 'CY': '46.198.0.0/15',
3430 'CZ': '88.100.0.0/14',
3431 'DE': '53.0.0.0/8',
3432 'DJ': '197.241.0.0/17',
3433 'DK': '87.48.0.0/12',
3434 'DM': '192.243.48.0/20',
3435 'DO': '152.166.0.0/15',
3436 'DZ': '41.96.0.0/12',
3437 'EC': '186.68.0.0/15',
3438 'EE': '90.190.0.0/15',
3439 'EG': '156.160.0.0/11',
3440 'ER': '196.200.96.0/20',
3441 'ES': '88.0.0.0/11',
3442 'ET': '196.188.0.0/14',
3443 'EU': '2.16.0.0/13',
3444 'FI': '91.152.0.0/13',
3445 'FJ': '144.120.0.0/16',
3446 'FM': '119.252.112.0/20',
3447 'FO': '88.85.32.0/19',
3448 'FR': '90.0.0.0/9',
3449 'GA': '41.158.0.0/15',
3450 'GB': '25.0.0.0/8',
3451 'GD': '74.122.88.0/21',
3452 'GE': '31.146.0.0/16',
3453 'GF': '161.22.64.0/18',
3454 'GG': '62.68.160.0/19',
3455 'GH': '45.208.0.0/14',
3456 'GI': '85.115.128.0/19',
3457 'GL': '88.83.0.0/19',
3458 'GM': '160.182.0.0/15',
3459 'GN': '197.149.192.0/18',
3460 'GP': '104.250.0.0/19',
3461 'GQ': '105.235.224.0/20',
3462 'GR': '94.64.0.0/13',
3463 'GT': '168.234.0.0/16',
3464 'GU': '168.123.0.0/16',
3465 'GW': '197.214.80.0/20',
3466 'GY': '181.41.64.0/18',
3467 'HK': '113.252.0.0/14',
3468 'HN': '181.210.0.0/16',
3469 'HR': '93.136.0.0/13',
3470 'HT': '148.102.128.0/17',
3471 'HU': '84.0.0.0/14',
3472 'ID': '39.192.0.0/10',
3473 'IE': '87.32.0.0/12',
3474 'IL': '79.176.0.0/13',
3475 'IM': '5.62.80.0/20',
3476 'IN': '117.192.0.0/10',
3477 'IO': '203.83.48.0/21',
3478 'IQ': '37.236.0.0/14',
3479 'IR': '2.176.0.0/12',
3480 'IS': '82.221.0.0/16',
3481 'IT': '79.0.0.0/10',
3482 'JE': '87.244.64.0/18',
3483 'JM': '72.27.0.0/17',
3484 'JO': '176.29.0.0/16',
3485 'JP': '126.0.0.0/8',
3486 'KE': '105.48.0.0/12',
3487 'KG': '158.181.128.0/17',
3488 'KH': '36.37.128.0/17',
3489 'KI': '103.25.140.0/22',
3490 'KM': '197.255.224.0/20',
3491 'KN': '198.32.32.0/19',
3492 'KP': '175.45.176.0/22',
3493 'KR': '175.192.0.0/10',
3494 'KW': '37.36.0.0/14',
3495 'KY': '64.96.0.0/15',
3496 'KZ': '2.72.0.0/13',
3497 'LA': '115.84.64.0/18',
3498 'LB': '178.135.0.0/16',
3499 'LC': '192.147.231.0/24',
3500 'LI': '82.117.0.0/19',
3501 'LK': '112.134.0.0/15',
3502 'LR': '41.86.0.0/19',
3503 'LS': '129.232.0.0/17',
3504 'LT': '78.56.0.0/13',
3505 'LU': '188.42.0.0/16',
3506 'LV': '46.109.0.0/16',
3507 'LY': '41.252.0.0/14',
3508 'MA': '105.128.0.0/11',
3509 'MC': '88.209.64.0/18',
3510 'MD': '37.246.0.0/16',
3511 'ME': '178.175.0.0/17',
3512 'MF': '74.112.232.0/21',
3513 'MG': '154.126.0.0/17',
3514 'MH': '117.103.88.0/21',
3515 'MK': '77.28.0.0/15',
3516 'ML': '154.118.128.0/18',
3517 'MM': '37.111.0.0/17',
3518 'MN': '49.0.128.0/17',
3519 'MO': '60.246.0.0/16',
3520 'MP': '202.88.64.0/20',
3521 'MQ': '109.203.224.0/19',
3522 'MR': '41.188.64.0/18',
3523 'MS': '208.90.112.0/22',
3524 'MT': '46.11.0.0/16',
3525 'MU': '105.16.0.0/12',
3526 'MV': '27.114.128.0/18',
3527 'MW': '105.234.0.0/16',
3528 'MX': '187.192.0.0/11',
3529 'MY': '175.136.0.0/13',
3530 'MZ': '197.218.0.0/15',
3531 'NA': '41.182.0.0/16',
3532 'NC': '101.101.0.0/18',
3533 'NE': '197.214.0.0/18',
3534 'NF': '203.17.240.0/22',
3535 'NG': '105.112.0.0/12',
3536 'NI': '186.76.0.0/15',
3537 'NL': '145.96.0.0/11',
3538 'NO': '84.208.0.0/13',
3539 'NP': '36.252.0.0/15',
3540 'NR': '203.98.224.0/19',
3541 'NU': '49.156.48.0/22',
3542 'NZ': '49.224.0.0/14',
3543 'OM': '5.36.0.0/15',
3544 'PA': '186.72.0.0/15',
3545 'PE': '186.160.0.0/14',
3546 'PF': '123.50.64.0/18',
3547 'PG': '124.240.192.0/19',
3548 'PH': '49.144.0.0/13',
3549 'PK': '39.32.0.0/11',
3550 'PL': '83.0.0.0/11',
3551 'PM': '70.36.0.0/20',
3552 'PR': '66.50.0.0/16',
3553 'PS': '188.161.0.0/16',
3554 'PT': '85.240.0.0/13',
3555 'PW': '202.124.224.0/20',
3556 'PY': '181.120.0.0/14',
3557 'QA': '37.210.0.0/15',
3558 'RE': '139.26.0.0/16',
3559 'RO': '79.112.0.0/13',
3560 'RS': '178.220.0.0/14',
3561 'RU': '5.136.0.0/13',
3562 'RW': '105.178.0.0/15',
3563 'SA': '188.48.0.0/13',
3564 'SB': '202.1.160.0/19',
3565 'SC': '154.192.0.0/11',
3566 'SD': '154.96.0.0/13',
3567 'SE': '78.64.0.0/12',
3568 'SG': '152.56.0.0/14',
3569 'SI': '188.196.0.0/14',
3570 'SK': '78.98.0.0/15',
3571 'SL': '197.215.0.0/17',
3572 'SM': '89.186.32.0/19',
3573 'SN': '41.82.0.0/15',
3574 'SO': '197.220.64.0/19',
3575 'SR': '186.179.128.0/17',
3576 'SS': '105.235.208.0/21',
3577 'ST': '197.159.160.0/19',
3578 'SV': '168.243.0.0/16',
3579 'SX': '190.102.0.0/20',
3580 'SY': '5.0.0.0/16',
3581 'SZ': '41.84.224.0/19',
3582 'TC': '65.255.48.0/20',
3583 'TD': '154.68.128.0/19',
3584 'TG': '196.168.0.0/14',
3585 'TH': '171.96.0.0/13',
3586 'TJ': '85.9.128.0/18',
3587 'TK': '27.96.24.0/21',
3588 'TL': '180.189.160.0/20',
3589 'TM': '95.85.96.0/19',
3590 'TN': '197.0.0.0/11',
3591 'TO': '175.176.144.0/21',
3592 'TR': '78.160.0.0/11',
3593 'TT': '186.44.0.0/15',
3594 'TV': '202.2.96.0/19',
3595 'TW': '120.96.0.0/11',
3596 'TZ': '156.156.0.0/14',
3597 'UA': '93.72.0.0/13',
3598 'UG': '154.224.0.0/13',
3599 'US': '3.0.0.0/8',
3600 'UY': '167.56.0.0/13',
3601 'UZ': '82.215.64.0/18',
3602 'VA': '212.77.0.0/19',
3603 'VC': '24.92.144.0/20',
3604 'VE': '186.88.0.0/13',
3605 'VG': '172.103.64.0/18',
3606 'VI': '146.226.0.0/16',
3607 'VN': '14.160.0.0/11',
3608 'VU': '202.80.32.0/20',
3609 'WF': '117.20.32.0/21',
3610 'WS': '202.4.32.0/19',
3611 'YE': '134.35.0.0/16',
3612 'YT': '41.242.116.0/22',
3613 'ZA': '41.0.0.0/11',
3614 'ZM': '165.56.0.0/13',
3615 'ZW': '41.85.192.0/19',
3616 }
3617
3618 @classmethod
5f95927a
S
3619 def random_ipv4(cls, code_or_block):
3620 if len(code_or_block) == 2:
3621 block = cls._country_ip_map.get(code_or_block.upper())
3622 if not block:
3623 return None
3624 else:
3625 block = code_or_block
773f291d
S
3626 addr, preflen = block.split('/')
3627 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3628 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3629 return compat_str(socket.inet_ntoa(
4248dad9 3630 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3631
3632
91410c9b 3633class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3634 def __init__(self, proxies=None):
3635 # Set default handlers
3636 for type in ('http', 'https'):
3637 setattr(self, '%s_open' % type,
3638 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3639 meth(r, proxy, type))
38e87f6c 3640 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 3641
91410c9b 3642 def proxy_open(self, req, proxy, type):
2461f79d 3643 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3644 if req_proxy is not None:
3645 proxy = req_proxy
2461f79d
PH
3646 del req.headers['Ytdl-request-proxy']
3647
3648 if proxy == '__noproxy__':
3649 return None # No Proxy
51fb4995 3650 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3651 req.add_header('Ytdl-socks-proxy', proxy)
3652 # youtube-dl's http/https handlers do wrapping the socket with socks
3653 return None
91410c9b
PH
3654 return compat_urllib_request.ProxyHandler.proxy_open(
3655 self, req, proxy, type)
5bc880b9
YCH
3656
3657
0a5445dd
YCH
3658# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3659# released into Public Domain
3660# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3661
3662def long_to_bytes(n, blocksize=0):
3663 """long_to_bytes(n:long, blocksize:int) : string
3664 Convert a long integer to a byte string.
3665
3666 If optional blocksize is given and greater than zero, pad the front of the
3667 byte string with binary zeros so that the length is a multiple of
3668 blocksize.
3669 """
3670 # after much testing, this algorithm was deemed to be the fastest
3671 s = b''
3672 n = int(n)
3673 while n > 0:
3674 s = compat_struct_pack('>I', n & 0xffffffff) + s
3675 n = n >> 32
3676 # strip off leading zeros
3677 for i in range(len(s)):
3678 if s[i] != b'\000'[0]:
3679 break
3680 else:
3681 # only happens when n == 0
3682 s = b'\000'
3683 i = 0
3684 s = s[i:]
3685 # add back some pad bytes. this could be done more efficiently w.r.t. the
3686 # de-padding being done above, but sigh...
3687 if blocksize > 0 and len(s) % blocksize:
3688 s = (blocksize - len(s) % blocksize) * b'\000' + s
3689 return s
3690
3691
3692def bytes_to_long(s):
3693 """bytes_to_long(string) : long
3694 Convert a byte string to a long integer.
3695
3696 This is (essentially) the inverse of long_to_bytes().
3697 """
3698 acc = 0
3699 length = len(s)
3700 if length % 4:
3701 extra = (4 - length % 4)
3702 s = b'\000' * extra + s
3703 length = length + extra
3704 for i in range(0, length, 4):
3705 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3706 return acc
3707
3708
5bc880b9
YCH
3709def ohdave_rsa_encrypt(data, exponent, modulus):
3710 '''
3711 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3712
3713 Input:
3714 data: data to encrypt, bytes-like object
3715 exponent, modulus: parameter e and N of RSA algorithm, both integer
3716 Output: hex string of encrypted data
3717
3718 Limitation: supports one block encryption only
3719 '''
3720
3721 payload = int(binascii.hexlify(data[::-1]), 16)
3722 encrypted = pow(payload, exponent, modulus)
3723 return '%x' % encrypted
81bdc8fd
YCH
3724
3725
f48409c7
YCH
3726def pkcs1pad(data, length):
3727 """
3728 Padding input data with PKCS#1 scheme
3729
3730 @param {int[]} data input data
3731 @param {int} length target length
3732 @returns {int[]} padded data
3733 """
3734 if len(data) > length - 11:
3735 raise ValueError('Input data too long for PKCS#1 padding')
3736
3737 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3738 return [0, 2] + pseudo_random + [0] + data
3739
3740
5eb6bdce 3741def encode_base_n(num, n, table=None):
59f898b7 3742 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3743 if not table:
3744 table = FULL_TABLE[:n]
3745
5eb6bdce
YCH
3746 if n > len(table):
3747 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3748
3749 if num == 0:
3750 return table[0]
3751
81bdc8fd
YCH
3752 ret = ''
3753 while num:
3754 ret = table[num % n] + ret
3755 num = num // n
3756 return ret
f52354a8
YCH
3757
3758
3759def decode_packed_codes(code):
06b3fe29 3760 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3761 obfucasted_code, base, count, symbols = mobj.groups()
3762 base = int(base)
3763 count = int(count)
3764 symbols = symbols.split('|')
3765 symbol_table = {}
3766
3767 while count:
3768 count -= 1
5eb6bdce 3769 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3770 symbol_table[base_n_count] = symbols[count] or base_n_count
3771
3772 return re.sub(
3773 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3774 obfucasted_code)
e154c651 3775
3776
3777def parse_m3u8_attributes(attrib):
3778 info = {}
3779 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3780 if val.startswith('"'):
3781 val = val[1:-1]
3782 info[key] = val
3783 return info
1143535d
YCH
3784
3785
3786def urshift(val, n):
3787 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3788
3789
3790# Based on png2str() written by @gdkchan and improved by @yokrysty
3791# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3792def decode_png(png_data):
3793 # Reference: https://www.w3.org/TR/PNG/
3794 header = png_data[8:]
3795
3796 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3797 raise IOError('Not a valid PNG file.')
3798
3799 int_map = {1: '>B', 2: '>H', 4: '>I'}
3800 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3801
3802 chunks = []
3803
3804 while header:
3805 length = unpack_integer(header[:4])
3806 header = header[4:]
3807
3808 chunk_type = header[:4]
3809 header = header[4:]
3810
3811 chunk_data = header[:length]
3812 header = header[length:]
3813
3814 header = header[4:] # Skip CRC
3815
3816 chunks.append({
3817 'type': chunk_type,
3818 'length': length,
3819 'data': chunk_data
3820 })
3821
3822 ihdr = chunks[0]['data']
3823
3824 width = unpack_integer(ihdr[:4])
3825 height = unpack_integer(ihdr[4:8])
3826
3827 idat = b''
3828
3829 for chunk in chunks:
3830 if chunk['type'] == b'IDAT':
3831 idat += chunk['data']
3832
3833 if not idat:
3834 raise IOError('Unable to read PNG data.')
3835
3836 decompressed_data = bytearray(zlib.decompress(idat))
3837
3838 stride = width * 3
3839 pixels = []
3840
3841 def _get_pixel(idx):
3842 x = idx % stride
3843 y = idx // stride
3844 return pixels[y][x]
3845
3846 for y in range(height):
3847 basePos = y * (1 + stride)
3848 filter_type = decompressed_data[basePos]
3849
3850 current_row = []
3851
3852 pixels.append(current_row)
3853
3854 for x in range(stride):
3855 color = decompressed_data[1 + basePos + x]
3856 basex = y * stride + x
3857 left = 0
3858 up = 0
3859
3860 if x > 2:
3861 left = _get_pixel(basex - 3)
3862 if y > 0:
3863 up = _get_pixel(basex - stride)
3864
3865 if filter_type == 1: # Sub
3866 color = (color + left) & 0xff
3867 elif filter_type == 2: # Up
3868 color = (color + up) & 0xff
3869 elif filter_type == 3: # Average
3870 color = (color + ((left + up) >> 1)) & 0xff
3871 elif filter_type == 4: # Paeth
3872 a = left
3873 b = up
3874 c = 0
3875
3876 if x > 2 and y > 0:
3877 c = _get_pixel(basex - stride - 3)
3878
3879 p = a + b - c
3880
3881 pa = abs(p - a)
3882 pb = abs(p - b)
3883 pc = abs(p - c)
3884
3885 if pa <= pb and pa <= pc:
3886 color = (color + a) & 0xff
3887 elif pb <= pc:
3888 color = (color + b) & 0xff
3889 else:
3890 color = (color + c) & 0xff
3891
3892 current_row.append(color)
3893
3894 return width, height, pixels
efa97bdc
YCH
3895
3896
3897def write_xattr(path, key, value):
3898 # This mess below finds the best xattr tool for the job
3899 try:
3900 # try the pyxattr module...
3901 import xattr
3902
53a7e3d2
YCH
3903 if hasattr(xattr, 'set'): # pyxattr
3904 # Unicode arguments are not supported in python-pyxattr until
3905 # version 0.5.0
3906 # See https://github.com/rg3/youtube-dl/issues/5498
3907 pyxattr_required_version = '0.5.0'
3908 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3909 # TODO: fallback to CLI tools
3910 raise XAttrUnavailableError(
3911 'python-pyxattr is detected but is too old. '
3912 'youtube-dl requires %s or above while your version is %s. '
3913 'Falling back to other xattr implementations' % (
3914 pyxattr_required_version, xattr.__version__))
3915
3916 setxattr = xattr.set
3917 else: # xattr
3918 setxattr = xattr.setxattr
efa97bdc
YCH
3919
3920 try:
53a7e3d2 3921 setxattr(path, key, value)
efa97bdc
YCH
3922 except EnvironmentError as e:
3923 raise XAttrMetadataError(e.errno, e.strerror)
3924
3925 except ImportError:
3926 if compat_os_name == 'nt':
3927 # Write xattrs to NTFS Alternate Data Streams:
3928 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3929 assert ':' not in key
3930 assert os.path.exists(path)
3931
3932 ads_fn = path + ':' + key
3933 try:
3934 with open(ads_fn, 'wb') as f:
3935 f.write(value)
3936 except EnvironmentError as e:
3937 raise XAttrMetadataError(e.errno, e.strerror)
3938 else:
3939 user_has_setfattr = check_executable('setfattr', ['--version'])
3940 user_has_xattr = check_executable('xattr', ['-h'])
3941
3942 if user_has_setfattr or user_has_xattr:
3943
3944 value = value.decode('utf-8')
3945 if user_has_setfattr:
3946 executable = 'setfattr'
3947 opts = ['-n', key, '-v', value]
3948 elif user_has_xattr:
3949 executable = 'xattr'
3950 opts = ['-w', key, value]
3951
3952 cmd = ([encodeFilename(executable, True)] +
3953 [encodeArgument(o) for o in opts] +
3954 [encodeFilename(path, True)])
3955
3956 try:
3957 p = subprocess.Popen(
3958 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3959 except EnvironmentError as e:
3960 raise XAttrMetadataError(e.errno, e.strerror)
3961 stdout, stderr = p.communicate()
3962 stderr = stderr.decode('utf-8', 'replace')
3963 if p.returncode != 0:
3964 raise XAttrMetadataError(p.returncode, stderr)
3965
3966 else:
3967 # On Unix, and can't find pyxattr, setfattr, or xattr.
3968 if sys.platform.startswith('linux'):
3969 raise XAttrUnavailableError(
3970 "Couldn't find a tool to set the xattrs. "
3971 "Install either the python 'pyxattr' or 'xattr' "
3972 "modules, or the GNU 'attr' package "
3973 "(which contains the 'setfattr' tool).")
3974 else:
3975 raise XAttrUnavailableError(
3976 "Couldn't find a tool to set the xattrs. "
3977 "Install either the python 'xattr' module, "
3978 "or the 'xattr' binary.")
0c265486
YCH
3979
3980
3981def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
3982 start_date = datetime.date(1950, 1, 1)
3983 end_date = datetime.date(1995, 12, 31)
3984 offset = random.randint(0, (end_date - start_date).days)
3985 random_date = start_date + datetime.timedelta(offset)
0c265486 3986 return {
aa374bc7
AS
3987 year_field: str(random_date.year),
3988 month_field: str(random_date.month),
3989 day_field: str(random_date.day),
0c265486 3990 }