]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Skip remote IP addresses non matching to source address' IP version (closes...
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
d7cd9a9e 42 compat_ctypes_WINFUNCTYPE,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
60c08562 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29 186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0685d972 187JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 188
7105440c 189
d77c3dfd 190def preferredencoding():
59ae15a5 191 """Get preferred encoding.
d77c3dfd 192
59ae15a5
PH
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
28e614de 198 'TEST'.encode(pref)
70a1165b 199 except Exception:
59ae15a5 200 pref = 'UTF-8'
bae611f2 201
59ae15a5 202 return pref
d77c3dfd 203
f4bfd65f 204
181c8655 205def write_json_file(obj, fn):
1394646a 206 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 207
92120217 208 fn = encodeFilename(fn)
61ee5aeb 209 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
73159f99
S
221 args = {
222 'suffix': '.tmp',
ec5f6016
JMF
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
73159f99
S
225 'delete': False,
226 }
227
181c8655
PH
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
73159f99 231 args['mode'] = 'wb'
181c8655 232 else:
73159f99
S
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
c86b6142 238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
239
240 try:
241 with tf:
242 json.dump(obj, tf)
1394646a
IK
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
181c8655 250 os.rename(tf.name, fn)
70a1165b 251 except Exception:
181c8655
PH
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259if sys.version_info >= (2, 7):
ee114368 260 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 261 """ Find the xpath xpath[@key=val] """
5d2354f1 262 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
264 return node.find(expr)
265else:
ee114368 266 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 267 for f in node.findall(compat_xpath(xpath)):
ee114368
S
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
271 return f
272 return None
273
d7e66d39
JMF
274# On python2.6 the xml.etree.ElementTree.Element methods don't support
275# the namespace parameter
5f6a1245
JW
276
277
d7e66d39
JMF
278def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
d77c3dfd 289
a41fb80c 290def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 291 def _find_xpath(xpath):
810c10ba 292 return node.find(compat_xpath(xpath))
578c0745
S
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
d74bebd5 301
8e636da4 302 if n is None:
bf42a990
S
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
bf0ff932
PH
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
a41fb80c
S
310 return n
311
312
313def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
a41fb80c
S
326
327
328def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
bf0ff932
PH
339
340
9e6dd238 341def get_element_by_id(id, html):
43e8fafd 342 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 343 return get_element_by_attribute('id', id, html)
43e8fafd 344
12ea2f30 345
84c237fb 346def get_element_by_class(class_name, html):
2af12ad9
TC
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
84c237fb
YCH
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
2af12ad9 364def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 365 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 366
84c237fb
YCH
367 value = re.escape(value) if escape_value else value
368
2af12ad9
TC
369 retlist = []
370 for m in re.finditer(r'''(?xs)
38285056 371 <([a-zA-Z0-9:._-]+)
609ff8ca 372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 373 \s+%s=['"]?%s['"]?
609ff8ca 374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
375 \s*>
376 (?P<content>.*?)
377 </\1>
2af12ad9
TC
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
38285056 380
2af12ad9
TC
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
38285056 383
2af12ad9 384 retlist.append(unescapeHTML(res))
a921f407 385
2af12ad9 386 return retlist
a921f407 387
c5229f39 388
8bb56eee
BF
389class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
c5229f39 392 self.attrs = {}
8bb56eee
BF
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
c5229f39 398
8bb56eee
BF
399def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
b4a3d461
S
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
8bb56eee 422 return parser.attrs
9e6dd238 423
c5229f39 424
9e6dd238 425def clean_html(html):
59ae15a5 426 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
59ae15a5
PH
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
edd9221c
TF
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
7decf895 439 return html.strip()
9e6dd238
FV
440
441
d77c3dfd 442def sanitize_open(filename, open_mode):
59ae15a5
PH
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
28e614de 453 if filename == '-':
59ae15a5
PH
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
f45c185f
PH
461 if err.errno in (errno.EACCES,):
462 raise
59ae15a5 463
f45c185f 464 # In case of error, try to remove win32 forbidden chars
d55de57b 465 alt_filename = sanitize_path(filename)
f45c185f
PH
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
d55de57b 470 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 471 return (stream, alt_filename)
d77c3dfd
FV
472
473
474def timeconvert(timestr):
59ae15a5
PH
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
1c469a94 481
5f6a1245 482
796173d0 483def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
158af524
S
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
59ae15a5
PH
488 """
489 def replace_insane(char):
c587cbb7
AT
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
59ae15a5
PH
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
627dcfff 500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
2aeb06d6
PH
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 508 result = ''.join(map(replace_insane, s))
796173d0
PH
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
5a42414b
PH
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
a7440261 518 result = result.lstrip('.')
796173d0
PH
519 if not result:
520 result = '_'
59ae15a5 521 return result
d77c3dfd 522
5f6a1245 523
a2aaf4db
S
524def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
be531ef1
S
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
a2aaf4db
S
533 norm_path.pop(0)
534 sanitized_path = [
ec85ded8 535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 536 for path_part in norm_path]
be531ef1
S
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
539 return os.path.join(*sanitized_path)
540
541
17bcc626 542def sanitize_url(url):
befa4708
S
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
17bcc626
S
558
559
67dda517 560def sanitized_Request(url, *args, **kwargs):
17bcc626 561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
562
563
51098426
S
564def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
d77c3dfd 569def orderedSet(iterable):
59ae15a5
PH
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
d77c3dfd 576
912b38b4 577
55b2f099 578def _htmlentity_transform(entity_with_semicolon):
4e408e47 579 """Transforms an HTML entity to a character."""
55b2f099
YCH
580 entity = entity_with_semicolon[:-1]
581
4e408e47
PH
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
55b2f099
YCH
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
91757b0f 591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
592 if mobj is not None:
593 numstr = mobj.group(1)
28e614de 594 if numstr.startswith('x'):
4e408e47 595 base = 16
28e614de 596 numstr = '0%s' % numstr
4e408e47
PH
597 else:
598 base = 10
7aefc49c
S
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
4e408e47
PH
604
605 # Unknown entity in name, return its literal representation
7a3f0c00 606 return '&%s;' % entity
4e408e47
PH
607
608
d77c3dfd 609def unescapeHTML(s):
912b38b4
PH
610 if s is None:
611 return None
612 assert type(s) == compat_str
d77c3dfd 613
4e408e47 614 return re.sub(
95f3f7c2 615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 616
8bf48f23 617
aa49acd1
S
618def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
8bf48f23 630def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
631 """
632 @param s The name of the file
633 """
d77c3dfd 634
8bf48f23 635 assert type(s) == compat_str
d77c3dfd 636
59ae15a5
PH
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
0f00efed 640
aa49acd1
S
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
8ee239e9
YCH
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
aa49acd1
S
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 663
f07b74fc
PH
664
665def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
7af808a5 669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
aa49acd1
S
674def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
8271226a
PH
678def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
1c256f70 686
5f6a1245 687
4539dd30
PH
688def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
a0ddb8a2 696
be4a824d
PH
697def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 701 if opts_no_check_certificate:
be5f2c19 702 context.check_hostname = False
0db261ba 703 context.verify_mode = ssl.CERT_NONE
a2366922 704 try:
be4a824d 705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
d7932313 712 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 713 else: # Python < 3.4
d7932313 714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 715 context.verify_mode = (ssl.CERT_NONE
dca08720 716 if opts_no_check_certificate
ea6d901e 717 else ssl.CERT_REQUIRED)
303b479e 718 context.set_default_verify_paths()
be4a824d 719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 720
732ea2f0 721
08f2a92c
JMF
722def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
bf5b9d85
PM
733class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738class ExtractorError(YoutubeDLError):
1c256f70 739 """Error during info extraction."""
5f6a1245 740
d11271dd 741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
d11271dd
PH
748 if video_id is not None:
749 msg = video_id + ': ' + msg
410f3e73 750 if cause:
28e614de 751 msg += ' (caused by %r)' % cause
9a82b238 752 if not expected:
08f2a92c 753 msg += bug_reports_message()
1c256f70 754 super(ExtractorError, self).__init__(msg)
d5979c5d 755
1c256f70 756 self.traceback = tb
8cc83b8d 757 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 758 self.cause = cause
d11271dd 759 self.video_id = video_id
1c256f70 760
01951dda
PH
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
28e614de 764 return ''.join(traceback.format_tb(self.traceback))
01951dda 765
1c256f70 766
416c7fcb
PH
767class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
55b3e45b
JMF
774class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
773f291d
S
779class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
bf5b9d85 791class DownloadError(YoutubeDLError):
59ae15a5 792 """Download Error exception.
d77c3dfd 793
59ae15a5
PH
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
5f6a1245 798
8cc83b8d
FV
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
d77c3dfd
FV
803
804
bf5b9d85 805class SameFileError(YoutubeDLError):
59ae15a5 806 """Same File exception.
d77c3dfd 807
59ae15a5
PH
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
d77c3dfd
FV
812
813
bf5b9d85 814class PostProcessingError(YoutubeDLError):
59ae15a5 815 """Post Processing exception.
d77c3dfd 816
59ae15a5
PH
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
5f6a1245 820
7851b379 821 def __init__(self, msg):
bf5b9d85 822 super(PostProcessingError, self).__init__(msg)
7851b379 823 self.msg = msg
d77c3dfd 824
5f6a1245 825
bf5b9d85 826class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
827 """ --max-downloads limit has been reached. """
828 pass
d77c3dfd
FV
829
830
bf5b9d85 831class UnavailableVideoError(YoutubeDLError):
59ae15a5 832 """Unavailable Format exception.
d77c3dfd 833
59ae15a5
PH
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
d77c3dfd
FV
838
839
bf5b9d85 840class ContentTooShortError(YoutubeDLError):
59ae15a5 841 """Content Too Short exception.
d77c3dfd 842
59ae15a5
PH
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
d77c3dfd 847
59ae15a5 848 def __init__(self, downloaded, expected):
bf5b9d85
PM
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
2c7ed247 852 # Both in bytes
59ae15a5
PH
853 self.downloaded = downloaded
854 self.expected = expected
d77c3dfd 855
5f6a1245 856
bf5b9d85 857class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
bd264412 861 self.msg = msg
efa97bdc
YCH
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
bf5b9d85 873class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
874 pass
875
876
c5a59d93 877def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
65220c3b
S
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 884 source_address = ydl_handler._params.get('source_address')
8959018a 885
be4a824d 886 if source_address is not None:
8959018a
AU
887 filter_for = socket.AF_INET if '.' in source_address else socket.AF_INET6
888 # This is to workaround _create_connection() from socket where it will try all
889 # address data from getaddrinfo() including IPv6. This filters the result from
890 # getaddrinfo() based on the source_address value.
891 # This is based on the cpython socket.create_connection() function.
892 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
893 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
894 host, port = address
895 err = None
896 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
897 ip_addrs = [addr for addr in addrs if addr[0] == filter_for]
898 for res in ip_addrs:
899 af, socktype, proto, canonname, sa = res
900 sock = None
901 try:
902 sock = socket.socket(af, socktype, proto)
903 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
904 sock.settimeout(timeout)
905 sock.bind(source_address)
906 sock.connect(sa)
907 err = None # Explicitly break reference cycle
908 return sock
909 except socket.error as _:
910 err = _
911 if sock is not None:
912 sock.close()
913 if err is not None:
914 raise err
915 else:
916 raise socket.error('Unknown error occurred')
917 hc._create_connection = _create_connection
918
be4a824d
PH
919 sa = (source_address, 0)
920 if hasattr(hc, 'source_address'): # Python 2.7+
921 hc.source_address = sa
922 else: # Python 2.6
923 def _hc_connect(self, *args, **kwargs):
924 sock = compat_socket_create_connection(
925 (self.host, self.port), self.timeout, sa)
926 if is_https:
d7932313
PH
927 self.sock = ssl.wrap_socket(
928 sock, self.key_file, self.cert_file,
929 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
930 else:
931 self.sock = sock
932 hc.connect = functools.partial(_hc_connect, hc)
933
934 return hc
935
936
87f0e62d 937def handle_youtubedl_headers(headers):
992fc9d6
YCH
938 filtered_headers = headers
939
940 if 'Youtubedl-no-compression' in filtered_headers:
941 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 942 del filtered_headers['Youtubedl-no-compression']
87f0e62d 943
992fc9d6 944 return filtered_headers
87f0e62d
YCH
945
946
acebc9cd 947class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
948 """Handler for HTTP requests and responses.
949
950 This class, when installed with an OpenerDirector, automatically adds
951 the standard headers to every HTTP request and handles gzipped and
952 deflated responses from web servers. If compression is to be avoided in
953 a particular request, the original request in the program code only has
0424ec30 954 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
955 removed before making the real request.
956
957 Part of this code was copied from:
958
959 http://techknack.net/python-urllib2-handlers/
960
961 Andrew Rowls, the author of that code, agreed to release it to the
962 public domain.
963 """
964
be4a824d
PH
965 def __init__(self, params, *args, **kwargs):
966 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
967 self._params = params
968
969 def http_open(self, req):
71aff188
YCH
970 conn_class = compat_http_client.HTTPConnection
971
972 socks_proxy = req.headers.get('Ytdl-socks-proxy')
973 if socks_proxy:
974 conn_class = make_socks_conn_class(conn_class, socks_proxy)
975 del req.headers['Ytdl-socks-proxy']
976
be4a824d 977 return self.do_open(functools.partial(
71aff188 978 _create_http_connection, self, conn_class, False),
be4a824d
PH
979 req)
980
59ae15a5
PH
981 @staticmethod
982 def deflate(data):
983 try:
984 return zlib.decompress(data, -zlib.MAX_WBITS)
985 except zlib.error:
986 return zlib.decompress(data)
987
acebc9cd 988 def http_request(self, req):
51f267d9
S
989 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
990 # always respected by websites, some tend to give out URLs with non percent-encoded
991 # non-ASCII characters (see telemb.py, ard.py [#3412])
992 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
993 # To work around aforementioned issue we will replace request's original URL with
994 # percent-encoded one
995 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
996 # the code of this workaround has been moved here from YoutubeDL.urlopen()
997 url = req.get_full_url()
998 url_escaped = escape_url(url)
999
1000 # Substitute URL if any change after escaping
1001 if url != url_escaped:
15d260eb 1002 req = update_Request(req, url=url_escaped)
51f267d9 1003
33ac271b 1004 for h, v in std_headers.items():
3d5f7a39
JK
1005 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1006 # The dict keys are capitalized because of this bug by urllib
1007 if h.capitalize() not in req.headers:
33ac271b 1008 req.add_header(h, v)
87f0e62d
YCH
1009
1010 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1011
1012 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1013 # Python 2.6 is brain-dead when it comes to fragments
1014 req._Request__original = req._Request__original.partition('#')[0]
1015 req._Request__r_type = req._Request__r_type.partition('#')[0]
1016
59ae15a5
PH
1017 return req
1018
acebc9cd 1019 def http_response(self, req, resp):
59ae15a5
PH
1020 old_resp = resp
1021 # gzip
1022 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1023 content = resp.read()
1024 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1025 try:
1026 uncompressed = io.BytesIO(gz.read())
1027 except IOError as original_ioerror:
1028 # There may be junk add the end of the file
1029 # See http://stackoverflow.com/q/4928560/35070 for details
1030 for i in range(1, 1024):
1031 try:
1032 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1033 uncompressed = io.BytesIO(gz.read())
1034 except IOError:
1035 continue
1036 break
1037 else:
1038 raise original_ioerror
b407d853 1039 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1040 resp.msg = old_resp.msg
c047270c 1041 del resp.headers['Content-encoding']
59ae15a5
PH
1042 # deflate
1043 if resp.headers.get('Content-encoding', '') == 'deflate':
1044 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1045 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1046 resp.msg = old_resp.msg
c047270c 1047 del resp.headers['Content-encoding']
ad729172
S
1048 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1049 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1050 if 300 <= resp.code < 400:
1051 location = resp.headers.get('Location')
1052 if location:
1053 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1054 if sys.version_info >= (3, 0):
1055 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1056 else:
1057 location = location.decode('utf-8')
5a4d9ddb
S
1058 location_escaped = escape_url(location)
1059 if location != location_escaped:
1060 del resp.headers['Location']
9a4aec8b
YCH
1061 if sys.version_info < (3, 0):
1062 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1063 resp.headers['Location'] = location_escaped
59ae15a5 1064 return resp
0f8d03f8 1065
acebc9cd
PH
1066 https_request = http_request
1067 https_response = http_response
bf50b038 1068
5de90176 1069
71aff188
YCH
1070def make_socks_conn_class(base_class, socks_proxy):
1071 assert issubclass(base_class, (
1072 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1073
1074 url_components = compat_urlparse.urlparse(socks_proxy)
1075 if url_components.scheme.lower() == 'socks5':
1076 socks_type = ProxyType.SOCKS5
1077 elif url_components.scheme.lower() in ('socks', 'socks4'):
1078 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1079 elif url_components.scheme.lower() == 'socks4a':
1080 socks_type = ProxyType.SOCKS4A
71aff188 1081
cdd94c2e
YCH
1082 def unquote_if_non_empty(s):
1083 if not s:
1084 return s
1085 return compat_urllib_parse_unquote_plus(s)
1086
71aff188
YCH
1087 proxy_args = (
1088 socks_type,
1089 url_components.hostname, url_components.port or 1080,
1090 True, # Remote DNS
cdd94c2e
YCH
1091 unquote_if_non_empty(url_components.username),
1092 unquote_if_non_empty(url_components.password),
71aff188
YCH
1093 )
1094
1095 class SocksConnection(base_class):
1096 def connect(self):
1097 self.sock = sockssocket()
1098 self.sock.setproxy(*proxy_args)
1099 if type(self.timeout) in (int, float):
1100 self.sock.settimeout(self.timeout)
1101 self.sock.connect((self.host, self.port))
1102
1103 if isinstance(self, compat_http_client.HTTPSConnection):
1104 if hasattr(self, '_context'): # Python > 2.6
1105 self.sock = self._context.wrap_socket(
1106 self.sock, server_hostname=self.host)
1107 else:
1108 self.sock = ssl.wrap_socket(self.sock)
1109
1110 return SocksConnection
1111
1112
be4a824d
PH
1113class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1114 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1115 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1116 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1117 self._params = params
1118
1119 def https_open(self, req):
4f264c02 1120 kwargs = {}
71aff188
YCH
1121 conn_class = self._https_conn_class
1122
4f264c02
JMF
1123 if hasattr(self, '_context'): # python > 2.6
1124 kwargs['context'] = self._context
1125 if hasattr(self, '_check_hostname'): # python 3.x
1126 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1127
1128 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1129 if socks_proxy:
1130 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1131 del req.headers['Ytdl-socks-proxy']
1132
be4a824d 1133 return self.do_open(functools.partial(
71aff188 1134 _create_http_connection, self, conn_class, True),
4f264c02 1135 req, **kwargs)
be4a824d
PH
1136
1137
a6420bf5
S
1138class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1139 def __init__(self, cookiejar=None):
1140 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1141
1142 def http_response(self, request, response):
1143 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1144 # characters in Set-Cookie HTTP header of last response (see
1145 # https://github.com/rg3/youtube-dl/issues/6769).
1146 # In order to at least prevent crashing we will percent encode Set-Cookie
1147 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1148 # if sys.version_info < (3, 0) and response.headers:
1149 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1150 # set_cookie = response.headers.get(set_cookie_header)
1151 # if set_cookie:
1152 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1153 # if set_cookie != set_cookie_escaped:
1154 # del response.headers[set_cookie_header]
1155 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1156 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1157
1158 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1159 https_response = http_response
1160
1161
46f59e89
S
1162def extract_timezone(date_str):
1163 m = re.search(
1164 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1165 date_str)
1166 if not m:
1167 timezone = datetime.timedelta()
1168 else:
1169 date_str = date_str[:-len(m.group('tz'))]
1170 if not m.group('sign'):
1171 timezone = datetime.timedelta()
1172 else:
1173 sign = 1 if m.group('sign') == '+' else -1
1174 timezone = datetime.timedelta(
1175 hours=sign * int(m.group('hours')),
1176 minutes=sign * int(m.group('minutes')))
1177 return timezone, date_str
1178
1179
08b38d54 1180def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1181 """ Return a UNIX timestamp from the given date """
1182
1183 if date_str is None:
1184 return None
1185
52c3a6e4
S
1186 date_str = re.sub(r'\.[0-9]+', '', date_str)
1187
08b38d54 1188 if timezone is None:
46f59e89
S
1189 timezone, date_str = extract_timezone(date_str)
1190
52c3a6e4
S
1191 try:
1192 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1193 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1194 return calendar.timegm(dt.timetuple())
1195 except ValueError:
1196 pass
912b38b4
PH
1197
1198
46f59e89
S
1199def date_formats(day_first=True):
1200 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1201
1202
42bdd9d0 1203def unified_strdate(date_str, day_first=True):
bf50b038 1204 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1205
1206 if date_str is None:
1207 return None
bf50b038 1208 upload_date = None
5f6a1245 1209 # Replace commas
026fcc04 1210 date_str = date_str.replace(',', ' ')
42bdd9d0 1211 # Remove AM/PM + timezone
9bb8e0a3 1212 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1213 _, date_str = extract_timezone(date_str)
42bdd9d0 1214
46f59e89 1215 for expression in date_formats(day_first):
bf50b038
JMF
1216 try:
1217 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1218 except ValueError:
bf50b038 1219 pass
42393ce2
PH
1220 if upload_date is None:
1221 timetuple = email.utils.parsedate_tz(date_str)
1222 if timetuple:
c6b9cf05
S
1223 try:
1224 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1225 except ValueError:
1226 pass
6a750402
JMF
1227 if upload_date is not None:
1228 return compat_str(upload_date)
bf50b038 1229
5f6a1245 1230
46f59e89
S
1231def unified_timestamp(date_str, day_first=True):
1232 if date_str is None:
1233 return None
1234
2ae2ffda 1235 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1236
7dc2a74e 1237 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1238 timezone, date_str = extract_timezone(date_str)
1239
1240 # Remove AM/PM + timezone
1241 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1242
deef3195
S
1243 # Remove unrecognized timezones from ISO 8601 alike timestamps
1244 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1245 if m:
1246 date_str = date_str[:-len(m.group('tz'))]
1247
f226880c
PH
1248 # Python only supports microseconds, so remove nanoseconds
1249 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1250 if m:
1251 date_str = m.group(1)
1252
46f59e89
S
1253 for expression in date_formats(day_first):
1254 try:
7dc2a74e 1255 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1256 return calendar.timegm(dt.timetuple())
1257 except ValueError:
1258 pass
1259 timetuple = email.utils.parsedate_tz(date_str)
1260 if timetuple:
7dc2a74e 1261 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1262
1263
28e614de 1264def determine_ext(url, default_ext='unknown_video'):
85750f89 1265 if url is None or '.' not in url:
f4776371 1266 return default_ext
9cb9a5df 1267 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1268 if re.match(r'^[A-Za-z0-9]+$', guess):
1269 return guess
a7aaa398
S
1270 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1271 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1272 return guess.rstrip('/')
73e79f2a 1273 else:
cbdbb766 1274 return default_ext
73e79f2a 1275
5f6a1245 1276
d4051a8e 1277def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1278 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1279
5f6a1245 1280
bd558525 1281def date_from_str(date_str):
37254abc
JMF
1282 """
1283 Return a datetime object from a string in the format YYYYMMDD or
1284 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1285 today = datetime.date.today()
f8795e10 1286 if date_str in ('now', 'today'):
37254abc 1287 return today
f8795e10
PH
1288 if date_str == 'yesterday':
1289 return today - datetime.timedelta(days=1)
ec85ded8 1290 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1291 if match is not None:
1292 sign = match.group('sign')
1293 time = int(match.group('time'))
1294 if sign == '-':
1295 time = -time
1296 unit = match.group('unit')
dfb1b146 1297 # A bad approximation?
37254abc
JMF
1298 if unit == 'month':
1299 unit = 'day'
1300 time *= 30
1301 elif unit == 'year':
1302 unit = 'day'
1303 time *= 365
1304 unit += 's'
1305 delta = datetime.timedelta(**{unit: time})
1306 return today + delta
611c1dd9 1307 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1308
1309
e63fc1be 1310def hyphenate_date(date_str):
1311 """
1312 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1313 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1314 if match is not None:
1315 return '-'.join(match.groups())
1316 else:
1317 return date_str
1318
5f6a1245 1319
bd558525
JMF
1320class DateRange(object):
1321 """Represents a time interval between two dates"""
5f6a1245 1322
bd558525
JMF
1323 def __init__(self, start=None, end=None):
1324 """start and end must be strings in the format accepted by date"""
1325 if start is not None:
1326 self.start = date_from_str(start)
1327 else:
1328 self.start = datetime.datetime.min.date()
1329 if end is not None:
1330 self.end = date_from_str(end)
1331 else:
1332 self.end = datetime.datetime.max.date()
37254abc 1333 if self.start > self.end:
bd558525 1334 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1335
bd558525
JMF
1336 @classmethod
1337 def day(cls, day):
1338 """Returns a range that only contains the given day"""
5f6a1245
JW
1339 return cls(day, day)
1340
bd558525
JMF
1341 def __contains__(self, date):
1342 """Check if the date is in the range"""
37254abc
JMF
1343 if not isinstance(date, datetime.date):
1344 date = date_from_str(date)
1345 return self.start <= date <= self.end
5f6a1245 1346
bd558525 1347 def __str__(self):
5f6a1245 1348 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1349
1350
1351def platform_name():
1352 """ Returns the platform name as a compat_str """
1353 res = platform.platform()
1354 if isinstance(res, bytes):
1355 res = res.decode(preferredencoding())
1356
1357 assert isinstance(res, compat_str)
1358 return res
c257baff
PH
1359
1360
b58ddb32
PH
1361def _windows_write_string(s, out):
1362 """ Returns True if the string was written using special methods,
1363 False if it has yet to be written out."""
1364 # Adapted from http://stackoverflow.com/a/3259271/35070
1365
1366 import ctypes
1367 import ctypes.wintypes
1368
1369 WIN_OUTPUT_IDS = {
1370 1: -11,
1371 2: -12,
1372 }
1373
a383a98a
PH
1374 try:
1375 fileno = out.fileno()
1376 except AttributeError:
1377 # If the output stream doesn't have a fileno, it's virtual
1378 return False
aa42e873
PH
1379 except io.UnsupportedOperation:
1380 # Some strange Windows pseudo files?
1381 return False
b58ddb32
PH
1382 if fileno not in WIN_OUTPUT_IDS:
1383 return False
1384
d7cd9a9e 1385 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1386 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1387 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1388 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1389
d7cd9a9e 1390 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1391 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1392 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1393 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1394 written = ctypes.wintypes.DWORD(0)
1395
d7cd9a9e 1396 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1397 FILE_TYPE_CHAR = 0x0002
1398 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1399 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1400 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1401 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1402 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1403 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1404
1405 def not_a_console(handle):
1406 if handle == INVALID_HANDLE_VALUE or handle is None:
1407 return True
8fb3ac36
PH
1408 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1409 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1410
1411 if not_a_console(h):
1412 return False
1413
d1b9c912
PH
1414 def next_nonbmp_pos(s):
1415 try:
1416 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1417 except StopIteration:
1418 return len(s)
1419
1420 while s:
1421 count = min(next_nonbmp_pos(s), 1024)
1422
b58ddb32 1423 ret = WriteConsoleW(
d1b9c912 1424 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1425 if ret == 0:
1426 raise OSError('Failed to write string')
d1b9c912
PH
1427 if not count: # We just wrote a non-BMP character
1428 assert written.value == 2
1429 s = s[1:]
1430 else:
1431 assert written.value > 0
1432 s = s[written.value:]
b58ddb32
PH
1433 return True
1434
1435
734f90bb 1436def write_string(s, out=None, encoding=None):
7459e3a2
PH
1437 if out is None:
1438 out = sys.stderr
8bf48f23 1439 assert type(s) == compat_str
7459e3a2 1440
b58ddb32
PH
1441 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1442 if _windows_write_string(s, out):
1443 return
1444
7459e3a2
PH
1445 if ('b' in getattr(out, 'mode', '') or
1446 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1447 byt = s.encode(encoding or preferredencoding(), 'ignore')
1448 out.write(byt)
1449 elif hasattr(out, 'buffer'):
1450 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1451 byt = s.encode(enc, 'ignore')
1452 out.buffer.write(byt)
1453 else:
8bf48f23 1454 out.write(s)
7459e3a2
PH
1455 out.flush()
1456
1457
48ea9cea
PH
1458def bytes_to_intlist(bs):
1459 if not bs:
1460 return []
1461 if isinstance(bs[0], int): # Python 3
1462 return list(bs)
1463 else:
1464 return [ord(c) for c in bs]
1465
c257baff 1466
cba892fa 1467def intlist_to_bytes(xs):
1468 if not xs:
1469 return b''
edaa23f8 1470 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1471
1472
c1c9a79c
PH
1473# Cross-platform file locking
1474if sys.platform == 'win32':
1475 import ctypes.wintypes
1476 import msvcrt
1477
1478 class OVERLAPPED(ctypes.Structure):
1479 _fields_ = [
1480 ('Internal', ctypes.wintypes.LPVOID),
1481 ('InternalHigh', ctypes.wintypes.LPVOID),
1482 ('Offset', ctypes.wintypes.DWORD),
1483 ('OffsetHigh', ctypes.wintypes.DWORD),
1484 ('hEvent', ctypes.wintypes.HANDLE),
1485 ]
1486
1487 kernel32 = ctypes.windll.kernel32
1488 LockFileEx = kernel32.LockFileEx
1489 LockFileEx.argtypes = [
1490 ctypes.wintypes.HANDLE, # hFile
1491 ctypes.wintypes.DWORD, # dwFlags
1492 ctypes.wintypes.DWORD, # dwReserved
1493 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1494 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1495 ctypes.POINTER(OVERLAPPED) # Overlapped
1496 ]
1497 LockFileEx.restype = ctypes.wintypes.BOOL
1498 UnlockFileEx = kernel32.UnlockFileEx
1499 UnlockFileEx.argtypes = [
1500 ctypes.wintypes.HANDLE, # hFile
1501 ctypes.wintypes.DWORD, # dwReserved
1502 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1503 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1504 ctypes.POINTER(OVERLAPPED) # Overlapped
1505 ]
1506 UnlockFileEx.restype = ctypes.wintypes.BOOL
1507 whole_low = 0xffffffff
1508 whole_high = 0x7fffffff
1509
1510 def _lock_file(f, exclusive):
1511 overlapped = OVERLAPPED()
1512 overlapped.Offset = 0
1513 overlapped.OffsetHigh = 0
1514 overlapped.hEvent = 0
1515 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1516 handle = msvcrt.get_osfhandle(f.fileno())
1517 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1518 whole_low, whole_high, f._lock_file_overlapped_p):
1519 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1520
1521 def _unlock_file(f):
1522 assert f._lock_file_overlapped_p
1523 handle = msvcrt.get_osfhandle(f.fileno())
1524 if not UnlockFileEx(handle, 0,
1525 whole_low, whole_high, f._lock_file_overlapped_p):
1526 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1527
1528else:
399a76e6
YCH
1529 # Some platforms, such as Jython, is missing fcntl
1530 try:
1531 import fcntl
c1c9a79c 1532
399a76e6
YCH
1533 def _lock_file(f, exclusive):
1534 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1535
399a76e6
YCH
1536 def _unlock_file(f):
1537 fcntl.flock(f, fcntl.LOCK_UN)
1538 except ImportError:
1539 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1540
1541 def _lock_file(f, exclusive):
1542 raise IOError(UNSUPPORTED_MSG)
1543
1544 def _unlock_file(f):
1545 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1546
1547
1548class locked_file(object):
1549 def __init__(self, filename, mode, encoding=None):
1550 assert mode in ['r', 'a', 'w']
1551 self.f = io.open(filename, mode, encoding=encoding)
1552 self.mode = mode
1553
1554 def __enter__(self):
1555 exclusive = self.mode != 'r'
1556 try:
1557 _lock_file(self.f, exclusive)
1558 except IOError:
1559 self.f.close()
1560 raise
1561 return self
1562
1563 def __exit__(self, etype, value, traceback):
1564 try:
1565 _unlock_file(self.f)
1566 finally:
1567 self.f.close()
1568
1569 def __iter__(self):
1570 return iter(self.f)
1571
1572 def write(self, *args):
1573 return self.f.write(*args)
1574
1575 def read(self, *args):
1576 return self.f.read(*args)
4eb7f1d1
JMF
1577
1578
4644ac55
S
1579def get_filesystem_encoding():
1580 encoding = sys.getfilesystemencoding()
1581 return encoding if encoding is not None else 'utf-8'
1582
1583
4eb7f1d1 1584def shell_quote(args):
a6a173c2 1585 quoted_args = []
4644ac55 1586 encoding = get_filesystem_encoding()
a6a173c2
JMF
1587 for a in args:
1588 if isinstance(a, bytes):
1589 # We may get a filename encoded with 'encodeFilename'
1590 a = a.decode(encoding)
aefce8e6 1591 quoted_args.append(compat_shlex_quote(a))
28e614de 1592 return ' '.join(quoted_args)
9d4660ca
PH
1593
1594
1595def smuggle_url(url, data):
1596 """ Pass additional data in a URL for internal use. """
1597
81953d1a
RA
1598 url, idata = unsmuggle_url(url, {})
1599 data.update(idata)
15707c7e 1600 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1601 {'__youtubedl_smuggle': json.dumps(data)})
1602 return url + '#' + sdata
9d4660ca
PH
1603
1604
79f82953 1605def unsmuggle_url(smug_url, default=None):
83e865a3 1606 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1607 return smug_url, default
28e614de
PH
1608 url, _, sdata = smug_url.rpartition('#')
1609 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1610 data = json.loads(jsond)
1611 return url, data
02dbf93f
PH
1612
1613
02dbf93f
PH
1614def format_bytes(bytes):
1615 if bytes is None:
28e614de 1616 return 'N/A'
02dbf93f
PH
1617 if type(bytes) is str:
1618 bytes = float(bytes)
1619 if bytes == 0.0:
1620 exponent = 0
1621 else:
1622 exponent = int(math.log(bytes, 1024.0))
28e614de 1623 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1624 converted = float(bytes) / float(1024 ** exponent)
28e614de 1625 return '%.2f%s' % (converted, suffix)
f53c966a 1626
1c088fa8 1627
fb47597b
S
1628def lookup_unit_table(unit_table, s):
1629 units_re = '|'.join(re.escape(u) for u in unit_table)
1630 m = re.match(
782b1b5b 1631 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1632 if not m:
1633 return None
1634 num_str = m.group('num').replace(',', '.')
1635 mult = unit_table[m.group('unit')]
1636 return int(float(num_str) * mult)
1637
1638
be64b5b0
PH
1639def parse_filesize(s):
1640 if s is None:
1641 return None
1642
dfb1b146 1643 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1644 # but we support those too
1645 _UNIT_TABLE = {
1646 'B': 1,
1647 'b': 1,
70852b47 1648 'bytes': 1,
be64b5b0
PH
1649 'KiB': 1024,
1650 'KB': 1000,
1651 'kB': 1024,
1652 'Kb': 1000,
13585d76 1653 'kb': 1000,
70852b47
YCH
1654 'kilobytes': 1000,
1655 'kibibytes': 1024,
be64b5b0
PH
1656 'MiB': 1024 ** 2,
1657 'MB': 1000 ** 2,
1658 'mB': 1024 ** 2,
1659 'Mb': 1000 ** 2,
13585d76 1660 'mb': 1000 ** 2,
70852b47
YCH
1661 'megabytes': 1000 ** 2,
1662 'mebibytes': 1024 ** 2,
be64b5b0
PH
1663 'GiB': 1024 ** 3,
1664 'GB': 1000 ** 3,
1665 'gB': 1024 ** 3,
1666 'Gb': 1000 ** 3,
13585d76 1667 'gb': 1000 ** 3,
70852b47
YCH
1668 'gigabytes': 1000 ** 3,
1669 'gibibytes': 1024 ** 3,
be64b5b0
PH
1670 'TiB': 1024 ** 4,
1671 'TB': 1000 ** 4,
1672 'tB': 1024 ** 4,
1673 'Tb': 1000 ** 4,
13585d76 1674 'tb': 1000 ** 4,
70852b47
YCH
1675 'terabytes': 1000 ** 4,
1676 'tebibytes': 1024 ** 4,
be64b5b0
PH
1677 'PiB': 1024 ** 5,
1678 'PB': 1000 ** 5,
1679 'pB': 1024 ** 5,
1680 'Pb': 1000 ** 5,
13585d76 1681 'pb': 1000 ** 5,
70852b47
YCH
1682 'petabytes': 1000 ** 5,
1683 'pebibytes': 1024 ** 5,
be64b5b0
PH
1684 'EiB': 1024 ** 6,
1685 'EB': 1000 ** 6,
1686 'eB': 1024 ** 6,
1687 'Eb': 1000 ** 6,
13585d76 1688 'eb': 1000 ** 6,
70852b47
YCH
1689 'exabytes': 1000 ** 6,
1690 'exbibytes': 1024 ** 6,
be64b5b0
PH
1691 'ZiB': 1024 ** 7,
1692 'ZB': 1000 ** 7,
1693 'zB': 1024 ** 7,
1694 'Zb': 1000 ** 7,
13585d76 1695 'zb': 1000 ** 7,
70852b47
YCH
1696 'zettabytes': 1000 ** 7,
1697 'zebibytes': 1024 ** 7,
be64b5b0
PH
1698 'YiB': 1024 ** 8,
1699 'YB': 1000 ** 8,
1700 'yB': 1024 ** 8,
1701 'Yb': 1000 ** 8,
13585d76 1702 'yb': 1000 ** 8,
70852b47
YCH
1703 'yottabytes': 1000 ** 8,
1704 'yobibytes': 1024 ** 8,
be64b5b0
PH
1705 }
1706
fb47597b
S
1707 return lookup_unit_table(_UNIT_TABLE, s)
1708
1709
1710def parse_count(s):
1711 if s is None:
be64b5b0
PH
1712 return None
1713
fb47597b
S
1714 s = s.strip()
1715
1716 if re.match(r'^[\d,.]+$', s):
1717 return str_to_int(s)
1718
1719 _UNIT_TABLE = {
1720 'k': 1000,
1721 'K': 1000,
1722 'm': 1000 ** 2,
1723 'M': 1000 ** 2,
1724 'kk': 1000 ** 2,
1725 'KK': 1000 ** 2,
1726 }
be64b5b0 1727
fb47597b 1728 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1729
2f7ae819 1730
b871d7e9
S
1731def parse_resolution(s):
1732 if s is None:
1733 return {}
1734
1735 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1736 if mobj:
1737 return {
1738 'width': int(mobj.group('w')),
1739 'height': int(mobj.group('h')),
1740 }
1741
1742 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1743 if mobj:
1744 return {'height': int(mobj.group(1))}
1745
1746 mobj = re.search(r'\b([48])[kK]\b', s)
1747 if mobj:
1748 return {'height': int(mobj.group(1)) * 540}
1749
1750 return {}
1751
1752
a942d6cb 1753def month_by_name(name, lang='en'):
caefb1de
PH
1754 """ Return the number of a month by (locale-independently) English name """
1755
f6717dec 1756 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1757
caefb1de 1758 try:
f6717dec 1759 return month_names.index(name) + 1
7105440c
YCH
1760 except ValueError:
1761 return None
1762
1763
1764def month_by_abbreviation(abbrev):
1765 """ Return the number of a month by (locale-independently) English
1766 abbreviations """
1767
1768 try:
1769 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1770 except ValueError:
1771 return None
18258362
JMF
1772
1773
5aafe895 1774def fix_xml_ampersands(xml_str):
18258362 1775 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1776 return re.sub(
1777 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1778 '&amp;',
5aafe895 1779 xml_str)
e3946f98
PH
1780
1781
1782def setproctitle(title):
8bf48f23 1783 assert isinstance(title, compat_str)
c1c05c67
YCH
1784
1785 # ctypes in Jython is not complete
1786 # http://bugs.jython.org/issue2148
1787 if sys.platform.startswith('java'):
1788 return
1789
e3946f98 1790 try:
611c1dd9 1791 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1792 except OSError:
1793 return
2f49bcd6
RC
1794 except TypeError:
1795 # LoadLibrary in Windows Python 2.7.13 only expects
1796 # a bytestring, but since unicode_literals turns
1797 # every string into a unicode string, it fails.
1798 return
6eefe533
PH
1799 title_bytes = title.encode('utf-8')
1800 buf = ctypes.create_string_buffer(len(title_bytes))
1801 buf.value = title_bytes
e3946f98 1802 try:
6eefe533 1803 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1804 except AttributeError:
1805 return # Strange libc, just skip this
d7dda168
PH
1806
1807
1808def remove_start(s, start):
46bc9b7d 1809 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1810
1811
2b9faf55 1812def remove_end(s, end):
46bc9b7d 1813 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1814
1815
31b2051e
S
1816def remove_quotes(s):
1817 if s is None or len(s) < 2:
1818 return s
1819 for quote in ('"', "'", ):
1820 if s[0] == quote and s[-1] == quote:
1821 return s[1:-1]
1822 return s
1823
1824
29eb5174 1825def url_basename(url):
9b8aaeed 1826 path = compat_urlparse.urlparse(url).path
28e614de 1827 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1828
1829
02dc0a36
S
1830def base_url(url):
1831 return re.match(r'https?://[^?#&]+/', url).group()
1832
1833
e34c3361 1834def urljoin(base, path):
4b5de77b
S
1835 if isinstance(path, bytes):
1836 path = path.decode('utf-8')
e34c3361
S
1837 if not isinstance(path, compat_str) or not path:
1838 return None
b0c65c67 1839 if re.match(r'^(?:https?:)?//', path):
e34c3361 1840 return path
4b5de77b
S
1841 if isinstance(base, bytes):
1842 base = base.decode('utf-8')
1843 if not isinstance(base, compat_str) or not re.match(
1844 r'^(?:https?:)?//', base):
e34c3361
S
1845 return None
1846 return compat_urlparse.urljoin(base, path)
1847
1848
aa94a6d3
PH
1849class HEADRequest(compat_urllib_request.Request):
1850 def get_method(self):
611c1dd9 1851 return 'HEAD'
7217e148
PH
1852
1853
95cf60e8
S
1854class PUTRequest(compat_urllib_request.Request):
1855 def get_method(self):
1856 return 'PUT'
1857
1858
9732d77e 1859def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1860 if get_attr:
1861 if v is not None:
1862 v = getattr(v, get_attr, None)
9572013d
PH
1863 if v == '':
1864 v = None
1812afb7
S
1865 if v is None:
1866 return default
1867 try:
1868 return int(v) * invscale // scale
1869 except ValueError:
af98f8ff 1870 return default
9732d77e 1871
9572013d 1872
40a90862
JMF
1873def str_or_none(v, default=None):
1874 return default if v is None else compat_str(v)
1875
9732d77e
PH
1876
1877def str_to_int(int_str):
48d4681e 1878 """ A more relaxed version of int_or_none """
9732d77e
PH
1879 if int_str is None:
1880 return None
28e614de 1881 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1882 return int(int_str)
608d11f5
PH
1883
1884
9732d77e 1885def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1886 if v is None:
1887 return default
1888 try:
1889 return float(v) * invscale / scale
1890 except ValueError:
1891 return default
43f775e4
PH
1892
1893
c7e327c4
S
1894def bool_or_none(v, default=None):
1895 return v if isinstance(v, bool) else default
1896
1897
b72b4431
S
1898def strip_or_none(v):
1899 return None if v is None else v.strip()
1900
1901
af03000a
S
1902def url_or_none(url):
1903 if not url or not isinstance(url, compat_str):
1904 return None
1905 url = url.strip()
1906 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1907
1908
608d11f5 1909def parse_duration(s):
8f9312c3 1910 if not isinstance(s, compat_basestring):
608d11f5
PH
1911 return None
1912
ca7b3246
S
1913 s = s.strip()
1914
acaff495 1915 days, hours, mins, secs, ms = [None] * 5
15846398 1916 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1917 if m:
1918 days, hours, mins, secs, ms = m.groups()
1919 else:
1920 m = re.match(
056653bb
S
1921 r'''(?ix)(?:P?
1922 (?:
1923 [0-9]+\s*y(?:ears?)?\s*
1924 )?
1925 (?:
1926 [0-9]+\s*m(?:onths?)?\s*
1927 )?
1928 (?:
1929 [0-9]+\s*w(?:eeks?)?\s*
1930 )?
8f4b58d7 1931 (?:
acaff495 1932 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1933 )?
056653bb 1934 T)?
acaff495 1935 (?:
1936 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1937 )?
1938 (?:
1939 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1940 )?
1941 (?:
1942 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1943 )?Z?$''', s)
acaff495 1944 if m:
1945 days, hours, mins, secs, ms = m.groups()
1946 else:
15846398 1947 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1948 if m:
1949 hours, mins = m.groups()
1950 else:
1951 return None
1952
1953 duration = 0
1954 if secs:
1955 duration += float(secs)
1956 if mins:
1957 duration += float(mins) * 60
1958 if hours:
1959 duration += float(hours) * 60 * 60
1960 if days:
1961 duration += float(days) * 24 * 60 * 60
1962 if ms:
1963 duration += float(ms)
1964 return duration
91d7d0b3
JMF
1965
1966
e65e4c88 1967def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1968 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1969 return (
1970 '{0}.{1}{2}'.format(name, ext, real_ext)
1971 if not expected_real_ext or real_ext[1:] == expected_real_ext
1972 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1973
1974
b3ed15b7
S
1975def replace_extension(filename, ext, expected_real_ext=None):
1976 name, real_ext = os.path.splitext(filename)
1977 return '{0}.{1}'.format(
1978 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1979 ext)
1980
1981
d70ad093
PH
1982def check_executable(exe, args=[]):
1983 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1984 args can be a list of arguments for a short output (like -version) """
1985 try:
1986 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1987 except OSError:
1988 return False
1989 return exe
b7ab0590
PH
1990
1991
95807118 1992def get_exe_version(exe, args=['--version'],
cae97f65 1993 version_re=None, unrecognized='present'):
95807118
PH
1994 """ Returns the version of the specified executable,
1995 or False if the executable is not present """
1996 try:
b64d04c1
YCH
1997 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1998 # SIGTTOU if youtube-dl is run in the background.
1999 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 2000 out, _ = subprocess.Popen(
54116803 2001 [encodeArgument(exe)] + args,
00ca7552 2002 stdin=subprocess.PIPE,
95807118
PH
2003 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
2004 except OSError:
2005 return False
cae97f65
PH
2006 if isinstance(out, bytes): # Python 2.x
2007 out = out.decode('ascii', 'ignore')
2008 return detect_exe_version(out, version_re, unrecognized)
2009
2010
2011def detect_exe_version(output, version_re=None, unrecognized='present'):
2012 assert isinstance(output, compat_str)
2013 if version_re is None:
2014 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2015 m = re.search(version_re, output)
95807118
PH
2016 if m:
2017 return m.group(1)
2018 else:
2019 return unrecognized
2020
2021
b7ab0590 2022class PagedList(object):
dd26ced1
PH
2023 def __len__(self):
2024 # This is only useful for tests
2025 return len(self.getslice())
2026
9c44d242
PH
2027
2028class OnDemandPagedList(PagedList):
6be08ce6 2029 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
2030 self._pagefunc = pagefunc
2031 self._pagesize = pagesize
b95dc034
YCH
2032 self._use_cache = use_cache
2033 if use_cache:
2034 self._cache = {}
9c44d242 2035
b7ab0590
PH
2036 def getslice(self, start=0, end=None):
2037 res = []
2038 for pagenum in itertools.count(start // self._pagesize):
2039 firstid = pagenum * self._pagesize
2040 nextfirstid = pagenum * self._pagesize + self._pagesize
2041 if start >= nextfirstid:
2042 continue
2043
b95dc034
YCH
2044 page_results = None
2045 if self._use_cache:
2046 page_results = self._cache.get(pagenum)
2047 if page_results is None:
2048 page_results = list(self._pagefunc(pagenum))
2049 if self._use_cache:
2050 self._cache[pagenum] = page_results
b7ab0590
PH
2051
2052 startv = (
2053 start % self._pagesize
2054 if firstid <= start < nextfirstid
2055 else 0)
2056
2057 endv = (
2058 ((end - 1) % self._pagesize) + 1
2059 if (end is not None and firstid <= end <= nextfirstid)
2060 else None)
2061
2062 if startv != 0 or endv is not None:
2063 page_results = page_results[startv:endv]
2064 res.extend(page_results)
2065
2066 # A little optimization - if current page is not "full", ie. does
2067 # not contain page_size videos then we can assume that this page
2068 # is the last one - there are no more ids on further pages -
2069 # i.e. no need to query again.
2070 if len(page_results) + startv < self._pagesize:
2071 break
2072
2073 # If we got the whole page, but the next page is not interesting,
2074 # break out early as well
2075 if end == nextfirstid:
2076 break
2077 return res
81c2f20b
PH
2078
2079
9c44d242
PH
2080class InAdvancePagedList(PagedList):
2081 def __init__(self, pagefunc, pagecount, pagesize):
2082 self._pagefunc = pagefunc
2083 self._pagecount = pagecount
2084 self._pagesize = pagesize
2085
2086 def getslice(self, start=0, end=None):
2087 res = []
2088 start_page = start // self._pagesize
2089 end_page = (
2090 self._pagecount if end is None else (end // self._pagesize + 1))
2091 skip_elems = start - start_page * self._pagesize
2092 only_more = None if end is None else end - start
2093 for pagenum in range(start_page, end_page):
2094 page = list(self._pagefunc(pagenum))
2095 if skip_elems:
2096 page = page[skip_elems:]
2097 skip_elems = None
2098 if only_more is not None:
2099 if len(page) < only_more:
2100 only_more -= len(page)
2101 else:
2102 page = page[:only_more]
2103 res.extend(page)
2104 break
2105 res.extend(page)
2106 return res
2107
2108
81c2f20b 2109def uppercase_escape(s):
676eb3f2 2110 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2111 return re.sub(
a612753d 2112 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2113 lambda m: unicode_escape(m.group(0))[0],
2114 s)
0fe2ff78
YCH
2115
2116
2117def lowercase_escape(s):
2118 unicode_escape = codecs.getdecoder('unicode_escape')
2119 return re.sub(
2120 r'\\u[0-9a-fA-F]{4}',
2121 lambda m: unicode_escape(m.group(0))[0],
2122 s)
b53466e1 2123
d05cfe06
S
2124
2125def escape_rfc3986(s):
2126 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2127 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2128 s = s.encode('utf-8')
ecc0c5ee 2129 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2130
2131
2132def escape_url(url):
2133 """Escape URL as suggested by RFC 3986"""
2134 url_parsed = compat_urllib_parse_urlparse(url)
2135 return url_parsed._replace(
efbed08d 2136 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2137 path=escape_rfc3986(url_parsed.path),
2138 params=escape_rfc3986(url_parsed.params),
2139 query=escape_rfc3986(url_parsed.query),
2140 fragment=escape_rfc3986(url_parsed.fragment)
2141 ).geturl()
2142
62e609ab
PH
2143
2144def read_batch_urls(batch_fd):
2145 def fixup(url):
2146 if not isinstance(url, compat_str):
2147 url = url.decode('utf-8', 'replace')
28e614de 2148 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2149 if url.startswith(BOM_UTF8):
2150 url = url[len(BOM_UTF8):]
2151 url = url.strip()
2152 if url.startswith(('#', ';', ']')):
2153 return False
2154 return url
2155
2156 with contextlib.closing(batch_fd) as fd:
2157 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2158
2159
2160def urlencode_postdata(*args, **kargs):
15707c7e 2161 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2162
2163
38f9ef31 2164def update_url_query(url, query):
cacd9966
YCH
2165 if not query:
2166 return url
38f9ef31 2167 parsed_url = compat_urlparse.urlparse(url)
2168 qs = compat_parse_qs(parsed_url.query)
2169 qs.update(query)
2170 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2171 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2172
8e60dc75 2173
ed0291d1
S
2174def update_Request(req, url=None, data=None, headers={}, query={}):
2175 req_headers = req.headers.copy()
2176 req_headers.update(headers)
2177 req_data = data or req.data
2178 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2179 req_get_method = req.get_method()
2180 if req_get_method == 'HEAD':
2181 req_type = HEADRequest
2182 elif req_get_method == 'PUT':
2183 req_type = PUTRequest
2184 else:
2185 req_type = compat_urllib_request.Request
ed0291d1
S
2186 new_req = req_type(
2187 req_url, data=req_data, headers=req_headers,
2188 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2189 if hasattr(req, 'timeout'):
2190 new_req.timeout = req.timeout
2191 return new_req
2192
2193
10c87c15 2194def _multipart_encode_impl(data, boundary):
0c265486
YCH
2195 content_type = 'multipart/form-data; boundary=%s' % boundary
2196
2197 out = b''
2198 for k, v in data.items():
2199 out += b'--' + boundary.encode('ascii') + b'\r\n'
2200 if isinstance(k, compat_str):
2201 k = k.encode('utf-8')
2202 if isinstance(v, compat_str):
2203 v = v.encode('utf-8')
2204 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2205 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2206 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2207 if boundary.encode('ascii') in content:
2208 raise ValueError('Boundary overlaps with data')
2209 out += content
2210
2211 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2212
2213 return out, content_type
2214
2215
2216def multipart_encode(data, boundary=None):
2217 '''
2218 Encode a dict to RFC 7578-compliant form-data
2219
2220 data:
2221 A dict where keys and values can be either Unicode or bytes-like
2222 objects.
2223 boundary:
2224 If specified a Unicode object, it's used as the boundary. Otherwise
2225 a random boundary is generated.
2226
2227 Reference: https://tools.ietf.org/html/rfc7578
2228 '''
2229 has_specified_boundary = boundary is not None
2230
2231 while True:
2232 if boundary is None:
2233 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2234
2235 try:
10c87c15 2236 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2237 break
2238 except ValueError:
2239 if has_specified_boundary:
2240 raise
2241 boundary = None
2242
2243 return out, content_type
2244
2245
86296ad2 2246def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2247 if isinstance(key_or_keys, (list, tuple)):
2248 for key in key_or_keys:
86296ad2
S
2249 if key not in d or d[key] is None or skip_false_values and not d[key]:
2250 continue
2251 return d[key]
cbecc9b9
S
2252 return default
2253 return d.get(key_or_keys, default)
2254
2255
329ca3be 2256def try_get(src, getter, expected_type=None):
a32a9a7e
S
2257 if not isinstance(getter, (list, tuple)):
2258 getter = [getter]
2259 for get in getter:
2260 try:
2261 v = get(src)
2262 except (AttributeError, KeyError, TypeError, IndexError):
2263 pass
2264 else:
2265 if expected_type is None or isinstance(v, expected_type):
2266 return v
329ca3be
S
2267
2268
6cc62232
S
2269def merge_dicts(*dicts):
2270 merged = {}
2271 for a_dict in dicts:
2272 for k, v in a_dict.items():
2273 if v is None:
2274 continue
2275 if (k not in merged or
2276 (isinstance(v, compat_str) and v and
2277 isinstance(merged[k], compat_str) and
2278 not merged[k])):
2279 merged[k] = v
2280 return merged
2281
2282
8e60dc75
S
2283def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2284 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2285
16392824 2286
a1a530b0
PH
2287US_RATINGS = {
2288 'G': 0,
2289 'PG': 10,
2290 'PG-13': 13,
2291 'R': 16,
2292 'NC': 18,
2293}
fac55558
PH
2294
2295
a8795327 2296TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2297 'TV-Y': 0,
2298 'TV-Y7': 7,
2299 'TV-G': 0,
2300 'TV-PG': 0,
2301 'TV-14': 14,
2302 'TV-MA': 17,
a8795327
S
2303}
2304
2305
146c80e2 2306def parse_age_limit(s):
a8795327
S
2307 if type(s) == int:
2308 return s if 0 <= s <= 21 else None
2309 if not isinstance(s, compat_basestring):
d838b1bd 2310 return None
146c80e2 2311 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2312 if m:
2313 return int(m.group('age'))
2314 if s in US_RATINGS:
2315 return US_RATINGS[s]
5a16c9d9 2316 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2317 if m:
5a16c9d9 2318 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2319 return None
146c80e2
S
2320
2321
fac55558 2322def strip_jsonp(code):
609a61e3 2323 return re.sub(
5552c9eb 2324 r'''(?sx)^
e9c671d5 2325 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2326 (?:\s*&&\s*(?P=func_name))?
2327 \s*\(\s*(?P<callback_data>.*)\);?
2328 \s*?(?://[^\n]*)*$''',
2329 r'\g<callback_data>', code)
478c2c61
PH
2330
2331
e05f6939 2332def js_to_json(code):
4195096e
S
2333 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2334 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2335 INTEGER_TABLE = (
2336 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2337 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2338 )
2339
e05f6939 2340 def fix_kv(m):
e7b6d122
PH
2341 v = m.group(0)
2342 if v in ('true', 'false', 'null'):
2343 return v
b3ee552e 2344 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2345 return ""
2346
2347 if v[0] in ("'", '"'):
2348 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2349 '"': '\\"',
bd1e4844 2350 "\\'": "'",
2351 '\\\n': '',
2352 '\\x': '\\u00',
2353 }.get(m.group(0), m.group(0)), v[1:-1])
2354
89ac4a19
S
2355 for regex, base in INTEGER_TABLE:
2356 im = re.match(regex, v)
2357 if im:
e4659b45 2358 i = int(im.group(1), base)
89ac4a19
S
2359 return '"%d":' % i if v.endswith(':') else '%d' % i
2360
e7b6d122 2361 return '"%s"' % v
e05f6939 2362
bd1e4844 2363 return re.sub(r'''(?sx)
2364 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2365 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2366 {comment}|,(?={skip}[\]}}])|
c384d537 2367 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2368 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2369 [0-9]+(?={skip}:)
2370 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2371
2372
478c2c61
PH
2373def qualities(quality_ids):
2374 """ Get a numeric quality value out of a list of possible values """
2375 def q(qid):
2376 try:
2377 return quality_ids.index(qid)
2378 except ValueError:
2379 return -1
2380 return q
2381
acd69589
PH
2382
2383DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2384
a020a0dc
PH
2385
2386def limit_length(s, length):
2387 """ Add ellipses to overly long strings """
2388 if s is None:
2389 return None
2390 ELLIPSES = '...'
2391 if len(s) > length:
2392 return s[:length - len(ELLIPSES)] + ELLIPSES
2393 return s
48844745
PH
2394
2395
2396def version_tuple(v):
5f9b8394 2397 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2398
2399
2400def is_outdated_version(version, limit, assume_new=True):
2401 if not version:
2402 return not assume_new
2403 try:
2404 return version_tuple(version) < version_tuple(limit)
2405 except ValueError:
2406 return not assume_new
732ea2f0
PH
2407
2408
2409def ytdl_is_updateable():
2410 """ Returns if youtube-dl can be updated with -U """
2411 from zipimport import zipimporter
2412
2413 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2414
2415
2416def args_to_str(args):
2417 # Get a short string representation for a subprocess command
702ccf2d 2418 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2419
2420
9b9c5355 2421def error_to_compat_str(err):
fdae2358
S
2422 err_str = str(err)
2423 # On python 2 error byte string must be decoded with proper
2424 # encoding rather than ascii
2425 if sys.version_info[0] < 3:
2426 err_str = err_str.decode(preferredencoding())
2427 return err_str
2428
2429
c460bdd5 2430def mimetype2ext(mt):
eb9ee194
S
2431 if mt is None:
2432 return None
2433
765ac263
JMF
2434 ext = {
2435 'audio/mp4': 'm4a',
6c33d24b
YCH
2436 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2437 # it's the most popular one
2438 'audio/mpeg': 'mp3',
765ac263
JMF
2439 }.get(mt)
2440 if ext is not None:
2441 return ext
2442
c460bdd5 2443 _, _, res = mt.rpartition('/')
6562d34a 2444 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2445
2446 return {
f6861ec9 2447 '3gpp': '3gp',
cafcf657 2448 'smptett+xml': 'tt',
cafcf657 2449 'ttaf+xml': 'dfxp',
a0d8d704 2450 'ttml+xml': 'ttml',
f6861ec9 2451 'x-flv': 'flv',
a0d8d704 2452 'x-mp4-fragmented': 'mp4',
d4f05d47 2453 'x-ms-sami': 'sami',
a0d8d704 2454 'x-ms-wmv': 'wmv',
b4173f15
RA
2455 'mpegurl': 'm3u8',
2456 'x-mpegurl': 'm3u8',
2457 'vnd.apple.mpegurl': 'm3u8',
2458 'dash+xml': 'mpd',
b4173f15 2459 'f4m+xml': 'f4m',
f164b971 2460 'hds+xml': 'f4m',
e910fe2f 2461 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2462 'quicktime': 'mov',
98ce1a3f 2463 'mp2t': 'ts',
c460bdd5
PH
2464 }.get(res, res)
2465
2466
4f3c5e06 2467def parse_codecs(codecs_str):
2468 # http://tools.ietf.org/html/rfc6381
2469 if not codecs_str:
2470 return {}
2471 splited_codecs = list(filter(None, map(
2472 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2473 vcodec, acodec = None, None
2474 for full_codec in splited_codecs:
2475 codec = full_codec.split('.')[0]
ffe6979e 2476 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2477 if not vcodec:
2478 vcodec = full_codec
60f5c9fb 2479 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2480 if not acodec:
2481 acodec = full_codec
2482 else:
60f5c9fb 2483 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2484 if not vcodec and not acodec:
2485 if len(splited_codecs) == 2:
2486 return {
2487 'vcodec': vcodec,
2488 'acodec': acodec,
2489 }
2490 elif len(splited_codecs) == 1:
2491 return {
2492 'vcodec': 'none',
2493 'acodec': vcodec,
2494 }
2495 else:
2496 return {
2497 'vcodec': vcodec or 'none',
2498 'acodec': acodec or 'none',
2499 }
2500 return {}
2501
2502
2ccd1b10 2503def urlhandle_detect_ext(url_handle):
79298173 2504 getheader = url_handle.headers.get
2ccd1b10 2505
b55ee18f
PH
2506 cd = getheader('Content-Disposition')
2507 if cd:
2508 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2509 if m:
2510 e = determine_ext(m.group('filename'), default_ext=None)
2511 if e:
2512 return e
2513
c460bdd5 2514 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2515
2516
1e399778
YCH
2517def encode_data_uri(data, mime_type):
2518 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2519
2520
05900629 2521def age_restricted(content_limit, age_limit):
6ec6cb4e 2522 """ Returns True iff the content should be blocked """
05900629
PH
2523
2524 if age_limit is None: # No limit set
2525 return False
2526 if content_limit is None:
2527 return False # Content available for everyone
2528 return age_limit < content_limit
61ca9a80
PH
2529
2530
2531def is_html(first_bytes):
2532 """ Detect whether a file contains HTML by examining its first bytes. """
2533
2534 BOMS = [
2535 (b'\xef\xbb\xbf', 'utf-8'),
2536 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2537 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2538 (b'\xff\xfe', 'utf-16-le'),
2539 (b'\xfe\xff', 'utf-16-be'),
2540 ]
2541 for bom, enc in BOMS:
2542 if first_bytes.startswith(bom):
2543 s = first_bytes[len(bom):].decode(enc, 'replace')
2544 break
2545 else:
2546 s = first_bytes.decode('utf-8', 'replace')
2547
2548 return re.match(r'^\s*<', s)
a055469f
PH
2549
2550
2551def determine_protocol(info_dict):
2552 protocol = info_dict.get('protocol')
2553 if protocol is not None:
2554 return protocol
2555
2556 url = info_dict['url']
2557 if url.startswith('rtmp'):
2558 return 'rtmp'
2559 elif url.startswith('mms'):
2560 return 'mms'
2561 elif url.startswith('rtsp'):
2562 return 'rtsp'
2563
2564 ext = determine_ext(url)
2565 if ext == 'm3u8':
2566 return 'm3u8'
2567 elif ext == 'f4m':
2568 return 'f4m'
2569
2570 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2571
2572
2573def render_table(header_row, data):
2574 """ Render a list of rows, each as a list of values """
2575 table = [header_row] + data
2576 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2577 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2578 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2579
2580
2581def _match_one(filter_part, dct):
2582 COMPARISON_OPERATORS = {
2583 '<': operator.lt,
2584 '<=': operator.le,
2585 '>': operator.gt,
2586 '>=': operator.ge,
2587 '=': operator.eq,
2588 '!=': operator.ne,
2589 }
2590 operator_rex = re.compile(r'''(?x)\s*
2591 (?P<key>[a-z_]+)
2592 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2593 (?:
2594 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2595 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2596 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2597 )
2598 \s*$
2599 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2600 m = operator_rex.search(filter_part)
2601 if m:
2602 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2603 actual_value = dct.get(m.group('key'))
db13c16e
S
2604 if (m.group('quotedstrval') is not None or
2605 m.group('strval') is not None or
e5a088dc
S
2606 # If the original field is a string and matching comparisonvalue is
2607 # a number we should respect the origin of the original field
2608 # and process comparison value as a string (see
2609 # https://github.com/rg3/youtube-dl/issues/11082).
2610 actual_value is not None and m.group('intval') is not None and
2611 isinstance(actual_value, compat_str)):
347de493
PH
2612 if m.group('op') not in ('=', '!='):
2613 raise ValueError(
2614 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2615 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2616 quote = m.group('quote')
2617 if quote is not None:
2618 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2619 else:
2620 try:
2621 comparison_value = int(m.group('intval'))
2622 except ValueError:
2623 comparison_value = parse_filesize(m.group('intval'))
2624 if comparison_value is None:
2625 comparison_value = parse_filesize(m.group('intval') + 'B')
2626 if comparison_value is None:
2627 raise ValueError(
2628 'Invalid integer value %r in filter part %r' % (
2629 m.group('intval'), filter_part))
347de493
PH
2630 if actual_value is None:
2631 return m.group('none_inclusive')
2632 return op(actual_value, comparison_value)
2633
2634 UNARY_OPERATORS = {
1cc47c66
S
2635 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2636 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
2637 }
2638 operator_rex = re.compile(r'''(?x)\s*
2639 (?P<op>%s)\s*(?P<key>[a-z_]+)
2640 \s*$
2641 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2642 m = operator_rex.search(filter_part)
2643 if m:
2644 op = UNARY_OPERATORS[m.group('op')]
2645 actual_value = dct.get(m.group('key'))
2646 return op(actual_value)
2647
2648 raise ValueError('Invalid filter part %r' % filter_part)
2649
2650
2651def match_str(filter_str, dct):
2652 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2653
2654 return all(
2655 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2656
2657
2658def match_filter_func(filter_str):
2659 def _match_func(info_dict):
2660 if match_str(filter_str, info_dict):
2661 return None
2662 else:
2663 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2664 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2665 return _match_func
91410c9b
PH
2666
2667
bf6427d2
YCH
2668def parse_dfxp_time_expr(time_expr):
2669 if not time_expr:
d631d5f9 2670 return
bf6427d2
YCH
2671
2672 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2673 if mobj:
2674 return float(mobj.group('time_offset'))
2675
db2fe38b 2676 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2677 if mobj:
db2fe38b 2678 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2679
2680
c1c924ab
YCH
2681def srt_subtitles_timecode(seconds):
2682 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2683
2684
2685def dfxp2srt(dfxp_data):
3869028f
YCH
2686 '''
2687 @param dfxp_data A bytes-like object containing DFXP data
2688 @returns A unicode object containing converted SRT data
2689 '''
5b995f71 2690 LEGACY_NAMESPACES = (
3869028f
YCH
2691 (b'http://www.w3.org/ns/ttml', [
2692 b'http://www.w3.org/2004/11/ttaf1',
2693 b'http://www.w3.org/2006/04/ttaf1',
2694 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2695 ]),
3869028f
YCH
2696 (b'http://www.w3.org/ns/ttml#styling', [
2697 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2698 ]),
2699 )
2700
2701 SUPPORTED_STYLING = [
2702 'color',
2703 'fontFamily',
2704 'fontSize',
2705 'fontStyle',
2706 'fontWeight',
2707 'textDecoration'
2708 ]
2709
4e335771 2710 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 2711 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 2712 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2713 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2714 })
bf6427d2 2715
5b995f71
RA
2716 styles = {}
2717 default_style = {}
2718
87de7069 2719 class TTMLPElementParser(object):
5b995f71
RA
2720 _out = ''
2721 _unclosed_elements = []
2722 _applied_styles = []
bf6427d2 2723
2b14cb56 2724 def start(self, tag, attrib):
5b995f71
RA
2725 if tag in (_x('ttml:br'), 'br'):
2726 self._out += '\n'
2727 else:
2728 unclosed_elements = []
2729 style = {}
2730 element_style_id = attrib.get('style')
2731 if default_style:
2732 style.update(default_style)
2733 if element_style_id:
2734 style.update(styles.get(element_style_id, {}))
2735 for prop in SUPPORTED_STYLING:
2736 prop_val = attrib.get(_x('tts:' + prop))
2737 if prop_val:
2738 style[prop] = prop_val
2739 if style:
2740 font = ''
2741 for k, v in sorted(style.items()):
2742 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2743 continue
2744 if k == 'color':
2745 font += ' color="%s"' % v
2746 elif k == 'fontSize':
2747 font += ' size="%s"' % v
2748 elif k == 'fontFamily':
2749 font += ' face="%s"' % v
2750 elif k == 'fontWeight' and v == 'bold':
2751 self._out += '<b>'
2752 unclosed_elements.append('b')
2753 elif k == 'fontStyle' and v == 'italic':
2754 self._out += '<i>'
2755 unclosed_elements.append('i')
2756 elif k == 'textDecoration' and v == 'underline':
2757 self._out += '<u>'
2758 unclosed_elements.append('u')
2759 if font:
2760 self._out += '<font' + font + '>'
2761 unclosed_elements.append('font')
2762 applied_style = {}
2763 if self._applied_styles:
2764 applied_style.update(self._applied_styles[-1])
2765 applied_style.update(style)
2766 self._applied_styles.append(applied_style)
2767 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2768
2b14cb56 2769 def end(self, tag):
5b995f71
RA
2770 if tag not in (_x('ttml:br'), 'br'):
2771 unclosed_elements = self._unclosed_elements.pop()
2772 for element in reversed(unclosed_elements):
2773 self._out += '</%s>' % element
2774 if unclosed_elements and self._applied_styles:
2775 self._applied_styles.pop()
bf6427d2 2776
2b14cb56 2777 def data(self, data):
5b995f71 2778 self._out += data
2b14cb56 2779
2780 def close(self):
5b995f71 2781 return self._out.strip()
2b14cb56 2782
2783 def parse_node(node):
2784 target = TTMLPElementParser()
2785 parser = xml.etree.ElementTree.XMLParser(target=target)
2786 parser.feed(xml.etree.ElementTree.tostring(node))
2787 return parser.close()
bf6427d2 2788
5b995f71
RA
2789 for k, v in LEGACY_NAMESPACES:
2790 for ns in v:
2791 dfxp_data = dfxp_data.replace(ns, k)
2792
3869028f 2793 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2794 out = []
5b995f71 2795 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2796
2797 if not paras:
2798 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2799
5b995f71
RA
2800 repeat = False
2801 while True:
2802 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
2803 style_id = style.get('id') or style.get(_x('xml:id'))
2804 if not style_id:
2805 continue
5b995f71
RA
2806 parent_style_id = style.get('style')
2807 if parent_style_id:
2808 if parent_style_id not in styles:
2809 repeat = True
2810 continue
2811 styles[style_id] = styles[parent_style_id].copy()
2812 for prop in SUPPORTED_STYLING:
2813 prop_val = style.get(_x('tts:' + prop))
2814 if prop_val:
2815 styles.setdefault(style_id, {})[prop] = prop_val
2816 if repeat:
2817 repeat = False
2818 else:
2819 break
2820
2821 for p in ('body', 'div'):
2822 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2823 if ele is None:
2824 continue
2825 style = styles.get(ele.get('style'))
2826 if not style:
2827 continue
2828 default_style.update(style)
2829
bf6427d2 2830 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2831 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2832 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2833 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2834 if begin_time is None:
2835 continue
7dff0363 2836 if not end_time:
d631d5f9
YCH
2837 if not dur:
2838 continue
2839 end_time = begin_time + dur
bf6427d2
YCH
2840 out.append('%d\n%s --> %s\n%s\n\n' % (
2841 index,
c1c924ab
YCH
2842 srt_subtitles_timecode(begin_time),
2843 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2844 parse_node(para)))
2845
2846 return ''.join(out)
2847
2848
66e289ba
S
2849def cli_option(params, command_option, param):
2850 param = params.get(param)
98e698f1
RA
2851 if param:
2852 param = compat_str(param)
66e289ba
S
2853 return [command_option, param] if param is not None else []
2854
2855
2856def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2857 param = params.get(param)
5b232f46
S
2858 if param is None:
2859 return []
66e289ba
S
2860 assert isinstance(param, bool)
2861 if separator:
2862 return [command_option + separator + (true_value if param else false_value)]
2863 return [command_option, true_value if param else false_value]
2864
2865
2866def cli_valueless_option(params, command_option, param, expected_value=True):
2867 param = params.get(param)
2868 return [command_option] if param == expected_value else []
2869
2870
2871def cli_configuration_args(params, param, default=[]):
2872 ex_args = params.get(param)
2873 if ex_args is None:
2874 return default
2875 assert isinstance(ex_args, list)
2876 return ex_args
2877
2878
39672624
YCH
2879class ISO639Utils(object):
2880 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2881 _lang_map = {
2882 'aa': 'aar',
2883 'ab': 'abk',
2884 'ae': 'ave',
2885 'af': 'afr',
2886 'ak': 'aka',
2887 'am': 'amh',
2888 'an': 'arg',
2889 'ar': 'ara',
2890 'as': 'asm',
2891 'av': 'ava',
2892 'ay': 'aym',
2893 'az': 'aze',
2894 'ba': 'bak',
2895 'be': 'bel',
2896 'bg': 'bul',
2897 'bh': 'bih',
2898 'bi': 'bis',
2899 'bm': 'bam',
2900 'bn': 'ben',
2901 'bo': 'bod',
2902 'br': 'bre',
2903 'bs': 'bos',
2904 'ca': 'cat',
2905 'ce': 'che',
2906 'ch': 'cha',
2907 'co': 'cos',
2908 'cr': 'cre',
2909 'cs': 'ces',
2910 'cu': 'chu',
2911 'cv': 'chv',
2912 'cy': 'cym',
2913 'da': 'dan',
2914 'de': 'deu',
2915 'dv': 'div',
2916 'dz': 'dzo',
2917 'ee': 'ewe',
2918 'el': 'ell',
2919 'en': 'eng',
2920 'eo': 'epo',
2921 'es': 'spa',
2922 'et': 'est',
2923 'eu': 'eus',
2924 'fa': 'fas',
2925 'ff': 'ful',
2926 'fi': 'fin',
2927 'fj': 'fij',
2928 'fo': 'fao',
2929 'fr': 'fra',
2930 'fy': 'fry',
2931 'ga': 'gle',
2932 'gd': 'gla',
2933 'gl': 'glg',
2934 'gn': 'grn',
2935 'gu': 'guj',
2936 'gv': 'glv',
2937 'ha': 'hau',
2938 'he': 'heb',
2939 'hi': 'hin',
2940 'ho': 'hmo',
2941 'hr': 'hrv',
2942 'ht': 'hat',
2943 'hu': 'hun',
2944 'hy': 'hye',
2945 'hz': 'her',
2946 'ia': 'ina',
2947 'id': 'ind',
2948 'ie': 'ile',
2949 'ig': 'ibo',
2950 'ii': 'iii',
2951 'ik': 'ipk',
2952 'io': 'ido',
2953 'is': 'isl',
2954 'it': 'ita',
2955 'iu': 'iku',
2956 'ja': 'jpn',
2957 'jv': 'jav',
2958 'ka': 'kat',
2959 'kg': 'kon',
2960 'ki': 'kik',
2961 'kj': 'kua',
2962 'kk': 'kaz',
2963 'kl': 'kal',
2964 'km': 'khm',
2965 'kn': 'kan',
2966 'ko': 'kor',
2967 'kr': 'kau',
2968 'ks': 'kas',
2969 'ku': 'kur',
2970 'kv': 'kom',
2971 'kw': 'cor',
2972 'ky': 'kir',
2973 'la': 'lat',
2974 'lb': 'ltz',
2975 'lg': 'lug',
2976 'li': 'lim',
2977 'ln': 'lin',
2978 'lo': 'lao',
2979 'lt': 'lit',
2980 'lu': 'lub',
2981 'lv': 'lav',
2982 'mg': 'mlg',
2983 'mh': 'mah',
2984 'mi': 'mri',
2985 'mk': 'mkd',
2986 'ml': 'mal',
2987 'mn': 'mon',
2988 'mr': 'mar',
2989 'ms': 'msa',
2990 'mt': 'mlt',
2991 'my': 'mya',
2992 'na': 'nau',
2993 'nb': 'nob',
2994 'nd': 'nde',
2995 'ne': 'nep',
2996 'ng': 'ndo',
2997 'nl': 'nld',
2998 'nn': 'nno',
2999 'no': 'nor',
3000 'nr': 'nbl',
3001 'nv': 'nav',
3002 'ny': 'nya',
3003 'oc': 'oci',
3004 'oj': 'oji',
3005 'om': 'orm',
3006 'or': 'ori',
3007 'os': 'oss',
3008 'pa': 'pan',
3009 'pi': 'pli',
3010 'pl': 'pol',
3011 'ps': 'pus',
3012 'pt': 'por',
3013 'qu': 'que',
3014 'rm': 'roh',
3015 'rn': 'run',
3016 'ro': 'ron',
3017 'ru': 'rus',
3018 'rw': 'kin',
3019 'sa': 'san',
3020 'sc': 'srd',
3021 'sd': 'snd',
3022 'se': 'sme',
3023 'sg': 'sag',
3024 'si': 'sin',
3025 'sk': 'slk',
3026 'sl': 'slv',
3027 'sm': 'smo',
3028 'sn': 'sna',
3029 'so': 'som',
3030 'sq': 'sqi',
3031 'sr': 'srp',
3032 'ss': 'ssw',
3033 'st': 'sot',
3034 'su': 'sun',
3035 'sv': 'swe',
3036 'sw': 'swa',
3037 'ta': 'tam',
3038 'te': 'tel',
3039 'tg': 'tgk',
3040 'th': 'tha',
3041 'ti': 'tir',
3042 'tk': 'tuk',
3043 'tl': 'tgl',
3044 'tn': 'tsn',
3045 'to': 'ton',
3046 'tr': 'tur',
3047 'ts': 'tso',
3048 'tt': 'tat',
3049 'tw': 'twi',
3050 'ty': 'tah',
3051 'ug': 'uig',
3052 'uk': 'ukr',
3053 'ur': 'urd',
3054 'uz': 'uzb',
3055 've': 'ven',
3056 'vi': 'vie',
3057 'vo': 'vol',
3058 'wa': 'wln',
3059 'wo': 'wol',
3060 'xh': 'xho',
3061 'yi': 'yid',
3062 'yo': 'yor',
3063 'za': 'zha',
3064 'zh': 'zho',
3065 'zu': 'zul',
3066 }
3067
3068 @classmethod
3069 def short2long(cls, code):
3070 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3071 return cls._lang_map.get(code[:2])
3072
3073 @classmethod
3074 def long2short(cls, code):
3075 """Convert language code from ISO 639-2/T to ISO 639-1"""
3076 for short_name, long_name in cls._lang_map.items():
3077 if long_name == code:
3078 return short_name
3079
3080
4eb10f66
YCH
3081class ISO3166Utils(object):
3082 # From http://data.okfn.org/data/core/country-list
3083 _country_map = {
3084 'AF': 'Afghanistan',
3085 'AX': 'Åland Islands',
3086 'AL': 'Albania',
3087 'DZ': 'Algeria',
3088 'AS': 'American Samoa',
3089 'AD': 'Andorra',
3090 'AO': 'Angola',
3091 'AI': 'Anguilla',
3092 'AQ': 'Antarctica',
3093 'AG': 'Antigua and Barbuda',
3094 'AR': 'Argentina',
3095 'AM': 'Armenia',
3096 'AW': 'Aruba',
3097 'AU': 'Australia',
3098 'AT': 'Austria',
3099 'AZ': 'Azerbaijan',
3100 'BS': 'Bahamas',
3101 'BH': 'Bahrain',
3102 'BD': 'Bangladesh',
3103 'BB': 'Barbados',
3104 'BY': 'Belarus',
3105 'BE': 'Belgium',
3106 'BZ': 'Belize',
3107 'BJ': 'Benin',
3108 'BM': 'Bermuda',
3109 'BT': 'Bhutan',
3110 'BO': 'Bolivia, Plurinational State of',
3111 'BQ': 'Bonaire, Sint Eustatius and Saba',
3112 'BA': 'Bosnia and Herzegovina',
3113 'BW': 'Botswana',
3114 'BV': 'Bouvet Island',
3115 'BR': 'Brazil',
3116 'IO': 'British Indian Ocean Territory',
3117 'BN': 'Brunei Darussalam',
3118 'BG': 'Bulgaria',
3119 'BF': 'Burkina Faso',
3120 'BI': 'Burundi',
3121 'KH': 'Cambodia',
3122 'CM': 'Cameroon',
3123 'CA': 'Canada',
3124 'CV': 'Cape Verde',
3125 'KY': 'Cayman Islands',
3126 'CF': 'Central African Republic',
3127 'TD': 'Chad',
3128 'CL': 'Chile',
3129 'CN': 'China',
3130 'CX': 'Christmas Island',
3131 'CC': 'Cocos (Keeling) Islands',
3132 'CO': 'Colombia',
3133 'KM': 'Comoros',
3134 'CG': 'Congo',
3135 'CD': 'Congo, the Democratic Republic of the',
3136 'CK': 'Cook Islands',
3137 'CR': 'Costa Rica',
3138 'CI': 'Côte d\'Ivoire',
3139 'HR': 'Croatia',
3140 'CU': 'Cuba',
3141 'CW': 'Curaçao',
3142 'CY': 'Cyprus',
3143 'CZ': 'Czech Republic',
3144 'DK': 'Denmark',
3145 'DJ': 'Djibouti',
3146 'DM': 'Dominica',
3147 'DO': 'Dominican Republic',
3148 'EC': 'Ecuador',
3149 'EG': 'Egypt',
3150 'SV': 'El Salvador',
3151 'GQ': 'Equatorial Guinea',
3152 'ER': 'Eritrea',
3153 'EE': 'Estonia',
3154 'ET': 'Ethiopia',
3155 'FK': 'Falkland Islands (Malvinas)',
3156 'FO': 'Faroe Islands',
3157 'FJ': 'Fiji',
3158 'FI': 'Finland',
3159 'FR': 'France',
3160 'GF': 'French Guiana',
3161 'PF': 'French Polynesia',
3162 'TF': 'French Southern Territories',
3163 'GA': 'Gabon',
3164 'GM': 'Gambia',
3165 'GE': 'Georgia',
3166 'DE': 'Germany',
3167 'GH': 'Ghana',
3168 'GI': 'Gibraltar',
3169 'GR': 'Greece',
3170 'GL': 'Greenland',
3171 'GD': 'Grenada',
3172 'GP': 'Guadeloupe',
3173 'GU': 'Guam',
3174 'GT': 'Guatemala',
3175 'GG': 'Guernsey',
3176 'GN': 'Guinea',
3177 'GW': 'Guinea-Bissau',
3178 'GY': 'Guyana',
3179 'HT': 'Haiti',
3180 'HM': 'Heard Island and McDonald Islands',
3181 'VA': 'Holy See (Vatican City State)',
3182 'HN': 'Honduras',
3183 'HK': 'Hong Kong',
3184 'HU': 'Hungary',
3185 'IS': 'Iceland',
3186 'IN': 'India',
3187 'ID': 'Indonesia',
3188 'IR': 'Iran, Islamic Republic of',
3189 'IQ': 'Iraq',
3190 'IE': 'Ireland',
3191 'IM': 'Isle of Man',
3192 'IL': 'Israel',
3193 'IT': 'Italy',
3194 'JM': 'Jamaica',
3195 'JP': 'Japan',
3196 'JE': 'Jersey',
3197 'JO': 'Jordan',
3198 'KZ': 'Kazakhstan',
3199 'KE': 'Kenya',
3200 'KI': 'Kiribati',
3201 'KP': 'Korea, Democratic People\'s Republic of',
3202 'KR': 'Korea, Republic of',
3203 'KW': 'Kuwait',
3204 'KG': 'Kyrgyzstan',
3205 'LA': 'Lao People\'s Democratic Republic',
3206 'LV': 'Latvia',
3207 'LB': 'Lebanon',
3208 'LS': 'Lesotho',
3209 'LR': 'Liberia',
3210 'LY': 'Libya',
3211 'LI': 'Liechtenstein',
3212 'LT': 'Lithuania',
3213 'LU': 'Luxembourg',
3214 'MO': 'Macao',
3215 'MK': 'Macedonia, the Former Yugoslav Republic of',
3216 'MG': 'Madagascar',
3217 'MW': 'Malawi',
3218 'MY': 'Malaysia',
3219 'MV': 'Maldives',
3220 'ML': 'Mali',
3221 'MT': 'Malta',
3222 'MH': 'Marshall Islands',
3223 'MQ': 'Martinique',
3224 'MR': 'Mauritania',
3225 'MU': 'Mauritius',
3226 'YT': 'Mayotte',
3227 'MX': 'Mexico',
3228 'FM': 'Micronesia, Federated States of',
3229 'MD': 'Moldova, Republic of',
3230 'MC': 'Monaco',
3231 'MN': 'Mongolia',
3232 'ME': 'Montenegro',
3233 'MS': 'Montserrat',
3234 'MA': 'Morocco',
3235 'MZ': 'Mozambique',
3236 'MM': 'Myanmar',
3237 'NA': 'Namibia',
3238 'NR': 'Nauru',
3239 'NP': 'Nepal',
3240 'NL': 'Netherlands',
3241 'NC': 'New Caledonia',
3242 'NZ': 'New Zealand',
3243 'NI': 'Nicaragua',
3244 'NE': 'Niger',
3245 'NG': 'Nigeria',
3246 'NU': 'Niue',
3247 'NF': 'Norfolk Island',
3248 'MP': 'Northern Mariana Islands',
3249 'NO': 'Norway',
3250 'OM': 'Oman',
3251 'PK': 'Pakistan',
3252 'PW': 'Palau',
3253 'PS': 'Palestine, State of',
3254 'PA': 'Panama',
3255 'PG': 'Papua New Guinea',
3256 'PY': 'Paraguay',
3257 'PE': 'Peru',
3258 'PH': 'Philippines',
3259 'PN': 'Pitcairn',
3260 'PL': 'Poland',
3261 'PT': 'Portugal',
3262 'PR': 'Puerto Rico',
3263 'QA': 'Qatar',
3264 'RE': 'Réunion',
3265 'RO': 'Romania',
3266 'RU': 'Russian Federation',
3267 'RW': 'Rwanda',
3268 'BL': 'Saint Barthélemy',
3269 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3270 'KN': 'Saint Kitts and Nevis',
3271 'LC': 'Saint Lucia',
3272 'MF': 'Saint Martin (French part)',
3273 'PM': 'Saint Pierre and Miquelon',
3274 'VC': 'Saint Vincent and the Grenadines',
3275 'WS': 'Samoa',
3276 'SM': 'San Marino',
3277 'ST': 'Sao Tome and Principe',
3278 'SA': 'Saudi Arabia',
3279 'SN': 'Senegal',
3280 'RS': 'Serbia',
3281 'SC': 'Seychelles',
3282 'SL': 'Sierra Leone',
3283 'SG': 'Singapore',
3284 'SX': 'Sint Maarten (Dutch part)',
3285 'SK': 'Slovakia',
3286 'SI': 'Slovenia',
3287 'SB': 'Solomon Islands',
3288 'SO': 'Somalia',
3289 'ZA': 'South Africa',
3290 'GS': 'South Georgia and the South Sandwich Islands',
3291 'SS': 'South Sudan',
3292 'ES': 'Spain',
3293 'LK': 'Sri Lanka',
3294 'SD': 'Sudan',
3295 'SR': 'Suriname',
3296 'SJ': 'Svalbard and Jan Mayen',
3297 'SZ': 'Swaziland',
3298 'SE': 'Sweden',
3299 'CH': 'Switzerland',
3300 'SY': 'Syrian Arab Republic',
3301 'TW': 'Taiwan, Province of China',
3302 'TJ': 'Tajikistan',
3303 'TZ': 'Tanzania, United Republic of',
3304 'TH': 'Thailand',
3305 'TL': 'Timor-Leste',
3306 'TG': 'Togo',
3307 'TK': 'Tokelau',
3308 'TO': 'Tonga',
3309 'TT': 'Trinidad and Tobago',
3310 'TN': 'Tunisia',
3311 'TR': 'Turkey',
3312 'TM': 'Turkmenistan',
3313 'TC': 'Turks and Caicos Islands',
3314 'TV': 'Tuvalu',
3315 'UG': 'Uganda',
3316 'UA': 'Ukraine',
3317 'AE': 'United Arab Emirates',
3318 'GB': 'United Kingdom',
3319 'US': 'United States',
3320 'UM': 'United States Minor Outlying Islands',
3321 'UY': 'Uruguay',
3322 'UZ': 'Uzbekistan',
3323 'VU': 'Vanuatu',
3324 'VE': 'Venezuela, Bolivarian Republic of',
3325 'VN': 'Viet Nam',
3326 'VG': 'Virgin Islands, British',
3327 'VI': 'Virgin Islands, U.S.',
3328 'WF': 'Wallis and Futuna',
3329 'EH': 'Western Sahara',
3330 'YE': 'Yemen',
3331 'ZM': 'Zambia',
3332 'ZW': 'Zimbabwe',
3333 }
3334
3335 @classmethod
3336 def short2full(cls, code):
3337 """Convert an ISO 3166-2 country code to the corresponding full name"""
3338 return cls._country_map.get(code.upper())
3339
3340
773f291d
S
3341class GeoUtils(object):
3342 # Major IPv4 address blocks per country
3343 _country_ip_map = {
3344 'AD': '85.94.160.0/19',
3345 'AE': '94.200.0.0/13',
3346 'AF': '149.54.0.0/17',
3347 'AG': '209.59.64.0/18',
3348 'AI': '204.14.248.0/21',
3349 'AL': '46.99.0.0/16',
3350 'AM': '46.70.0.0/15',
3351 'AO': '105.168.0.0/13',
3352 'AP': '159.117.192.0/21',
3353 'AR': '181.0.0.0/12',
3354 'AS': '202.70.112.0/20',
3355 'AT': '84.112.0.0/13',
3356 'AU': '1.128.0.0/11',
3357 'AW': '181.41.0.0/18',
3358 'AZ': '5.191.0.0/16',
3359 'BA': '31.176.128.0/17',
3360 'BB': '65.48.128.0/17',
3361 'BD': '114.130.0.0/16',
3362 'BE': '57.0.0.0/8',
3363 'BF': '129.45.128.0/17',
3364 'BG': '95.42.0.0/15',
3365 'BH': '37.131.0.0/17',
3366 'BI': '154.117.192.0/18',
3367 'BJ': '137.255.0.0/16',
3368 'BL': '192.131.134.0/24',
3369 'BM': '196.12.64.0/18',
3370 'BN': '156.31.0.0/16',
3371 'BO': '161.56.0.0/16',
3372 'BQ': '161.0.80.0/20',
3373 'BR': '152.240.0.0/12',
3374 'BS': '24.51.64.0/18',
3375 'BT': '119.2.96.0/19',
3376 'BW': '168.167.0.0/16',
3377 'BY': '178.120.0.0/13',
3378 'BZ': '179.42.192.0/18',
3379 'CA': '99.224.0.0/11',
3380 'CD': '41.243.0.0/16',
3381 'CF': '196.32.200.0/21',
3382 'CG': '197.214.128.0/17',
3383 'CH': '85.0.0.0/13',
3384 'CI': '154.232.0.0/14',
3385 'CK': '202.65.32.0/19',
3386 'CL': '152.172.0.0/14',
3387 'CM': '165.210.0.0/15',
3388 'CN': '36.128.0.0/10',
3389 'CO': '181.240.0.0/12',
3390 'CR': '201.192.0.0/12',
3391 'CU': '152.206.0.0/15',
3392 'CV': '165.90.96.0/19',
3393 'CW': '190.88.128.0/17',
3394 'CY': '46.198.0.0/15',
3395 'CZ': '88.100.0.0/14',
3396 'DE': '53.0.0.0/8',
3397 'DJ': '197.241.0.0/17',
3398 'DK': '87.48.0.0/12',
3399 'DM': '192.243.48.0/20',
3400 'DO': '152.166.0.0/15',
3401 'DZ': '41.96.0.0/12',
3402 'EC': '186.68.0.0/15',
3403 'EE': '90.190.0.0/15',
3404 'EG': '156.160.0.0/11',
3405 'ER': '196.200.96.0/20',
3406 'ES': '88.0.0.0/11',
3407 'ET': '196.188.0.0/14',
3408 'EU': '2.16.0.0/13',
3409 'FI': '91.152.0.0/13',
3410 'FJ': '144.120.0.0/16',
3411 'FM': '119.252.112.0/20',
3412 'FO': '88.85.32.0/19',
3413 'FR': '90.0.0.0/9',
3414 'GA': '41.158.0.0/15',
3415 'GB': '25.0.0.0/8',
3416 'GD': '74.122.88.0/21',
3417 'GE': '31.146.0.0/16',
3418 'GF': '161.22.64.0/18',
3419 'GG': '62.68.160.0/19',
3420 'GH': '45.208.0.0/14',
3421 'GI': '85.115.128.0/19',
3422 'GL': '88.83.0.0/19',
3423 'GM': '160.182.0.0/15',
3424 'GN': '197.149.192.0/18',
3425 'GP': '104.250.0.0/19',
3426 'GQ': '105.235.224.0/20',
3427 'GR': '94.64.0.0/13',
3428 'GT': '168.234.0.0/16',
3429 'GU': '168.123.0.0/16',
3430 'GW': '197.214.80.0/20',
3431 'GY': '181.41.64.0/18',
3432 'HK': '113.252.0.0/14',
3433 'HN': '181.210.0.0/16',
3434 'HR': '93.136.0.0/13',
3435 'HT': '148.102.128.0/17',
3436 'HU': '84.0.0.0/14',
3437 'ID': '39.192.0.0/10',
3438 'IE': '87.32.0.0/12',
3439 'IL': '79.176.0.0/13',
3440 'IM': '5.62.80.0/20',
3441 'IN': '117.192.0.0/10',
3442 'IO': '203.83.48.0/21',
3443 'IQ': '37.236.0.0/14',
3444 'IR': '2.176.0.0/12',
3445 'IS': '82.221.0.0/16',
3446 'IT': '79.0.0.0/10',
3447 'JE': '87.244.64.0/18',
3448 'JM': '72.27.0.0/17',
3449 'JO': '176.29.0.0/16',
3450 'JP': '126.0.0.0/8',
3451 'KE': '105.48.0.0/12',
3452 'KG': '158.181.128.0/17',
3453 'KH': '36.37.128.0/17',
3454 'KI': '103.25.140.0/22',
3455 'KM': '197.255.224.0/20',
3456 'KN': '198.32.32.0/19',
3457 'KP': '175.45.176.0/22',
3458 'KR': '175.192.0.0/10',
3459 'KW': '37.36.0.0/14',
3460 'KY': '64.96.0.0/15',
3461 'KZ': '2.72.0.0/13',
3462 'LA': '115.84.64.0/18',
3463 'LB': '178.135.0.0/16',
3464 'LC': '192.147.231.0/24',
3465 'LI': '82.117.0.0/19',
3466 'LK': '112.134.0.0/15',
3467 'LR': '41.86.0.0/19',
3468 'LS': '129.232.0.0/17',
3469 'LT': '78.56.0.0/13',
3470 'LU': '188.42.0.0/16',
3471 'LV': '46.109.0.0/16',
3472 'LY': '41.252.0.0/14',
3473 'MA': '105.128.0.0/11',
3474 'MC': '88.209.64.0/18',
3475 'MD': '37.246.0.0/16',
3476 'ME': '178.175.0.0/17',
3477 'MF': '74.112.232.0/21',
3478 'MG': '154.126.0.0/17',
3479 'MH': '117.103.88.0/21',
3480 'MK': '77.28.0.0/15',
3481 'ML': '154.118.128.0/18',
3482 'MM': '37.111.0.0/17',
3483 'MN': '49.0.128.0/17',
3484 'MO': '60.246.0.0/16',
3485 'MP': '202.88.64.0/20',
3486 'MQ': '109.203.224.0/19',
3487 'MR': '41.188.64.0/18',
3488 'MS': '208.90.112.0/22',
3489 'MT': '46.11.0.0/16',
3490 'MU': '105.16.0.0/12',
3491 'MV': '27.114.128.0/18',
3492 'MW': '105.234.0.0/16',
3493 'MX': '187.192.0.0/11',
3494 'MY': '175.136.0.0/13',
3495 'MZ': '197.218.0.0/15',
3496 'NA': '41.182.0.0/16',
3497 'NC': '101.101.0.0/18',
3498 'NE': '197.214.0.0/18',
3499 'NF': '203.17.240.0/22',
3500 'NG': '105.112.0.0/12',
3501 'NI': '186.76.0.0/15',
3502 'NL': '145.96.0.0/11',
3503 'NO': '84.208.0.0/13',
3504 'NP': '36.252.0.0/15',
3505 'NR': '203.98.224.0/19',
3506 'NU': '49.156.48.0/22',
3507 'NZ': '49.224.0.0/14',
3508 'OM': '5.36.0.0/15',
3509 'PA': '186.72.0.0/15',
3510 'PE': '186.160.0.0/14',
3511 'PF': '123.50.64.0/18',
3512 'PG': '124.240.192.0/19',
3513 'PH': '49.144.0.0/13',
3514 'PK': '39.32.0.0/11',
3515 'PL': '83.0.0.0/11',
3516 'PM': '70.36.0.0/20',
3517 'PR': '66.50.0.0/16',
3518 'PS': '188.161.0.0/16',
3519 'PT': '85.240.0.0/13',
3520 'PW': '202.124.224.0/20',
3521 'PY': '181.120.0.0/14',
3522 'QA': '37.210.0.0/15',
3523 'RE': '139.26.0.0/16',
3524 'RO': '79.112.0.0/13',
3525 'RS': '178.220.0.0/14',
3526 'RU': '5.136.0.0/13',
3527 'RW': '105.178.0.0/15',
3528 'SA': '188.48.0.0/13',
3529 'SB': '202.1.160.0/19',
3530 'SC': '154.192.0.0/11',
3531 'SD': '154.96.0.0/13',
3532 'SE': '78.64.0.0/12',
3533 'SG': '152.56.0.0/14',
3534 'SI': '188.196.0.0/14',
3535 'SK': '78.98.0.0/15',
3536 'SL': '197.215.0.0/17',
3537 'SM': '89.186.32.0/19',
3538 'SN': '41.82.0.0/15',
3539 'SO': '197.220.64.0/19',
3540 'SR': '186.179.128.0/17',
3541 'SS': '105.235.208.0/21',
3542 'ST': '197.159.160.0/19',
3543 'SV': '168.243.0.0/16',
3544 'SX': '190.102.0.0/20',
3545 'SY': '5.0.0.0/16',
3546 'SZ': '41.84.224.0/19',
3547 'TC': '65.255.48.0/20',
3548 'TD': '154.68.128.0/19',
3549 'TG': '196.168.0.0/14',
3550 'TH': '171.96.0.0/13',
3551 'TJ': '85.9.128.0/18',
3552 'TK': '27.96.24.0/21',
3553 'TL': '180.189.160.0/20',
3554 'TM': '95.85.96.0/19',
3555 'TN': '197.0.0.0/11',
3556 'TO': '175.176.144.0/21',
3557 'TR': '78.160.0.0/11',
3558 'TT': '186.44.0.0/15',
3559 'TV': '202.2.96.0/19',
3560 'TW': '120.96.0.0/11',
3561 'TZ': '156.156.0.0/14',
3562 'UA': '93.72.0.0/13',
3563 'UG': '154.224.0.0/13',
3564 'US': '3.0.0.0/8',
3565 'UY': '167.56.0.0/13',
3566 'UZ': '82.215.64.0/18',
3567 'VA': '212.77.0.0/19',
3568 'VC': '24.92.144.0/20',
3569 'VE': '186.88.0.0/13',
3570 'VG': '172.103.64.0/18',
3571 'VI': '146.226.0.0/16',
3572 'VN': '14.160.0.0/11',
3573 'VU': '202.80.32.0/20',
3574 'WF': '117.20.32.0/21',
3575 'WS': '202.4.32.0/19',
3576 'YE': '134.35.0.0/16',
3577 'YT': '41.242.116.0/22',
3578 'ZA': '41.0.0.0/11',
3579 'ZM': '165.56.0.0/13',
3580 'ZW': '41.85.192.0/19',
3581 }
3582
3583 @classmethod
5f95927a
S
3584 def random_ipv4(cls, code_or_block):
3585 if len(code_or_block) == 2:
3586 block = cls._country_ip_map.get(code_or_block.upper())
3587 if not block:
3588 return None
3589 else:
3590 block = code_or_block
773f291d
S
3591 addr, preflen = block.split('/')
3592 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3593 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3594 return compat_str(socket.inet_ntoa(
4248dad9 3595 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3596
3597
91410c9b 3598class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3599 def __init__(self, proxies=None):
3600 # Set default handlers
3601 for type in ('http', 'https'):
3602 setattr(self, '%s_open' % type,
3603 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3604 meth(r, proxy, type))
38e87f6c 3605 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 3606
91410c9b 3607 def proxy_open(self, req, proxy, type):
2461f79d 3608 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3609 if req_proxy is not None:
3610 proxy = req_proxy
2461f79d
PH
3611 del req.headers['Ytdl-request-proxy']
3612
3613 if proxy == '__noproxy__':
3614 return None # No Proxy
51fb4995 3615 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3616 req.add_header('Ytdl-socks-proxy', proxy)
3617 # youtube-dl's http/https handlers do wrapping the socket with socks
3618 return None
91410c9b
PH
3619 return compat_urllib_request.ProxyHandler.proxy_open(
3620 self, req, proxy, type)
5bc880b9
YCH
3621
3622
0a5445dd
YCH
3623# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3624# released into Public Domain
3625# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3626
3627def long_to_bytes(n, blocksize=0):
3628 """long_to_bytes(n:long, blocksize:int) : string
3629 Convert a long integer to a byte string.
3630
3631 If optional blocksize is given and greater than zero, pad the front of the
3632 byte string with binary zeros so that the length is a multiple of
3633 blocksize.
3634 """
3635 # after much testing, this algorithm was deemed to be the fastest
3636 s = b''
3637 n = int(n)
3638 while n > 0:
3639 s = compat_struct_pack('>I', n & 0xffffffff) + s
3640 n = n >> 32
3641 # strip off leading zeros
3642 for i in range(len(s)):
3643 if s[i] != b'\000'[0]:
3644 break
3645 else:
3646 # only happens when n == 0
3647 s = b'\000'
3648 i = 0
3649 s = s[i:]
3650 # add back some pad bytes. this could be done more efficiently w.r.t. the
3651 # de-padding being done above, but sigh...
3652 if blocksize > 0 and len(s) % blocksize:
3653 s = (blocksize - len(s) % blocksize) * b'\000' + s
3654 return s
3655
3656
3657def bytes_to_long(s):
3658 """bytes_to_long(string) : long
3659 Convert a byte string to a long integer.
3660
3661 This is (essentially) the inverse of long_to_bytes().
3662 """
3663 acc = 0
3664 length = len(s)
3665 if length % 4:
3666 extra = (4 - length % 4)
3667 s = b'\000' * extra + s
3668 length = length + extra
3669 for i in range(0, length, 4):
3670 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3671 return acc
3672
3673
5bc880b9
YCH
3674def ohdave_rsa_encrypt(data, exponent, modulus):
3675 '''
3676 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3677
3678 Input:
3679 data: data to encrypt, bytes-like object
3680 exponent, modulus: parameter e and N of RSA algorithm, both integer
3681 Output: hex string of encrypted data
3682
3683 Limitation: supports one block encryption only
3684 '''
3685
3686 payload = int(binascii.hexlify(data[::-1]), 16)
3687 encrypted = pow(payload, exponent, modulus)
3688 return '%x' % encrypted
81bdc8fd
YCH
3689
3690
f48409c7
YCH
3691def pkcs1pad(data, length):
3692 """
3693 Padding input data with PKCS#1 scheme
3694
3695 @param {int[]} data input data
3696 @param {int} length target length
3697 @returns {int[]} padded data
3698 """
3699 if len(data) > length - 11:
3700 raise ValueError('Input data too long for PKCS#1 padding')
3701
3702 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3703 return [0, 2] + pseudo_random + [0] + data
3704
3705
5eb6bdce 3706def encode_base_n(num, n, table=None):
59f898b7 3707 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3708 if not table:
3709 table = FULL_TABLE[:n]
3710
5eb6bdce
YCH
3711 if n > len(table):
3712 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3713
3714 if num == 0:
3715 return table[0]
3716
81bdc8fd
YCH
3717 ret = ''
3718 while num:
3719 ret = table[num % n] + ret
3720 num = num // n
3721 return ret
f52354a8
YCH
3722
3723
3724def decode_packed_codes(code):
06b3fe29 3725 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3726 obfucasted_code, base, count, symbols = mobj.groups()
3727 base = int(base)
3728 count = int(count)
3729 symbols = symbols.split('|')
3730 symbol_table = {}
3731
3732 while count:
3733 count -= 1
5eb6bdce 3734 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3735 symbol_table[base_n_count] = symbols[count] or base_n_count
3736
3737 return re.sub(
3738 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3739 obfucasted_code)
e154c651 3740
3741
3742def parse_m3u8_attributes(attrib):
3743 info = {}
3744 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3745 if val.startswith('"'):
3746 val = val[1:-1]
3747 info[key] = val
3748 return info
1143535d
YCH
3749
3750
3751def urshift(val, n):
3752 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3753
3754
3755# Based on png2str() written by @gdkchan and improved by @yokrysty
3756# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3757def decode_png(png_data):
3758 # Reference: https://www.w3.org/TR/PNG/
3759 header = png_data[8:]
3760
3761 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3762 raise IOError('Not a valid PNG file.')
3763
3764 int_map = {1: '>B', 2: '>H', 4: '>I'}
3765 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3766
3767 chunks = []
3768
3769 while header:
3770 length = unpack_integer(header[:4])
3771 header = header[4:]
3772
3773 chunk_type = header[:4]
3774 header = header[4:]
3775
3776 chunk_data = header[:length]
3777 header = header[length:]
3778
3779 header = header[4:] # Skip CRC
3780
3781 chunks.append({
3782 'type': chunk_type,
3783 'length': length,
3784 'data': chunk_data
3785 })
3786
3787 ihdr = chunks[0]['data']
3788
3789 width = unpack_integer(ihdr[:4])
3790 height = unpack_integer(ihdr[4:8])
3791
3792 idat = b''
3793
3794 for chunk in chunks:
3795 if chunk['type'] == b'IDAT':
3796 idat += chunk['data']
3797
3798 if not idat:
3799 raise IOError('Unable to read PNG data.')
3800
3801 decompressed_data = bytearray(zlib.decompress(idat))
3802
3803 stride = width * 3
3804 pixels = []
3805
3806 def _get_pixel(idx):
3807 x = idx % stride
3808 y = idx // stride
3809 return pixels[y][x]
3810
3811 for y in range(height):
3812 basePos = y * (1 + stride)
3813 filter_type = decompressed_data[basePos]
3814
3815 current_row = []
3816
3817 pixels.append(current_row)
3818
3819 for x in range(stride):
3820 color = decompressed_data[1 + basePos + x]
3821 basex = y * stride + x
3822 left = 0
3823 up = 0
3824
3825 if x > 2:
3826 left = _get_pixel(basex - 3)
3827 if y > 0:
3828 up = _get_pixel(basex - stride)
3829
3830 if filter_type == 1: # Sub
3831 color = (color + left) & 0xff
3832 elif filter_type == 2: # Up
3833 color = (color + up) & 0xff
3834 elif filter_type == 3: # Average
3835 color = (color + ((left + up) >> 1)) & 0xff
3836 elif filter_type == 4: # Paeth
3837 a = left
3838 b = up
3839 c = 0
3840
3841 if x > 2 and y > 0:
3842 c = _get_pixel(basex - stride - 3)
3843
3844 p = a + b - c
3845
3846 pa = abs(p - a)
3847 pb = abs(p - b)
3848 pc = abs(p - c)
3849
3850 if pa <= pb and pa <= pc:
3851 color = (color + a) & 0xff
3852 elif pb <= pc:
3853 color = (color + b) & 0xff
3854 else:
3855 color = (color + c) & 0xff
3856
3857 current_row.append(color)
3858
3859 return width, height, pixels
efa97bdc
YCH
3860
3861
3862def write_xattr(path, key, value):
3863 # This mess below finds the best xattr tool for the job
3864 try:
3865 # try the pyxattr module...
3866 import xattr
3867
53a7e3d2
YCH
3868 if hasattr(xattr, 'set'): # pyxattr
3869 # Unicode arguments are not supported in python-pyxattr until
3870 # version 0.5.0
3871 # See https://github.com/rg3/youtube-dl/issues/5498
3872 pyxattr_required_version = '0.5.0'
3873 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3874 # TODO: fallback to CLI tools
3875 raise XAttrUnavailableError(
3876 'python-pyxattr is detected but is too old. '
3877 'youtube-dl requires %s or above while your version is %s. '
3878 'Falling back to other xattr implementations' % (
3879 pyxattr_required_version, xattr.__version__))
3880
3881 setxattr = xattr.set
3882 else: # xattr
3883 setxattr = xattr.setxattr
efa97bdc
YCH
3884
3885 try:
53a7e3d2 3886 setxattr(path, key, value)
efa97bdc
YCH
3887 except EnvironmentError as e:
3888 raise XAttrMetadataError(e.errno, e.strerror)
3889
3890 except ImportError:
3891 if compat_os_name == 'nt':
3892 # Write xattrs to NTFS Alternate Data Streams:
3893 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3894 assert ':' not in key
3895 assert os.path.exists(path)
3896
3897 ads_fn = path + ':' + key
3898 try:
3899 with open(ads_fn, 'wb') as f:
3900 f.write(value)
3901 except EnvironmentError as e:
3902 raise XAttrMetadataError(e.errno, e.strerror)
3903 else:
3904 user_has_setfattr = check_executable('setfattr', ['--version'])
3905 user_has_xattr = check_executable('xattr', ['-h'])
3906
3907 if user_has_setfattr or user_has_xattr:
3908
3909 value = value.decode('utf-8')
3910 if user_has_setfattr:
3911 executable = 'setfattr'
3912 opts = ['-n', key, '-v', value]
3913 elif user_has_xattr:
3914 executable = 'xattr'
3915 opts = ['-w', key, value]
3916
3917 cmd = ([encodeFilename(executable, True)] +
3918 [encodeArgument(o) for o in opts] +
3919 [encodeFilename(path, True)])
3920
3921 try:
3922 p = subprocess.Popen(
3923 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3924 except EnvironmentError as e:
3925 raise XAttrMetadataError(e.errno, e.strerror)
3926 stdout, stderr = p.communicate()
3927 stderr = stderr.decode('utf-8', 'replace')
3928 if p.returncode != 0:
3929 raise XAttrMetadataError(p.returncode, stderr)
3930
3931 else:
3932 # On Unix, and can't find pyxattr, setfattr, or xattr.
3933 if sys.platform.startswith('linux'):
3934 raise XAttrUnavailableError(
3935 "Couldn't find a tool to set the xattrs. "
3936 "Install either the python 'pyxattr' or 'xattr' "
3937 "modules, or the GNU 'attr' package "
3938 "(which contains the 'setfattr' tool).")
3939 else:
3940 raise XAttrUnavailableError(
3941 "Couldn't find a tool to set the xattrs. "
3942 "Install either the python 'xattr' module, "
3943 "or the 'xattr' binary.")
0c265486
YCH
3944
3945
3946def random_birthday(year_field, month_field, day_field):
3947 return {
3948 year_field: str(random.randint(1950, 1995)),
3949 month_field: str(random.randint(1, 12)),
3950 day_field: str(random.randint(1, 31)),
3951 }