]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
release 2018.08.28
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
d7cd9a9e 42 compat_ctypes_WINFUNCTYPE,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
60c08562 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29 186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0685d972 187JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 188
7105440c 189
d77c3dfd 190def preferredencoding():
59ae15a5 191 """Get preferred encoding.
d77c3dfd 192
59ae15a5
PH
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
28e614de 198 'TEST'.encode(pref)
70a1165b 199 except Exception:
59ae15a5 200 pref = 'UTF-8'
bae611f2 201
59ae15a5 202 return pref
d77c3dfd 203
f4bfd65f 204
181c8655 205def write_json_file(obj, fn):
1394646a 206 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 207
92120217 208 fn = encodeFilename(fn)
61ee5aeb 209 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
73159f99
S
221 args = {
222 'suffix': '.tmp',
ec5f6016
JMF
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
73159f99
S
225 'delete': False,
226 }
227
181c8655
PH
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
73159f99 231 args['mode'] = 'wb'
181c8655 232 else:
73159f99
S
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
c86b6142 238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
239
240 try:
241 with tf:
242 json.dump(obj, tf)
1394646a
IK
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
181c8655 250 os.rename(tf.name, fn)
70a1165b 251 except Exception:
181c8655
PH
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259if sys.version_info >= (2, 7):
ee114368 260 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 261 """ Find the xpath xpath[@key=val] """
5d2354f1 262 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
264 return node.find(expr)
265else:
ee114368 266 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 267 for f in node.findall(compat_xpath(xpath)):
ee114368
S
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
271 return f
272 return None
273
d7e66d39
JMF
274# On python2.6 the xml.etree.ElementTree.Element methods don't support
275# the namespace parameter
5f6a1245
JW
276
277
d7e66d39
JMF
278def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
d77c3dfd 289
a41fb80c 290def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 291 def _find_xpath(xpath):
810c10ba 292 return node.find(compat_xpath(xpath))
578c0745
S
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
d74bebd5 301
8e636da4 302 if n is None:
bf42a990
S
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
bf0ff932
PH
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
a41fb80c
S
310 return n
311
312
313def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
a41fb80c
S
326
327
328def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
bf0ff932
PH
339
340
9e6dd238 341def get_element_by_id(id, html):
43e8fafd 342 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 343 return get_element_by_attribute('id', id, html)
43e8fafd 344
12ea2f30 345
84c237fb 346def get_element_by_class(class_name, html):
2af12ad9
TC
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
84c237fb
YCH
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
2af12ad9 364def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 365 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 366
84c237fb
YCH
367 value = re.escape(value) if escape_value else value
368
2af12ad9
TC
369 retlist = []
370 for m in re.finditer(r'''(?xs)
38285056 371 <([a-zA-Z0-9:._-]+)
609ff8ca 372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 373 \s+%s=['"]?%s['"]?
609ff8ca 374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
375 \s*>
376 (?P<content>.*?)
377 </\1>
2af12ad9
TC
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
38285056 380
2af12ad9
TC
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
38285056 383
2af12ad9 384 retlist.append(unescapeHTML(res))
a921f407 385
2af12ad9 386 return retlist
a921f407 387
c5229f39 388
8bb56eee
BF
389class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
c5229f39 392 self.attrs = {}
8bb56eee
BF
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
c5229f39 398
8bb56eee
BF
399def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
b4a3d461
S
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
8bb56eee 422 return parser.attrs
9e6dd238 423
c5229f39 424
9e6dd238 425def clean_html(html):
59ae15a5 426 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
59ae15a5
PH
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
edd9221c
TF
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
7decf895 439 return html.strip()
9e6dd238
FV
440
441
d77c3dfd 442def sanitize_open(filename, open_mode):
59ae15a5
PH
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
28e614de 453 if filename == '-':
59ae15a5
PH
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
f45c185f
PH
461 if err.errno in (errno.EACCES,):
462 raise
59ae15a5 463
f45c185f 464 # In case of error, try to remove win32 forbidden chars
d55de57b 465 alt_filename = sanitize_path(filename)
f45c185f
PH
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
d55de57b 470 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 471 return (stream, alt_filename)
d77c3dfd
FV
472
473
474def timeconvert(timestr):
59ae15a5
PH
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
1c469a94 481
5f6a1245 482
796173d0 483def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
158af524
S
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
59ae15a5
PH
488 """
489 def replace_insane(char):
c587cbb7
AT
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
59ae15a5
PH
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
627dcfff 500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
2aeb06d6
PH
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 508 result = ''.join(map(replace_insane, s))
796173d0
PH
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
5a42414b
PH
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
a7440261 518 result = result.lstrip('.')
796173d0
PH
519 if not result:
520 result = '_'
59ae15a5 521 return result
d77c3dfd 522
5f6a1245 523
a2aaf4db
S
524def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
be531ef1
S
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
a2aaf4db
S
533 norm_path.pop(0)
534 sanitized_path = [
ec85ded8 535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 536 for path_part in norm_path]
be531ef1
S
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
539 return os.path.join(*sanitized_path)
540
541
17bcc626 542def sanitize_url(url):
befa4708
S
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
17bcc626
S
558
559
67dda517 560def sanitized_Request(url, *args, **kwargs):
17bcc626 561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
562
563
51098426
S
564def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
d77c3dfd 569def orderedSet(iterable):
59ae15a5
PH
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
d77c3dfd 576
912b38b4 577
55b2f099 578def _htmlentity_transform(entity_with_semicolon):
4e408e47 579 """Transforms an HTML entity to a character."""
55b2f099
YCH
580 entity = entity_with_semicolon[:-1]
581
4e408e47
PH
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
55b2f099
YCH
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
91757b0f 591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
592 if mobj is not None:
593 numstr = mobj.group(1)
28e614de 594 if numstr.startswith('x'):
4e408e47 595 base = 16
28e614de 596 numstr = '0%s' % numstr
4e408e47
PH
597 else:
598 base = 10
7aefc49c
S
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
4e408e47
PH
604
605 # Unknown entity in name, return its literal representation
7a3f0c00 606 return '&%s;' % entity
4e408e47
PH
607
608
d77c3dfd 609def unescapeHTML(s):
912b38b4
PH
610 if s is None:
611 return None
612 assert type(s) == compat_str
d77c3dfd 613
4e408e47 614 return re.sub(
95f3f7c2 615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 616
8bf48f23 617
aa49acd1
S
618def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
8bf48f23 630def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
631 """
632 @param s The name of the file
633 """
d77c3dfd 634
8bf48f23 635 assert type(s) == compat_str
d77c3dfd 636
59ae15a5
PH
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
0f00efed 640
aa49acd1
S
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
8ee239e9
YCH
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
aa49acd1
S
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 663
f07b74fc
PH
664
665def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
7af808a5 669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
aa49acd1
S
674def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
8271226a
PH
678def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
1c256f70 686
5f6a1245 687
4539dd30
PH
688def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
a0ddb8a2 696
be4a824d
PH
697def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 701 if opts_no_check_certificate:
be5f2c19 702 context.check_hostname = False
0db261ba 703 context.verify_mode = ssl.CERT_NONE
a2366922 704 try:
be4a824d 705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
d7932313 712 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 713 else: # Python < 3.4
d7932313 714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 715 context.verify_mode = (ssl.CERT_NONE
dca08720 716 if opts_no_check_certificate
ea6d901e 717 else ssl.CERT_REQUIRED)
303b479e 718 context.set_default_verify_paths()
be4a824d 719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 720
732ea2f0 721
08f2a92c
JMF
722def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
bf5b9d85
PM
733class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738class ExtractorError(YoutubeDLError):
1c256f70 739 """Error during info extraction."""
5f6a1245 740
d11271dd 741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
d11271dd
PH
748 if video_id is not None:
749 msg = video_id + ': ' + msg
410f3e73 750 if cause:
28e614de 751 msg += ' (caused by %r)' % cause
9a82b238 752 if not expected:
08f2a92c 753 msg += bug_reports_message()
1c256f70 754 super(ExtractorError, self).__init__(msg)
d5979c5d 755
1c256f70 756 self.traceback = tb
8cc83b8d 757 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 758 self.cause = cause
d11271dd 759 self.video_id = video_id
1c256f70 760
01951dda
PH
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
28e614de 764 return ''.join(traceback.format_tb(self.traceback))
01951dda 765
1c256f70 766
416c7fcb
PH
767class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
55b3e45b
JMF
774class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
773f291d
S
779class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
bf5b9d85 791class DownloadError(YoutubeDLError):
59ae15a5 792 """Download Error exception.
d77c3dfd 793
59ae15a5
PH
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
5f6a1245 798
8cc83b8d
FV
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
d77c3dfd
FV
803
804
bf5b9d85 805class SameFileError(YoutubeDLError):
59ae15a5 806 """Same File exception.
d77c3dfd 807
59ae15a5
PH
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
d77c3dfd
FV
812
813
bf5b9d85 814class PostProcessingError(YoutubeDLError):
59ae15a5 815 """Post Processing exception.
d77c3dfd 816
59ae15a5
PH
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
5f6a1245 820
7851b379 821 def __init__(self, msg):
bf5b9d85 822 super(PostProcessingError, self).__init__(msg)
7851b379 823 self.msg = msg
d77c3dfd 824
5f6a1245 825
bf5b9d85 826class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
827 """ --max-downloads limit has been reached. """
828 pass
d77c3dfd
FV
829
830
bf5b9d85 831class UnavailableVideoError(YoutubeDLError):
59ae15a5 832 """Unavailable Format exception.
d77c3dfd 833
59ae15a5
PH
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
d77c3dfd
FV
838
839
bf5b9d85 840class ContentTooShortError(YoutubeDLError):
59ae15a5 841 """Content Too Short exception.
d77c3dfd 842
59ae15a5
PH
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
d77c3dfd 847
59ae15a5 848 def __init__(self, downloaded, expected):
bf5b9d85
PM
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
2c7ed247 852 # Both in bytes
59ae15a5
PH
853 self.downloaded = downloaded
854 self.expected = expected
d77c3dfd 855
5f6a1245 856
bf5b9d85 857class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
bd264412 861 self.msg = msg
efa97bdc
YCH
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
bf5b9d85 873class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
874 pass
875
876
c5a59d93 877def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
65220c3b
S
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d
PH
884 source_address = ydl_handler._params.get('source_address')
885 if source_address is not None:
886 sa = (source_address, 0)
887 if hasattr(hc, 'source_address'): # Python 2.7+
888 hc.source_address = sa
889 else: # Python 2.6
890 def _hc_connect(self, *args, **kwargs):
891 sock = compat_socket_create_connection(
892 (self.host, self.port), self.timeout, sa)
893 if is_https:
d7932313
PH
894 self.sock = ssl.wrap_socket(
895 sock, self.key_file, self.cert_file,
896 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
897 else:
898 self.sock = sock
899 hc.connect = functools.partial(_hc_connect, hc)
900
901 return hc
902
903
87f0e62d 904def handle_youtubedl_headers(headers):
992fc9d6
YCH
905 filtered_headers = headers
906
907 if 'Youtubedl-no-compression' in filtered_headers:
908 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 909 del filtered_headers['Youtubedl-no-compression']
87f0e62d 910
992fc9d6 911 return filtered_headers
87f0e62d
YCH
912
913
acebc9cd 914class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
915 """Handler for HTTP requests and responses.
916
917 This class, when installed with an OpenerDirector, automatically adds
918 the standard headers to every HTTP request and handles gzipped and
919 deflated responses from web servers. If compression is to be avoided in
920 a particular request, the original request in the program code only has
0424ec30 921 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
922 removed before making the real request.
923
924 Part of this code was copied from:
925
926 http://techknack.net/python-urllib2-handlers/
927
928 Andrew Rowls, the author of that code, agreed to release it to the
929 public domain.
930 """
931
be4a824d
PH
932 def __init__(self, params, *args, **kwargs):
933 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
934 self._params = params
935
936 def http_open(self, req):
71aff188
YCH
937 conn_class = compat_http_client.HTTPConnection
938
939 socks_proxy = req.headers.get('Ytdl-socks-proxy')
940 if socks_proxy:
941 conn_class = make_socks_conn_class(conn_class, socks_proxy)
942 del req.headers['Ytdl-socks-proxy']
943
be4a824d 944 return self.do_open(functools.partial(
71aff188 945 _create_http_connection, self, conn_class, False),
be4a824d
PH
946 req)
947
59ae15a5
PH
948 @staticmethod
949 def deflate(data):
950 try:
951 return zlib.decompress(data, -zlib.MAX_WBITS)
952 except zlib.error:
953 return zlib.decompress(data)
954
acebc9cd 955 def http_request(self, req):
51f267d9
S
956 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
957 # always respected by websites, some tend to give out URLs with non percent-encoded
958 # non-ASCII characters (see telemb.py, ard.py [#3412])
959 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
960 # To work around aforementioned issue we will replace request's original URL with
961 # percent-encoded one
962 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
963 # the code of this workaround has been moved here from YoutubeDL.urlopen()
964 url = req.get_full_url()
965 url_escaped = escape_url(url)
966
967 # Substitute URL if any change after escaping
968 if url != url_escaped:
15d260eb 969 req = update_Request(req, url=url_escaped)
51f267d9 970
33ac271b 971 for h, v in std_headers.items():
3d5f7a39
JK
972 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
973 # The dict keys are capitalized because of this bug by urllib
974 if h.capitalize() not in req.headers:
33ac271b 975 req.add_header(h, v)
87f0e62d
YCH
976
977 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
978
979 if sys.version_info < (2, 7) and '#' in req.get_full_url():
980 # Python 2.6 is brain-dead when it comes to fragments
981 req._Request__original = req._Request__original.partition('#')[0]
982 req._Request__r_type = req._Request__r_type.partition('#')[0]
983
59ae15a5
PH
984 return req
985
acebc9cd 986 def http_response(self, req, resp):
59ae15a5
PH
987 old_resp = resp
988 # gzip
989 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
990 content = resp.read()
991 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
992 try:
993 uncompressed = io.BytesIO(gz.read())
994 except IOError as original_ioerror:
995 # There may be junk add the end of the file
996 # See http://stackoverflow.com/q/4928560/35070 for details
997 for i in range(1, 1024):
998 try:
999 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1000 uncompressed = io.BytesIO(gz.read())
1001 except IOError:
1002 continue
1003 break
1004 else:
1005 raise original_ioerror
b407d853 1006 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1007 resp.msg = old_resp.msg
c047270c 1008 del resp.headers['Content-encoding']
59ae15a5
PH
1009 # deflate
1010 if resp.headers.get('Content-encoding', '') == 'deflate':
1011 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1012 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1013 resp.msg = old_resp.msg
c047270c 1014 del resp.headers['Content-encoding']
ad729172
S
1015 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1016 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1017 if 300 <= resp.code < 400:
1018 location = resp.headers.get('Location')
1019 if location:
1020 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1021 if sys.version_info >= (3, 0):
1022 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1023 else:
1024 location = location.decode('utf-8')
5a4d9ddb
S
1025 location_escaped = escape_url(location)
1026 if location != location_escaped:
1027 del resp.headers['Location']
9a4aec8b
YCH
1028 if sys.version_info < (3, 0):
1029 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1030 resp.headers['Location'] = location_escaped
59ae15a5 1031 return resp
0f8d03f8 1032
acebc9cd
PH
1033 https_request = http_request
1034 https_response = http_response
bf50b038 1035
5de90176 1036
71aff188
YCH
1037def make_socks_conn_class(base_class, socks_proxy):
1038 assert issubclass(base_class, (
1039 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1040
1041 url_components = compat_urlparse.urlparse(socks_proxy)
1042 if url_components.scheme.lower() == 'socks5':
1043 socks_type = ProxyType.SOCKS5
1044 elif url_components.scheme.lower() in ('socks', 'socks4'):
1045 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1046 elif url_components.scheme.lower() == 'socks4a':
1047 socks_type = ProxyType.SOCKS4A
71aff188 1048
cdd94c2e
YCH
1049 def unquote_if_non_empty(s):
1050 if not s:
1051 return s
1052 return compat_urllib_parse_unquote_plus(s)
1053
71aff188
YCH
1054 proxy_args = (
1055 socks_type,
1056 url_components.hostname, url_components.port or 1080,
1057 True, # Remote DNS
cdd94c2e
YCH
1058 unquote_if_non_empty(url_components.username),
1059 unquote_if_non_empty(url_components.password),
71aff188
YCH
1060 )
1061
1062 class SocksConnection(base_class):
1063 def connect(self):
1064 self.sock = sockssocket()
1065 self.sock.setproxy(*proxy_args)
1066 if type(self.timeout) in (int, float):
1067 self.sock.settimeout(self.timeout)
1068 self.sock.connect((self.host, self.port))
1069
1070 if isinstance(self, compat_http_client.HTTPSConnection):
1071 if hasattr(self, '_context'): # Python > 2.6
1072 self.sock = self._context.wrap_socket(
1073 self.sock, server_hostname=self.host)
1074 else:
1075 self.sock = ssl.wrap_socket(self.sock)
1076
1077 return SocksConnection
1078
1079
be4a824d
PH
1080class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1081 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1082 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1083 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1084 self._params = params
1085
1086 def https_open(self, req):
4f264c02 1087 kwargs = {}
71aff188
YCH
1088 conn_class = self._https_conn_class
1089
4f264c02
JMF
1090 if hasattr(self, '_context'): # python > 2.6
1091 kwargs['context'] = self._context
1092 if hasattr(self, '_check_hostname'): # python 3.x
1093 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1094
1095 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1096 if socks_proxy:
1097 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1098 del req.headers['Ytdl-socks-proxy']
1099
be4a824d 1100 return self.do_open(functools.partial(
71aff188 1101 _create_http_connection, self, conn_class, True),
4f264c02 1102 req, **kwargs)
be4a824d
PH
1103
1104
a6420bf5
S
1105class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1106 def __init__(self, cookiejar=None):
1107 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1108
1109 def http_response(self, request, response):
1110 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1111 # characters in Set-Cookie HTTP header of last response (see
1112 # https://github.com/rg3/youtube-dl/issues/6769).
1113 # In order to at least prevent crashing we will percent encode Set-Cookie
1114 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1115 # if sys.version_info < (3, 0) and response.headers:
1116 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1117 # set_cookie = response.headers.get(set_cookie_header)
1118 # if set_cookie:
1119 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1120 # if set_cookie != set_cookie_escaped:
1121 # del response.headers[set_cookie_header]
1122 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1123 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1124
1125 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1126 https_response = http_response
1127
1128
46f59e89
S
1129def extract_timezone(date_str):
1130 m = re.search(
1131 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1132 date_str)
1133 if not m:
1134 timezone = datetime.timedelta()
1135 else:
1136 date_str = date_str[:-len(m.group('tz'))]
1137 if not m.group('sign'):
1138 timezone = datetime.timedelta()
1139 else:
1140 sign = 1 if m.group('sign') == '+' else -1
1141 timezone = datetime.timedelta(
1142 hours=sign * int(m.group('hours')),
1143 minutes=sign * int(m.group('minutes')))
1144 return timezone, date_str
1145
1146
08b38d54 1147def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1148 """ Return a UNIX timestamp from the given date """
1149
1150 if date_str is None:
1151 return None
1152
52c3a6e4
S
1153 date_str = re.sub(r'\.[0-9]+', '', date_str)
1154
08b38d54 1155 if timezone is None:
46f59e89
S
1156 timezone, date_str = extract_timezone(date_str)
1157
52c3a6e4
S
1158 try:
1159 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1160 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1161 return calendar.timegm(dt.timetuple())
1162 except ValueError:
1163 pass
912b38b4
PH
1164
1165
46f59e89
S
1166def date_formats(day_first=True):
1167 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1168
1169
42bdd9d0 1170def unified_strdate(date_str, day_first=True):
bf50b038 1171 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1172
1173 if date_str is None:
1174 return None
bf50b038 1175 upload_date = None
5f6a1245 1176 # Replace commas
026fcc04 1177 date_str = date_str.replace(',', ' ')
42bdd9d0 1178 # Remove AM/PM + timezone
9bb8e0a3 1179 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1180 _, date_str = extract_timezone(date_str)
42bdd9d0 1181
46f59e89 1182 for expression in date_formats(day_first):
bf50b038
JMF
1183 try:
1184 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1185 except ValueError:
bf50b038 1186 pass
42393ce2
PH
1187 if upload_date is None:
1188 timetuple = email.utils.parsedate_tz(date_str)
1189 if timetuple:
c6b9cf05
S
1190 try:
1191 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1192 except ValueError:
1193 pass
6a750402
JMF
1194 if upload_date is not None:
1195 return compat_str(upload_date)
bf50b038 1196
5f6a1245 1197
46f59e89
S
1198def unified_timestamp(date_str, day_first=True):
1199 if date_str is None:
1200 return None
1201
2ae2ffda 1202 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1203
7dc2a74e 1204 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1205 timezone, date_str = extract_timezone(date_str)
1206
1207 # Remove AM/PM + timezone
1208 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1209
deef3195
S
1210 # Remove unrecognized timezones from ISO 8601 alike timestamps
1211 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1212 if m:
1213 date_str = date_str[:-len(m.group('tz'))]
1214
f226880c
PH
1215 # Python only supports microseconds, so remove nanoseconds
1216 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1217 if m:
1218 date_str = m.group(1)
1219
46f59e89
S
1220 for expression in date_formats(day_first):
1221 try:
7dc2a74e 1222 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1223 return calendar.timegm(dt.timetuple())
1224 except ValueError:
1225 pass
1226 timetuple = email.utils.parsedate_tz(date_str)
1227 if timetuple:
7dc2a74e 1228 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1229
1230
28e614de 1231def determine_ext(url, default_ext='unknown_video'):
85750f89 1232 if url is None or '.' not in url:
f4776371 1233 return default_ext
9cb9a5df 1234 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1235 if re.match(r'^[A-Za-z0-9]+$', guess):
1236 return guess
a7aaa398
S
1237 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1238 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1239 return guess.rstrip('/')
73e79f2a 1240 else:
cbdbb766 1241 return default_ext
73e79f2a 1242
5f6a1245 1243
d4051a8e 1244def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1245 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1246
5f6a1245 1247
bd558525 1248def date_from_str(date_str):
37254abc
JMF
1249 """
1250 Return a datetime object from a string in the format YYYYMMDD or
1251 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1252 today = datetime.date.today()
f8795e10 1253 if date_str in ('now', 'today'):
37254abc 1254 return today
f8795e10
PH
1255 if date_str == 'yesterday':
1256 return today - datetime.timedelta(days=1)
ec85ded8 1257 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1258 if match is not None:
1259 sign = match.group('sign')
1260 time = int(match.group('time'))
1261 if sign == '-':
1262 time = -time
1263 unit = match.group('unit')
dfb1b146 1264 # A bad approximation?
37254abc
JMF
1265 if unit == 'month':
1266 unit = 'day'
1267 time *= 30
1268 elif unit == 'year':
1269 unit = 'day'
1270 time *= 365
1271 unit += 's'
1272 delta = datetime.timedelta(**{unit: time})
1273 return today + delta
611c1dd9 1274 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1275
1276
e63fc1be 1277def hyphenate_date(date_str):
1278 """
1279 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1280 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1281 if match is not None:
1282 return '-'.join(match.groups())
1283 else:
1284 return date_str
1285
5f6a1245 1286
bd558525
JMF
1287class DateRange(object):
1288 """Represents a time interval between two dates"""
5f6a1245 1289
bd558525
JMF
1290 def __init__(self, start=None, end=None):
1291 """start and end must be strings in the format accepted by date"""
1292 if start is not None:
1293 self.start = date_from_str(start)
1294 else:
1295 self.start = datetime.datetime.min.date()
1296 if end is not None:
1297 self.end = date_from_str(end)
1298 else:
1299 self.end = datetime.datetime.max.date()
37254abc 1300 if self.start > self.end:
bd558525 1301 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1302
bd558525
JMF
1303 @classmethod
1304 def day(cls, day):
1305 """Returns a range that only contains the given day"""
5f6a1245
JW
1306 return cls(day, day)
1307
bd558525
JMF
1308 def __contains__(self, date):
1309 """Check if the date is in the range"""
37254abc
JMF
1310 if not isinstance(date, datetime.date):
1311 date = date_from_str(date)
1312 return self.start <= date <= self.end
5f6a1245 1313
bd558525 1314 def __str__(self):
5f6a1245 1315 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1316
1317
1318def platform_name():
1319 """ Returns the platform name as a compat_str """
1320 res = platform.platform()
1321 if isinstance(res, bytes):
1322 res = res.decode(preferredencoding())
1323
1324 assert isinstance(res, compat_str)
1325 return res
c257baff
PH
1326
1327
b58ddb32
PH
1328def _windows_write_string(s, out):
1329 """ Returns True if the string was written using special methods,
1330 False if it has yet to be written out."""
1331 # Adapted from http://stackoverflow.com/a/3259271/35070
1332
1333 import ctypes
1334 import ctypes.wintypes
1335
1336 WIN_OUTPUT_IDS = {
1337 1: -11,
1338 2: -12,
1339 }
1340
a383a98a
PH
1341 try:
1342 fileno = out.fileno()
1343 except AttributeError:
1344 # If the output stream doesn't have a fileno, it's virtual
1345 return False
aa42e873
PH
1346 except io.UnsupportedOperation:
1347 # Some strange Windows pseudo files?
1348 return False
b58ddb32
PH
1349 if fileno not in WIN_OUTPUT_IDS:
1350 return False
1351
d7cd9a9e 1352 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1353 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1354 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1355 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1356
d7cd9a9e 1357 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1358 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1359 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1360 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1361 written = ctypes.wintypes.DWORD(0)
1362
d7cd9a9e 1363 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1364 FILE_TYPE_CHAR = 0x0002
1365 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1366 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1367 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1368 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1369 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1370 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1371
1372 def not_a_console(handle):
1373 if handle == INVALID_HANDLE_VALUE or handle is None:
1374 return True
8fb3ac36
PH
1375 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1376 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1377
1378 if not_a_console(h):
1379 return False
1380
d1b9c912
PH
1381 def next_nonbmp_pos(s):
1382 try:
1383 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1384 except StopIteration:
1385 return len(s)
1386
1387 while s:
1388 count = min(next_nonbmp_pos(s), 1024)
1389
b58ddb32 1390 ret = WriteConsoleW(
d1b9c912 1391 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1392 if ret == 0:
1393 raise OSError('Failed to write string')
d1b9c912
PH
1394 if not count: # We just wrote a non-BMP character
1395 assert written.value == 2
1396 s = s[1:]
1397 else:
1398 assert written.value > 0
1399 s = s[written.value:]
b58ddb32
PH
1400 return True
1401
1402
734f90bb 1403def write_string(s, out=None, encoding=None):
7459e3a2
PH
1404 if out is None:
1405 out = sys.stderr
8bf48f23 1406 assert type(s) == compat_str
7459e3a2 1407
b58ddb32
PH
1408 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1409 if _windows_write_string(s, out):
1410 return
1411
7459e3a2
PH
1412 if ('b' in getattr(out, 'mode', '') or
1413 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1414 byt = s.encode(encoding or preferredencoding(), 'ignore')
1415 out.write(byt)
1416 elif hasattr(out, 'buffer'):
1417 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1418 byt = s.encode(enc, 'ignore')
1419 out.buffer.write(byt)
1420 else:
8bf48f23 1421 out.write(s)
7459e3a2
PH
1422 out.flush()
1423
1424
48ea9cea
PH
1425def bytes_to_intlist(bs):
1426 if not bs:
1427 return []
1428 if isinstance(bs[0], int): # Python 3
1429 return list(bs)
1430 else:
1431 return [ord(c) for c in bs]
1432
c257baff 1433
cba892fa 1434def intlist_to_bytes(xs):
1435 if not xs:
1436 return b''
edaa23f8 1437 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1438
1439
c1c9a79c
PH
1440# Cross-platform file locking
1441if sys.platform == 'win32':
1442 import ctypes.wintypes
1443 import msvcrt
1444
1445 class OVERLAPPED(ctypes.Structure):
1446 _fields_ = [
1447 ('Internal', ctypes.wintypes.LPVOID),
1448 ('InternalHigh', ctypes.wintypes.LPVOID),
1449 ('Offset', ctypes.wintypes.DWORD),
1450 ('OffsetHigh', ctypes.wintypes.DWORD),
1451 ('hEvent', ctypes.wintypes.HANDLE),
1452 ]
1453
1454 kernel32 = ctypes.windll.kernel32
1455 LockFileEx = kernel32.LockFileEx
1456 LockFileEx.argtypes = [
1457 ctypes.wintypes.HANDLE, # hFile
1458 ctypes.wintypes.DWORD, # dwFlags
1459 ctypes.wintypes.DWORD, # dwReserved
1460 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1461 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1462 ctypes.POINTER(OVERLAPPED) # Overlapped
1463 ]
1464 LockFileEx.restype = ctypes.wintypes.BOOL
1465 UnlockFileEx = kernel32.UnlockFileEx
1466 UnlockFileEx.argtypes = [
1467 ctypes.wintypes.HANDLE, # hFile
1468 ctypes.wintypes.DWORD, # dwReserved
1469 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1470 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1471 ctypes.POINTER(OVERLAPPED) # Overlapped
1472 ]
1473 UnlockFileEx.restype = ctypes.wintypes.BOOL
1474 whole_low = 0xffffffff
1475 whole_high = 0x7fffffff
1476
1477 def _lock_file(f, exclusive):
1478 overlapped = OVERLAPPED()
1479 overlapped.Offset = 0
1480 overlapped.OffsetHigh = 0
1481 overlapped.hEvent = 0
1482 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1483 handle = msvcrt.get_osfhandle(f.fileno())
1484 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1485 whole_low, whole_high, f._lock_file_overlapped_p):
1486 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1487
1488 def _unlock_file(f):
1489 assert f._lock_file_overlapped_p
1490 handle = msvcrt.get_osfhandle(f.fileno())
1491 if not UnlockFileEx(handle, 0,
1492 whole_low, whole_high, f._lock_file_overlapped_p):
1493 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1494
1495else:
399a76e6
YCH
1496 # Some platforms, such as Jython, is missing fcntl
1497 try:
1498 import fcntl
c1c9a79c 1499
399a76e6
YCH
1500 def _lock_file(f, exclusive):
1501 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1502
399a76e6
YCH
1503 def _unlock_file(f):
1504 fcntl.flock(f, fcntl.LOCK_UN)
1505 except ImportError:
1506 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1507
1508 def _lock_file(f, exclusive):
1509 raise IOError(UNSUPPORTED_MSG)
1510
1511 def _unlock_file(f):
1512 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1513
1514
1515class locked_file(object):
1516 def __init__(self, filename, mode, encoding=None):
1517 assert mode in ['r', 'a', 'w']
1518 self.f = io.open(filename, mode, encoding=encoding)
1519 self.mode = mode
1520
1521 def __enter__(self):
1522 exclusive = self.mode != 'r'
1523 try:
1524 _lock_file(self.f, exclusive)
1525 except IOError:
1526 self.f.close()
1527 raise
1528 return self
1529
1530 def __exit__(self, etype, value, traceback):
1531 try:
1532 _unlock_file(self.f)
1533 finally:
1534 self.f.close()
1535
1536 def __iter__(self):
1537 return iter(self.f)
1538
1539 def write(self, *args):
1540 return self.f.write(*args)
1541
1542 def read(self, *args):
1543 return self.f.read(*args)
4eb7f1d1
JMF
1544
1545
4644ac55
S
1546def get_filesystem_encoding():
1547 encoding = sys.getfilesystemencoding()
1548 return encoding if encoding is not None else 'utf-8'
1549
1550
4eb7f1d1 1551def shell_quote(args):
a6a173c2 1552 quoted_args = []
4644ac55 1553 encoding = get_filesystem_encoding()
a6a173c2
JMF
1554 for a in args:
1555 if isinstance(a, bytes):
1556 # We may get a filename encoded with 'encodeFilename'
1557 a = a.decode(encoding)
aefce8e6 1558 quoted_args.append(compat_shlex_quote(a))
28e614de 1559 return ' '.join(quoted_args)
9d4660ca
PH
1560
1561
1562def smuggle_url(url, data):
1563 """ Pass additional data in a URL for internal use. """
1564
81953d1a
RA
1565 url, idata = unsmuggle_url(url, {})
1566 data.update(idata)
15707c7e 1567 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1568 {'__youtubedl_smuggle': json.dumps(data)})
1569 return url + '#' + sdata
9d4660ca
PH
1570
1571
79f82953 1572def unsmuggle_url(smug_url, default=None):
83e865a3 1573 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1574 return smug_url, default
28e614de
PH
1575 url, _, sdata = smug_url.rpartition('#')
1576 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1577 data = json.loads(jsond)
1578 return url, data
02dbf93f
PH
1579
1580
02dbf93f
PH
1581def format_bytes(bytes):
1582 if bytes is None:
28e614de 1583 return 'N/A'
02dbf93f
PH
1584 if type(bytes) is str:
1585 bytes = float(bytes)
1586 if bytes == 0.0:
1587 exponent = 0
1588 else:
1589 exponent = int(math.log(bytes, 1024.0))
28e614de 1590 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1591 converted = float(bytes) / float(1024 ** exponent)
28e614de 1592 return '%.2f%s' % (converted, suffix)
f53c966a 1593
1c088fa8 1594
fb47597b
S
1595def lookup_unit_table(unit_table, s):
1596 units_re = '|'.join(re.escape(u) for u in unit_table)
1597 m = re.match(
782b1b5b 1598 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1599 if not m:
1600 return None
1601 num_str = m.group('num').replace(',', '.')
1602 mult = unit_table[m.group('unit')]
1603 return int(float(num_str) * mult)
1604
1605
be64b5b0
PH
1606def parse_filesize(s):
1607 if s is None:
1608 return None
1609
dfb1b146 1610 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1611 # but we support those too
1612 _UNIT_TABLE = {
1613 'B': 1,
1614 'b': 1,
70852b47 1615 'bytes': 1,
be64b5b0
PH
1616 'KiB': 1024,
1617 'KB': 1000,
1618 'kB': 1024,
1619 'Kb': 1000,
13585d76 1620 'kb': 1000,
70852b47
YCH
1621 'kilobytes': 1000,
1622 'kibibytes': 1024,
be64b5b0
PH
1623 'MiB': 1024 ** 2,
1624 'MB': 1000 ** 2,
1625 'mB': 1024 ** 2,
1626 'Mb': 1000 ** 2,
13585d76 1627 'mb': 1000 ** 2,
70852b47
YCH
1628 'megabytes': 1000 ** 2,
1629 'mebibytes': 1024 ** 2,
be64b5b0
PH
1630 'GiB': 1024 ** 3,
1631 'GB': 1000 ** 3,
1632 'gB': 1024 ** 3,
1633 'Gb': 1000 ** 3,
13585d76 1634 'gb': 1000 ** 3,
70852b47
YCH
1635 'gigabytes': 1000 ** 3,
1636 'gibibytes': 1024 ** 3,
be64b5b0
PH
1637 'TiB': 1024 ** 4,
1638 'TB': 1000 ** 4,
1639 'tB': 1024 ** 4,
1640 'Tb': 1000 ** 4,
13585d76 1641 'tb': 1000 ** 4,
70852b47
YCH
1642 'terabytes': 1000 ** 4,
1643 'tebibytes': 1024 ** 4,
be64b5b0
PH
1644 'PiB': 1024 ** 5,
1645 'PB': 1000 ** 5,
1646 'pB': 1024 ** 5,
1647 'Pb': 1000 ** 5,
13585d76 1648 'pb': 1000 ** 5,
70852b47
YCH
1649 'petabytes': 1000 ** 5,
1650 'pebibytes': 1024 ** 5,
be64b5b0
PH
1651 'EiB': 1024 ** 6,
1652 'EB': 1000 ** 6,
1653 'eB': 1024 ** 6,
1654 'Eb': 1000 ** 6,
13585d76 1655 'eb': 1000 ** 6,
70852b47
YCH
1656 'exabytes': 1000 ** 6,
1657 'exbibytes': 1024 ** 6,
be64b5b0
PH
1658 'ZiB': 1024 ** 7,
1659 'ZB': 1000 ** 7,
1660 'zB': 1024 ** 7,
1661 'Zb': 1000 ** 7,
13585d76 1662 'zb': 1000 ** 7,
70852b47
YCH
1663 'zettabytes': 1000 ** 7,
1664 'zebibytes': 1024 ** 7,
be64b5b0
PH
1665 'YiB': 1024 ** 8,
1666 'YB': 1000 ** 8,
1667 'yB': 1024 ** 8,
1668 'Yb': 1000 ** 8,
13585d76 1669 'yb': 1000 ** 8,
70852b47
YCH
1670 'yottabytes': 1000 ** 8,
1671 'yobibytes': 1024 ** 8,
be64b5b0
PH
1672 }
1673
fb47597b
S
1674 return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677def parse_count(s):
1678 if s is None:
be64b5b0
PH
1679 return None
1680
fb47597b
S
1681 s = s.strip()
1682
1683 if re.match(r'^[\d,.]+$', s):
1684 return str_to_int(s)
1685
1686 _UNIT_TABLE = {
1687 'k': 1000,
1688 'K': 1000,
1689 'm': 1000 ** 2,
1690 'M': 1000 ** 2,
1691 'kk': 1000 ** 2,
1692 'KK': 1000 ** 2,
1693 }
be64b5b0 1694
fb47597b 1695 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1696
2f7ae819 1697
b871d7e9
S
1698def parse_resolution(s):
1699 if s is None:
1700 return {}
1701
1702 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1703 if mobj:
1704 return {
1705 'width': int(mobj.group('w')),
1706 'height': int(mobj.group('h')),
1707 }
1708
1709 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1710 if mobj:
1711 return {'height': int(mobj.group(1))}
1712
1713 mobj = re.search(r'\b([48])[kK]\b', s)
1714 if mobj:
1715 return {'height': int(mobj.group(1)) * 540}
1716
1717 return {}
1718
1719
a942d6cb 1720def month_by_name(name, lang='en'):
caefb1de
PH
1721 """ Return the number of a month by (locale-independently) English name """
1722
f6717dec 1723 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1724
caefb1de 1725 try:
f6717dec 1726 return month_names.index(name) + 1
7105440c
YCH
1727 except ValueError:
1728 return None
1729
1730
1731def month_by_abbreviation(abbrev):
1732 """ Return the number of a month by (locale-independently) English
1733 abbreviations """
1734
1735 try:
1736 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1737 except ValueError:
1738 return None
18258362
JMF
1739
1740
5aafe895 1741def fix_xml_ampersands(xml_str):
18258362 1742 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1743 return re.sub(
1744 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1745 '&amp;',
5aafe895 1746 xml_str)
e3946f98
PH
1747
1748
1749def setproctitle(title):
8bf48f23 1750 assert isinstance(title, compat_str)
c1c05c67
YCH
1751
1752 # ctypes in Jython is not complete
1753 # http://bugs.jython.org/issue2148
1754 if sys.platform.startswith('java'):
1755 return
1756
e3946f98 1757 try:
611c1dd9 1758 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1759 except OSError:
1760 return
2f49bcd6
RC
1761 except TypeError:
1762 # LoadLibrary in Windows Python 2.7.13 only expects
1763 # a bytestring, but since unicode_literals turns
1764 # every string into a unicode string, it fails.
1765 return
6eefe533
PH
1766 title_bytes = title.encode('utf-8')
1767 buf = ctypes.create_string_buffer(len(title_bytes))
1768 buf.value = title_bytes
e3946f98 1769 try:
6eefe533 1770 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1771 except AttributeError:
1772 return # Strange libc, just skip this
d7dda168
PH
1773
1774
1775def remove_start(s, start):
46bc9b7d 1776 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1777
1778
2b9faf55 1779def remove_end(s, end):
46bc9b7d 1780 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1781
1782
31b2051e
S
1783def remove_quotes(s):
1784 if s is None or len(s) < 2:
1785 return s
1786 for quote in ('"', "'", ):
1787 if s[0] == quote and s[-1] == quote:
1788 return s[1:-1]
1789 return s
1790
1791
29eb5174 1792def url_basename(url):
9b8aaeed 1793 path = compat_urlparse.urlparse(url).path
28e614de 1794 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1795
1796
02dc0a36
S
1797def base_url(url):
1798 return re.match(r'https?://[^?#&]+/', url).group()
1799
1800
e34c3361 1801def urljoin(base, path):
4b5de77b
S
1802 if isinstance(path, bytes):
1803 path = path.decode('utf-8')
e34c3361
S
1804 if not isinstance(path, compat_str) or not path:
1805 return None
b0c65c67 1806 if re.match(r'^(?:https?:)?//', path):
e34c3361 1807 return path
4b5de77b
S
1808 if isinstance(base, bytes):
1809 base = base.decode('utf-8')
1810 if not isinstance(base, compat_str) or not re.match(
1811 r'^(?:https?:)?//', base):
e34c3361
S
1812 return None
1813 return compat_urlparse.urljoin(base, path)
1814
1815
aa94a6d3
PH
1816class HEADRequest(compat_urllib_request.Request):
1817 def get_method(self):
611c1dd9 1818 return 'HEAD'
7217e148
PH
1819
1820
95cf60e8
S
1821class PUTRequest(compat_urllib_request.Request):
1822 def get_method(self):
1823 return 'PUT'
1824
1825
9732d77e 1826def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1827 if get_attr:
1828 if v is not None:
1829 v = getattr(v, get_attr, None)
9572013d
PH
1830 if v == '':
1831 v = None
1812afb7
S
1832 if v is None:
1833 return default
1834 try:
1835 return int(v) * invscale // scale
1836 except ValueError:
af98f8ff 1837 return default
9732d77e 1838
9572013d 1839
40a90862
JMF
1840def str_or_none(v, default=None):
1841 return default if v is None else compat_str(v)
1842
9732d77e
PH
1843
1844def str_to_int(int_str):
48d4681e 1845 """ A more relaxed version of int_or_none """
9732d77e
PH
1846 if int_str is None:
1847 return None
28e614de 1848 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1849 return int(int_str)
608d11f5
PH
1850
1851
9732d77e 1852def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1853 if v is None:
1854 return default
1855 try:
1856 return float(v) * invscale / scale
1857 except ValueError:
1858 return default
43f775e4
PH
1859
1860
c7e327c4
S
1861def bool_or_none(v, default=None):
1862 return v if isinstance(v, bool) else default
1863
1864
b72b4431
S
1865def strip_or_none(v):
1866 return None if v is None else v.strip()
1867
1868
af03000a
S
1869def url_or_none(url):
1870 if not url or not isinstance(url, compat_str):
1871 return None
1872 url = url.strip()
1873 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
1874
1875
608d11f5 1876def parse_duration(s):
8f9312c3 1877 if not isinstance(s, compat_basestring):
608d11f5
PH
1878 return None
1879
ca7b3246
S
1880 s = s.strip()
1881
acaff495 1882 days, hours, mins, secs, ms = [None] * 5
15846398 1883 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1884 if m:
1885 days, hours, mins, secs, ms = m.groups()
1886 else:
1887 m = re.match(
056653bb
S
1888 r'''(?ix)(?:P?
1889 (?:
1890 [0-9]+\s*y(?:ears?)?\s*
1891 )?
1892 (?:
1893 [0-9]+\s*m(?:onths?)?\s*
1894 )?
1895 (?:
1896 [0-9]+\s*w(?:eeks?)?\s*
1897 )?
8f4b58d7 1898 (?:
acaff495 1899 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1900 )?
056653bb 1901 T)?
acaff495 1902 (?:
1903 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1904 )?
1905 (?:
1906 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1907 )?
1908 (?:
1909 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1910 )?Z?$''', s)
acaff495 1911 if m:
1912 days, hours, mins, secs, ms = m.groups()
1913 else:
15846398 1914 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1915 if m:
1916 hours, mins = m.groups()
1917 else:
1918 return None
1919
1920 duration = 0
1921 if secs:
1922 duration += float(secs)
1923 if mins:
1924 duration += float(mins) * 60
1925 if hours:
1926 duration += float(hours) * 60 * 60
1927 if days:
1928 duration += float(days) * 24 * 60 * 60
1929 if ms:
1930 duration += float(ms)
1931 return duration
91d7d0b3
JMF
1932
1933
e65e4c88 1934def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1935 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1936 return (
1937 '{0}.{1}{2}'.format(name, ext, real_ext)
1938 if not expected_real_ext or real_ext[1:] == expected_real_ext
1939 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1940
1941
b3ed15b7
S
1942def replace_extension(filename, ext, expected_real_ext=None):
1943 name, real_ext = os.path.splitext(filename)
1944 return '{0}.{1}'.format(
1945 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1946 ext)
1947
1948
d70ad093
PH
1949def check_executable(exe, args=[]):
1950 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1951 args can be a list of arguments for a short output (like -version) """
1952 try:
1953 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1954 except OSError:
1955 return False
1956 return exe
b7ab0590
PH
1957
1958
95807118 1959def get_exe_version(exe, args=['--version'],
cae97f65 1960 version_re=None, unrecognized='present'):
95807118
PH
1961 """ Returns the version of the specified executable,
1962 or False if the executable is not present """
1963 try:
b64d04c1
YCH
1964 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1965 # SIGTTOU if youtube-dl is run in the background.
1966 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1967 out, _ = subprocess.Popen(
54116803 1968 [encodeArgument(exe)] + args,
00ca7552 1969 stdin=subprocess.PIPE,
95807118
PH
1970 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1971 except OSError:
1972 return False
cae97f65
PH
1973 if isinstance(out, bytes): # Python 2.x
1974 out = out.decode('ascii', 'ignore')
1975 return detect_exe_version(out, version_re, unrecognized)
1976
1977
1978def detect_exe_version(output, version_re=None, unrecognized='present'):
1979 assert isinstance(output, compat_str)
1980 if version_re is None:
1981 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1982 m = re.search(version_re, output)
95807118
PH
1983 if m:
1984 return m.group(1)
1985 else:
1986 return unrecognized
1987
1988
b7ab0590 1989class PagedList(object):
dd26ced1
PH
1990 def __len__(self):
1991 # This is only useful for tests
1992 return len(self.getslice())
1993
9c44d242
PH
1994
1995class OnDemandPagedList(PagedList):
6be08ce6 1996 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1997 self._pagefunc = pagefunc
1998 self._pagesize = pagesize
b95dc034
YCH
1999 self._use_cache = use_cache
2000 if use_cache:
2001 self._cache = {}
9c44d242 2002
b7ab0590
PH
2003 def getslice(self, start=0, end=None):
2004 res = []
2005 for pagenum in itertools.count(start // self._pagesize):
2006 firstid = pagenum * self._pagesize
2007 nextfirstid = pagenum * self._pagesize + self._pagesize
2008 if start >= nextfirstid:
2009 continue
2010
b95dc034
YCH
2011 page_results = None
2012 if self._use_cache:
2013 page_results = self._cache.get(pagenum)
2014 if page_results is None:
2015 page_results = list(self._pagefunc(pagenum))
2016 if self._use_cache:
2017 self._cache[pagenum] = page_results
b7ab0590
PH
2018
2019 startv = (
2020 start % self._pagesize
2021 if firstid <= start < nextfirstid
2022 else 0)
2023
2024 endv = (
2025 ((end - 1) % self._pagesize) + 1
2026 if (end is not None and firstid <= end <= nextfirstid)
2027 else None)
2028
2029 if startv != 0 or endv is not None:
2030 page_results = page_results[startv:endv]
2031 res.extend(page_results)
2032
2033 # A little optimization - if current page is not "full", ie. does
2034 # not contain page_size videos then we can assume that this page
2035 # is the last one - there are no more ids on further pages -
2036 # i.e. no need to query again.
2037 if len(page_results) + startv < self._pagesize:
2038 break
2039
2040 # If we got the whole page, but the next page is not interesting,
2041 # break out early as well
2042 if end == nextfirstid:
2043 break
2044 return res
81c2f20b
PH
2045
2046
9c44d242
PH
2047class InAdvancePagedList(PagedList):
2048 def __init__(self, pagefunc, pagecount, pagesize):
2049 self._pagefunc = pagefunc
2050 self._pagecount = pagecount
2051 self._pagesize = pagesize
2052
2053 def getslice(self, start=0, end=None):
2054 res = []
2055 start_page = start // self._pagesize
2056 end_page = (
2057 self._pagecount if end is None else (end // self._pagesize + 1))
2058 skip_elems = start - start_page * self._pagesize
2059 only_more = None if end is None else end - start
2060 for pagenum in range(start_page, end_page):
2061 page = list(self._pagefunc(pagenum))
2062 if skip_elems:
2063 page = page[skip_elems:]
2064 skip_elems = None
2065 if only_more is not None:
2066 if len(page) < only_more:
2067 only_more -= len(page)
2068 else:
2069 page = page[:only_more]
2070 res.extend(page)
2071 break
2072 res.extend(page)
2073 return res
2074
2075
81c2f20b 2076def uppercase_escape(s):
676eb3f2 2077 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2078 return re.sub(
a612753d 2079 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2080 lambda m: unicode_escape(m.group(0))[0],
2081 s)
0fe2ff78
YCH
2082
2083
2084def lowercase_escape(s):
2085 unicode_escape = codecs.getdecoder('unicode_escape')
2086 return re.sub(
2087 r'\\u[0-9a-fA-F]{4}',
2088 lambda m: unicode_escape(m.group(0))[0],
2089 s)
b53466e1 2090
d05cfe06
S
2091
2092def escape_rfc3986(s):
2093 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2094 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2095 s = s.encode('utf-8')
ecc0c5ee 2096 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2097
2098
2099def escape_url(url):
2100 """Escape URL as suggested by RFC 3986"""
2101 url_parsed = compat_urllib_parse_urlparse(url)
2102 return url_parsed._replace(
efbed08d 2103 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2104 path=escape_rfc3986(url_parsed.path),
2105 params=escape_rfc3986(url_parsed.params),
2106 query=escape_rfc3986(url_parsed.query),
2107 fragment=escape_rfc3986(url_parsed.fragment)
2108 ).geturl()
2109
62e609ab
PH
2110
2111def read_batch_urls(batch_fd):
2112 def fixup(url):
2113 if not isinstance(url, compat_str):
2114 url = url.decode('utf-8', 'replace')
28e614de 2115 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2116 if url.startswith(BOM_UTF8):
2117 url = url[len(BOM_UTF8):]
2118 url = url.strip()
2119 if url.startswith(('#', ';', ']')):
2120 return False
2121 return url
2122
2123 with contextlib.closing(batch_fd) as fd:
2124 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2125
2126
2127def urlencode_postdata(*args, **kargs):
15707c7e 2128 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2129
2130
38f9ef31 2131def update_url_query(url, query):
cacd9966
YCH
2132 if not query:
2133 return url
38f9ef31 2134 parsed_url = compat_urlparse.urlparse(url)
2135 qs = compat_parse_qs(parsed_url.query)
2136 qs.update(query)
2137 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2138 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2139
8e60dc75 2140
ed0291d1
S
2141def update_Request(req, url=None, data=None, headers={}, query={}):
2142 req_headers = req.headers.copy()
2143 req_headers.update(headers)
2144 req_data = data or req.data
2145 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2146 req_get_method = req.get_method()
2147 if req_get_method == 'HEAD':
2148 req_type = HEADRequest
2149 elif req_get_method == 'PUT':
2150 req_type = PUTRequest
2151 else:
2152 req_type = compat_urllib_request.Request
ed0291d1
S
2153 new_req = req_type(
2154 req_url, data=req_data, headers=req_headers,
2155 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2156 if hasattr(req, 'timeout'):
2157 new_req.timeout = req.timeout
2158 return new_req
2159
2160
10c87c15 2161def _multipart_encode_impl(data, boundary):
0c265486
YCH
2162 content_type = 'multipart/form-data; boundary=%s' % boundary
2163
2164 out = b''
2165 for k, v in data.items():
2166 out += b'--' + boundary.encode('ascii') + b'\r\n'
2167 if isinstance(k, compat_str):
2168 k = k.encode('utf-8')
2169 if isinstance(v, compat_str):
2170 v = v.encode('utf-8')
2171 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2172 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2173 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2174 if boundary.encode('ascii') in content:
2175 raise ValueError('Boundary overlaps with data')
2176 out += content
2177
2178 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2179
2180 return out, content_type
2181
2182
2183def multipart_encode(data, boundary=None):
2184 '''
2185 Encode a dict to RFC 7578-compliant form-data
2186
2187 data:
2188 A dict where keys and values can be either Unicode or bytes-like
2189 objects.
2190 boundary:
2191 If specified a Unicode object, it's used as the boundary. Otherwise
2192 a random boundary is generated.
2193
2194 Reference: https://tools.ietf.org/html/rfc7578
2195 '''
2196 has_specified_boundary = boundary is not None
2197
2198 while True:
2199 if boundary is None:
2200 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2201
2202 try:
10c87c15 2203 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2204 break
2205 except ValueError:
2206 if has_specified_boundary:
2207 raise
2208 boundary = None
2209
2210 return out, content_type
2211
2212
86296ad2 2213def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2214 if isinstance(key_or_keys, (list, tuple)):
2215 for key in key_or_keys:
86296ad2
S
2216 if key not in d or d[key] is None or skip_false_values and not d[key]:
2217 continue
2218 return d[key]
cbecc9b9
S
2219 return default
2220 return d.get(key_or_keys, default)
2221
2222
329ca3be 2223def try_get(src, getter, expected_type=None):
a32a9a7e
S
2224 if not isinstance(getter, (list, tuple)):
2225 getter = [getter]
2226 for get in getter:
2227 try:
2228 v = get(src)
2229 except (AttributeError, KeyError, TypeError, IndexError):
2230 pass
2231 else:
2232 if expected_type is None or isinstance(v, expected_type):
2233 return v
329ca3be
S
2234
2235
6cc62232
S
2236def merge_dicts(*dicts):
2237 merged = {}
2238 for a_dict in dicts:
2239 for k, v in a_dict.items():
2240 if v is None:
2241 continue
2242 if (k not in merged or
2243 (isinstance(v, compat_str) and v and
2244 isinstance(merged[k], compat_str) and
2245 not merged[k])):
2246 merged[k] = v
2247 return merged
2248
2249
8e60dc75
S
2250def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2251 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2252
16392824 2253
a1a530b0
PH
2254US_RATINGS = {
2255 'G': 0,
2256 'PG': 10,
2257 'PG-13': 13,
2258 'R': 16,
2259 'NC': 18,
2260}
fac55558
PH
2261
2262
a8795327 2263TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2264 'TV-Y': 0,
2265 'TV-Y7': 7,
2266 'TV-G': 0,
2267 'TV-PG': 0,
2268 'TV-14': 14,
2269 'TV-MA': 17,
a8795327
S
2270}
2271
2272
146c80e2 2273def parse_age_limit(s):
a8795327
S
2274 if type(s) == int:
2275 return s if 0 <= s <= 21 else None
2276 if not isinstance(s, compat_basestring):
d838b1bd 2277 return None
146c80e2 2278 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2279 if m:
2280 return int(m.group('age'))
2281 if s in US_RATINGS:
2282 return US_RATINGS[s]
5a16c9d9 2283 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2284 if m:
5a16c9d9 2285 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2286 return None
146c80e2
S
2287
2288
fac55558 2289def strip_jsonp(code):
609a61e3 2290 return re.sub(
5552c9eb 2291 r'''(?sx)^
e9c671d5 2292 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2293 (?:\s*&&\s*(?P=func_name))?
2294 \s*\(\s*(?P<callback_data>.*)\);?
2295 \s*?(?://[^\n]*)*$''',
2296 r'\g<callback_data>', code)
478c2c61
PH
2297
2298
e05f6939 2299def js_to_json(code):
4195096e
S
2300 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2301 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2302 INTEGER_TABLE = (
2303 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2304 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2305 )
2306
e05f6939 2307 def fix_kv(m):
e7b6d122
PH
2308 v = m.group(0)
2309 if v in ('true', 'false', 'null'):
2310 return v
b3ee552e 2311 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2312 return ""
2313
2314 if v[0] in ("'", '"'):
2315 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2316 '"': '\\"',
bd1e4844 2317 "\\'": "'",
2318 '\\\n': '',
2319 '\\x': '\\u00',
2320 }.get(m.group(0), m.group(0)), v[1:-1])
2321
89ac4a19
S
2322 for regex, base in INTEGER_TABLE:
2323 im = re.match(regex, v)
2324 if im:
e4659b45 2325 i = int(im.group(1), base)
89ac4a19
S
2326 return '"%d":' % i if v.endswith(':') else '%d' % i
2327
e7b6d122 2328 return '"%s"' % v
e05f6939 2329
bd1e4844 2330 return re.sub(r'''(?sx)
2331 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2332 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2333 {comment}|,(?={skip}[\]}}])|
c384d537 2334 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2335 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2336 [0-9]+(?={skip}:)
2337 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2338
2339
478c2c61
PH
2340def qualities(quality_ids):
2341 """ Get a numeric quality value out of a list of possible values """
2342 def q(qid):
2343 try:
2344 return quality_ids.index(qid)
2345 except ValueError:
2346 return -1
2347 return q
2348
acd69589
PH
2349
2350DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2351
a020a0dc
PH
2352
2353def limit_length(s, length):
2354 """ Add ellipses to overly long strings """
2355 if s is None:
2356 return None
2357 ELLIPSES = '...'
2358 if len(s) > length:
2359 return s[:length - len(ELLIPSES)] + ELLIPSES
2360 return s
48844745
PH
2361
2362
2363def version_tuple(v):
5f9b8394 2364 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2365
2366
2367def is_outdated_version(version, limit, assume_new=True):
2368 if not version:
2369 return not assume_new
2370 try:
2371 return version_tuple(version) < version_tuple(limit)
2372 except ValueError:
2373 return not assume_new
732ea2f0
PH
2374
2375
2376def ytdl_is_updateable():
2377 """ Returns if youtube-dl can be updated with -U """
2378 from zipimport import zipimporter
2379
2380 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2381
2382
2383def args_to_str(args):
2384 # Get a short string representation for a subprocess command
702ccf2d 2385 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2386
2387
9b9c5355 2388def error_to_compat_str(err):
fdae2358
S
2389 err_str = str(err)
2390 # On python 2 error byte string must be decoded with proper
2391 # encoding rather than ascii
2392 if sys.version_info[0] < 3:
2393 err_str = err_str.decode(preferredencoding())
2394 return err_str
2395
2396
c460bdd5 2397def mimetype2ext(mt):
eb9ee194
S
2398 if mt is None:
2399 return None
2400
765ac263
JMF
2401 ext = {
2402 'audio/mp4': 'm4a',
6c33d24b
YCH
2403 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2404 # it's the most popular one
2405 'audio/mpeg': 'mp3',
765ac263
JMF
2406 }.get(mt)
2407 if ext is not None:
2408 return ext
2409
c460bdd5 2410 _, _, res = mt.rpartition('/')
6562d34a 2411 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2412
2413 return {
f6861ec9 2414 '3gpp': '3gp',
cafcf657 2415 'smptett+xml': 'tt',
cafcf657 2416 'ttaf+xml': 'dfxp',
a0d8d704 2417 'ttml+xml': 'ttml',
f6861ec9 2418 'x-flv': 'flv',
a0d8d704 2419 'x-mp4-fragmented': 'mp4',
d4f05d47 2420 'x-ms-sami': 'sami',
a0d8d704 2421 'x-ms-wmv': 'wmv',
b4173f15
RA
2422 'mpegurl': 'm3u8',
2423 'x-mpegurl': 'm3u8',
2424 'vnd.apple.mpegurl': 'm3u8',
2425 'dash+xml': 'mpd',
b4173f15 2426 'f4m+xml': 'f4m',
f164b971 2427 'hds+xml': 'f4m',
e910fe2f 2428 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2429 'quicktime': 'mov',
98ce1a3f 2430 'mp2t': 'ts',
c460bdd5
PH
2431 }.get(res, res)
2432
2433
4f3c5e06 2434def parse_codecs(codecs_str):
2435 # http://tools.ietf.org/html/rfc6381
2436 if not codecs_str:
2437 return {}
2438 splited_codecs = list(filter(None, map(
2439 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2440 vcodec, acodec = None, None
2441 for full_codec in splited_codecs:
2442 codec = full_codec.split('.')[0]
ffe6979e 2443 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2444 if not vcodec:
2445 vcodec = full_codec
60f5c9fb 2446 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2447 if not acodec:
2448 acodec = full_codec
2449 else:
60f5c9fb 2450 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2451 if not vcodec and not acodec:
2452 if len(splited_codecs) == 2:
2453 return {
2454 'vcodec': vcodec,
2455 'acodec': acodec,
2456 }
2457 elif len(splited_codecs) == 1:
2458 return {
2459 'vcodec': 'none',
2460 'acodec': vcodec,
2461 }
2462 else:
2463 return {
2464 'vcodec': vcodec or 'none',
2465 'acodec': acodec or 'none',
2466 }
2467 return {}
2468
2469
2ccd1b10 2470def urlhandle_detect_ext(url_handle):
79298173 2471 getheader = url_handle.headers.get
2ccd1b10 2472
b55ee18f
PH
2473 cd = getheader('Content-Disposition')
2474 if cd:
2475 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2476 if m:
2477 e = determine_ext(m.group('filename'), default_ext=None)
2478 if e:
2479 return e
2480
c460bdd5 2481 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2482
2483
1e399778
YCH
2484def encode_data_uri(data, mime_type):
2485 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2486
2487
05900629 2488def age_restricted(content_limit, age_limit):
6ec6cb4e 2489 """ Returns True iff the content should be blocked """
05900629
PH
2490
2491 if age_limit is None: # No limit set
2492 return False
2493 if content_limit is None:
2494 return False # Content available for everyone
2495 return age_limit < content_limit
61ca9a80
PH
2496
2497
2498def is_html(first_bytes):
2499 """ Detect whether a file contains HTML by examining its first bytes. """
2500
2501 BOMS = [
2502 (b'\xef\xbb\xbf', 'utf-8'),
2503 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2504 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2505 (b'\xff\xfe', 'utf-16-le'),
2506 (b'\xfe\xff', 'utf-16-be'),
2507 ]
2508 for bom, enc in BOMS:
2509 if first_bytes.startswith(bom):
2510 s = first_bytes[len(bom):].decode(enc, 'replace')
2511 break
2512 else:
2513 s = first_bytes.decode('utf-8', 'replace')
2514
2515 return re.match(r'^\s*<', s)
a055469f
PH
2516
2517
2518def determine_protocol(info_dict):
2519 protocol = info_dict.get('protocol')
2520 if protocol is not None:
2521 return protocol
2522
2523 url = info_dict['url']
2524 if url.startswith('rtmp'):
2525 return 'rtmp'
2526 elif url.startswith('mms'):
2527 return 'mms'
2528 elif url.startswith('rtsp'):
2529 return 'rtsp'
2530
2531 ext = determine_ext(url)
2532 if ext == 'm3u8':
2533 return 'm3u8'
2534 elif ext == 'f4m':
2535 return 'f4m'
2536
2537 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2538
2539
2540def render_table(header_row, data):
2541 """ Render a list of rows, each as a list of values """
2542 table = [header_row] + data
2543 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2544 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2545 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2546
2547
2548def _match_one(filter_part, dct):
2549 COMPARISON_OPERATORS = {
2550 '<': operator.lt,
2551 '<=': operator.le,
2552 '>': operator.gt,
2553 '>=': operator.ge,
2554 '=': operator.eq,
2555 '!=': operator.ne,
2556 }
2557 operator_rex = re.compile(r'''(?x)\s*
2558 (?P<key>[a-z_]+)
2559 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2560 (?:
2561 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2562 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2563 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2564 )
2565 \s*$
2566 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2567 m = operator_rex.search(filter_part)
2568 if m:
2569 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2570 actual_value = dct.get(m.group('key'))
db13c16e
S
2571 if (m.group('quotedstrval') is not None or
2572 m.group('strval') is not None or
e5a088dc
S
2573 # If the original field is a string and matching comparisonvalue is
2574 # a number we should respect the origin of the original field
2575 # and process comparison value as a string (see
2576 # https://github.com/rg3/youtube-dl/issues/11082).
2577 actual_value is not None and m.group('intval') is not None and
2578 isinstance(actual_value, compat_str)):
347de493
PH
2579 if m.group('op') not in ('=', '!='):
2580 raise ValueError(
2581 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2582 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2583 quote = m.group('quote')
2584 if quote is not None:
2585 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2586 else:
2587 try:
2588 comparison_value = int(m.group('intval'))
2589 except ValueError:
2590 comparison_value = parse_filesize(m.group('intval'))
2591 if comparison_value is None:
2592 comparison_value = parse_filesize(m.group('intval') + 'B')
2593 if comparison_value is None:
2594 raise ValueError(
2595 'Invalid integer value %r in filter part %r' % (
2596 m.group('intval'), filter_part))
347de493
PH
2597 if actual_value is None:
2598 return m.group('none_inclusive')
2599 return op(actual_value, comparison_value)
2600
2601 UNARY_OPERATORS = {
1cc47c66
S
2602 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2603 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
2604 }
2605 operator_rex = re.compile(r'''(?x)\s*
2606 (?P<op>%s)\s*(?P<key>[a-z_]+)
2607 \s*$
2608 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2609 m = operator_rex.search(filter_part)
2610 if m:
2611 op = UNARY_OPERATORS[m.group('op')]
2612 actual_value = dct.get(m.group('key'))
2613 return op(actual_value)
2614
2615 raise ValueError('Invalid filter part %r' % filter_part)
2616
2617
2618def match_str(filter_str, dct):
2619 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2620
2621 return all(
2622 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2623
2624
2625def match_filter_func(filter_str):
2626 def _match_func(info_dict):
2627 if match_str(filter_str, info_dict):
2628 return None
2629 else:
2630 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2631 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2632 return _match_func
91410c9b
PH
2633
2634
bf6427d2
YCH
2635def parse_dfxp_time_expr(time_expr):
2636 if not time_expr:
d631d5f9 2637 return
bf6427d2
YCH
2638
2639 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2640 if mobj:
2641 return float(mobj.group('time_offset'))
2642
db2fe38b 2643 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2644 if mobj:
db2fe38b 2645 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2646
2647
c1c924ab
YCH
2648def srt_subtitles_timecode(seconds):
2649 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2650
2651
2652def dfxp2srt(dfxp_data):
3869028f
YCH
2653 '''
2654 @param dfxp_data A bytes-like object containing DFXP data
2655 @returns A unicode object containing converted SRT data
2656 '''
5b995f71 2657 LEGACY_NAMESPACES = (
3869028f
YCH
2658 (b'http://www.w3.org/ns/ttml', [
2659 b'http://www.w3.org/2004/11/ttaf1',
2660 b'http://www.w3.org/2006/04/ttaf1',
2661 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2662 ]),
3869028f
YCH
2663 (b'http://www.w3.org/ns/ttml#styling', [
2664 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2665 ]),
2666 )
2667
2668 SUPPORTED_STYLING = [
2669 'color',
2670 'fontFamily',
2671 'fontSize',
2672 'fontStyle',
2673 'fontWeight',
2674 'textDecoration'
2675 ]
2676
4e335771 2677 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 2678 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 2679 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2680 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2681 })
bf6427d2 2682
5b995f71
RA
2683 styles = {}
2684 default_style = {}
2685
87de7069 2686 class TTMLPElementParser(object):
5b995f71
RA
2687 _out = ''
2688 _unclosed_elements = []
2689 _applied_styles = []
bf6427d2 2690
2b14cb56 2691 def start(self, tag, attrib):
5b995f71
RA
2692 if tag in (_x('ttml:br'), 'br'):
2693 self._out += '\n'
2694 else:
2695 unclosed_elements = []
2696 style = {}
2697 element_style_id = attrib.get('style')
2698 if default_style:
2699 style.update(default_style)
2700 if element_style_id:
2701 style.update(styles.get(element_style_id, {}))
2702 for prop in SUPPORTED_STYLING:
2703 prop_val = attrib.get(_x('tts:' + prop))
2704 if prop_val:
2705 style[prop] = prop_val
2706 if style:
2707 font = ''
2708 for k, v in sorted(style.items()):
2709 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2710 continue
2711 if k == 'color':
2712 font += ' color="%s"' % v
2713 elif k == 'fontSize':
2714 font += ' size="%s"' % v
2715 elif k == 'fontFamily':
2716 font += ' face="%s"' % v
2717 elif k == 'fontWeight' and v == 'bold':
2718 self._out += '<b>'
2719 unclosed_elements.append('b')
2720 elif k == 'fontStyle' and v == 'italic':
2721 self._out += '<i>'
2722 unclosed_elements.append('i')
2723 elif k == 'textDecoration' and v == 'underline':
2724 self._out += '<u>'
2725 unclosed_elements.append('u')
2726 if font:
2727 self._out += '<font' + font + '>'
2728 unclosed_elements.append('font')
2729 applied_style = {}
2730 if self._applied_styles:
2731 applied_style.update(self._applied_styles[-1])
2732 applied_style.update(style)
2733 self._applied_styles.append(applied_style)
2734 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2735
2b14cb56 2736 def end(self, tag):
5b995f71
RA
2737 if tag not in (_x('ttml:br'), 'br'):
2738 unclosed_elements = self._unclosed_elements.pop()
2739 for element in reversed(unclosed_elements):
2740 self._out += '</%s>' % element
2741 if unclosed_elements and self._applied_styles:
2742 self._applied_styles.pop()
bf6427d2 2743
2b14cb56 2744 def data(self, data):
5b995f71 2745 self._out += data
2b14cb56 2746
2747 def close(self):
5b995f71 2748 return self._out.strip()
2b14cb56 2749
2750 def parse_node(node):
2751 target = TTMLPElementParser()
2752 parser = xml.etree.ElementTree.XMLParser(target=target)
2753 parser.feed(xml.etree.ElementTree.tostring(node))
2754 return parser.close()
bf6427d2 2755
5b995f71
RA
2756 for k, v in LEGACY_NAMESPACES:
2757 for ns in v:
2758 dfxp_data = dfxp_data.replace(ns, k)
2759
3869028f 2760 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2761 out = []
5b995f71 2762 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2763
2764 if not paras:
2765 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2766
5b995f71
RA
2767 repeat = False
2768 while True:
2769 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
2770 style_id = style.get('id') or style.get(_x('xml:id'))
2771 if not style_id:
2772 continue
5b995f71
RA
2773 parent_style_id = style.get('style')
2774 if parent_style_id:
2775 if parent_style_id not in styles:
2776 repeat = True
2777 continue
2778 styles[style_id] = styles[parent_style_id].copy()
2779 for prop in SUPPORTED_STYLING:
2780 prop_val = style.get(_x('tts:' + prop))
2781 if prop_val:
2782 styles.setdefault(style_id, {})[prop] = prop_val
2783 if repeat:
2784 repeat = False
2785 else:
2786 break
2787
2788 for p in ('body', 'div'):
2789 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2790 if ele is None:
2791 continue
2792 style = styles.get(ele.get('style'))
2793 if not style:
2794 continue
2795 default_style.update(style)
2796
bf6427d2 2797 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2798 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2799 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2800 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2801 if begin_time is None:
2802 continue
7dff0363 2803 if not end_time:
d631d5f9
YCH
2804 if not dur:
2805 continue
2806 end_time = begin_time + dur
bf6427d2
YCH
2807 out.append('%d\n%s --> %s\n%s\n\n' % (
2808 index,
c1c924ab
YCH
2809 srt_subtitles_timecode(begin_time),
2810 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2811 parse_node(para)))
2812
2813 return ''.join(out)
2814
2815
66e289ba
S
2816def cli_option(params, command_option, param):
2817 param = params.get(param)
98e698f1
RA
2818 if param:
2819 param = compat_str(param)
66e289ba
S
2820 return [command_option, param] if param is not None else []
2821
2822
2823def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2824 param = params.get(param)
5b232f46
S
2825 if param is None:
2826 return []
66e289ba
S
2827 assert isinstance(param, bool)
2828 if separator:
2829 return [command_option + separator + (true_value if param else false_value)]
2830 return [command_option, true_value if param else false_value]
2831
2832
2833def cli_valueless_option(params, command_option, param, expected_value=True):
2834 param = params.get(param)
2835 return [command_option] if param == expected_value else []
2836
2837
2838def cli_configuration_args(params, param, default=[]):
2839 ex_args = params.get(param)
2840 if ex_args is None:
2841 return default
2842 assert isinstance(ex_args, list)
2843 return ex_args
2844
2845
39672624
YCH
2846class ISO639Utils(object):
2847 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2848 _lang_map = {
2849 'aa': 'aar',
2850 'ab': 'abk',
2851 'ae': 'ave',
2852 'af': 'afr',
2853 'ak': 'aka',
2854 'am': 'amh',
2855 'an': 'arg',
2856 'ar': 'ara',
2857 'as': 'asm',
2858 'av': 'ava',
2859 'ay': 'aym',
2860 'az': 'aze',
2861 'ba': 'bak',
2862 'be': 'bel',
2863 'bg': 'bul',
2864 'bh': 'bih',
2865 'bi': 'bis',
2866 'bm': 'bam',
2867 'bn': 'ben',
2868 'bo': 'bod',
2869 'br': 'bre',
2870 'bs': 'bos',
2871 'ca': 'cat',
2872 'ce': 'che',
2873 'ch': 'cha',
2874 'co': 'cos',
2875 'cr': 'cre',
2876 'cs': 'ces',
2877 'cu': 'chu',
2878 'cv': 'chv',
2879 'cy': 'cym',
2880 'da': 'dan',
2881 'de': 'deu',
2882 'dv': 'div',
2883 'dz': 'dzo',
2884 'ee': 'ewe',
2885 'el': 'ell',
2886 'en': 'eng',
2887 'eo': 'epo',
2888 'es': 'spa',
2889 'et': 'est',
2890 'eu': 'eus',
2891 'fa': 'fas',
2892 'ff': 'ful',
2893 'fi': 'fin',
2894 'fj': 'fij',
2895 'fo': 'fao',
2896 'fr': 'fra',
2897 'fy': 'fry',
2898 'ga': 'gle',
2899 'gd': 'gla',
2900 'gl': 'glg',
2901 'gn': 'grn',
2902 'gu': 'guj',
2903 'gv': 'glv',
2904 'ha': 'hau',
2905 'he': 'heb',
2906 'hi': 'hin',
2907 'ho': 'hmo',
2908 'hr': 'hrv',
2909 'ht': 'hat',
2910 'hu': 'hun',
2911 'hy': 'hye',
2912 'hz': 'her',
2913 'ia': 'ina',
2914 'id': 'ind',
2915 'ie': 'ile',
2916 'ig': 'ibo',
2917 'ii': 'iii',
2918 'ik': 'ipk',
2919 'io': 'ido',
2920 'is': 'isl',
2921 'it': 'ita',
2922 'iu': 'iku',
2923 'ja': 'jpn',
2924 'jv': 'jav',
2925 'ka': 'kat',
2926 'kg': 'kon',
2927 'ki': 'kik',
2928 'kj': 'kua',
2929 'kk': 'kaz',
2930 'kl': 'kal',
2931 'km': 'khm',
2932 'kn': 'kan',
2933 'ko': 'kor',
2934 'kr': 'kau',
2935 'ks': 'kas',
2936 'ku': 'kur',
2937 'kv': 'kom',
2938 'kw': 'cor',
2939 'ky': 'kir',
2940 'la': 'lat',
2941 'lb': 'ltz',
2942 'lg': 'lug',
2943 'li': 'lim',
2944 'ln': 'lin',
2945 'lo': 'lao',
2946 'lt': 'lit',
2947 'lu': 'lub',
2948 'lv': 'lav',
2949 'mg': 'mlg',
2950 'mh': 'mah',
2951 'mi': 'mri',
2952 'mk': 'mkd',
2953 'ml': 'mal',
2954 'mn': 'mon',
2955 'mr': 'mar',
2956 'ms': 'msa',
2957 'mt': 'mlt',
2958 'my': 'mya',
2959 'na': 'nau',
2960 'nb': 'nob',
2961 'nd': 'nde',
2962 'ne': 'nep',
2963 'ng': 'ndo',
2964 'nl': 'nld',
2965 'nn': 'nno',
2966 'no': 'nor',
2967 'nr': 'nbl',
2968 'nv': 'nav',
2969 'ny': 'nya',
2970 'oc': 'oci',
2971 'oj': 'oji',
2972 'om': 'orm',
2973 'or': 'ori',
2974 'os': 'oss',
2975 'pa': 'pan',
2976 'pi': 'pli',
2977 'pl': 'pol',
2978 'ps': 'pus',
2979 'pt': 'por',
2980 'qu': 'que',
2981 'rm': 'roh',
2982 'rn': 'run',
2983 'ro': 'ron',
2984 'ru': 'rus',
2985 'rw': 'kin',
2986 'sa': 'san',
2987 'sc': 'srd',
2988 'sd': 'snd',
2989 'se': 'sme',
2990 'sg': 'sag',
2991 'si': 'sin',
2992 'sk': 'slk',
2993 'sl': 'slv',
2994 'sm': 'smo',
2995 'sn': 'sna',
2996 'so': 'som',
2997 'sq': 'sqi',
2998 'sr': 'srp',
2999 'ss': 'ssw',
3000 'st': 'sot',
3001 'su': 'sun',
3002 'sv': 'swe',
3003 'sw': 'swa',
3004 'ta': 'tam',
3005 'te': 'tel',
3006 'tg': 'tgk',
3007 'th': 'tha',
3008 'ti': 'tir',
3009 'tk': 'tuk',
3010 'tl': 'tgl',
3011 'tn': 'tsn',
3012 'to': 'ton',
3013 'tr': 'tur',
3014 'ts': 'tso',
3015 'tt': 'tat',
3016 'tw': 'twi',
3017 'ty': 'tah',
3018 'ug': 'uig',
3019 'uk': 'ukr',
3020 'ur': 'urd',
3021 'uz': 'uzb',
3022 've': 'ven',
3023 'vi': 'vie',
3024 'vo': 'vol',
3025 'wa': 'wln',
3026 'wo': 'wol',
3027 'xh': 'xho',
3028 'yi': 'yid',
3029 'yo': 'yor',
3030 'za': 'zha',
3031 'zh': 'zho',
3032 'zu': 'zul',
3033 }
3034
3035 @classmethod
3036 def short2long(cls, code):
3037 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3038 return cls._lang_map.get(code[:2])
3039
3040 @classmethod
3041 def long2short(cls, code):
3042 """Convert language code from ISO 639-2/T to ISO 639-1"""
3043 for short_name, long_name in cls._lang_map.items():
3044 if long_name == code:
3045 return short_name
3046
3047
4eb10f66
YCH
3048class ISO3166Utils(object):
3049 # From http://data.okfn.org/data/core/country-list
3050 _country_map = {
3051 'AF': 'Afghanistan',
3052 'AX': 'Åland Islands',
3053 'AL': 'Albania',
3054 'DZ': 'Algeria',
3055 'AS': 'American Samoa',
3056 'AD': 'Andorra',
3057 'AO': 'Angola',
3058 'AI': 'Anguilla',
3059 'AQ': 'Antarctica',
3060 'AG': 'Antigua and Barbuda',
3061 'AR': 'Argentina',
3062 'AM': 'Armenia',
3063 'AW': 'Aruba',
3064 'AU': 'Australia',
3065 'AT': 'Austria',
3066 'AZ': 'Azerbaijan',
3067 'BS': 'Bahamas',
3068 'BH': 'Bahrain',
3069 'BD': 'Bangladesh',
3070 'BB': 'Barbados',
3071 'BY': 'Belarus',
3072 'BE': 'Belgium',
3073 'BZ': 'Belize',
3074 'BJ': 'Benin',
3075 'BM': 'Bermuda',
3076 'BT': 'Bhutan',
3077 'BO': 'Bolivia, Plurinational State of',
3078 'BQ': 'Bonaire, Sint Eustatius and Saba',
3079 'BA': 'Bosnia and Herzegovina',
3080 'BW': 'Botswana',
3081 'BV': 'Bouvet Island',
3082 'BR': 'Brazil',
3083 'IO': 'British Indian Ocean Territory',
3084 'BN': 'Brunei Darussalam',
3085 'BG': 'Bulgaria',
3086 'BF': 'Burkina Faso',
3087 'BI': 'Burundi',
3088 'KH': 'Cambodia',
3089 'CM': 'Cameroon',
3090 'CA': 'Canada',
3091 'CV': 'Cape Verde',
3092 'KY': 'Cayman Islands',
3093 'CF': 'Central African Republic',
3094 'TD': 'Chad',
3095 'CL': 'Chile',
3096 'CN': 'China',
3097 'CX': 'Christmas Island',
3098 'CC': 'Cocos (Keeling) Islands',
3099 'CO': 'Colombia',
3100 'KM': 'Comoros',
3101 'CG': 'Congo',
3102 'CD': 'Congo, the Democratic Republic of the',
3103 'CK': 'Cook Islands',
3104 'CR': 'Costa Rica',
3105 'CI': 'Côte d\'Ivoire',
3106 'HR': 'Croatia',
3107 'CU': 'Cuba',
3108 'CW': 'Curaçao',
3109 'CY': 'Cyprus',
3110 'CZ': 'Czech Republic',
3111 'DK': 'Denmark',
3112 'DJ': 'Djibouti',
3113 'DM': 'Dominica',
3114 'DO': 'Dominican Republic',
3115 'EC': 'Ecuador',
3116 'EG': 'Egypt',
3117 'SV': 'El Salvador',
3118 'GQ': 'Equatorial Guinea',
3119 'ER': 'Eritrea',
3120 'EE': 'Estonia',
3121 'ET': 'Ethiopia',
3122 'FK': 'Falkland Islands (Malvinas)',
3123 'FO': 'Faroe Islands',
3124 'FJ': 'Fiji',
3125 'FI': 'Finland',
3126 'FR': 'France',
3127 'GF': 'French Guiana',
3128 'PF': 'French Polynesia',
3129 'TF': 'French Southern Territories',
3130 'GA': 'Gabon',
3131 'GM': 'Gambia',
3132 'GE': 'Georgia',
3133 'DE': 'Germany',
3134 'GH': 'Ghana',
3135 'GI': 'Gibraltar',
3136 'GR': 'Greece',
3137 'GL': 'Greenland',
3138 'GD': 'Grenada',
3139 'GP': 'Guadeloupe',
3140 'GU': 'Guam',
3141 'GT': 'Guatemala',
3142 'GG': 'Guernsey',
3143 'GN': 'Guinea',
3144 'GW': 'Guinea-Bissau',
3145 'GY': 'Guyana',
3146 'HT': 'Haiti',
3147 'HM': 'Heard Island and McDonald Islands',
3148 'VA': 'Holy See (Vatican City State)',
3149 'HN': 'Honduras',
3150 'HK': 'Hong Kong',
3151 'HU': 'Hungary',
3152 'IS': 'Iceland',
3153 'IN': 'India',
3154 'ID': 'Indonesia',
3155 'IR': 'Iran, Islamic Republic of',
3156 'IQ': 'Iraq',
3157 'IE': 'Ireland',
3158 'IM': 'Isle of Man',
3159 'IL': 'Israel',
3160 'IT': 'Italy',
3161 'JM': 'Jamaica',
3162 'JP': 'Japan',
3163 'JE': 'Jersey',
3164 'JO': 'Jordan',
3165 'KZ': 'Kazakhstan',
3166 'KE': 'Kenya',
3167 'KI': 'Kiribati',
3168 'KP': 'Korea, Democratic People\'s Republic of',
3169 'KR': 'Korea, Republic of',
3170 'KW': 'Kuwait',
3171 'KG': 'Kyrgyzstan',
3172 'LA': 'Lao People\'s Democratic Republic',
3173 'LV': 'Latvia',
3174 'LB': 'Lebanon',
3175 'LS': 'Lesotho',
3176 'LR': 'Liberia',
3177 'LY': 'Libya',
3178 'LI': 'Liechtenstein',
3179 'LT': 'Lithuania',
3180 'LU': 'Luxembourg',
3181 'MO': 'Macao',
3182 'MK': 'Macedonia, the Former Yugoslav Republic of',
3183 'MG': 'Madagascar',
3184 'MW': 'Malawi',
3185 'MY': 'Malaysia',
3186 'MV': 'Maldives',
3187 'ML': 'Mali',
3188 'MT': 'Malta',
3189 'MH': 'Marshall Islands',
3190 'MQ': 'Martinique',
3191 'MR': 'Mauritania',
3192 'MU': 'Mauritius',
3193 'YT': 'Mayotte',
3194 'MX': 'Mexico',
3195 'FM': 'Micronesia, Federated States of',
3196 'MD': 'Moldova, Republic of',
3197 'MC': 'Monaco',
3198 'MN': 'Mongolia',
3199 'ME': 'Montenegro',
3200 'MS': 'Montserrat',
3201 'MA': 'Morocco',
3202 'MZ': 'Mozambique',
3203 'MM': 'Myanmar',
3204 'NA': 'Namibia',
3205 'NR': 'Nauru',
3206 'NP': 'Nepal',
3207 'NL': 'Netherlands',
3208 'NC': 'New Caledonia',
3209 'NZ': 'New Zealand',
3210 'NI': 'Nicaragua',
3211 'NE': 'Niger',
3212 'NG': 'Nigeria',
3213 'NU': 'Niue',
3214 'NF': 'Norfolk Island',
3215 'MP': 'Northern Mariana Islands',
3216 'NO': 'Norway',
3217 'OM': 'Oman',
3218 'PK': 'Pakistan',
3219 'PW': 'Palau',
3220 'PS': 'Palestine, State of',
3221 'PA': 'Panama',
3222 'PG': 'Papua New Guinea',
3223 'PY': 'Paraguay',
3224 'PE': 'Peru',
3225 'PH': 'Philippines',
3226 'PN': 'Pitcairn',
3227 'PL': 'Poland',
3228 'PT': 'Portugal',
3229 'PR': 'Puerto Rico',
3230 'QA': 'Qatar',
3231 'RE': 'Réunion',
3232 'RO': 'Romania',
3233 'RU': 'Russian Federation',
3234 'RW': 'Rwanda',
3235 'BL': 'Saint Barthélemy',
3236 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3237 'KN': 'Saint Kitts and Nevis',
3238 'LC': 'Saint Lucia',
3239 'MF': 'Saint Martin (French part)',
3240 'PM': 'Saint Pierre and Miquelon',
3241 'VC': 'Saint Vincent and the Grenadines',
3242 'WS': 'Samoa',
3243 'SM': 'San Marino',
3244 'ST': 'Sao Tome and Principe',
3245 'SA': 'Saudi Arabia',
3246 'SN': 'Senegal',
3247 'RS': 'Serbia',
3248 'SC': 'Seychelles',
3249 'SL': 'Sierra Leone',
3250 'SG': 'Singapore',
3251 'SX': 'Sint Maarten (Dutch part)',
3252 'SK': 'Slovakia',
3253 'SI': 'Slovenia',
3254 'SB': 'Solomon Islands',
3255 'SO': 'Somalia',
3256 'ZA': 'South Africa',
3257 'GS': 'South Georgia and the South Sandwich Islands',
3258 'SS': 'South Sudan',
3259 'ES': 'Spain',
3260 'LK': 'Sri Lanka',
3261 'SD': 'Sudan',
3262 'SR': 'Suriname',
3263 'SJ': 'Svalbard and Jan Mayen',
3264 'SZ': 'Swaziland',
3265 'SE': 'Sweden',
3266 'CH': 'Switzerland',
3267 'SY': 'Syrian Arab Republic',
3268 'TW': 'Taiwan, Province of China',
3269 'TJ': 'Tajikistan',
3270 'TZ': 'Tanzania, United Republic of',
3271 'TH': 'Thailand',
3272 'TL': 'Timor-Leste',
3273 'TG': 'Togo',
3274 'TK': 'Tokelau',
3275 'TO': 'Tonga',
3276 'TT': 'Trinidad and Tobago',
3277 'TN': 'Tunisia',
3278 'TR': 'Turkey',
3279 'TM': 'Turkmenistan',
3280 'TC': 'Turks and Caicos Islands',
3281 'TV': 'Tuvalu',
3282 'UG': 'Uganda',
3283 'UA': 'Ukraine',
3284 'AE': 'United Arab Emirates',
3285 'GB': 'United Kingdom',
3286 'US': 'United States',
3287 'UM': 'United States Minor Outlying Islands',
3288 'UY': 'Uruguay',
3289 'UZ': 'Uzbekistan',
3290 'VU': 'Vanuatu',
3291 'VE': 'Venezuela, Bolivarian Republic of',
3292 'VN': 'Viet Nam',
3293 'VG': 'Virgin Islands, British',
3294 'VI': 'Virgin Islands, U.S.',
3295 'WF': 'Wallis and Futuna',
3296 'EH': 'Western Sahara',
3297 'YE': 'Yemen',
3298 'ZM': 'Zambia',
3299 'ZW': 'Zimbabwe',
3300 }
3301
3302 @classmethod
3303 def short2full(cls, code):
3304 """Convert an ISO 3166-2 country code to the corresponding full name"""
3305 return cls._country_map.get(code.upper())
3306
3307
773f291d
S
3308class GeoUtils(object):
3309 # Major IPv4 address blocks per country
3310 _country_ip_map = {
3311 'AD': '85.94.160.0/19',
3312 'AE': '94.200.0.0/13',
3313 'AF': '149.54.0.0/17',
3314 'AG': '209.59.64.0/18',
3315 'AI': '204.14.248.0/21',
3316 'AL': '46.99.0.0/16',
3317 'AM': '46.70.0.0/15',
3318 'AO': '105.168.0.0/13',
3319 'AP': '159.117.192.0/21',
3320 'AR': '181.0.0.0/12',
3321 'AS': '202.70.112.0/20',
3322 'AT': '84.112.0.0/13',
3323 'AU': '1.128.0.0/11',
3324 'AW': '181.41.0.0/18',
3325 'AZ': '5.191.0.0/16',
3326 'BA': '31.176.128.0/17',
3327 'BB': '65.48.128.0/17',
3328 'BD': '114.130.0.0/16',
3329 'BE': '57.0.0.0/8',
3330 'BF': '129.45.128.0/17',
3331 'BG': '95.42.0.0/15',
3332 'BH': '37.131.0.0/17',
3333 'BI': '154.117.192.0/18',
3334 'BJ': '137.255.0.0/16',
3335 'BL': '192.131.134.0/24',
3336 'BM': '196.12.64.0/18',
3337 'BN': '156.31.0.0/16',
3338 'BO': '161.56.0.0/16',
3339 'BQ': '161.0.80.0/20',
3340 'BR': '152.240.0.0/12',
3341 'BS': '24.51.64.0/18',
3342 'BT': '119.2.96.0/19',
3343 'BW': '168.167.0.0/16',
3344 'BY': '178.120.0.0/13',
3345 'BZ': '179.42.192.0/18',
3346 'CA': '99.224.0.0/11',
3347 'CD': '41.243.0.0/16',
3348 'CF': '196.32.200.0/21',
3349 'CG': '197.214.128.0/17',
3350 'CH': '85.0.0.0/13',
3351 'CI': '154.232.0.0/14',
3352 'CK': '202.65.32.0/19',
3353 'CL': '152.172.0.0/14',
3354 'CM': '165.210.0.0/15',
3355 'CN': '36.128.0.0/10',
3356 'CO': '181.240.0.0/12',
3357 'CR': '201.192.0.0/12',
3358 'CU': '152.206.0.0/15',
3359 'CV': '165.90.96.0/19',
3360 'CW': '190.88.128.0/17',
3361 'CY': '46.198.0.0/15',
3362 'CZ': '88.100.0.0/14',
3363 'DE': '53.0.0.0/8',
3364 'DJ': '197.241.0.0/17',
3365 'DK': '87.48.0.0/12',
3366 'DM': '192.243.48.0/20',
3367 'DO': '152.166.0.0/15',
3368 'DZ': '41.96.0.0/12',
3369 'EC': '186.68.0.0/15',
3370 'EE': '90.190.0.0/15',
3371 'EG': '156.160.0.0/11',
3372 'ER': '196.200.96.0/20',
3373 'ES': '88.0.0.0/11',
3374 'ET': '196.188.0.0/14',
3375 'EU': '2.16.0.0/13',
3376 'FI': '91.152.0.0/13',
3377 'FJ': '144.120.0.0/16',
3378 'FM': '119.252.112.0/20',
3379 'FO': '88.85.32.0/19',
3380 'FR': '90.0.0.0/9',
3381 'GA': '41.158.0.0/15',
3382 'GB': '25.0.0.0/8',
3383 'GD': '74.122.88.0/21',
3384 'GE': '31.146.0.0/16',
3385 'GF': '161.22.64.0/18',
3386 'GG': '62.68.160.0/19',
3387 'GH': '45.208.0.0/14',
3388 'GI': '85.115.128.0/19',
3389 'GL': '88.83.0.0/19',
3390 'GM': '160.182.0.0/15',
3391 'GN': '197.149.192.0/18',
3392 'GP': '104.250.0.0/19',
3393 'GQ': '105.235.224.0/20',
3394 'GR': '94.64.0.0/13',
3395 'GT': '168.234.0.0/16',
3396 'GU': '168.123.0.0/16',
3397 'GW': '197.214.80.0/20',
3398 'GY': '181.41.64.0/18',
3399 'HK': '113.252.0.0/14',
3400 'HN': '181.210.0.0/16',
3401 'HR': '93.136.0.0/13',
3402 'HT': '148.102.128.0/17',
3403 'HU': '84.0.0.0/14',
3404 'ID': '39.192.0.0/10',
3405 'IE': '87.32.0.0/12',
3406 'IL': '79.176.0.0/13',
3407 'IM': '5.62.80.0/20',
3408 'IN': '117.192.0.0/10',
3409 'IO': '203.83.48.0/21',
3410 'IQ': '37.236.0.0/14',
3411 'IR': '2.176.0.0/12',
3412 'IS': '82.221.0.0/16',
3413 'IT': '79.0.0.0/10',
3414 'JE': '87.244.64.0/18',
3415 'JM': '72.27.0.0/17',
3416 'JO': '176.29.0.0/16',
3417 'JP': '126.0.0.0/8',
3418 'KE': '105.48.0.0/12',
3419 'KG': '158.181.128.0/17',
3420 'KH': '36.37.128.0/17',
3421 'KI': '103.25.140.0/22',
3422 'KM': '197.255.224.0/20',
3423 'KN': '198.32.32.0/19',
3424 'KP': '175.45.176.0/22',
3425 'KR': '175.192.0.0/10',
3426 'KW': '37.36.0.0/14',
3427 'KY': '64.96.0.0/15',
3428 'KZ': '2.72.0.0/13',
3429 'LA': '115.84.64.0/18',
3430 'LB': '178.135.0.0/16',
3431 'LC': '192.147.231.0/24',
3432 'LI': '82.117.0.0/19',
3433 'LK': '112.134.0.0/15',
3434 'LR': '41.86.0.0/19',
3435 'LS': '129.232.0.0/17',
3436 'LT': '78.56.0.0/13',
3437 'LU': '188.42.0.0/16',
3438 'LV': '46.109.0.0/16',
3439 'LY': '41.252.0.0/14',
3440 'MA': '105.128.0.0/11',
3441 'MC': '88.209.64.0/18',
3442 'MD': '37.246.0.0/16',
3443 'ME': '178.175.0.0/17',
3444 'MF': '74.112.232.0/21',
3445 'MG': '154.126.0.0/17',
3446 'MH': '117.103.88.0/21',
3447 'MK': '77.28.0.0/15',
3448 'ML': '154.118.128.0/18',
3449 'MM': '37.111.0.0/17',
3450 'MN': '49.0.128.0/17',
3451 'MO': '60.246.0.0/16',
3452 'MP': '202.88.64.0/20',
3453 'MQ': '109.203.224.0/19',
3454 'MR': '41.188.64.0/18',
3455 'MS': '208.90.112.0/22',
3456 'MT': '46.11.0.0/16',
3457 'MU': '105.16.0.0/12',
3458 'MV': '27.114.128.0/18',
3459 'MW': '105.234.0.0/16',
3460 'MX': '187.192.0.0/11',
3461 'MY': '175.136.0.0/13',
3462 'MZ': '197.218.0.0/15',
3463 'NA': '41.182.0.0/16',
3464 'NC': '101.101.0.0/18',
3465 'NE': '197.214.0.0/18',
3466 'NF': '203.17.240.0/22',
3467 'NG': '105.112.0.0/12',
3468 'NI': '186.76.0.0/15',
3469 'NL': '145.96.0.0/11',
3470 'NO': '84.208.0.0/13',
3471 'NP': '36.252.0.0/15',
3472 'NR': '203.98.224.0/19',
3473 'NU': '49.156.48.0/22',
3474 'NZ': '49.224.0.0/14',
3475 'OM': '5.36.0.0/15',
3476 'PA': '186.72.0.0/15',
3477 'PE': '186.160.0.0/14',
3478 'PF': '123.50.64.0/18',
3479 'PG': '124.240.192.0/19',
3480 'PH': '49.144.0.0/13',
3481 'PK': '39.32.0.0/11',
3482 'PL': '83.0.0.0/11',
3483 'PM': '70.36.0.0/20',
3484 'PR': '66.50.0.0/16',
3485 'PS': '188.161.0.0/16',
3486 'PT': '85.240.0.0/13',
3487 'PW': '202.124.224.0/20',
3488 'PY': '181.120.0.0/14',
3489 'QA': '37.210.0.0/15',
3490 'RE': '139.26.0.0/16',
3491 'RO': '79.112.0.0/13',
3492 'RS': '178.220.0.0/14',
3493 'RU': '5.136.0.0/13',
3494 'RW': '105.178.0.0/15',
3495 'SA': '188.48.0.0/13',
3496 'SB': '202.1.160.0/19',
3497 'SC': '154.192.0.0/11',
3498 'SD': '154.96.0.0/13',
3499 'SE': '78.64.0.0/12',
3500 'SG': '152.56.0.0/14',
3501 'SI': '188.196.0.0/14',
3502 'SK': '78.98.0.0/15',
3503 'SL': '197.215.0.0/17',
3504 'SM': '89.186.32.0/19',
3505 'SN': '41.82.0.0/15',
3506 'SO': '197.220.64.0/19',
3507 'SR': '186.179.128.0/17',
3508 'SS': '105.235.208.0/21',
3509 'ST': '197.159.160.0/19',
3510 'SV': '168.243.0.0/16',
3511 'SX': '190.102.0.0/20',
3512 'SY': '5.0.0.0/16',
3513 'SZ': '41.84.224.0/19',
3514 'TC': '65.255.48.0/20',
3515 'TD': '154.68.128.0/19',
3516 'TG': '196.168.0.0/14',
3517 'TH': '171.96.0.0/13',
3518 'TJ': '85.9.128.0/18',
3519 'TK': '27.96.24.0/21',
3520 'TL': '180.189.160.0/20',
3521 'TM': '95.85.96.0/19',
3522 'TN': '197.0.0.0/11',
3523 'TO': '175.176.144.0/21',
3524 'TR': '78.160.0.0/11',
3525 'TT': '186.44.0.0/15',
3526 'TV': '202.2.96.0/19',
3527 'TW': '120.96.0.0/11',
3528 'TZ': '156.156.0.0/14',
3529 'UA': '93.72.0.0/13',
3530 'UG': '154.224.0.0/13',
3531 'US': '3.0.0.0/8',
3532 'UY': '167.56.0.0/13',
3533 'UZ': '82.215.64.0/18',
3534 'VA': '212.77.0.0/19',
3535 'VC': '24.92.144.0/20',
3536 'VE': '186.88.0.0/13',
3537 'VG': '172.103.64.0/18',
3538 'VI': '146.226.0.0/16',
3539 'VN': '14.160.0.0/11',
3540 'VU': '202.80.32.0/20',
3541 'WF': '117.20.32.0/21',
3542 'WS': '202.4.32.0/19',
3543 'YE': '134.35.0.0/16',
3544 'YT': '41.242.116.0/22',
3545 'ZA': '41.0.0.0/11',
3546 'ZM': '165.56.0.0/13',
3547 'ZW': '41.85.192.0/19',
3548 }
3549
3550 @classmethod
5f95927a
S
3551 def random_ipv4(cls, code_or_block):
3552 if len(code_or_block) == 2:
3553 block = cls._country_ip_map.get(code_or_block.upper())
3554 if not block:
3555 return None
3556 else:
3557 block = code_or_block
773f291d
S
3558 addr, preflen = block.split('/')
3559 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3560 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3561 return compat_str(socket.inet_ntoa(
4248dad9 3562 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3563
3564
91410c9b 3565class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3566 def __init__(self, proxies=None):
3567 # Set default handlers
3568 for type in ('http', 'https'):
3569 setattr(self, '%s_open' % type,
3570 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3571 meth(r, proxy, type))
38e87f6c 3572 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 3573
91410c9b 3574 def proxy_open(self, req, proxy, type):
2461f79d 3575 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3576 if req_proxy is not None:
3577 proxy = req_proxy
2461f79d
PH
3578 del req.headers['Ytdl-request-proxy']
3579
3580 if proxy == '__noproxy__':
3581 return None # No Proxy
51fb4995 3582 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3583 req.add_header('Ytdl-socks-proxy', proxy)
3584 # youtube-dl's http/https handlers do wrapping the socket with socks
3585 return None
91410c9b
PH
3586 return compat_urllib_request.ProxyHandler.proxy_open(
3587 self, req, proxy, type)
5bc880b9
YCH
3588
3589
0a5445dd
YCH
3590# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3591# released into Public Domain
3592# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3593
3594def long_to_bytes(n, blocksize=0):
3595 """long_to_bytes(n:long, blocksize:int) : string
3596 Convert a long integer to a byte string.
3597
3598 If optional blocksize is given and greater than zero, pad the front of the
3599 byte string with binary zeros so that the length is a multiple of
3600 blocksize.
3601 """
3602 # after much testing, this algorithm was deemed to be the fastest
3603 s = b''
3604 n = int(n)
3605 while n > 0:
3606 s = compat_struct_pack('>I', n & 0xffffffff) + s
3607 n = n >> 32
3608 # strip off leading zeros
3609 for i in range(len(s)):
3610 if s[i] != b'\000'[0]:
3611 break
3612 else:
3613 # only happens when n == 0
3614 s = b'\000'
3615 i = 0
3616 s = s[i:]
3617 # add back some pad bytes. this could be done more efficiently w.r.t. the
3618 # de-padding being done above, but sigh...
3619 if blocksize > 0 and len(s) % blocksize:
3620 s = (blocksize - len(s) % blocksize) * b'\000' + s
3621 return s
3622
3623
3624def bytes_to_long(s):
3625 """bytes_to_long(string) : long
3626 Convert a byte string to a long integer.
3627
3628 This is (essentially) the inverse of long_to_bytes().
3629 """
3630 acc = 0
3631 length = len(s)
3632 if length % 4:
3633 extra = (4 - length % 4)
3634 s = b'\000' * extra + s
3635 length = length + extra
3636 for i in range(0, length, 4):
3637 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3638 return acc
3639
3640
5bc880b9
YCH
3641def ohdave_rsa_encrypt(data, exponent, modulus):
3642 '''
3643 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3644
3645 Input:
3646 data: data to encrypt, bytes-like object
3647 exponent, modulus: parameter e and N of RSA algorithm, both integer
3648 Output: hex string of encrypted data
3649
3650 Limitation: supports one block encryption only
3651 '''
3652
3653 payload = int(binascii.hexlify(data[::-1]), 16)
3654 encrypted = pow(payload, exponent, modulus)
3655 return '%x' % encrypted
81bdc8fd
YCH
3656
3657
f48409c7
YCH
3658def pkcs1pad(data, length):
3659 """
3660 Padding input data with PKCS#1 scheme
3661
3662 @param {int[]} data input data
3663 @param {int} length target length
3664 @returns {int[]} padded data
3665 """
3666 if len(data) > length - 11:
3667 raise ValueError('Input data too long for PKCS#1 padding')
3668
3669 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3670 return [0, 2] + pseudo_random + [0] + data
3671
3672
5eb6bdce 3673def encode_base_n(num, n, table=None):
59f898b7 3674 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3675 if not table:
3676 table = FULL_TABLE[:n]
3677
5eb6bdce
YCH
3678 if n > len(table):
3679 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3680
3681 if num == 0:
3682 return table[0]
3683
81bdc8fd
YCH
3684 ret = ''
3685 while num:
3686 ret = table[num % n] + ret
3687 num = num // n
3688 return ret
f52354a8
YCH
3689
3690
3691def decode_packed_codes(code):
06b3fe29 3692 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3693 obfucasted_code, base, count, symbols = mobj.groups()
3694 base = int(base)
3695 count = int(count)
3696 symbols = symbols.split('|')
3697 symbol_table = {}
3698
3699 while count:
3700 count -= 1
5eb6bdce 3701 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3702 symbol_table[base_n_count] = symbols[count] or base_n_count
3703
3704 return re.sub(
3705 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3706 obfucasted_code)
e154c651 3707
3708
3709def parse_m3u8_attributes(attrib):
3710 info = {}
3711 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3712 if val.startswith('"'):
3713 val = val[1:-1]
3714 info[key] = val
3715 return info
1143535d
YCH
3716
3717
3718def urshift(val, n):
3719 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3720
3721
3722# Based on png2str() written by @gdkchan and improved by @yokrysty
3723# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3724def decode_png(png_data):
3725 # Reference: https://www.w3.org/TR/PNG/
3726 header = png_data[8:]
3727
3728 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3729 raise IOError('Not a valid PNG file.')
3730
3731 int_map = {1: '>B', 2: '>H', 4: '>I'}
3732 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3733
3734 chunks = []
3735
3736 while header:
3737 length = unpack_integer(header[:4])
3738 header = header[4:]
3739
3740 chunk_type = header[:4]
3741 header = header[4:]
3742
3743 chunk_data = header[:length]
3744 header = header[length:]
3745
3746 header = header[4:] # Skip CRC
3747
3748 chunks.append({
3749 'type': chunk_type,
3750 'length': length,
3751 'data': chunk_data
3752 })
3753
3754 ihdr = chunks[0]['data']
3755
3756 width = unpack_integer(ihdr[:4])
3757 height = unpack_integer(ihdr[4:8])
3758
3759 idat = b''
3760
3761 for chunk in chunks:
3762 if chunk['type'] == b'IDAT':
3763 idat += chunk['data']
3764
3765 if not idat:
3766 raise IOError('Unable to read PNG data.')
3767
3768 decompressed_data = bytearray(zlib.decompress(idat))
3769
3770 stride = width * 3
3771 pixels = []
3772
3773 def _get_pixel(idx):
3774 x = idx % stride
3775 y = idx // stride
3776 return pixels[y][x]
3777
3778 for y in range(height):
3779 basePos = y * (1 + stride)
3780 filter_type = decompressed_data[basePos]
3781
3782 current_row = []
3783
3784 pixels.append(current_row)
3785
3786 for x in range(stride):
3787 color = decompressed_data[1 + basePos + x]
3788 basex = y * stride + x
3789 left = 0
3790 up = 0
3791
3792 if x > 2:
3793 left = _get_pixel(basex - 3)
3794 if y > 0:
3795 up = _get_pixel(basex - stride)
3796
3797 if filter_type == 1: # Sub
3798 color = (color + left) & 0xff
3799 elif filter_type == 2: # Up
3800 color = (color + up) & 0xff
3801 elif filter_type == 3: # Average
3802 color = (color + ((left + up) >> 1)) & 0xff
3803 elif filter_type == 4: # Paeth
3804 a = left
3805 b = up
3806 c = 0
3807
3808 if x > 2 and y > 0:
3809 c = _get_pixel(basex - stride - 3)
3810
3811 p = a + b - c
3812
3813 pa = abs(p - a)
3814 pb = abs(p - b)
3815 pc = abs(p - c)
3816
3817 if pa <= pb and pa <= pc:
3818 color = (color + a) & 0xff
3819 elif pb <= pc:
3820 color = (color + b) & 0xff
3821 else:
3822 color = (color + c) & 0xff
3823
3824 current_row.append(color)
3825
3826 return width, height, pixels
efa97bdc
YCH
3827
3828
3829def write_xattr(path, key, value):
3830 # This mess below finds the best xattr tool for the job
3831 try:
3832 # try the pyxattr module...
3833 import xattr
3834
53a7e3d2
YCH
3835 if hasattr(xattr, 'set'): # pyxattr
3836 # Unicode arguments are not supported in python-pyxattr until
3837 # version 0.5.0
3838 # See https://github.com/rg3/youtube-dl/issues/5498
3839 pyxattr_required_version = '0.5.0'
3840 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3841 # TODO: fallback to CLI tools
3842 raise XAttrUnavailableError(
3843 'python-pyxattr is detected but is too old. '
3844 'youtube-dl requires %s or above while your version is %s. '
3845 'Falling back to other xattr implementations' % (
3846 pyxattr_required_version, xattr.__version__))
3847
3848 setxattr = xattr.set
3849 else: # xattr
3850 setxattr = xattr.setxattr
efa97bdc
YCH
3851
3852 try:
53a7e3d2 3853 setxattr(path, key, value)
efa97bdc
YCH
3854 except EnvironmentError as e:
3855 raise XAttrMetadataError(e.errno, e.strerror)
3856
3857 except ImportError:
3858 if compat_os_name == 'nt':
3859 # Write xattrs to NTFS Alternate Data Streams:
3860 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3861 assert ':' not in key
3862 assert os.path.exists(path)
3863
3864 ads_fn = path + ':' + key
3865 try:
3866 with open(ads_fn, 'wb') as f:
3867 f.write(value)
3868 except EnvironmentError as e:
3869 raise XAttrMetadataError(e.errno, e.strerror)
3870 else:
3871 user_has_setfattr = check_executable('setfattr', ['--version'])
3872 user_has_xattr = check_executable('xattr', ['-h'])
3873
3874 if user_has_setfattr or user_has_xattr:
3875
3876 value = value.decode('utf-8')
3877 if user_has_setfattr:
3878 executable = 'setfattr'
3879 opts = ['-n', key, '-v', value]
3880 elif user_has_xattr:
3881 executable = 'xattr'
3882 opts = ['-w', key, value]
3883
3884 cmd = ([encodeFilename(executable, True)] +
3885 [encodeArgument(o) for o in opts] +
3886 [encodeFilename(path, True)])
3887
3888 try:
3889 p = subprocess.Popen(
3890 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3891 except EnvironmentError as e:
3892 raise XAttrMetadataError(e.errno, e.strerror)
3893 stdout, stderr = p.communicate()
3894 stderr = stderr.decode('utf-8', 'replace')
3895 if p.returncode != 0:
3896 raise XAttrMetadataError(p.returncode, stderr)
3897
3898 else:
3899 # On Unix, and can't find pyxattr, setfattr, or xattr.
3900 if sys.platform.startswith('linux'):
3901 raise XAttrUnavailableError(
3902 "Couldn't find a tool to set the xattrs. "
3903 "Install either the python 'pyxattr' or 'xattr' "
3904 "modules, or the GNU 'attr' package "
3905 "(which contains the 'setfattr' tool).")
3906 else:
3907 raise XAttrUnavailableError(
3908 "Couldn't find a tool to set the xattrs. "
3909 "Install either the python 'xattr' module, "
3910 "or the 'xattr' binary.")
0c265486
YCH
3911
3912
3913def random_birthday(year_field, month_field, day_field):
3914 return {
3915 year_field: str(random.randint(1950, 1995)),
3916 month_field: str(random.randint(1, 12)),
3917 day_field: str(random.randint(1, 31)),
3918 }