]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[bbc] add support for BBC Radio Play pages(closes #17022)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
d7cd9a9e 42 compat_ctypes_WINFUNCTYPE,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
b12cf31b 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29 186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0685d972 187JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 188
7105440c 189
d77c3dfd 190def preferredencoding():
59ae15a5 191 """Get preferred encoding.
d77c3dfd 192
59ae15a5
PH
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
28e614de 198 'TEST'.encode(pref)
70a1165b 199 except Exception:
59ae15a5 200 pref = 'UTF-8'
bae611f2 201
59ae15a5 202 return pref
d77c3dfd 203
f4bfd65f 204
181c8655 205def write_json_file(obj, fn):
1394646a 206 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 207
92120217 208 fn = encodeFilename(fn)
61ee5aeb 209 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
210 encoding = get_filesystem_encoding()
211 # os.path.basename returns a bytes object, but NamedTemporaryFile
212 # will fail if the filename contains non ascii characters unless we
213 # use a unicode object
214 path_basename = lambda f: os.path.basename(fn).decode(encoding)
215 # the same for os.path.dirname
216 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 else:
218 path_basename = os.path.basename
219 path_dirname = os.path.dirname
220
73159f99
S
221 args = {
222 'suffix': '.tmp',
ec5f6016
JMF
223 'prefix': path_basename(fn) + '.',
224 'dir': path_dirname(fn),
73159f99
S
225 'delete': False,
226 }
227
181c8655
PH
228 # In Python 2.x, json.dump expects a bytestream.
229 # In Python 3.x, it writes to a character stream
230 if sys.version_info < (3, 0):
73159f99 231 args['mode'] = 'wb'
181c8655 232 else:
73159f99
S
233 args.update({
234 'mode': 'w',
235 'encoding': 'utf-8',
236 })
237
c86b6142 238 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
239
240 try:
241 with tf:
242 json.dump(obj, tf)
1394646a
IK
243 if sys.platform == 'win32':
244 # Need to remove existing file on Windows, else os.rename raises
245 # WindowsError or FileExistsError.
246 try:
247 os.unlink(fn)
248 except OSError:
249 pass
181c8655 250 os.rename(tf.name, fn)
70a1165b 251 except Exception:
181c8655
PH
252 try:
253 os.remove(tf.name)
254 except OSError:
255 pass
256 raise
257
258
259if sys.version_info >= (2, 7):
ee114368 260 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 261 """ Find the xpath xpath[@key=val] """
5d2354f1 262 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 263 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
264 return node.find(expr)
265else:
ee114368 266 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 267 for f in node.findall(compat_xpath(xpath)):
ee114368
S
268 if key not in f.attrib:
269 continue
270 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
271 return f
272 return None
273
d7e66d39
JMF
274# On python2.6 the xml.etree.ElementTree.Element methods don't support
275# the namespace parameter
5f6a1245
JW
276
277
d7e66d39
JMF
278def xpath_with_ns(path, ns_map):
279 components = [c.split(':') for c in path.split('/')]
280 replaced = []
281 for c in components:
282 if len(c) == 1:
283 replaced.append(c[0])
284 else:
285 ns, tag = c
286 replaced.append('{%s}%s' % (ns_map[ns], tag))
287 return '/'.join(replaced)
288
d77c3dfd 289
a41fb80c 290def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 291 def _find_xpath(xpath):
810c10ba 292 return node.find(compat_xpath(xpath))
578c0745
S
293
294 if isinstance(xpath, (str, compat_str)):
295 n = _find_xpath(xpath)
296 else:
297 for xp in xpath:
298 n = _find_xpath(xp)
299 if n is not None:
300 break
d74bebd5 301
8e636da4 302 if n is None:
bf42a990
S
303 if default is not NO_DEFAULT:
304 return default
305 elif fatal:
bf0ff932
PH
306 name = xpath if name is None else name
307 raise ExtractorError('Could not find XML element %s' % name)
308 else:
309 return None
a41fb80c
S
310 return n
311
312
313def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
314 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
315 if n is None or n == default:
316 return n
317 if n.text is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element\'s text %s' % name)
323 else:
324 return None
325 return n.text
a41fb80c
S
326
327
328def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
329 n = find_xpath_attr(node, xpath, key)
330 if n is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = '%s[@%s]' % (xpath, key) if name is None else name
335 raise ExtractorError('Could not find XML attribute %s' % name)
336 else:
337 return None
338 return n.attrib[key]
bf0ff932
PH
339
340
9e6dd238 341def get_element_by_id(id, html):
43e8fafd 342 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 343 return get_element_by_attribute('id', id, html)
43e8fafd 344
12ea2f30 345
84c237fb 346def get_element_by_class(class_name, html):
2af12ad9
TC
347 """Return the content of the first tag with the specified class in the passed HTML document"""
348 retval = get_elements_by_class(class_name, html)
349 return retval[0] if retval else None
350
351
352def get_element_by_attribute(attribute, value, html, escape_value=True):
353 retval = get_elements_by_attribute(attribute, value, html, escape_value)
354 return retval[0] if retval else None
355
356
357def get_elements_by_class(class_name, html):
358 """Return the content of all tags with the specified class in the passed HTML document as a list"""
359 return get_elements_by_attribute(
84c237fb
YCH
360 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
361 html, escape_value=False)
362
363
2af12ad9 364def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 365 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 366
84c237fb
YCH
367 value = re.escape(value) if escape_value else value
368
2af12ad9
TC
369 retlist = []
370 for m in re.finditer(r'''(?xs)
38285056 371 <([a-zA-Z0-9:._-]+)
609ff8ca 372 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 373 \s+%s=['"]?%s['"]?
609ff8ca 374 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
375 \s*>
376 (?P<content>.*?)
377 </\1>
2af12ad9
TC
378 ''' % (re.escape(attribute), value), html):
379 res = m.group('content')
38285056 380
2af12ad9
TC
381 if res.startswith('"') or res.startswith("'"):
382 res = res[1:-1]
38285056 383
2af12ad9 384 retlist.append(unescapeHTML(res))
a921f407 385
2af12ad9 386 return retlist
a921f407 387
c5229f39 388
8bb56eee
BF
389class HTMLAttributeParser(compat_HTMLParser):
390 """Trivial HTML parser to gather the attributes for a single element"""
391 def __init__(self):
c5229f39 392 self.attrs = {}
8bb56eee
BF
393 compat_HTMLParser.__init__(self)
394
395 def handle_starttag(self, tag, attrs):
396 self.attrs = dict(attrs)
397
c5229f39 398
8bb56eee
BF
399def extract_attributes(html_element):
400 """Given a string for an HTML element such as
401 <el
402 a="foo" B="bar" c="&98;az" d=boz
403 empty= noval entity="&amp;"
404 sq='"' dq="'"
405 >
406 Decode and return a dictionary of attributes.
407 {
408 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
409 'empty': '', 'noval': None, 'entity': '&',
410 'sq': '"', 'dq': '\''
411 }.
412 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
413 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 """
415 parser = HTMLAttributeParser()
b4a3d461
S
416 try:
417 parser.feed(html_element)
418 parser.close()
419 # Older Python may throw HTMLParseError in case of malformed HTML
420 except compat_HTMLParseError:
421 pass
8bb56eee 422 return parser.attrs
9e6dd238 423
c5229f39 424
9e6dd238 425def clean_html(html):
59ae15a5 426 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
427
428 if html is None: # Convenience for sanitizing descriptions etc.
429 return html
430
59ae15a5
PH
431 # Newline vs <br />
432 html = html.replace('\n', ' ')
edd9221c
TF
433 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
434 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
435 # Strip html tags
436 html = re.sub('<.*?>', '', html)
437 # Replace html entities
438 html = unescapeHTML(html)
7decf895 439 return html.strip()
9e6dd238
FV
440
441
d77c3dfd 442def sanitize_open(filename, open_mode):
59ae15a5
PH
443 """Try to open the given filename, and slightly tweak it if this fails.
444
445 Attempts to open the given filename. If this fails, it tries to change
446 the filename slightly, step by step, until it's either able to open it
447 or it fails and raises a final exception, like the standard open()
448 function.
449
450 It returns the tuple (stream, definitive_file_name).
451 """
452 try:
28e614de 453 if filename == '-':
59ae15a5
PH
454 if sys.platform == 'win32':
455 import msvcrt
456 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 457 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
458 stream = open(encodeFilename(filename), open_mode)
459 return (stream, filename)
460 except (IOError, OSError) as err:
f45c185f
PH
461 if err.errno in (errno.EACCES,):
462 raise
59ae15a5 463
f45c185f 464 # In case of error, try to remove win32 forbidden chars
d55de57b 465 alt_filename = sanitize_path(filename)
f45c185f
PH
466 if alt_filename == filename:
467 raise
468 else:
469 # An exception here should be caught in the caller
d55de57b 470 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 471 return (stream, alt_filename)
d77c3dfd
FV
472
473
474def timeconvert(timestr):
59ae15a5
PH
475 """Convert RFC 2822 defined time string into system timestamp"""
476 timestamp = None
477 timetuple = email.utils.parsedate_tz(timestr)
478 if timetuple is not None:
479 timestamp = email.utils.mktime_tz(timetuple)
480 return timestamp
1c469a94 481
5f6a1245 482
796173d0 483def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
484 """Sanitizes a string so it could be used as part of a filename.
485 If restricted is set, use a stricter subset of allowed characters.
158af524
S
486 Set is_id if this is not an arbitrary string, but an ID that should be kept
487 if possible.
59ae15a5
PH
488 """
489 def replace_insane(char):
c587cbb7
AT
490 if restricted and char in ACCENT_CHARS:
491 return ACCENT_CHARS[char]
59ae15a5
PH
492 if char == '?' or ord(char) < 32 or ord(char) == 127:
493 return ''
494 elif char == '"':
495 return '' if restricted else '\''
496 elif char == ':':
497 return '_-' if restricted else ' -'
498 elif char in '\\/|*<>':
499 return '_'
627dcfff 500 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
501 return '_'
502 if restricted and ord(char) > 127:
503 return '_'
504 return char
505
2aeb06d6
PH
506 # Handle timestamps
507 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 508 result = ''.join(map(replace_insane, s))
796173d0
PH
509 if not is_id:
510 while '__' in result:
511 result = result.replace('__', '_')
512 result = result.strip('_')
513 # Common case of "Foreign band name - English song title"
514 if restricted and result.startswith('-_'):
515 result = result[2:]
5a42414b
PH
516 if result.startswith('-'):
517 result = '_' + result[len('-'):]
a7440261 518 result = result.lstrip('.')
796173d0
PH
519 if not result:
520 result = '_'
59ae15a5 521 return result
d77c3dfd 522
5f6a1245 523
a2aaf4db
S
524def sanitize_path(s):
525 """Sanitizes and normalizes path on Windows"""
526 if sys.platform != 'win32':
527 return s
be531ef1
S
528 drive_or_unc, _ = os.path.splitdrive(s)
529 if sys.version_info < (2, 7) and not drive_or_unc:
530 drive_or_unc, _ = os.path.splitunc(s)
531 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 if drive_or_unc:
a2aaf4db
S
533 norm_path.pop(0)
534 sanitized_path = [
ec85ded8 535 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 536 for path_part in norm_path]
be531ef1
S
537 if drive_or_unc:
538 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
539 return os.path.join(*sanitized_path)
540
541
17bcc626 542def sanitize_url(url):
befa4708
S
543 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
544 # the number of unwanted failures due to missing protocol
545 if url.startswith('//'):
546 return 'http:%s' % url
547 # Fix some common typos seen so far
548 COMMON_TYPOS = (
549 # https://github.com/rg3/youtube-dl/issues/15649
550 (r'^httpss://', r'https://'),
551 # https://bx1.be/lives/direct-tv/
552 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 )
554 for mistake, fixup in COMMON_TYPOS:
555 if re.match(mistake, url):
556 return re.sub(mistake, fixup, url)
557 return url
17bcc626
S
558
559
67dda517 560def sanitized_Request(url, *args, **kwargs):
17bcc626 561 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
562
563
51098426
S
564def expand_path(s):
565 """Expand shell variables and ~"""
566 return os.path.expandvars(compat_expanduser(s))
567
568
d77c3dfd 569def orderedSet(iterable):
59ae15a5
PH
570 """ Remove all duplicates from the input iterable """
571 res = []
572 for el in iterable:
573 if el not in res:
574 res.append(el)
575 return res
d77c3dfd 576
912b38b4 577
55b2f099 578def _htmlentity_transform(entity_with_semicolon):
4e408e47 579 """Transforms an HTML entity to a character."""
55b2f099
YCH
580 entity = entity_with_semicolon[:-1]
581
4e408e47
PH
582 # Known non-numeric HTML entity
583 if entity in compat_html_entities.name2codepoint:
584 return compat_chr(compat_html_entities.name2codepoint[entity])
585
55b2f099
YCH
586 # TODO: HTML5 allows entities without a semicolon. For example,
587 # '&Eacuteric' should be decoded as 'Éric'.
588 if entity_with_semicolon in compat_html_entities_html5:
589 return compat_html_entities_html5[entity_with_semicolon]
590
91757b0f 591 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
592 if mobj is not None:
593 numstr = mobj.group(1)
28e614de 594 if numstr.startswith('x'):
4e408e47 595 base = 16
28e614de 596 numstr = '0%s' % numstr
4e408e47
PH
597 else:
598 base = 10
7aefc49c
S
599 # See https://github.com/rg3/youtube-dl/issues/7518
600 try:
601 return compat_chr(int(numstr, base))
602 except ValueError:
603 pass
4e408e47
PH
604
605 # Unknown entity in name, return its literal representation
7a3f0c00 606 return '&%s;' % entity
4e408e47
PH
607
608
d77c3dfd 609def unescapeHTML(s):
912b38b4
PH
610 if s is None:
611 return None
612 assert type(s) == compat_str
d77c3dfd 613
4e408e47 614 return re.sub(
95f3f7c2 615 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 616
8bf48f23 617
aa49acd1
S
618def get_subprocess_encoding():
619 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
620 # For subprocess calls, encode with locale encoding
621 # Refer to http://stackoverflow.com/a/9951851/35070
622 encoding = preferredencoding()
623 else:
624 encoding = sys.getfilesystemencoding()
625 if encoding is None:
626 encoding = 'utf-8'
627 return encoding
628
629
8bf48f23 630def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
631 """
632 @param s The name of the file
633 """
d77c3dfd 634
8bf48f23 635 assert type(s) == compat_str
d77c3dfd 636
59ae15a5
PH
637 # Python 3 has a Unicode API
638 if sys.version_info >= (3, 0):
639 return s
0f00efed 640
aa49acd1
S
641 # Pass '' directly to use Unicode APIs on Windows 2000 and up
642 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
643 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
644 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
645 return s
646
8ee239e9
YCH
647 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
648 if sys.platform.startswith('java'):
649 return s
650
aa49acd1
S
651 return s.encode(get_subprocess_encoding(), 'ignore')
652
653
654def decodeFilename(b, for_subprocess=False):
655
656 if sys.version_info >= (3, 0):
657 return b
658
659 if not isinstance(b, bytes):
660 return b
661
662 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 663
f07b74fc
PH
664
665def encodeArgument(s):
666 if not isinstance(s, compat_str):
667 # Legacy code that uses byte strings
668 # Uncomment the following line after fixing all post processors
7af808a5 669 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
670 s = s.decode('ascii')
671 return encodeFilename(s, True)
672
673
aa49acd1
S
674def decodeArgument(b):
675 return decodeFilename(b, True)
676
677
8271226a
PH
678def decodeOption(optval):
679 if optval is None:
680 return optval
681 if isinstance(optval, bytes):
682 optval = optval.decode(preferredencoding())
683
684 assert isinstance(optval, compat_str)
685 return optval
1c256f70 686
5f6a1245 687
4539dd30
PH
688def formatSeconds(secs):
689 if secs > 3600:
690 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 elif secs > 60:
692 return '%d:%02d' % (secs // 60, secs % 60)
693 else:
694 return '%d' % secs
695
a0ddb8a2 696
be4a824d
PH
697def make_HTTPS_handler(params, **kwargs):
698 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 699 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 700 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 701 if opts_no_check_certificate:
be5f2c19 702 context.check_hostname = False
0db261ba 703 context.verify_mode = ssl.CERT_NONE
a2366922 704 try:
be4a824d 705 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
706 except TypeError:
707 # Python 2.7.8
708 # (create_default_context present but HTTPSHandler has no context=)
709 pass
710
711 if sys.version_info < (3, 2):
d7932313 712 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 713 else: # Python < 3.4
d7932313 714 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 715 context.verify_mode = (ssl.CERT_NONE
dca08720 716 if opts_no_check_certificate
ea6d901e 717 else ssl.CERT_REQUIRED)
303b479e 718 context.set_default_verify_paths()
be4a824d 719 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 720
732ea2f0 721
08f2a92c
JMF
722def bug_reports_message():
723 if ytdl_is_updateable():
724 update_cmd = 'type youtube-dl -U to update'
725 else:
726 update_cmd = 'see https://yt-dl.org/update on how to update'
727 msg = '; please report this issue on https://yt-dl.org/bug .'
728 msg += ' Make sure you are using the latest version; %s.' % update_cmd
729 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
730 return msg
731
732
bf5b9d85
PM
733class YoutubeDLError(Exception):
734 """Base exception for YoutubeDL errors."""
735 pass
736
737
738class ExtractorError(YoutubeDLError):
1c256f70 739 """Error during info extraction."""
5f6a1245 740
d11271dd 741 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
742 """ tb, if given, is the original traceback (so that it can be printed out).
743 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
744 """
745
746 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 expected = True
d11271dd
PH
748 if video_id is not None:
749 msg = video_id + ': ' + msg
410f3e73 750 if cause:
28e614de 751 msg += ' (caused by %r)' % cause
9a82b238 752 if not expected:
08f2a92c 753 msg += bug_reports_message()
1c256f70 754 super(ExtractorError, self).__init__(msg)
d5979c5d 755
1c256f70 756 self.traceback = tb
8cc83b8d 757 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 758 self.cause = cause
d11271dd 759 self.video_id = video_id
1c256f70 760
01951dda
PH
761 def format_traceback(self):
762 if self.traceback is None:
763 return None
28e614de 764 return ''.join(traceback.format_tb(self.traceback))
01951dda 765
1c256f70 766
416c7fcb
PH
767class UnsupportedError(ExtractorError):
768 def __init__(self, url):
769 super(UnsupportedError, self).__init__(
770 'Unsupported URL: %s' % url, expected=True)
771 self.url = url
772
773
55b3e45b
JMF
774class RegexNotFoundError(ExtractorError):
775 """Error when a regex didn't match"""
776 pass
777
778
773f291d
S
779class GeoRestrictedError(ExtractorError):
780 """Geographic restriction Error exception.
781
782 This exception may be thrown when a video is not available from your
783 geographic location due to geographic restrictions imposed by a website.
784 """
785 def __init__(self, msg, countries=None):
786 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.msg = msg
788 self.countries = countries
789
790
bf5b9d85 791class DownloadError(YoutubeDLError):
59ae15a5 792 """Download Error exception.
d77c3dfd 793
59ae15a5
PH
794 This exception may be thrown by FileDownloader objects if they are not
795 configured to continue on errors. They will contain the appropriate
796 error message.
797 """
5f6a1245 798
8cc83b8d
FV
799 def __init__(self, msg, exc_info=None):
800 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
801 super(DownloadError, self).__init__(msg)
802 self.exc_info = exc_info
d77c3dfd
FV
803
804
bf5b9d85 805class SameFileError(YoutubeDLError):
59ae15a5 806 """Same File exception.
d77c3dfd 807
59ae15a5
PH
808 This exception will be thrown by FileDownloader objects if they detect
809 multiple files would have to be downloaded to the same file on disk.
810 """
811 pass
d77c3dfd
FV
812
813
bf5b9d85 814class PostProcessingError(YoutubeDLError):
59ae15a5 815 """Post Processing exception.
d77c3dfd 816
59ae15a5
PH
817 This exception may be raised by PostProcessor's .run() method to
818 indicate an error in the postprocessing task.
819 """
5f6a1245 820
7851b379 821 def __init__(self, msg):
bf5b9d85 822 super(PostProcessingError, self).__init__(msg)
7851b379 823 self.msg = msg
d77c3dfd 824
5f6a1245 825
bf5b9d85 826class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
827 """ --max-downloads limit has been reached. """
828 pass
d77c3dfd
FV
829
830
bf5b9d85 831class UnavailableVideoError(YoutubeDLError):
59ae15a5 832 """Unavailable Format exception.
d77c3dfd 833
59ae15a5
PH
834 This exception will be thrown when a video is requested
835 in a format that is not available for that video.
836 """
837 pass
d77c3dfd
FV
838
839
bf5b9d85 840class ContentTooShortError(YoutubeDLError):
59ae15a5 841 """Content Too Short exception.
d77c3dfd 842
59ae15a5
PH
843 This exception may be raised by FileDownloader objects when a file they
844 download is too small for what the server announced first, indicating
845 the connection was probably interrupted.
846 """
d77c3dfd 847
59ae15a5 848 def __init__(self, downloaded, expected):
bf5b9d85
PM
849 super(ContentTooShortError, self).__init__(
850 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
851 )
2c7ed247 852 # Both in bytes
59ae15a5
PH
853 self.downloaded = downloaded
854 self.expected = expected
d77c3dfd 855
5f6a1245 856
bf5b9d85 857class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
858 def __init__(self, code=None, msg='Unknown error'):
859 super(XAttrMetadataError, self).__init__(msg)
860 self.code = code
bd264412 861 self.msg = msg
efa97bdc
YCH
862
863 # Parsing code and msg
864 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
865 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
866 self.reason = 'NO_SPACE'
867 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
868 self.reason = 'VALUE_TOO_LONG'
869 else:
870 self.reason = 'NOT_SUPPORTED'
871
872
bf5b9d85 873class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
874 pass
875
876
c5a59d93 877def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
878 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
879 # expected HTTP responses to meet HTTP/1.0 or later (see also
880 # https://github.com/rg3/youtube-dl/issues/6727)
881 if sys.version_info < (3, 0):
65220c3b
S
882 kwargs['strict'] = True
883 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d
PH
884 source_address = ydl_handler._params.get('source_address')
885 if source_address is not None:
886 sa = (source_address, 0)
887 if hasattr(hc, 'source_address'): # Python 2.7+
888 hc.source_address = sa
889 else: # Python 2.6
890 def _hc_connect(self, *args, **kwargs):
891 sock = compat_socket_create_connection(
892 (self.host, self.port), self.timeout, sa)
893 if is_https:
d7932313
PH
894 self.sock = ssl.wrap_socket(
895 sock, self.key_file, self.cert_file,
896 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
897 else:
898 self.sock = sock
899 hc.connect = functools.partial(_hc_connect, hc)
900
901 return hc
902
903
87f0e62d 904def handle_youtubedl_headers(headers):
992fc9d6
YCH
905 filtered_headers = headers
906
907 if 'Youtubedl-no-compression' in filtered_headers:
908 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 909 del filtered_headers['Youtubedl-no-compression']
87f0e62d 910
992fc9d6 911 return filtered_headers
87f0e62d
YCH
912
913
acebc9cd 914class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
915 """Handler for HTTP requests and responses.
916
917 This class, when installed with an OpenerDirector, automatically adds
918 the standard headers to every HTTP request and handles gzipped and
919 deflated responses from web servers. If compression is to be avoided in
920 a particular request, the original request in the program code only has
0424ec30 921 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
922 removed before making the real request.
923
924 Part of this code was copied from:
925
926 http://techknack.net/python-urllib2-handlers/
927
928 Andrew Rowls, the author of that code, agreed to release it to the
929 public domain.
930 """
931
be4a824d
PH
932 def __init__(self, params, *args, **kwargs):
933 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
934 self._params = params
935
936 def http_open(self, req):
71aff188
YCH
937 conn_class = compat_http_client.HTTPConnection
938
939 socks_proxy = req.headers.get('Ytdl-socks-proxy')
940 if socks_proxy:
941 conn_class = make_socks_conn_class(conn_class, socks_proxy)
942 del req.headers['Ytdl-socks-proxy']
943
be4a824d 944 return self.do_open(functools.partial(
71aff188 945 _create_http_connection, self, conn_class, False),
be4a824d
PH
946 req)
947
59ae15a5
PH
948 @staticmethod
949 def deflate(data):
950 try:
951 return zlib.decompress(data, -zlib.MAX_WBITS)
952 except zlib.error:
953 return zlib.decompress(data)
954
acebc9cd 955 def http_request(self, req):
51f267d9
S
956 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
957 # always respected by websites, some tend to give out URLs with non percent-encoded
958 # non-ASCII characters (see telemb.py, ard.py [#3412])
959 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
960 # To work around aforementioned issue we will replace request's original URL with
961 # percent-encoded one
962 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
963 # the code of this workaround has been moved here from YoutubeDL.urlopen()
964 url = req.get_full_url()
965 url_escaped = escape_url(url)
966
967 # Substitute URL if any change after escaping
968 if url != url_escaped:
15d260eb 969 req = update_Request(req, url=url_escaped)
51f267d9 970
33ac271b 971 for h, v in std_headers.items():
3d5f7a39
JK
972 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
973 # The dict keys are capitalized because of this bug by urllib
974 if h.capitalize() not in req.headers:
33ac271b 975 req.add_header(h, v)
87f0e62d
YCH
976
977 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
978
979 if sys.version_info < (2, 7) and '#' in req.get_full_url():
980 # Python 2.6 is brain-dead when it comes to fragments
981 req._Request__original = req._Request__original.partition('#')[0]
982 req._Request__r_type = req._Request__r_type.partition('#')[0]
983
59ae15a5
PH
984 return req
985
acebc9cd 986 def http_response(self, req, resp):
59ae15a5
PH
987 old_resp = resp
988 # gzip
989 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
990 content = resp.read()
991 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
992 try:
993 uncompressed = io.BytesIO(gz.read())
994 except IOError as original_ioerror:
995 # There may be junk add the end of the file
996 # See http://stackoverflow.com/q/4928560/35070 for details
997 for i in range(1, 1024):
998 try:
999 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1000 uncompressed = io.BytesIO(gz.read())
1001 except IOError:
1002 continue
1003 break
1004 else:
1005 raise original_ioerror
b407d853 1006 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1007 resp.msg = old_resp.msg
c047270c 1008 del resp.headers['Content-encoding']
59ae15a5
PH
1009 # deflate
1010 if resp.headers.get('Content-encoding', '') == 'deflate':
1011 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1012 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1013 resp.msg = old_resp.msg
c047270c 1014 del resp.headers['Content-encoding']
ad729172
S
1015 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1016 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1017 if 300 <= resp.code < 400:
1018 location = resp.headers.get('Location')
1019 if location:
1020 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1021 if sys.version_info >= (3, 0):
1022 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1023 else:
1024 location = location.decode('utf-8')
5a4d9ddb
S
1025 location_escaped = escape_url(location)
1026 if location != location_escaped:
1027 del resp.headers['Location']
9a4aec8b
YCH
1028 if sys.version_info < (3, 0):
1029 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1030 resp.headers['Location'] = location_escaped
59ae15a5 1031 return resp
0f8d03f8 1032
acebc9cd
PH
1033 https_request = http_request
1034 https_response = http_response
bf50b038 1035
5de90176 1036
71aff188
YCH
1037def make_socks_conn_class(base_class, socks_proxy):
1038 assert issubclass(base_class, (
1039 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1040
1041 url_components = compat_urlparse.urlparse(socks_proxy)
1042 if url_components.scheme.lower() == 'socks5':
1043 socks_type = ProxyType.SOCKS5
1044 elif url_components.scheme.lower() in ('socks', 'socks4'):
1045 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1046 elif url_components.scheme.lower() == 'socks4a':
1047 socks_type = ProxyType.SOCKS4A
71aff188 1048
cdd94c2e
YCH
1049 def unquote_if_non_empty(s):
1050 if not s:
1051 return s
1052 return compat_urllib_parse_unquote_plus(s)
1053
71aff188
YCH
1054 proxy_args = (
1055 socks_type,
1056 url_components.hostname, url_components.port or 1080,
1057 True, # Remote DNS
cdd94c2e
YCH
1058 unquote_if_non_empty(url_components.username),
1059 unquote_if_non_empty(url_components.password),
71aff188
YCH
1060 )
1061
1062 class SocksConnection(base_class):
1063 def connect(self):
1064 self.sock = sockssocket()
1065 self.sock.setproxy(*proxy_args)
1066 if type(self.timeout) in (int, float):
1067 self.sock.settimeout(self.timeout)
1068 self.sock.connect((self.host, self.port))
1069
1070 if isinstance(self, compat_http_client.HTTPSConnection):
1071 if hasattr(self, '_context'): # Python > 2.6
1072 self.sock = self._context.wrap_socket(
1073 self.sock, server_hostname=self.host)
1074 else:
1075 self.sock = ssl.wrap_socket(self.sock)
1076
1077 return SocksConnection
1078
1079
be4a824d
PH
1080class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1081 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1082 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1083 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1084 self._params = params
1085
1086 def https_open(self, req):
4f264c02 1087 kwargs = {}
71aff188
YCH
1088 conn_class = self._https_conn_class
1089
4f264c02
JMF
1090 if hasattr(self, '_context'): # python > 2.6
1091 kwargs['context'] = self._context
1092 if hasattr(self, '_check_hostname'): # python 3.x
1093 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1094
1095 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1096 if socks_proxy:
1097 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1098 del req.headers['Ytdl-socks-proxy']
1099
be4a824d 1100 return self.do_open(functools.partial(
71aff188 1101 _create_http_connection, self, conn_class, True),
4f264c02 1102 req, **kwargs)
be4a824d
PH
1103
1104
a6420bf5
S
1105class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1106 def __init__(self, cookiejar=None):
1107 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1108
1109 def http_response(self, request, response):
1110 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1111 # characters in Set-Cookie HTTP header of last response (see
1112 # https://github.com/rg3/youtube-dl/issues/6769).
1113 # In order to at least prevent crashing we will percent encode Set-Cookie
1114 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1115 # if sys.version_info < (3, 0) and response.headers:
1116 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1117 # set_cookie = response.headers.get(set_cookie_header)
1118 # if set_cookie:
1119 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1120 # if set_cookie != set_cookie_escaped:
1121 # del response.headers[set_cookie_header]
1122 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1123 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1124
1125 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1126 https_response = http_response
1127
1128
46f59e89
S
1129def extract_timezone(date_str):
1130 m = re.search(
1131 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1132 date_str)
1133 if not m:
1134 timezone = datetime.timedelta()
1135 else:
1136 date_str = date_str[:-len(m.group('tz'))]
1137 if not m.group('sign'):
1138 timezone = datetime.timedelta()
1139 else:
1140 sign = 1 if m.group('sign') == '+' else -1
1141 timezone = datetime.timedelta(
1142 hours=sign * int(m.group('hours')),
1143 minutes=sign * int(m.group('minutes')))
1144 return timezone, date_str
1145
1146
08b38d54 1147def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1148 """ Return a UNIX timestamp from the given date """
1149
1150 if date_str is None:
1151 return None
1152
52c3a6e4
S
1153 date_str = re.sub(r'\.[0-9]+', '', date_str)
1154
08b38d54 1155 if timezone is None:
46f59e89
S
1156 timezone, date_str = extract_timezone(date_str)
1157
52c3a6e4
S
1158 try:
1159 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1160 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1161 return calendar.timegm(dt.timetuple())
1162 except ValueError:
1163 pass
912b38b4
PH
1164
1165
46f59e89
S
1166def date_formats(day_first=True):
1167 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1168
1169
42bdd9d0 1170def unified_strdate(date_str, day_first=True):
bf50b038 1171 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1172
1173 if date_str is None:
1174 return None
bf50b038 1175 upload_date = None
5f6a1245 1176 # Replace commas
026fcc04 1177 date_str = date_str.replace(',', ' ')
42bdd9d0 1178 # Remove AM/PM + timezone
9bb8e0a3 1179 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1180 _, date_str = extract_timezone(date_str)
42bdd9d0 1181
46f59e89 1182 for expression in date_formats(day_first):
bf50b038
JMF
1183 try:
1184 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1185 except ValueError:
bf50b038 1186 pass
42393ce2
PH
1187 if upload_date is None:
1188 timetuple = email.utils.parsedate_tz(date_str)
1189 if timetuple:
c6b9cf05
S
1190 try:
1191 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1192 except ValueError:
1193 pass
6a750402
JMF
1194 if upload_date is not None:
1195 return compat_str(upload_date)
bf50b038 1196
5f6a1245 1197
46f59e89
S
1198def unified_timestamp(date_str, day_first=True):
1199 if date_str is None:
1200 return None
1201
2ae2ffda 1202 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1203
7dc2a74e 1204 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1205 timezone, date_str = extract_timezone(date_str)
1206
1207 # Remove AM/PM + timezone
1208 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1209
deef3195
S
1210 # Remove unrecognized timezones from ISO 8601 alike timestamps
1211 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1212 if m:
1213 date_str = date_str[:-len(m.group('tz'))]
1214
f226880c
PH
1215 # Python only supports microseconds, so remove nanoseconds
1216 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1217 if m:
1218 date_str = m.group(1)
1219
46f59e89
S
1220 for expression in date_formats(day_first):
1221 try:
7dc2a74e 1222 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1223 return calendar.timegm(dt.timetuple())
1224 except ValueError:
1225 pass
1226 timetuple = email.utils.parsedate_tz(date_str)
1227 if timetuple:
7dc2a74e 1228 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1229
1230
28e614de 1231def determine_ext(url, default_ext='unknown_video'):
85750f89 1232 if url is None or '.' not in url:
f4776371 1233 return default_ext
9cb9a5df 1234 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1235 if re.match(r'^[A-Za-z0-9]+$', guess):
1236 return guess
a7aaa398
S
1237 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1238 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1239 return guess.rstrip('/')
73e79f2a 1240 else:
cbdbb766 1241 return default_ext
73e79f2a 1242
5f6a1245 1243
d4051a8e 1244def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1245 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1246
5f6a1245 1247
bd558525 1248def date_from_str(date_str):
37254abc
JMF
1249 """
1250 Return a datetime object from a string in the format YYYYMMDD or
1251 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1252 today = datetime.date.today()
f8795e10 1253 if date_str in ('now', 'today'):
37254abc 1254 return today
f8795e10
PH
1255 if date_str == 'yesterday':
1256 return today - datetime.timedelta(days=1)
ec85ded8 1257 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1258 if match is not None:
1259 sign = match.group('sign')
1260 time = int(match.group('time'))
1261 if sign == '-':
1262 time = -time
1263 unit = match.group('unit')
dfb1b146 1264 # A bad approximation?
37254abc
JMF
1265 if unit == 'month':
1266 unit = 'day'
1267 time *= 30
1268 elif unit == 'year':
1269 unit = 'day'
1270 time *= 365
1271 unit += 's'
1272 delta = datetime.timedelta(**{unit: time})
1273 return today + delta
611c1dd9 1274 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1275
1276
e63fc1be 1277def hyphenate_date(date_str):
1278 """
1279 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1280 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1281 if match is not None:
1282 return '-'.join(match.groups())
1283 else:
1284 return date_str
1285
5f6a1245 1286
bd558525
JMF
1287class DateRange(object):
1288 """Represents a time interval between two dates"""
5f6a1245 1289
bd558525
JMF
1290 def __init__(self, start=None, end=None):
1291 """start and end must be strings in the format accepted by date"""
1292 if start is not None:
1293 self.start = date_from_str(start)
1294 else:
1295 self.start = datetime.datetime.min.date()
1296 if end is not None:
1297 self.end = date_from_str(end)
1298 else:
1299 self.end = datetime.datetime.max.date()
37254abc 1300 if self.start > self.end:
bd558525 1301 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1302
bd558525
JMF
1303 @classmethod
1304 def day(cls, day):
1305 """Returns a range that only contains the given day"""
5f6a1245
JW
1306 return cls(day, day)
1307
bd558525
JMF
1308 def __contains__(self, date):
1309 """Check if the date is in the range"""
37254abc
JMF
1310 if not isinstance(date, datetime.date):
1311 date = date_from_str(date)
1312 return self.start <= date <= self.end
5f6a1245 1313
bd558525 1314 def __str__(self):
5f6a1245 1315 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1316
1317
1318def platform_name():
1319 """ Returns the platform name as a compat_str """
1320 res = platform.platform()
1321 if isinstance(res, bytes):
1322 res = res.decode(preferredencoding())
1323
1324 assert isinstance(res, compat_str)
1325 return res
c257baff
PH
1326
1327
b58ddb32
PH
1328def _windows_write_string(s, out):
1329 """ Returns True if the string was written using special methods,
1330 False if it has yet to be written out."""
1331 # Adapted from http://stackoverflow.com/a/3259271/35070
1332
1333 import ctypes
1334 import ctypes.wintypes
1335
1336 WIN_OUTPUT_IDS = {
1337 1: -11,
1338 2: -12,
1339 }
1340
a383a98a
PH
1341 try:
1342 fileno = out.fileno()
1343 except AttributeError:
1344 # If the output stream doesn't have a fileno, it's virtual
1345 return False
aa42e873
PH
1346 except io.UnsupportedOperation:
1347 # Some strange Windows pseudo files?
1348 return False
b58ddb32
PH
1349 if fileno not in WIN_OUTPUT_IDS:
1350 return False
1351
d7cd9a9e 1352 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1353 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1354 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1355 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1356
d7cd9a9e 1357 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1358 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1359 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1360 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1361 written = ctypes.wintypes.DWORD(0)
1362
d7cd9a9e 1363 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1364 FILE_TYPE_CHAR = 0x0002
1365 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1366 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1367 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1368 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1369 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1370 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1371
1372 def not_a_console(handle):
1373 if handle == INVALID_HANDLE_VALUE or handle is None:
1374 return True
8fb3ac36
PH
1375 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1376 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1377
1378 if not_a_console(h):
1379 return False
1380
d1b9c912
PH
1381 def next_nonbmp_pos(s):
1382 try:
1383 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1384 except StopIteration:
1385 return len(s)
1386
1387 while s:
1388 count = min(next_nonbmp_pos(s), 1024)
1389
b58ddb32 1390 ret = WriteConsoleW(
d1b9c912 1391 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1392 if ret == 0:
1393 raise OSError('Failed to write string')
d1b9c912
PH
1394 if not count: # We just wrote a non-BMP character
1395 assert written.value == 2
1396 s = s[1:]
1397 else:
1398 assert written.value > 0
1399 s = s[written.value:]
b58ddb32
PH
1400 return True
1401
1402
734f90bb 1403def write_string(s, out=None, encoding=None):
7459e3a2
PH
1404 if out is None:
1405 out = sys.stderr
8bf48f23 1406 assert type(s) == compat_str
7459e3a2 1407
b58ddb32
PH
1408 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1409 if _windows_write_string(s, out):
1410 return
1411
7459e3a2
PH
1412 if ('b' in getattr(out, 'mode', '') or
1413 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1414 byt = s.encode(encoding or preferredencoding(), 'ignore')
1415 out.write(byt)
1416 elif hasattr(out, 'buffer'):
1417 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1418 byt = s.encode(enc, 'ignore')
1419 out.buffer.write(byt)
1420 else:
8bf48f23 1421 out.write(s)
7459e3a2
PH
1422 out.flush()
1423
1424
48ea9cea
PH
1425def bytes_to_intlist(bs):
1426 if not bs:
1427 return []
1428 if isinstance(bs[0], int): # Python 3
1429 return list(bs)
1430 else:
1431 return [ord(c) for c in bs]
1432
c257baff 1433
cba892fa 1434def intlist_to_bytes(xs):
1435 if not xs:
1436 return b''
edaa23f8 1437 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1438
1439
c1c9a79c
PH
1440# Cross-platform file locking
1441if sys.platform == 'win32':
1442 import ctypes.wintypes
1443 import msvcrt
1444
1445 class OVERLAPPED(ctypes.Structure):
1446 _fields_ = [
1447 ('Internal', ctypes.wintypes.LPVOID),
1448 ('InternalHigh', ctypes.wintypes.LPVOID),
1449 ('Offset', ctypes.wintypes.DWORD),
1450 ('OffsetHigh', ctypes.wintypes.DWORD),
1451 ('hEvent', ctypes.wintypes.HANDLE),
1452 ]
1453
1454 kernel32 = ctypes.windll.kernel32
1455 LockFileEx = kernel32.LockFileEx
1456 LockFileEx.argtypes = [
1457 ctypes.wintypes.HANDLE, # hFile
1458 ctypes.wintypes.DWORD, # dwFlags
1459 ctypes.wintypes.DWORD, # dwReserved
1460 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1461 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1462 ctypes.POINTER(OVERLAPPED) # Overlapped
1463 ]
1464 LockFileEx.restype = ctypes.wintypes.BOOL
1465 UnlockFileEx = kernel32.UnlockFileEx
1466 UnlockFileEx.argtypes = [
1467 ctypes.wintypes.HANDLE, # hFile
1468 ctypes.wintypes.DWORD, # dwReserved
1469 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1470 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1471 ctypes.POINTER(OVERLAPPED) # Overlapped
1472 ]
1473 UnlockFileEx.restype = ctypes.wintypes.BOOL
1474 whole_low = 0xffffffff
1475 whole_high = 0x7fffffff
1476
1477 def _lock_file(f, exclusive):
1478 overlapped = OVERLAPPED()
1479 overlapped.Offset = 0
1480 overlapped.OffsetHigh = 0
1481 overlapped.hEvent = 0
1482 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1483 handle = msvcrt.get_osfhandle(f.fileno())
1484 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1485 whole_low, whole_high, f._lock_file_overlapped_p):
1486 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1487
1488 def _unlock_file(f):
1489 assert f._lock_file_overlapped_p
1490 handle = msvcrt.get_osfhandle(f.fileno())
1491 if not UnlockFileEx(handle, 0,
1492 whole_low, whole_high, f._lock_file_overlapped_p):
1493 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1494
1495else:
399a76e6
YCH
1496 # Some platforms, such as Jython, is missing fcntl
1497 try:
1498 import fcntl
c1c9a79c 1499
399a76e6
YCH
1500 def _lock_file(f, exclusive):
1501 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1502
399a76e6
YCH
1503 def _unlock_file(f):
1504 fcntl.flock(f, fcntl.LOCK_UN)
1505 except ImportError:
1506 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1507
1508 def _lock_file(f, exclusive):
1509 raise IOError(UNSUPPORTED_MSG)
1510
1511 def _unlock_file(f):
1512 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1513
1514
1515class locked_file(object):
1516 def __init__(self, filename, mode, encoding=None):
1517 assert mode in ['r', 'a', 'w']
1518 self.f = io.open(filename, mode, encoding=encoding)
1519 self.mode = mode
1520
1521 def __enter__(self):
1522 exclusive = self.mode != 'r'
1523 try:
1524 _lock_file(self.f, exclusive)
1525 except IOError:
1526 self.f.close()
1527 raise
1528 return self
1529
1530 def __exit__(self, etype, value, traceback):
1531 try:
1532 _unlock_file(self.f)
1533 finally:
1534 self.f.close()
1535
1536 def __iter__(self):
1537 return iter(self.f)
1538
1539 def write(self, *args):
1540 return self.f.write(*args)
1541
1542 def read(self, *args):
1543 return self.f.read(*args)
4eb7f1d1
JMF
1544
1545
4644ac55
S
1546def get_filesystem_encoding():
1547 encoding = sys.getfilesystemencoding()
1548 return encoding if encoding is not None else 'utf-8'
1549
1550
4eb7f1d1 1551def shell_quote(args):
a6a173c2 1552 quoted_args = []
4644ac55 1553 encoding = get_filesystem_encoding()
a6a173c2
JMF
1554 for a in args:
1555 if isinstance(a, bytes):
1556 # We may get a filename encoded with 'encodeFilename'
1557 a = a.decode(encoding)
aefce8e6 1558 quoted_args.append(compat_shlex_quote(a))
28e614de 1559 return ' '.join(quoted_args)
9d4660ca
PH
1560
1561
1562def smuggle_url(url, data):
1563 """ Pass additional data in a URL for internal use. """
1564
81953d1a
RA
1565 url, idata = unsmuggle_url(url, {})
1566 data.update(idata)
15707c7e 1567 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1568 {'__youtubedl_smuggle': json.dumps(data)})
1569 return url + '#' + sdata
9d4660ca
PH
1570
1571
79f82953 1572def unsmuggle_url(smug_url, default=None):
83e865a3 1573 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1574 return smug_url, default
28e614de
PH
1575 url, _, sdata = smug_url.rpartition('#')
1576 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1577 data = json.loads(jsond)
1578 return url, data
02dbf93f
PH
1579
1580
02dbf93f
PH
1581def format_bytes(bytes):
1582 if bytes is None:
28e614de 1583 return 'N/A'
02dbf93f
PH
1584 if type(bytes) is str:
1585 bytes = float(bytes)
1586 if bytes == 0.0:
1587 exponent = 0
1588 else:
1589 exponent = int(math.log(bytes, 1024.0))
28e614de 1590 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1591 converted = float(bytes) / float(1024 ** exponent)
28e614de 1592 return '%.2f%s' % (converted, suffix)
f53c966a 1593
1c088fa8 1594
fb47597b
S
1595def lookup_unit_table(unit_table, s):
1596 units_re = '|'.join(re.escape(u) for u in unit_table)
1597 m = re.match(
782b1b5b 1598 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1599 if not m:
1600 return None
1601 num_str = m.group('num').replace(',', '.')
1602 mult = unit_table[m.group('unit')]
1603 return int(float(num_str) * mult)
1604
1605
be64b5b0
PH
1606def parse_filesize(s):
1607 if s is None:
1608 return None
1609
dfb1b146 1610 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1611 # but we support those too
1612 _UNIT_TABLE = {
1613 'B': 1,
1614 'b': 1,
70852b47 1615 'bytes': 1,
be64b5b0
PH
1616 'KiB': 1024,
1617 'KB': 1000,
1618 'kB': 1024,
1619 'Kb': 1000,
13585d76 1620 'kb': 1000,
70852b47
YCH
1621 'kilobytes': 1000,
1622 'kibibytes': 1024,
be64b5b0
PH
1623 'MiB': 1024 ** 2,
1624 'MB': 1000 ** 2,
1625 'mB': 1024 ** 2,
1626 'Mb': 1000 ** 2,
13585d76 1627 'mb': 1000 ** 2,
70852b47
YCH
1628 'megabytes': 1000 ** 2,
1629 'mebibytes': 1024 ** 2,
be64b5b0
PH
1630 'GiB': 1024 ** 3,
1631 'GB': 1000 ** 3,
1632 'gB': 1024 ** 3,
1633 'Gb': 1000 ** 3,
13585d76 1634 'gb': 1000 ** 3,
70852b47
YCH
1635 'gigabytes': 1000 ** 3,
1636 'gibibytes': 1024 ** 3,
be64b5b0
PH
1637 'TiB': 1024 ** 4,
1638 'TB': 1000 ** 4,
1639 'tB': 1024 ** 4,
1640 'Tb': 1000 ** 4,
13585d76 1641 'tb': 1000 ** 4,
70852b47
YCH
1642 'terabytes': 1000 ** 4,
1643 'tebibytes': 1024 ** 4,
be64b5b0
PH
1644 'PiB': 1024 ** 5,
1645 'PB': 1000 ** 5,
1646 'pB': 1024 ** 5,
1647 'Pb': 1000 ** 5,
13585d76 1648 'pb': 1000 ** 5,
70852b47
YCH
1649 'petabytes': 1000 ** 5,
1650 'pebibytes': 1024 ** 5,
be64b5b0
PH
1651 'EiB': 1024 ** 6,
1652 'EB': 1000 ** 6,
1653 'eB': 1024 ** 6,
1654 'Eb': 1000 ** 6,
13585d76 1655 'eb': 1000 ** 6,
70852b47
YCH
1656 'exabytes': 1000 ** 6,
1657 'exbibytes': 1024 ** 6,
be64b5b0
PH
1658 'ZiB': 1024 ** 7,
1659 'ZB': 1000 ** 7,
1660 'zB': 1024 ** 7,
1661 'Zb': 1000 ** 7,
13585d76 1662 'zb': 1000 ** 7,
70852b47
YCH
1663 'zettabytes': 1000 ** 7,
1664 'zebibytes': 1024 ** 7,
be64b5b0
PH
1665 'YiB': 1024 ** 8,
1666 'YB': 1000 ** 8,
1667 'yB': 1024 ** 8,
1668 'Yb': 1000 ** 8,
13585d76 1669 'yb': 1000 ** 8,
70852b47
YCH
1670 'yottabytes': 1000 ** 8,
1671 'yobibytes': 1024 ** 8,
be64b5b0
PH
1672 }
1673
fb47597b
S
1674 return lookup_unit_table(_UNIT_TABLE, s)
1675
1676
1677def parse_count(s):
1678 if s is None:
be64b5b0
PH
1679 return None
1680
fb47597b
S
1681 s = s.strip()
1682
1683 if re.match(r'^[\d,.]+$', s):
1684 return str_to_int(s)
1685
1686 _UNIT_TABLE = {
1687 'k': 1000,
1688 'K': 1000,
1689 'm': 1000 ** 2,
1690 'M': 1000 ** 2,
1691 'kk': 1000 ** 2,
1692 'KK': 1000 ** 2,
1693 }
be64b5b0 1694
fb47597b 1695 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1696
2f7ae819 1697
b871d7e9
S
1698def parse_resolution(s):
1699 if s is None:
1700 return {}
1701
1702 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1703 if mobj:
1704 return {
1705 'width': int(mobj.group('w')),
1706 'height': int(mobj.group('h')),
1707 }
1708
1709 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1710 if mobj:
1711 return {'height': int(mobj.group(1))}
1712
1713 mobj = re.search(r'\b([48])[kK]\b', s)
1714 if mobj:
1715 return {'height': int(mobj.group(1)) * 540}
1716
1717 return {}
1718
1719
a942d6cb 1720def month_by_name(name, lang='en'):
caefb1de
PH
1721 """ Return the number of a month by (locale-independently) English name """
1722
f6717dec 1723 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1724
caefb1de 1725 try:
f6717dec 1726 return month_names.index(name) + 1
7105440c
YCH
1727 except ValueError:
1728 return None
1729
1730
1731def month_by_abbreviation(abbrev):
1732 """ Return the number of a month by (locale-independently) English
1733 abbreviations """
1734
1735 try:
1736 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1737 except ValueError:
1738 return None
18258362
JMF
1739
1740
5aafe895 1741def fix_xml_ampersands(xml_str):
18258362 1742 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1743 return re.sub(
1744 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1745 '&amp;',
5aafe895 1746 xml_str)
e3946f98
PH
1747
1748
1749def setproctitle(title):
8bf48f23 1750 assert isinstance(title, compat_str)
c1c05c67
YCH
1751
1752 # ctypes in Jython is not complete
1753 # http://bugs.jython.org/issue2148
1754 if sys.platform.startswith('java'):
1755 return
1756
e3946f98 1757 try:
611c1dd9 1758 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1759 except OSError:
1760 return
2f49bcd6
RC
1761 except TypeError:
1762 # LoadLibrary in Windows Python 2.7.13 only expects
1763 # a bytestring, but since unicode_literals turns
1764 # every string into a unicode string, it fails.
1765 return
6eefe533
PH
1766 title_bytes = title.encode('utf-8')
1767 buf = ctypes.create_string_buffer(len(title_bytes))
1768 buf.value = title_bytes
e3946f98 1769 try:
6eefe533 1770 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1771 except AttributeError:
1772 return # Strange libc, just skip this
d7dda168
PH
1773
1774
1775def remove_start(s, start):
46bc9b7d 1776 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1777
1778
2b9faf55 1779def remove_end(s, end):
46bc9b7d 1780 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1781
1782
31b2051e
S
1783def remove_quotes(s):
1784 if s is None or len(s) < 2:
1785 return s
1786 for quote in ('"', "'", ):
1787 if s[0] == quote and s[-1] == quote:
1788 return s[1:-1]
1789 return s
1790
1791
29eb5174 1792def url_basename(url):
9b8aaeed 1793 path = compat_urlparse.urlparse(url).path
28e614de 1794 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1795
1796
02dc0a36
S
1797def base_url(url):
1798 return re.match(r'https?://[^?#&]+/', url).group()
1799
1800
e34c3361 1801def urljoin(base, path):
4b5de77b
S
1802 if isinstance(path, bytes):
1803 path = path.decode('utf-8')
e34c3361
S
1804 if not isinstance(path, compat_str) or not path:
1805 return None
b0c65c67 1806 if re.match(r'^(?:https?:)?//', path):
e34c3361 1807 return path
4b5de77b
S
1808 if isinstance(base, bytes):
1809 base = base.decode('utf-8')
1810 if not isinstance(base, compat_str) or not re.match(
1811 r'^(?:https?:)?//', base):
e34c3361
S
1812 return None
1813 return compat_urlparse.urljoin(base, path)
1814
1815
aa94a6d3
PH
1816class HEADRequest(compat_urllib_request.Request):
1817 def get_method(self):
611c1dd9 1818 return 'HEAD'
7217e148
PH
1819
1820
95cf60e8
S
1821class PUTRequest(compat_urllib_request.Request):
1822 def get_method(self):
1823 return 'PUT'
1824
1825
9732d77e 1826def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1827 if get_attr:
1828 if v is not None:
1829 v = getattr(v, get_attr, None)
9572013d
PH
1830 if v == '':
1831 v = None
1812afb7
S
1832 if v is None:
1833 return default
1834 try:
1835 return int(v) * invscale // scale
1836 except ValueError:
af98f8ff 1837 return default
9732d77e 1838
9572013d 1839
40a90862
JMF
1840def str_or_none(v, default=None):
1841 return default if v is None else compat_str(v)
1842
9732d77e
PH
1843
1844def str_to_int(int_str):
48d4681e 1845 """ A more relaxed version of int_or_none """
9732d77e
PH
1846 if int_str is None:
1847 return None
28e614de 1848 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1849 return int(int_str)
608d11f5
PH
1850
1851
9732d77e 1852def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1853 if v is None:
1854 return default
1855 try:
1856 return float(v) * invscale / scale
1857 except ValueError:
1858 return default
43f775e4
PH
1859
1860
c7e327c4
S
1861def bool_or_none(v, default=None):
1862 return v if isinstance(v, bool) else default
1863
1864
b72b4431
S
1865def strip_or_none(v):
1866 return None if v is None else v.strip()
1867
1868
608d11f5 1869def parse_duration(s):
8f9312c3 1870 if not isinstance(s, compat_basestring):
608d11f5
PH
1871 return None
1872
ca7b3246
S
1873 s = s.strip()
1874
acaff495 1875 days, hours, mins, secs, ms = [None] * 5
15846398 1876 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1877 if m:
1878 days, hours, mins, secs, ms = m.groups()
1879 else:
1880 m = re.match(
056653bb
S
1881 r'''(?ix)(?:P?
1882 (?:
1883 [0-9]+\s*y(?:ears?)?\s*
1884 )?
1885 (?:
1886 [0-9]+\s*m(?:onths?)?\s*
1887 )?
1888 (?:
1889 [0-9]+\s*w(?:eeks?)?\s*
1890 )?
8f4b58d7 1891 (?:
acaff495 1892 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1893 )?
056653bb 1894 T)?
acaff495 1895 (?:
1896 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1897 )?
1898 (?:
1899 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1900 )?
1901 (?:
1902 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1903 )?Z?$''', s)
acaff495 1904 if m:
1905 days, hours, mins, secs, ms = m.groups()
1906 else:
15846398 1907 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1908 if m:
1909 hours, mins = m.groups()
1910 else:
1911 return None
1912
1913 duration = 0
1914 if secs:
1915 duration += float(secs)
1916 if mins:
1917 duration += float(mins) * 60
1918 if hours:
1919 duration += float(hours) * 60 * 60
1920 if days:
1921 duration += float(days) * 24 * 60 * 60
1922 if ms:
1923 duration += float(ms)
1924 return duration
91d7d0b3
JMF
1925
1926
e65e4c88 1927def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1928 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1929 return (
1930 '{0}.{1}{2}'.format(name, ext, real_ext)
1931 if not expected_real_ext or real_ext[1:] == expected_real_ext
1932 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1933
1934
b3ed15b7
S
1935def replace_extension(filename, ext, expected_real_ext=None):
1936 name, real_ext = os.path.splitext(filename)
1937 return '{0}.{1}'.format(
1938 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1939 ext)
1940
1941
d70ad093
PH
1942def check_executable(exe, args=[]):
1943 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1944 args can be a list of arguments for a short output (like -version) """
1945 try:
1946 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1947 except OSError:
1948 return False
1949 return exe
b7ab0590
PH
1950
1951
95807118 1952def get_exe_version(exe, args=['--version'],
cae97f65 1953 version_re=None, unrecognized='present'):
95807118
PH
1954 """ Returns the version of the specified executable,
1955 or False if the executable is not present """
1956 try:
b64d04c1
YCH
1957 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1958 # SIGTTOU if youtube-dl is run in the background.
1959 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1960 out, _ = subprocess.Popen(
54116803 1961 [encodeArgument(exe)] + args,
00ca7552 1962 stdin=subprocess.PIPE,
95807118
PH
1963 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1964 except OSError:
1965 return False
cae97f65
PH
1966 if isinstance(out, bytes): # Python 2.x
1967 out = out.decode('ascii', 'ignore')
1968 return detect_exe_version(out, version_re, unrecognized)
1969
1970
1971def detect_exe_version(output, version_re=None, unrecognized='present'):
1972 assert isinstance(output, compat_str)
1973 if version_re is None:
1974 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1975 m = re.search(version_re, output)
95807118
PH
1976 if m:
1977 return m.group(1)
1978 else:
1979 return unrecognized
1980
1981
b7ab0590 1982class PagedList(object):
dd26ced1
PH
1983 def __len__(self):
1984 # This is only useful for tests
1985 return len(self.getslice())
1986
9c44d242
PH
1987
1988class OnDemandPagedList(PagedList):
6be08ce6 1989 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1990 self._pagefunc = pagefunc
1991 self._pagesize = pagesize
b95dc034
YCH
1992 self._use_cache = use_cache
1993 if use_cache:
1994 self._cache = {}
9c44d242 1995
b7ab0590
PH
1996 def getslice(self, start=0, end=None):
1997 res = []
1998 for pagenum in itertools.count(start // self._pagesize):
1999 firstid = pagenum * self._pagesize
2000 nextfirstid = pagenum * self._pagesize + self._pagesize
2001 if start >= nextfirstid:
2002 continue
2003
b95dc034
YCH
2004 page_results = None
2005 if self._use_cache:
2006 page_results = self._cache.get(pagenum)
2007 if page_results is None:
2008 page_results = list(self._pagefunc(pagenum))
2009 if self._use_cache:
2010 self._cache[pagenum] = page_results
b7ab0590
PH
2011
2012 startv = (
2013 start % self._pagesize
2014 if firstid <= start < nextfirstid
2015 else 0)
2016
2017 endv = (
2018 ((end - 1) % self._pagesize) + 1
2019 if (end is not None and firstid <= end <= nextfirstid)
2020 else None)
2021
2022 if startv != 0 or endv is not None:
2023 page_results = page_results[startv:endv]
2024 res.extend(page_results)
2025
2026 # A little optimization - if current page is not "full", ie. does
2027 # not contain page_size videos then we can assume that this page
2028 # is the last one - there are no more ids on further pages -
2029 # i.e. no need to query again.
2030 if len(page_results) + startv < self._pagesize:
2031 break
2032
2033 # If we got the whole page, but the next page is not interesting,
2034 # break out early as well
2035 if end == nextfirstid:
2036 break
2037 return res
81c2f20b
PH
2038
2039
9c44d242
PH
2040class InAdvancePagedList(PagedList):
2041 def __init__(self, pagefunc, pagecount, pagesize):
2042 self._pagefunc = pagefunc
2043 self._pagecount = pagecount
2044 self._pagesize = pagesize
2045
2046 def getslice(self, start=0, end=None):
2047 res = []
2048 start_page = start // self._pagesize
2049 end_page = (
2050 self._pagecount if end is None else (end // self._pagesize + 1))
2051 skip_elems = start - start_page * self._pagesize
2052 only_more = None if end is None else end - start
2053 for pagenum in range(start_page, end_page):
2054 page = list(self._pagefunc(pagenum))
2055 if skip_elems:
2056 page = page[skip_elems:]
2057 skip_elems = None
2058 if only_more is not None:
2059 if len(page) < only_more:
2060 only_more -= len(page)
2061 else:
2062 page = page[:only_more]
2063 res.extend(page)
2064 break
2065 res.extend(page)
2066 return res
2067
2068
81c2f20b 2069def uppercase_escape(s):
676eb3f2 2070 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2071 return re.sub(
a612753d 2072 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2073 lambda m: unicode_escape(m.group(0))[0],
2074 s)
0fe2ff78
YCH
2075
2076
2077def lowercase_escape(s):
2078 unicode_escape = codecs.getdecoder('unicode_escape')
2079 return re.sub(
2080 r'\\u[0-9a-fA-F]{4}',
2081 lambda m: unicode_escape(m.group(0))[0],
2082 s)
b53466e1 2083
d05cfe06
S
2084
2085def escape_rfc3986(s):
2086 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2087 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2088 s = s.encode('utf-8')
ecc0c5ee 2089 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2090
2091
2092def escape_url(url):
2093 """Escape URL as suggested by RFC 3986"""
2094 url_parsed = compat_urllib_parse_urlparse(url)
2095 return url_parsed._replace(
efbed08d 2096 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2097 path=escape_rfc3986(url_parsed.path),
2098 params=escape_rfc3986(url_parsed.params),
2099 query=escape_rfc3986(url_parsed.query),
2100 fragment=escape_rfc3986(url_parsed.fragment)
2101 ).geturl()
2102
62e609ab
PH
2103
2104def read_batch_urls(batch_fd):
2105 def fixup(url):
2106 if not isinstance(url, compat_str):
2107 url = url.decode('utf-8', 'replace')
28e614de 2108 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2109 if url.startswith(BOM_UTF8):
2110 url = url[len(BOM_UTF8):]
2111 url = url.strip()
2112 if url.startswith(('#', ';', ']')):
2113 return False
2114 return url
2115
2116 with contextlib.closing(batch_fd) as fd:
2117 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2118
2119
2120def urlencode_postdata(*args, **kargs):
15707c7e 2121 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2122
2123
38f9ef31 2124def update_url_query(url, query):
cacd9966
YCH
2125 if not query:
2126 return url
38f9ef31 2127 parsed_url = compat_urlparse.urlparse(url)
2128 qs = compat_parse_qs(parsed_url.query)
2129 qs.update(query)
2130 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2131 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2132
8e60dc75 2133
ed0291d1
S
2134def update_Request(req, url=None, data=None, headers={}, query={}):
2135 req_headers = req.headers.copy()
2136 req_headers.update(headers)
2137 req_data = data or req.data
2138 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2139 req_get_method = req.get_method()
2140 if req_get_method == 'HEAD':
2141 req_type = HEADRequest
2142 elif req_get_method == 'PUT':
2143 req_type = PUTRequest
2144 else:
2145 req_type = compat_urllib_request.Request
ed0291d1
S
2146 new_req = req_type(
2147 req_url, data=req_data, headers=req_headers,
2148 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2149 if hasattr(req, 'timeout'):
2150 new_req.timeout = req.timeout
2151 return new_req
2152
2153
10c87c15 2154def _multipart_encode_impl(data, boundary):
0c265486
YCH
2155 content_type = 'multipart/form-data; boundary=%s' % boundary
2156
2157 out = b''
2158 for k, v in data.items():
2159 out += b'--' + boundary.encode('ascii') + b'\r\n'
2160 if isinstance(k, compat_str):
2161 k = k.encode('utf-8')
2162 if isinstance(v, compat_str):
2163 v = v.encode('utf-8')
2164 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2165 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2166 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2167 if boundary.encode('ascii') in content:
2168 raise ValueError('Boundary overlaps with data')
2169 out += content
2170
2171 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2172
2173 return out, content_type
2174
2175
2176def multipart_encode(data, boundary=None):
2177 '''
2178 Encode a dict to RFC 7578-compliant form-data
2179
2180 data:
2181 A dict where keys and values can be either Unicode or bytes-like
2182 objects.
2183 boundary:
2184 If specified a Unicode object, it's used as the boundary. Otherwise
2185 a random boundary is generated.
2186
2187 Reference: https://tools.ietf.org/html/rfc7578
2188 '''
2189 has_specified_boundary = boundary is not None
2190
2191 while True:
2192 if boundary is None:
2193 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2194
2195 try:
10c87c15 2196 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2197 break
2198 except ValueError:
2199 if has_specified_boundary:
2200 raise
2201 boundary = None
2202
2203 return out, content_type
2204
2205
86296ad2 2206def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2207 if isinstance(key_or_keys, (list, tuple)):
2208 for key in key_or_keys:
86296ad2
S
2209 if key not in d or d[key] is None or skip_false_values and not d[key]:
2210 continue
2211 return d[key]
cbecc9b9
S
2212 return default
2213 return d.get(key_or_keys, default)
2214
2215
329ca3be 2216def try_get(src, getter, expected_type=None):
a32a9a7e
S
2217 if not isinstance(getter, (list, tuple)):
2218 getter = [getter]
2219 for get in getter:
2220 try:
2221 v = get(src)
2222 except (AttributeError, KeyError, TypeError, IndexError):
2223 pass
2224 else:
2225 if expected_type is None or isinstance(v, expected_type):
2226 return v
329ca3be
S
2227
2228
6cc62232
S
2229def merge_dicts(*dicts):
2230 merged = {}
2231 for a_dict in dicts:
2232 for k, v in a_dict.items():
2233 if v is None:
2234 continue
2235 if (k not in merged or
2236 (isinstance(v, compat_str) and v and
2237 isinstance(merged[k], compat_str) and
2238 not merged[k])):
2239 merged[k] = v
2240 return merged
2241
2242
8e60dc75
S
2243def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2244 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2245
16392824 2246
a1a530b0
PH
2247US_RATINGS = {
2248 'G': 0,
2249 'PG': 10,
2250 'PG-13': 13,
2251 'R': 16,
2252 'NC': 18,
2253}
fac55558
PH
2254
2255
a8795327 2256TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2257 'TV-Y': 0,
2258 'TV-Y7': 7,
2259 'TV-G': 0,
2260 'TV-PG': 0,
2261 'TV-14': 14,
2262 'TV-MA': 17,
a8795327
S
2263}
2264
2265
146c80e2 2266def parse_age_limit(s):
a8795327
S
2267 if type(s) == int:
2268 return s if 0 <= s <= 21 else None
2269 if not isinstance(s, compat_basestring):
d838b1bd 2270 return None
146c80e2 2271 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2272 if m:
2273 return int(m.group('age'))
2274 if s in US_RATINGS:
2275 return US_RATINGS[s]
5a16c9d9 2276 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2277 if m:
5a16c9d9 2278 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2279 return None
146c80e2
S
2280
2281
fac55558 2282def strip_jsonp(code):
609a61e3 2283 return re.sub(
5552c9eb 2284 r'''(?sx)^
e9c671d5 2285 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2286 (?:\s*&&\s*(?P=func_name))?
2287 \s*\(\s*(?P<callback_data>.*)\);?
2288 \s*?(?://[^\n]*)*$''',
2289 r'\g<callback_data>', code)
478c2c61
PH
2290
2291
e05f6939 2292def js_to_json(code):
4195096e
S
2293 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2294 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2295 INTEGER_TABLE = (
2296 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2297 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2298 )
2299
e05f6939 2300 def fix_kv(m):
e7b6d122
PH
2301 v = m.group(0)
2302 if v in ('true', 'false', 'null'):
2303 return v
b3ee552e 2304 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2305 return ""
2306
2307 if v[0] in ("'", '"'):
2308 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2309 '"': '\\"',
bd1e4844 2310 "\\'": "'",
2311 '\\\n': '',
2312 '\\x': '\\u00',
2313 }.get(m.group(0), m.group(0)), v[1:-1])
2314
89ac4a19
S
2315 for regex, base in INTEGER_TABLE:
2316 im = re.match(regex, v)
2317 if im:
e4659b45 2318 i = int(im.group(1), base)
89ac4a19
S
2319 return '"%d":' % i if v.endswith(':') else '%d' % i
2320
e7b6d122 2321 return '"%s"' % v
e05f6939 2322
bd1e4844 2323 return re.sub(r'''(?sx)
2324 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2325 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2326 {comment}|,(?={skip}[\]}}])|
c384d537 2327 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2328 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2329 [0-9]+(?={skip}:)
2330 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2331
2332
478c2c61
PH
2333def qualities(quality_ids):
2334 """ Get a numeric quality value out of a list of possible values """
2335 def q(qid):
2336 try:
2337 return quality_ids.index(qid)
2338 except ValueError:
2339 return -1
2340 return q
2341
acd69589
PH
2342
2343DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2344
a020a0dc
PH
2345
2346def limit_length(s, length):
2347 """ Add ellipses to overly long strings """
2348 if s is None:
2349 return None
2350 ELLIPSES = '...'
2351 if len(s) > length:
2352 return s[:length - len(ELLIPSES)] + ELLIPSES
2353 return s
48844745
PH
2354
2355
2356def version_tuple(v):
5f9b8394 2357 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2358
2359
2360def is_outdated_version(version, limit, assume_new=True):
2361 if not version:
2362 return not assume_new
2363 try:
2364 return version_tuple(version) < version_tuple(limit)
2365 except ValueError:
2366 return not assume_new
732ea2f0
PH
2367
2368
2369def ytdl_is_updateable():
2370 """ Returns if youtube-dl can be updated with -U """
2371 from zipimport import zipimporter
2372
2373 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2374
2375
2376def args_to_str(args):
2377 # Get a short string representation for a subprocess command
702ccf2d 2378 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2379
2380
9b9c5355 2381def error_to_compat_str(err):
fdae2358
S
2382 err_str = str(err)
2383 # On python 2 error byte string must be decoded with proper
2384 # encoding rather than ascii
2385 if sys.version_info[0] < 3:
2386 err_str = err_str.decode(preferredencoding())
2387 return err_str
2388
2389
c460bdd5 2390def mimetype2ext(mt):
eb9ee194
S
2391 if mt is None:
2392 return None
2393
765ac263
JMF
2394 ext = {
2395 'audio/mp4': 'm4a',
6c33d24b
YCH
2396 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2397 # it's the most popular one
2398 'audio/mpeg': 'mp3',
765ac263
JMF
2399 }.get(mt)
2400 if ext is not None:
2401 return ext
2402
c460bdd5 2403 _, _, res = mt.rpartition('/')
6562d34a 2404 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2405
2406 return {
f6861ec9 2407 '3gpp': '3gp',
cafcf657 2408 'smptett+xml': 'tt',
cafcf657 2409 'ttaf+xml': 'dfxp',
a0d8d704 2410 'ttml+xml': 'ttml',
f6861ec9 2411 'x-flv': 'flv',
a0d8d704 2412 'x-mp4-fragmented': 'mp4',
d4f05d47 2413 'x-ms-sami': 'sami',
a0d8d704 2414 'x-ms-wmv': 'wmv',
b4173f15
RA
2415 'mpegurl': 'm3u8',
2416 'x-mpegurl': 'm3u8',
2417 'vnd.apple.mpegurl': 'm3u8',
2418 'dash+xml': 'mpd',
b4173f15 2419 'f4m+xml': 'f4m',
f164b971 2420 'hds+xml': 'f4m',
e910fe2f 2421 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2422 'quicktime': 'mov',
98ce1a3f 2423 'mp2t': 'ts',
c460bdd5
PH
2424 }.get(res, res)
2425
2426
4f3c5e06 2427def parse_codecs(codecs_str):
2428 # http://tools.ietf.org/html/rfc6381
2429 if not codecs_str:
2430 return {}
2431 splited_codecs = list(filter(None, map(
2432 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2433 vcodec, acodec = None, None
2434 for full_codec in splited_codecs:
2435 codec = full_codec.split('.')[0]
ffe6979e 2436 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2437 if not vcodec:
2438 vcodec = full_codec
60f5c9fb 2439 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2440 if not acodec:
2441 acodec = full_codec
2442 else:
60f5c9fb 2443 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2444 if not vcodec and not acodec:
2445 if len(splited_codecs) == 2:
2446 return {
2447 'vcodec': vcodec,
2448 'acodec': acodec,
2449 }
2450 elif len(splited_codecs) == 1:
2451 return {
2452 'vcodec': 'none',
2453 'acodec': vcodec,
2454 }
2455 else:
2456 return {
2457 'vcodec': vcodec or 'none',
2458 'acodec': acodec or 'none',
2459 }
2460 return {}
2461
2462
2ccd1b10 2463def urlhandle_detect_ext(url_handle):
79298173 2464 getheader = url_handle.headers.get
2ccd1b10 2465
b55ee18f
PH
2466 cd = getheader('Content-Disposition')
2467 if cd:
2468 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2469 if m:
2470 e = determine_ext(m.group('filename'), default_ext=None)
2471 if e:
2472 return e
2473
c460bdd5 2474 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2475
2476
1e399778
YCH
2477def encode_data_uri(data, mime_type):
2478 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2479
2480
05900629 2481def age_restricted(content_limit, age_limit):
6ec6cb4e 2482 """ Returns True iff the content should be blocked """
05900629
PH
2483
2484 if age_limit is None: # No limit set
2485 return False
2486 if content_limit is None:
2487 return False # Content available for everyone
2488 return age_limit < content_limit
61ca9a80
PH
2489
2490
2491def is_html(first_bytes):
2492 """ Detect whether a file contains HTML by examining its first bytes. """
2493
2494 BOMS = [
2495 (b'\xef\xbb\xbf', 'utf-8'),
2496 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2497 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2498 (b'\xff\xfe', 'utf-16-le'),
2499 (b'\xfe\xff', 'utf-16-be'),
2500 ]
2501 for bom, enc in BOMS:
2502 if first_bytes.startswith(bom):
2503 s = first_bytes[len(bom):].decode(enc, 'replace')
2504 break
2505 else:
2506 s = first_bytes.decode('utf-8', 'replace')
2507
2508 return re.match(r'^\s*<', s)
a055469f
PH
2509
2510
2511def determine_protocol(info_dict):
2512 protocol = info_dict.get('protocol')
2513 if protocol is not None:
2514 return protocol
2515
2516 url = info_dict['url']
2517 if url.startswith('rtmp'):
2518 return 'rtmp'
2519 elif url.startswith('mms'):
2520 return 'mms'
2521 elif url.startswith('rtsp'):
2522 return 'rtsp'
2523
2524 ext = determine_ext(url)
2525 if ext == 'm3u8':
2526 return 'm3u8'
2527 elif ext == 'f4m':
2528 return 'f4m'
2529
2530 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2531
2532
2533def render_table(header_row, data):
2534 """ Render a list of rows, each as a list of values """
2535 table = [header_row] + data
2536 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2537 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2538 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2539
2540
2541def _match_one(filter_part, dct):
2542 COMPARISON_OPERATORS = {
2543 '<': operator.lt,
2544 '<=': operator.le,
2545 '>': operator.gt,
2546 '>=': operator.ge,
2547 '=': operator.eq,
2548 '!=': operator.ne,
2549 }
2550 operator_rex = re.compile(r'''(?x)\s*
2551 (?P<key>[a-z_]+)
2552 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2553 (?:
2554 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2555 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2556 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2557 )
2558 \s*$
2559 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2560 m = operator_rex.search(filter_part)
2561 if m:
2562 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2563 actual_value = dct.get(m.group('key'))
db13c16e
S
2564 if (m.group('quotedstrval') is not None or
2565 m.group('strval') is not None or
e5a088dc
S
2566 # If the original field is a string and matching comparisonvalue is
2567 # a number we should respect the origin of the original field
2568 # and process comparison value as a string (see
2569 # https://github.com/rg3/youtube-dl/issues/11082).
2570 actual_value is not None and m.group('intval') is not None and
2571 isinstance(actual_value, compat_str)):
347de493
PH
2572 if m.group('op') not in ('=', '!='):
2573 raise ValueError(
2574 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2575 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2576 quote = m.group('quote')
2577 if quote is not None:
2578 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2579 else:
2580 try:
2581 comparison_value = int(m.group('intval'))
2582 except ValueError:
2583 comparison_value = parse_filesize(m.group('intval'))
2584 if comparison_value is None:
2585 comparison_value = parse_filesize(m.group('intval') + 'B')
2586 if comparison_value is None:
2587 raise ValueError(
2588 'Invalid integer value %r in filter part %r' % (
2589 m.group('intval'), filter_part))
347de493
PH
2590 if actual_value is None:
2591 return m.group('none_inclusive')
2592 return op(actual_value, comparison_value)
2593
2594 UNARY_OPERATORS = {
1cc47c66
S
2595 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2596 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
2597 }
2598 operator_rex = re.compile(r'''(?x)\s*
2599 (?P<op>%s)\s*(?P<key>[a-z_]+)
2600 \s*$
2601 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2602 m = operator_rex.search(filter_part)
2603 if m:
2604 op = UNARY_OPERATORS[m.group('op')]
2605 actual_value = dct.get(m.group('key'))
2606 return op(actual_value)
2607
2608 raise ValueError('Invalid filter part %r' % filter_part)
2609
2610
2611def match_str(filter_str, dct):
2612 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2613
2614 return all(
2615 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2616
2617
2618def match_filter_func(filter_str):
2619 def _match_func(info_dict):
2620 if match_str(filter_str, info_dict):
2621 return None
2622 else:
2623 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2624 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2625 return _match_func
91410c9b
PH
2626
2627
bf6427d2
YCH
2628def parse_dfxp_time_expr(time_expr):
2629 if not time_expr:
d631d5f9 2630 return
bf6427d2
YCH
2631
2632 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2633 if mobj:
2634 return float(mobj.group('time_offset'))
2635
db2fe38b 2636 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2637 if mobj:
db2fe38b 2638 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2639
2640
c1c924ab
YCH
2641def srt_subtitles_timecode(seconds):
2642 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2643
2644
2645def dfxp2srt(dfxp_data):
3869028f
YCH
2646 '''
2647 @param dfxp_data A bytes-like object containing DFXP data
2648 @returns A unicode object containing converted SRT data
2649 '''
5b995f71 2650 LEGACY_NAMESPACES = (
3869028f
YCH
2651 (b'http://www.w3.org/ns/ttml', [
2652 b'http://www.w3.org/2004/11/ttaf1',
2653 b'http://www.w3.org/2006/04/ttaf1',
2654 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2655 ]),
3869028f
YCH
2656 (b'http://www.w3.org/ns/ttml#styling', [
2657 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2658 ]),
2659 )
2660
2661 SUPPORTED_STYLING = [
2662 'color',
2663 'fontFamily',
2664 'fontSize',
2665 'fontStyle',
2666 'fontWeight',
2667 'textDecoration'
2668 ]
2669
4e335771 2670 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 2671 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 2672 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2673 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2674 })
bf6427d2 2675
5b995f71
RA
2676 styles = {}
2677 default_style = {}
2678
87de7069 2679 class TTMLPElementParser(object):
5b995f71
RA
2680 _out = ''
2681 _unclosed_elements = []
2682 _applied_styles = []
bf6427d2 2683
2b14cb56 2684 def start(self, tag, attrib):
5b995f71
RA
2685 if tag in (_x('ttml:br'), 'br'):
2686 self._out += '\n'
2687 else:
2688 unclosed_elements = []
2689 style = {}
2690 element_style_id = attrib.get('style')
2691 if default_style:
2692 style.update(default_style)
2693 if element_style_id:
2694 style.update(styles.get(element_style_id, {}))
2695 for prop in SUPPORTED_STYLING:
2696 prop_val = attrib.get(_x('tts:' + prop))
2697 if prop_val:
2698 style[prop] = prop_val
2699 if style:
2700 font = ''
2701 for k, v in sorted(style.items()):
2702 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2703 continue
2704 if k == 'color':
2705 font += ' color="%s"' % v
2706 elif k == 'fontSize':
2707 font += ' size="%s"' % v
2708 elif k == 'fontFamily':
2709 font += ' face="%s"' % v
2710 elif k == 'fontWeight' and v == 'bold':
2711 self._out += '<b>'
2712 unclosed_elements.append('b')
2713 elif k == 'fontStyle' and v == 'italic':
2714 self._out += '<i>'
2715 unclosed_elements.append('i')
2716 elif k == 'textDecoration' and v == 'underline':
2717 self._out += '<u>'
2718 unclosed_elements.append('u')
2719 if font:
2720 self._out += '<font' + font + '>'
2721 unclosed_elements.append('font')
2722 applied_style = {}
2723 if self._applied_styles:
2724 applied_style.update(self._applied_styles[-1])
2725 applied_style.update(style)
2726 self._applied_styles.append(applied_style)
2727 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2728
2b14cb56 2729 def end(self, tag):
5b995f71
RA
2730 if tag not in (_x('ttml:br'), 'br'):
2731 unclosed_elements = self._unclosed_elements.pop()
2732 for element in reversed(unclosed_elements):
2733 self._out += '</%s>' % element
2734 if unclosed_elements and self._applied_styles:
2735 self._applied_styles.pop()
bf6427d2 2736
2b14cb56 2737 def data(self, data):
5b995f71 2738 self._out += data
2b14cb56 2739
2740 def close(self):
5b995f71 2741 return self._out.strip()
2b14cb56 2742
2743 def parse_node(node):
2744 target = TTMLPElementParser()
2745 parser = xml.etree.ElementTree.XMLParser(target=target)
2746 parser.feed(xml.etree.ElementTree.tostring(node))
2747 return parser.close()
bf6427d2 2748
5b995f71
RA
2749 for k, v in LEGACY_NAMESPACES:
2750 for ns in v:
2751 dfxp_data = dfxp_data.replace(ns, k)
2752
3869028f 2753 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2754 out = []
5b995f71 2755 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2756
2757 if not paras:
2758 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2759
5b995f71
RA
2760 repeat = False
2761 while True:
2762 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
2763 style_id = style.get('id') or style.get(_x('xml:id'))
2764 if not style_id:
2765 continue
5b995f71
RA
2766 parent_style_id = style.get('style')
2767 if parent_style_id:
2768 if parent_style_id not in styles:
2769 repeat = True
2770 continue
2771 styles[style_id] = styles[parent_style_id].copy()
2772 for prop in SUPPORTED_STYLING:
2773 prop_val = style.get(_x('tts:' + prop))
2774 if prop_val:
2775 styles.setdefault(style_id, {})[prop] = prop_val
2776 if repeat:
2777 repeat = False
2778 else:
2779 break
2780
2781 for p in ('body', 'div'):
2782 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2783 if ele is None:
2784 continue
2785 style = styles.get(ele.get('style'))
2786 if not style:
2787 continue
2788 default_style.update(style)
2789
bf6427d2 2790 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2791 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2792 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2793 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2794 if begin_time is None:
2795 continue
7dff0363 2796 if not end_time:
d631d5f9
YCH
2797 if not dur:
2798 continue
2799 end_time = begin_time + dur
bf6427d2
YCH
2800 out.append('%d\n%s --> %s\n%s\n\n' % (
2801 index,
c1c924ab
YCH
2802 srt_subtitles_timecode(begin_time),
2803 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2804 parse_node(para)))
2805
2806 return ''.join(out)
2807
2808
66e289ba
S
2809def cli_option(params, command_option, param):
2810 param = params.get(param)
98e698f1
RA
2811 if param:
2812 param = compat_str(param)
66e289ba
S
2813 return [command_option, param] if param is not None else []
2814
2815
2816def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2817 param = params.get(param)
5b232f46
S
2818 if param is None:
2819 return []
66e289ba
S
2820 assert isinstance(param, bool)
2821 if separator:
2822 return [command_option + separator + (true_value if param else false_value)]
2823 return [command_option, true_value if param else false_value]
2824
2825
2826def cli_valueless_option(params, command_option, param, expected_value=True):
2827 param = params.get(param)
2828 return [command_option] if param == expected_value else []
2829
2830
2831def cli_configuration_args(params, param, default=[]):
2832 ex_args = params.get(param)
2833 if ex_args is None:
2834 return default
2835 assert isinstance(ex_args, list)
2836 return ex_args
2837
2838
39672624
YCH
2839class ISO639Utils(object):
2840 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2841 _lang_map = {
2842 'aa': 'aar',
2843 'ab': 'abk',
2844 'ae': 'ave',
2845 'af': 'afr',
2846 'ak': 'aka',
2847 'am': 'amh',
2848 'an': 'arg',
2849 'ar': 'ara',
2850 'as': 'asm',
2851 'av': 'ava',
2852 'ay': 'aym',
2853 'az': 'aze',
2854 'ba': 'bak',
2855 'be': 'bel',
2856 'bg': 'bul',
2857 'bh': 'bih',
2858 'bi': 'bis',
2859 'bm': 'bam',
2860 'bn': 'ben',
2861 'bo': 'bod',
2862 'br': 'bre',
2863 'bs': 'bos',
2864 'ca': 'cat',
2865 'ce': 'che',
2866 'ch': 'cha',
2867 'co': 'cos',
2868 'cr': 'cre',
2869 'cs': 'ces',
2870 'cu': 'chu',
2871 'cv': 'chv',
2872 'cy': 'cym',
2873 'da': 'dan',
2874 'de': 'deu',
2875 'dv': 'div',
2876 'dz': 'dzo',
2877 'ee': 'ewe',
2878 'el': 'ell',
2879 'en': 'eng',
2880 'eo': 'epo',
2881 'es': 'spa',
2882 'et': 'est',
2883 'eu': 'eus',
2884 'fa': 'fas',
2885 'ff': 'ful',
2886 'fi': 'fin',
2887 'fj': 'fij',
2888 'fo': 'fao',
2889 'fr': 'fra',
2890 'fy': 'fry',
2891 'ga': 'gle',
2892 'gd': 'gla',
2893 'gl': 'glg',
2894 'gn': 'grn',
2895 'gu': 'guj',
2896 'gv': 'glv',
2897 'ha': 'hau',
2898 'he': 'heb',
2899 'hi': 'hin',
2900 'ho': 'hmo',
2901 'hr': 'hrv',
2902 'ht': 'hat',
2903 'hu': 'hun',
2904 'hy': 'hye',
2905 'hz': 'her',
2906 'ia': 'ina',
2907 'id': 'ind',
2908 'ie': 'ile',
2909 'ig': 'ibo',
2910 'ii': 'iii',
2911 'ik': 'ipk',
2912 'io': 'ido',
2913 'is': 'isl',
2914 'it': 'ita',
2915 'iu': 'iku',
2916 'ja': 'jpn',
2917 'jv': 'jav',
2918 'ka': 'kat',
2919 'kg': 'kon',
2920 'ki': 'kik',
2921 'kj': 'kua',
2922 'kk': 'kaz',
2923 'kl': 'kal',
2924 'km': 'khm',
2925 'kn': 'kan',
2926 'ko': 'kor',
2927 'kr': 'kau',
2928 'ks': 'kas',
2929 'ku': 'kur',
2930 'kv': 'kom',
2931 'kw': 'cor',
2932 'ky': 'kir',
2933 'la': 'lat',
2934 'lb': 'ltz',
2935 'lg': 'lug',
2936 'li': 'lim',
2937 'ln': 'lin',
2938 'lo': 'lao',
2939 'lt': 'lit',
2940 'lu': 'lub',
2941 'lv': 'lav',
2942 'mg': 'mlg',
2943 'mh': 'mah',
2944 'mi': 'mri',
2945 'mk': 'mkd',
2946 'ml': 'mal',
2947 'mn': 'mon',
2948 'mr': 'mar',
2949 'ms': 'msa',
2950 'mt': 'mlt',
2951 'my': 'mya',
2952 'na': 'nau',
2953 'nb': 'nob',
2954 'nd': 'nde',
2955 'ne': 'nep',
2956 'ng': 'ndo',
2957 'nl': 'nld',
2958 'nn': 'nno',
2959 'no': 'nor',
2960 'nr': 'nbl',
2961 'nv': 'nav',
2962 'ny': 'nya',
2963 'oc': 'oci',
2964 'oj': 'oji',
2965 'om': 'orm',
2966 'or': 'ori',
2967 'os': 'oss',
2968 'pa': 'pan',
2969 'pi': 'pli',
2970 'pl': 'pol',
2971 'ps': 'pus',
2972 'pt': 'por',
2973 'qu': 'que',
2974 'rm': 'roh',
2975 'rn': 'run',
2976 'ro': 'ron',
2977 'ru': 'rus',
2978 'rw': 'kin',
2979 'sa': 'san',
2980 'sc': 'srd',
2981 'sd': 'snd',
2982 'se': 'sme',
2983 'sg': 'sag',
2984 'si': 'sin',
2985 'sk': 'slk',
2986 'sl': 'slv',
2987 'sm': 'smo',
2988 'sn': 'sna',
2989 'so': 'som',
2990 'sq': 'sqi',
2991 'sr': 'srp',
2992 'ss': 'ssw',
2993 'st': 'sot',
2994 'su': 'sun',
2995 'sv': 'swe',
2996 'sw': 'swa',
2997 'ta': 'tam',
2998 'te': 'tel',
2999 'tg': 'tgk',
3000 'th': 'tha',
3001 'ti': 'tir',
3002 'tk': 'tuk',
3003 'tl': 'tgl',
3004 'tn': 'tsn',
3005 'to': 'ton',
3006 'tr': 'tur',
3007 'ts': 'tso',
3008 'tt': 'tat',
3009 'tw': 'twi',
3010 'ty': 'tah',
3011 'ug': 'uig',
3012 'uk': 'ukr',
3013 'ur': 'urd',
3014 'uz': 'uzb',
3015 've': 'ven',
3016 'vi': 'vie',
3017 'vo': 'vol',
3018 'wa': 'wln',
3019 'wo': 'wol',
3020 'xh': 'xho',
3021 'yi': 'yid',
3022 'yo': 'yor',
3023 'za': 'zha',
3024 'zh': 'zho',
3025 'zu': 'zul',
3026 }
3027
3028 @classmethod
3029 def short2long(cls, code):
3030 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3031 return cls._lang_map.get(code[:2])
3032
3033 @classmethod
3034 def long2short(cls, code):
3035 """Convert language code from ISO 639-2/T to ISO 639-1"""
3036 for short_name, long_name in cls._lang_map.items():
3037 if long_name == code:
3038 return short_name
3039
3040
4eb10f66
YCH
3041class ISO3166Utils(object):
3042 # From http://data.okfn.org/data/core/country-list
3043 _country_map = {
3044 'AF': 'Afghanistan',
3045 'AX': 'Åland Islands',
3046 'AL': 'Albania',
3047 'DZ': 'Algeria',
3048 'AS': 'American Samoa',
3049 'AD': 'Andorra',
3050 'AO': 'Angola',
3051 'AI': 'Anguilla',
3052 'AQ': 'Antarctica',
3053 'AG': 'Antigua and Barbuda',
3054 'AR': 'Argentina',
3055 'AM': 'Armenia',
3056 'AW': 'Aruba',
3057 'AU': 'Australia',
3058 'AT': 'Austria',
3059 'AZ': 'Azerbaijan',
3060 'BS': 'Bahamas',
3061 'BH': 'Bahrain',
3062 'BD': 'Bangladesh',
3063 'BB': 'Barbados',
3064 'BY': 'Belarus',
3065 'BE': 'Belgium',
3066 'BZ': 'Belize',
3067 'BJ': 'Benin',
3068 'BM': 'Bermuda',
3069 'BT': 'Bhutan',
3070 'BO': 'Bolivia, Plurinational State of',
3071 'BQ': 'Bonaire, Sint Eustatius and Saba',
3072 'BA': 'Bosnia and Herzegovina',
3073 'BW': 'Botswana',
3074 'BV': 'Bouvet Island',
3075 'BR': 'Brazil',
3076 'IO': 'British Indian Ocean Territory',
3077 'BN': 'Brunei Darussalam',
3078 'BG': 'Bulgaria',
3079 'BF': 'Burkina Faso',
3080 'BI': 'Burundi',
3081 'KH': 'Cambodia',
3082 'CM': 'Cameroon',
3083 'CA': 'Canada',
3084 'CV': 'Cape Verde',
3085 'KY': 'Cayman Islands',
3086 'CF': 'Central African Republic',
3087 'TD': 'Chad',
3088 'CL': 'Chile',
3089 'CN': 'China',
3090 'CX': 'Christmas Island',
3091 'CC': 'Cocos (Keeling) Islands',
3092 'CO': 'Colombia',
3093 'KM': 'Comoros',
3094 'CG': 'Congo',
3095 'CD': 'Congo, the Democratic Republic of the',
3096 'CK': 'Cook Islands',
3097 'CR': 'Costa Rica',
3098 'CI': 'Côte d\'Ivoire',
3099 'HR': 'Croatia',
3100 'CU': 'Cuba',
3101 'CW': 'Curaçao',
3102 'CY': 'Cyprus',
3103 'CZ': 'Czech Republic',
3104 'DK': 'Denmark',
3105 'DJ': 'Djibouti',
3106 'DM': 'Dominica',
3107 'DO': 'Dominican Republic',
3108 'EC': 'Ecuador',
3109 'EG': 'Egypt',
3110 'SV': 'El Salvador',
3111 'GQ': 'Equatorial Guinea',
3112 'ER': 'Eritrea',
3113 'EE': 'Estonia',
3114 'ET': 'Ethiopia',
3115 'FK': 'Falkland Islands (Malvinas)',
3116 'FO': 'Faroe Islands',
3117 'FJ': 'Fiji',
3118 'FI': 'Finland',
3119 'FR': 'France',
3120 'GF': 'French Guiana',
3121 'PF': 'French Polynesia',
3122 'TF': 'French Southern Territories',
3123 'GA': 'Gabon',
3124 'GM': 'Gambia',
3125 'GE': 'Georgia',
3126 'DE': 'Germany',
3127 'GH': 'Ghana',
3128 'GI': 'Gibraltar',
3129 'GR': 'Greece',
3130 'GL': 'Greenland',
3131 'GD': 'Grenada',
3132 'GP': 'Guadeloupe',
3133 'GU': 'Guam',
3134 'GT': 'Guatemala',
3135 'GG': 'Guernsey',
3136 'GN': 'Guinea',
3137 'GW': 'Guinea-Bissau',
3138 'GY': 'Guyana',
3139 'HT': 'Haiti',
3140 'HM': 'Heard Island and McDonald Islands',
3141 'VA': 'Holy See (Vatican City State)',
3142 'HN': 'Honduras',
3143 'HK': 'Hong Kong',
3144 'HU': 'Hungary',
3145 'IS': 'Iceland',
3146 'IN': 'India',
3147 'ID': 'Indonesia',
3148 'IR': 'Iran, Islamic Republic of',
3149 'IQ': 'Iraq',
3150 'IE': 'Ireland',
3151 'IM': 'Isle of Man',
3152 'IL': 'Israel',
3153 'IT': 'Italy',
3154 'JM': 'Jamaica',
3155 'JP': 'Japan',
3156 'JE': 'Jersey',
3157 'JO': 'Jordan',
3158 'KZ': 'Kazakhstan',
3159 'KE': 'Kenya',
3160 'KI': 'Kiribati',
3161 'KP': 'Korea, Democratic People\'s Republic of',
3162 'KR': 'Korea, Republic of',
3163 'KW': 'Kuwait',
3164 'KG': 'Kyrgyzstan',
3165 'LA': 'Lao People\'s Democratic Republic',
3166 'LV': 'Latvia',
3167 'LB': 'Lebanon',
3168 'LS': 'Lesotho',
3169 'LR': 'Liberia',
3170 'LY': 'Libya',
3171 'LI': 'Liechtenstein',
3172 'LT': 'Lithuania',
3173 'LU': 'Luxembourg',
3174 'MO': 'Macao',
3175 'MK': 'Macedonia, the Former Yugoslav Republic of',
3176 'MG': 'Madagascar',
3177 'MW': 'Malawi',
3178 'MY': 'Malaysia',
3179 'MV': 'Maldives',
3180 'ML': 'Mali',
3181 'MT': 'Malta',
3182 'MH': 'Marshall Islands',
3183 'MQ': 'Martinique',
3184 'MR': 'Mauritania',
3185 'MU': 'Mauritius',
3186 'YT': 'Mayotte',
3187 'MX': 'Mexico',
3188 'FM': 'Micronesia, Federated States of',
3189 'MD': 'Moldova, Republic of',
3190 'MC': 'Monaco',
3191 'MN': 'Mongolia',
3192 'ME': 'Montenegro',
3193 'MS': 'Montserrat',
3194 'MA': 'Morocco',
3195 'MZ': 'Mozambique',
3196 'MM': 'Myanmar',
3197 'NA': 'Namibia',
3198 'NR': 'Nauru',
3199 'NP': 'Nepal',
3200 'NL': 'Netherlands',
3201 'NC': 'New Caledonia',
3202 'NZ': 'New Zealand',
3203 'NI': 'Nicaragua',
3204 'NE': 'Niger',
3205 'NG': 'Nigeria',
3206 'NU': 'Niue',
3207 'NF': 'Norfolk Island',
3208 'MP': 'Northern Mariana Islands',
3209 'NO': 'Norway',
3210 'OM': 'Oman',
3211 'PK': 'Pakistan',
3212 'PW': 'Palau',
3213 'PS': 'Palestine, State of',
3214 'PA': 'Panama',
3215 'PG': 'Papua New Guinea',
3216 'PY': 'Paraguay',
3217 'PE': 'Peru',
3218 'PH': 'Philippines',
3219 'PN': 'Pitcairn',
3220 'PL': 'Poland',
3221 'PT': 'Portugal',
3222 'PR': 'Puerto Rico',
3223 'QA': 'Qatar',
3224 'RE': 'Réunion',
3225 'RO': 'Romania',
3226 'RU': 'Russian Federation',
3227 'RW': 'Rwanda',
3228 'BL': 'Saint Barthélemy',
3229 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3230 'KN': 'Saint Kitts and Nevis',
3231 'LC': 'Saint Lucia',
3232 'MF': 'Saint Martin (French part)',
3233 'PM': 'Saint Pierre and Miquelon',
3234 'VC': 'Saint Vincent and the Grenadines',
3235 'WS': 'Samoa',
3236 'SM': 'San Marino',
3237 'ST': 'Sao Tome and Principe',
3238 'SA': 'Saudi Arabia',
3239 'SN': 'Senegal',
3240 'RS': 'Serbia',
3241 'SC': 'Seychelles',
3242 'SL': 'Sierra Leone',
3243 'SG': 'Singapore',
3244 'SX': 'Sint Maarten (Dutch part)',
3245 'SK': 'Slovakia',
3246 'SI': 'Slovenia',
3247 'SB': 'Solomon Islands',
3248 'SO': 'Somalia',
3249 'ZA': 'South Africa',
3250 'GS': 'South Georgia and the South Sandwich Islands',
3251 'SS': 'South Sudan',
3252 'ES': 'Spain',
3253 'LK': 'Sri Lanka',
3254 'SD': 'Sudan',
3255 'SR': 'Suriname',
3256 'SJ': 'Svalbard and Jan Mayen',
3257 'SZ': 'Swaziland',
3258 'SE': 'Sweden',
3259 'CH': 'Switzerland',
3260 'SY': 'Syrian Arab Republic',
3261 'TW': 'Taiwan, Province of China',
3262 'TJ': 'Tajikistan',
3263 'TZ': 'Tanzania, United Republic of',
3264 'TH': 'Thailand',
3265 'TL': 'Timor-Leste',
3266 'TG': 'Togo',
3267 'TK': 'Tokelau',
3268 'TO': 'Tonga',
3269 'TT': 'Trinidad and Tobago',
3270 'TN': 'Tunisia',
3271 'TR': 'Turkey',
3272 'TM': 'Turkmenistan',
3273 'TC': 'Turks and Caicos Islands',
3274 'TV': 'Tuvalu',
3275 'UG': 'Uganda',
3276 'UA': 'Ukraine',
3277 'AE': 'United Arab Emirates',
3278 'GB': 'United Kingdom',
3279 'US': 'United States',
3280 'UM': 'United States Minor Outlying Islands',
3281 'UY': 'Uruguay',
3282 'UZ': 'Uzbekistan',
3283 'VU': 'Vanuatu',
3284 'VE': 'Venezuela, Bolivarian Republic of',
3285 'VN': 'Viet Nam',
3286 'VG': 'Virgin Islands, British',
3287 'VI': 'Virgin Islands, U.S.',
3288 'WF': 'Wallis and Futuna',
3289 'EH': 'Western Sahara',
3290 'YE': 'Yemen',
3291 'ZM': 'Zambia',
3292 'ZW': 'Zimbabwe',
3293 }
3294
3295 @classmethod
3296 def short2full(cls, code):
3297 """Convert an ISO 3166-2 country code to the corresponding full name"""
3298 return cls._country_map.get(code.upper())
3299
3300
773f291d
S
3301class GeoUtils(object):
3302 # Major IPv4 address blocks per country
3303 _country_ip_map = {
3304 'AD': '85.94.160.0/19',
3305 'AE': '94.200.0.0/13',
3306 'AF': '149.54.0.0/17',
3307 'AG': '209.59.64.0/18',
3308 'AI': '204.14.248.0/21',
3309 'AL': '46.99.0.0/16',
3310 'AM': '46.70.0.0/15',
3311 'AO': '105.168.0.0/13',
3312 'AP': '159.117.192.0/21',
3313 'AR': '181.0.0.0/12',
3314 'AS': '202.70.112.0/20',
3315 'AT': '84.112.0.0/13',
3316 'AU': '1.128.0.0/11',
3317 'AW': '181.41.0.0/18',
3318 'AZ': '5.191.0.0/16',
3319 'BA': '31.176.128.0/17',
3320 'BB': '65.48.128.0/17',
3321 'BD': '114.130.0.0/16',
3322 'BE': '57.0.0.0/8',
3323 'BF': '129.45.128.0/17',
3324 'BG': '95.42.0.0/15',
3325 'BH': '37.131.0.0/17',
3326 'BI': '154.117.192.0/18',
3327 'BJ': '137.255.0.0/16',
3328 'BL': '192.131.134.0/24',
3329 'BM': '196.12.64.0/18',
3330 'BN': '156.31.0.0/16',
3331 'BO': '161.56.0.0/16',
3332 'BQ': '161.0.80.0/20',
3333 'BR': '152.240.0.0/12',
3334 'BS': '24.51.64.0/18',
3335 'BT': '119.2.96.0/19',
3336 'BW': '168.167.0.0/16',
3337 'BY': '178.120.0.0/13',
3338 'BZ': '179.42.192.0/18',
3339 'CA': '99.224.0.0/11',
3340 'CD': '41.243.0.0/16',
3341 'CF': '196.32.200.0/21',
3342 'CG': '197.214.128.0/17',
3343 'CH': '85.0.0.0/13',
3344 'CI': '154.232.0.0/14',
3345 'CK': '202.65.32.0/19',
3346 'CL': '152.172.0.0/14',
3347 'CM': '165.210.0.0/15',
3348 'CN': '36.128.0.0/10',
3349 'CO': '181.240.0.0/12',
3350 'CR': '201.192.0.0/12',
3351 'CU': '152.206.0.0/15',
3352 'CV': '165.90.96.0/19',
3353 'CW': '190.88.128.0/17',
3354 'CY': '46.198.0.0/15',
3355 'CZ': '88.100.0.0/14',
3356 'DE': '53.0.0.0/8',
3357 'DJ': '197.241.0.0/17',
3358 'DK': '87.48.0.0/12',
3359 'DM': '192.243.48.0/20',
3360 'DO': '152.166.0.0/15',
3361 'DZ': '41.96.0.0/12',
3362 'EC': '186.68.0.0/15',
3363 'EE': '90.190.0.0/15',
3364 'EG': '156.160.0.0/11',
3365 'ER': '196.200.96.0/20',
3366 'ES': '88.0.0.0/11',
3367 'ET': '196.188.0.0/14',
3368 'EU': '2.16.0.0/13',
3369 'FI': '91.152.0.0/13',
3370 'FJ': '144.120.0.0/16',
3371 'FM': '119.252.112.0/20',
3372 'FO': '88.85.32.0/19',
3373 'FR': '90.0.0.0/9',
3374 'GA': '41.158.0.0/15',
3375 'GB': '25.0.0.0/8',
3376 'GD': '74.122.88.0/21',
3377 'GE': '31.146.0.0/16',
3378 'GF': '161.22.64.0/18',
3379 'GG': '62.68.160.0/19',
3380 'GH': '45.208.0.0/14',
3381 'GI': '85.115.128.0/19',
3382 'GL': '88.83.0.0/19',
3383 'GM': '160.182.0.0/15',
3384 'GN': '197.149.192.0/18',
3385 'GP': '104.250.0.0/19',
3386 'GQ': '105.235.224.0/20',
3387 'GR': '94.64.0.0/13',
3388 'GT': '168.234.0.0/16',
3389 'GU': '168.123.0.0/16',
3390 'GW': '197.214.80.0/20',
3391 'GY': '181.41.64.0/18',
3392 'HK': '113.252.0.0/14',
3393 'HN': '181.210.0.0/16',
3394 'HR': '93.136.0.0/13',
3395 'HT': '148.102.128.0/17',
3396 'HU': '84.0.0.0/14',
3397 'ID': '39.192.0.0/10',
3398 'IE': '87.32.0.0/12',
3399 'IL': '79.176.0.0/13',
3400 'IM': '5.62.80.0/20',
3401 'IN': '117.192.0.0/10',
3402 'IO': '203.83.48.0/21',
3403 'IQ': '37.236.0.0/14',
3404 'IR': '2.176.0.0/12',
3405 'IS': '82.221.0.0/16',
3406 'IT': '79.0.0.0/10',
3407 'JE': '87.244.64.0/18',
3408 'JM': '72.27.0.0/17',
3409 'JO': '176.29.0.0/16',
3410 'JP': '126.0.0.0/8',
3411 'KE': '105.48.0.0/12',
3412 'KG': '158.181.128.0/17',
3413 'KH': '36.37.128.0/17',
3414 'KI': '103.25.140.0/22',
3415 'KM': '197.255.224.0/20',
3416 'KN': '198.32.32.0/19',
3417 'KP': '175.45.176.0/22',
3418 'KR': '175.192.0.0/10',
3419 'KW': '37.36.0.0/14',
3420 'KY': '64.96.0.0/15',
3421 'KZ': '2.72.0.0/13',
3422 'LA': '115.84.64.0/18',
3423 'LB': '178.135.0.0/16',
3424 'LC': '192.147.231.0/24',
3425 'LI': '82.117.0.0/19',
3426 'LK': '112.134.0.0/15',
3427 'LR': '41.86.0.0/19',
3428 'LS': '129.232.0.0/17',
3429 'LT': '78.56.0.0/13',
3430 'LU': '188.42.0.0/16',
3431 'LV': '46.109.0.0/16',
3432 'LY': '41.252.0.0/14',
3433 'MA': '105.128.0.0/11',
3434 'MC': '88.209.64.0/18',
3435 'MD': '37.246.0.0/16',
3436 'ME': '178.175.0.0/17',
3437 'MF': '74.112.232.0/21',
3438 'MG': '154.126.0.0/17',
3439 'MH': '117.103.88.0/21',
3440 'MK': '77.28.0.0/15',
3441 'ML': '154.118.128.0/18',
3442 'MM': '37.111.0.0/17',
3443 'MN': '49.0.128.0/17',
3444 'MO': '60.246.0.0/16',
3445 'MP': '202.88.64.0/20',
3446 'MQ': '109.203.224.0/19',
3447 'MR': '41.188.64.0/18',
3448 'MS': '208.90.112.0/22',
3449 'MT': '46.11.0.0/16',
3450 'MU': '105.16.0.0/12',
3451 'MV': '27.114.128.0/18',
3452 'MW': '105.234.0.0/16',
3453 'MX': '187.192.0.0/11',
3454 'MY': '175.136.0.0/13',
3455 'MZ': '197.218.0.0/15',
3456 'NA': '41.182.0.0/16',
3457 'NC': '101.101.0.0/18',
3458 'NE': '197.214.0.0/18',
3459 'NF': '203.17.240.0/22',
3460 'NG': '105.112.0.0/12',
3461 'NI': '186.76.0.0/15',
3462 'NL': '145.96.0.0/11',
3463 'NO': '84.208.0.0/13',
3464 'NP': '36.252.0.0/15',
3465 'NR': '203.98.224.0/19',
3466 'NU': '49.156.48.0/22',
3467 'NZ': '49.224.0.0/14',
3468 'OM': '5.36.0.0/15',
3469 'PA': '186.72.0.0/15',
3470 'PE': '186.160.0.0/14',
3471 'PF': '123.50.64.0/18',
3472 'PG': '124.240.192.0/19',
3473 'PH': '49.144.0.0/13',
3474 'PK': '39.32.0.0/11',
3475 'PL': '83.0.0.0/11',
3476 'PM': '70.36.0.0/20',
3477 'PR': '66.50.0.0/16',
3478 'PS': '188.161.0.0/16',
3479 'PT': '85.240.0.0/13',
3480 'PW': '202.124.224.0/20',
3481 'PY': '181.120.0.0/14',
3482 'QA': '37.210.0.0/15',
3483 'RE': '139.26.0.0/16',
3484 'RO': '79.112.0.0/13',
3485 'RS': '178.220.0.0/14',
3486 'RU': '5.136.0.0/13',
3487 'RW': '105.178.0.0/15',
3488 'SA': '188.48.0.0/13',
3489 'SB': '202.1.160.0/19',
3490 'SC': '154.192.0.0/11',
3491 'SD': '154.96.0.0/13',
3492 'SE': '78.64.0.0/12',
3493 'SG': '152.56.0.0/14',
3494 'SI': '188.196.0.0/14',
3495 'SK': '78.98.0.0/15',
3496 'SL': '197.215.0.0/17',
3497 'SM': '89.186.32.0/19',
3498 'SN': '41.82.0.0/15',
3499 'SO': '197.220.64.0/19',
3500 'SR': '186.179.128.0/17',
3501 'SS': '105.235.208.0/21',
3502 'ST': '197.159.160.0/19',
3503 'SV': '168.243.0.0/16',
3504 'SX': '190.102.0.0/20',
3505 'SY': '5.0.0.0/16',
3506 'SZ': '41.84.224.0/19',
3507 'TC': '65.255.48.0/20',
3508 'TD': '154.68.128.0/19',
3509 'TG': '196.168.0.0/14',
3510 'TH': '171.96.0.0/13',
3511 'TJ': '85.9.128.0/18',
3512 'TK': '27.96.24.0/21',
3513 'TL': '180.189.160.0/20',
3514 'TM': '95.85.96.0/19',
3515 'TN': '197.0.0.0/11',
3516 'TO': '175.176.144.0/21',
3517 'TR': '78.160.0.0/11',
3518 'TT': '186.44.0.0/15',
3519 'TV': '202.2.96.0/19',
3520 'TW': '120.96.0.0/11',
3521 'TZ': '156.156.0.0/14',
3522 'UA': '93.72.0.0/13',
3523 'UG': '154.224.0.0/13',
3524 'US': '3.0.0.0/8',
3525 'UY': '167.56.0.0/13',
3526 'UZ': '82.215.64.0/18',
3527 'VA': '212.77.0.0/19',
3528 'VC': '24.92.144.0/20',
3529 'VE': '186.88.0.0/13',
3530 'VG': '172.103.64.0/18',
3531 'VI': '146.226.0.0/16',
3532 'VN': '14.160.0.0/11',
3533 'VU': '202.80.32.0/20',
3534 'WF': '117.20.32.0/21',
3535 'WS': '202.4.32.0/19',
3536 'YE': '134.35.0.0/16',
3537 'YT': '41.242.116.0/22',
3538 'ZA': '41.0.0.0/11',
3539 'ZM': '165.56.0.0/13',
3540 'ZW': '41.85.192.0/19',
3541 }
3542
3543 @classmethod
5f95927a
S
3544 def random_ipv4(cls, code_or_block):
3545 if len(code_or_block) == 2:
3546 block = cls._country_ip_map.get(code_or_block.upper())
3547 if not block:
3548 return None
3549 else:
3550 block = code_or_block
773f291d
S
3551 addr, preflen = block.split('/')
3552 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3553 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3554 return compat_str(socket.inet_ntoa(
4248dad9 3555 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3556
3557
91410c9b 3558class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3559 def __init__(self, proxies=None):
3560 # Set default handlers
3561 for type in ('http', 'https'):
3562 setattr(self, '%s_open' % type,
3563 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3564 meth(r, proxy, type))
3565 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3566
91410c9b 3567 def proxy_open(self, req, proxy, type):
2461f79d 3568 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3569 if req_proxy is not None:
3570 proxy = req_proxy
2461f79d
PH
3571 del req.headers['Ytdl-request-proxy']
3572
3573 if proxy == '__noproxy__':
3574 return None # No Proxy
51fb4995 3575 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3576 req.add_header('Ytdl-socks-proxy', proxy)
3577 # youtube-dl's http/https handlers do wrapping the socket with socks
3578 return None
91410c9b
PH
3579 return compat_urllib_request.ProxyHandler.proxy_open(
3580 self, req, proxy, type)
5bc880b9
YCH
3581
3582
0a5445dd
YCH
3583# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3584# released into Public Domain
3585# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3586
3587def long_to_bytes(n, blocksize=0):
3588 """long_to_bytes(n:long, blocksize:int) : string
3589 Convert a long integer to a byte string.
3590
3591 If optional blocksize is given and greater than zero, pad the front of the
3592 byte string with binary zeros so that the length is a multiple of
3593 blocksize.
3594 """
3595 # after much testing, this algorithm was deemed to be the fastest
3596 s = b''
3597 n = int(n)
3598 while n > 0:
3599 s = compat_struct_pack('>I', n & 0xffffffff) + s
3600 n = n >> 32
3601 # strip off leading zeros
3602 for i in range(len(s)):
3603 if s[i] != b'\000'[0]:
3604 break
3605 else:
3606 # only happens when n == 0
3607 s = b'\000'
3608 i = 0
3609 s = s[i:]
3610 # add back some pad bytes. this could be done more efficiently w.r.t. the
3611 # de-padding being done above, but sigh...
3612 if blocksize > 0 and len(s) % blocksize:
3613 s = (blocksize - len(s) % blocksize) * b'\000' + s
3614 return s
3615
3616
3617def bytes_to_long(s):
3618 """bytes_to_long(string) : long
3619 Convert a byte string to a long integer.
3620
3621 This is (essentially) the inverse of long_to_bytes().
3622 """
3623 acc = 0
3624 length = len(s)
3625 if length % 4:
3626 extra = (4 - length % 4)
3627 s = b'\000' * extra + s
3628 length = length + extra
3629 for i in range(0, length, 4):
3630 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3631 return acc
3632
3633
5bc880b9
YCH
3634def ohdave_rsa_encrypt(data, exponent, modulus):
3635 '''
3636 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3637
3638 Input:
3639 data: data to encrypt, bytes-like object
3640 exponent, modulus: parameter e and N of RSA algorithm, both integer
3641 Output: hex string of encrypted data
3642
3643 Limitation: supports one block encryption only
3644 '''
3645
3646 payload = int(binascii.hexlify(data[::-1]), 16)
3647 encrypted = pow(payload, exponent, modulus)
3648 return '%x' % encrypted
81bdc8fd
YCH
3649
3650
f48409c7
YCH
3651def pkcs1pad(data, length):
3652 """
3653 Padding input data with PKCS#1 scheme
3654
3655 @param {int[]} data input data
3656 @param {int} length target length
3657 @returns {int[]} padded data
3658 """
3659 if len(data) > length - 11:
3660 raise ValueError('Input data too long for PKCS#1 padding')
3661
3662 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3663 return [0, 2] + pseudo_random + [0] + data
3664
3665
5eb6bdce 3666def encode_base_n(num, n, table=None):
59f898b7 3667 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3668 if not table:
3669 table = FULL_TABLE[:n]
3670
5eb6bdce
YCH
3671 if n > len(table):
3672 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3673
3674 if num == 0:
3675 return table[0]
3676
81bdc8fd
YCH
3677 ret = ''
3678 while num:
3679 ret = table[num % n] + ret
3680 num = num // n
3681 return ret
f52354a8
YCH
3682
3683
3684def decode_packed_codes(code):
06b3fe29 3685 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3686 obfucasted_code, base, count, symbols = mobj.groups()
3687 base = int(base)
3688 count = int(count)
3689 symbols = symbols.split('|')
3690 symbol_table = {}
3691
3692 while count:
3693 count -= 1
5eb6bdce 3694 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3695 symbol_table[base_n_count] = symbols[count] or base_n_count
3696
3697 return re.sub(
3698 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3699 obfucasted_code)
e154c651 3700
3701
3702def parse_m3u8_attributes(attrib):
3703 info = {}
3704 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3705 if val.startswith('"'):
3706 val = val[1:-1]
3707 info[key] = val
3708 return info
1143535d
YCH
3709
3710
3711def urshift(val, n):
3712 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3713
3714
3715# Based on png2str() written by @gdkchan and improved by @yokrysty
3716# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3717def decode_png(png_data):
3718 # Reference: https://www.w3.org/TR/PNG/
3719 header = png_data[8:]
3720
3721 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3722 raise IOError('Not a valid PNG file.')
3723
3724 int_map = {1: '>B', 2: '>H', 4: '>I'}
3725 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3726
3727 chunks = []
3728
3729 while header:
3730 length = unpack_integer(header[:4])
3731 header = header[4:]
3732
3733 chunk_type = header[:4]
3734 header = header[4:]
3735
3736 chunk_data = header[:length]
3737 header = header[length:]
3738
3739 header = header[4:] # Skip CRC
3740
3741 chunks.append({
3742 'type': chunk_type,
3743 'length': length,
3744 'data': chunk_data
3745 })
3746
3747 ihdr = chunks[0]['data']
3748
3749 width = unpack_integer(ihdr[:4])
3750 height = unpack_integer(ihdr[4:8])
3751
3752 idat = b''
3753
3754 for chunk in chunks:
3755 if chunk['type'] == b'IDAT':
3756 idat += chunk['data']
3757
3758 if not idat:
3759 raise IOError('Unable to read PNG data.')
3760
3761 decompressed_data = bytearray(zlib.decompress(idat))
3762
3763 stride = width * 3
3764 pixels = []
3765
3766 def _get_pixel(idx):
3767 x = idx % stride
3768 y = idx // stride
3769 return pixels[y][x]
3770
3771 for y in range(height):
3772 basePos = y * (1 + stride)
3773 filter_type = decompressed_data[basePos]
3774
3775 current_row = []
3776
3777 pixels.append(current_row)
3778
3779 for x in range(stride):
3780 color = decompressed_data[1 + basePos + x]
3781 basex = y * stride + x
3782 left = 0
3783 up = 0
3784
3785 if x > 2:
3786 left = _get_pixel(basex - 3)
3787 if y > 0:
3788 up = _get_pixel(basex - stride)
3789
3790 if filter_type == 1: # Sub
3791 color = (color + left) & 0xff
3792 elif filter_type == 2: # Up
3793 color = (color + up) & 0xff
3794 elif filter_type == 3: # Average
3795 color = (color + ((left + up) >> 1)) & 0xff
3796 elif filter_type == 4: # Paeth
3797 a = left
3798 b = up
3799 c = 0
3800
3801 if x > 2 and y > 0:
3802 c = _get_pixel(basex - stride - 3)
3803
3804 p = a + b - c
3805
3806 pa = abs(p - a)
3807 pb = abs(p - b)
3808 pc = abs(p - c)
3809
3810 if pa <= pb and pa <= pc:
3811 color = (color + a) & 0xff
3812 elif pb <= pc:
3813 color = (color + b) & 0xff
3814 else:
3815 color = (color + c) & 0xff
3816
3817 current_row.append(color)
3818
3819 return width, height, pixels
efa97bdc
YCH
3820
3821
3822def write_xattr(path, key, value):
3823 # This mess below finds the best xattr tool for the job
3824 try:
3825 # try the pyxattr module...
3826 import xattr
3827
53a7e3d2
YCH
3828 if hasattr(xattr, 'set'): # pyxattr
3829 # Unicode arguments are not supported in python-pyxattr until
3830 # version 0.5.0
3831 # See https://github.com/rg3/youtube-dl/issues/5498
3832 pyxattr_required_version = '0.5.0'
3833 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3834 # TODO: fallback to CLI tools
3835 raise XAttrUnavailableError(
3836 'python-pyxattr is detected but is too old. '
3837 'youtube-dl requires %s or above while your version is %s. '
3838 'Falling back to other xattr implementations' % (
3839 pyxattr_required_version, xattr.__version__))
3840
3841 setxattr = xattr.set
3842 else: # xattr
3843 setxattr = xattr.setxattr
efa97bdc
YCH
3844
3845 try:
53a7e3d2 3846 setxattr(path, key, value)
efa97bdc
YCH
3847 except EnvironmentError as e:
3848 raise XAttrMetadataError(e.errno, e.strerror)
3849
3850 except ImportError:
3851 if compat_os_name == 'nt':
3852 # Write xattrs to NTFS Alternate Data Streams:
3853 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3854 assert ':' not in key
3855 assert os.path.exists(path)
3856
3857 ads_fn = path + ':' + key
3858 try:
3859 with open(ads_fn, 'wb') as f:
3860 f.write(value)
3861 except EnvironmentError as e:
3862 raise XAttrMetadataError(e.errno, e.strerror)
3863 else:
3864 user_has_setfattr = check_executable('setfattr', ['--version'])
3865 user_has_xattr = check_executable('xattr', ['-h'])
3866
3867 if user_has_setfattr or user_has_xattr:
3868
3869 value = value.decode('utf-8')
3870 if user_has_setfattr:
3871 executable = 'setfattr'
3872 opts = ['-n', key, '-v', value]
3873 elif user_has_xattr:
3874 executable = 'xattr'
3875 opts = ['-w', key, value]
3876
3877 cmd = ([encodeFilename(executable, True)] +
3878 [encodeArgument(o) for o in opts] +
3879 [encodeFilename(path, True)])
3880
3881 try:
3882 p = subprocess.Popen(
3883 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3884 except EnvironmentError as e:
3885 raise XAttrMetadataError(e.errno, e.strerror)
3886 stdout, stderr = p.communicate()
3887 stderr = stderr.decode('utf-8', 'replace')
3888 if p.returncode != 0:
3889 raise XAttrMetadataError(p.returncode, stderr)
3890
3891 else:
3892 # On Unix, and can't find pyxattr, setfattr, or xattr.
3893 if sys.platform.startswith('linux'):
3894 raise XAttrUnavailableError(
3895 "Couldn't find a tool to set the xattrs. "
3896 "Install either the python 'pyxattr' or 'xattr' "
3897 "modules, or the GNU 'attr' package "
3898 "(which contains the 'setfattr' tool).")
3899 else:
3900 raise XAttrUnavailableError(
3901 "Couldn't find a tool to set the xattrs. "
3902 "Install either the python 'xattr' module, "
3903 "or the 'xattr' binary.")
0c265486
YCH
3904
3905
3906def random_birthday(year_field, month_field, day_field):
3907 return {
3908 year_field: str(random.randint(1950, 1995)),
3909 month_field: str(random.randint(1, 12)),
3910 day_field: str(random.randint(1, 31)),
3911 }