]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[soundcloud] detect format extension(closes #16549)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
d7cd9a9e 42 compat_ctypes_WINFUNCTYPE,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
b12cf31b 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29
S
186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187
7105440c 188
d77c3dfd 189def preferredencoding():
59ae15a5 190 """Get preferred encoding.
d77c3dfd 191
59ae15a5
PH
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
28e614de 197 'TEST'.encode(pref)
70a1165b 198 except Exception:
59ae15a5 199 pref = 'UTF-8'
bae611f2 200
59ae15a5 201 return pref
d77c3dfd 202
f4bfd65f 203
181c8655 204def write_json_file(obj, fn):
1394646a 205 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 206
92120217 207 fn = encodeFilename(fn)
61ee5aeb 208 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
209 encoding = get_filesystem_encoding()
210 # os.path.basename returns a bytes object, but NamedTemporaryFile
211 # will fail if the filename contains non ascii characters unless we
212 # use a unicode object
213 path_basename = lambda f: os.path.basename(fn).decode(encoding)
214 # the same for os.path.dirname
215 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
216 else:
217 path_basename = os.path.basename
218 path_dirname = os.path.dirname
219
73159f99
S
220 args = {
221 'suffix': '.tmp',
ec5f6016
JMF
222 'prefix': path_basename(fn) + '.',
223 'dir': path_dirname(fn),
73159f99
S
224 'delete': False,
225 }
226
181c8655
PH
227 # In Python 2.x, json.dump expects a bytestream.
228 # In Python 3.x, it writes to a character stream
229 if sys.version_info < (3, 0):
73159f99 230 args['mode'] = 'wb'
181c8655 231 else:
73159f99
S
232 args.update({
233 'mode': 'w',
234 'encoding': 'utf-8',
235 })
236
c86b6142 237 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
238
239 try:
240 with tf:
241 json.dump(obj, tf)
1394646a
IK
242 if sys.platform == 'win32':
243 # Need to remove existing file on Windows, else os.rename raises
244 # WindowsError or FileExistsError.
245 try:
246 os.unlink(fn)
247 except OSError:
248 pass
181c8655 249 os.rename(tf.name, fn)
70a1165b 250 except Exception:
181c8655
PH
251 try:
252 os.remove(tf.name)
253 except OSError:
254 pass
255 raise
256
257
258if sys.version_info >= (2, 7):
ee114368 259 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 260 """ Find the xpath xpath[@key=val] """
5d2354f1 261 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 262 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
263 return node.find(expr)
264else:
ee114368 265 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 266 for f in node.findall(compat_xpath(xpath)):
ee114368
S
267 if key not in f.attrib:
268 continue
269 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
270 return f
271 return None
272
d7e66d39
JMF
273# On python2.6 the xml.etree.ElementTree.Element methods don't support
274# the namespace parameter
5f6a1245
JW
275
276
d7e66d39
JMF
277def xpath_with_ns(path, ns_map):
278 components = [c.split(':') for c in path.split('/')]
279 replaced = []
280 for c in components:
281 if len(c) == 1:
282 replaced.append(c[0])
283 else:
284 ns, tag = c
285 replaced.append('{%s}%s' % (ns_map[ns], tag))
286 return '/'.join(replaced)
287
d77c3dfd 288
a41fb80c 289def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 290 def _find_xpath(xpath):
810c10ba 291 return node.find(compat_xpath(xpath))
578c0745
S
292
293 if isinstance(xpath, (str, compat_str)):
294 n = _find_xpath(xpath)
295 else:
296 for xp in xpath:
297 n = _find_xpath(xp)
298 if n is not None:
299 break
d74bebd5 300
8e636da4 301 if n is None:
bf42a990
S
302 if default is not NO_DEFAULT:
303 return default
304 elif fatal:
bf0ff932
PH
305 name = xpath if name is None else name
306 raise ExtractorError('Could not find XML element %s' % name)
307 else:
308 return None
a41fb80c
S
309 return n
310
311
312def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
313 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
314 if n is None or n == default:
315 return n
316 if n.text is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element\'s text %s' % name)
322 else:
323 return None
324 return n.text
a41fb80c
S
325
326
327def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
328 n = find_xpath_attr(node, xpath, key)
329 if n is None:
330 if default is not NO_DEFAULT:
331 return default
332 elif fatal:
333 name = '%s[@%s]' % (xpath, key) if name is None else name
334 raise ExtractorError('Could not find XML attribute %s' % name)
335 else:
336 return None
337 return n.attrib[key]
bf0ff932
PH
338
339
9e6dd238 340def get_element_by_id(id, html):
43e8fafd 341 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 342 return get_element_by_attribute('id', id, html)
43e8fafd 343
12ea2f30 344
84c237fb 345def get_element_by_class(class_name, html):
2af12ad9
TC
346 """Return the content of the first tag with the specified class in the passed HTML document"""
347 retval = get_elements_by_class(class_name, html)
348 return retval[0] if retval else None
349
350
351def get_element_by_attribute(attribute, value, html, escape_value=True):
352 retval = get_elements_by_attribute(attribute, value, html, escape_value)
353 return retval[0] if retval else None
354
355
356def get_elements_by_class(class_name, html):
357 """Return the content of all tags with the specified class in the passed HTML document as a list"""
358 return get_elements_by_attribute(
84c237fb
YCH
359 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
360 html, escape_value=False)
361
362
2af12ad9 363def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 364 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 365
84c237fb
YCH
366 value = re.escape(value) if escape_value else value
367
2af12ad9
TC
368 retlist = []
369 for m in re.finditer(r'''(?xs)
38285056 370 <([a-zA-Z0-9:._-]+)
609ff8ca 371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 372 \s+%s=['"]?%s['"]?
609ff8ca 373 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
374 \s*>
375 (?P<content>.*?)
376 </\1>
2af12ad9
TC
377 ''' % (re.escape(attribute), value), html):
378 res = m.group('content')
38285056 379
2af12ad9
TC
380 if res.startswith('"') or res.startswith("'"):
381 res = res[1:-1]
38285056 382
2af12ad9 383 retlist.append(unescapeHTML(res))
a921f407 384
2af12ad9 385 return retlist
a921f407 386
c5229f39 387
8bb56eee
BF
388class HTMLAttributeParser(compat_HTMLParser):
389 """Trivial HTML parser to gather the attributes for a single element"""
390 def __init__(self):
c5229f39 391 self.attrs = {}
8bb56eee
BF
392 compat_HTMLParser.__init__(self)
393
394 def handle_starttag(self, tag, attrs):
395 self.attrs = dict(attrs)
396
c5229f39 397
8bb56eee
BF
398def extract_attributes(html_element):
399 """Given a string for an HTML element such as
400 <el
401 a="foo" B="bar" c="&98;az" d=boz
402 empty= noval entity="&amp;"
403 sq='"' dq="'"
404 >
405 Decode and return a dictionary of attributes.
406 {
407 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
408 'empty': '', 'noval': None, 'entity': '&',
409 'sq': '"', 'dq': '\''
410 }.
411 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
412 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
413 """
414 parser = HTMLAttributeParser()
b4a3d461
S
415 try:
416 parser.feed(html_element)
417 parser.close()
418 # Older Python may throw HTMLParseError in case of malformed HTML
419 except compat_HTMLParseError:
420 pass
8bb56eee 421 return parser.attrs
9e6dd238 422
c5229f39 423
9e6dd238 424def clean_html(html):
59ae15a5 425 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
426
427 if html is None: # Convenience for sanitizing descriptions etc.
428 return html
429
59ae15a5
PH
430 # Newline vs <br />
431 html = html.replace('\n', ' ')
edd9221c
TF
432 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
433 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
434 # Strip html tags
435 html = re.sub('<.*?>', '', html)
436 # Replace html entities
437 html = unescapeHTML(html)
7decf895 438 return html.strip()
9e6dd238
FV
439
440
d77c3dfd 441def sanitize_open(filename, open_mode):
59ae15a5
PH
442 """Try to open the given filename, and slightly tweak it if this fails.
443
444 Attempts to open the given filename. If this fails, it tries to change
445 the filename slightly, step by step, until it's either able to open it
446 or it fails and raises a final exception, like the standard open()
447 function.
448
449 It returns the tuple (stream, definitive_file_name).
450 """
451 try:
28e614de 452 if filename == '-':
59ae15a5
PH
453 if sys.platform == 'win32':
454 import msvcrt
455 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 456 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
457 stream = open(encodeFilename(filename), open_mode)
458 return (stream, filename)
459 except (IOError, OSError) as err:
f45c185f
PH
460 if err.errno in (errno.EACCES,):
461 raise
59ae15a5 462
f45c185f 463 # In case of error, try to remove win32 forbidden chars
d55de57b 464 alt_filename = sanitize_path(filename)
f45c185f
PH
465 if alt_filename == filename:
466 raise
467 else:
468 # An exception here should be caught in the caller
d55de57b 469 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 470 return (stream, alt_filename)
d77c3dfd
FV
471
472
473def timeconvert(timestr):
59ae15a5
PH
474 """Convert RFC 2822 defined time string into system timestamp"""
475 timestamp = None
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
479 return timestamp
1c469a94 480
5f6a1245 481
796173d0 482def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
483 """Sanitizes a string so it could be used as part of a filename.
484 If restricted is set, use a stricter subset of allowed characters.
158af524
S
485 Set is_id if this is not an arbitrary string, but an ID that should be kept
486 if possible.
59ae15a5
PH
487 """
488 def replace_insane(char):
c587cbb7
AT
489 if restricted and char in ACCENT_CHARS:
490 return ACCENT_CHARS[char]
59ae15a5
PH
491 if char == '?' or ord(char) < 32 or ord(char) == 127:
492 return ''
493 elif char == '"':
494 return '' if restricted else '\''
495 elif char == ':':
496 return '_-' if restricted else ' -'
497 elif char in '\\/|*<>':
498 return '_'
627dcfff 499 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
500 return '_'
501 if restricted and ord(char) > 127:
502 return '_'
503 return char
504
2aeb06d6
PH
505 # Handle timestamps
506 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 507 result = ''.join(map(replace_insane, s))
796173d0
PH
508 if not is_id:
509 while '__' in result:
510 result = result.replace('__', '_')
511 result = result.strip('_')
512 # Common case of "Foreign band name - English song title"
513 if restricted and result.startswith('-_'):
514 result = result[2:]
5a42414b
PH
515 if result.startswith('-'):
516 result = '_' + result[len('-'):]
a7440261 517 result = result.lstrip('.')
796173d0
PH
518 if not result:
519 result = '_'
59ae15a5 520 return result
d77c3dfd 521
5f6a1245 522
a2aaf4db
S
523def sanitize_path(s):
524 """Sanitizes and normalizes path on Windows"""
525 if sys.platform != 'win32':
526 return s
be531ef1
S
527 drive_or_unc, _ = os.path.splitdrive(s)
528 if sys.version_info < (2, 7) and not drive_or_unc:
529 drive_or_unc, _ = os.path.splitunc(s)
530 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
531 if drive_or_unc:
a2aaf4db
S
532 norm_path.pop(0)
533 sanitized_path = [
ec85ded8 534 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 535 for path_part in norm_path]
be531ef1
S
536 if drive_or_unc:
537 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
538 return os.path.join(*sanitized_path)
539
540
17bcc626 541def sanitize_url(url):
befa4708
S
542 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
543 # the number of unwanted failures due to missing protocol
544 if url.startswith('//'):
545 return 'http:%s' % url
546 # Fix some common typos seen so far
547 COMMON_TYPOS = (
548 # https://github.com/rg3/youtube-dl/issues/15649
549 (r'^httpss://', r'https://'),
550 # https://bx1.be/lives/direct-tv/
551 (r'^rmtp([es]?)://', r'rtmp\1://'),
552 )
553 for mistake, fixup in COMMON_TYPOS:
554 if re.match(mistake, url):
555 return re.sub(mistake, fixup, url)
556 return url
17bcc626
S
557
558
67dda517 559def sanitized_Request(url, *args, **kwargs):
17bcc626 560 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
561
562
51098426
S
563def expand_path(s):
564 """Expand shell variables and ~"""
565 return os.path.expandvars(compat_expanduser(s))
566
567
d77c3dfd 568def orderedSet(iterable):
59ae15a5
PH
569 """ Remove all duplicates from the input iterable """
570 res = []
571 for el in iterable:
572 if el not in res:
573 res.append(el)
574 return res
d77c3dfd 575
912b38b4 576
55b2f099 577def _htmlentity_transform(entity_with_semicolon):
4e408e47 578 """Transforms an HTML entity to a character."""
55b2f099
YCH
579 entity = entity_with_semicolon[:-1]
580
4e408e47
PH
581 # Known non-numeric HTML entity
582 if entity in compat_html_entities.name2codepoint:
583 return compat_chr(compat_html_entities.name2codepoint[entity])
584
55b2f099
YCH
585 # TODO: HTML5 allows entities without a semicolon. For example,
586 # '&Eacuteric' should be decoded as 'Éric'.
587 if entity_with_semicolon in compat_html_entities_html5:
588 return compat_html_entities_html5[entity_with_semicolon]
589
91757b0f 590 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
591 if mobj is not None:
592 numstr = mobj.group(1)
28e614de 593 if numstr.startswith('x'):
4e408e47 594 base = 16
28e614de 595 numstr = '0%s' % numstr
4e408e47
PH
596 else:
597 base = 10
7aefc49c
S
598 # See https://github.com/rg3/youtube-dl/issues/7518
599 try:
600 return compat_chr(int(numstr, base))
601 except ValueError:
602 pass
4e408e47
PH
603
604 # Unknown entity in name, return its literal representation
7a3f0c00 605 return '&%s;' % entity
4e408e47
PH
606
607
d77c3dfd 608def unescapeHTML(s):
912b38b4
PH
609 if s is None:
610 return None
611 assert type(s) == compat_str
d77c3dfd 612
4e408e47 613 return re.sub(
95f3f7c2 614 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 615
8bf48f23 616
aa49acd1
S
617def get_subprocess_encoding():
618 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
619 # For subprocess calls, encode with locale encoding
620 # Refer to http://stackoverflow.com/a/9951851/35070
621 encoding = preferredencoding()
622 else:
623 encoding = sys.getfilesystemencoding()
624 if encoding is None:
625 encoding = 'utf-8'
626 return encoding
627
628
8bf48f23 629def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
630 """
631 @param s The name of the file
632 """
d77c3dfd 633
8bf48f23 634 assert type(s) == compat_str
d77c3dfd 635
59ae15a5
PH
636 # Python 3 has a Unicode API
637 if sys.version_info >= (3, 0):
638 return s
0f00efed 639
aa49acd1
S
640 # Pass '' directly to use Unicode APIs on Windows 2000 and up
641 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
642 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
643 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
644 return s
645
8ee239e9
YCH
646 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
647 if sys.platform.startswith('java'):
648 return s
649
aa49acd1
S
650 return s.encode(get_subprocess_encoding(), 'ignore')
651
652
653def decodeFilename(b, for_subprocess=False):
654
655 if sys.version_info >= (3, 0):
656 return b
657
658 if not isinstance(b, bytes):
659 return b
660
661 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 662
f07b74fc
PH
663
664def encodeArgument(s):
665 if not isinstance(s, compat_str):
666 # Legacy code that uses byte strings
667 # Uncomment the following line after fixing all post processors
7af808a5 668 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
669 s = s.decode('ascii')
670 return encodeFilename(s, True)
671
672
aa49acd1
S
673def decodeArgument(b):
674 return decodeFilename(b, True)
675
676
8271226a
PH
677def decodeOption(optval):
678 if optval is None:
679 return optval
680 if isinstance(optval, bytes):
681 optval = optval.decode(preferredencoding())
682
683 assert isinstance(optval, compat_str)
684 return optval
1c256f70 685
5f6a1245 686
4539dd30
PH
687def formatSeconds(secs):
688 if secs > 3600:
689 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
690 elif secs > 60:
691 return '%d:%02d' % (secs // 60, secs % 60)
692 else:
693 return '%d' % secs
694
a0ddb8a2 695
be4a824d
PH
696def make_HTTPS_handler(params, **kwargs):
697 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 698 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 699 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 700 if opts_no_check_certificate:
be5f2c19 701 context.check_hostname = False
0db261ba 702 context.verify_mode = ssl.CERT_NONE
a2366922 703 try:
be4a824d 704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
705 except TypeError:
706 # Python 2.7.8
707 # (create_default_context present but HTTPSHandler has no context=)
708 pass
709
710 if sys.version_info < (3, 2):
d7932313 711 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 712 else: # Python < 3.4
d7932313 713 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 714 context.verify_mode = (ssl.CERT_NONE
dca08720 715 if opts_no_check_certificate
ea6d901e 716 else ssl.CERT_REQUIRED)
303b479e 717 context.set_default_verify_paths()
be4a824d 718 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 719
732ea2f0 720
08f2a92c
JMF
721def bug_reports_message():
722 if ytdl_is_updateable():
723 update_cmd = 'type youtube-dl -U to update'
724 else:
725 update_cmd = 'see https://yt-dl.org/update on how to update'
726 msg = '; please report this issue on https://yt-dl.org/bug .'
727 msg += ' Make sure you are using the latest version; %s.' % update_cmd
728 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
729 return msg
730
731
bf5b9d85
PM
732class YoutubeDLError(Exception):
733 """Base exception for YoutubeDL errors."""
734 pass
735
736
737class ExtractorError(YoutubeDLError):
1c256f70 738 """Error during info extraction."""
5f6a1245 739
d11271dd 740 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
741 """ tb, if given, is the original traceback (so that it can be printed out).
742 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
743 """
744
745 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
746 expected = True
d11271dd
PH
747 if video_id is not None:
748 msg = video_id + ': ' + msg
410f3e73 749 if cause:
28e614de 750 msg += ' (caused by %r)' % cause
9a82b238 751 if not expected:
08f2a92c 752 msg += bug_reports_message()
1c256f70 753 super(ExtractorError, self).__init__(msg)
d5979c5d 754
1c256f70 755 self.traceback = tb
8cc83b8d 756 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 757 self.cause = cause
d11271dd 758 self.video_id = video_id
1c256f70 759
01951dda
PH
760 def format_traceback(self):
761 if self.traceback is None:
762 return None
28e614de 763 return ''.join(traceback.format_tb(self.traceback))
01951dda 764
1c256f70 765
416c7fcb
PH
766class UnsupportedError(ExtractorError):
767 def __init__(self, url):
768 super(UnsupportedError, self).__init__(
769 'Unsupported URL: %s' % url, expected=True)
770 self.url = url
771
772
55b3e45b
JMF
773class RegexNotFoundError(ExtractorError):
774 """Error when a regex didn't match"""
775 pass
776
777
773f291d
S
778class GeoRestrictedError(ExtractorError):
779 """Geographic restriction Error exception.
780
781 This exception may be thrown when a video is not available from your
782 geographic location due to geographic restrictions imposed by a website.
783 """
784 def __init__(self, msg, countries=None):
785 super(GeoRestrictedError, self).__init__(msg, expected=True)
786 self.msg = msg
787 self.countries = countries
788
789
bf5b9d85 790class DownloadError(YoutubeDLError):
59ae15a5 791 """Download Error exception.
d77c3dfd 792
59ae15a5
PH
793 This exception may be thrown by FileDownloader objects if they are not
794 configured to continue on errors. They will contain the appropriate
795 error message.
796 """
5f6a1245 797
8cc83b8d
FV
798 def __init__(self, msg, exc_info=None):
799 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
800 super(DownloadError, self).__init__(msg)
801 self.exc_info = exc_info
d77c3dfd
FV
802
803
bf5b9d85 804class SameFileError(YoutubeDLError):
59ae15a5 805 """Same File exception.
d77c3dfd 806
59ae15a5
PH
807 This exception will be thrown by FileDownloader objects if they detect
808 multiple files would have to be downloaded to the same file on disk.
809 """
810 pass
d77c3dfd
FV
811
812
bf5b9d85 813class PostProcessingError(YoutubeDLError):
59ae15a5 814 """Post Processing exception.
d77c3dfd 815
59ae15a5
PH
816 This exception may be raised by PostProcessor's .run() method to
817 indicate an error in the postprocessing task.
818 """
5f6a1245 819
7851b379 820 def __init__(self, msg):
bf5b9d85 821 super(PostProcessingError, self).__init__(msg)
7851b379 822 self.msg = msg
d77c3dfd 823
5f6a1245 824
bf5b9d85 825class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
826 """ --max-downloads limit has been reached. """
827 pass
d77c3dfd
FV
828
829
bf5b9d85 830class UnavailableVideoError(YoutubeDLError):
59ae15a5 831 """Unavailable Format exception.
d77c3dfd 832
59ae15a5
PH
833 This exception will be thrown when a video is requested
834 in a format that is not available for that video.
835 """
836 pass
d77c3dfd
FV
837
838
bf5b9d85 839class ContentTooShortError(YoutubeDLError):
59ae15a5 840 """Content Too Short exception.
d77c3dfd 841
59ae15a5
PH
842 This exception may be raised by FileDownloader objects when a file they
843 download is too small for what the server announced first, indicating
844 the connection was probably interrupted.
845 """
d77c3dfd 846
59ae15a5 847 def __init__(self, downloaded, expected):
bf5b9d85
PM
848 super(ContentTooShortError, self).__init__(
849 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
850 )
2c7ed247 851 # Both in bytes
59ae15a5
PH
852 self.downloaded = downloaded
853 self.expected = expected
d77c3dfd 854
5f6a1245 855
bf5b9d85 856class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
857 def __init__(self, code=None, msg='Unknown error'):
858 super(XAttrMetadataError, self).__init__(msg)
859 self.code = code
bd264412 860 self.msg = msg
efa97bdc
YCH
861
862 # Parsing code and msg
863 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
864 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
865 self.reason = 'NO_SPACE'
866 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
867 self.reason = 'VALUE_TOO_LONG'
868 else:
869 self.reason = 'NOT_SUPPORTED'
870
871
bf5b9d85 872class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
873 pass
874
875
c5a59d93 876def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
877 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
878 # expected HTTP responses to meet HTTP/1.0 or later (see also
879 # https://github.com/rg3/youtube-dl/issues/6727)
880 if sys.version_info < (3, 0):
65220c3b
S
881 kwargs['strict'] = True
882 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d
PH
883 source_address = ydl_handler._params.get('source_address')
884 if source_address is not None:
885 sa = (source_address, 0)
886 if hasattr(hc, 'source_address'): # Python 2.7+
887 hc.source_address = sa
888 else: # Python 2.6
889 def _hc_connect(self, *args, **kwargs):
890 sock = compat_socket_create_connection(
891 (self.host, self.port), self.timeout, sa)
892 if is_https:
d7932313
PH
893 self.sock = ssl.wrap_socket(
894 sock, self.key_file, self.cert_file,
895 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
896 else:
897 self.sock = sock
898 hc.connect = functools.partial(_hc_connect, hc)
899
900 return hc
901
902
87f0e62d 903def handle_youtubedl_headers(headers):
992fc9d6
YCH
904 filtered_headers = headers
905
906 if 'Youtubedl-no-compression' in filtered_headers:
907 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 908 del filtered_headers['Youtubedl-no-compression']
87f0e62d 909
992fc9d6 910 return filtered_headers
87f0e62d
YCH
911
912
acebc9cd 913class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
914 """Handler for HTTP requests and responses.
915
916 This class, when installed with an OpenerDirector, automatically adds
917 the standard headers to every HTTP request and handles gzipped and
918 deflated responses from web servers. If compression is to be avoided in
919 a particular request, the original request in the program code only has
0424ec30 920 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
921 removed before making the real request.
922
923 Part of this code was copied from:
924
925 http://techknack.net/python-urllib2-handlers/
926
927 Andrew Rowls, the author of that code, agreed to release it to the
928 public domain.
929 """
930
be4a824d
PH
931 def __init__(self, params, *args, **kwargs):
932 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
933 self._params = params
934
935 def http_open(self, req):
71aff188
YCH
936 conn_class = compat_http_client.HTTPConnection
937
938 socks_proxy = req.headers.get('Ytdl-socks-proxy')
939 if socks_proxy:
940 conn_class = make_socks_conn_class(conn_class, socks_proxy)
941 del req.headers['Ytdl-socks-proxy']
942
be4a824d 943 return self.do_open(functools.partial(
71aff188 944 _create_http_connection, self, conn_class, False),
be4a824d
PH
945 req)
946
59ae15a5
PH
947 @staticmethod
948 def deflate(data):
949 try:
950 return zlib.decompress(data, -zlib.MAX_WBITS)
951 except zlib.error:
952 return zlib.decompress(data)
953
acebc9cd 954 def http_request(self, req):
51f267d9
S
955 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
956 # always respected by websites, some tend to give out URLs with non percent-encoded
957 # non-ASCII characters (see telemb.py, ard.py [#3412])
958 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
959 # To work around aforementioned issue we will replace request's original URL with
960 # percent-encoded one
961 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
962 # the code of this workaround has been moved here from YoutubeDL.urlopen()
963 url = req.get_full_url()
964 url_escaped = escape_url(url)
965
966 # Substitute URL if any change after escaping
967 if url != url_escaped:
15d260eb 968 req = update_Request(req, url=url_escaped)
51f267d9 969
33ac271b 970 for h, v in std_headers.items():
3d5f7a39
JK
971 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
972 # The dict keys are capitalized because of this bug by urllib
973 if h.capitalize() not in req.headers:
33ac271b 974 req.add_header(h, v)
87f0e62d
YCH
975
976 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
977
978 if sys.version_info < (2, 7) and '#' in req.get_full_url():
979 # Python 2.6 is brain-dead when it comes to fragments
980 req._Request__original = req._Request__original.partition('#')[0]
981 req._Request__r_type = req._Request__r_type.partition('#')[0]
982
59ae15a5
PH
983 return req
984
acebc9cd 985 def http_response(self, req, resp):
59ae15a5
PH
986 old_resp = resp
987 # gzip
988 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
989 content = resp.read()
990 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
991 try:
992 uncompressed = io.BytesIO(gz.read())
993 except IOError as original_ioerror:
994 # There may be junk add the end of the file
995 # See http://stackoverflow.com/q/4928560/35070 for details
996 for i in range(1, 1024):
997 try:
998 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
999 uncompressed = io.BytesIO(gz.read())
1000 except IOError:
1001 continue
1002 break
1003 else:
1004 raise original_ioerror
b407d853 1005 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1006 resp.msg = old_resp.msg
c047270c 1007 del resp.headers['Content-encoding']
59ae15a5
PH
1008 # deflate
1009 if resp.headers.get('Content-encoding', '') == 'deflate':
1010 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1011 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1012 resp.msg = old_resp.msg
c047270c 1013 del resp.headers['Content-encoding']
ad729172
S
1014 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1016 if 300 <= resp.code < 400:
1017 location = resp.headers.get('Location')
1018 if location:
1019 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020 if sys.version_info >= (3, 0):
1021 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1022 else:
1023 location = location.decode('utf-8')
5a4d9ddb
S
1024 location_escaped = escape_url(location)
1025 if location != location_escaped:
1026 del resp.headers['Location']
9a4aec8b
YCH
1027 if sys.version_info < (3, 0):
1028 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1029 resp.headers['Location'] = location_escaped
59ae15a5 1030 return resp
0f8d03f8 1031
acebc9cd
PH
1032 https_request = http_request
1033 https_response = http_response
bf50b038 1034
5de90176 1035
71aff188
YCH
1036def make_socks_conn_class(base_class, socks_proxy):
1037 assert issubclass(base_class, (
1038 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1039
1040 url_components = compat_urlparse.urlparse(socks_proxy)
1041 if url_components.scheme.lower() == 'socks5':
1042 socks_type = ProxyType.SOCKS5
1043 elif url_components.scheme.lower() in ('socks', 'socks4'):
1044 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1045 elif url_components.scheme.lower() == 'socks4a':
1046 socks_type = ProxyType.SOCKS4A
71aff188 1047
cdd94c2e
YCH
1048 def unquote_if_non_empty(s):
1049 if not s:
1050 return s
1051 return compat_urllib_parse_unquote_plus(s)
1052
71aff188
YCH
1053 proxy_args = (
1054 socks_type,
1055 url_components.hostname, url_components.port or 1080,
1056 True, # Remote DNS
cdd94c2e
YCH
1057 unquote_if_non_empty(url_components.username),
1058 unquote_if_non_empty(url_components.password),
71aff188
YCH
1059 )
1060
1061 class SocksConnection(base_class):
1062 def connect(self):
1063 self.sock = sockssocket()
1064 self.sock.setproxy(*proxy_args)
1065 if type(self.timeout) in (int, float):
1066 self.sock.settimeout(self.timeout)
1067 self.sock.connect((self.host, self.port))
1068
1069 if isinstance(self, compat_http_client.HTTPSConnection):
1070 if hasattr(self, '_context'): # Python > 2.6
1071 self.sock = self._context.wrap_socket(
1072 self.sock, server_hostname=self.host)
1073 else:
1074 self.sock = ssl.wrap_socket(self.sock)
1075
1076 return SocksConnection
1077
1078
be4a824d
PH
1079class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083 self._params = params
1084
1085 def https_open(self, req):
4f264c02 1086 kwargs = {}
71aff188
YCH
1087 conn_class = self._https_conn_class
1088
4f264c02
JMF
1089 if hasattr(self, '_context'): # python > 2.6
1090 kwargs['context'] = self._context
1091 if hasattr(self, '_check_hostname'): # python 3.x
1092 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1093
1094 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1095 if socks_proxy:
1096 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097 del req.headers['Ytdl-socks-proxy']
1098
be4a824d 1099 return self.do_open(functools.partial(
71aff188 1100 _create_http_connection, self, conn_class, True),
4f264c02 1101 req, **kwargs)
be4a824d
PH
1102
1103
a6420bf5
S
1104class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105 def __init__(self, cookiejar=None):
1106 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1107
1108 def http_response(self, request, response):
1109 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110 # characters in Set-Cookie HTTP header of last response (see
1111 # https://github.com/rg3/youtube-dl/issues/6769).
1112 # In order to at least prevent crashing we will percent encode Set-Cookie
1113 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1114 # if sys.version_info < (3, 0) and response.headers:
1115 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116 # set_cookie = response.headers.get(set_cookie_header)
1117 # if set_cookie:
1118 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119 # if set_cookie != set_cookie_escaped:
1120 # del response.headers[set_cookie_header]
1121 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1122 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1123
1124 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125 https_response = http_response
1126
1127
46f59e89
S
1128def extract_timezone(date_str):
1129 m = re.search(
1130 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1131 date_str)
1132 if not m:
1133 timezone = datetime.timedelta()
1134 else:
1135 date_str = date_str[:-len(m.group('tz'))]
1136 if not m.group('sign'):
1137 timezone = datetime.timedelta()
1138 else:
1139 sign = 1 if m.group('sign') == '+' else -1
1140 timezone = datetime.timedelta(
1141 hours=sign * int(m.group('hours')),
1142 minutes=sign * int(m.group('minutes')))
1143 return timezone, date_str
1144
1145
08b38d54 1146def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1147 """ Return a UNIX timestamp from the given date """
1148
1149 if date_str is None:
1150 return None
1151
52c3a6e4
S
1152 date_str = re.sub(r'\.[0-9]+', '', date_str)
1153
08b38d54 1154 if timezone is None:
46f59e89
S
1155 timezone, date_str = extract_timezone(date_str)
1156
52c3a6e4
S
1157 try:
1158 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160 return calendar.timegm(dt.timetuple())
1161 except ValueError:
1162 pass
912b38b4
PH
1163
1164
46f59e89
S
1165def date_formats(day_first=True):
1166 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1167
1168
42bdd9d0 1169def unified_strdate(date_str, day_first=True):
bf50b038 1170 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1171
1172 if date_str is None:
1173 return None
bf50b038 1174 upload_date = None
5f6a1245 1175 # Replace commas
026fcc04 1176 date_str = date_str.replace(',', ' ')
42bdd9d0 1177 # Remove AM/PM + timezone
9bb8e0a3 1178 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1179 _, date_str = extract_timezone(date_str)
42bdd9d0 1180
46f59e89 1181 for expression in date_formats(day_first):
bf50b038
JMF
1182 try:
1183 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1184 except ValueError:
bf50b038 1185 pass
42393ce2
PH
1186 if upload_date is None:
1187 timetuple = email.utils.parsedate_tz(date_str)
1188 if timetuple:
c6b9cf05
S
1189 try:
1190 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191 except ValueError:
1192 pass
6a750402
JMF
1193 if upload_date is not None:
1194 return compat_str(upload_date)
bf50b038 1195
5f6a1245 1196
46f59e89
S
1197def unified_timestamp(date_str, day_first=True):
1198 if date_str is None:
1199 return None
1200
2ae2ffda 1201 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1202
7dc2a74e 1203 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1204 timezone, date_str = extract_timezone(date_str)
1205
1206 # Remove AM/PM + timezone
1207 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1208
deef3195
S
1209 # Remove unrecognized timezones from ISO 8601 alike timestamps
1210 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1211 if m:
1212 date_str = date_str[:-len(m.group('tz'))]
1213
f226880c
PH
1214 # Python only supports microseconds, so remove nanoseconds
1215 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1216 if m:
1217 date_str = m.group(1)
1218
46f59e89
S
1219 for expression in date_formats(day_first):
1220 try:
7dc2a74e 1221 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1222 return calendar.timegm(dt.timetuple())
1223 except ValueError:
1224 pass
1225 timetuple = email.utils.parsedate_tz(date_str)
1226 if timetuple:
7dc2a74e 1227 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1228
1229
28e614de 1230def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1231 if url is None:
1232 return default_ext
9cb9a5df 1233 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1234 if re.match(r'^[A-Za-z0-9]+$', guess):
1235 return guess
a7aaa398
S
1236 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1237 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1238 return guess.rstrip('/')
73e79f2a 1239 else:
cbdbb766 1240 return default_ext
73e79f2a 1241
5f6a1245 1242
d4051a8e 1243def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1244 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1245
5f6a1245 1246
bd558525 1247def date_from_str(date_str):
37254abc
JMF
1248 """
1249 Return a datetime object from a string in the format YYYYMMDD or
1250 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1251 today = datetime.date.today()
f8795e10 1252 if date_str in ('now', 'today'):
37254abc 1253 return today
f8795e10
PH
1254 if date_str == 'yesterday':
1255 return today - datetime.timedelta(days=1)
ec85ded8 1256 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1257 if match is not None:
1258 sign = match.group('sign')
1259 time = int(match.group('time'))
1260 if sign == '-':
1261 time = -time
1262 unit = match.group('unit')
dfb1b146 1263 # A bad approximation?
37254abc
JMF
1264 if unit == 'month':
1265 unit = 'day'
1266 time *= 30
1267 elif unit == 'year':
1268 unit = 'day'
1269 time *= 365
1270 unit += 's'
1271 delta = datetime.timedelta(**{unit: time})
1272 return today + delta
611c1dd9 1273 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1274
1275
e63fc1be 1276def hyphenate_date(date_str):
1277 """
1278 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1279 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1280 if match is not None:
1281 return '-'.join(match.groups())
1282 else:
1283 return date_str
1284
5f6a1245 1285
bd558525
JMF
1286class DateRange(object):
1287 """Represents a time interval between two dates"""
5f6a1245 1288
bd558525
JMF
1289 def __init__(self, start=None, end=None):
1290 """start and end must be strings in the format accepted by date"""
1291 if start is not None:
1292 self.start = date_from_str(start)
1293 else:
1294 self.start = datetime.datetime.min.date()
1295 if end is not None:
1296 self.end = date_from_str(end)
1297 else:
1298 self.end = datetime.datetime.max.date()
37254abc 1299 if self.start > self.end:
bd558525 1300 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1301
bd558525
JMF
1302 @classmethod
1303 def day(cls, day):
1304 """Returns a range that only contains the given day"""
5f6a1245
JW
1305 return cls(day, day)
1306
bd558525
JMF
1307 def __contains__(self, date):
1308 """Check if the date is in the range"""
37254abc
JMF
1309 if not isinstance(date, datetime.date):
1310 date = date_from_str(date)
1311 return self.start <= date <= self.end
5f6a1245 1312
bd558525 1313 def __str__(self):
5f6a1245 1314 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1315
1316
1317def platform_name():
1318 """ Returns the platform name as a compat_str """
1319 res = platform.platform()
1320 if isinstance(res, bytes):
1321 res = res.decode(preferredencoding())
1322
1323 assert isinstance(res, compat_str)
1324 return res
c257baff
PH
1325
1326
b58ddb32
PH
1327def _windows_write_string(s, out):
1328 """ Returns True if the string was written using special methods,
1329 False if it has yet to be written out."""
1330 # Adapted from http://stackoverflow.com/a/3259271/35070
1331
1332 import ctypes
1333 import ctypes.wintypes
1334
1335 WIN_OUTPUT_IDS = {
1336 1: -11,
1337 2: -12,
1338 }
1339
a383a98a
PH
1340 try:
1341 fileno = out.fileno()
1342 except AttributeError:
1343 # If the output stream doesn't have a fileno, it's virtual
1344 return False
aa42e873
PH
1345 except io.UnsupportedOperation:
1346 # Some strange Windows pseudo files?
1347 return False
b58ddb32
PH
1348 if fileno not in WIN_OUTPUT_IDS:
1349 return False
1350
d7cd9a9e 1351 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1352 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1353 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1354 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1355
d7cd9a9e 1356 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1357 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1358 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1359 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1360 written = ctypes.wintypes.DWORD(0)
1361
d7cd9a9e 1362 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1363 FILE_TYPE_CHAR = 0x0002
1364 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1365 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1366 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1367 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1368 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1369 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1370
1371 def not_a_console(handle):
1372 if handle == INVALID_HANDLE_VALUE or handle is None:
1373 return True
8fb3ac36
PH
1374 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1375 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1376
1377 if not_a_console(h):
1378 return False
1379
d1b9c912
PH
1380 def next_nonbmp_pos(s):
1381 try:
1382 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1383 except StopIteration:
1384 return len(s)
1385
1386 while s:
1387 count = min(next_nonbmp_pos(s), 1024)
1388
b58ddb32 1389 ret = WriteConsoleW(
d1b9c912 1390 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1391 if ret == 0:
1392 raise OSError('Failed to write string')
d1b9c912
PH
1393 if not count: # We just wrote a non-BMP character
1394 assert written.value == 2
1395 s = s[1:]
1396 else:
1397 assert written.value > 0
1398 s = s[written.value:]
b58ddb32
PH
1399 return True
1400
1401
734f90bb 1402def write_string(s, out=None, encoding=None):
7459e3a2
PH
1403 if out is None:
1404 out = sys.stderr
8bf48f23 1405 assert type(s) == compat_str
7459e3a2 1406
b58ddb32
PH
1407 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1408 if _windows_write_string(s, out):
1409 return
1410
7459e3a2
PH
1411 if ('b' in getattr(out, 'mode', '') or
1412 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1413 byt = s.encode(encoding or preferredencoding(), 'ignore')
1414 out.write(byt)
1415 elif hasattr(out, 'buffer'):
1416 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1417 byt = s.encode(enc, 'ignore')
1418 out.buffer.write(byt)
1419 else:
8bf48f23 1420 out.write(s)
7459e3a2
PH
1421 out.flush()
1422
1423
48ea9cea
PH
1424def bytes_to_intlist(bs):
1425 if not bs:
1426 return []
1427 if isinstance(bs[0], int): # Python 3
1428 return list(bs)
1429 else:
1430 return [ord(c) for c in bs]
1431
c257baff 1432
cba892fa 1433def intlist_to_bytes(xs):
1434 if not xs:
1435 return b''
edaa23f8 1436 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1437
1438
c1c9a79c
PH
1439# Cross-platform file locking
1440if sys.platform == 'win32':
1441 import ctypes.wintypes
1442 import msvcrt
1443
1444 class OVERLAPPED(ctypes.Structure):
1445 _fields_ = [
1446 ('Internal', ctypes.wintypes.LPVOID),
1447 ('InternalHigh', ctypes.wintypes.LPVOID),
1448 ('Offset', ctypes.wintypes.DWORD),
1449 ('OffsetHigh', ctypes.wintypes.DWORD),
1450 ('hEvent', ctypes.wintypes.HANDLE),
1451 ]
1452
1453 kernel32 = ctypes.windll.kernel32
1454 LockFileEx = kernel32.LockFileEx
1455 LockFileEx.argtypes = [
1456 ctypes.wintypes.HANDLE, # hFile
1457 ctypes.wintypes.DWORD, # dwFlags
1458 ctypes.wintypes.DWORD, # dwReserved
1459 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1460 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1461 ctypes.POINTER(OVERLAPPED) # Overlapped
1462 ]
1463 LockFileEx.restype = ctypes.wintypes.BOOL
1464 UnlockFileEx = kernel32.UnlockFileEx
1465 UnlockFileEx.argtypes = [
1466 ctypes.wintypes.HANDLE, # hFile
1467 ctypes.wintypes.DWORD, # dwReserved
1468 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1469 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1470 ctypes.POINTER(OVERLAPPED) # Overlapped
1471 ]
1472 UnlockFileEx.restype = ctypes.wintypes.BOOL
1473 whole_low = 0xffffffff
1474 whole_high = 0x7fffffff
1475
1476 def _lock_file(f, exclusive):
1477 overlapped = OVERLAPPED()
1478 overlapped.Offset = 0
1479 overlapped.OffsetHigh = 0
1480 overlapped.hEvent = 0
1481 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1482 handle = msvcrt.get_osfhandle(f.fileno())
1483 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1484 whole_low, whole_high, f._lock_file_overlapped_p):
1485 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1486
1487 def _unlock_file(f):
1488 assert f._lock_file_overlapped_p
1489 handle = msvcrt.get_osfhandle(f.fileno())
1490 if not UnlockFileEx(handle, 0,
1491 whole_low, whole_high, f._lock_file_overlapped_p):
1492 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1493
1494else:
399a76e6
YCH
1495 # Some platforms, such as Jython, is missing fcntl
1496 try:
1497 import fcntl
c1c9a79c 1498
399a76e6
YCH
1499 def _lock_file(f, exclusive):
1500 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1501
399a76e6
YCH
1502 def _unlock_file(f):
1503 fcntl.flock(f, fcntl.LOCK_UN)
1504 except ImportError:
1505 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1506
1507 def _lock_file(f, exclusive):
1508 raise IOError(UNSUPPORTED_MSG)
1509
1510 def _unlock_file(f):
1511 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1512
1513
1514class locked_file(object):
1515 def __init__(self, filename, mode, encoding=None):
1516 assert mode in ['r', 'a', 'w']
1517 self.f = io.open(filename, mode, encoding=encoding)
1518 self.mode = mode
1519
1520 def __enter__(self):
1521 exclusive = self.mode != 'r'
1522 try:
1523 _lock_file(self.f, exclusive)
1524 except IOError:
1525 self.f.close()
1526 raise
1527 return self
1528
1529 def __exit__(self, etype, value, traceback):
1530 try:
1531 _unlock_file(self.f)
1532 finally:
1533 self.f.close()
1534
1535 def __iter__(self):
1536 return iter(self.f)
1537
1538 def write(self, *args):
1539 return self.f.write(*args)
1540
1541 def read(self, *args):
1542 return self.f.read(*args)
4eb7f1d1
JMF
1543
1544
4644ac55
S
1545def get_filesystem_encoding():
1546 encoding = sys.getfilesystemencoding()
1547 return encoding if encoding is not None else 'utf-8'
1548
1549
4eb7f1d1 1550def shell_quote(args):
a6a173c2 1551 quoted_args = []
4644ac55 1552 encoding = get_filesystem_encoding()
a6a173c2
JMF
1553 for a in args:
1554 if isinstance(a, bytes):
1555 # We may get a filename encoded with 'encodeFilename'
1556 a = a.decode(encoding)
aefce8e6 1557 quoted_args.append(compat_shlex_quote(a))
28e614de 1558 return ' '.join(quoted_args)
9d4660ca
PH
1559
1560
1561def smuggle_url(url, data):
1562 """ Pass additional data in a URL for internal use. """
1563
81953d1a
RA
1564 url, idata = unsmuggle_url(url, {})
1565 data.update(idata)
15707c7e 1566 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1567 {'__youtubedl_smuggle': json.dumps(data)})
1568 return url + '#' + sdata
9d4660ca
PH
1569
1570
79f82953 1571def unsmuggle_url(smug_url, default=None):
83e865a3 1572 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1573 return smug_url, default
28e614de
PH
1574 url, _, sdata = smug_url.rpartition('#')
1575 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1576 data = json.loads(jsond)
1577 return url, data
02dbf93f
PH
1578
1579
02dbf93f
PH
1580def format_bytes(bytes):
1581 if bytes is None:
28e614de 1582 return 'N/A'
02dbf93f
PH
1583 if type(bytes) is str:
1584 bytes = float(bytes)
1585 if bytes == 0.0:
1586 exponent = 0
1587 else:
1588 exponent = int(math.log(bytes, 1024.0))
28e614de 1589 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1590 converted = float(bytes) / float(1024 ** exponent)
28e614de 1591 return '%.2f%s' % (converted, suffix)
f53c966a 1592
1c088fa8 1593
fb47597b
S
1594def lookup_unit_table(unit_table, s):
1595 units_re = '|'.join(re.escape(u) for u in unit_table)
1596 m = re.match(
782b1b5b 1597 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1598 if not m:
1599 return None
1600 num_str = m.group('num').replace(',', '.')
1601 mult = unit_table[m.group('unit')]
1602 return int(float(num_str) * mult)
1603
1604
be64b5b0
PH
1605def parse_filesize(s):
1606 if s is None:
1607 return None
1608
dfb1b146 1609 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1610 # but we support those too
1611 _UNIT_TABLE = {
1612 'B': 1,
1613 'b': 1,
70852b47 1614 'bytes': 1,
be64b5b0
PH
1615 'KiB': 1024,
1616 'KB': 1000,
1617 'kB': 1024,
1618 'Kb': 1000,
13585d76 1619 'kb': 1000,
70852b47
YCH
1620 'kilobytes': 1000,
1621 'kibibytes': 1024,
be64b5b0
PH
1622 'MiB': 1024 ** 2,
1623 'MB': 1000 ** 2,
1624 'mB': 1024 ** 2,
1625 'Mb': 1000 ** 2,
13585d76 1626 'mb': 1000 ** 2,
70852b47
YCH
1627 'megabytes': 1000 ** 2,
1628 'mebibytes': 1024 ** 2,
be64b5b0
PH
1629 'GiB': 1024 ** 3,
1630 'GB': 1000 ** 3,
1631 'gB': 1024 ** 3,
1632 'Gb': 1000 ** 3,
13585d76 1633 'gb': 1000 ** 3,
70852b47
YCH
1634 'gigabytes': 1000 ** 3,
1635 'gibibytes': 1024 ** 3,
be64b5b0
PH
1636 'TiB': 1024 ** 4,
1637 'TB': 1000 ** 4,
1638 'tB': 1024 ** 4,
1639 'Tb': 1000 ** 4,
13585d76 1640 'tb': 1000 ** 4,
70852b47
YCH
1641 'terabytes': 1000 ** 4,
1642 'tebibytes': 1024 ** 4,
be64b5b0
PH
1643 'PiB': 1024 ** 5,
1644 'PB': 1000 ** 5,
1645 'pB': 1024 ** 5,
1646 'Pb': 1000 ** 5,
13585d76 1647 'pb': 1000 ** 5,
70852b47
YCH
1648 'petabytes': 1000 ** 5,
1649 'pebibytes': 1024 ** 5,
be64b5b0
PH
1650 'EiB': 1024 ** 6,
1651 'EB': 1000 ** 6,
1652 'eB': 1024 ** 6,
1653 'Eb': 1000 ** 6,
13585d76 1654 'eb': 1000 ** 6,
70852b47
YCH
1655 'exabytes': 1000 ** 6,
1656 'exbibytes': 1024 ** 6,
be64b5b0
PH
1657 'ZiB': 1024 ** 7,
1658 'ZB': 1000 ** 7,
1659 'zB': 1024 ** 7,
1660 'Zb': 1000 ** 7,
13585d76 1661 'zb': 1000 ** 7,
70852b47
YCH
1662 'zettabytes': 1000 ** 7,
1663 'zebibytes': 1024 ** 7,
be64b5b0
PH
1664 'YiB': 1024 ** 8,
1665 'YB': 1000 ** 8,
1666 'yB': 1024 ** 8,
1667 'Yb': 1000 ** 8,
13585d76 1668 'yb': 1000 ** 8,
70852b47
YCH
1669 'yottabytes': 1000 ** 8,
1670 'yobibytes': 1024 ** 8,
be64b5b0
PH
1671 }
1672
fb47597b
S
1673 return lookup_unit_table(_UNIT_TABLE, s)
1674
1675
1676def parse_count(s):
1677 if s is None:
be64b5b0
PH
1678 return None
1679
fb47597b
S
1680 s = s.strip()
1681
1682 if re.match(r'^[\d,.]+$', s):
1683 return str_to_int(s)
1684
1685 _UNIT_TABLE = {
1686 'k': 1000,
1687 'K': 1000,
1688 'm': 1000 ** 2,
1689 'M': 1000 ** 2,
1690 'kk': 1000 ** 2,
1691 'KK': 1000 ** 2,
1692 }
be64b5b0 1693
fb47597b 1694 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1695
2f7ae819 1696
b871d7e9
S
1697def parse_resolution(s):
1698 if s is None:
1699 return {}
1700
1701 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1702 if mobj:
1703 return {
1704 'width': int(mobj.group('w')),
1705 'height': int(mobj.group('h')),
1706 }
1707
1708 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1709 if mobj:
1710 return {'height': int(mobj.group(1))}
1711
1712 mobj = re.search(r'\b([48])[kK]\b', s)
1713 if mobj:
1714 return {'height': int(mobj.group(1)) * 540}
1715
1716 return {}
1717
1718
a942d6cb 1719def month_by_name(name, lang='en'):
caefb1de
PH
1720 """ Return the number of a month by (locale-independently) English name """
1721
f6717dec 1722 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1723
caefb1de 1724 try:
f6717dec 1725 return month_names.index(name) + 1
7105440c
YCH
1726 except ValueError:
1727 return None
1728
1729
1730def month_by_abbreviation(abbrev):
1731 """ Return the number of a month by (locale-independently) English
1732 abbreviations """
1733
1734 try:
1735 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1736 except ValueError:
1737 return None
18258362
JMF
1738
1739
5aafe895 1740def fix_xml_ampersands(xml_str):
18258362 1741 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1742 return re.sub(
1743 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1744 '&amp;',
5aafe895 1745 xml_str)
e3946f98
PH
1746
1747
1748def setproctitle(title):
8bf48f23 1749 assert isinstance(title, compat_str)
c1c05c67
YCH
1750
1751 # ctypes in Jython is not complete
1752 # http://bugs.jython.org/issue2148
1753 if sys.platform.startswith('java'):
1754 return
1755
e3946f98 1756 try:
611c1dd9 1757 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1758 except OSError:
1759 return
2f49bcd6
RC
1760 except TypeError:
1761 # LoadLibrary in Windows Python 2.7.13 only expects
1762 # a bytestring, but since unicode_literals turns
1763 # every string into a unicode string, it fails.
1764 return
6eefe533
PH
1765 title_bytes = title.encode('utf-8')
1766 buf = ctypes.create_string_buffer(len(title_bytes))
1767 buf.value = title_bytes
e3946f98 1768 try:
6eefe533 1769 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1770 except AttributeError:
1771 return # Strange libc, just skip this
d7dda168
PH
1772
1773
1774def remove_start(s, start):
46bc9b7d 1775 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1776
1777
2b9faf55 1778def remove_end(s, end):
46bc9b7d 1779 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1780
1781
31b2051e
S
1782def remove_quotes(s):
1783 if s is None or len(s) < 2:
1784 return s
1785 for quote in ('"', "'", ):
1786 if s[0] == quote and s[-1] == quote:
1787 return s[1:-1]
1788 return s
1789
1790
29eb5174 1791def url_basename(url):
9b8aaeed 1792 path = compat_urlparse.urlparse(url).path
28e614de 1793 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1794
1795
02dc0a36
S
1796def base_url(url):
1797 return re.match(r'https?://[^?#&]+/', url).group()
1798
1799
e34c3361 1800def urljoin(base, path):
4b5de77b
S
1801 if isinstance(path, bytes):
1802 path = path.decode('utf-8')
e34c3361
S
1803 if not isinstance(path, compat_str) or not path:
1804 return None
b0c65c67 1805 if re.match(r'^(?:https?:)?//', path):
e34c3361 1806 return path
4b5de77b
S
1807 if isinstance(base, bytes):
1808 base = base.decode('utf-8')
1809 if not isinstance(base, compat_str) or not re.match(
1810 r'^(?:https?:)?//', base):
e34c3361
S
1811 return None
1812 return compat_urlparse.urljoin(base, path)
1813
1814
aa94a6d3
PH
1815class HEADRequest(compat_urllib_request.Request):
1816 def get_method(self):
611c1dd9 1817 return 'HEAD'
7217e148
PH
1818
1819
95cf60e8
S
1820class PUTRequest(compat_urllib_request.Request):
1821 def get_method(self):
1822 return 'PUT'
1823
1824
9732d77e 1825def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1826 if get_attr:
1827 if v is not None:
1828 v = getattr(v, get_attr, None)
9572013d
PH
1829 if v == '':
1830 v = None
1812afb7
S
1831 if v is None:
1832 return default
1833 try:
1834 return int(v) * invscale // scale
1835 except ValueError:
af98f8ff 1836 return default
9732d77e 1837
9572013d 1838
40a90862
JMF
1839def str_or_none(v, default=None):
1840 return default if v is None else compat_str(v)
1841
9732d77e
PH
1842
1843def str_to_int(int_str):
48d4681e 1844 """ A more relaxed version of int_or_none """
9732d77e
PH
1845 if int_str is None:
1846 return None
28e614de 1847 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1848 return int(int_str)
608d11f5
PH
1849
1850
9732d77e 1851def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1852 if v is None:
1853 return default
1854 try:
1855 return float(v) * invscale / scale
1856 except ValueError:
1857 return default
43f775e4
PH
1858
1859
c7e327c4
S
1860def bool_or_none(v, default=None):
1861 return v if isinstance(v, bool) else default
1862
1863
b72b4431
S
1864def strip_or_none(v):
1865 return None if v is None else v.strip()
1866
1867
608d11f5 1868def parse_duration(s):
8f9312c3 1869 if not isinstance(s, compat_basestring):
608d11f5
PH
1870 return None
1871
ca7b3246
S
1872 s = s.strip()
1873
acaff495 1874 days, hours, mins, secs, ms = [None] * 5
15846398 1875 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1876 if m:
1877 days, hours, mins, secs, ms = m.groups()
1878 else:
1879 m = re.match(
056653bb
S
1880 r'''(?ix)(?:P?
1881 (?:
1882 [0-9]+\s*y(?:ears?)?\s*
1883 )?
1884 (?:
1885 [0-9]+\s*m(?:onths?)?\s*
1886 )?
1887 (?:
1888 [0-9]+\s*w(?:eeks?)?\s*
1889 )?
8f4b58d7 1890 (?:
acaff495 1891 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1892 )?
056653bb 1893 T)?
acaff495 1894 (?:
1895 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1896 )?
1897 (?:
1898 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1899 )?
1900 (?:
1901 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1902 )?Z?$''', s)
acaff495 1903 if m:
1904 days, hours, mins, secs, ms = m.groups()
1905 else:
15846398 1906 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1907 if m:
1908 hours, mins = m.groups()
1909 else:
1910 return None
1911
1912 duration = 0
1913 if secs:
1914 duration += float(secs)
1915 if mins:
1916 duration += float(mins) * 60
1917 if hours:
1918 duration += float(hours) * 60 * 60
1919 if days:
1920 duration += float(days) * 24 * 60 * 60
1921 if ms:
1922 duration += float(ms)
1923 return duration
91d7d0b3
JMF
1924
1925
e65e4c88 1926def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1927 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1928 return (
1929 '{0}.{1}{2}'.format(name, ext, real_ext)
1930 if not expected_real_ext or real_ext[1:] == expected_real_ext
1931 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1932
1933
b3ed15b7
S
1934def replace_extension(filename, ext, expected_real_ext=None):
1935 name, real_ext = os.path.splitext(filename)
1936 return '{0}.{1}'.format(
1937 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1938 ext)
1939
1940
d70ad093
PH
1941def check_executable(exe, args=[]):
1942 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1943 args can be a list of arguments for a short output (like -version) """
1944 try:
1945 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1946 except OSError:
1947 return False
1948 return exe
b7ab0590
PH
1949
1950
95807118 1951def get_exe_version(exe, args=['--version'],
cae97f65 1952 version_re=None, unrecognized='present'):
95807118
PH
1953 """ Returns the version of the specified executable,
1954 or False if the executable is not present """
1955 try:
b64d04c1
YCH
1956 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1957 # SIGTTOU if youtube-dl is run in the background.
1958 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1959 out, _ = subprocess.Popen(
54116803 1960 [encodeArgument(exe)] + args,
00ca7552 1961 stdin=subprocess.PIPE,
95807118
PH
1962 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1963 except OSError:
1964 return False
cae97f65
PH
1965 if isinstance(out, bytes): # Python 2.x
1966 out = out.decode('ascii', 'ignore')
1967 return detect_exe_version(out, version_re, unrecognized)
1968
1969
1970def detect_exe_version(output, version_re=None, unrecognized='present'):
1971 assert isinstance(output, compat_str)
1972 if version_re is None:
1973 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1974 m = re.search(version_re, output)
95807118
PH
1975 if m:
1976 return m.group(1)
1977 else:
1978 return unrecognized
1979
1980
b7ab0590 1981class PagedList(object):
dd26ced1
PH
1982 def __len__(self):
1983 # This is only useful for tests
1984 return len(self.getslice())
1985
9c44d242
PH
1986
1987class OnDemandPagedList(PagedList):
6be08ce6 1988 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1989 self._pagefunc = pagefunc
1990 self._pagesize = pagesize
b95dc034
YCH
1991 self._use_cache = use_cache
1992 if use_cache:
1993 self._cache = {}
9c44d242 1994
b7ab0590
PH
1995 def getslice(self, start=0, end=None):
1996 res = []
1997 for pagenum in itertools.count(start // self._pagesize):
1998 firstid = pagenum * self._pagesize
1999 nextfirstid = pagenum * self._pagesize + self._pagesize
2000 if start >= nextfirstid:
2001 continue
2002
b95dc034
YCH
2003 page_results = None
2004 if self._use_cache:
2005 page_results = self._cache.get(pagenum)
2006 if page_results is None:
2007 page_results = list(self._pagefunc(pagenum))
2008 if self._use_cache:
2009 self._cache[pagenum] = page_results
b7ab0590
PH
2010
2011 startv = (
2012 start % self._pagesize
2013 if firstid <= start < nextfirstid
2014 else 0)
2015
2016 endv = (
2017 ((end - 1) % self._pagesize) + 1
2018 if (end is not None and firstid <= end <= nextfirstid)
2019 else None)
2020
2021 if startv != 0 or endv is not None:
2022 page_results = page_results[startv:endv]
2023 res.extend(page_results)
2024
2025 # A little optimization - if current page is not "full", ie. does
2026 # not contain page_size videos then we can assume that this page
2027 # is the last one - there are no more ids on further pages -
2028 # i.e. no need to query again.
2029 if len(page_results) + startv < self._pagesize:
2030 break
2031
2032 # If we got the whole page, but the next page is not interesting,
2033 # break out early as well
2034 if end == nextfirstid:
2035 break
2036 return res
81c2f20b
PH
2037
2038
9c44d242
PH
2039class InAdvancePagedList(PagedList):
2040 def __init__(self, pagefunc, pagecount, pagesize):
2041 self._pagefunc = pagefunc
2042 self._pagecount = pagecount
2043 self._pagesize = pagesize
2044
2045 def getslice(self, start=0, end=None):
2046 res = []
2047 start_page = start // self._pagesize
2048 end_page = (
2049 self._pagecount if end is None else (end // self._pagesize + 1))
2050 skip_elems = start - start_page * self._pagesize
2051 only_more = None if end is None else end - start
2052 for pagenum in range(start_page, end_page):
2053 page = list(self._pagefunc(pagenum))
2054 if skip_elems:
2055 page = page[skip_elems:]
2056 skip_elems = None
2057 if only_more is not None:
2058 if len(page) < only_more:
2059 only_more -= len(page)
2060 else:
2061 page = page[:only_more]
2062 res.extend(page)
2063 break
2064 res.extend(page)
2065 return res
2066
2067
81c2f20b 2068def uppercase_escape(s):
676eb3f2 2069 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2070 return re.sub(
a612753d 2071 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2072 lambda m: unicode_escape(m.group(0))[0],
2073 s)
0fe2ff78
YCH
2074
2075
2076def lowercase_escape(s):
2077 unicode_escape = codecs.getdecoder('unicode_escape')
2078 return re.sub(
2079 r'\\u[0-9a-fA-F]{4}',
2080 lambda m: unicode_escape(m.group(0))[0],
2081 s)
b53466e1 2082
d05cfe06
S
2083
2084def escape_rfc3986(s):
2085 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2086 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2087 s = s.encode('utf-8')
ecc0c5ee 2088 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2089
2090
2091def escape_url(url):
2092 """Escape URL as suggested by RFC 3986"""
2093 url_parsed = compat_urllib_parse_urlparse(url)
2094 return url_parsed._replace(
efbed08d 2095 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2096 path=escape_rfc3986(url_parsed.path),
2097 params=escape_rfc3986(url_parsed.params),
2098 query=escape_rfc3986(url_parsed.query),
2099 fragment=escape_rfc3986(url_parsed.fragment)
2100 ).geturl()
2101
62e609ab
PH
2102
2103def read_batch_urls(batch_fd):
2104 def fixup(url):
2105 if not isinstance(url, compat_str):
2106 url = url.decode('utf-8', 'replace')
28e614de 2107 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2108 if url.startswith(BOM_UTF8):
2109 url = url[len(BOM_UTF8):]
2110 url = url.strip()
2111 if url.startswith(('#', ';', ']')):
2112 return False
2113 return url
2114
2115 with contextlib.closing(batch_fd) as fd:
2116 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2117
2118
2119def urlencode_postdata(*args, **kargs):
15707c7e 2120 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2121
2122
38f9ef31 2123def update_url_query(url, query):
cacd9966
YCH
2124 if not query:
2125 return url
38f9ef31 2126 parsed_url = compat_urlparse.urlparse(url)
2127 qs = compat_parse_qs(parsed_url.query)
2128 qs.update(query)
2129 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2130 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2131
8e60dc75 2132
ed0291d1
S
2133def update_Request(req, url=None, data=None, headers={}, query={}):
2134 req_headers = req.headers.copy()
2135 req_headers.update(headers)
2136 req_data = data or req.data
2137 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2138 req_get_method = req.get_method()
2139 if req_get_method == 'HEAD':
2140 req_type = HEADRequest
2141 elif req_get_method == 'PUT':
2142 req_type = PUTRequest
2143 else:
2144 req_type = compat_urllib_request.Request
ed0291d1
S
2145 new_req = req_type(
2146 req_url, data=req_data, headers=req_headers,
2147 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2148 if hasattr(req, 'timeout'):
2149 new_req.timeout = req.timeout
2150 return new_req
2151
2152
10c87c15 2153def _multipart_encode_impl(data, boundary):
0c265486
YCH
2154 content_type = 'multipart/form-data; boundary=%s' % boundary
2155
2156 out = b''
2157 for k, v in data.items():
2158 out += b'--' + boundary.encode('ascii') + b'\r\n'
2159 if isinstance(k, compat_str):
2160 k = k.encode('utf-8')
2161 if isinstance(v, compat_str):
2162 v = v.encode('utf-8')
2163 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2164 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2165 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2166 if boundary.encode('ascii') in content:
2167 raise ValueError('Boundary overlaps with data')
2168 out += content
2169
2170 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2171
2172 return out, content_type
2173
2174
2175def multipart_encode(data, boundary=None):
2176 '''
2177 Encode a dict to RFC 7578-compliant form-data
2178
2179 data:
2180 A dict where keys and values can be either Unicode or bytes-like
2181 objects.
2182 boundary:
2183 If specified a Unicode object, it's used as the boundary. Otherwise
2184 a random boundary is generated.
2185
2186 Reference: https://tools.ietf.org/html/rfc7578
2187 '''
2188 has_specified_boundary = boundary is not None
2189
2190 while True:
2191 if boundary is None:
2192 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2193
2194 try:
10c87c15 2195 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2196 break
2197 except ValueError:
2198 if has_specified_boundary:
2199 raise
2200 boundary = None
2201
2202 return out, content_type
2203
2204
86296ad2 2205def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2206 if isinstance(key_or_keys, (list, tuple)):
2207 for key in key_or_keys:
86296ad2
S
2208 if key not in d or d[key] is None or skip_false_values and not d[key]:
2209 continue
2210 return d[key]
cbecc9b9
S
2211 return default
2212 return d.get(key_or_keys, default)
2213
2214
329ca3be 2215def try_get(src, getter, expected_type=None):
a32a9a7e
S
2216 if not isinstance(getter, (list, tuple)):
2217 getter = [getter]
2218 for get in getter:
2219 try:
2220 v = get(src)
2221 except (AttributeError, KeyError, TypeError, IndexError):
2222 pass
2223 else:
2224 if expected_type is None or isinstance(v, expected_type):
2225 return v
329ca3be
S
2226
2227
6cc62232
S
2228def merge_dicts(*dicts):
2229 merged = {}
2230 for a_dict in dicts:
2231 for k, v in a_dict.items():
2232 if v is None:
2233 continue
2234 if (k not in merged or
2235 (isinstance(v, compat_str) and v and
2236 isinstance(merged[k], compat_str) and
2237 not merged[k])):
2238 merged[k] = v
2239 return merged
2240
2241
8e60dc75
S
2242def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2243 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2244
16392824 2245
a1a530b0
PH
2246US_RATINGS = {
2247 'G': 0,
2248 'PG': 10,
2249 'PG-13': 13,
2250 'R': 16,
2251 'NC': 18,
2252}
fac55558
PH
2253
2254
a8795327 2255TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2256 'TV-Y': 0,
2257 'TV-Y7': 7,
2258 'TV-G': 0,
2259 'TV-PG': 0,
2260 'TV-14': 14,
2261 'TV-MA': 17,
a8795327
S
2262}
2263
2264
146c80e2 2265def parse_age_limit(s):
a8795327
S
2266 if type(s) == int:
2267 return s if 0 <= s <= 21 else None
2268 if not isinstance(s, compat_basestring):
d838b1bd 2269 return None
146c80e2 2270 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2271 if m:
2272 return int(m.group('age'))
2273 if s in US_RATINGS:
2274 return US_RATINGS[s]
5a16c9d9 2275 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2276 if m:
5a16c9d9 2277 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2278 return None
146c80e2
S
2279
2280
fac55558 2281def strip_jsonp(code):
609a61e3 2282 return re.sub(
5552c9eb
YCH
2283 r'''(?sx)^
2284 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2285 (?:\s*&&\s*(?P=func_name))?
2286 \s*\(\s*(?P<callback_data>.*)\);?
2287 \s*?(?://[^\n]*)*$''',
2288 r'\g<callback_data>', code)
478c2c61
PH
2289
2290
e05f6939 2291def js_to_json(code):
4195096e
S
2292 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2293 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2294 INTEGER_TABLE = (
2295 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2296 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2297 )
2298
e05f6939 2299 def fix_kv(m):
e7b6d122
PH
2300 v = m.group(0)
2301 if v in ('true', 'false', 'null'):
2302 return v
b3ee552e 2303 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2304 return ""
2305
2306 if v[0] in ("'", '"'):
2307 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2308 '"': '\\"',
bd1e4844 2309 "\\'": "'",
2310 '\\\n': '',
2311 '\\x': '\\u00',
2312 }.get(m.group(0), m.group(0)), v[1:-1])
2313
89ac4a19
S
2314 for regex, base in INTEGER_TABLE:
2315 im = re.match(regex, v)
2316 if im:
e4659b45 2317 i = int(im.group(1), base)
89ac4a19
S
2318 return '"%d":' % i if v.endswith(':') else '%d' % i
2319
e7b6d122 2320 return '"%s"' % v
e05f6939 2321
bd1e4844 2322 return re.sub(r'''(?sx)
2323 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2324 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2325 {comment}|,(?={skip}[\]}}])|
c384d537 2326 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2327 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2328 [0-9]+(?={skip}:)
2329 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2330
2331
478c2c61
PH
2332def qualities(quality_ids):
2333 """ Get a numeric quality value out of a list of possible values """
2334 def q(qid):
2335 try:
2336 return quality_ids.index(qid)
2337 except ValueError:
2338 return -1
2339 return q
2340
acd69589
PH
2341
2342DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2343
a020a0dc
PH
2344
2345def limit_length(s, length):
2346 """ Add ellipses to overly long strings """
2347 if s is None:
2348 return None
2349 ELLIPSES = '...'
2350 if len(s) > length:
2351 return s[:length - len(ELLIPSES)] + ELLIPSES
2352 return s
48844745
PH
2353
2354
2355def version_tuple(v):
5f9b8394 2356 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2357
2358
2359def is_outdated_version(version, limit, assume_new=True):
2360 if not version:
2361 return not assume_new
2362 try:
2363 return version_tuple(version) < version_tuple(limit)
2364 except ValueError:
2365 return not assume_new
732ea2f0
PH
2366
2367
2368def ytdl_is_updateable():
2369 """ Returns if youtube-dl can be updated with -U """
2370 from zipimport import zipimporter
2371
2372 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2373
2374
2375def args_to_str(args):
2376 # Get a short string representation for a subprocess command
702ccf2d 2377 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2378
2379
9b9c5355 2380def error_to_compat_str(err):
fdae2358
S
2381 err_str = str(err)
2382 # On python 2 error byte string must be decoded with proper
2383 # encoding rather than ascii
2384 if sys.version_info[0] < 3:
2385 err_str = err_str.decode(preferredencoding())
2386 return err_str
2387
2388
c460bdd5 2389def mimetype2ext(mt):
eb9ee194
S
2390 if mt is None:
2391 return None
2392
765ac263
JMF
2393 ext = {
2394 'audio/mp4': 'm4a',
6c33d24b
YCH
2395 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2396 # it's the most popular one
2397 'audio/mpeg': 'mp3',
765ac263
JMF
2398 }.get(mt)
2399 if ext is not None:
2400 return ext
2401
c460bdd5 2402 _, _, res = mt.rpartition('/')
6562d34a 2403 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2404
2405 return {
f6861ec9 2406 '3gpp': '3gp',
cafcf657 2407 'smptett+xml': 'tt',
cafcf657 2408 'ttaf+xml': 'dfxp',
a0d8d704 2409 'ttml+xml': 'ttml',
f6861ec9 2410 'x-flv': 'flv',
a0d8d704 2411 'x-mp4-fragmented': 'mp4',
d4f05d47 2412 'x-ms-sami': 'sami',
a0d8d704 2413 'x-ms-wmv': 'wmv',
b4173f15
RA
2414 'mpegurl': 'm3u8',
2415 'x-mpegurl': 'm3u8',
2416 'vnd.apple.mpegurl': 'm3u8',
2417 'dash+xml': 'mpd',
b4173f15 2418 'f4m+xml': 'f4m',
f164b971 2419 'hds+xml': 'f4m',
e910fe2f 2420 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2421 'quicktime': 'mov',
98ce1a3f 2422 'mp2t': 'ts',
c460bdd5
PH
2423 }.get(res, res)
2424
2425
4f3c5e06 2426def parse_codecs(codecs_str):
2427 # http://tools.ietf.org/html/rfc6381
2428 if not codecs_str:
2429 return {}
2430 splited_codecs = list(filter(None, map(
2431 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2432 vcodec, acodec = None, None
2433 for full_codec in splited_codecs:
2434 codec = full_codec.split('.')[0]
ffe6979e 2435 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2436 if not vcodec:
2437 vcodec = full_codec
60f5c9fb 2438 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2439 if not acodec:
2440 acodec = full_codec
2441 else:
60f5c9fb 2442 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2443 if not vcodec and not acodec:
2444 if len(splited_codecs) == 2:
2445 return {
2446 'vcodec': vcodec,
2447 'acodec': acodec,
2448 }
2449 elif len(splited_codecs) == 1:
2450 return {
2451 'vcodec': 'none',
2452 'acodec': vcodec,
2453 }
2454 else:
2455 return {
2456 'vcodec': vcodec or 'none',
2457 'acodec': acodec or 'none',
2458 }
2459 return {}
2460
2461
2ccd1b10 2462def urlhandle_detect_ext(url_handle):
79298173 2463 getheader = url_handle.headers.get
2ccd1b10 2464
b55ee18f
PH
2465 cd = getheader('Content-Disposition')
2466 if cd:
2467 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2468 if m:
2469 e = determine_ext(m.group('filename'), default_ext=None)
2470 if e:
2471 return e
2472
c460bdd5 2473 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2474
2475
1e399778
YCH
2476def encode_data_uri(data, mime_type):
2477 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2478
2479
05900629 2480def age_restricted(content_limit, age_limit):
6ec6cb4e 2481 """ Returns True iff the content should be blocked """
05900629
PH
2482
2483 if age_limit is None: # No limit set
2484 return False
2485 if content_limit is None:
2486 return False # Content available for everyone
2487 return age_limit < content_limit
61ca9a80
PH
2488
2489
2490def is_html(first_bytes):
2491 """ Detect whether a file contains HTML by examining its first bytes. """
2492
2493 BOMS = [
2494 (b'\xef\xbb\xbf', 'utf-8'),
2495 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2496 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2497 (b'\xff\xfe', 'utf-16-le'),
2498 (b'\xfe\xff', 'utf-16-be'),
2499 ]
2500 for bom, enc in BOMS:
2501 if first_bytes.startswith(bom):
2502 s = first_bytes[len(bom):].decode(enc, 'replace')
2503 break
2504 else:
2505 s = first_bytes.decode('utf-8', 'replace')
2506
2507 return re.match(r'^\s*<', s)
a055469f
PH
2508
2509
2510def determine_protocol(info_dict):
2511 protocol = info_dict.get('protocol')
2512 if protocol is not None:
2513 return protocol
2514
2515 url = info_dict['url']
2516 if url.startswith('rtmp'):
2517 return 'rtmp'
2518 elif url.startswith('mms'):
2519 return 'mms'
2520 elif url.startswith('rtsp'):
2521 return 'rtsp'
2522
2523 ext = determine_ext(url)
2524 if ext == 'm3u8':
2525 return 'm3u8'
2526 elif ext == 'f4m':
2527 return 'f4m'
2528
2529 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2530
2531
2532def render_table(header_row, data):
2533 """ Render a list of rows, each as a list of values """
2534 table = [header_row] + data
2535 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2536 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2537 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2538
2539
2540def _match_one(filter_part, dct):
2541 COMPARISON_OPERATORS = {
2542 '<': operator.lt,
2543 '<=': operator.le,
2544 '>': operator.gt,
2545 '>=': operator.ge,
2546 '=': operator.eq,
2547 '!=': operator.ne,
2548 }
2549 operator_rex = re.compile(r'''(?x)\s*
2550 (?P<key>[a-z_]+)
2551 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2552 (?:
2553 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2554 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2555 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2556 )
2557 \s*$
2558 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2559 m = operator_rex.search(filter_part)
2560 if m:
2561 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2562 actual_value = dct.get(m.group('key'))
db13c16e
S
2563 if (m.group('quotedstrval') is not None or
2564 m.group('strval') is not None or
e5a088dc
S
2565 # If the original field is a string and matching comparisonvalue is
2566 # a number we should respect the origin of the original field
2567 # and process comparison value as a string (see
2568 # https://github.com/rg3/youtube-dl/issues/11082).
2569 actual_value is not None and m.group('intval') is not None and
2570 isinstance(actual_value, compat_str)):
347de493
PH
2571 if m.group('op') not in ('=', '!='):
2572 raise ValueError(
2573 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2574 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2575 quote = m.group('quote')
2576 if quote is not None:
2577 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2578 else:
2579 try:
2580 comparison_value = int(m.group('intval'))
2581 except ValueError:
2582 comparison_value = parse_filesize(m.group('intval'))
2583 if comparison_value is None:
2584 comparison_value = parse_filesize(m.group('intval') + 'B')
2585 if comparison_value is None:
2586 raise ValueError(
2587 'Invalid integer value %r in filter part %r' % (
2588 m.group('intval'), filter_part))
347de493
PH
2589 if actual_value is None:
2590 return m.group('none_inclusive')
2591 return op(actual_value, comparison_value)
2592
2593 UNARY_OPERATORS = {
1cc47c66
S
2594 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
2595 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
2596 }
2597 operator_rex = re.compile(r'''(?x)\s*
2598 (?P<op>%s)\s*(?P<key>[a-z_]+)
2599 \s*$
2600 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2601 m = operator_rex.search(filter_part)
2602 if m:
2603 op = UNARY_OPERATORS[m.group('op')]
2604 actual_value = dct.get(m.group('key'))
2605 return op(actual_value)
2606
2607 raise ValueError('Invalid filter part %r' % filter_part)
2608
2609
2610def match_str(filter_str, dct):
2611 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2612
2613 return all(
2614 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2615
2616
2617def match_filter_func(filter_str):
2618 def _match_func(info_dict):
2619 if match_str(filter_str, info_dict):
2620 return None
2621 else:
2622 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2623 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2624 return _match_func
91410c9b
PH
2625
2626
bf6427d2
YCH
2627def parse_dfxp_time_expr(time_expr):
2628 if not time_expr:
d631d5f9 2629 return
bf6427d2
YCH
2630
2631 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2632 if mobj:
2633 return float(mobj.group('time_offset'))
2634
db2fe38b 2635 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2636 if mobj:
db2fe38b 2637 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2638
2639
c1c924ab
YCH
2640def srt_subtitles_timecode(seconds):
2641 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2642
2643
2644def dfxp2srt(dfxp_data):
3869028f
YCH
2645 '''
2646 @param dfxp_data A bytes-like object containing DFXP data
2647 @returns A unicode object containing converted SRT data
2648 '''
5b995f71 2649 LEGACY_NAMESPACES = (
3869028f
YCH
2650 (b'http://www.w3.org/ns/ttml', [
2651 b'http://www.w3.org/2004/11/ttaf1',
2652 b'http://www.w3.org/2006/04/ttaf1',
2653 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2654 ]),
3869028f
YCH
2655 (b'http://www.w3.org/ns/ttml#styling', [
2656 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2657 ]),
2658 )
2659
2660 SUPPORTED_STYLING = [
2661 'color',
2662 'fontFamily',
2663 'fontSize',
2664 'fontStyle',
2665 'fontWeight',
2666 'textDecoration'
2667 ]
2668
4e335771
YCH
2669 _x = functools.partial(xpath_with_ns, ns_map={
2670 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2671 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2672 })
bf6427d2 2673
5b995f71
RA
2674 styles = {}
2675 default_style = {}
2676
87de7069 2677 class TTMLPElementParser(object):
5b995f71
RA
2678 _out = ''
2679 _unclosed_elements = []
2680 _applied_styles = []
bf6427d2 2681
2b14cb56 2682 def start(self, tag, attrib):
5b995f71
RA
2683 if tag in (_x('ttml:br'), 'br'):
2684 self._out += '\n'
2685 else:
2686 unclosed_elements = []
2687 style = {}
2688 element_style_id = attrib.get('style')
2689 if default_style:
2690 style.update(default_style)
2691 if element_style_id:
2692 style.update(styles.get(element_style_id, {}))
2693 for prop in SUPPORTED_STYLING:
2694 prop_val = attrib.get(_x('tts:' + prop))
2695 if prop_val:
2696 style[prop] = prop_val
2697 if style:
2698 font = ''
2699 for k, v in sorted(style.items()):
2700 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2701 continue
2702 if k == 'color':
2703 font += ' color="%s"' % v
2704 elif k == 'fontSize':
2705 font += ' size="%s"' % v
2706 elif k == 'fontFamily':
2707 font += ' face="%s"' % v
2708 elif k == 'fontWeight' and v == 'bold':
2709 self._out += '<b>'
2710 unclosed_elements.append('b')
2711 elif k == 'fontStyle' and v == 'italic':
2712 self._out += '<i>'
2713 unclosed_elements.append('i')
2714 elif k == 'textDecoration' and v == 'underline':
2715 self._out += '<u>'
2716 unclosed_elements.append('u')
2717 if font:
2718 self._out += '<font' + font + '>'
2719 unclosed_elements.append('font')
2720 applied_style = {}
2721 if self._applied_styles:
2722 applied_style.update(self._applied_styles[-1])
2723 applied_style.update(style)
2724 self._applied_styles.append(applied_style)
2725 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2726
2b14cb56 2727 def end(self, tag):
5b995f71
RA
2728 if tag not in (_x('ttml:br'), 'br'):
2729 unclosed_elements = self._unclosed_elements.pop()
2730 for element in reversed(unclosed_elements):
2731 self._out += '</%s>' % element
2732 if unclosed_elements and self._applied_styles:
2733 self._applied_styles.pop()
bf6427d2 2734
2b14cb56 2735 def data(self, data):
5b995f71 2736 self._out += data
2b14cb56 2737
2738 def close(self):
5b995f71 2739 return self._out.strip()
2b14cb56 2740
2741 def parse_node(node):
2742 target = TTMLPElementParser()
2743 parser = xml.etree.ElementTree.XMLParser(target=target)
2744 parser.feed(xml.etree.ElementTree.tostring(node))
2745 return parser.close()
bf6427d2 2746
5b995f71
RA
2747 for k, v in LEGACY_NAMESPACES:
2748 for ns in v:
2749 dfxp_data = dfxp_data.replace(ns, k)
2750
3869028f 2751 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2752 out = []
5b995f71 2753 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2754
2755 if not paras:
2756 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2757
5b995f71
RA
2758 repeat = False
2759 while True:
2760 for style in dfxp.findall(_x('.//ttml:style')):
2761 style_id = style.get('id')
2762 parent_style_id = style.get('style')
2763 if parent_style_id:
2764 if parent_style_id not in styles:
2765 repeat = True
2766 continue
2767 styles[style_id] = styles[parent_style_id].copy()
2768 for prop in SUPPORTED_STYLING:
2769 prop_val = style.get(_x('tts:' + prop))
2770 if prop_val:
2771 styles.setdefault(style_id, {})[prop] = prop_val
2772 if repeat:
2773 repeat = False
2774 else:
2775 break
2776
2777 for p in ('body', 'div'):
2778 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2779 if ele is None:
2780 continue
2781 style = styles.get(ele.get('style'))
2782 if not style:
2783 continue
2784 default_style.update(style)
2785
bf6427d2 2786 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2787 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2788 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2789 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2790 if begin_time is None:
2791 continue
7dff0363 2792 if not end_time:
d631d5f9
YCH
2793 if not dur:
2794 continue
2795 end_time = begin_time + dur
bf6427d2
YCH
2796 out.append('%d\n%s --> %s\n%s\n\n' % (
2797 index,
c1c924ab
YCH
2798 srt_subtitles_timecode(begin_time),
2799 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2800 parse_node(para)))
2801
2802 return ''.join(out)
2803
2804
66e289ba
S
2805def cli_option(params, command_option, param):
2806 param = params.get(param)
98e698f1
RA
2807 if param:
2808 param = compat_str(param)
66e289ba
S
2809 return [command_option, param] if param is not None else []
2810
2811
2812def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2813 param = params.get(param)
5b232f46
S
2814 if param is None:
2815 return []
66e289ba
S
2816 assert isinstance(param, bool)
2817 if separator:
2818 return [command_option + separator + (true_value if param else false_value)]
2819 return [command_option, true_value if param else false_value]
2820
2821
2822def cli_valueless_option(params, command_option, param, expected_value=True):
2823 param = params.get(param)
2824 return [command_option] if param == expected_value else []
2825
2826
2827def cli_configuration_args(params, param, default=[]):
2828 ex_args = params.get(param)
2829 if ex_args is None:
2830 return default
2831 assert isinstance(ex_args, list)
2832 return ex_args
2833
2834
39672624
YCH
2835class ISO639Utils(object):
2836 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2837 _lang_map = {
2838 'aa': 'aar',
2839 'ab': 'abk',
2840 'ae': 'ave',
2841 'af': 'afr',
2842 'ak': 'aka',
2843 'am': 'amh',
2844 'an': 'arg',
2845 'ar': 'ara',
2846 'as': 'asm',
2847 'av': 'ava',
2848 'ay': 'aym',
2849 'az': 'aze',
2850 'ba': 'bak',
2851 'be': 'bel',
2852 'bg': 'bul',
2853 'bh': 'bih',
2854 'bi': 'bis',
2855 'bm': 'bam',
2856 'bn': 'ben',
2857 'bo': 'bod',
2858 'br': 'bre',
2859 'bs': 'bos',
2860 'ca': 'cat',
2861 'ce': 'che',
2862 'ch': 'cha',
2863 'co': 'cos',
2864 'cr': 'cre',
2865 'cs': 'ces',
2866 'cu': 'chu',
2867 'cv': 'chv',
2868 'cy': 'cym',
2869 'da': 'dan',
2870 'de': 'deu',
2871 'dv': 'div',
2872 'dz': 'dzo',
2873 'ee': 'ewe',
2874 'el': 'ell',
2875 'en': 'eng',
2876 'eo': 'epo',
2877 'es': 'spa',
2878 'et': 'est',
2879 'eu': 'eus',
2880 'fa': 'fas',
2881 'ff': 'ful',
2882 'fi': 'fin',
2883 'fj': 'fij',
2884 'fo': 'fao',
2885 'fr': 'fra',
2886 'fy': 'fry',
2887 'ga': 'gle',
2888 'gd': 'gla',
2889 'gl': 'glg',
2890 'gn': 'grn',
2891 'gu': 'guj',
2892 'gv': 'glv',
2893 'ha': 'hau',
2894 'he': 'heb',
2895 'hi': 'hin',
2896 'ho': 'hmo',
2897 'hr': 'hrv',
2898 'ht': 'hat',
2899 'hu': 'hun',
2900 'hy': 'hye',
2901 'hz': 'her',
2902 'ia': 'ina',
2903 'id': 'ind',
2904 'ie': 'ile',
2905 'ig': 'ibo',
2906 'ii': 'iii',
2907 'ik': 'ipk',
2908 'io': 'ido',
2909 'is': 'isl',
2910 'it': 'ita',
2911 'iu': 'iku',
2912 'ja': 'jpn',
2913 'jv': 'jav',
2914 'ka': 'kat',
2915 'kg': 'kon',
2916 'ki': 'kik',
2917 'kj': 'kua',
2918 'kk': 'kaz',
2919 'kl': 'kal',
2920 'km': 'khm',
2921 'kn': 'kan',
2922 'ko': 'kor',
2923 'kr': 'kau',
2924 'ks': 'kas',
2925 'ku': 'kur',
2926 'kv': 'kom',
2927 'kw': 'cor',
2928 'ky': 'kir',
2929 'la': 'lat',
2930 'lb': 'ltz',
2931 'lg': 'lug',
2932 'li': 'lim',
2933 'ln': 'lin',
2934 'lo': 'lao',
2935 'lt': 'lit',
2936 'lu': 'lub',
2937 'lv': 'lav',
2938 'mg': 'mlg',
2939 'mh': 'mah',
2940 'mi': 'mri',
2941 'mk': 'mkd',
2942 'ml': 'mal',
2943 'mn': 'mon',
2944 'mr': 'mar',
2945 'ms': 'msa',
2946 'mt': 'mlt',
2947 'my': 'mya',
2948 'na': 'nau',
2949 'nb': 'nob',
2950 'nd': 'nde',
2951 'ne': 'nep',
2952 'ng': 'ndo',
2953 'nl': 'nld',
2954 'nn': 'nno',
2955 'no': 'nor',
2956 'nr': 'nbl',
2957 'nv': 'nav',
2958 'ny': 'nya',
2959 'oc': 'oci',
2960 'oj': 'oji',
2961 'om': 'orm',
2962 'or': 'ori',
2963 'os': 'oss',
2964 'pa': 'pan',
2965 'pi': 'pli',
2966 'pl': 'pol',
2967 'ps': 'pus',
2968 'pt': 'por',
2969 'qu': 'que',
2970 'rm': 'roh',
2971 'rn': 'run',
2972 'ro': 'ron',
2973 'ru': 'rus',
2974 'rw': 'kin',
2975 'sa': 'san',
2976 'sc': 'srd',
2977 'sd': 'snd',
2978 'se': 'sme',
2979 'sg': 'sag',
2980 'si': 'sin',
2981 'sk': 'slk',
2982 'sl': 'slv',
2983 'sm': 'smo',
2984 'sn': 'sna',
2985 'so': 'som',
2986 'sq': 'sqi',
2987 'sr': 'srp',
2988 'ss': 'ssw',
2989 'st': 'sot',
2990 'su': 'sun',
2991 'sv': 'swe',
2992 'sw': 'swa',
2993 'ta': 'tam',
2994 'te': 'tel',
2995 'tg': 'tgk',
2996 'th': 'tha',
2997 'ti': 'tir',
2998 'tk': 'tuk',
2999 'tl': 'tgl',
3000 'tn': 'tsn',
3001 'to': 'ton',
3002 'tr': 'tur',
3003 'ts': 'tso',
3004 'tt': 'tat',
3005 'tw': 'twi',
3006 'ty': 'tah',
3007 'ug': 'uig',
3008 'uk': 'ukr',
3009 'ur': 'urd',
3010 'uz': 'uzb',
3011 've': 'ven',
3012 'vi': 'vie',
3013 'vo': 'vol',
3014 'wa': 'wln',
3015 'wo': 'wol',
3016 'xh': 'xho',
3017 'yi': 'yid',
3018 'yo': 'yor',
3019 'za': 'zha',
3020 'zh': 'zho',
3021 'zu': 'zul',
3022 }
3023
3024 @classmethod
3025 def short2long(cls, code):
3026 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3027 return cls._lang_map.get(code[:2])
3028
3029 @classmethod
3030 def long2short(cls, code):
3031 """Convert language code from ISO 639-2/T to ISO 639-1"""
3032 for short_name, long_name in cls._lang_map.items():
3033 if long_name == code:
3034 return short_name
3035
3036
4eb10f66
YCH
3037class ISO3166Utils(object):
3038 # From http://data.okfn.org/data/core/country-list
3039 _country_map = {
3040 'AF': 'Afghanistan',
3041 'AX': 'Åland Islands',
3042 'AL': 'Albania',
3043 'DZ': 'Algeria',
3044 'AS': 'American Samoa',
3045 'AD': 'Andorra',
3046 'AO': 'Angola',
3047 'AI': 'Anguilla',
3048 'AQ': 'Antarctica',
3049 'AG': 'Antigua and Barbuda',
3050 'AR': 'Argentina',
3051 'AM': 'Armenia',
3052 'AW': 'Aruba',
3053 'AU': 'Australia',
3054 'AT': 'Austria',
3055 'AZ': 'Azerbaijan',
3056 'BS': 'Bahamas',
3057 'BH': 'Bahrain',
3058 'BD': 'Bangladesh',
3059 'BB': 'Barbados',
3060 'BY': 'Belarus',
3061 'BE': 'Belgium',
3062 'BZ': 'Belize',
3063 'BJ': 'Benin',
3064 'BM': 'Bermuda',
3065 'BT': 'Bhutan',
3066 'BO': 'Bolivia, Plurinational State of',
3067 'BQ': 'Bonaire, Sint Eustatius and Saba',
3068 'BA': 'Bosnia and Herzegovina',
3069 'BW': 'Botswana',
3070 'BV': 'Bouvet Island',
3071 'BR': 'Brazil',
3072 'IO': 'British Indian Ocean Territory',
3073 'BN': 'Brunei Darussalam',
3074 'BG': 'Bulgaria',
3075 'BF': 'Burkina Faso',
3076 'BI': 'Burundi',
3077 'KH': 'Cambodia',
3078 'CM': 'Cameroon',
3079 'CA': 'Canada',
3080 'CV': 'Cape Verde',
3081 'KY': 'Cayman Islands',
3082 'CF': 'Central African Republic',
3083 'TD': 'Chad',
3084 'CL': 'Chile',
3085 'CN': 'China',
3086 'CX': 'Christmas Island',
3087 'CC': 'Cocos (Keeling) Islands',
3088 'CO': 'Colombia',
3089 'KM': 'Comoros',
3090 'CG': 'Congo',
3091 'CD': 'Congo, the Democratic Republic of the',
3092 'CK': 'Cook Islands',
3093 'CR': 'Costa Rica',
3094 'CI': 'Côte d\'Ivoire',
3095 'HR': 'Croatia',
3096 'CU': 'Cuba',
3097 'CW': 'Curaçao',
3098 'CY': 'Cyprus',
3099 'CZ': 'Czech Republic',
3100 'DK': 'Denmark',
3101 'DJ': 'Djibouti',
3102 'DM': 'Dominica',
3103 'DO': 'Dominican Republic',
3104 'EC': 'Ecuador',
3105 'EG': 'Egypt',
3106 'SV': 'El Salvador',
3107 'GQ': 'Equatorial Guinea',
3108 'ER': 'Eritrea',
3109 'EE': 'Estonia',
3110 'ET': 'Ethiopia',
3111 'FK': 'Falkland Islands (Malvinas)',
3112 'FO': 'Faroe Islands',
3113 'FJ': 'Fiji',
3114 'FI': 'Finland',
3115 'FR': 'France',
3116 'GF': 'French Guiana',
3117 'PF': 'French Polynesia',
3118 'TF': 'French Southern Territories',
3119 'GA': 'Gabon',
3120 'GM': 'Gambia',
3121 'GE': 'Georgia',
3122 'DE': 'Germany',
3123 'GH': 'Ghana',
3124 'GI': 'Gibraltar',
3125 'GR': 'Greece',
3126 'GL': 'Greenland',
3127 'GD': 'Grenada',
3128 'GP': 'Guadeloupe',
3129 'GU': 'Guam',
3130 'GT': 'Guatemala',
3131 'GG': 'Guernsey',
3132 'GN': 'Guinea',
3133 'GW': 'Guinea-Bissau',
3134 'GY': 'Guyana',
3135 'HT': 'Haiti',
3136 'HM': 'Heard Island and McDonald Islands',
3137 'VA': 'Holy See (Vatican City State)',
3138 'HN': 'Honduras',
3139 'HK': 'Hong Kong',
3140 'HU': 'Hungary',
3141 'IS': 'Iceland',
3142 'IN': 'India',
3143 'ID': 'Indonesia',
3144 'IR': 'Iran, Islamic Republic of',
3145 'IQ': 'Iraq',
3146 'IE': 'Ireland',
3147 'IM': 'Isle of Man',
3148 'IL': 'Israel',
3149 'IT': 'Italy',
3150 'JM': 'Jamaica',
3151 'JP': 'Japan',
3152 'JE': 'Jersey',
3153 'JO': 'Jordan',
3154 'KZ': 'Kazakhstan',
3155 'KE': 'Kenya',
3156 'KI': 'Kiribati',
3157 'KP': 'Korea, Democratic People\'s Republic of',
3158 'KR': 'Korea, Republic of',
3159 'KW': 'Kuwait',
3160 'KG': 'Kyrgyzstan',
3161 'LA': 'Lao People\'s Democratic Republic',
3162 'LV': 'Latvia',
3163 'LB': 'Lebanon',
3164 'LS': 'Lesotho',
3165 'LR': 'Liberia',
3166 'LY': 'Libya',
3167 'LI': 'Liechtenstein',
3168 'LT': 'Lithuania',
3169 'LU': 'Luxembourg',
3170 'MO': 'Macao',
3171 'MK': 'Macedonia, the Former Yugoslav Republic of',
3172 'MG': 'Madagascar',
3173 'MW': 'Malawi',
3174 'MY': 'Malaysia',
3175 'MV': 'Maldives',
3176 'ML': 'Mali',
3177 'MT': 'Malta',
3178 'MH': 'Marshall Islands',
3179 'MQ': 'Martinique',
3180 'MR': 'Mauritania',
3181 'MU': 'Mauritius',
3182 'YT': 'Mayotte',
3183 'MX': 'Mexico',
3184 'FM': 'Micronesia, Federated States of',
3185 'MD': 'Moldova, Republic of',
3186 'MC': 'Monaco',
3187 'MN': 'Mongolia',
3188 'ME': 'Montenegro',
3189 'MS': 'Montserrat',
3190 'MA': 'Morocco',
3191 'MZ': 'Mozambique',
3192 'MM': 'Myanmar',
3193 'NA': 'Namibia',
3194 'NR': 'Nauru',
3195 'NP': 'Nepal',
3196 'NL': 'Netherlands',
3197 'NC': 'New Caledonia',
3198 'NZ': 'New Zealand',
3199 'NI': 'Nicaragua',
3200 'NE': 'Niger',
3201 'NG': 'Nigeria',
3202 'NU': 'Niue',
3203 'NF': 'Norfolk Island',
3204 'MP': 'Northern Mariana Islands',
3205 'NO': 'Norway',
3206 'OM': 'Oman',
3207 'PK': 'Pakistan',
3208 'PW': 'Palau',
3209 'PS': 'Palestine, State of',
3210 'PA': 'Panama',
3211 'PG': 'Papua New Guinea',
3212 'PY': 'Paraguay',
3213 'PE': 'Peru',
3214 'PH': 'Philippines',
3215 'PN': 'Pitcairn',
3216 'PL': 'Poland',
3217 'PT': 'Portugal',
3218 'PR': 'Puerto Rico',
3219 'QA': 'Qatar',
3220 'RE': 'Réunion',
3221 'RO': 'Romania',
3222 'RU': 'Russian Federation',
3223 'RW': 'Rwanda',
3224 'BL': 'Saint Barthélemy',
3225 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3226 'KN': 'Saint Kitts and Nevis',
3227 'LC': 'Saint Lucia',
3228 'MF': 'Saint Martin (French part)',
3229 'PM': 'Saint Pierre and Miquelon',
3230 'VC': 'Saint Vincent and the Grenadines',
3231 'WS': 'Samoa',
3232 'SM': 'San Marino',
3233 'ST': 'Sao Tome and Principe',
3234 'SA': 'Saudi Arabia',
3235 'SN': 'Senegal',
3236 'RS': 'Serbia',
3237 'SC': 'Seychelles',
3238 'SL': 'Sierra Leone',
3239 'SG': 'Singapore',
3240 'SX': 'Sint Maarten (Dutch part)',
3241 'SK': 'Slovakia',
3242 'SI': 'Slovenia',
3243 'SB': 'Solomon Islands',
3244 'SO': 'Somalia',
3245 'ZA': 'South Africa',
3246 'GS': 'South Georgia and the South Sandwich Islands',
3247 'SS': 'South Sudan',
3248 'ES': 'Spain',
3249 'LK': 'Sri Lanka',
3250 'SD': 'Sudan',
3251 'SR': 'Suriname',
3252 'SJ': 'Svalbard and Jan Mayen',
3253 'SZ': 'Swaziland',
3254 'SE': 'Sweden',
3255 'CH': 'Switzerland',
3256 'SY': 'Syrian Arab Republic',
3257 'TW': 'Taiwan, Province of China',
3258 'TJ': 'Tajikistan',
3259 'TZ': 'Tanzania, United Republic of',
3260 'TH': 'Thailand',
3261 'TL': 'Timor-Leste',
3262 'TG': 'Togo',
3263 'TK': 'Tokelau',
3264 'TO': 'Tonga',
3265 'TT': 'Trinidad and Tobago',
3266 'TN': 'Tunisia',
3267 'TR': 'Turkey',
3268 'TM': 'Turkmenistan',
3269 'TC': 'Turks and Caicos Islands',
3270 'TV': 'Tuvalu',
3271 'UG': 'Uganda',
3272 'UA': 'Ukraine',
3273 'AE': 'United Arab Emirates',
3274 'GB': 'United Kingdom',
3275 'US': 'United States',
3276 'UM': 'United States Minor Outlying Islands',
3277 'UY': 'Uruguay',
3278 'UZ': 'Uzbekistan',
3279 'VU': 'Vanuatu',
3280 'VE': 'Venezuela, Bolivarian Republic of',
3281 'VN': 'Viet Nam',
3282 'VG': 'Virgin Islands, British',
3283 'VI': 'Virgin Islands, U.S.',
3284 'WF': 'Wallis and Futuna',
3285 'EH': 'Western Sahara',
3286 'YE': 'Yemen',
3287 'ZM': 'Zambia',
3288 'ZW': 'Zimbabwe',
3289 }
3290
3291 @classmethod
3292 def short2full(cls, code):
3293 """Convert an ISO 3166-2 country code to the corresponding full name"""
3294 return cls._country_map.get(code.upper())
3295
3296
773f291d
S
3297class GeoUtils(object):
3298 # Major IPv4 address blocks per country
3299 _country_ip_map = {
3300 'AD': '85.94.160.0/19',
3301 'AE': '94.200.0.0/13',
3302 'AF': '149.54.0.0/17',
3303 'AG': '209.59.64.0/18',
3304 'AI': '204.14.248.0/21',
3305 'AL': '46.99.0.0/16',
3306 'AM': '46.70.0.0/15',
3307 'AO': '105.168.0.0/13',
3308 'AP': '159.117.192.0/21',
3309 'AR': '181.0.0.0/12',
3310 'AS': '202.70.112.0/20',
3311 'AT': '84.112.0.0/13',
3312 'AU': '1.128.0.0/11',
3313 'AW': '181.41.0.0/18',
3314 'AZ': '5.191.0.0/16',
3315 'BA': '31.176.128.0/17',
3316 'BB': '65.48.128.0/17',
3317 'BD': '114.130.0.0/16',
3318 'BE': '57.0.0.0/8',
3319 'BF': '129.45.128.0/17',
3320 'BG': '95.42.0.0/15',
3321 'BH': '37.131.0.0/17',
3322 'BI': '154.117.192.0/18',
3323 'BJ': '137.255.0.0/16',
3324 'BL': '192.131.134.0/24',
3325 'BM': '196.12.64.0/18',
3326 'BN': '156.31.0.0/16',
3327 'BO': '161.56.0.0/16',
3328 'BQ': '161.0.80.0/20',
3329 'BR': '152.240.0.0/12',
3330 'BS': '24.51.64.0/18',
3331 'BT': '119.2.96.0/19',
3332 'BW': '168.167.0.0/16',
3333 'BY': '178.120.0.0/13',
3334 'BZ': '179.42.192.0/18',
3335 'CA': '99.224.0.0/11',
3336 'CD': '41.243.0.0/16',
3337 'CF': '196.32.200.0/21',
3338 'CG': '197.214.128.0/17',
3339 'CH': '85.0.0.0/13',
3340 'CI': '154.232.0.0/14',
3341 'CK': '202.65.32.0/19',
3342 'CL': '152.172.0.0/14',
3343 'CM': '165.210.0.0/15',
3344 'CN': '36.128.0.0/10',
3345 'CO': '181.240.0.0/12',
3346 'CR': '201.192.0.0/12',
3347 'CU': '152.206.0.0/15',
3348 'CV': '165.90.96.0/19',
3349 'CW': '190.88.128.0/17',
3350 'CY': '46.198.0.0/15',
3351 'CZ': '88.100.0.0/14',
3352 'DE': '53.0.0.0/8',
3353 'DJ': '197.241.0.0/17',
3354 'DK': '87.48.0.0/12',
3355 'DM': '192.243.48.0/20',
3356 'DO': '152.166.0.0/15',
3357 'DZ': '41.96.0.0/12',
3358 'EC': '186.68.0.0/15',
3359 'EE': '90.190.0.0/15',
3360 'EG': '156.160.0.0/11',
3361 'ER': '196.200.96.0/20',
3362 'ES': '88.0.0.0/11',
3363 'ET': '196.188.0.0/14',
3364 'EU': '2.16.0.0/13',
3365 'FI': '91.152.0.0/13',
3366 'FJ': '144.120.0.0/16',
3367 'FM': '119.252.112.0/20',
3368 'FO': '88.85.32.0/19',
3369 'FR': '90.0.0.0/9',
3370 'GA': '41.158.0.0/15',
3371 'GB': '25.0.0.0/8',
3372 'GD': '74.122.88.0/21',
3373 'GE': '31.146.0.0/16',
3374 'GF': '161.22.64.0/18',
3375 'GG': '62.68.160.0/19',
3376 'GH': '45.208.0.0/14',
3377 'GI': '85.115.128.0/19',
3378 'GL': '88.83.0.0/19',
3379 'GM': '160.182.0.0/15',
3380 'GN': '197.149.192.0/18',
3381 'GP': '104.250.0.0/19',
3382 'GQ': '105.235.224.0/20',
3383 'GR': '94.64.0.0/13',
3384 'GT': '168.234.0.0/16',
3385 'GU': '168.123.0.0/16',
3386 'GW': '197.214.80.0/20',
3387 'GY': '181.41.64.0/18',
3388 'HK': '113.252.0.0/14',
3389 'HN': '181.210.0.0/16',
3390 'HR': '93.136.0.0/13',
3391 'HT': '148.102.128.0/17',
3392 'HU': '84.0.0.0/14',
3393 'ID': '39.192.0.0/10',
3394 'IE': '87.32.0.0/12',
3395 'IL': '79.176.0.0/13',
3396 'IM': '5.62.80.0/20',
3397 'IN': '117.192.0.0/10',
3398 'IO': '203.83.48.0/21',
3399 'IQ': '37.236.0.0/14',
3400 'IR': '2.176.0.0/12',
3401 'IS': '82.221.0.0/16',
3402 'IT': '79.0.0.0/10',
3403 'JE': '87.244.64.0/18',
3404 'JM': '72.27.0.0/17',
3405 'JO': '176.29.0.0/16',
3406 'JP': '126.0.0.0/8',
3407 'KE': '105.48.0.0/12',
3408 'KG': '158.181.128.0/17',
3409 'KH': '36.37.128.0/17',
3410 'KI': '103.25.140.0/22',
3411 'KM': '197.255.224.0/20',
3412 'KN': '198.32.32.0/19',
3413 'KP': '175.45.176.0/22',
3414 'KR': '175.192.0.0/10',
3415 'KW': '37.36.0.0/14',
3416 'KY': '64.96.0.0/15',
3417 'KZ': '2.72.0.0/13',
3418 'LA': '115.84.64.0/18',
3419 'LB': '178.135.0.0/16',
3420 'LC': '192.147.231.0/24',
3421 'LI': '82.117.0.0/19',
3422 'LK': '112.134.0.0/15',
3423 'LR': '41.86.0.0/19',
3424 'LS': '129.232.0.0/17',
3425 'LT': '78.56.0.0/13',
3426 'LU': '188.42.0.0/16',
3427 'LV': '46.109.0.0/16',
3428 'LY': '41.252.0.0/14',
3429 'MA': '105.128.0.0/11',
3430 'MC': '88.209.64.0/18',
3431 'MD': '37.246.0.0/16',
3432 'ME': '178.175.0.0/17',
3433 'MF': '74.112.232.0/21',
3434 'MG': '154.126.0.0/17',
3435 'MH': '117.103.88.0/21',
3436 'MK': '77.28.0.0/15',
3437 'ML': '154.118.128.0/18',
3438 'MM': '37.111.0.0/17',
3439 'MN': '49.0.128.0/17',
3440 'MO': '60.246.0.0/16',
3441 'MP': '202.88.64.0/20',
3442 'MQ': '109.203.224.0/19',
3443 'MR': '41.188.64.0/18',
3444 'MS': '208.90.112.0/22',
3445 'MT': '46.11.0.0/16',
3446 'MU': '105.16.0.0/12',
3447 'MV': '27.114.128.0/18',
3448 'MW': '105.234.0.0/16',
3449 'MX': '187.192.0.0/11',
3450 'MY': '175.136.0.0/13',
3451 'MZ': '197.218.0.0/15',
3452 'NA': '41.182.0.0/16',
3453 'NC': '101.101.0.0/18',
3454 'NE': '197.214.0.0/18',
3455 'NF': '203.17.240.0/22',
3456 'NG': '105.112.0.0/12',
3457 'NI': '186.76.0.0/15',
3458 'NL': '145.96.0.0/11',
3459 'NO': '84.208.0.0/13',
3460 'NP': '36.252.0.0/15',
3461 'NR': '203.98.224.0/19',
3462 'NU': '49.156.48.0/22',
3463 'NZ': '49.224.0.0/14',
3464 'OM': '5.36.0.0/15',
3465 'PA': '186.72.0.0/15',
3466 'PE': '186.160.0.0/14',
3467 'PF': '123.50.64.0/18',
3468 'PG': '124.240.192.0/19',
3469 'PH': '49.144.0.0/13',
3470 'PK': '39.32.0.0/11',
3471 'PL': '83.0.0.0/11',
3472 'PM': '70.36.0.0/20',
3473 'PR': '66.50.0.0/16',
3474 'PS': '188.161.0.0/16',
3475 'PT': '85.240.0.0/13',
3476 'PW': '202.124.224.0/20',
3477 'PY': '181.120.0.0/14',
3478 'QA': '37.210.0.0/15',
3479 'RE': '139.26.0.0/16',
3480 'RO': '79.112.0.0/13',
3481 'RS': '178.220.0.0/14',
3482 'RU': '5.136.0.0/13',
3483 'RW': '105.178.0.0/15',
3484 'SA': '188.48.0.0/13',
3485 'SB': '202.1.160.0/19',
3486 'SC': '154.192.0.0/11',
3487 'SD': '154.96.0.0/13',
3488 'SE': '78.64.0.0/12',
3489 'SG': '152.56.0.0/14',
3490 'SI': '188.196.0.0/14',
3491 'SK': '78.98.0.0/15',
3492 'SL': '197.215.0.0/17',
3493 'SM': '89.186.32.0/19',
3494 'SN': '41.82.0.0/15',
3495 'SO': '197.220.64.0/19',
3496 'SR': '186.179.128.0/17',
3497 'SS': '105.235.208.0/21',
3498 'ST': '197.159.160.0/19',
3499 'SV': '168.243.0.0/16',
3500 'SX': '190.102.0.0/20',
3501 'SY': '5.0.0.0/16',
3502 'SZ': '41.84.224.0/19',
3503 'TC': '65.255.48.0/20',
3504 'TD': '154.68.128.0/19',
3505 'TG': '196.168.0.0/14',
3506 'TH': '171.96.0.0/13',
3507 'TJ': '85.9.128.0/18',
3508 'TK': '27.96.24.0/21',
3509 'TL': '180.189.160.0/20',
3510 'TM': '95.85.96.0/19',
3511 'TN': '197.0.0.0/11',
3512 'TO': '175.176.144.0/21',
3513 'TR': '78.160.0.0/11',
3514 'TT': '186.44.0.0/15',
3515 'TV': '202.2.96.0/19',
3516 'TW': '120.96.0.0/11',
3517 'TZ': '156.156.0.0/14',
3518 'UA': '93.72.0.0/13',
3519 'UG': '154.224.0.0/13',
3520 'US': '3.0.0.0/8',
3521 'UY': '167.56.0.0/13',
3522 'UZ': '82.215.64.0/18',
3523 'VA': '212.77.0.0/19',
3524 'VC': '24.92.144.0/20',
3525 'VE': '186.88.0.0/13',
3526 'VG': '172.103.64.0/18',
3527 'VI': '146.226.0.0/16',
3528 'VN': '14.160.0.0/11',
3529 'VU': '202.80.32.0/20',
3530 'WF': '117.20.32.0/21',
3531 'WS': '202.4.32.0/19',
3532 'YE': '134.35.0.0/16',
3533 'YT': '41.242.116.0/22',
3534 'ZA': '41.0.0.0/11',
3535 'ZM': '165.56.0.0/13',
3536 'ZW': '41.85.192.0/19',
3537 }
3538
3539 @classmethod
5f95927a
S
3540 def random_ipv4(cls, code_or_block):
3541 if len(code_or_block) == 2:
3542 block = cls._country_ip_map.get(code_or_block.upper())
3543 if not block:
3544 return None
3545 else:
3546 block = code_or_block
773f291d
S
3547 addr, preflen = block.split('/')
3548 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3549 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3550 return compat_str(socket.inet_ntoa(
4248dad9 3551 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3552
3553
91410c9b 3554class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3555 def __init__(self, proxies=None):
3556 # Set default handlers
3557 for type in ('http', 'https'):
3558 setattr(self, '%s_open' % type,
3559 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3560 meth(r, proxy, type))
3561 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3562
91410c9b 3563 def proxy_open(self, req, proxy, type):
2461f79d 3564 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3565 if req_proxy is not None:
3566 proxy = req_proxy
2461f79d
PH
3567 del req.headers['Ytdl-request-proxy']
3568
3569 if proxy == '__noproxy__':
3570 return None # No Proxy
51fb4995 3571 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3572 req.add_header('Ytdl-socks-proxy', proxy)
3573 # youtube-dl's http/https handlers do wrapping the socket with socks
3574 return None
91410c9b
PH
3575 return compat_urllib_request.ProxyHandler.proxy_open(
3576 self, req, proxy, type)
5bc880b9
YCH
3577
3578
0a5445dd
YCH
3579# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3580# released into Public Domain
3581# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3582
3583def long_to_bytes(n, blocksize=0):
3584 """long_to_bytes(n:long, blocksize:int) : string
3585 Convert a long integer to a byte string.
3586
3587 If optional blocksize is given and greater than zero, pad the front of the
3588 byte string with binary zeros so that the length is a multiple of
3589 blocksize.
3590 """
3591 # after much testing, this algorithm was deemed to be the fastest
3592 s = b''
3593 n = int(n)
3594 while n > 0:
3595 s = compat_struct_pack('>I', n & 0xffffffff) + s
3596 n = n >> 32
3597 # strip off leading zeros
3598 for i in range(len(s)):
3599 if s[i] != b'\000'[0]:
3600 break
3601 else:
3602 # only happens when n == 0
3603 s = b'\000'
3604 i = 0
3605 s = s[i:]
3606 # add back some pad bytes. this could be done more efficiently w.r.t. the
3607 # de-padding being done above, but sigh...
3608 if blocksize > 0 and len(s) % blocksize:
3609 s = (blocksize - len(s) % blocksize) * b'\000' + s
3610 return s
3611
3612
3613def bytes_to_long(s):
3614 """bytes_to_long(string) : long
3615 Convert a byte string to a long integer.
3616
3617 This is (essentially) the inverse of long_to_bytes().
3618 """
3619 acc = 0
3620 length = len(s)
3621 if length % 4:
3622 extra = (4 - length % 4)
3623 s = b'\000' * extra + s
3624 length = length + extra
3625 for i in range(0, length, 4):
3626 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3627 return acc
3628
3629
5bc880b9
YCH
3630def ohdave_rsa_encrypt(data, exponent, modulus):
3631 '''
3632 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3633
3634 Input:
3635 data: data to encrypt, bytes-like object
3636 exponent, modulus: parameter e and N of RSA algorithm, both integer
3637 Output: hex string of encrypted data
3638
3639 Limitation: supports one block encryption only
3640 '''
3641
3642 payload = int(binascii.hexlify(data[::-1]), 16)
3643 encrypted = pow(payload, exponent, modulus)
3644 return '%x' % encrypted
81bdc8fd
YCH
3645
3646
f48409c7
YCH
3647def pkcs1pad(data, length):
3648 """
3649 Padding input data with PKCS#1 scheme
3650
3651 @param {int[]} data input data
3652 @param {int} length target length
3653 @returns {int[]} padded data
3654 """
3655 if len(data) > length - 11:
3656 raise ValueError('Input data too long for PKCS#1 padding')
3657
3658 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3659 return [0, 2] + pseudo_random + [0] + data
3660
3661
5eb6bdce 3662def encode_base_n(num, n, table=None):
59f898b7 3663 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3664 if not table:
3665 table = FULL_TABLE[:n]
3666
5eb6bdce
YCH
3667 if n > len(table):
3668 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3669
3670 if num == 0:
3671 return table[0]
3672
81bdc8fd
YCH
3673 ret = ''
3674 while num:
3675 ret = table[num % n] + ret
3676 num = num // n
3677 return ret
f52354a8
YCH
3678
3679
3680def decode_packed_codes(code):
06b3fe29 3681 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3682 obfucasted_code, base, count, symbols = mobj.groups()
3683 base = int(base)
3684 count = int(count)
3685 symbols = symbols.split('|')
3686 symbol_table = {}
3687
3688 while count:
3689 count -= 1
5eb6bdce 3690 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3691 symbol_table[base_n_count] = symbols[count] or base_n_count
3692
3693 return re.sub(
3694 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3695 obfucasted_code)
e154c651 3696
3697
3698def parse_m3u8_attributes(attrib):
3699 info = {}
3700 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3701 if val.startswith('"'):
3702 val = val[1:-1]
3703 info[key] = val
3704 return info
1143535d
YCH
3705
3706
3707def urshift(val, n):
3708 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3709
3710
3711# Based on png2str() written by @gdkchan and improved by @yokrysty
3712# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3713def decode_png(png_data):
3714 # Reference: https://www.w3.org/TR/PNG/
3715 header = png_data[8:]
3716
3717 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3718 raise IOError('Not a valid PNG file.')
3719
3720 int_map = {1: '>B', 2: '>H', 4: '>I'}
3721 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3722
3723 chunks = []
3724
3725 while header:
3726 length = unpack_integer(header[:4])
3727 header = header[4:]
3728
3729 chunk_type = header[:4]
3730 header = header[4:]
3731
3732 chunk_data = header[:length]
3733 header = header[length:]
3734
3735 header = header[4:] # Skip CRC
3736
3737 chunks.append({
3738 'type': chunk_type,
3739 'length': length,
3740 'data': chunk_data
3741 })
3742
3743 ihdr = chunks[0]['data']
3744
3745 width = unpack_integer(ihdr[:4])
3746 height = unpack_integer(ihdr[4:8])
3747
3748 idat = b''
3749
3750 for chunk in chunks:
3751 if chunk['type'] == b'IDAT':
3752 idat += chunk['data']
3753
3754 if not idat:
3755 raise IOError('Unable to read PNG data.')
3756
3757 decompressed_data = bytearray(zlib.decompress(idat))
3758
3759 stride = width * 3
3760 pixels = []
3761
3762 def _get_pixel(idx):
3763 x = idx % stride
3764 y = idx // stride
3765 return pixels[y][x]
3766
3767 for y in range(height):
3768 basePos = y * (1 + stride)
3769 filter_type = decompressed_data[basePos]
3770
3771 current_row = []
3772
3773 pixels.append(current_row)
3774
3775 for x in range(stride):
3776 color = decompressed_data[1 + basePos + x]
3777 basex = y * stride + x
3778 left = 0
3779 up = 0
3780
3781 if x > 2:
3782 left = _get_pixel(basex - 3)
3783 if y > 0:
3784 up = _get_pixel(basex - stride)
3785
3786 if filter_type == 1: # Sub
3787 color = (color + left) & 0xff
3788 elif filter_type == 2: # Up
3789 color = (color + up) & 0xff
3790 elif filter_type == 3: # Average
3791 color = (color + ((left + up) >> 1)) & 0xff
3792 elif filter_type == 4: # Paeth
3793 a = left
3794 b = up
3795 c = 0
3796
3797 if x > 2 and y > 0:
3798 c = _get_pixel(basex - stride - 3)
3799
3800 p = a + b - c
3801
3802 pa = abs(p - a)
3803 pb = abs(p - b)
3804 pc = abs(p - c)
3805
3806 if pa <= pb and pa <= pc:
3807 color = (color + a) & 0xff
3808 elif pb <= pc:
3809 color = (color + b) & 0xff
3810 else:
3811 color = (color + c) & 0xff
3812
3813 current_row.append(color)
3814
3815 return width, height, pixels
efa97bdc
YCH
3816
3817
3818def write_xattr(path, key, value):
3819 # This mess below finds the best xattr tool for the job
3820 try:
3821 # try the pyxattr module...
3822 import xattr
3823
53a7e3d2
YCH
3824 if hasattr(xattr, 'set'): # pyxattr
3825 # Unicode arguments are not supported in python-pyxattr until
3826 # version 0.5.0
3827 # See https://github.com/rg3/youtube-dl/issues/5498
3828 pyxattr_required_version = '0.5.0'
3829 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3830 # TODO: fallback to CLI tools
3831 raise XAttrUnavailableError(
3832 'python-pyxattr is detected but is too old. '
3833 'youtube-dl requires %s or above while your version is %s. '
3834 'Falling back to other xattr implementations' % (
3835 pyxattr_required_version, xattr.__version__))
3836
3837 setxattr = xattr.set
3838 else: # xattr
3839 setxattr = xattr.setxattr
efa97bdc
YCH
3840
3841 try:
53a7e3d2 3842 setxattr(path, key, value)
efa97bdc
YCH
3843 except EnvironmentError as e:
3844 raise XAttrMetadataError(e.errno, e.strerror)
3845
3846 except ImportError:
3847 if compat_os_name == 'nt':
3848 # Write xattrs to NTFS Alternate Data Streams:
3849 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3850 assert ':' not in key
3851 assert os.path.exists(path)
3852
3853 ads_fn = path + ':' + key
3854 try:
3855 with open(ads_fn, 'wb') as f:
3856 f.write(value)
3857 except EnvironmentError as e:
3858 raise XAttrMetadataError(e.errno, e.strerror)
3859 else:
3860 user_has_setfattr = check_executable('setfattr', ['--version'])
3861 user_has_xattr = check_executable('xattr', ['-h'])
3862
3863 if user_has_setfattr or user_has_xattr:
3864
3865 value = value.decode('utf-8')
3866 if user_has_setfattr:
3867 executable = 'setfattr'
3868 opts = ['-n', key, '-v', value]
3869 elif user_has_xattr:
3870 executable = 'xattr'
3871 opts = ['-w', key, value]
3872
3873 cmd = ([encodeFilename(executable, True)] +
3874 [encodeArgument(o) for o in opts] +
3875 [encodeFilename(path, True)])
3876
3877 try:
3878 p = subprocess.Popen(
3879 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3880 except EnvironmentError as e:
3881 raise XAttrMetadataError(e.errno, e.strerror)
3882 stdout, stderr = p.communicate()
3883 stderr = stderr.decode('utf-8', 'replace')
3884 if p.returncode != 0:
3885 raise XAttrMetadataError(p.returncode, stderr)
3886
3887 else:
3888 # On Unix, and can't find pyxattr, setfattr, or xattr.
3889 if sys.platform.startswith('linux'):
3890 raise XAttrUnavailableError(
3891 "Couldn't find a tool to set the xattrs. "
3892 "Install either the python 'pyxattr' or 'xattr' "
3893 "modules, or the GNU 'attr' package "
3894 "(which contains the 'setfattr' tool).")
3895 else:
3896 raise XAttrUnavailableError(
3897 "Couldn't find a tool to set the xattrs. "
3898 "Install either the python 'xattr' module, "
3899 "or the 'xattr' binary.")
0c265486
YCH
3900
3901
3902def random_birthday(year_field, month_field, day_field):
3903 return {
3904 year_field: str(random.randint(1950, 1995)),
3905 month_field: str(random.randint(1, 12)),
3906 day_field: str(random.randint(1, 31)),
3907 }