]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[adn] fix format extraction(#15716)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
d7cd9a9e 42 compat_ctypes_WINFUNCTYPE,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
b12cf31b 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
b555ae9b
S
163 '%B %d %Y at %H:%M',
164 '%B %d %Y at %H:%M:%S',
46f59e89
S
165)
166
167DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168DATE_FORMATS_DAY_FIRST.extend([
169 '%d-%m-%Y',
170 '%d.%m.%Y',
171 '%d.%m.%y',
172 '%d/%m/%Y',
173 '%d/%m/%y',
174 '%d/%m/%Y %H:%M:%S',
175])
176
177DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178DATE_FORMATS_MONTH_FIRST.extend([
179 '%m-%d-%Y',
180 '%m.%d.%Y',
181 '%m/%d/%Y',
182 '%m/%d/%y',
183 '%m/%d/%Y %H:%M:%S',
184])
185
06b3fe29
S
186PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187
7105440c 188
d77c3dfd 189def preferredencoding():
59ae15a5 190 """Get preferred encoding.
d77c3dfd 191
59ae15a5
PH
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
28e614de 197 'TEST'.encode(pref)
70a1165b 198 except Exception:
59ae15a5 199 pref = 'UTF-8'
bae611f2 200
59ae15a5 201 return pref
d77c3dfd 202
f4bfd65f 203
181c8655 204def write_json_file(obj, fn):
1394646a 205 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 206
92120217 207 fn = encodeFilename(fn)
61ee5aeb 208 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
209 encoding = get_filesystem_encoding()
210 # os.path.basename returns a bytes object, but NamedTemporaryFile
211 # will fail if the filename contains non ascii characters unless we
212 # use a unicode object
213 path_basename = lambda f: os.path.basename(fn).decode(encoding)
214 # the same for os.path.dirname
215 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
216 else:
217 path_basename = os.path.basename
218 path_dirname = os.path.dirname
219
73159f99
S
220 args = {
221 'suffix': '.tmp',
ec5f6016
JMF
222 'prefix': path_basename(fn) + '.',
223 'dir': path_dirname(fn),
73159f99
S
224 'delete': False,
225 }
226
181c8655
PH
227 # In Python 2.x, json.dump expects a bytestream.
228 # In Python 3.x, it writes to a character stream
229 if sys.version_info < (3, 0):
73159f99 230 args['mode'] = 'wb'
181c8655 231 else:
73159f99
S
232 args.update({
233 'mode': 'w',
234 'encoding': 'utf-8',
235 })
236
c86b6142 237 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
238
239 try:
240 with tf:
241 json.dump(obj, tf)
1394646a
IK
242 if sys.platform == 'win32':
243 # Need to remove existing file on Windows, else os.rename raises
244 # WindowsError or FileExistsError.
245 try:
246 os.unlink(fn)
247 except OSError:
248 pass
181c8655 249 os.rename(tf.name, fn)
70a1165b 250 except Exception:
181c8655
PH
251 try:
252 os.remove(tf.name)
253 except OSError:
254 pass
255 raise
256
257
258if sys.version_info >= (2, 7):
ee114368 259 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 260 """ Find the xpath xpath[@key=val] """
5d2354f1 261 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 262 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
263 return node.find(expr)
264else:
ee114368 265 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 266 for f in node.findall(compat_xpath(xpath)):
ee114368
S
267 if key not in f.attrib:
268 continue
269 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
270 return f
271 return None
272
d7e66d39
JMF
273# On python2.6 the xml.etree.ElementTree.Element methods don't support
274# the namespace parameter
5f6a1245
JW
275
276
d7e66d39
JMF
277def xpath_with_ns(path, ns_map):
278 components = [c.split(':') for c in path.split('/')]
279 replaced = []
280 for c in components:
281 if len(c) == 1:
282 replaced.append(c[0])
283 else:
284 ns, tag = c
285 replaced.append('{%s}%s' % (ns_map[ns], tag))
286 return '/'.join(replaced)
287
d77c3dfd 288
a41fb80c 289def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 290 def _find_xpath(xpath):
810c10ba 291 return node.find(compat_xpath(xpath))
578c0745
S
292
293 if isinstance(xpath, (str, compat_str)):
294 n = _find_xpath(xpath)
295 else:
296 for xp in xpath:
297 n = _find_xpath(xp)
298 if n is not None:
299 break
d74bebd5 300
8e636da4 301 if n is None:
bf42a990
S
302 if default is not NO_DEFAULT:
303 return default
304 elif fatal:
bf0ff932
PH
305 name = xpath if name is None else name
306 raise ExtractorError('Could not find XML element %s' % name)
307 else:
308 return None
a41fb80c
S
309 return n
310
311
312def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
313 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
314 if n is None or n == default:
315 return n
316 if n.text is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element\'s text %s' % name)
322 else:
323 return None
324 return n.text
a41fb80c
S
325
326
327def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
328 n = find_xpath_attr(node, xpath, key)
329 if n is None:
330 if default is not NO_DEFAULT:
331 return default
332 elif fatal:
333 name = '%s[@%s]' % (xpath, key) if name is None else name
334 raise ExtractorError('Could not find XML attribute %s' % name)
335 else:
336 return None
337 return n.attrib[key]
bf0ff932
PH
338
339
9e6dd238 340def get_element_by_id(id, html):
43e8fafd 341 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 342 return get_element_by_attribute('id', id, html)
43e8fafd 343
12ea2f30 344
84c237fb 345def get_element_by_class(class_name, html):
2af12ad9
TC
346 """Return the content of the first tag with the specified class in the passed HTML document"""
347 retval = get_elements_by_class(class_name, html)
348 return retval[0] if retval else None
349
350
351def get_element_by_attribute(attribute, value, html, escape_value=True):
352 retval = get_elements_by_attribute(attribute, value, html, escape_value)
353 return retval[0] if retval else None
354
355
356def get_elements_by_class(class_name, html):
357 """Return the content of all tags with the specified class in the passed HTML document as a list"""
358 return get_elements_by_attribute(
84c237fb
YCH
359 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
360 html, escape_value=False)
361
362
2af12ad9 363def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 364 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 365
84c237fb
YCH
366 value = re.escape(value) if escape_value else value
367
2af12ad9
TC
368 retlist = []
369 for m in re.finditer(r'''(?xs)
38285056 370 <([a-zA-Z0-9:._-]+)
609ff8ca 371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 372 \s+%s=['"]?%s['"]?
609ff8ca 373 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
374 \s*>
375 (?P<content>.*?)
376 </\1>
2af12ad9
TC
377 ''' % (re.escape(attribute), value), html):
378 res = m.group('content')
38285056 379
2af12ad9
TC
380 if res.startswith('"') or res.startswith("'"):
381 res = res[1:-1]
38285056 382
2af12ad9 383 retlist.append(unescapeHTML(res))
a921f407 384
2af12ad9 385 return retlist
a921f407 386
c5229f39 387
8bb56eee
BF
388class HTMLAttributeParser(compat_HTMLParser):
389 """Trivial HTML parser to gather the attributes for a single element"""
390 def __init__(self):
c5229f39 391 self.attrs = {}
8bb56eee
BF
392 compat_HTMLParser.__init__(self)
393
394 def handle_starttag(self, tag, attrs):
395 self.attrs = dict(attrs)
396
c5229f39 397
8bb56eee
BF
398def extract_attributes(html_element):
399 """Given a string for an HTML element such as
400 <el
401 a="foo" B="bar" c="&98;az" d=boz
402 empty= noval entity="&amp;"
403 sq='"' dq="'"
404 >
405 Decode and return a dictionary of attributes.
406 {
407 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
408 'empty': '', 'noval': None, 'entity': '&',
409 'sq': '"', 'dq': '\''
410 }.
411 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
412 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
413 """
414 parser = HTMLAttributeParser()
b4a3d461
S
415 try:
416 parser.feed(html_element)
417 parser.close()
418 # Older Python may throw HTMLParseError in case of malformed HTML
419 except compat_HTMLParseError:
420 pass
8bb56eee 421 return parser.attrs
9e6dd238 422
c5229f39 423
9e6dd238 424def clean_html(html):
59ae15a5 425 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
426
427 if html is None: # Convenience for sanitizing descriptions etc.
428 return html
429
59ae15a5
PH
430 # Newline vs <br />
431 html = html.replace('\n', ' ')
edd9221c
TF
432 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
433 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
434 # Strip html tags
435 html = re.sub('<.*?>', '', html)
436 # Replace html entities
437 html = unescapeHTML(html)
7decf895 438 return html.strip()
9e6dd238
FV
439
440
d77c3dfd 441def sanitize_open(filename, open_mode):
59ae15a5
PH
442 """Try to open the given filename, and slightly tweak it if this fails.
443
444 Attempts to open the given filename. If this fails, it tries to change
445 the filename slightly, step by step, until it's either able to open it
446 or it fails and raises a final exception, like the standard open()
447 function.
448
449 It returns the tuple (stream, definitive_file_name).
450 """
451 try:
28e614de 452 if filename == '-':
59ae15a5
PH
453 if sys.platform == 'win32':
454 import msvcrt
455 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 456 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
457 stream = open(encodeFilename(filename), open_mode)
458 return (stream, filename)
459 except (IOError, OSError) as err:
f45c185f
PH
460 if err.errno in (errno.EACCES,):
461 raise
59ae15a5 462
f45c185f 463 # In case of error, try to remove win32 forbidden chars
d55de57b 464 alt_filename = sanitize_path(filename)
f45c185f
PH
465 if alt_filename == filename:
466 raise
467 else:
468 # An exception here should be caught in the caller
d55de57b 469 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 470 return (stream, alt_filename)
d77c3dfd
FV
471
472
473def timeconvert(timestr):
59ae15a5
PH
474 """Convert RFC 2822 defined time string into system timestamp"""
475 timestamp = None
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
479 return timestamp
1c469a94 480
5f6a1245 481
796173d0 482def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
483 """Sanitizes a string so it could be used as part of a filename.
484 If restricted is set, use a stricter subset of allowed characters.
158af524
S
485 Set is_id if this is not an arbitrary string, but an ID that should be kept
486 if possible.
59ae15a5
PH
487 """
488 def replace_insane(char):
c587cbb7
AT
489 if restricted and char in ACCENT_CHARS:
490 return ACCENT_CHARS[char]
59ae15a5
PH
491 if char == '?' or ord(char) < 32 or ord(char) == 127:
492 return ''
493 elif char == '"':
494 return '' if restricted else '\''
495 elif char == ':':
496 return '_-' if restricted else ' -'
497 elif char in '\\/|*<>':
498 return '_'
627dcfff 499 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
500 return '_'
501 if restricted and ord(char) > 127:
502 return '_'
503 return char
504
2aeb06d6
PH
505 # Handle timestamps
506 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 507 result = ''.join(map(replace_insane, s))
796173d0
PH
508 if not is_id:
509 while '__' in result:
510 result = result.replace('__', '_')
511 result = result.strip('_')
512 # Common case of "Foreign band name - English song title"
513 if restricted and result.startswith('-_'):
514 result = result[2:]
5a42414b
PH
515 if result.startswith('-'):
516 result = '_' + result[len('-'):]
a7440261 517 result = result.lstrip('.')
796173d0
PH
518 if not result:
519 result = '_'
59ae15a5 520 return result
d77c3dfd 521
5f6a1245 522
a2aaf4db
S
523def sanitize_path(s):
524 """Sanitizes and normalizes path on Windows"""
525 if sys.platform != 'win32':
526 return s
be531ef1
S
527 drive_or_unc, _ = os.path.splitdrive(s)
528 if sys.version_info < (2, 7) and not drive_or_unc:
529 drive_or_unc, _ = os.path.splitunc(s)
530 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
531 if drive_or_unc:
a2aaf4db
S
532 norm_path.pop(0)
533 sanitized_path = [
ec85ded8 534 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 535 for path_part in norm_path]
be531ef1
S
536 if drive_or_unc:
537 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
538 return os.path.join(*sanitized_path)
539
540
17bcc626 541def sanitize_url(url):
befa4708
S
542 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
543 # the number of unwanted failures due to missing protocol
544 if url.startswith('//'):
545 return 'http:%s' % url
546 # Fix some common typos seen so far
547 COMMON_TYPOS = (
548 # https://github.com/rg3/youtube-dl/issues/15649
549 (r'^httpss://', r'https://'),
550 # https://bx1.be/lives/direct-tv/
551 (r'^rmtp([es]?)://', r'rtmp\1://'),
552 )
553 for mistake, fixup in COMMON_TYPOS:
554 if re.match(mistake, url):
555 return re.sub(mistake, fixup, url)
556 return url
17bcc626
S
557
558
67dda517 559def sanitized_Request(url, *args, **kwargs):
17bcc626 560 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
561
562
51098426
S
563def expand_path(s):
564 """Expand shell variables and ~"""
565 return os.path.expandvars(compat_expanduser(s))
566
567
d77c3dfd 568def orderedSet(iterable):
59ae15a5
PH
569 """ Remove all duplicates from the input iterable """
570 res = []
571 for el in iterable:
572 if el not in res:
573 res.append(el)
574 return res
d77c3dfd 575
912b38b4 576
55b2f099 577def _htmlentity_transform(entity_with_semicolon):
4e408e47 578 """Transforms an HTML entity to a character."""
55b2f099
YCH
579 entity = entity_with_semicolon[:-1]
580
4e408e47
PH
581 # Known non-numeric HTML entity
582 if entity in compat_html_entities.name2codepoint:
583 return compat_chr(compat_html_entities.name2codepoint[entity])
584
55b2f099
YCH
585 # TODO: HTML5 allows entities without a semicolon. For example,
586 # '&Eacuteric' should be decoded as 'Éric'.
587 if entity_with_semicolon in compat_html_entities_html5:
588 return compat_html_entities_html5[entity_with_semicolon]
589
91757b0f 590 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
591 if mobj is not None:
592 numstr = mobj.group(1)
28e614de 593 if numstr.startswith('x'):
4e408e47 594 base = 16
28e614de 595 numstr = '0%s' % numstr
4e408e47
PH
596 else:
597 base = 10
7aefc49c
S
598 # See https://github.com/rg3/youtube-dl/issues/7518
599 try:
600 return compat_chr(int(numstr, base))
601 except ValueError:
602 pass
4e408e47
PH
603
604 # Unknown entity in name, return its literal representation
7a3f0c00 605 return '&%s;' % entity
4e408e47
PH
606
607
d77c3dfd 608def unescapeHTML(s):
912b38b4
PH
609 if s is None:
610 return None
611 assert type(s) == compat_str
d77c3dfd 612
4e408e47 613 return re.sub(
95f3f7c2 614 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 615
8bf48f23 616
aa49acd1
S
617def get_subprocess_encoding():
618 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
619 # For subprocess calls, encode with locale encoding
620 # Refer to http://stackoverflow.com/a/9951851/35070
621 encoding = preferredencoding()
622 else:
623 encoding = sys.getfilesystemencoding()
624 if encoding is None:
625 encoding = 'utf-8'
626 return encoding
627
628
8bf48f23 629def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
630 """
631 @param s The name of the file
632 """
d77c3dfd 633
8bf48f23 634 assert type(s) == compat_str
d77c3dfd 635
59ae15a5
PH
636 # Python 3 has a Unicode API
637 if sys.version_info >= (3, 0):
638 return s
0f00efed 639
aa49acd1
S
640 # Pass '' directly to use Unicode APIs on Windows 2000 and up
641 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
642 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
643 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
644 return s
645
8ee239e9
YCH
646 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
647 if sys.platform.startswith('java'):
648 return s
649
aa49acd1
S
650 return s.encode(get_subprocess_encoding(), 'ignore')
651
652
653def decodeFilename(b, for_subprocess=False):
654
655 if sys.version_info >= (3, 0):
656 return b
657
658 if not isinstance(b, bytes):
659 return b
660
661 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 662
f07b74fc
PH
663
664def encodeArgument(s):
665 if not isinstance(s, compat_str):
666 # Legacy code that uses byte strings
667 # Uncomment the following line after fixing all post processors
7af808a5 668 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
669 s = s.decode('ascii')
670 return encodeFilename(s, True)
671
672
aa49acd1
S
673def decodeArgument(b):
674 return decodeFilename(b, True)
675
676
8271226a
PH
677def decodeOption(optval):
678 if optval is None:
679 return optval
680 if isinstance(optval, bytes):
681 optval = optval.decode(preferredencoding())
682
683 assert isinstance(optval, compat_str)
684 return optval
1c256f70 685
5f6a1245 686
4539dd30
PH
687def formatSeconds(secs):
688 if secs > 3600:
689 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
690 elif secs > 60:
691 return '%d:%02d' % (secs // 60, secs % 60)
692 else:
693 return '%d' % secs
694
a0ddb8a2 695
be4a824d
PH
696def make_HTTPS_handler(params, **kwargs):
697 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 698 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 699 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 700 if opts_no_check_certificate:
be5f2c19 701 context.check_hostname = False
0db261ba 702 context.verify_mode = ssl.CERT_NONE
a2366922 703 try:
be4a824d 704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
705 except TypeError:
706 # Python 2.7.8
707 # (create_default_context present but HTTPSHandler has no context=)
708 pass
709
710 if sys.version_info < (3, 2):
d7932313 711 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 712 else: # Python < 3.4
d7932313 713 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 714 context.verify_mode = (ssl.CERT_NONE
dca08720 715 if opts_no_check_certificate
ea6d901e 716 else ssl.CERT_REQUIRED)
303b479e 717 context.set_default_verify_paths()
be4a824d 718 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 719
732ea2f0 720
08f2a92c
JMF
721def bug_reports_message():
722 if ytdl_is_updateable():
723 update_cmd = 'type youtube-dl -U to update'
724 else:
725 update_cmd = 'see https://yt-dl.org/update on how to update'
726 msg = '; please report this issue on https://yt-dl.org/bug .'
727 msg += ' Make sure you are using the latest version; %s.' % update_cmd
728 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
729 return msg
730
731
bf5b9d85
PM
732class YoutubeDLError(Exception):
733 """Base exception for YoutubeDL errors."""
734 pass
735
736
737class ExtractorError(YoutubeDLError):
1c256f70 738 """Error during info extraction."""
5f6a1245 739
d11271dd 740 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
741 """ tb, if given, is the original traceback (so that it can be printed out).
742 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
743 """
744
745 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
746 expected = True
d11271dd
PH
747 if video_id is not None:
748 msg = video_id + ': ' + msg
410f3e73 749 if cause:
28e614de 750 msg += ' (caused by %r)' % cause
9a82b238 751 if not expected:
08f2a92c 752 msg += bug_reports_message()
1c256f70 753 super(ExtractorError, self).__init__(msg)
d5979c5d 754
1c256f70 755 self.traceback = tb
8cc83b8d 756 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 757 self.cause = cause
d11271dd 758 self.video_id = video_id
1c256f70 759
01951dda
PH
760 def format_traceback(self):
761 if self.traceback is None:
762 return None
28e614de 763 return ''.join(traceback.format_tb(self.traceback))
01951dda 764
1c256f70 765
416c7fcb
PH
766class UnsupportedError(ExtractorError):
767 def __init__(self, url):
768 super(UnsupportedError, self).__init__(
769 'Unsupported URL: %s' % url, expected=True)
770 self.url = url
771
772
55b3e45b
JMF
773class RegexNotFoundError(ExtractorError):
774 """Error when a regex didn't match"""
775 pass
776
777
773f291d
S
778class GeoRestrictedError(ExtractorError):
779 """Geographic restriction Error exception.
780
781 This exception may be thrown when a video is not available from your
782 geographic location due to geographic restrictions imposed by a website.
783 """
784 def __init__(self, msg, countries=None):
785 super(GeoRestrictedError, self).__init__(msg, expected=True)
786 self.msg = msg
787 self.countries = countries
788
789
bf5b9d85 790class DownloadError(YoutubeDLError):
59ae15a5 791 """Download Error exception.
d77c3dfd 792
59ae15a5
PH
793 This exception may be thrown by FileDownloader objects if they are not
794 configured to continue on errors. They will contain the appropriate
795 error message.
796 """
5f6a1245 797
8cc83b8d
FV
798 def __init__(self, msg, exc_info=None):
799 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
800 super(DownloadError, self).__init__(msg)
801 self.exc_info = exc_info
d77c3dfd
FV
802
803
bf5b9d85 804class SameFileError(YoutubeDLError):
59ae15a5 805 """Same File exception.
d77c3dfd 806
59ae15a5
PH
807 This exception will be thrown by FileDownloader objects if they detect
808 multiple files would have to be downloaded to the same file on disk.
809 """
810 pass
d77c3dfd
FV
811
812
bf5b9d85 813class PostProcessingError(YoutubeDLError):
59ae15a5 814 """Post Processing exception.
d77c3dfd 815
59ae15a5
PH
816 This exception may be raised by PostProcessor's .run() method to
817 indicate an error in the postprocessing task.
818 """
5f6a1245 819
7851b379 820 def __init__(self, msg):
bf5b9d85 821 super(PostProcessingError, self).__init__(msg)
7851b379 822 self.msg = msg
d77c3dfd 823
5f6a1245 824
bf5b9d85 825class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
826 """ --max-downloads limit has been reached. """
827 pass
d77c3dfd
FV
828
829
bf5b9d85 830class UnavailableVideoError(YoutubeDLError):
59ae15a5 831 """Unavailable Format exception.
d77c3dfd 832
59ae15a5
PH
833 This exception will be thrown when a video is requested
834 in a format that is not available for that video.
835 """
836 pass
d77c3dfd
FV
837
838
bf5b9d85 839class ContentTooShortError(YoutubeDLError):
59ae15a5 840 """Content Too Short exception.
d77c3dfd 841
59ae15a5
PH
842 This exception may be raised by FileDownloader objects when a file they
843 download is too small for what the server announced first, indicating
844 the connection was probably interrupted.
845 """
d77c3dfd 846
59ae15a5 847 def __init__(self, downloaded, expected):
bf5b9d85
PM
848 super(ContentTooShortError, self).__init__(
849 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
850 )
2c7ed247 851 # Both in bytes
59ae15a5
PH
852 self.downloaded = downloaded
853 self.expected = expected
d77c3dfd 854
5f6a1245 855
bf5b9d85 856class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
857 def __init__(self, code=None, msg='Unknown error'):
858 super(XAttrMetadataError, self).__init__(msg)
859 self.code = code
bd264412 860 self.msg = msg
efa97bdc
YCH
861
862 # Parsing code and msg
863 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
864 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
865 self.reason = 'NO_SPACE'
866 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
867 self.reason = 'VALUE_TOO_LONG'
868 else:
869 self.reason = 'NOT_SUPPORTED'
870
871
bf5b9d85 872class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
873 pass
874
875
c5a59d93 876def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
877 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
878 # expected HTTP responses to meet HTTP/1.0 or later (see also
879 # https://github.com/rg3/youtube-dl/issues/6727)
880 if sys.version_info < (3, 0):
65220c3b
S
881 kwargs['strict'] = True
882 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d
PH
883 source_address = ydl_handler._params.get('source_address')
884 if source_address is not None:
885 sa = (source_address, 0)
886 if hasattr(hc, 'source_address'): # Python 2.7+
887 hc.source_address = sa
888 else: # Python 2.6
889 def _hc_connect(self, *args, **kwargs):
890 sock = compat_socket_create_connection(
891 (self.host, self.port), self.timeout, sa)
892 if is_https:
d7932313
PH
893 self.sock = ssl.wrap_socket(
894 sock, self.key_file, self.cert_file,
895 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
896 else:
897 self.sock = sock
898 hc.connect = functools.partial(_hc_connect, hc)
899
900 return hc
901
902
87f0e62d 903def handle_youtubedl_headers(headers):
992fc9d6
YCH
904 filtered_headers = headers
905
906 if 'Youtubedl-no-compression' in filtered_headers:
907 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 908 del filtered_headers['Youtubedl-no-compression']
87f0e62d 909
992fc9d6 910 return filtered_headers
87f0e62d
YCH
911
912
acebc9cd 913class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
914 """Handler for HTTP requests and responses.
915
916 This class, when installed with an OpenerDirector, automatically adds
917 the standard headers to every HTTP request and handles gzipped and
918 deflated responses from web servers. If compression is to be avoided in
919 a particular request, the original request in the program code only has
0424ec30 920 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
921 removed before making the real request.
922
923 Part of this code was copied from:
924
925 http://techknack.net/python-urllib2-handlers/
926
927 Andrew Rowls, the author of that code, agreed to release it to the
928 public domain.
929 """
930
be4a824d
PH
931 def __init__(self, params, *args, **kwargs):
932 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
933 self._params = params
934
935 def http_open(self, req):
71aff188
YCH
936 conn_class = compat_http_client.HTTPConnection
937
938 socks_proxy = req.headers.get('Ytdl-socks-proxy')
939 if socks_proxy:
940 conn_class = make_socks_conn_class(conn_class, socks_proxy)
941 del req.headers['Ytdl-socks-proxy']
942
be4a824d 943 return self.do_open(functools.partial(
71aff188 944 _create_http_connection, self, conn_class, False),
be4a824d
PH
945 req)
946
59ae15a5
PH
947 @staticmethod
948 def deflate(data):
949 try:
950 return zlib.decompress(data, -zlib.MAX_WBITS)
951 except zlib.error:
952 return zlib.decompress(data)
953
acebc9cd 954 def http_request(self, req):
51f267d9
S
955 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
956 # always respected by websites, some tend to give out URLs with non percent-encoded
957 # non-ASCII characters (see telemb.py, ard.py [#3412])
958 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
959 # To work around aforementioned issue we will replace request's original URL with
960 # percent-encoded one
961 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
962 # the code of this workaround has been moved here from YoutubeDL.urlopen()
963 url = req.get_full_url()
964 url_escaped = escape_url(url)
965
966 # Substitute URL if any change after escaping
967 if url != url_escaped:
15d260eb 968 req = update_Request(req, url=url_escaped)
51f267d9 969
33ac271b 970 for h, v in std_headers.items():
3d5f7a39
JK
971 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
972 # The dict keys are capitalized because of this bug by urllib
973 if h.capitalize() not in req.headers:
33ac271b 974 req.add_header(h, v)
87f0e62d
YCH
975
976 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
977
978 if sys.version_info < (2, 7) and '#' in req.get_full_url():
979 # Python 2.6 is brain-dead when it comes to fragments
980 req._Request__original = req._Request__original.partition('#')[0]
981 req._Request__r_type = req._Request__r_type.partition('#')[0]
982
59ae15a5
PH
983 return req
984
acebc9cd 985 def http_response(self, req, resp):
59ae15a5
PH
986 old_resp = resp
987 # gzip
988 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
989 content = resp.read()
990 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
991 try:
992 uncompressed = io.BytesIO(gz.read())
993 except IOError as original_ioerror:
994 # There may be junk add the end of the file
995 # See http://stackoverflow.com/q/4928560/35070 for details
996 for i in range(1, 1024):
997 try:
998 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
999 uncompressed = io.BytesIO(gz.read())
1000 except IOError:
1001 continue
1002 break
1003 else:
1004 raise original_ioerror
b407d853 1005 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1006 resp.msg = old_resp.msg
c047270c 1007 del resp.headers['Content-encoding']
59ae15a5
PH
1008 # deflate
1009 if resp.headers.get('Content-encoding', '') == 'deflate':
1010 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1011 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1012 resp.msg = old_resp.msg
c047270c 1013 del resp.headers['Content-encoding']
ad729172
S
1014 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1016 if 300 <= resp.code < 400:
1017 location = resp.headers.get('Location')
1018 if location:
1019 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020 if sys.version_info >= (3, 0):
1021 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1022 else:
1023 location = location.decode('utf-8')
5a4d9ddb
S
1024 location_escaped = escape_url(location)
1025 if location != location_escaped:
1026 del resp.headers['Location']
9a4aec8b
YCH
1027 if sys.version_info < (3, 0):
1028 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1029 resp.headers['Location'] = location_escaped
59ae15a5 1030 return resp
0f8d03f8 1031
acebc9cd
PH
1032 https_request = http_request
1033 https_response = http_response
bf50b038 1034
5de90176 1035
71aff188
YCH
1036def make_socks_conn_class(base_class, socks_proxy):
1037 assert issubclass(base_class, (
1038 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1039
1040 url_components = compat_urlparse.urlparse(socks_proxy)
1041 if url_components.scheme.lower() == 'socks5':
1042 socks_type = ProxyType.SOCKS5
1043 elif url_components.scheme.lower() in ('socks', 'socks4'):
1044 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1045 elif url_components.scheme.lower() == 'socks4a':
1046 socks_type = ProxyType.SOCKS4A
71aff188 1047
cdd94c2e
YCH
1048 def unquote_if_non_empty(s):
1049 if not s:
1050 return s
1051 return compat_urllib_parse_unquote_plus(s)
1052
71aff188
YCH
1053 proxy_args = (
1054 socks_type,
1055 url_components.hostname, url_components.port or 1080,
1056 True, # Remote DNS
cdd94c2e
YCH
1057 unquote_if_non_empty(url_components.username),
1058 unquote_if_non_empty(url_components.password),
71aff188
YCH
1059 )
1060
1061 class SocksConnection(base_class):
1062 def connect(self):
1063 self.sock = sockssocket()
1064 self.sock.setproxy(*proxy_args)
1065 if type(self.timeout) in (int, float):
1066 self.sock.settimeout(self.timeout)
1067 self.sock.connect((self.host, self.port))
1068
1069 if isinstance(self, compat_http_client.HTTPSConnection):
1070 if hasattr(self, '_context'): # Python > 2.6
1071 self.sock = self._context.wrap_socket(
1072 self.sock, server_hostname=self.host)
1073 else:
1074 self.sock = ssl.wrap_socket(self.sock)
1075
1076 return SocksConnection
1077
1078
be4a824d
PH
1079class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083 self._params = params
1084
1085 def https_open(self, req):
4f264c02 1086 kwargs = {}
71aff188
YCH
1087 conn_class = self._https_conn_class
1088
4f264c02
JMF
1089 if hasattr(self, '_context'): # python > 2.6
1090 kwargs['context'] = self._context
1091 if hasattr(self, '_check_hostname'): # python 3.x
1092 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1093
1094 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1095 if socks_proxy:
1096 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097 del req.headers['Ytdl-socks-proxy']
1098
be4a824d 1099 return self.do_open(functools.partial(
71aff188 1100 _create_http_connection, self, conn_class, True),
4f264c02 1101 req, **kwargs)
be4a824d
PH
1102
1103
a6420bf5
S
1104class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105 def __init__(self, cookiejar=None):
1106 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1107
1108 def http_response(self, request, response):
1109 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110 # characters in Set-Cookie HTTP header of last response (see
1111 # https://github.com/rg3/youtube-dl/issues/6769).
1112 # In order to at least prevent crashing we will percent encode Set-Cookie
1113 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1114 # if sys.version_info < (3, 0) and response.headers:
1115 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116 # set_cookie = response.headers.get(set_cookie_header)
1117 # if set_cookie:
1118 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119 # if set_cookie != set_cookie_escaped:
1120 # del response.headers[set_cookie_header]
1121 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1122 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1123
1124 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125 https_response = http_response
1126
1127
46f59e89
S
1128def extract_timezone(date_str):
1129 m = re.search(
1130 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1131 date_str)
1132 if not m:
1133 timezone = datetime.timedelta()
1134 else:
1135 date_str = date_str[:-len(m.group('tz'))]
1136 if not m.group('sign'):
1137 timezone = datetime.timedelta()
1138 else:
1139 sign = 1 if m.group('sign') == '+' else -1
1140 timezone = datetime.timedelta(
1141 hours=sign * int(m.group('hours')),
1142 minutes=sign * int(m.group('minutes')))
1143 return timezone, date_str
1144
1145
08b38d54 1146def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1147 """ Return a UNIX timestamp from the given date """
1148
1149 if date_str is None:
1150 return None
1151
52c3a6e4
S
1152 date_str = re.sub(r'\.[0-9]+', '', date_str)
1153
08b38d54 1154 if timezone is None:
46f59e89
S
1155 timezone, date_str = extract_timezone(date_str)
1156
52c3a6e4
S
1157 try:
1158 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160 return calendar.timegm(dt.timetuple())
1161 except ValueError:
1162 pass
912b38b4
PH
1163
1164
46f59e89
S
1165def date_formats(day_first=True):
1166 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1167
1168
42bdd9d0 1169def unified_strdate(date_str, day_first=True):
bf50b038 1170 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1171
1172 if date_str is None:
1173 return None
bf50b038 1174 upload_date = None
5f6a1245 1175 # Replace commas
026fcc04 1176 date_str = date_str.replace(',', ' ')
42bdd9d0 1177 # Remove AM/PM + timezone
9bb8e0a3 1178 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1179 _, date_str = extract_timezone(date_str)
42bdd9d0 1180
46f59e89 1181 for expression in date_formats(day_first):
bf50b038
JMF
1182 try:
1183 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1184 except ValueError:
bf50b038 1185 pass
42393ce2
PH
1186 if upload_date is None:
1187 timetuple = email.utils.parsedate_tz(date_str)
1188 if timetuple:
c6b9cf05
S
1189 try:
1190 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1191 except ValueError:
1192 pass
6a750402
JMF
1193 if upload_date is not None:
1194 return compat_str(upload_date)
bf50b038 1195
5f6a1245 1196
46f59e89
S
1197def unified_timestamp(date_str, day_first=True):
1198 if date_str is None:
1199 return None
1200
2ae2ffda 1201 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1202
7dc2a74e 1203 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1204 timezone, date_str = extract_timezone(date_str)
1205
1206 # Remove AM/PM + timezone
1207 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1208
deef3195
S
1209 # Remove unrecognized timezones from ISO 8601 alike timestamps
1210 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1211 if m:
1212 date_str = date_str[:-len(m.group('tz'))]
1213
46f59e89
S
1214 for expression in date_formats(day_first):
1215 try:
7dc2a74e 1216 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1217 return calendar.timegm(dt.timetuple())
1218 except ValueError:
1219 pass
1220 timetuple = email.utils.parsedate_tz(date_str)
1221 if timetuple:
7dc2a74e 1222 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1223
1224
28e614de 1225def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1226 if url is None:
1227 return default_ext
9cb9a5df 1228 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1229 if re.match(r'^[A-Za-z0-9]+$', guess):
1230 return guess
a7aaa398
S
1231 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1232 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1233 return guess.rstrip('/')
73e79f2a 1234 else:
cbdbb766 1235 return default_ext
73e79f2a 1236
5f6a1245 1237
d4051a8e 1238def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1239 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1240
5f6a1245 1241
bd558525 1242def date_from_str(date_str):
37254abc
JMF
1243 """
1244 Return a datetime object from a string in the format YYYYMMDD or
1245 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1246 today = datetime.date.today()
f8795e10 1247 if date_str in ('now', 'today'):
37254abc 1248 return today
f8795e10
PH
1249 if date_str == 'yesterday':
1250 return today - datetime.timedelta(days=1)
ec85ded8 1251 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1252 if match is not None:
1253 sign = match.group('sign')
1254 time = int(match.group('time'))
1255 if sign == '-':
1256 time = -time
1257 unit = match.group('unit')
dfb1b146 1258 # A bad approximation?
37254abc
JMF
1259 if unit == 'month':
1260 unit = 'day'
1261 time *= 30
1262 elif unit == 'year':
1263 unit = 'day'
1264 time *= 365
1265 unit += 's'
1266 delta = datetime.timedelta(**{unit: time})
1267 return today + delta
611c1dd9 1268 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1269
1270
e63fc1be 1271def hyphenate_date(date_str):
1272 """
1273 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1274 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1275 if match is not None:
1276 return '-'.join(match.groups())
1277 else:
1278 return date_str
1279
5f6a1245 1280
bd558525
JMF
1281class DateRange(object):
1282 """Represents a time interval between two dates"""
5f6a1245 1283
bd558525
JMF
1284 def __init__(self, start=None, end=None):
1285 """start and end must be strings in the format accepted by date"""
1286 if start is not None:
1287 self.start = date_from_str(start)
1288 else:
1289 self.start = datetime.datetime.min.date()
1290 if end is not None:
1291 self.end = date_from_str(end)
1292 else:
1293 self.end = datetime.datetime.max.date()
37254abc 1294 if self.start > self.end:
bd558525 1295 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1296
bd558525
JMF
1297 @classmethod
1298 def day(cls, day):
1299 """Returns a range that only contains the given day"""
5f6a1245
JW
1300 return cls(day, day)
1301
bd558525
JMF
1302 def __contains__(self, date):
1303 """Check if the date is in the range"""
37254abc
JMF
1304 if not isinstance(date, datetime.date):
1305 date = date_from_str(date)
1306 return self.start <= date <= self.end
5f6a1245 1307
bd558525 1308 def __str__(self):
5f6a1245 1309 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1310
1311
1312def platform_name():
1313 """ Returns the platform name as a compat_str """
1314 res = platform.platform()
1315 if isinstance(res, bytes):
1316 res = res.decode(preferredencoding())
1317
1318 assert isinstance(res, compat_str)
1319 return res
c257baff
PH
1320
1321
b58ddb32
PH
1322def _windows_write_string(s, out):
1323 """ Returns True if the string was written using special methods,
1324 False if it has yet to be written out."""
1325 # Adapted from http://stackoverflow.com/a/3259271/35070
1326
1327 import ctypes
1328 import ctypes.wintypes
1329
1330 WIN_OUTPUT_IDS = {
1331 1: -11,
1332 2: -12,
1333 }
1334
a383a98a
PH
1335 try:
1336 fileno = out.fileno()
1337 except AttributeError:
1338 # If the output stream doesn't have a fileno, it's virtual
1339 return False
aa42e873
PH
1340 except io.UnsupportedOperation:
1341 # Some strange Windows pseudo files?
1342 return False
b58ddb32
PH
1343 if fileno not in WIN_OUTPUT_IDS:
1344 return False
1345
d7cd9a9e 1346 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1347 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1348 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1349 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1350
d7cd9a9e 1351 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1352 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1353 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1354 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1355 written = ctypes.wintypes.DWORD(0)
1356
d7cd9a9e 1357 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1358 FILE_TYPE_CHAR = 0x0002
1359 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1360 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1361 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1362 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1363 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1364 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1365
1366 def not_a_console(handle):
1367 if handle == INVALID_HANDLE_VALUE or handle is None:
1368 return True
8fb3ac36
PH
1369 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1370 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1371
1372 if not_a_console(h):
1373 return False
1374
d1b9c912
PH
1375 def next_nonbmp_pos(s):
1376 try:
1377 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1378 except StopIteration:
1379 return len(s)
1380
1381 while s:
1382 count = min(next_nonbmp_pos(s), 1024)
1383
b58ddb32 1384 ret = WriteConsoleW(
d1b9c912 1385 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1386 if ret == 0:
1387 raise OSError('Failed to write string')
d1b9c912
PH
1388 if not count: # We just wrote a non-BMP character
1389 assert written.value == 2
1390 s = s[1:]
1391 else:
1392 assert written.value > 0
1393 s = s[written.value:]
b58ddb32
PH
1394 return True
1395
1396
734f90bb 1397def write_string(s, out=None, encoding=None):
7459e3a2
PH
1398 if out is None:
1399 out = sys.stderr
8bf48f23 1400 assert type(s) == compat_str
7459e3a2 1401
b58ddb32
PH
1402 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1403 if _windows_write_string(s, out):
1404 return
1405
7459e3a2
PH
1406 if ('b' in getattr(out, 'mode', '') or
1407 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1408 byt = s.encode(encoding or preferredencoding(), 'ignore')
1409 out.write(byt)
1410 elif hasattr(out, 'buffer'):
1411 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1412 byt = s.encode(enc, 'ignore')
1413 out.buffer.write(byt)
1414 else:
8bf48f23 1415 out.write(s)
7459e3a2
PH
1416 out.flush()
1417
1418
48ea9cea
PH
1419def bytes_to_intlist(bs):
1420 if not bs:
1421 return []
1422 if isinstance(bs[0], int): # Python 3
1423 return list(bs)
1424 else:
1425 return [ord(c) for c in bs]
1426
c257baff 1427
cba892fa 1428def intlist_to_bytes(xs):
1429 if not xs:
1430 return b''
edaa23f8 1431 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1432
1433
c1c9a79c
PH
1434# Cross-platform file locking
1435if sys.platform == 'win32':
1436 import ctypes.wintypes
1437 import msvcrt
1438
1439 class OVERLAPPED(ctypes.Structure):
1440 _fields_ = [
1441 ('Internal', ctypes.wintypes.LPVOID),
1442 ('InternalHigh', ctypes.wintypes.LPVOID),
1443 ('Offset', ctypes.wintypes.DWORD),
1444 ('OffsetHigh', ctypes.wintypes.DWORD),
1445 ('hEvent', ctypes.wintypes.HANDLE),
1446 ]
1447
1448 kernel32 = ctypes.windll.kernel32
1449 LockFileEx = kernel32.LockFileEx
1450 LockFileEx.argtypes = [
1451 ctypes.wintypes.HANDLE, # hFile
1452 ctypes.wintypes.DWORD, # dwFlags
1453 ctypes.wintypes.DWORD, # dwReserved
1454 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1455 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1456 ctypes.POINTER(OVERLAPPED) # Overlapped
1457 ]
1458 LockFileEx.restype = ctypes.wintypes.BOOL
1459 UnlockFileEx = kernel32.UnlockFileEx
1460 UnlockFileEx.argtypes = [
1461 ctypes.wintypes.HANDLE, # hFile
1462 ctypes.wintypes.DWORD, # dwReserved
1463 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1464 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1465 ctypes.POINTER(OVERLAPPED) # Overlapped
1466 ]
1467 UnlockFileEx.restype = ctypes.wintypes.BOOL
1468 whole_low = 0xffffffff
1469 whole_high = 0x7fffffff
1470
1471 def _lock_file(f, exclusive):
1472 overlapped = OVERLAPPED()
1473 overlapped.Offset = 0
1474 overlapped.OffsetHigh = 0
1475 overlapped.hEvent = 0
1476 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1477 handle = msvcrt.get_osfhandle(f.fileno())
1478 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1479 whole_low, whole_high, f._lock_file_overlapped_p):
1480 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1481
1482 def _unlock_file(f):
1483 assert f._lock_file_overlapped_p
1484 handle = msvcrt.get_osfhandle(f.fileno())
1485 if not UnlockFileEx(handle, 0,
1486 whole_low, whole_high, f._lock_file_overlapped_p):
1487 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1488
1489else:
399a76e6
YCH
1490 # Some platforms, such as Jython, is missing fcntl
1491 try:
1492 import fcntl
c1c9a79c 1493
399a76e6
YCH
1494 def _lock_file(f, exclusive):
1495 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1496
399a76e6
YCH
1497 def _unlock_file(f):
1498 fcntl.flock(f, fcntl.LOCK_UN)
1499 except ImportError:
1500 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1501
1502 def _lock_file(f, exclusive):
1503 raise IOError(UNSUPPORTED_MSG)
1504
1505 def _unlock_file(f):
1506 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1507
1508
1509class locked_file(object):
1510 def __init__(self, filename, mode, encoding=None):
1511 assert mode in ['r', 'a', 'w']
1512 self.f = io.open(filename, mode, encoding=encoding)
1513 self.mode = mode
1514
1515 def __enter__(self):
1516 exclusive = self.mode != 'r'
1517 try:
1518 _lock_file(self.f, exclusive)
1519 except IOError:
1520 self.f.close()
1521 raise
1522 return self
1523
1524 def __exit__(self, etype, value, traceback):
1525 try:
1526 _unlock_file(self.f)
1527 finally:
1528 self.f.close()
1529
1530 def __iter__(self):
1531 return iter(self.f)
1532
1533 def write(self, *args):
1534 return self.f.write(*args)
1535
1536 def read(self, *args):
1537 return self.f.read(*args)
4eb7f1d1
JMF
1538
1539
4644ac55
S
1540def get_filesystem_encoding():
1541 encoding = sys.getfilesystemencoding()
1542 return encoding if encoding is not None else 'utf-8'
1543
1544
4eb7f1d1 1545def shell_quote(args):
a6a173c2 1546 quoted_args = []
4644ac55 1547 encoding = get_filesystem_encoding()
a6a173c2
JMF
1548 for a in args:
1549 if isinstance(a, bytes):
1550 # We may get a filename encoded with 'encodeFilename'
1551 a = a.decode(encoding)
aefce8e6 1552 quoted_args.append(compat_shlex_quote(a))
28e614de 1553 return ' '.join(quoted_args)
9d4660ca
PH
1554
1555
1556def smuggle_url(url, data):
1557 """ Pass additional data in a URL for internal use. """
1558
81953d1a
RA
1559 url, idata = unsmuggle_url(url, {})
1560 data.update(idata)
15707c7e 1561 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1562 {'__youtubedl_smuggle': json.dumps(data)})
1563 return url + '#' + sdata
9d4660ca
PH
1564
1565
79f82953 1566def unsmuggle_url(smug_url, default=None):
83e865a3 1567 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1568 return smug_url, default
28e614de
PH
1569 url, _, sdata = smug_url.rpartition('#')
1570 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1571 data = json.loads(jsond)
1572 return url, data
02dbf93f
PH
1573
1574
02dbf93f
PH
1575def format_bytes(bytes):
1576 if bytes is None:
28e614de 1577 return 'N/A'
02dbf93f
PH
1578 if type(bytes) is str:
1579 bytes = float(bytes)
1580 if bytes == 0.0:
1581 exponent = 0
1582 else:
1583 exponent = int(math.log(bytes, 1024.0))
28e614de 1584 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1585 converted = float(bytes) / float(1024 ** exponent)
28e614de 1586 return '%.2f%s' % (converted, suffix)
f53c966a 1587
1c088fa8 1588
fb47597b
S
1589def lookup_unit_table(unit_table, s):
1590 units_re = '|'.join(re.escape(u) for u in unit_table)
1591 m = re.match(
782b1b5b 1592 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1593 if not m:
1594 return None
1595 num_str = m.group('num').replace(',', '.')
1596 mult = unit_table[m.group('unit')]
1597 return int(float(num_str) * mult)
1598
1599
be64b5b0
PH
1600def parse_filesize(s):
1601 if s is None:
1602 return None
1603
dfb1b146 1604 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1605 # but we support those too
1606 _UNIT_TABLE = {
1607 'B': 1,
1608 'b': 1,
70852b47 1609 'bytes': 1,
be64b5b0
PH
1610 'KiB': 1024,
1611 'KB': 1000,
1612 'kB': 1024,
1613 'Kb': 1000,
13585d76 1614 'kb': 1000,
70852b47
YCH
1615 'kilobytes': 1000,
1616 'kibibytes': 1024,
be64b5b0
PH
1617 'MiB': 1024 ** 2,
1618 'MB': 1000 ** 2,
1619 'mB': 1024 ** 2,
1620 'Mb': 1000 ** 2,
13585d76 1621 'mb': 1000 ** 2,
70852b47
YCH
1622 'megabytes': 1000 ** 2,
1623 'mebibytes': 1024 ** 2,
be64b5b0
PH
1624 'GiB': 1024 ** 3,
1625 'GB': 1000 ** 3,
1626 'gB': 1024 ** 3,
1627 'Gb': 1000 ** 3,
13585d76 1628 'gb': 1000 ** 3,
70852b47
YCH
1629 'gigabytes': 1000 ** 3,
1630 'gibibytes': 1024 ** 3,
be64b5b0
PH
1631 'TiB': 1024 ** 4,
1632 'TB': 1000 ** 4,
1633 'tB': 1024 ** 4,
1634 'Tb': 1000 ** 4,
13585d76 1635 'tb': 1000 ** 4,
70852b47
YCH
1636 'terabytes': 1000 ** 4,
1637 'tebibytes': 1024 ** 4,
be64b5b0
PH
1638 'PiB': 1024 ** 5,
1639 'PB': 1000 ** 5,
1640 'pB': 1024 ** 5,
1641 'Pb': 1000 ** 5,
13585d76 1642 'pb': 1000 ** 5,
70852b47
YCH
1643 'petabytes': 1000 ** 5,
1644 'pebibytes': 1024 ** 5,
be64b5b0
PH
1645 'EiB': 1024 ** 6,
1646 'EB': 1000 ** 6,
1647 'eB': 1024 ** 6,
1648 'Eb': 1000 ** 6,
13585d76 1649 'eb': 1000 ** 6,
70852b47
YCH
1650 'exabytes': 1000 ** 6,
1651 'exbibytes': 1024 ** 6,
be64b5b0
PH
1652 'ZiB': 1024 ** 7,
1653 'ZB': 1000 ** 7,
1654 'zB': 1024 ** 7,
1655 'Zb': 1000 ** 7,
13585d76 1656 'zb': 1000 ** 7,
70852b47
YCH
1657 'zettabytes': 1000 ** 7,
1658 'zebibytes': 1024 ** 7,
be64b5b0
PH
1659 'YiB': 1024 ** 8,
1660 'YB': 1000 ** 8,
1661 'yB': 1024 ** 8,
1662 'Yb': 1000 ** 8,
13585d76 1663 'yb': 1000 ** 8,
70852b47
YCH
1664 'yottabytes': 1000 ** 8,
1665 'yobibytes': 1024 ** 8,
be64b5b0
PH
1666 }
1667
fb47597b
S
1668 return lookup_unit_table(_UNIT_TABLE, s)
1669
1670
1671def parse_count(s):
1672 if s is None:
be64b5b0
PH
1673 return None
1674
fb47597b
S
1675 s = s.strip()
1676
1677 if re.match(r'^[\d,.]+$', s):
1678 return str_to_int(s)
1679
1680 _UNIT_TABLE = {
1681 'k': 1000,
1682 'K': 1000,
1683 'm': 1000 ** 2,
1684 'M': 1000 ** 2,
1685 'kk': 1000 ** 2,
1686 'KK': 1000 ** 2,
1687 }
be64b5b0 1688
fb47597b 1689 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1690
2f7ae819 1691
a942d6cb 1692def month_by_name(name, lang='en'):
caefb1de
PH
1693 """ Return the number of a month by (locale-independently) English name """
1694
f6717dec 1695 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1696
caefb1de 1697 try:
f6717dec 1698 return month_names.index(name) + 1
7105440c
YCH
1699 except ValueError:
1700 return None
1701
1702
1703def month_by_abbreviation(abbrev):
1704 """ Return the number of a month by (locale-independently) English
1705 abbreviations """
1706
1707 try:
1708 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1709 except ValueError:
1710 return None
18258362
JMF
1711
1712
5aafe895 1713def fix_xml_ampersands(xml_str):
18258362 1714 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1715 return re.sub(
1716 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1717 '&amp;',
5aafe895 1718 xml_str)
e3946f98
PH
1719
1720
1721def setproctitle(title):
8bf48f23 1722 assert isinstance(title, compat_str)
c1c05c67
YCH
1723
1724 # ctypes in Jython is not complete
1725 # http://bugs.jython.org/issue2148
1726 if sys.platform.startswith('java'):
1727 return
1728
e3946f98 1729 try:
611c1dd9 1730 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1731 except OSError:
1732 return
2f49bcd6
RC
1733 except TypeError:
1734 # LoadLibrary in Windows Python 2.7.13 only expects
1735 # a bytestring, but since unicode_literals turns
1736 # every string into a unicode string, it fails.
1737 return
6eefe533
PH
1738 title_bytes = title.encode('utf-8')
1739 buf = ctypes.create_string_buffer(len(title_bytes))
1740 buf.value = title_bytes
e3946f98 1741 try:
6eefe533 1742 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1743 except AttributeError:
1744 return # Strange libc, just skip this
d7dda168
PH
1745
1746
1747def remove_start(s, start):
46bc9b7d 1748 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1749
1750
2b9faf55 1751def remove_end(s, end):
46bc9b7d 1752 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1753
1754
31b2051e
S
1755def remove_quotes(s):
1756 if s is None or len(s) < 2:
1757 return s
1758 for quote in ('"', "'", ):
1759 if s[0] == quote and s[-1] == quote:
1760 return s[1:-1]
1761 return s
1762
1763
29eb5174 1764def url_basename(url):
9b8aaeed 1765 path = compat_urlparse.urlparse(url).path
28e614de 1766 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1767
1768
02dc0a36
S
1769def base_url(url):
1770 return re.match(r'https?://[^?#&]+/', url).group()
1771
1772
e34c3361 1773def urljoin(base, path):
4b5de77b
S
1774 if isinstance(path, bytes):
1775 path = path.decode('utf-8')
e34c3361
S
1776 if not isinstance(path, compat_str) or not path:
1777 return None
b0c65c67 1778 if re.match(r'^(?:https?:)?//', path):
e34c3361 1779 return path
4b5de77b
S
1780 if isinstance(base, bytes):
1781 base = base.decode('utf-8')
1782 if not isinstance(base, compat_str) or not re.match(
1783 r'^(?:https?:)?//', base):
e34c3361
S
1784 return None
1785 return compat_urlparse.urljoin(base, path)
1786
1787
aa94a6d3
PH
1788class HEADRequest(compat_urllib_request.Request):
1789 def get_method(self):
611c1dd9 1790 return 'HEAD'
7217e148
PH
1791
1792
95cf60e8
S
1793class PUTRequest(compat_urllib_request.Request):
1794 def get_method(self):
1795 return 'PUT'
1796
1797
9732d77e 1798def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1799 if get_attr:
1800 if v is not None:
1801 v = getattr(v, get_attr, None)
9572013d
PH
1802 if v == '':
1803 v = None
1812afb7
S
1804 if v is None:
1805 return default
1806 try:
1807 return int(v) * invscale // scale
1808 except ValueError:
af98f8ff 1809 return default
9732d77e 1810
9572013d 1811
40a90862
JMF
1812def str_or_none(v, default=None):
1813 return default if v is None else compat_str(v)
1814
9732d77e
PH
1815
1816def str_to_int(int_str):
48d4681e 1817 """ A more relaxed version of int_or_none """
9732d77e
PH
1818 if int_str is None:
1819 return None
28e614de 1820 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1821 return int(int_str)
608d11f5
PH
1822
1823
9732d77e 1824def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1825 if v is None:
1826 return default
1827 try:
1828 return float(v) * invscale / scale
1829 except ValueError:
1830 return default
43f775e4
PH
1831
1832
c7e327c4
S
1833def bool_or_none(v, default=None):
1834 return v if isinstance(v, bool) else default
1835
1836
b72b4431
S
1837def strip_or_none(v):
1838 return None if v is None else v.strip()
1839
1840
608d11f5 1841def parse_duration(s):
8f9312c3 1842 if not isinstance(s, compat_basestring):
608d11f5
PH
1843 return None
1844
ca7b3246
S
1845 s = s.strip()
1846
acaff495 1847 days, hours, mins, secs, ms = [None] * 5
15846398 1848 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1849 if m:
1850 days, hours, mins, secs, ms = m.groups()
1851 else:
1852 m = re.match(
056653bb
S
1853 r'''(?ix)(?:P?
1854 (?:
1855 [0-9]+\s*y(?:ears?)?\s*
1856 )?
1857 (?:
1858 [0-9]+\s*m(?:onths?)?\s*
1859 )?
1860 (?:
1861 [0-9]+\s*w(?:eeks?)?\s*
1862 )?
8f4b58d7 1863 (?:
acaff495 1864 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1865 )?
056653bb 1866 T)?
acaff495 1867 (?:
1868 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1869 )?
1870 (?:
1871 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1872 )?
1873 (?:
1874 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1875 )?Z?$''', s)
acaff495 1876 if m:
1877 days, hours, mins, secs, ms = m.groups()
1878 else:
15846398 1879 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1880 if m:
1881 hours, mins = m.groups()
1882 else:
1883 return None
1884
1885 duration = 0
1886 if secs:
1887 duration += float(secs)
1888 if mins:
1889 duration += float(mins) * 60
1890 if hours:
1891 duration += float(hours) * 60 * 60
1892 if days:
1893 duration += float(days) * 24 * 60 * 60
1894 if ms:
1895 duration += float(ms)
1896 return duration
91d7d0b3
JMF
1897
1898
e65e4c88 1899def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1900 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1901 return (
1902 '{0}.{1}{2}'.format(name, ext, real_ext)
1903 if not expected_real_ext or real_ext[1:] == expected_real_ext
1904 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1905
1906
b3ed15b7
S
1907def replace_extension(filename, ext, expected_real_ext=None):
1908 name, real_ext = os.path.splitext(filename)
1909 return '{0}.{1}'.format(
1910 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1911 ext)
1912
1913
d70ad093
PH
1914def check_executable(exe, args=[]):
1915 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1916 args can be a list of arguments for a short output (like -version) """
1917 try:
1918 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1919 except OSError:
1920 return False
1921 return exe
b7ab0590
PH
1922
1923
95807118 1924def get_exe_version(exe, args=['--version'],
cae97f65 1925 version_re=None, unrecognized='present'):
95807118
PH
1926 """ Returns the version of the specified executable,
1927 or False if the executable is not present """
1928 try:
b64d04c1
YCH
1929 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1930 # SIGTTOU if youtube-dl is run in the background.
1931 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1932 out, _ = subprocess.Popen(
54116803 1933 [encodeArgument(exe)] + args,
00ca7552 1934 stdin=subprocess.PIPE,
95807118
PH
1935 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1936 except OSError:
1937 return False
cae97f65
PH
1938 if isinstance(out, bytes): # Python 2.x
1939 out = out.decode('ascii', 'ignore')
1940 return detect_exe_version(out, version_re, unrecognized)
1941
1942
1943def detect_exe_version(output, version_re=None, unrecognized='present'):
1944 assert isinstance(output, compat_str)
1945 if version_re is None:
1946 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1947 m = re.search(version_re, output)
95807118
PH
1948 if m:
1949 return m.group(1)
1950 else:
1951 return unrecognized
1952
1953
b7ab0590 1954class PagedList(object):
dd26ced1
PH
1955 def __len__(self):
1956 # This is only useful for tests
1957 return len(self.getslice())
1958
9c44d242
PH
1959
1960class OnDemandPagedList(PagedList):
6be08ce6 1961 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1962 self._pagefunc = pagefunc
1963 self._pagesize = pagesize
b95dc034
YCH
1964 self._use_cache = use_cache
1965 if use_cache:
1966 self._cache = {}
9c44d242 1967
b7ab0590
PH
1968 def getslice(self, start=0, end=None):
1969 res = []
1970 for pagenum in itertools.count(start // self._pagesize):
1971 firstid = pagenum * self._pagesize
1972 nextfirstid = pagenum * self._pagesize + self._pagesize
1973 if start >= nextfirstid:
1974 continue
1975
b95dc034
YCH
1976 page_results = None
1977 if self._use_cache:
1978 page_results = self._cache.get(pagenum)
1979 if page_results is None:
1980 page_results = list(self._pagefunc(pagenum))
1981 if self._use_cache:
1982 self._cache[pagenum] = page_results
b7ab0590
PH
1983
1984 startv = (
1985 start % self._pagesize
1986 if firstid <= start < nextfirstid
1987 else 0)
1988
1989 endv = (
1990 ((end - 1) % self._pagesize) + 1
1991 if (end is not None and firstid <= end <= nextfirstid)
1992 else None)
1993
1994 if startv != 0 or endv is not None:
1995 page_results = page_results[startv:endv]
1996 res.extend(page_results)
1997
1998 # A little optimization - if current page is not "full", ie. does
1999 # not contain page_size videos then we can assume that this page
2000 # is the last one - there are no more ids on further pages -
2001 # i.e. no need to query again.
2002 if len(page_results) + startv < self._pagesize:
2003 break
2004
2005 # If we got the whole page, but the next page is not interesting,
2006 # break out early as well
2007 if end == nextfirstid:
2008 break
2009 return res
81c2f20b
PH
2010
2011
9c44d242
PH
2012class InAdvancePagedList(PagedList):
2013 def __init__(self, pagefunc, pagecount, pagesize):
2014 self._pagefunc = pagefunc
2015 self._pagecount = pagecount
2016 self._pagesize = pagesize
2017
2018 def getslice(self, start=0, end=None):
2019 res = []
2020 start_page = start // self._pagesize
2021 end_page = (
2022 self._pagecount if end is None else (end // self._pagesize + 1))
2023 skip_elems = start - start_page * self._pagesize
2024 only_more = None if end is None else end - start
2025 for pagenum in range(start_page, end_page):
2026 page = list(self._pagefunc(pagenum))
2027 if skip_elems:
2028 page = page[skip_elems:]
2029 skip_elems = None
2030 if only_more is not None:
2031 if len(page) < only_more:
2032 only_more -= len(page)
2033 else:
2034 page = page[:only_more]
2035 res.extend(page)
2036 break
2037 res.extend(page)
2038 return res
2039
2040
81c2f20b 2041def uppercase_escape(s):
676eb3f2 2042 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2043 return re.sub(
a612753d 2044 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2045 lambda m: unicode_escape(m.group(0))[0],
2046 s)
0fe2ff78
YCH
2047
2048
2049def lowercase_escape(s):
2050 unicode_escape = codecs.getdecoder('unicode_escape')
2051 return re.sub(
2052 r'\\u[0-9a-fA-F]{4}',
2053 lambda m: unicode_escape(m.group(0))[0],
2054 s)
b53466e1 2055
d05cfe06
S
2056
2057def escape_rfc3986(s):
2058 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2059 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2060 s = s.encode('utf-8')
ecc0c5ee 2061 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2062
2063
2064def escape_url(url):
2065 """Escape URL as suggested by RFC 3986"""
2066 url_parsed = compat_urllib_parse_urlparse(url)
2067 return url_parsed._replace(
efbed08d 2068 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2069 path=escape_rfc3986(url_parsed.path),
2070 params=escape_rfc3986(url_parsed.params),
2071 query=escape_rfc3986(url_parsed.query),
2072 fragment=escape_rfc3986(url_parsed.fragment)
2073 ).geturl()
2074
62e609ab
PH
2075
2076def read_batch_urls(batch_fd):
2077 def fixup(url):
2078 if not isinstance(url, compat_str):
2079 url = url.decode('utf-8', 'replace')
28e614de 2080 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2081 if url.startswith(BOM_UTF8):
2082 url = url[len(BOM_UTF8):]
2083 url = url.strip()
2084 if url.startswith(('#', ';', ']')):
2085 return False
2086 return url
2087
2088 with contextlib.closing(batch_fd) as fd:
2089 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2090
2091
2092def urlencode_postdata(*args, **kargs):
15707c7e 2093 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2094
2095
38f9ef31 2096def update_url_query(url, query):
cacd9966
YCH
2097 if not query:
2098 return url
38f9ef31 2099 parsed_url = compat_urlparse.urlparse(url)
2100 qs = compat_parse_qs(parsed_url.query)
2101 qs.update(query)
2102 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2103 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2104
8e60dc75 2105
ed0291d1
S
2106def update_Request(req, url=None, data=None, headers={}, query={}):
2107 req_headers = req.headers.copy()
2108 req_headers.update(headers)
2109 req_data = data or req.data
2110 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2111 req_get_method = req.get_method()
2112 if req_get_method == 'HEAD':
2113 req_type = HEADRequest
2114 elif req_get_method == 'PUT':
2115 req_type = PUTRequest
2116 else:
2117 req_type = compat_urllib_request.Request
ed0291d1
S
2118 new_req = req_type(
2119 req_url, data=req_data, headers=req_headers,
2120 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2121 if hasattr(req, 'timeout'):
2122 new_req.timeout = req.timeout
2123 return new_req
2124
2125
10c87c15 2126def _multipart_encode_impl(data, boundary):
0c265486
YCH
2127 content_type = 'multipart/form-data; boundary=%s' % boundary
2128
2129 out = b''
2130 for k, v in data.items():
2131 out += b'--' + boundary.encode('ascii') + b'\r\n'
2132 if isinstance(k, compat_str):
2133 k = k.encode('utf-8')
2134 if isinstance(v, compat_str):
2135 v = v.encode('utf-8')
2136 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2137 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2138 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2139 if boundary.encode('ascii') in content:
2140 raise ValueError('Boundary overlaps with data')
2141 out += content
2142
2143 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2144
2145 return out, content_type
2146
2147
2148def multipart_encode(data, boundary=None):
2149 '''
2150 Encode a dict to RFC 7578-compliant form-data
2151
2152 data:
2153 A dict where keys and values can be either Unicode or bytes-like
2154 objects.
2155 boundary:
2156 If specified a Unicode object, it's used as the boundary. Otherwise
2157 a random boundary is generated.
2158
2159 Reference: https://tools.ietf.org/html/rfc7578
2160 '''
2161 has_specified_boundary = boundary is not None
2162
2163 while True:
2164 if boundary is None:
2165 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2166
2167 try:
10c87c15 2168 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2169 break
2170 except ValueError:
2171 if has_specified_boundary:
2172 raise
2173 boundary = None
2174
2175 return out, content_type
2176
2177
86296ad2 2178def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2179 if isinstance(key_or_keys, (list, tuple)):
2180 for key in key_or_keys:
86296ad2
S
2181 if key not in d or d[key] is None or skip_false_values and not d[key]:
2182 continue
2183 return d[key]
cbecc9b9
S
2184 return default
2185 return d.get(key_or_keys, default)
2186
2187
329ca3be 2188def try_get(src, getter, expected_type=None):
a32a9a7e
S
2189 if not isinstance(getter, (list, tuple)):
2190 getter = [getter]
2191 for get in getter:
2192 try:
2193 v = get(src)
2194 except (AttributeError, KeyError, TypeError, IndexError):
2195 pass
2196 else:
2197 if expected_type is None or isinstance(v, expected_type):
2198 return v
329ca3be
S
2199
2200
8e60dc75
S
2201def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2202 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2203
16392824 2204
a1a530b0
PH
2205US_RATINGS = {
2206 'G': 0,
2207 'PG': 10,
2208 'PG-13': 13,
2209 'R': 16,
2210 'NC': 18,
2211}
fac55558
PH
2212
2213
a8795327
S
2214TV_PARENTAL_GUIDELINES = {
2215 'TV-Y': 0,
2216 'TV-Y7': 7,
2217 'TV-G': 0,
2218 'TV-PG': 0,
2219 'TV-14': 14,
2220 'TV-MA': 17,
2221}
2222
2223
146c80e2 2224def parse_age_limit(s):
a8795327
S
2225 if type(s) == int:
2226 return s if 0 <= s <= 21 else None
2227 if not isinstance(s, compat_basestring):
d838b1bd 2228 return None
146c80e2 2229 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2230 if m:
2231 return int(m.group('age'))
2232 if s in US_RATINGS:
2233 return US_RATINGS[s]
2234 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2235
2236
fac55558 2237def strip_jsonp(code):
609a61e3 2238 return re.sub(
5552c9eb
YCH
2239 r'''(?sx)^
2240 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2241 (?:\s*&&\s*(?P=func_name))?
2242 \s*\(\s*(?P<callback_data>.*)\);?
2243 \s*?(?://[^\n]*)*$''',
2244 r'\g<callback_data>', code)
478c2c61
PH
2245
2246
e05f6939 2247def js_to_json(code):
4195096e
S
2248 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2249 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2250 INTEGER_TABLE = (
2251 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2252 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2253 )
2254
e05f6939 2255 def fix_kv(m):
e7b6d122
PH
2256 v = m.group(0)
2257 if v in ('true', 'false', 'null'):
2258 return v
b3ee552e 2259 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2260 return ""
2261
2262 if v[0] in ("'", '"'):
2263 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2264 '"': '\\"',
bd1e4844 2265 "\\'": "'",
2266 '\\\n': '',
2267 '\\x': '\\u00',
2268 }.get(m.group(0), m.group(0)), v[1:-1])
2269
89ac4a19
S
2270 for regex, base in INTEGER_TABLE:
2271 im = re.match(regex, v)
2272 if im:
e4659b45 2273 i = int(im.group(1), base)
89ac4a19
S
2274 return '"%d":' % i if v.endswith(':') else '%d' % i
2275
e7b6d122 2276 return '"%s"' % v
e05f6939 2277
bd1e4844 2278 return re.sub(r'''(?sx)
2279 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2280 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2281 {comment}|,(?={skip}[\]}}])|
c384d537 2282 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4195096e
S
2283 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2284 [0-9]+(?={skip}:)
2285 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2286
2287
478c2c61
PH
2288def qualities(quality_ids):
2289 """ Get a numeric quality value out of a list of possible values """
2290 def q(qid):
2291 try:
2292 return quality_ids.index(qid)
2293 except ValueError:
2294 return -1
2295 return q
2296
acd69589
PH
2297
2298DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2299
a020a0dc
PH
2300
2301def limit_length(s, length):
2302 """ Add ellipses to overly long strings """
2303 if s is None:
2304 return None
2305 ELLIPSES = '...'
2306 if len(s) > length:
2307 return s[:length - len(ELLIPSES)] + ELLIPSES
2308 return s
48844745
PH
2309
2310
2311def version_tuple(v):
5f9b8394 2312 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2313
2314
2315def is_outdated_version(version, limit, assume_new=True):
2316 if not version:
2317 return not assume_new
2318 try:
2319 return version_tuple(version) < version_tuple(limit)
2320 except ValueError:
2321 return not assume_new
732ea2f0
PH
2322
2323
2324def ytdl_is_updateable():
2325 """ Returns if youtube-dl can be updated with -U """
2326 from zipimport import zipimporter
2327
2328 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2329
2330
2331def args_to_str(args):
2332 # Get a short string representation for a subprocess command
702ccf2d 2333 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2334
2335
9b9c5355 2336def error_to_compat_str(err):
fdae2358
S
2337 err_str = str(err)
2338 # On python 2 error byte string must be decoded with proper
2339 # encoding rather than ascii
2340 if sys.version_info[0] < 3:
2341 err_str = err_str.decode(preferredencoding())
2342 return err_str
2343
2344
c460bdd5 2345def mimetype2ext(mt):
eb9ee194
S
2346 if mt is None:
2347 return None
2348
765ac263
JMF
2349 ext = {
2350 'audio/mp4': 'm4a',
6c33d24b
YCH
2351 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2352 # it's the most popular one
2353 'audio/mpeg': 'mp3',
765ac263
JMF
2354 }.get(mt)
2355 if ext is not None:
2356 return ext
2357
c460bdd5 2358 _, _, res = mt.rpartition('/')
6562d34a 2359 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2360
2361 return {
f6861ec9 2362 '3gpp': '3gp',
cafcf657 2363 'smptett+xml': 'tt',
cafcf657 2364 'ttaf+xml': 'dfxp',
a0d8d704 2365 'ttml+xml': 'ttml',
f6861ec9 2366 'x-flv': 'flv',
a0d8d704 2367 'x-mp4-fragmented': 'mp4',
d4f05d47 2368 'x-ms-sami': 'sami',
a0d8d704 2369 'x-ms-wmv': 'wmv',
b4173f15
RA
2370 'mpegurl': 'm3u8',
2371 'x-mpegurl': 'm3u8',
2372 'vnd.apple.mpegurl': 'm3u8',
2373 'dash+xml': 'mpd',
b4173f15 2374 'f4m+xml': 'f4m',
f164b971 2375 'hds+xml': 'f4m',
e910fe2f 2376 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2377 'quicktime': 'mov',
98ce1a3f 2378 'mp2t': 'ts',
c460bdd5
PH
2379 }.get(res, res)
2380
2381
4f3c5e06 2382def parse_codecs(codecs_str):
2383 # http://tools.ietf.org/html/rfc6381
2384 if not codecs_str:
2385 return {}
2386 splited_codecs = list(filter(None, map(
2387 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2388 vcodec, acodec = None, None
2389 for full_codec in splited_codecs:
2390 codec = full_codec.split('.')[0]
ffe6979e 2391 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2392 if not vcodec:
2393 vcodec = full_codec
60f5c9fb 2394 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2395 if not acodec:
2396 acodec = full_codec
2397 else:
60f5c9fb 2398 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2399 if not vcodec and not acodec:
2400 if len(splited_codecs) == 2:
2401 return {
2402 'vcodec': vcodec,
2403 'acodec': acodec,
2404 }
2405 elif len(splited_codecs) == 1:
2406 return {
2407 'vcodec': 'none',
2408 'acodec': vcodec,
2409 }
2410 else:
2411 return {
2412 'vcodec': vcodec or 'none',
2413 'acodec': acodec or 'none',
2414 }
2415 return {}
2416
2417
2ccd1b10 2418def urlhandle_detect_ext(url_handle):
79298173 2419 getheader = url_handle.headers.get
2ccd1b10 2420
b55ee18f
PH
2421 cd = getheader('Content-Disposition')
2422 if cd:
2423 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2424 if m:
2425 e = determine_ext(m.group('filename'), default_ext=None)
2426 if e:
2427 return e
2428
c460bdd5 2429 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2430
2431
1e399778
YCH
2432def encode_data_uri(data, mime_type):
2433 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2434
2435
05900629 2436def age_restricted(content_limit, age_limit):
6ec6cb4e 2437 """ Returns True iff the content should be blocked """
05900629
PH
2438
2439 if age_limit is None: # No limit set
2440 return False
2441 if content_limit is None:
2442 return False # Content available for everyone
2443 return age_limit < content_limit
61ca9a80
PH
2444
2445
2446def is_html(first_bytes):
2447 """ Detect whether a file contains HTML by examining its first bytes. """
2448
2449 BOMS = [
2450 (b'\xef\xbb\xbf', 'utf-8'),
2451 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2452 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2453 (b'\xff\xfe', 'utf-16-le'),
2454 (b'\xfe\xff', 'utf-16-be'),
2455 ]
2456 for bom, enc in BOMS:
2457 if first_bytes.startswith(bom):
2458 s = first_bytes[len(bom):].decode(enc, 'replace')
2459 break
2460 else:
2461 s = first_bytes.decode('utf-8', 'replace')
2462
2463 return re.match(r'^\s*<', s)
a055469f
PH
2464
2465
2466def determine_protocol(info_dict):
2467 protocol = info_dict.get('protocol')
2468 if protocol is not None:
2469 return protocol
2470
2471 url = info_dict['url']
2472 if url.startswith('rtmp'):
2473 return 'rtmp'
2474 elif url.startswith('mms'):
2475 return 'mms'
2476 elif url.startswith('rtsp'):
2477 return 'rtsp'
2478
2479 ext = determine_ext(url)
2480 if ext == 'm3u8':
2481 return 'm3u8'
2482 elif ext == 'f4m':
2483 return 'f4m'
2484
2485 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2486
2487
2488def render_table(header_row, data):
2489 """ Render a list of rows, each as a list of values """
2490 table = [header_row] + data
2491 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2492 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2493 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2494
2495
2496def _match_one(filter_part, dct):
2497 COMPARISON_OPERATORS = {
2498 '<': operator.lt,
2499 '<=': operator.le,
2500 '>': operator.gt,
2501 '>=': operator.ge,
2502 '=': operator.eq,
2503 '!=': operator.ne,
2504 }
2505 operator_rex = re.compile(r'''(?x)\s*
2506 (?P<key>[a-z_]+)
2507 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2508 (?:
2509 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2510 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2511 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2512 )
2513 \s*$
2514 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2515 m = operator_rex.search(filter_part)
2516 if m:
2517 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2518 actual_value = dct.get(m.group('key'))
db13c16e
S
2519 if (m.group('quotedstrval') is not None or
2520 m.group('strval') is not None or
e5a088dc
S
2521 # If the original field is a string and matching comparisonvalue is
2522 # a number we should respect the origin of the original field
2523 # and process comparison value as a string (see
2524 # https://github.com/rg3/youtube-dl/issues/11082).
2525 actual_value is not None and m.group('intval') is not None and
2526 isinstance(actual_value, compat_str)):
347de493
PH
2527 if m.group('op') not in ('=', '!='):
2528 raise ValueError(
2529 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2530 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2531 quote = m.group('quote')
2532 if quote is not None:
2533 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2534 else:
2535 try:
2536 comparison_value = int(m.group('intval'))
2537 except ValueError:
2538 comparison_value = parse_filesize(m.group('intval'))
2539 if comparison_value is None:
2540 comparison_value = parse_filesize(m.group('intval') + 'B')
2541 if comparison_value is None:
2542 raise ValueError(
2543 'Invalid integer value %r in filter part %r' % (
2544 m.group('intval'), filter_part))
347de493
PH
2545 if actual_value is None:
2546 return m.group('none_inclusive')
2547 return op(actual_value, comparison_value)
2548
2549 UNARY_OPERATORS = {
2550 '': lambda v: v is not None,
2551 '!': lambda v: v is None,
2552 }
2553 operator_rex = re.compile(r'''(?x)\s*
2554 (?P<op>%s)\s*(?P<key>[a-z_]+)
2555 \s*$
2556 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2557 m = operator_rex.search(filter_part)
2558 if m:
2559 op = UNARY_OPERATORS[m.group('op')]
2560 actual_value = dct.get(m.group('key'))
2561 return op(actual_value)
2562
2563 raise ValueError('Invalid filter part %r' % filter_part)
2564
2565
2566def match_str(filter_str, dct):
2567 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2568
2569 return all(
2570 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2571
2572
2573def match_filter_func(filter_str):
2574 def _match_func(info_dict):
2575 if match_str(filter_str, info_dict):
2576 return None
2577 else:
2578 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2579 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2580 return _match_func
91410c9b
PH
2581
2582
bf6427d2
YCH
2583def parse_dfxp_time_expr(time_expr):
2584 if not time_expr:
d631d5f9 2585 return
bf6427d2
YCH
2586
2587 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2588 if mobj:
2589 return float(mobj.group('time_offset'))
2590
db2fe38b 2591 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2592 if mobj:
db2fe38b 2593 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2594
2595
c1c924ab
YCH
2596def srt_subtitles_timecode(seconds):
2597 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2598
2599
2600def dfxp2srt(dfxp_data):
3869028f
YCH
2601 '''
2602 @param dfxp_data A bytes-like object containing DFXP data
2603 @returns A unicode object containing converted SRT data
2604 '''
5b995f71 2605 LEGACY_NAMESPACES = (
3869028f
YCH
2606 (b'http://www.w3.org/ns/ttml', [
2607 b'http://www.w3.org/2004/11/ttaf1',
2608 b'http://www.w3.org/2006/04/ttaf1',
2609 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2610 ]),
3869028f
YCH
2611 (b'http://www.w3.org/ns/ttml#styling', [
2612 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2613 ]),
2614 )
2615
2616 SUPPORTED_STYLING = [
2617 'color',
2618 'fontFamily',
2619 'fontSize',
2620 'fontStyle',
2621 'fontWeight',
2622 'textDecoration'
2623 ]
2624
4e335771
YCH
2625 _x = functools.partial(xpath_with_ns, ns_map={
2626 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2627 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2628 })
bf6427d2 2629
5b995f71
RA
2630 styles = {}
2631 default_style = {}
2632
87de7069 2633 class TTMLPElementParser(object):
5b995f71
RA
2634 _out = ''
2635 _unclosed_elements = []
2636 _applied_styles = []
bf6427d2 2637
2b14cb56 2638 def start(self, tag, attrib):
5b995f71
RA
2639 if tag in (_x('ttml:br'), 'br'):
2640 self._out += '\n'
2641 else:
2642 unclosed_elements = []
2643 style = {}
2644 element_style_id = attrib.get('style')
2645 if default_style:
2646 style.update(default_style)
2647 if element_style_id:
2648 style.update(styles.get(element_style_id, {}))
2649 for prop in SUPPORTED_STYLING:
2650 prop_val = attrib.get(_x('tts:' + prop))
2651 if prop_val:
2652 style[prop] = prop_val
2653 if style:
2654 font = ''
2655 for k, v in sorted(style.items()):
2656 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2657 continue
2658 if k == 'color':
2659 font += ' color="%s"' % v
2660 elif k == 'fontSize':
2661 font += ' size="%s"' % v
2662 elif k == 'fontFamily':
2663 font += ' face="%s"' % v
2664 elif k == 'fontWeight' and v == 'bold':
2665 self._out += '<b>'
2666 unclosed_elements.append('b')
2667 elif k == 'fontStyle' and v == 'italic':
2668 self._out += '<i>'
2669 unclosed_elements.append('i')
2670 elif k == 'textDecoration' and v == 'underline':
2671 self._out += '<u>'
2672 unclosed_elements.append('u')
2673 if font:
2674 self._out += '<font' + font + '>'
2675 unclosed_elements.append('font')
2676 applied_style = {}
2677 if self._applied_styles:
2678 applied_style.update(self._applied_styles[-1])
2679 applied_style.update(style)
2680 self._applied_styles.append(applied_style)
2681 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2682
2b14cb56 2683 def end(self, tag):
5b995f71
RA
2684 if tag not in (_x('ttml:br'), 'br'):
2685 unclosed_elements = self._unclosed_elements.pop()
2686 for element in reversed(unclosed_elements):
2687 self._out += '</%s>' % element
2688 if unclosed_elements and self._applied_styles:
2689 self._applied_styles.pop()
bf6427d2 2690
2b14cb56 2691 def data(self, data):
5b995f71 2692 self._out += data
2b14cb56 2693
2694 def close(self):
5b995f71 2695 return self._out.strip()
2b14cb56 2696
2697 def parse_node(node):
2698 target = TTMLPElementParser()
2699 parser = xml.etree.ElementTree.XMLParser(target=target)
2700 parser.feed(xml.etree.ElementTree.tostring(node))
2701 return parser.close()
bf6427d2 2702
5b995f71
RA
2703 for k, v in LEGACY_NAMESPACES:
2704 for ns in v:
2705 dfxp_data = dfxp_data.replace(ns, k)
2706
3869028f 2707 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2708 out = []
5b995f71 2709 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2710
2711 if not paras:
2712 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2713
5b995f71
RA
2714 repeat = False
2715 while True:
2716 for style in dfxp.findall(_x('.//ttml:style')):
2717 style_id = style.get('id')
2718 parent_style_id = style.get('style')
2719 if parent_style_id:
2720 if parent_style_id not in styles:
2721 repeat = True
2722 continue
2723 styles[style_id] = styles[parent_style_id].copy()
2724 for prop in SUPPORTED_STYLING:
2725 prop_val = style.get(_x('tts:' + prop))
2726 if prop_val:
2727 styles.setdefault(style_id, {})[prop] = prop_val
2728 if repeat:
2729 repeat = False
2730 else:
2731 break
2732
2733 for p in ('body', 'div'):
2734 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2735 if ele is None:
2736 continue
2737 style = styles.get(ele.get('style'))
2738 if not style:
2739 continue
2740 default_style.update(style)
2741
bf6427d2 2742 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2743 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2744 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2745 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2746 if begin_time is None:
2747 continue
7dff0363 2748 if not end_time:
d631d5f9
YCH
2749 if not dur:
2750 continue
2751 end_time = begin_time + dur
bf6427d2
YCH
2752 out.append('%d\n%s --> %s\n%s\n\n' % (
2753 index,
c1c924ab
YCH
2754 srt_subtitles_timecode(begin_time),
2755 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2756 parse_node(para)))
2757
2758 return ''.join(out)
2759
2760
66e289ba
S
2761def cli_option(params, command_option, param):
2762 param = params.get(param)
98e698f1
RA
2763 if param:
2764 param = compat_str(param)
66e289ba
S
2765 return [command_option, param] if param is not None else []
2766
2767
2768def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2769 param = params.get(param)
5b232f46
S
2770 if param is None:
2771 return []
66e289ba
S
2772 assert isinstance(param, bool)
2773 if separator:
2774 return [command_option + separator + (true_value if param else false_value)]
2775 return [command_option, true_value if param else false_value]
2776
2777
2778def cli_valueless_option(params, command_option, param, expected_value=True):
2779 param = params.get(param)
2780 return [command_option] if param == expected_value else []
2781
2782
2783def cli_configuration_args(params, param, default=[]):
2784 ex_args = params.get(param)
2785 if ex_args is None:
2786 return default
2787 assert isinstance(ex_args, list)
2788 return ex_args
2789
2790
39672624
YCH
2791class ISO639Utils(object):
2792 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2793 _lang_map = {
2794 'aa': 'aar',
2795 'ab': 'abk',
2796 'ae': 'ave',
2797 'af': 'afr',
2798 'ak': 'aka',
2799 'am': 'amh',
2800 'an': 'arg',
2801 'ar': 'ara',
2802 'as': 'asm',
2803 'av': 'ava',
2804 'ay': 'aym',
2805 'az': 'aze',
2806 'ba': 'bak',
2807 'be': 'bel',
2808 'bg': 'bul',
2809 'bh': 'bih',
2810 'bi': 'bis',
2811 'bm': 'bam',
2812 'bn': 'ben',
2813 'bo': 'bod',
2814 'br': 'bre',
2815 'bs': 'bos',
2816 'ca': 'cat',
2817 'ce': 'che',
2818 'ch': 'cha',
2819 'co': 'cos',
2820 'cr': 'cre',
2821 'cs': 'ces',
2822 'cu': 'chu',
2823 'cv': 'chv',
2824 'cy': 'cym',
2825 'da': 'dan',
2826 'de': 'deu',
2827 'dv': 'div',
2828 'dz': 'dzo',
2829 'ee': 'ewe',
2830 'el': 'ell',
2831 'en': 'eng',
2832 'eo': 'epo',
2833 'es': 'spa',
2834 'et': 'est',
2835 'eu': 'eus',
2836 'fa': 'fas',
2837 'ff': 'ful',
2838 'fi': 'fin',
2839 'fj': 'fij',
2840 'fo': 'fao',
2841 'fr': 'fra',
2842 'fy': 'fry',
2843 'ga': 'gle',
2844 'gd': 'gla',
2845 'gl': 'glg',
2846 'gn': 'grn',
2847 'gu': 'guj',
2848 'gv': 'glv',
2849 'ha': 'hau',
2850 'he': 'heb',
2851 'hi': 'hin',
2852 'ho': 'hmo',
2853 'hr': 'hrv',
2854 'ht': 'hat',
2855 'hu': 'hun',
2856 'hy': 'hye',
2857 'hz': 'her',
2858 'ia': 'ina',
2859 'id': 'ind',
2860 'ie': 'ile',
2861 'ig': 'ibo',
2862 'ii': 'iii',
2863 'ik': 'ipk',
2864 'io': 'ido',
2865 'is': 'isl',
2866 'it': 'ita',
2867 'iu': 'iku',
2868 'ja': 'jpn',
2869 'jv': 'jav',
2870 'ka': 'kat',
2871 'kg': 'kon',
2872 'ki': 'kik',
2873 'kj': 'kua',
2874 'kk': 'kaz',
2875 'kl': 'kal',
2876 'km': 'khm',
2877 'kn': 'kan',
2878 'ko': 'kor',
2879 'kr': 'kau',
2880 'ks': 'kas',
2881 'ku': 'kur',
2882 'kv': 'kom',
2883 'kw': 'cor',
2884 'ky': 'kir',
2885 'la': 'lat',
2886 'lb': 'ltz',
2887 'lg': 'lug',
2888 'li': 'lim',
2889 'ln': 'lin',
2890 'lo': 'lao',
2891 'lt': 'lit',
2892 'lu': 'lub',
2893 'lv': 'lav',
2894 'mg': 'mlg',
2895 'mh': 'mah',
2896 'mi': 'mri',
2897 'mk': 'mkd',
2898 'ml': 'mal',
2899 'mn': 'mon',
2900 'mr': 'mar',
2901 'ms': 'msa',
2902 'mt': 'mlt',
2903 'my': 'mya',
2904 'na': 'nau',
2905 'nb': 'nob',
2906 'nd': 'nde',
2907 'ne': 'nep',
2908 'ng': 'ndo',
2909 'nl': 'nld',
2910 'nn': 'nno',
2911 'no': 'nor',
2912 'nr': 'nbl',
2913 'nv': 'nav',
2914 'ny': 'nya',
2915 'oc': 'oci',
2916 'oj': 'oji',
2917 'om': 'orm',
2918 'or': 'ori',
2919 'os': 'oss',
2920 'pa': 'pan',
2921 'pi': 'pli',
2922 'pl': 'pol',
2923 'ps': 'pus',
2924 'pt': 'por',
2925 'qu': 'que',
2926 'rm': 'roh',
2927 'rn': 'run',
2928 'ro': 'ron',
2929 'ru': 'rus',
2930 'rw': 'kin',
2931 'sa': 'san',
2932 'sc': 'srd',
2933 'sd': 'snd',
2934 'se': 'sme',
2935 'sg': 'sag',
2936 'si': 'sin',
2937 'sk': 'slk',
2938 'sl': 'slv',
2939 'sm': 'smo',
2940 'sn': 'sna',
2941 'so': 'som',
2942 'sq': 'sqi',
2943 'sr': 'srp',
2944 'ss': 'ssw',
2945 'st': 'sot',
2946 'su': 'sun',
2947 'sv': 'swe',
2948 'sw': 'swa',
2949 'ta': 'tam',
2950 'te': 'tel',
2951 'tg': 'tgk',
2952 'th': 'tha',
2953 'ti': 'tir',
2954 'tk': 'tuk',
2955 'tl': 'tgl',
2956 'tn': 'tsn',
2957 'to': 'ton',
2958 'tr': 'tur',
2959 'ts': 'tso',
2960 'tt': 'tat',
2961 'tw': 'twi',
2962 'ty': 'tah',
2963 'ug': 'uig',
2964 'uk': 'ukr',
2965 'ur': 'urd',
2966 'uz': 'uzb',
2967 've': 'ven',
2968 'vi': 'vie',
2969 'vo': 'vol',
2970 'wa': 'wln',
2971 'wo': 'wol',
2972 'xh': 'xho',
2973 'yi': 'yid',
2974 'yo': 'yor',
2975 'za': 'zha',
2976 'zh': 'zho',
2977 'zu': 'zul',
2978 }
2979
2980 @classmethod
2981 def short2long(cls, code):
2982 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2983 return cls._lang_map.get(code[:2])
2984
2985 @classmethod
2986 def long2short(cls, code):
2987 """Convert language code from ISO 639-2/T to ISO 639-1"""
2988 for short_name, long_name in cls._lang_map.items():
2989 if long_name == code:
2990 return short_name
2991
2992
4eb10f66
YCH
2993class ISO3166Utils(object):
2994 # From http://data.okfn.org/data/core/country-list
2995 _country_map = {
2996 'AF': 'Afghanistan',
2997 'AX': 'Åland Islands',
2998 'AL': 'Albania',
2999 'DZ': 'Algeria',
3000 'AS': 'American Samoa',
3001 'AD': 'Andorra',
3002 'AO': 'Angola',
3003 'AI': 'Anguilla',
3004 'AQ': 'Antarctica',
3005 'AG': 'Antigua and Barbuda',
3006 'AR': 'Argentina',
3007 'AM': 'Armenia',
3008 'AW': 'Aruba',
3009 'AU': 'Australia',
3010 'AT': 'Austria',
3011 'AZ': 'Azerbaijan',
3012 'BS': 'Bahamas',
3013 'BH': 'Bahrain',
3014 'BD': 'Bangladesh',
3015 'BB': 'Barbados',
3016 'BY': 'Belarus',
3017 'BE': 'Belgium',
3018 'BZ': 'Belize',
3019 'BJ': 'Benin',
3020 'BM': 'Bermuda',
3021 'BT': 'Bhutan',
3022 'BO': 'Bolivia, Plurinational State of',
3023 'BQ': 'Bonaire, Sint Eustatius and Saba',
3024 'BA': 'Bosnia and Herzegovina',
3025 'BW': 'Botswana',
3026 'BV': 'Bouvet Island',
3027 'BR': 'Brazil',
3028 'IO': 'British Indian Ocean Territory',
3029 'BN': 'Brunei Darussalam',
3030 'BG': 'Bulgaria',
3031 'BF': 'Burkina Faso',
3032 'BI': 'Burundi',
3033 'KH': 'Cambodia',
3034 'CM': 'Cameroon',
3035 'CA': 'Canada',
3036 'CV': 'Cape Verde',
3037 'KY': 'Cayman Islands',
3038 'CF': 'Central African Republic',
3039 'TD': 'Chad',
3040 'CL': 'Chile',
3041 'CN': 'China',
3042 'CX': 'Christmas Island',
3043 'CC': 'Cocos (Keeling) Islands',
3044 'CO': 'Colombia',
3045 'KM': 'Comoros',
3046 'CG': 'Congo',
3047 'CD': 'Congo, the Democratic Republic of the',
3048 'CK': 'Cook Islands',
3049 'CR': 'Costa Rica',
3050 'CI': 'Côte d\'Ivoire',
3051 'HR': 'Croatia',
3052 'CU': 'Cuba',
3053 'CW': 'Curaçao',
3054 'CY': 'Cyprus',
3055 'CZ': 'Czech Republic',
3056 'DK': 'Denmark',
3057 'DJ': 'Djibouti',
3058 'DM': 'Dominica',
3059 'DO': 'Dominican Republic',
3060 'EC': 'Ecuador',
3061 'EG': 'Egypt',
3062 'SV': 'El Salvador',
3063 'GQ': 'Equatorial Guinea',
3064 'ER': 'Eritrea',
3065 'EE': 'Estonia',
3066 'ET': 'Ethiopia',
3067 'FK': 'Falkland Islands (Malvinas)',
3068 'FO': 'Faroe Islands',
3069 'FJ': 'Fiji',
3070 'FI': 'Finland',
3071 'FR': 'France',
3072 'GF': 'French Guiana',
3073 'PF': 'French Polynesia',
3074 'TF': 'French Southern Territories',
3075 'GA': 'Gabon',
3076 'GM': 'Gambia',
3077 'GE': 'Georgia',
3078 'DE': 'Germany',
3079 'GH': 'Ghana',
3080 'GI': 'Gibraltar',
3081 'GR': 'Greece',
3082 'GL': 'Greenland',
3083 'GD': 'Grenada',
3084 'GP': 'Guadeloupe',
3085 'GU': 'Guam',
3086 'GT': 'Guatemala',
3087 'GG': 'Guernsey',
3088 'GN': 'Guinea',
3089 'GW': 'Guinea-Bissau',
3090 'GY': 'Guyana',
3091 'HT': 'Haiti',
3092 'HM': 'Heard Island and McDonald Islands',
3093 'VA': 'Holy See (Vatican City State)',
3094 'HN': 'Honduras',
3095 'HK': 'Hong Kong',
3096 'HU': 'Hungary',
3097 'IS': 'Iceland',
3098 'IN': 'India',
3099 'ID': 'Indonesia',
3100 'IR': 'Iran, Islamic Republic of',
3101 'IQ': 'Iraq',
3102 'IE': 'Ireland',
3103 'IM': 'Isle of Man',
3104 'IL': 'Israel',
3105 'IT': 'Italy',
3106 'JM': 'Jamaica',
3107 'JP': 'Japan',
3108 'JE': 'Jersey',
3109 'JO': 'Jordan',
3110 'KZ': 'Kazakhstan',
3111 'KE': 'Kenya',
3112 'KI': 'Kiribati',
3113 'KP': 'Korea, Democratic People\'s Republic of',
3114 'KR': 'Korea, Republic of',
3115 'KW': 'Kuwait',
3116 'KG': 'Kyrgyzstan',
3117 'LA': 'Lao People\'s Democratic Republic',
3118 'LV': 'Latvia',
3119 'LB': 'Lebanon',
3120 'LS': 'Lesotho',
3121 'LR': 'Liberia',
3122 'LY': 'Libya',
3123 'LI': 'Liechtenstein',
3124 'LT': 'Lithuania',
3125 'LU': 'Luxembourg',
3126 'MO': 'Macao',
3127 'MK': 'Macedonia, the Former Yugoslav Republic of',
3128 'MG': 'Madagascar',
3129 'MW': 'Malawi',
3130 'MY': 'Malaysia',
3131 'MV': 'Maldives',
3132 'ML': 'Mali',
3133 'MT': 'Malta',
3134 'MH': 'Marshall Islands',
3135 'MQ': 'Martinique',
3136 'MR': 'Mauritania',
3137 'MU': 'Mauritius',
3138 'YT': 'Mayotte',
3139 'MX': 'Mexico',
3140 'FM': 'Micronesia, Federated States of',
3141 'MD': 'Moldova, Republic of',
3142 'MC': 'Monaco',
3143 'MN': 'Mongolia',
3144 'ME': 'Montenegro',
3145 'MS': 'Montserrat',
3146 'MA': 'Morocco',
3147 'MZ': 'Mozambique',
3148 'MM': 'Myanmar',
3149 'NA': 'Namibia',
3150 'NR': 'Nauru',
3151 'NP': 'Nepal',
3152 'NL': 'Netherlands',
3153 'NC': 'New Caledonia',
3154 'NZ': 'New Zealand',
3155 'NI': 'Nicaragua',
3156 'NE': 'Niger',
3157 'NG': 'Nigeria',
3158 'NU': 'Niue',
3159 'NF': 'Norfolk Island',
3160 'MP': 'Northern Mariana Islands',
3161 'NO': 'Norway',
3162 'OM': 'Oman',
3163 'PK': 'Pakistan',
3164 'PW': 'Palau',
3165 'PS': 'Palestine, State of',
3166 'PA': 'Panama',
3167 'PG': 'Papua New Guinea',
3168 'PY': 'Paraguay',
3169 'PE': 'Peru',
3170 'PH': 'Philippines',
3171 'PN': 'Pitcairn',
3172 'PL': 'Poland',
3173 'PT': 'Portugal',
3174 'PR': 'Puerto Rico',
3175 'QA': 'Qatar',
3176 'RE': 'Réunion',
3177 'RO': 'Romania',
3178 'RU': 'Russian Federation',
3179 'RW': 'Rwanda',
3180 'BL': 'Saint Barthélemy',
3181 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3182 'KN': 'Saint Kitts and Nevis',
3183 'LC': 'Saint Lucia',
3184 'MF': 'Saint Martin (French part)',
3185 'PM': 'Saint Pierre and Miquelon',
3186 'VC': 'Saint Vincent and the Grenadines',
3187 'WS': 'Samoa',
3188 'SM': 'San Marino',
3189 'ST': 'Sao Tome and Principe',
3190 'SA': 'Saudi Arabia',
3191 'SN': 'Senegal',
3192 'RS': 'Serbia',
3193 'SC': 'Seychelles',
3194 'SL': 'Sierra Leone',
3195 'SG': 'Singapore',
3196 'SX': 'Sint Maarten (Dutch part)',
3197 'SK': 'Slovakia',
3198 'SI': 'Slovenia',
3199 'SB': 'Solomon Islands',
3200 'SO': 'Somalia',
3201 'ZA': 'South Africa',
3202 'GS': 'South Georgia and the South Sandwich Islands',
3203 'SS': 'South Sudan',
3204 'ES': 'Spain',
3205 'LK': 'Sri Lanka',
3206 'SD': 'Sudan',
3207 'SR': 'Suriname',
3208 'SJ': 'Svalbard and Jan Mayen',
3209 'SZ': 'Swaziland',
3210 'SE': 'Sweden',
3211 'CH': 'Switzerland',
3212 'SY': 'Syrian Arab Republic',
3213 'TW': 'Taiwan, Province of China',
3214 'TJ': 'Tajikistan',
3215 'TZ': 'Tanzania, United Republic of',
3216 'TH': 'Thailand',
3217 'TL': 'Timor-Leste',
3218 'TG': 'Togo',
3219 'TK': 'Tokelau',
3220 'TO': 'Tonga',
3221 'TT': 'Trinidad and Tobago',
3222 'TN': 'Tunisia',
3223 'TR': 'Turkey',
3224 'TM': 'Turkmenistan',
3225 'TC': 'Turks and Caicos Islands',
3226 'TV': 'Tuvalu',
3227 'UG': 'Uganda',
3228 'UA': 'Ukraine',
3229 'AE': 'United Arab Emirates',
3230 'GB': 'United Kingdom',
3231 'US': 'United States',
3232 'UM': 'United States Minor Outlying Islands',
3233 'UY': 'Uruguay',
3234 'UZ': 'Uzbekistan',
3235 'VU': 'Vanuatu',
3236 'VE': 'Venezuela, Bolivarian Republic of',
3237 'VN': 'Viet Nam',
3238 'VG': 'Virgin Islands, British',
3239 'VI': 'Virgin Islands, U.S.',
3240 'WF': 'Wallis and Futuna',
3241 'EH': 'Western Sahara',
3242 'YE': 'Yemen',
3243 'ZM': 'Zambia',
3244 'ZW': 'Zimbabwe',
3245 }
3246
3247 @classmethod
3248 def short2full(cls, code):
3249 """Convert an ISO 3166-2 country code to the corresponding full name"""
3250 return cls._country_map.get(code.upper())
3251
3252
773f291d
S
3253class GeoUtils(object):
3254 # Major IPv4 address blocks per country
3255 _country_ip_map = {
3256 'AD': '85.94.160.0/19',
3257 'AE': '94.200.0.0/13',
3258 'AF': '149.54.0.0/17',
3259 'AG': '209.59.64.0/18',
3260 'AI': '204.14.248.0/21',
3261 'AL': '46.99.0.0/16',
3262 'AM': '46.70.0.0/15',
3263 'AO': '105.168.0.0/13',
3264 'AP': '159.117.192.0/21',
3265 'AR': '181.0.0.0/12',
3266 'AS': '202.70.112.0/20',
3267 'AT': '84.112.0.0/13',
3268 'AU': '1.128.0.0/11',
3269 'AW': '181.41.0.0/18',
3270 'AZ': '5.191.0.0/16',
3271 'BA': '31.176.128.0/17',
3272 'BB': '65.48.128.0/17',
3273 'BD': '114.130.0.0/16',
3274 'BE': '57.0.0.0/8',
3275 'BF': '129.45.128.0/17',
3276 'BG': '95.42.0.0/15',
3277 'BH': '37.131.0.0/17',
3278 'BI': '154.117.192.0/18',
3279 'BJ': '137.255.0.0/16',
3280 'BL': '192.131.134.0/24',
3281 'BM': '196.12.64.0/18',
3282 'BN': '156.31.0.0/16',
3283 'BO': '161.56.0.0/16',
3284 'BQ': '161.0.80.0/20',
3285 'BR': '152.240.0.0/12',
3286 'BS': '24.51.64.0/18',
3287 'BT': '119.2.96.0/19',
3288 'BW': '168.167.0.0/16',
3289 'BY': '178.120.0.0/13',
3290 'BZ': '179.42.192.0/18',
3291 'CA': '99.224.0.0/11',
3292 'CD': '41.243.0.0/16',
3293 'CF': '196.32.200.0/21',
3294 'CG': '197.214.128.0/17',
3295 'CH': '85.0.0.0/13',
3296 'CI': '154.232.0.0/14',
3297 'CK': '202.65.32.0/19',
3298 'CL': '152.172.0.0/14',
3299 'CM': '165.210.0.0/15',
3300 'CN': '36.128.0.0/10',
3301 'CO': '181.240.0.0/12',
3302 'CR': '201.192.0.0/12',
3303 'CU': '152.206.0.0/15',
3304 'CV': '165.90.96.0/19',
3305 'CW': '190.88.128.0/17',
3306 'CY': '46.198.0.0/15',
3307 'CZ': '88.100.0.0/14',
3308 'DE': '53.0.0.0/8',
3309 'DJ': '197.241.0.0/17',
3310 'DK': '87.48.0.0/12',
3311 'DM': '192.243.48.0/20',
3312 'DO': '152.166.0.0/15',
3313 'DZ': '41.96.0.0/12',
3314 'EC': '186.68.0.0/15',
3315 'EE': '90.190.0.0/15',
3316 'EG': '156.160.0.0/11',
3317 'ER': '196.200.96.0/20',
3318 'ES': '88.0.0.0/11',
3319 'ET': '196.188.0.0/14',
3320 'EU': '2.16.0.0/13',
3321 'FI': '91.152.0.0/13',
3322 'FJ': '144.120.0.0/16',
3323 'FM': '119.252.112.0/20',
3324 'FO': '88.85.32.0/19',
3325 'FR': '90.0.0.0/9',
3326 'GA': '41.158.0.0/15',
3327 'GB': '25.0.0.0/8',
3328 'GD': '74.122.88.0/21',
3329 'GE': '31.146.0.0/16',
3330 'GF': '161.22.64.0/18',
3331 'GG': '62.68.160.0/19',
3332 'GH': '45.208.0.0/14',
3333 'GI': '85.115.128.0/19',
3334 'GL': '88.83.0.0/19',
3335 'GM': '160.182.0.0/15',
3336 'GN': '197.149.192.0/18',
3337 'GP': '104.250.0.0/19',
3338 'GQ': '105.235.224.0/20',
3339 'GR': '94.64.0.0/13',
3340 'GT': '168.234.0.0/16',
3341 'GU': '168.123.0.0/16',
3342 'GW': '197.214.80.0/20',
3343 'GY': '181.41.64.0/18',
3344 'HK': '113.252.0.0/14',
3345 'HN': '181.210.0.0/16',
3346 'HR': '93.136.0.0/13',
3347 'HT': '148.102.128.0/17',
3348 'HU': '84.0.0.0/14',
3349 'ID': '39.192.0.0/10',
3350 'IE': '87.32.0.0/12',
3351 'IL': '79.176.0.0/13',
3352 'IM': '5.62.80.0/20',
3353 'IN': '117.192.0.0/10',
3354 'IO': '203.83.48.0/21',
3355 'IQ': '37.236.0.0/14',
3356 'IR': '2.176.0.0/12',
3357 'IS': '82.221.0.0/16',
3358 'IT': '79.0.0.0/10',
3359 'JE': '87.244.64.0/18',
3360 'JM': '72.27.0.0/17',
3361 'JO': '176.29.0.0/16',
3362 'JP': '126.0.0.0/8',
3363 'KE': '105.48.0.0/12',
3364 'KG': '158.181.128.0/17',
3365 'KH': '36.37.128.0/17',
3366 'KI': '103.25.140.0/22',
3367 'KM': '197.255.224.0/20',
3368 'KN': '198.32.32.0/19',
3369 'KP': '175.45.176.0/22',
3370 'KR': '175.192.0.0/10',
3371 'KW': '37.36.0.0/14',
3372 'KY': '64.96.0.0/15',
3373 'KZ': '2.72.0.0/13',
3374 'LA': '115.84.64.0/18',
3375 'LB': '178.135.0.0/16',
3376 'LC': '192.147.231.0/24',
3377 'LI': '82.117.0.0/19',
3378 'LK': '112.134.0.0/15',
3379 'LR': '41.86.0.0/19',
3380 'LS': '129.232.0.0/17',
3381 'LT': '78.56.0.0/13',
3382 'LU': '188.42.0.0/16',
3383 'LV': '46.109.0.0/16',
3384 'LY': '41.252.0.0/14',
3385 'MA': '105.128.0.0/11',
3386 'MC': '88.209.64.0/18',
3387 'MD': '37.246.0.0/16',
3388 'ME': '178.175.0.0/17',
3389 'MF': '74.112.232.0/21',
3390 'MG': '154.126.0.0/17',
3391 'MH': '117.103.88.0/21',
3392 'MK': '77.28.0.0/15',
3393 'ML': '154.118.128.0/18',
3394 'MM': '37.111.0.0/17',
3395 'MN': '49.0.128.0/17',
3396 'MO': '60.246.0.0/16',
3397 'MP': '202.88.64.0/20',
3398 'MQ': '109.203.224.0/19',
3399 'MR': '41.188.64.0/18',
3400 'MS': '208.90.112.0/22',
3401 'MT': '46.11.0.0/16',
3402 'MU': '105.16.0.0/12',
3403 'MV': '27.114.128.0/18',
3404 'MW': '105.234.0.0/16',
3405 'MX': '187.192.0.0/11',
3406 'MY': '175.136.0.0/13',
3407 'MZ': '197.218.0.0/15',
3408 'NA': '41.182.0.0/16',
3409 'NC': '101.101.0.0/18',
3410 'NE': '197.214.0.0/18',
3411 'NF': '203.17.240.0/22',
3412 'NG': '105.112.0.0/12',
3413 'NI': '186.76.0.0/15',
3414 'NL': '145.96.0.0/11',
3415 'NO': '84.208.0.0/13',
3416 'NP': '36.252.0.0/15',
3417 'NR': '203.98.224.0/19',
3418 'NU': '49.156.48.0/22',
3419 'NZ': '49.224.0.0/14',
3420 'OM': '5.36.0.0/15',
3421 'PA': '186.72.0.0/15',
3422 'PE': '186.160.0.0/14',
3423 'PF': '123.50.64.0/18',
3424 'PG': '124.240.192.0/19',
3425 'PH': '49.144.0.0/13',
3426 'PK': '39.32.0.0/11',
3427 'PL': '83.0.0.0/11',
3428 'PM': '70.36.0.0/20',
3429 'PR': '66.50.0.0/16',
3430 'PS': '188.161.0.0/16',
3431 'PT': '85.240.0.0/13',
3432 'PW': '202.124.224.0/20',
3433 'PY': '181.120.0.0/14',
3434 'QA': '37.210.0.0/15',
3435 'RE': '139.26.0.0/16',
3436 'RO': '79.112.0.0/13',
3437 'RS': '178.220.0.0/14',
3438 'RU': '5.136.0.0/13',
3439 'RW': '105.178.0.0/15',
3440 'SA': '188.48.0.0/13',
3441 'SB': '202.1.160.0/19',
3442 'SC': '154.192.0.0/11',
3443 'SD': '154.96.0.0/13',
3444 'SE': '78.64.0.0/12',
3445 'SG': '152.56.0.0/14',
3446 'SI': '188.196.0.0/14',
3447 'SK': '78.98.0.0/15',
3448 'SL': '197.215.0.0/17',
3449 'SM': '89.186.32.0/19',
3450 'SN': '41.82.0.0/15',
3451 'SO': '197.220.64.0/19',
3452 'SR': '186.179.128.0/17',
3453 'SS': '105.235.208.0/21',
3454 'ST': '197.159.160.0/19',
3455 'SV': '168.243.0.0/16',
3456 'SX': '190.102.0.0/20',
3457 'SY': '5.0.0.0/16',
3458 'SZ': '41.84.224.0/19',
3459 'TC': '65.255.48.0/20',
3460 'TD': '154.68.128.0/19',
3461 'TG': '196.168.0.0/14',
3462 'TH': '171.96.0.0/13',
3463 'TJ': '85.9.128.0/18',
3464 'TK': '27.96.24.0/21',
3465 'TL': '180.189.160.0/20',
3466 'TM': '95.85.96.0/19',
3467 'TN': '197.0.0.0/11',
3468 'TO': '175.176.144.0/21',
3469 'TR': '78.160.0.0/11',
3470 'TT': '186.44.0.0/15',
3471 'TV': '202.2.96.0/19',
3472 'TW': '120.96.0.0/11',
3473 'TZ': '156.156.0.0/14',
3474 'UA': '93.72.0.0/13',
3475 'UG': '154.224.0.0/13',
3476 'US': '3.0.0.0/8',
3477 'UY': '167.56.0.0/13',
3478 'UZ': '82.215.64.0/18',
3479 'VA': '212.77.0.0/19',
3480 'VC': '24.92.144.0/20',
3481 'VE': '186.88.0.0/13',
3482 'VG': '172.103.64.0/18',
3483 'VI': '146.226.0.0/16',
3484 'VN': '14.160.0.0/11',
3485 'VU': '202.80.32.0/20',
3486 'WF': '117.20.32.0/21',
3487 'WS': '202.4.32.0/19',
3488 'YE': '134.35.0.0/16',
3489 'YT': '41.242.116.0/22',
3490 'ZA': '41.0.0.0/11',
3491 'ZM': '165.56.0.0/13',
3492 'ZW': '41.85.192.0/19',
3493 }
3494
3495 @classmethod
3496 def random_ipv4(cls, code):
3497 block = cls._country_ip_map.get(code.upper())
3498 if not block:
3499 return None
3500 addr, preflen = block.split('/')
3501 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3502 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3503 return compat_str(socket.inet_ntoa(
4248dad9 3504 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3505
3506
91410c9b 3507class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3508 def __init__(self, proxies=None):
3509 # Set default handlers
3510 for type in ('http', 'https'):
3511 setattr(self, '%s_open' % type,
3512 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3513 meth(r, proxy, type))
3514 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3515
91410c9b 3516 def proxy_open(self, req, proxy, type):
2461f79d 3517 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3518 if req_proxy is not None:
3519 proxy = req_proxy
2461f79d
PH
3520 del req.headers['Ytdl-request-proxy']
3521
3522 if proxy == '__noproxy__':
3523 return None # No Proxy
51fb4995 3524 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3525 req.add_header('Ytdl-socks-proxy', proxy)
3526 # youtube-dl's http/https handlers do wrapping the socket with socks
3527 return None
91410c9b
PH
3528 return compat_urllib_request.ProxyHandler.proxy_open(
3529 self, req, proxy, type)
5bc880b9
YCH
3530
3531
0a5445dd
YCH
3532# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3533# released into Public Domain
3534# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3535
3536def long_to_bytes(n, blocksize=0):
3537 """long_to_bytes(n:long, blocksize:int) : string
3538 Convert a long integer to a byte string.
3539
3540 If optional blocksize is given and greater than zero, pad the front of the
3541 byte string with binary zeros so that the length is a multiple of
3542 blocksize.
3543 """
3544 # after much testing, this algorithm was deemed to be the fastest
3545 s = b''
3546 n = int(n)
3547 while n > 0:
3548 s = compat_struct_pack('>I', n & 0xffffffff) + s
3549 n = n >> 32
3550 # strip off leading zeros
3551 for i in range(len(s)):
3552 if s[i] != b'\000'[0]:
3553 break
3554 else:
3555 # only happens when n == 0
3556 s = b'\000'
3557 i = 0
3558 s = s[i:]
3559 # add back some pad bytes. this could be done more efficiently w.r.t. the
3560 # de-padding being done above, but sigh...
3561 if blocksize > 0 and len(s) % blocksize:
3562 s = (blocksize - len(s) % blocksize) * b'\000' + s
3563 return s
3564
3565
3566def bytes_to_long(s):
3567 """bytes_to_long(string) : long
3568 Convert a byte string to a long integer.
3569
3570 This is (essentially) the inverse of long_to_bytes().
3571 """
3572 acc = 0
3573 length = len(s)
3574 if length % 4:
3575 extra = (4 - length % 4)
3576 s = b'\000' * extra + s
3577 length = length + extra
3578 for i in range(0, length, 4):
3579 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3580 return acc
3581
3582
5bc880b9
YCH
3583def ohdave_rsa_encrypt(data, exponent, modulus):
3584 '''
3585 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3586
3587 Input:
3588 data: data to encrypt, bytes-like object
3589 exponent, modulus: parameter e and N of RSA algorithm, both integer
3590 Output: hex string of encrypted data
3591
3592 Limitation: supports one block encryption only
3593 '''
3594
3595 payload = int(binascii.hexlify(data[::-1]), 16)
3596 encrypted = pow(payload, exponent, modulus)
3597 return '%x' % encrypted
81bdc8fd
YCH
3598
3599
f48409c7
YCH
3600def pkcs1pad(data, length):
3601 """
3602 Padding input data with PKCS#1 scheme
3603
3604 @param {int[]} data input data
3605 @param {int} length target length
3606 @returns {int[]} padded data
3607 """
3608 if len(data) > length - 11:
3609 raise ValueError('Input data too long for PKCS#1 padding')
3610
3611 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3612 return [0, 2] + pseudo_random + [0] + data
3613
3614
5eb6bdce 3615def encode_base_n(num, n, table=None):
59f898b7 3616 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3617 if not table:
3618 table = FULL_TABLE[:n]
3619
5eb6bdce
YCH
3620 if n > len(table):
3621 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3622
3623 if num == 0:
3624 return table[0]
3625
81bdc8fd
YCH
3626 ret = ''
3627 while num:
3628 ret = table[num % n] + ret
3629 num = num // n
3630 return ret
f52354a8
YCH
3631
3632
3633def decode_packed_codes(code):
06b3fe29 3634 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3635 obfucasted_code, base, count, symbols = mobj.groups()
3636 base = int(base)
3637 count = int(count)
3638 symbols = symbols.split('|')
3639 symbol_table = {}
3640
3641 while count:
3642 count -= 1
5eb6bdce 3643 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3644 symbol_table[base_n_count] = symbols[count] or base_n_count
3645
3646 return re.sub(
3647 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3648 obfucasted_code)
e154c651 3649
3650
3651def parse_m3u8_attributes(attrib):
3652 info = {}
3653 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3654 if val.startswith('"'):
3655 val = val[1:-1]
3656 info[key] = val
3657 return info
1143535d
YCH
3658
3659
3660def urshift(val, n):
3661 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3662
3663
3664# Based on png2str() written by @gdkchan and improved by @yokrysty
3665# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3666def decode_png(png_data):
3667 # Reference: https://www.w3.org/TR/PNG/
3668 header = png_data[8:]
3669
3670 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3671 raise IOError('Not a valid PNG file.')
3672
3673 int_map = {1: '>B', 2: '>H', 4: '>I'}
3674 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3675
3676 chunks = []
3677
3678 while header:
3679 length = unpack_integer(header[:4])
3680 header = header[4:]
3681
3682 chunk_type = header[:4]
3683 header = header[4:]
3684
3685 chunk_data = header[:length]
3686 header = header[length:]
3687
3688 header = header[4:] # Skip CRC
3689
3690 chunks.append({
3691 'type': chunk_type,
3692 'length': length,
3693 'data': chunk_data
3694 })
3695
3696 ihdr = chunks[0]['data']
3697
3698 width = unpack_integer(ihdr[:4])
3699 height = unpack_integer(ihdr[4:8])
3700
3701 idat = b''
3702
3703 for chunk in chunks:
3704 if chunk['type'] == b'IDAT':
3705 idat += chunk['data']
3706
3707 if not idat:
3708 raise IOError('Unable to read PNG data.')
3709
3710 decompressed_data = bytearray(zlib.decompress(idat))
3711
3712 stride = width * 3
3713 pixels = []
3714
3715 def _get_pixel(idx):
3716 x = idx % stride
3717 y = idx // stride
3718 return pixels[y][x]
3719
3720 for y in range(height):
3721 basePos = y * (1 + stride)
3722 filter_type = decompressed_data[basePos]
3723
3724 current_row = []
3725
3726 pixels.append(current_row)
3727
3728 for x in range(stride):
3729 color = decompressed_data[1 + basePos + x]
3730 basex = y * stride + x
3731 left = 0
3732 up = 0
3733
3734 if x > 2:
3735 left = _get_pixel(basex - 3)
3736 if y > 0:
3737 up = _get_pixel(basex - stride)
3738
3739 if filter_type == 1: # Sub
3740 color = (color + left) & 0xff
3741 elif filter_type == 2: # Up
3742 color = (color + up) & 0xff
3743 elif filter_type == 3: # Average
3744 color = (color + ((left + up) >> 1)) & 0xff
3745 elif filter_type == 4: # Paeth
3746 a = left
3747 b = up
3748 c = 0
3749
3750 if x > 2 and y > 0:
3751 c = _get_pixel(basex - stride - 3)
3752
3753 p = a + b - c
3754
3755 pa = abs(p - a)
3756 pb = abs(p - b)
3757 pc = abs(p - c)
3758
3759 if pa <= pb and pa <= pc:
3760 color = (color + a) & 0xff
3761 elif pb <= pc:
3762 color = (color + b) & 0xff
3763 else:
3764 color = (color + c) & 0xff
3765
3766 current_row.append(color)
3767
3768 return width, height, pixels
efa97bdc
YCH
3769
3770
3771def write_xattr(path, key, value):
3772 # This mess below finds the best xattr tool for the job
3773 try:
3774 # try the pyxattr module...
3775 import xattr
3776
53a7e3d2
YCH
3777 if hasattr(xattr, 'set'): # pyxattr
3778 # Unicode arguments are not supported in python-pyxattr until
3779 # version 0.5.0
3780 # See https://github.com/rg3/youtube-dl/issues/5498
3781 pyxattr_required_version = '0.5.0'
3782 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3783 # TODO: fallback to CLI tools
3784 raise XAttrUnavailableError(
3785 'python-pyxattr is detected but is too old. '
3786 'youtube-dl requires %s or above while your version is %s. '
3787 'Falling back to other xattr implementations' % (
3788 pyxattr_required_version, xattr.__version__))
3789
3790 setxattr = xattr.set
3791 else: # xattr
3792 setxattr = xattr.setxattr
efa97bdc
YCH
3793
3794 try:
53a7e3d2 3795 setxattr(path, key, value)
efa97bdc
YCH
3796 except EnvironmentError as e:
3797 raise XAttrMetadataError(e.errno, e.strerror)
3798
3799 except ImportError:
3800 if compat_os_name == 'nt':
3801 # Write xattrs to NTFS Alternate Data Streams:
3802 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3803 assert ':' not in key
3804 assert os.path.exists(path)
3805
3806 ads_fn = path + ':' + key
3807 try:
3808 with open(ads_fn, 'wb') as f:
3809 f.write(value)
3810 except EnvironmentError as e:
3811 raise XAttrMetadataError(e.errno, e.strerror)
3812 else:
3813 user_has_setfattr = check_executable('setfattr', ['--version'])
3814 user_has_xattr = check_executable('xattr', ['-h'])
3815
3816 if user_has_setfattr or user_has_xattr:
3817
3818 value = value.decode('utf-8')
3819 if user_has_setfattr:
3820 executable = 'setfattr'
3821 opts = ['-n', key, '-v', value]
3822 elif user_has_xattr:
3823 executable = 'xattr'
3824 opts = ['-w', key, value]
3825
3826 cmd = ([encodeFilename(executable, True)] +
3827 [encodeArgument(o) for o in opts] +
3828 [encodeFilename(path, True)])
3829
3830 try:
3831 p = subprocess.Popen(
3832 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3833 except EnvironmentError as e:
3834 raise XAttrMetadataError(e.errno, e.strerror)
3835 stdout, stderr = p.communicate()
3836 stderr = stderr.decode('utf-8', 'replace')
3837 if p.returncode != 0:
3838 raise XAttrMetadataError(p.returncode, stderr)
3839
3840 else:
3841 # On Unix, and can't find pyxattr, setfattr, or xattr.
3842 if sys.platform.startswith('linux'):
3843 raise XAttrUnavailableError(
3844 "Couldn't find a tool to set the xattrs. "
3845 "Install either the python 'pyxattr' or 'xattr' "
3846 "modules, or the GNU 'attr' package "
3847 "(which contains the 'setfattr' tool).")
3848 else:
3849 raise XAttrUnavailableError(
3850 "Couldn't find a tool to set the xattrs. "
3851 "Install either the python 'xattr' module, "
3852 "or the 'xattr' binary.")
0c265486
YCH
3853
3854
3855def random_birthday(year_field, month_field, day_field):
3856 return {
3857 year_field: str(random.randint(1950, 1995)),
3858 month_field: str(random.randint(1, 12)),
3859 day_field: str(random.randint(1, 31)),
3860 }