]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Add brotli content-encoding support (#2433)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
da42679b 6import asyncio
15dfb392 7import atexit
1e399778 8import base64
5bc880b9 9import binascii
912b38b4 10import calendar
676eb3f2 11import codecs
c380cc28 12import collections
62e609ab 13import contextlib
e3946f98 14import ctypes
c496ca96
PH
15import datetime
16import email.utils
0c265486 17import email.header
f45c185f 18import errno
be4a824d 19import functools
d77c3dfd 20import gzip
49fa4d9a
N
21import hashlib
22import hmac
019a94f7 23import importlib.util
03f9daab 24import io
79a2e94e 25import itertools
f4bfd65f 26import json
d77c3dfd 27import locale
02dbf93f 28import math
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
c496ca96 34import socket
79a2e94e 35import ssl
1c088fa8 36import subprocess
d77c3dfd 37import sys
181c8655 38import tempfile
c380cc28 39import time
01951dda 40import traceback
bcf89ce6 41import xml.etree.ElementTree
d77c3dfd 42import zlib
2814f12b 43import mimetypes
d77c3dfd 44
8c25f81b 45from .compat import (
b4a3d461 46 compat_HTMLParseError,
8bb56eee 47 compat_HTMLParser,
201c1459 48 compat_HTTPError,
8f9312c3 49 compat_basestring,
4390d5ec 50 compat_brotli,
8c25f81b 51 compat_chr,
1bab3437 52 compat_cookiejar,
d7cd9a9e 53 compat_ctypes_WINFUNCTYPE,
36e6f62c 54 compat_etree_fromstring,
51098426 55 compat_expanduser,
8c25f81b 56 compat_html_entities,
55b2f099 57 compat_html_entities_html5,
be4a824d 58 compat_http_client,
42db58ec 59 compat_integer_types,
e29663c6 60 compat_numeric_types,
c86b6142 61 compat_kwargs,
efa97bdc 62 compat_os_name,
8c25f81b 63 compat_parse_qs,
06e57990 64 compat_shlex_split,
702ccf2d 65 compat_shlex_quote,
8c25f81b 66 compat_str,
edaa23f8 67 compat_struct_pack,
d3f8e038 68 compat_struct_unpack,
8c25f81b
PH
69 compat_urllib_error,
70 compat_urllib_parse,
15707c7e 71 compat_urllib_parse_urlencode,
8c25f81b 72 compat_urllib_parse_urlparse,
732044af 73 compat_urllib_parse_urlunparse,
74 compat_urllib_parse_quote,
75 compat_urllib_parse_quote_plus,
7581bfc9 76 compat_urllib_parse_unquote_plus,
8c25f81b
PH
77 compat_urllib_request,
78 compat_urlparse,
da42679b 79 compat_websockets,
810c10ba 80 compat_xpath,
8c25f81b 81)
4644ac55 82
71aff188
YCH
83from .socks import (
84 ProxyType,
85 sockssocket,
86)
87
4644ac55 88
51fb4995
YCH
89def register_socks_protocols():
90 # "Register" SOCKS protocols
d5ae6bb5
YCH
91 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
92 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
93 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
94 if scheme not in compat_urlparse.uses_netloc:
95 compat_urlparse.uses_netloc.append(scheme)
96
97
468e2e92
FV
98# This is not clearly defined otherwise
99compiled_regex_type = type(re.compile(''))
100
f7a147e3
S
101
102def random_user_agent():
103 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
104 _CHROME_VERSIONS = (
19b4c74d 105 '90.0.4430.212',
106 '90.0.4430.24',
107 '90.0.4430.70',
108 '90.0.4430.72',
109 '90.0.4430.85',
110 '90.0.4430.93',
111 '91.0.4472.101',
112 '91.0.4472.106',
113 '91.0.4472.114',
114 '91.0.4472.124',
115 '91.0.4472.164',
116 '91.0.4472.19',
117 '91.0.4472.77',
118 '92.0.4515.107',
119 '92.0.4515.115',
120 '92.0.4515.131',
121 '92.0.4515.159',
122 '92.0.4515.43',
123 '93.0.4556.0',
124 '93.0.4577.15',
125 '93.0.4577.63',
126 '93.0.4577.82',
127 '94.0.4606.41',
128 '94.0.4606.54',
129 '94.0.4606.61',
130 '94.0.4606.71',
131 '94.0.4606.81',
132 '94.0.4606.85',
133 '95.0.4638.17',
134 '95.0.4638.50',
135 '95.0.4638.54',
136 '95.0.4638.69',
137 '95.0.4638.74',
138 '96.0.4664.18',
139 '96.0.4664.45',
140 '96.0.4664.55',
141 '96.0.4664.93',
142 '97.0.4692.20',
f7a147e3
S
143 )
144 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
145
146
4390d5ec 147SUPPORTED_ENCODINGS = [
148 'gzip', 'deflate'
149]
150if compat_brotli:
151 SUPPORTED_ENCODINGS.append('br')
152
3e669f36 153std_headers = {
f7a147e3 154 'User-Agent': random_user_agent(),
59ae15a5 155 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
4390d5ec 156 'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS),
59ae15a5 157 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 158 'Sec-Fetch-Mode': 'navigate',
3e669f36 159}
f427df17 160
5f6a1245 161
fb37eb25
S
162USER_AGENTS = {
163 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
164}
165
166
bf42a990
S
167NO_DEFAULT = object()
168
7105440c
YCH
169ENGLISH_MONTH_NAMES = [
170 'January', 'February', 'March', 'April', 'May', 'June',
171 'July', 'August', 'September', 'October', 'November', 'December']
172
f6717dec
S
173MONTH_NAMES = {
174 'en': ENGLISH_MONTH_NAMES,
175 'fr': [
3e4185c3
S
176 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
177 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 178}
a942d6cb 179
a7aaa398
S
180KNOWN_EXTENSIONS = (
181 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
182 'flv', 'f4v', 'f4a', 'f4b',
183 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
184 'mkv', 'mka', 'mk3d',
185 'avi', 'divx',
186 'mov',
187 'asf', 'wmv', 'wma',
188 '3gp', '3g2',
189 'mp3',
190 'flac',
191 'ape',
192 'wav',
193 'f4f', 'f4m', 'm3u8', 'smil')
194
c587cbb7 195# needed for sanitizing filenames in restricted mode
c8827027 196ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
197 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
198 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 199
46f59e89
S
200DATE_FORMATS = (
201 '%d %B %Y',
202 '%d %b %Y',
203 '%B %d %Y',
cb655f34
S
204 '%B %dst %Y',
205 '%B %dnd %Y',
9d30c213 206 '%B %drd %Y',
cb655f34 207 '%B %dth %Y',
46f59e89 208 '%b %d %Y',
cb655f34
S
209 '%b %dst %Y',
210 '%b %dnd %Y',
9d30c213 211 '%b %drd %Y',
cb655f34 212 '%b %dth %Y',
46f59e89
S
213 '%b %dst %Y %I:%M',
214 '%b %dnd %Y %I:%M',
9d30c213 215 '%b %drd %Y %I:%M',
46f59e89
S
216 '%b %dth %Y %I:%M',
217 '%Y %m %d',
218 '%Y-%m-%d',
bccdbd22 219 '%Y.%m.%d.',
46f59e89 220 '%Y/%m/%d',
81c13222 221 '%Y/%m/%d %H:%M',
46f59e89 222 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
223 '%Y%m%d%H%M',
224 '%Y%m%d%H%M%S',
4f3fa23e 225 '%Y%m%d',
0c1c6f4b 226 '%Y-%m-%d %H:%M',
46f59e89
S
227 '%Y-%m-%d %H:%M:%S',
228 '%Y-%m-%d %H:%M:%S.%f',
5014558a 229 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
230 '%d.%m.%Y %H:%M',
231 '%d.%m.%Y %H.%M',
232 '%Y-%m-%dT%H:%M:%SZ',
233 '%Y-%m-%dT%H:%M:%S.%fZ',
234 '%Y-%m-%dT%H:%M:%S.%f0Z',
235 '%Y-%m-%dT%H:%M:%S',
236 '%Y-%m-%dT%H:%M:%S.%f',
237 '%Y-%m-%dT%H:%M',
c6eed6b8
S
238 '%b %d %Y at %H:%M',
239 '%b %d %Y at %H:%M:%S',
b555ae9b
S
240 '%B %d %Y at %H:%M',
241 '%B %d %Y at %H:%M:%S',
a63d9bd0 242 '%H:%M %d-%b-%Y',
46f59e89
S
243)
244
245DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
246DATE_FORMATS_DAY_FIRST.extend([
247 '%d-%m-%Y',
248 '%d.%m.%Y',
249 '%d.%m.%y',
250 '%d/%m/%Y',
251 '%d/%m/%y',
252 '%d/%m/%Y %H:%M:%S',
253])
254
255DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
256DATE_FORMATS_MONTH_FIRST.extend([
257 '%m-%d-%Y',
258 '%m.%d.%Y',
259 '%m/%d/%Y',
260 '%m/%d/%y',
261 '%m/%d/%Y %H:%M:%S',
262])
263
06b3fe29 264PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 265JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 266
7105440c 267
d77c3dfd 268def preferredencoding():
59ae15a5 269 """Get preferred encoding.
d77c3dfd 270
59ae15a5
PH
271 Returns the best encoding scheme for the system, based on
272 locale.getpreferredencoding() and some further tweaks.
273 """
274 try:
275 pref = locale.getpreferredencoding()
28e614de 276 'TEST'.encode(pref)
70a1165b 277 except Exception:
59ae15a5 278 pref = 'UTF-8'
bae611f2 279
59ae15a5 280 return pref
d77c3dfd 281
f4bfd65f 282
181c8655 283def write_json_file(obj, fn):
1394646a 284 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 285
92120217 286 fn = encodeFilename(fn)
61ee5aeb 287 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
288 encoding = get_filesystem_encoding()
289 # os.path.basename returns a bytes object, but NamedTemporaryFile
290 # will fail if the filename contains non ascii characters unless we
291 # use a unicode object
292 path_basename = lambda f: os.path.basename(fn).decode(encoding)
293 # the same for os.path.dirname
294 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
295 else:
296 path_basename = os.path.basename
297 path_dirname = os.path.dirname
298
73159f99
S
299 args = {
300 'suffix': '.tmp',
ec5f6016
JMF
301 'prefix': path_basename(fn) + '.',
302 'dir': path_dirname(fn),
73159f99
S
303 'delete': False,
304 }
305
181c8655
PH
306 # In Python 2.x, json.dump expects a bytestream.
307 # In Python 3.x, it writes to a character stream
308 if sys.version_info < (3, 0):
73159f99 309 args['mode'] = 'wb'
181c8655 310 else:
73159f99
S
311 args.update({
312 'mode': 'w',
313 'encoding': 'utf-8',
314 })
315
c86b6142 316 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
317
318 try:
319 with tf:
45d86abe 320 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
321 if sys.platform == 'win32':
322 # Need to remove existing file on Windows, else os.rename raises
323 # WindowsError or FileExistsError.
324 try:
325 os.unlink(fn)
326 except OSError:
327 pass
9cd5f54e
R
328 try:
329 mask = os.umask(0)
330 os.umask(mask)
331 os.chmod(tf.name, 0o666 & ~mask)
332 except OSError:
333 pass
181c8655 334 os.rename(tf.name, fn)
70a1165b 335 except Exception:
181c8655
PH
336 try:
337 os.remove(tf.name)
338 except OSError:
339 pass
340 raise
341
342
343if sys.version_info >= (2, 7):
ee114368 344 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 345 """ Find the xpath xpath[@key=val] """
5d2354f1 346 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 347 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
348 return node.find(expr)
349else:
ee114368 350 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 351 for f in node.findall(compat_xpath(xpath)):
ee114368
S
352 if key not in f.attrib:
353 continue
354 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
355 return f
356 return None
357
d7e66d39
JMF
358# On python2.6 the xml.etree.ElementTree.Element methods don't support
359# the namespace parameter
5f6a1245
JW
360
361
d7e66d39
JMF
362def xpath_with_ns(path, ns_map):
363 components = [c.split(':') for c in path.split('/')]
364 replaced = []
365 for c in components:
366 if len(c) == 1:
367 replaced.append(c[0])
368 else:
369 ns, tag = c
370 replaced.append('{%s}%s' % (ns_map[ns], tag))
371 return '/'.join(replaced)
372
d77c3dfd 373
a41fb80c 374def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 375 def _find_xpath(xpath):
810c10ba 376 return node.find(compat_xpath(xpath))
578c0745
S
377
378 if isinstance(xpath, (str, compat_str)):
379 n = _find_xpath(xpath)
380 else:
381 for xp in xpath:
382 n = _find_xpath(xp)
383 if n is not None:
384 break
d74bebd5 385
8e636da4 386 if n is None:
bf42a990
S
387 if default is not NO_DEFAULT:
388 return default
389 elif fatal:
bf0ff932
PH
390 name = xpath if name is None else name
391 raise ExtractorError('Could not find XML element %s' % name)
392 else:
393 return None
a41fb80c
S
394 return n
395
396
397def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
398 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
399 if n is None or n == default:
400 return n
401 if n.text is None:
402 if default is not NO_DEFAULT:
403 return default
404 elif fatal:
405 name = xpath if name is None else name
406 raise ExtractorError('Could not find XML element\'s text %s' % name)
407 else:
408 return None
409 return n.text
a41fb80c
S
410
411
412def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
413 n = find_xpath_attr(node, xpath, key)
414 if n is None:
415 if default is not NO_DEFAULT:
416 return default
417 elif fatal:
418 name = '%s[@%s]' % (xpath, key) if name is None else name
419 raise ExtractorError('Could not find XML attribute %s' % name)
420 else:
421 return None
422 return n.attrib[key]
bf0ff932
PH
423
424
9e6dd238 425def get_element_by_id(id, html):
43e8fafd 426 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 427 return get_element_by_attribute('id', id, html)
43e8fafd 428
12ea2f30 429
6f32a0b5
ZM
430def get_element_html_by_id(id, html):
431 """Return the html of the tag with the specified ID in the passed HTML document"""
432 return get_element_html_by_attribute('id', id, html)
433
434
84c237fb 435def get_element_by_class(class_name, html):
2af12ad9
TC
436 """Return the content of the first tag with the specified class in the passed HTML document"""
437 retval = get_elements_by_class(class_name, html)
438 return retval[0] if retval else None
439
440
6f32a0b5
ZM
441def get_element_html_by_class(class_name, html):
442 """Return the html of the first tag with the specified class in the passed HTML document"""
443 retval = get_elements_html_by_class(class_name, html)
444 return retval[0] if retval else None
445
446
2af12ad9
TC
447def get_element_by_attribute(attribute, value, html, escape_value=True):
448 retval = get_elements_by_attribute(attribute, value, html, escape_value)
449 return retval[0] if retval else None
450
451
6f32a0b5
ZM
452def get_element_html_by_attribute(attribute, value, html, escape_value=True):
453 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
454 return retval[0] if retval else None
455
456
2af12ad9
TC
457def get_elements_by_class(class_name, html):
458 """Return the content of all tags with the specified class in the passed HTML document as a list"""
459 return get_elements_by_attribute(
84c237fb
YCH
460 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
461 html, escape_value=False)
462
463
6f32a0b5
ZM
464def get_elements_html_by_class(class_name, html):
465 """Return the html of all tags with the specified class in the passed HTML document as a list"""
466 return get_elements_html_by_attribute(
467 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
468 html, escape_value=False)
469
470
471def get_elements_by_attribute(*args, **kwargs):
43e8fafd 472 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
473 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
474
475
476def get_elements_html_by_attribute(*args, **kwargs):
477 """Return the html of the tag with the specified attribute in the passed HTML document"""
478 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
479
480
481def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
482 """
483 Return the text (content) and the html (whole) of the tag with the specified
484 attribute in the passed HTML document
485 """
9e6dd238 486
0254f162
ZM
487 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
488
84c237fb
YCH
489 value = re.escape(value) if escape_value else value
490
0254f162 491 partial_element_re = r'''(?x)
6f32a0b5 492 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162
ZM
493 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
494 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
495 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
38285056 496
0254f162
ZM
497 for m in re.finditer(partial_element_re, html):
498 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 499
0254f162
ZM
500 yield (
501 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
502 whole
503 )
a921f407 504
c5229f39 505
6f32a0b5
ZM
506class HTMLBreakOnClosingTagParser(compat_HTMLParser):
507 """
508 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
509 closing tag for the first opening tag it has encountered, and can be used
510 as a context manager
511 """
512
513 class HTMLBreakOnClosingTagException(Exception):
514 pass
515
516 def __init__(self):
517 self.tagstack = collections.deque()
518 compat_HTMLParser.__init__(self)
519
520 def __enter__(self):
521 return self
522
523 def __exit__(self, *_):
524 self.close()
525
526 def close(self):
527 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
528 # so data remains buffered; we no longer have any interest in it, thus
529 # override this method to discard it
530 pass
531
532 def handle_starttag(self, tag, _):
533 self.tagstack.append(tag)
534
535 def handle_endtag(self, tag):
536 if not self.tagstack:
537 raise compat_HTMLParseError('no tags in the stack')
538 while self.tagstack:
539 inner_tag = self.tagstack.pop()
540 if inner_tag == tag:
541 break
542 else:
543 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
544 if not self.tagstack:
545 raise self.HTMLBreakOnClosingTagException()
546
547
548def get_element_text_and_html_by_tag(tag, html):
549 """
550 For the first element with the specified tag in the passed HTML document
551 return its' content (text) and the whole element (html)
552 """
553 def find_or_raise(haystack, needle, exc):
554 try:
555 return haystack.index(needle)
556 except ValueError:
557 raise exc
558 closing_tag = f'</{tag}>'
559 whole_start = find_or_raise(
560 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
561 content_start = find_or_raise(
562 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
563 content_start += whole_start + 1
564 with HTMLBreakOnClosingTagParser() as parser:
565 parser.feed(html[whole_start:content_start])
566 if not parser.tagstack or parser.tagstack[0] != tag:
567 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
568 offset = content_start
569 while offset < len(html):
570 next_closing_tag_start = find_or_raise(
571 html[offset:], closing_tag,
572 compat_HTMLParseError(f'closing {tag} tag not found'))
573 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
574 try:
575 parser.feed(html[offset:offset + next_closing_tag_end])
576 offset += next_closing_tag_end
577 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
578 return html[content_start:offset + next_closing_tag_start], \
579 html[whole_start:offset + next_closing_tag_end]
580 raise compat_HTMLParseError('unexpected end of html')
581
582
8bb56eee
BF
583class HTMLAttributeParser(compat_HTMLParser):
584 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 585
8bb56eee 586 def __init__(self):
c5229f39 587 self.attrs = {}
8bb56eee
BF
588 compat_HTMLParser.__init__(self)
589
590 def handle_starttag(self, tag, attrs):
591 self.attrs = dict(attrs)
592
c5229f39 593
73673ccf
FF
594class HTMLListAttrsParser(compat_HTMLParser):
595 """HTML parser to gather the attributes for the elements of a list"""
596
597 def __init__(self):
598 compat_HTMLParser.__init__(self)
599 self.items = []
600 self._level = 0
601
602 def handle_starttag(self, tag, attrs):
603 if tag == 'li' and self._level == 0:
604 self.items.append(dict(attrs))
605 self._level += 1
606
607 def handle_endtag(self, tag):
608 self._level -= 1
609
610
8bb56eee
BF
611def extract_attributes(html_element):
612 """Given a string for an HTML element such as
613 <el
614 a="foo" B="bar" c="&98;az" d=boz
615 empty= noval entity="&amp;"
616 sq='"' dq="'"
617 >
618 Decode and return a dictionary of attributes.
619 {
620 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
621 'empty': '', 'noval': None, 'entity': '&',
622 'sq': '"', 'dq': '\''
623 }.
624 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
625 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
626 """
627 parser = HTMLAttributeParser()
b4a3d461
S
628 try:
629 parser.feed(html_element)
630 parser.close()
631 # Older Python may throw HTMLParseError in case of malformed HTML
632 except compat_HTMLParseError:
633 pass
8bb56eee 634 return parser.attrs
9e6dd238 635
c5229f39 636
73673ccf
FF
637def parse_list(webpage):
638 """Given a string for an series of HTML <li> elements,
639 return a dictionary of their attributes"""
640 parser = HTMLListAttrsParser()
641 parser.feed(webpage)
642 parser.close()
643 return parser.items
644
645
9e6dd238 646def clean_html(html):
59ae15a5 647 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
648
649 if html is None: # Convenience for sanitizing descriptions etc.
650 return html
651
49185227 652 html = re.sub(r'\s+', ' ', html)
653 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
654 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
655 # Strip html tags
656 html = re.sub('<.*?>', '', html)
657 # Replace html entities
658 html = unescapeHTML(html)
7decf895 659 return html.strip()
9e6dd238
FV
660
661
d77c3dfd 662def sanitize_open(filename, open_mode):
59ae15a5
PH
663 """Try to open the given filename, and slightly tweak it if this fails.
664
665 Attempts to open the given filename. If this fails, it tries to change
666 the filename slightly, step by step, until it's either able to open it
667 or it fails and raises a final exception, like the standard open()
668 function.
669
670 It returns the tuple (stream, definitive_file_name).
671 """
672 try:
28e614de 673 if filename == '-':
59ae15a5
PH
674 if sys.platform == 'win32':
675 import msvcrt
676 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 677 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
a3125791 678 stream = locked_file(filename, open_mode, block=False).open()
59ae15a5
PH
679 return (stream, filename)
680 except (IOError, OSError) as err:
f45c185f
PH
681 if err.errno in (errno.EACCES,):
682 raise
59ae15a5 683
f45c185f 684 # In case of error, try to remove win32 forbidden chars
d55de57b 685 alt_filename = sanitize_path(filename)
f45c185f
PH
686 if alt_filename == filename:
687 raise
688 else:
689 # An exception here should be caught in the caller
a3125791 690 stream = locked_file(filename, open_mode, block=False).open()
f45c185f 691 return (stream, alt_filename)
d77c3dfd
FV
692
693
694def timeconvert(timestr):
59ae15a5
PH
695 """Convert RFC 2822 defined time string into system timestamp"""
696 timestamp = None
697 timetuple = email.utils.parsedate_tz(timestr)
698 if timetuple is not None:
699 timestamp = email.utils.mktime_tz(timetuple)
700 return timestamp
1c469a94 701
5f6a1245 702
796173d0 703def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
704 """Sanitizes a string so it could be used as part of a filename.
705 If restricted is set, use a stricter subset of allowed characters.
158af524
S
706 Set is_id if this is not an arbitrary string, but an ID that should be kept
707 if possible.
59ae15a5
PH
708 """
709 def replace_insane(char):
c587cbb7
AT
710 if restricted and char in ACCENT_CHARS:
711 return ACCENT_CHARS[char]
91dd88b9 712 elif not restricted and char == '\n':
713 return ' '
714 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
715 return ''
716 elif char == '"':
717 return '' if restricted else '\''
718 elif char == ':':
719 return '_-' if restricted else ' -'
720 elif char in '\\/|*<>':
721 return '_'
627dcfff 722 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
723 return '_'
724 if restricted and ord(char) > 127:
725 return '_'
726 return char
727
639f1cea 728 if s == '':
729 return ''
2aeb06d6
PH
730 # Handle timestamps
731 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 732 result = ''.join(map(replace_insane, s))
796173d0
PH
733 if not is_id:
734 while '__' in result:
735 result = result.replace('__', '_')
736 result = result.strip('_')
737 # Common case of "Foreign band name - English song title"
738 if restricted and result.startswith('-_'):
739 result = result[2:]
5a42414b
PH
740 if result.startswith('-'):
741 result = '_' + result[len('-'):]
a7440261 742 result = result.lstrip('.')
796173d0
PH
743 if not result:
744 result = '_'
59ae15a5 745 return result
d77c3dfd 746
5f6a1245 747
c2934512 748def sanitize_path(s, force=False):
a2aaf4db 749 """Sanitizes and normalizes path on Windows"""
c2934512 750 if sys.platform == 'win32':
c4218ac3 751 force = False
c2934512 752 drive_or_unc, _ = os.path.splitdrive(s)
753 if sys.version_info < (2, 7) and not drive_or_unc:
754 drive_or_unc, _ = os.path.splitunc(s)
755 elif force:
756 drive_or_unc = ''
757 else:
a2aaf4db 758 return s
c2934512 759
be531ef1
S
760 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
761 if drive_or_unc:
a2aaf4db
S
762 norm_path.pop(0)
763 sanitized_path = [
ec85ded8 764 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 765 for path_part in norm_path]
be531ef1
S
766 if drive_or_unc:
767 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 768 elif force and s[0] == os.path.sep:
769 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
770 return os.path.join(*sanitized_path)
771
772
17bcc626 773def sanitize_url(url):
befa4708
S
774 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
775 # the number of unwanted failures due to missing protocol
776 if url.startswith('//'):
777 return 'http:%s' % url
778 # Fix some common typos seen so far
779 COMMON_TYPOS = (
067aa17e 780 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
781 (r'^httpss://', r'https://'),
782 # https://bx1.be/lives/direct-tv/
783 (r'^rmtp([es]?)://', r'rtmp\1://'),
784 )
785 for mistake, fixup in COMMON_TYPOS:
786 if re.match(mistake, url):
787 return re.sub(mistake, fixup, url)
bc6b9bcd 788 return url
17bcc626
S
789
790
5435dcf9
HH
791def extract_basic_auth(url):
792 parts = compat_urlparse.urlsplit(url)
793 if parts.username is None:
794 return url, None
795 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
796 parts.hostname if parts.port is None
797 else '%s:%d' % (parts.hostname, parts.port))))
798 auth_payload = base64.b64encode(
799 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
800 return url, 'Basic ' + auth_payload.decode('utf-8')
801
802
67dda517 803def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 804 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
805 if auth_header is not None:
806 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
807 headers['Authorization'] = auth_header
808 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
809
810
51098426
S
811def expand_path(s):
812 """Expand shell variables and ~"""
813 return os.path.expandvars(compat_expanduser(s))
814
815
d77c3dfd 816def orderedSet(iterable):
59ae15a5
PH
817 """ Remove all duplicates from the input iterable """
818 res = []
819 for el in iterable:
820 if el not in res:
821 res.append(el)
822 return res
d77c3dfd 823
912b38b4 824
55b2f099 825def _htmlentity_transform(entity_with_semicolon):
4e408e47 826 """Transforms an HTML entity to a character."""
55b2f099
YCH
827 entity = entity_with_semicolon[:-1]
828
4e408e47
PH
829 # Known non-numeric HTML entity
830 if entity in compat_html_entities.name2codepoint:
831 return compat_chr(compat_html_entities.name2codepoint[entity])
832
55b2f099
YCH
833 # TODO: HTML5 allows entities without a semicolon. For example,
834 # '&Eacuteric' should be decoded as 'Éric'.
835 if entity_with_semicolon in compat_html_entities_html5:
836 return compat_html_entities_html5[entity_with_semicolon]
837
91757b0f 838 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
839 if mobj is not None:
840 numstr = mobj.group(1)
28e614de 841 if numstr.startswith('x'):
4e408e47 842 base = 16
28e614de 843 numstr = '0%s' % numstr
4e408e47
PH
844 else:
845 base = 10
067aa17e 846 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
847 try:
848 return compat_chr(int(numstr, base))
849 except ValueError:
850 pass
4e408e47
PH
851
852 # Unknown entity in name, return its literal representation
7a3f0c00 853 return '&%s;' % entity
4e408e47
PH
854
855
d77c3dfd 856def unescapeHTML(s):
912b38b4
PH
857 if s is None:
858 return None
859 assert type(s) == compat_str
d77c3dfd 860
4e408e47 861 return re.sub(
95f3f7c2 862 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 863
8bf48f23 864
cdb19aa4 865def escapeHTML(text):
866 return (
867 text
868 .replace('&', '&amp;')
869 .replace('<', '&lt;')
870 .replace('>', '&gt;')
871 .replace('"', '&quot;')
872 .replace("'", '&#39;')
873 )
874
875
f5b1bca9 876def process_communicate_or_kill(p, *args, **kwargs):
877 try:
878 return p.communicate(*args, **kwargs)
879 except BaseException: # Including KeyboardInterrupt
880 p.kill()
881 p.wait()
882 raise
883
884
d3c93ec2 885class Popen(subprocess.Popen):
886 if sys.platform == 'win32':
887 _startupinfo = subprocess.STARTUPINFO()
888 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
889 else:
890 _startupinfo = None
891
892 def __init__(self, *args, **kwargs):
893 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
894
895 def communicate_or_kill(self, *args, **kwargs):
896 return process_communicate_or_kill(self, *args, **kwargs)
897
898
aa49acd1
S
899def get_subprocess_encoding():
900 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
901 # For subprocess calls, encode with locale encoding
902 # Refer to http://stackoverflow.com/a/9951851/35070
903 encoding = preferredencoding()
904 else:
905 encoding = sys.getfilesystemencoding()
906 if encoding is None:
907 encoding = 'utf-8'
908 return encoding
909
910
8bf48f23 911def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
912 """
913 @param s The name of the file
914 """
d77c3dfd 915
8bf48f23 916 assert type(s) == compat_str
d77c3dfd 917
59ae15a5
PH
918 # Python 3 has a Unicode API
919 if sys.version_info >= (3, 0):
920 return s
0f00efed 921
aa49acd1
S
922 # Pass '' directly to use Unicode APIs on Windows 2000 and up
923 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
924 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
925 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
926 return s
927
8ee239e9
YCH
928 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
929 if sys.platform.startswith('java'):
930 return s
931
aa49acd1
S
932 return s.encode(get_subprocess_encoding(), 'ignore')
933
934
935def decodeFilename(b, for_subprocess=False):
936
937 if sys.version_info >= (3, 0):
938 return b
939
940 if not isinstance(b, bytes):
941 return b
942
943 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 944
f07b74fc
PH
945
946def encodeArgument(s):
947 if not isinstance(s, compat_str):
948 # Legacy code that uses byte strings
949 # Uncomment the following line after fixing all post processors
7af808a5 950 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
951 s = s.decode('ascii')
952 return encodeFilename(s, True)
953
954
aa49acd1
S
955def decodeArgument(b):
956 return decodeFilename(b, True)
957
958
8271226a
PH
959def decodeOption(optval):
960 if optval is None:
961 return optval
962 if isinstance(optval, bytes):
963 optval = optval.decode(preferredencoding())
964
965 assert isinstance(optval, compat_str)
966 return optval
1c256f70 967
5f6a1245 968
aa7785f8 969_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
970
971
972def timetuple_from_msec(msec):
973 secs, msec = divmod(msec, 1000)
974 mins, secs = divmod(secs, 60)
975 hrs, mins = divmod(mins, 60)
976 return _timetuple(hrs, mins, secs, msec)
977
978
cdb19aa4 979def formatSeconds(secs, delim=':', msec=False):
aa7785f8 980 time = timetuple_from_msec(secs * 1000)
981 if time.hours:
982 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
983 elif time.minutes:
984 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 985 else:
aa7785f8 986 ret = '%d' % time.seconds
987 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 988
a0ddb8a2 989
77562778 990def _ssl_load_windows_store_certs(ssl_context, storename):
991 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
992 try:
993 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
994 if encoding == 'x509_asn' and (
995 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
996 except PermissionError:
997 return
998 for cert in certs:
a2366922 999 try:
77562778 1000 ssl_context.load_verify_locations(cadata=cert)
1001 except ssl.SSLError:
a2366922
PH
1002 pass
1003
77562778 1004
1005def make_HTTPS_handler(params, **kwargs):
1006 opts_check_certificate = not params.get('nocheckcertificate')
1007 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1008 context.check_hostname = opts_check_certificate
f81c62a6 1009 if params.get('legacyserverconnect'):
1010 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
77562778 1011 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1012 if opts_check_certificate:
4e3d1898 1013 try:
1014 context.load_default_certs()
1015 # Work around the issue in load_default_certs when there are bad certificates. See:
1016 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018 except ssl.SSLError:
1019 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021 # Create a new context to discard any certificates that were already loaded
1022 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1023 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1024 for storename in ('CA', 'ROOT'):
1025 _ssl_load_windows_store_certs(context, storename)
1026 context.set_default_verify_paths()
77562778 1027 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1028
732ea2f0 1029
5873d4cc 1030def bug_reports_message(before=';'):
455a15e2 1031 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1032 'filling out the "Broken site" issue template properly. '
1033 'Confirm you are on the latest version using -U')
5873d4cc
F
1034
1035 before = before.rstrip()
1036 if not before or before.endswith(('.', '!', '?')):
1037 msg = msg[0].title() + msg[1:]
1038
1039 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1040
1041
bf5b9d85
PM
1042class YoutubeDLError(Exception):
1043 """Base exception for YoutubeDL errors."""
aa9369a2 1044 msg = None
1045
1046 def __init__(self, msg=None):
1047 if msg is not None:
1048 self.msg = msg
1049 elif self.msg is None:
1050 self.msg = type(self).__name__
1051 super().__init__(self.msg)
bf5b9d85
PM
1052
1053
3158150c 1054network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1055if hasattr(ssl, 'CertificateError'):
1056 network_exceptions.append(ssl.CertificateError)
1057network_exceptions = tuple(network_exceptions)
1058
1059
bf5b9d85 1060class ExtractorError(YoutubeDLError):
1c256f70 1061 """Error during info extraction."""
5f6a1245 1062
1151c407 1063 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1064 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1065 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1066 """
3158150c 1067 if sys.exc_info()[0] in network_exceptions:
9a82b238 1068 expected = True
d5979c5d 1069
7265a219 1070 self.orig_msg = str(msg)
1c256f70 1071 self.traceback = tb
1151c407 1072 self.expected = expected
2eabb802 1073 self.cause = cause
d11271dd 1074 self.video_id = video_id
1151c407 1075 self.ie = ie
1076 self.exc_info = sys.exc_info() # preserve original exception
1077
1078 super(ExtractorError, self).__init__(''.join((
1079 format_field(ie, template='[%s] '),
1080 format_field(video_id, template='%s: '),
7265a219 1081 msg,
1151c407 1082 format_field(cause, template=' (caused by %r)'),
1083 '' if expected else bug_reports_message())))
1c256f70 1084
01951dda 1085 def format_traceback(self):
497d2fab 1086 return join_nonempty(
1087 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1088 self.cause and ''.join(traceback.format_exception(self.cause)[1:]),
1089 delim='\n') or None
01951dda 1090
1c256f70 1091
416c7fcb
PH
1092class UnsupportedError(ExtractorError):
1093 def __init__(self, url):
1094 super(UnsupportedError, self).__init__(
1095 'Unsupported URL: %s' % url, expected=True)
1096 self.url = url
1097
1098
55b3e45b
JMF
1099class RegexNotFoundError(ExtractorError):
1100 """Error when a regex didn't match"""
1101 pass
1102
1103
773f291d
S
1104class GeoRestrictedError(ExtractorError):
1105 """Geographic restriction Error exception.
1106
1107 This exception may be thrown when a video is not available from your
1108 geographic location due to geographic restrictions imposed by a website.
1109 """
b6e0c7d2 1110
0db3bae8 1111 def __init__(self, msg, countries=None, **kwargs):
1112 kwargs['expected'] = True
1113 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
1114 self.countries = countries
1115
1116
bf5b9d85 1117class DownloadError(YoutubeDLError):
59ae15a5 1118 """Download Error exception.
d77c3dfd 1119
59ae15a5
PH
1120 This exception may be thrown by FileDownloader objects if they are not
1121 configured to continue on errors. They will contain the appropriate
1122 error message.
1123 """
5f6a1245 1124
8cc83b8d
FV
1125 def __init__(self, msg, exc_info=None):
1126 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127 super(DownloadError, self).__init__(msg)
1128 self.exc_info = exc_info
d77c3dfd
FV
1129
1130
498f5606 1131class EntryNotInPlaylist(YoutubeDLError):
1132 """Entry not in playlist exception.
1133
1134 This exception will be thrown by YoutubeDL when a requested entry
1135 is not found in the playlist info_dict
1136 """
aa9369a2 1137 msg = 'Entry not found in info'
498f5606 1138
1139
bf5b9d85 1140class SameFileError(YoutubeDLError):
59ae15a5 1141 """Same File exception.
d77c3dfd 1142
59ae15a5
PH
1143 This exception will be thrown by FileDownloader objects if they detect
1144 multiple files would have to be downloaded to the same file on disk.
1145 """
aa9369a2 1146 msg = 'Fixed output name but more than one file to download'
1147
1148 def __init__(self, filename=None):
1149 if filename is not None:
1150 self.msg += f': {filename}'
1151 super().__init__(self.msg)
d77c3dfd
FV
1152
1153
bf5b9d85 1154class PostProcessingError(YoutubeDLError):
59ae15a5 1155 """Post Processing exception.
d77c3dfd 1156
59ae15a5
PH
1157 This exception may be raised by PostProcessor's .run() method to
1158 indicate an error in the postprocessing task.
1159 """
5f6a1245 1160
5f6a1245 1161
48f79687 1162class DownloadCancelled(YoutubeDLError):
1163 """ Exception raised when the download queue should be interrupted """
1164 msg = 'The download was cancelled'
8b0d7497 1165
8b0d7497 1166
48f79687 1167class ExistingVideoReached(DownloadCancelled):
1168 """ --break-on-existing triggered """
1169 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1170
48f79687 1171
1172class RejectedVideoReached(DownloadCancelled):
1173 """ --break-on-reject triggered """
1174 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1175
1176
48f79687 1177class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1178 """ --max-downloads limit has been reached. """
48f79687 1179 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
f2ebc5c7 1182class ReExtractInfo(YoutubeDLError):
1183 """ Video info needs to be re-extracted. """
1184
1185 def __init__(self, msg, expected=False):
1186 super().__init__(msg)
1187 self.expected = expected
1188
1189
1190class ThrottledDownload(ReExtractInfo):
48f79687 1191 """ Download speed below --throttled-rate. """
aa9369a2 1192 msg = 'The download speed is below throttle limit'
d77c3dfd 1193
43b22906 1194 def __init__(self):
1195 super().__init__(self.msg, expected=False)
f2ebc5c7 1196
d77c3dfd 1197
bf5b9d85 1198class UnavailableVideoError(YoutubeDLError):
59ae15a5 1199 """Unavailable Format exception.
d77c3dfd 1200
59ae15a5
PH
1201 This exception will be thrown when a video is requested
1202 in a format that is not available for that video.
1203 """
aa9369a2 1204 msg = 'Unable to download video'
1205
1206 def __init__(self, err=None):
1207 if err is not None:
1208 self.msg += f': {err}'
1209 super().__init__(self.msg)
d77c3dfd
FV
1210
1211
bf5b9d85 1212class ContentTooShortError(YoutubeDLError):
59ae15a5 1213 """Content Too Short exception.
d77c3dfd 1214
59ae15a5
PH
1215 This exception may be raised by FileDownloader objects when a file they
1216 download is too small for what the server announced first, indicating
1217 the connection was probably interrupted.
1218 """
d77c3dfd 1219
59ae15a5 1220 def __init__(self, downloaded, expected):
bf5b9d85
PM
1221 super(ContentTooShortError, self).__init__(
1222 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1223 )
2c7ed247 1224 # Both in bytes
59ae15a5
PH
1225 self.downloaded = downloaded
1226 self.expected = expected
d77c3dfd 1227
5f6a1245 1228
bf5b9d85 1229class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1230 def __init__(self, code=None, msg='Unknown error'):
1231 super(XAttrMetadataError, self).__init__(msg)
1232 self.code = code
bd264412 1233 self.msg = msg
efa97bdc
YCH
1234
1235 # Parsing code and msg
3089bc74 1236 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1237 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1238 self.reason = 'NO_SPACE'
1239 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1240 self.reason = 'VALUE_TOO_LONG'
1241 else:
1242 self.reason = 'NOT_SUPPORTED'
1243
1244
bf5b9d85 1245class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1246 pass
1247
1248
c5a59d93 1249def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1250 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1251 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1252 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1253 if sys.version_info < (3, 0):
65220c3b
S
1254 kwargs['strict'] = True
1255 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1256 source_address = ydl_handler._params.get('source_address')
8959018a 1257
be4a824d 1258 if source_address is not None:
8959018a
AU
1259 # This is to workaround _create_connection() from socket where it will try all
1260 # address data from getaddrinfo() including IPv6. This filters the result from
1261 # getaddrinfo() based on the source_address value.
1262 # This is based on the cpython socket.create_connection() function.
1263 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1264 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1265 host, port = address
1266 err = None
1267 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1268 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1269 ip_addrs = [addr for addr in addrs if addr[0] == af]
1270 if addrs and not ip_addrs:
1271 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1272 raise socket.error(
1273 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1274 % (ip_version, source_address[0]))
8959018a
AU
1275 for res in ip_addrs:
1276 af, socktype, proto, canonname, sa = res
1277 sock = None
1278 try:
1279 sock = socket.socket(af, socktype, proto)
1280 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1281 sock.settimeout(timeout)
1282 sock.bind(source_address)
1283 sock.connect(sa)
1284 err = None # Explicitly break reference cycle
1285 return sock
1286 except socket.error as _:
1287 err = _
1288 if sock is not None:
1289 sock.close()
1290 if err is not None:
1291 raise err
1292 else:
9e21e6d9
S
1293 raise socket.error('getaddrinfo returns an empty list')
1294 if hasattr(hc, '_create_connection'):
1295 hc._create_connection = _create_connection
be4a824d
PH
1296 sa = (source_address, 0)
1297 if hasattr(hc, 'source_address'): # Python 2.7+
1298 hc.source_address = sa
1299 else: # Python 2.6
1300 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1301 sock = _create_connection(
be4a824d
PH
1302 (self.host, self.port), self.timeout, sa)
1303 if is_https:
d7932313
PH
1304 self.sock = ssl.wrap_socket(
1305 sock, self.key_file, self.cert_file,
1306 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1307 else:
1308 self.sock = sock
1309 hc.connect = functools.partial(_hc_connect, hc)
1310
1311 return hc
1312
1313
87f0e62d 1314def handle_youtubedl_headers(headers):
992fc9d6
YCH
1315 filtered_headers = headers
1316
1317 if 'Youtubedl-no-compression' in filtered_headers:
1318 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1319 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1320
992fc9d6 1321 return filtered_headers
87f0e62d
YCH
1322
1323
acebc9cd 1324class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1325 """Handler for HTTP requests and responses.
1326
1327 This class, when installed with an OpenerDirector, automatically adds
1328 the standard headers to every HTTP request and handles gzipped and
1329 deflated responses from web servers. If compression is to be avoided in
1330 a particular request, the original request in the program code only has
0424ec30 1331 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1332 removed before making the real request.
1333
1334 Part of this code was copied from:
1335
1336 http://techknack.net/python-urllib2-handlers/
1337
1338 Andrew Rowls, the author of that code, agreed to release it to the
1339 public domain.
1340 """
1341
be4a824d
PH
1342 def __init__(self, params, *args, **kwargs):
1343 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1344 self._params = params
1345
1346 def http_open(self, req):
71aff188
YCH
1347 conn_class = compat_http_client.HTTPConnection
1348
1349 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350 if socks_proxy:
1351 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352 del req.headers['Ytdl-socks-proxy']
1353
be4a824d 1354 return self.do_open(functools.partial(
71aff188 1355 _create_http_connection, self, conn_class, False),
be4a824d
PH
1356 req)
1357
59ae15a5
PH
1358 @staticmethod
1359 def deflate(data):
fc2119f2 1360 if not data:
1361 return data
59ae15a5
PH
1362 try:
1363 return zlib.decompress(data, -zlib.MAX_WBITS)
1364 except zlib.error:
1365 return zlib.decompress(data)
1366
4390d5ec 1367 @staticmethod
1368 def brotli(data):
1369 if not data:
1370 return data
1371 return compat_brotli.decompress(data)
1372
acebc9cd 1373 def http_request(self, req):
51f267d9
S
1374 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1375 # always respected by websites, some tend to give out URLs with non percent-encoded
1376 # non-ASCII characters (see telemb.py, ard.py [#3412])
1377 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1378 # To work around aforementioned issue we will replace request's original URL with
1379 # percent-encoded one
1380 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1381 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1382 url = req.get_full_url()
1383 url_escaped = escape_url(url)
1384
1385 # Substitute URL if any change after escaping
1386 if url != url_escaped:
15d260eb 1387 req = update_Request(req, url=url_escaped)
51f267d9 1388
8b7539d2 1389 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1390 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1391 # The dict keys are capitalized because of this bug by urllib
1392 if h.capitalize() not in req.headers:
33ac271b 1393 req.add_header(h, v)
87f0e62d
YCH
1394
1395 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1396
1397 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1398 # Python 2.6 is brain-dead when it comes to fragments
1399 req._Request__original = req._Request__original.partition('#')[0]
1400 req._Request__r_type = req._Request__r_type.partition('#')[0]
1401
59ae15a5
PH
1402 return req
1403
acebc9cd 1404 def http_response(self, req, resp):
59ae15a5
PH
1405 old_resp = resp
1406 # gzip
1407 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1408 content = resp.read()
1409 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1410 try:
1411 uncompressed = io.BytesIO(gz.read())
1412 except IOError as original_ioerror:
1413 # There may be junk add the end of the file
1414 # See http://stackoverflow.com/q/4928560/35070 for details
1415 for i in range(1, 1024):
1416 try:
1417 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1418 uncompressed = io.BytesIO(gz.read())
1419 except IOError:
1420 continue
1421 break
1422 else:
1423 raise original_ioerror
b407d853 1424 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1425 resp.msg = old_resp.msg
c047270c 1426 del resp.headers['Content-encoding']
59ae15a5
PH
1427 # deflate
1428 if resp.headers.get('Content-encoding', '') == 'deflate':
1429 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1430 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1431 resp.msg = old_resp.msg
c047270c 1432 del resp.headers['Content-encoding']
4390d5ec 1433 # brotli
1434 if resp.headers.get('Content-encoding', '') == 'br':
1435 resp = compat_urllib_request.addinfourl(
1436 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1437 resp.msg = old_resp.msg
1438 del resp.headers['Content-encoding']
ad729172 1439 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1440 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1441 if 300 <= resp.code < 400:
1442 location = resp.headers.get('Location')
1443 if location:
1444 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445 if sys.version_info >= (3, 0):
1446 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1447 else:
1448 location = location.decode('utf-8')
5a4d9ddb
S
1449 location_escaped = escape_url(location)
1450 if location != location_escaped:
1451 del resp.headers['Location']
9a4aec8b
YCH
1452 if sys.version_info < (3, 0):
1453 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1454 resp.headers['Location'] = location_escaped
59ae15a5 1455 return resp
0f8d03f8 1456
acebc9cd
PH
1457 https_request = http_request
1458 https_response = http_response
bf50b038 1459
5de90176 1460
71aff188
YCH
1461def make_socks_conn_class(base_class, socks_proxy):
1462 assert issubclass(base_class, (
1463 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1464
1465 url_components = compat_urlparse.urlparse(socks_proxy)
1466 if url_components.scheme.lower() == 'socks5':
1467 socks_type = ProxyType.SOCKS5
1468 elif url_components.scheme.lower() in ('socks', 'socks4'):
1469 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1470 elif url_components.scheme.lower() == 'socks4a':
1471 socks_type = ProxyType.SOCKS4A
71aff188 1472
cdd94c2e
YCH
1473 def unquote_if_non_empty(s):
1474 if not s:
1475 return s
1476 return compat_urllib_parse_unquote_plus(s)
1477
71aff188
YCH
1478 proxy_args = (
1479 socks_type,
1480 url_components.hostname, url_components.port or 1080,
1481 True, # Remote DNS
cdd94c2e
YCH
1482 unquote_if_non_empty(url_components.username),
1483 unquote_if_non_empty(url_components.password),
71aff188
YCH
1484 )
1485
1486 class SocksConnection(base_class):
1487 def connect(self):
1488 self.sock = sockssocket()
1489 self.sock.setproxy(*proxy_args)
1490 if type(self.timeout) in (int, float):
1491 self.sock.settimeout(self.timeout)
1492 self.sock.connect((self.host, self.port))
1493
1494 if isinstance(self, compat_http_client.HTTPSConnection):
1495 if hasattr(self, '_context'): # Python > 2.6
1496 self.sock = self._context.wrap_socket(
1497 self.sock, server_hostname=self.host)
1498 else:
1499 self.sock = ssl.wrap_socket(self.sock)
1500
1501 return SocksConnection
1502
1503
be4a824d
PH
1504class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1505 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1506 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1507 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1508 self._params = params
1509
1510 def https_open(self, req):
4f264c02 1511 kwargs = {}
71aff188
YCH
1512 conn_class = self._https_conn_class
1513
4f264c02
JMF
1514 if hasattr(self, '_context'): # python > 2.6
1515 kwargs['context'] = self._context
1516 if hasattr(self, '_check_hostname'): # python 3.x
1517 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1518
1519 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1520 if socks_proxy:
1521 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1522 del req.headers['Ytdl-socks-proxy']
1523
be4a824d 1524 return self.do_open(functools.partial(
71aff188 1525 _create_http_connection, self, conn_class, True),
4f264c02 1526 req, **kwargs)
be4a824d
PH
1527
1528
1bab3437 1529class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1530 """
1531 See [1] for cookie file format.
1532
1533 1. https://curl.haxx.se/docs/http-cookies.html
1534 """
e7e62441 1535 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1536 _ENTRY_LEN = 7
1537 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1538# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1539
1540'''
1541 _CookieFileEntry = collections.namedtuple(
1542 'CookieFileEntry',
1543 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1544
1bab3437 1545 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1546 """
1547 Save cookies to a file.
1548
1549 Most of the code is taken from CPython 3.8 and slightly adapted
1550 to support cookie files with UTF-8 in both python 2 and 3.
1551 """
1552 if filename is None:
1553 if self.filename is not None:
1554 filename = self.filename
1555 else:
1556 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1557
1bab3437
S
1558 # Store session cookies with `expires` set to 0 instead of an empty
1559 # string
1560 for cookie in self:
1561 if cookie.expires is None:
1562 cookie.expires = 0
c380cc28
S
1563
1564 with io.open(filename, 'w', encoding='utf-8') as f:
1565 f.write(self._HEADER)
1566 now = time.time()
1567 for cookie in self:
1568 if not ignore_discard and cookie.discard:
1569 continue
1570 if not ignore_expires and cookie.is_expired(now):
1571 continue
1572 if cookie.secure:
1573 secure = 'TRUE'
1574 else:
1575 secure = 'FALSE'
1576 if cookie.domain.startswith('.'):
1577 initial_dot = 'TRUE'
1578 else:
1579 initial_dot = 'FALSE'
1580 if cookie.expires is not None:
1581 expires = compat_str(cookie.expires)
1582 else:
1583 expires = ''
1584 if cookie.value is None:
1585 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1586 # with no name, whereas http.cookiejar regards it as a
1587 # cookie with no value.
1588 name = ''
1589 value = cookie.name
1590 else:
1591 name = cookie.name
1592 value = cookie.value
1593 f.write(
1594 '\t'.join([cookie.domain, initial_dot, cookie.path,
1595 secure, expires, name, value]) + '\n')
1bab3437
S
1596
1597 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1598 """Load cookies from a file."""
1599 if filename is None:
1600 if self.filename is not None:
1601 filename = self.filename
1602 else:
1603 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1604
c380cc28
S
1605 def prepare_line(line):
1606 if line.startswith(self._HTTPONLY_PREFIX):
1607 line = line[len(self._HTTPONLY_PREFIX):]
1608 # comments and empty lines are fine
1609 if line.startswith('#') or not line.strip():
1610 return line
1611 cookie_list = line.split('\t')
1612 if len(cookie_list) != self._ENTRY_LEN:
1613 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1614 cookie = self._CookieFileEntry(*cookie_list)
1615 if cookie.expires_at and not cookie.expires_at.isdigit():
1616 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1617 return line
1618
e7e62441 1619 cf = io.StringIO()
c380cc28 1620 with io.open(filename, encoding='utf-8') as f:
e7e62441 1621 for line in f:
c380cc28
S
1622 try:
1623 cf.write(prepare_line(line))
1624 except compat_cookiejar.LoadError as e:
1625 write_string(
1626 'WARNING: skipping cookie file entry due to %s: %r\n'
1627 % (e, line), sys.stderr)
1628 continue
e7e62441 1629 cf.seek(0)
1630 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1631 # Session cookies are denoted by either `expires` field set to
1632 # an empty string or 0. MozillaCookieJar only recognizes the former
1633 # (see [1]). So we need force the latter to be recognized as session
1634 # cookies on our own.
1635 # Session cookies may be important for cookies-based authentication,
1636 # e.g. usually, when user does not check 'Remember me' check box while
1637 # logging in on a site, some important cookies are stored as session
1638 # cookies so that not recognizing them will result in failed login.
1639 # 1. https://bugs.python.org/issue17164
1640 for cookie in self:
1641 # Treat `expires=0` cookies as session cookies
1642 if cookie.expires == 0:
1643 cookie.expires = None
1644 cookie.discard = True
1645
1646
a6420bf5
S
1647class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1648 def __init__(self, cookiejar=None):
1649 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1650
1651 def http_response(self, request, response):
1652 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1653 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1654 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1655 # In order to at least prevent crashing we will percent encode Set-Cookie
1656 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1657 # if sys.version_info < (3, 0) and response.headers:
1658 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1659 # set_cookie = response.headers.get(set_cookie_header)
1660 # if set_cookie:
1661 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1662 # if set_cookie != set_cookie_escaped:
1663 # del response.headers[set_cookie_header]
1664 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1665 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1666
f5fa042c 1667 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1668 https_response = http_response
1669
1670
fca6dba8 1671class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1672 """YoutubeDL redirect handler
1673
1674 The code is based on HTTPRedirectHandler implementation from CPython [1].
1675
1676 This redirect handler solves two issues:
1677 - ensures redirect URL is always unicode under python 2
1678 - introduces support for experimental HTTP response status code
1679 308 Permanent Redirect [2] used by some sites [3]
1680
1681 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1682 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1683 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1684 """
1685
1686 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1687
1688 def redirect_request(self, req, fp, code, msg, headers, newurl):
1689 """Return a Request or None in response to a redirect.
1690
1691 This is called by the http_error_30x methods when a
1692 redirection response is received. If a redirection should
1693 take place, return a new Request to allow http_error_30x to
1694 perform the redirect. Otherwise, raise HTTPError if no-one
1695 else should try to handle this url. Return None if you can't
1696 but another Handler might.
1697 """
1698 m = req.get_method()
1699 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1700 or code in (301, 302, 303) and m == "POST")):
1701 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1702 # Strictly (according to RFC 2616), 301 or 302 in response to
1703 # a POST MUST NOT cause a redirection without confirmation
1704 # from the user (of urllib.request, in this case). In practice,
1705 # essentially all clients do redirect in this case, so we do
1706 # the same.
1707
1708 # On python 2 urlh.geturl() may sometimes return redirect URL
1709 # as byte string instead of unicode. This workaround allows
1710 # to force it always return unicode.
1711 if sys.version_info[0] < 3:
1712 newurl = compat_str(newurl)
1713
1714 # Be conciliant with URIs containing a space. This is mainly
1715 # redundant with the more complete encoding done in http_error_302(),
1716 # but it is kept for compatibility with other callers.
1717 newurl = newurl.replace(' ', '%20')
1718
1719 CONTENT_HEADERS = ("content-length", "content-type")
1720 # NB: don't use dict comprehension for python 2.6 compatibility
1721 newheaders = dict((k, v) for k, v in req.headers.items()
1722 if k.lower() not in CONTENT_HEADERS)
1723 return compat_urllib_request.Request(
1724 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1725 unverifiable=True)
fca6dba8
S
1726
1727
46f59e89
S
1728def extract_timezone(date_str):
1729 m = re.search(
f137e4c2 1730 r'''(?x)
1731 ^.{8,}? # >=8 char non-TZ prefix, if present
1732 (?P<tz>Z| # just the UTC Z, or
1733 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1734 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1735 [ ]? # optional space
1736 (?P<sign>\+|-) # +/-
1737 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1738 $)
1739 ''', date_str)
46f59e89
S
1740 if not m:
1741 timezone = datetime.timedelta()
1742 else:
1743 date_str = date_str[:-len(m.group('tz'))]
1744 if not m.group('sign'):
1745 timezone = datetime.timedelta()
1746 else:
1747 sign = 1 if m.group('sign') == '+' else -1
1748 timezone = datetime.timedelta(
1749 hours=sign * int(m.group('hours')),
1750 minutes=sign * int(m.group('minutes')))
1751 return timezone, date_str
1752
1753
08b38d54 1754def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1755 """ Return a UNIX timestamp from the given date """
1756
1757 if date_str is None:
1758 return None
1759
52c3a6e4
S
1760 date_str = re.sub(r'\.[0-9]+', '', date_str)
1761
08b38d54 1762 if timezone is None:
46f59e89
S
1763 timezone, date_str = extract_timezone(date_str)
1764
52c3a6e4
S
1765 try:
1766 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1767 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1768 return calendar.timegm(dt.timetuple())
1769 except ValueError:
1770 pass
912b38b4
PH
1771
1772
46f59e89
S
1773def date_formats(day_first=True):
1774 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1775
1776
42bdd9d0 1777def unified_strdate(date_str, day_first=True):
bf50b038 1778 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1779
1780 if date_str is None:
1781 return None
bf50b038 1782 upload_date = None
5f6a1245 1783 # Replace commas
026fcc04 1784 date_str = date_str.replace(',', ' ')
42bdd9d0 1785 # Remove AM/PM + timezone
9bb8e0a3 1786 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1787 _, date_str = extract_timezone(date_str)
42bdd9d0 1788
46f59e89 1789 for expression in date_formats(day_first):
bf50b038
JMF
1790 try:
1791 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1792 except ValueError:
bf50b038 1793 pass
42393ce2
PH
1794 if upload_date is None:
1795 timetuple = email.utils.parsedate_tz(date_str)
1796 if timetuple:
c6b9cf05
S
1797 try:
1798 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1799 except ValueError:
1800 pass
6a750402
JMF
1801 if upload_date is not None:
1802 return compat_str(upload_date)
bf50b038 1803
5f6a1245 1804
46f59e89
S
1805def unified_timestamp(date_str, day_first=True):
1806 if date_str is None:
1807 return None
1808
2ae2ffda 1809 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1810
7dc2a74e 1811 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1812 timezone, date_str = extract_timezone(date_str)
1813
1814 # Remove AM/PM + timezone
1815 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1816
deef3195
S
1817 # Remove unrecognized timezones from ISO 8601 alike timestamps
1818 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1819 if m:
1820 date_str = date_str[:-len(m.group('tz'))]
1821
f226880c
PH
1822 # Python only supports microseconds, so remove nanoseconds
1823 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1824 if m:
1825 date_str = m.group(1)
1826
46f59e89
S
1827 for expression in date_formats(day_first):
1828 try:
7dc2a74e 1829 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1830 return calendar.timegm(dt.timetuple())
1831 except ValueError:
1832 pass
1833 timetuple = email.utils.parsedate_tz(date_str)
1834 if timetuple:
7dc2a74e 1835 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1836
1837
28e614de 1838def determine_ext(url, default_ext='unknown_video'):
85750f89 1839 if url is None or '.' not in url:
f4776371 1840 return default_ext
9cb9a5df 1841 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1842 if re.match(r'^[A-Za-z0-9]+$', guess):
1843 return guess
a7aaa398
S
1844 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1845 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1846 return guess.rstrip('/')
73e79f2a 1847 else:
cbdbb766 1848 return default_ext
73e79f2a 1849
5f6a1245 1850
824fa511
S
1851def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1852 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1853
5f6a1245 1854
9e62f283 1855def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1856 """
1857 Return a datetime object from a string in the format YYYYMMDD or
d49f8db3 1858 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
9e62f283 1859
1860 format: string date format used to return datetime object from
1861 precision: round the time portion of a datetime object.
1862 auto|microsecond|second|minute|hour|day.
1863 auto: round to the unit provided in date_str (if applicable).
1864 """
1865 auto_precision = False
1866 if precision == 'auto':
1867 auto_precision = True
1868 precision = 'microsecond'
396a76f7 1869 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1870 if date_str in ('now', 'today'):
37254abc 1871 return today
f8795e10
PH
1872 if date_str == 'yesterday':
1873 return today - datetime.timedelta(days=1)
9e62f283 1874 match = re.match(
1875 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1876 date_str)
37254abc 1877 if match is not None:
9e62f283 1878 start_time = datetime_from_str(match.group('start'), precision, format)
1879 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1880 unit = match.group('unit')
9e62f283 1881 if unit == 'month' or unit == 'year':
1882 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1883 unit = 'day'
9e62f283 1884 else:
1885 if unit == 'week':
1886 unit = 'day'
1887 time *= 7
1888 delta = datetime.timedelta(**{unit + 's': time})
1889 new_date = start_time + delta
1890 if auto_precision:
1891 return datetime_round(new_date, unit)
1892 return new_date
1893
1894 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1895
1896
d49f8db3 1897def date_from_str(date_str, format='%Y%m%d', strict=False):
9e62f283 1898 """
1899 Return a datetime object from a string in the format YYYYMMDD or
d49f8db3 1900 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1901
1902 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
9e62f283 1903
1904 format: string date format used to return datetime object from
1905 """
d49f8db3 1906 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1907 raise ValueError(f'Invalid date format {date_str}')
9e62f283 1908 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1909
1910
1911def datetime_add_months(dt, months):
1912 """Increment/Decrement a datetime object by months."""
1913 month = dt.month + months - 1
1914 year = dt.year + month // 12
1915 month = month % 12 + 1
1916 day = min(dt.day, calendar.monthrange(year, month)[1])
1917 return dt.replace(year, month, day)
1918
1919
1920def datetime_round(dt, precision='day'):
1921 """
1922 Round a datetime object's time to a specific precision
1923 """
1924 if precision == 'microsecond':
1925 return dt
1926
1927 unit_seconds = {
1928 'day': 86400,
1929 'hour': 3600,
1930 'minute': 60,
1931 'second': 1,
1932 }
1933 roundto = lambda x, n: ((x + n / 2) // n) * n
1934 timestamp = calendar.timegm(dt.timetuple())
1935 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1936
1937
e63fc1be 1938def hyphenate_date(date_str):
1939 """
1940 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1941 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1942 if match is not None:
1943 return '-'.join(match.groups())
1944 else:
1945 return date_str
1946
5f6a1245 1947
bd558525
JMF
1948class DateRange(object):
1949 """Represents a time interval between two dates"""
5f6a1245 1950
bd558525
JMF
1951 def __init__(self, start=None, end=None):
1952 """start and end must be strings in the format accepted by date"""
1953 if start is not None:
d49f8db3 1954 self.start = date_from_str(start, strict=True)
bd558525
JMF
1955 else:
1956 self.start = datetime.datetime.min.date()
1957 if end is not None:
d49f8db3 1958 self.end = date_from_str(end, strict=True)
bd558525
JMF
1959 else:
1960 self.end = datetime.datetime.max.date()
37254abc 1961 if self.start > self.end:
bd558525 1962 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1963
bd558525
JMF
1964 @classmethod
1965 def day(cls, day):
1966 """Returns a range that only contains the given day"""
5f6a1245
JW
1967 return cls(day, day)
1968
bd558525
JMF
1969 def __contains__(self, date):
1970 """Check if the date is in the range"""
37254abc
JMF
1971 if not isinstance(date, datetime.date):
1972 date = date_from_str(date)
1973 return self.start <= date <= self.end
5f6a1245 1974
bd558525 1975 def __str__(self):
5f6a1245 1976 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1977
1978
1979def platform_name():
1980 """ Returns the platform name as a compat_str """
1981 res = platform.platform()
1982 if isinstance(res, bytes):
1983 res = res.decode(preferredencoding())
1984
1985 assert isinstance(res, compat_str)
1986 return res
c257baff
PH
1987
1988
49fa4d9a
N
1989def get_windows_version():
1990 ''' Get Windows version. None if it's not running on Windows '''
1991 if compat_os_name == 'nt':
1992 return version_tuple(platform.win32_ver()[1])
1993 else:
1994 return None
1995
1996
b58ddb32
PH
1997def _windows_write_string(s, out):
1998 """ Returns True if the string was written using special methods,
1999 False if it has yet to be written out."""
2000 # Adapted from http://stackoverflow.com/a/3259271/35070
2001
b58ddb32
PH
2002 import ctypes.wintypes
2003
2004 WIN_OUTPUT_IDS = {
2005 1: -11,
2006 2: -12,
2007 }
2008
a383a98a
PH
2009 try:
2010 fileno = out.fileno()
2011 except AttributeError:
2012 # If the output stream doesn't have a fileno, it's virtual
2013 return False
aa42e873
PH
2014 except io.UnsupportedOperation:
2015 # Some strange Windows pseudo files?
2016 return False
b58ddb32
PH
2017 if fileno not in WIN_OUTPUT_IDS:
2018 return False
2019
d7cd9a9e 2020 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 2021 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 2022 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
2023 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2024
d7cd9a9e 2025 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2026 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2027 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 2028 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
2029 written = ctypes.wintypes.DWORD(0)
2030
d7cd9a9e 2031 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
2032 FILE_TYPE_CHAR = 0x0002
2033 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 2034 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2035 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2036 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 2037 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
2038 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2039
2040 def not_a_console(handle):
2041 if handle == INVALID_HANDLE_VALUE or handle is None:
2042 return True
3089bc74
S
2043 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2044 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
2045
2046 if not_a_console(h):
2047 return False
2048
d1b9c912
PH
2049 def next_nonbmp_pos(s):
2050 try:
2051 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2052 except StopIteration:
2053 return len(s)
2054
2055 while s:
2056 count = min(next_nonbmp_pos(s), 1024)
2057
b58ddb32 2058 ret = WriteConsoleW(
d1b9c912 2059 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
2060 if ret == 0:
2061 raise OSError('Failed to write string')
d1b9c912
PH
2062 if not count: # We just wrote a non-BMP character
2063 assert written.value == 2
2064 s = s[1:]
2065 else:
2066 assert written.value > 0
2067 s = s[written.value:]
b58ddb32
PH
2068 return True
2069
2070
734f90bb 2071def write_string(s, out=None, encoding=None):
7459e3a2
PH
2072 if out is None:
2073 out = sys.stderr
8bf48f23 2074 assert type(s) == compat_str
7459e3a2 2075
b58ddb32
PH
2076 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2077 if _windows_write_string(s, out):
2078 return
2079
3089bc74
S
2080 if ('b' in getattr(out, 'mode', '')
2081 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
2082 byt = s.encode(encoding or preferredencoding(), 'ignore')
2083 out.write(byt)
2084 elif hasattr(out, 'buffer'):
2085 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2086 byt = s.encode(enc, 'ignore')
2087 out.buffer.write(byt)
2088 else:
8bf48f23 2089 out.write(s)
7459e3a2
PH
2090 out.flush()
2091
2092
48ea9cea
PH
2093def bytes_to_intlist(bs):
2094 if not bs:
2095 return []
2096 if isinstance(bs[0], int): # Python 3
2097 return list(bs)
2098 else:
2099 return [ord(c) for c in bs]
2100
c257baff 2101
cba892fa 2102def intlist_to_bytes(xs):
2103 if not xs:
2104 return b''
edaa23f8 2105 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
2106
2107
c1c9a79c
PH
2108# Cross-platform file locking
2109if sys.platform == 'win32':
2110 import ctypes.wintypes
2111 import msvcrt
2112
2113 class OVERLAPPED(ctypes.Structure):
2114 _fields_ = [
2115 ('Internal', ctypes.wintypes.LPVOID),
2116 ('InternalHigh', ctypes.wintypes.LPVOID),
2117 ('Offset', ctypes.wintypes.DWORD),
2118 ('OffsetHigh', ctypes.wintypes.DWORD),
2119 ('hEvent', ctypes.wintypes.HANDLE),
2120 ]
2121
2122 kernel32 = ctypes.windll.kernel32
2123 LockFileEx = kernel32.LockFileEx
2124 LockFileEx.argtypes = [
2125 ctypes.wintypes.HANDLE, # hFile
2126 ctypes.wintypes.DWORD, # dwFlags
2127 ctypes.wintypes.DWORD, # dwReserved
2128 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2129 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2130 ctypes.POINTER(OVERLAPPED) # Overlapped
2131 ]
2132 LockFileEx.restype = ctypes.wintypes.BOOL
2133 UnlockFileEx = kernel32.UnlockFileEx
2134 UnlockFileEx.argtypes = [
2135 ctypes.wintypes.HANDLE, # hFile
2136 ctypes.wintypes.DWORD, # dwReserved
2137 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2139 ctypes.POINTER(OVERLAPPED) # Overlapped
2140 ]
2141 UnlockFileEx.restype = ctypes.wintypes.BOOL
2142 whole_low = 0xffffffff
2143 whole_high = 0x7fffffff
2144
747c0bd1 2145 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2146 overlapped = OVERLAPPED()
2147 overlapped.Offset = 0
2148 overlapped.OffsetHigh = 0
2149 overlapped.hEvent = 0
2150 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2151
2152 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2153 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2154 0, whole_low, whole_high, f._lock_file_overlapped_p):
2155 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
c1c9a79c
PH
2156
2157 def _unlock_file(f):
2158 assert f._lock_file_overlapped_p
2159 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2160 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2161 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2162
2163else:
399a76e6
YCH
2164 try:
2165 import fcntl
c1c9a79c 2166
a3125791 2167 def _lock_file(f, exclusive, block):
acea8d7c
JK
2168 try:
2169 fcntl.flock(f,
2170 fcntl.LOCK_SH if not exclusive
2171 else fcntl.LOCK_EX if block
2172 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2173 except BlockingIOError:
2174 raise
2175 except OSError: # AOSP does not have flock()
2176 fcntl.lockf(f,
2177 fcntl.LOCK_SH if not exclusive
2178 else fcntl.LOCK_EX if block
2179 else fcntl.LOCK_EX | fcntl.LOCK_NB)
c1c9a79c 2180
399a76e6 2181 def _unlock_file(f):
acea8d7c
JK
2182 try:
2183 fcntl.flock(f, fcntl.LOCK_UN)
2184 except OSError:
2185 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2186
399a76e6
YCH
2187 except ImportError:
2188 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2189
a3125791 2190 def _lock_file(f, exclusive, block):
399a76e6
YCH
2191 raise IOError(UNSUPPORTED_MSG)
2192
2193 def _unlock_file(f):
2194 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2195
2196
2197class locked_file(object):
747c0bd1 2198 _closed = False
2199
a3125791
JK
2200 def __init__(self, filename, mode, block=True, encoding=None):
2201 assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
c1c9a79c
PH
2202 self.f = io.open(filename, mode, encoding=encoding)
2203 self.mode = mode
a3125791 2204 self.block = block
c1c9a79c
PH
2205
2206 def __enter__(self):
a3125791 2207 exclusive = 'r' not in self.mode
c1c9a79c 2208 try:
a3125791 2209 _lock_file(self.f, exclusive, self.block)
c1c9a79c
PH
2210 except IOError:
2211 self.f.close()
2212 raise
2213 return self
2214
2215 def __exit__(self, etype, value, traceback):
2216 try:
747c0bd1 2217 if not self._closed:
2218 _unlock_file(self.f)
c1c9a79c
PH
2219 finally:
2220 self.f.close()
747c0bd1 2221 self._closed = True
c1c9a79c
PH
2222
2223 def __iter__(self):
2224 return iter(self.f)
2225
2226 def write(self, *args):
2227 return self.f.write(*args)
2228
2229 def read(self, *args):
2230 return self.f.read(*args)
4eb7f1d1 2231
a3125791
JK
2232 def flush(self):
2233 self.f.flush()
2234
2235 def open(self):
2236 return self.__enter__()
2237
2238 def close(self, *args):
2239 self.__exit__(self, *args, value=False, traceback=False)
2240
4eb7f1d1 2241
4644ac55
S
2242def get_filesystem_encoding():
2243 encoding = sys.getfilesystemencoding()
2244 return encoding if encoding is not None else 'utf-8'
2245
2246
4eb7f1d1 2247def shell_quote(args):
a6a173c2 2248 quoted_args = []
4644ac55 2249 encoding = get_filesystem_encoding()
a6a173c2
JMF
2250 for a in args:
2251 if isinstance(a, bytes):
2252 # We may get a filename encoded with 'encodeFilename'
2253 a = a.decode(encoding)
aefce8e6 2254 quoted_args.append(compat_shlex_quote(a))
28e614de 2255 return ' '.join(quoted_args)
9d4660ca
PH
2256
2257
2258def smuggle_url(url, data):
2259 """ Pass additional data in a URL for internal use. """
2260
81953d1a
RA
2261 url, idata = unsmuggle_url(url, {})
2262 data.update(idata)
15707c7e 2263 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2264 {'__youtubedl_smuggle': json.dumps(data)})
2265 return url + '#' + sdata
9d4660ca
PH
2266
2267
79f82953 2268def unsmuggle_url(smug_url, default=None):
83e865a3 2269 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2270 return smug_url, default
28e614de
PH
2271 url, _, sdata = smug_url.rpartition('#')
2272 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2273 data = json.loads(jsond)
2274 return url, data
02dbf93f
PH
2275
2276
e0fd9573 2277def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2278 """ Formats numbers with decimal sufixes like K, M, etc """
2279 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2280 if num is None or num < 0:
e0fd9573 2281 return None
2282 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2283 suffix = ['', *'kMGTPEZY'][exponent]
2284 if factor == 1024:
2285 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2286 converted = num / (factor ** exponent)
abbeeebc 2287 return fmt % (converted, suffix)
e0fd9573 2288
2289
02dbf93f 2290def format_bytes(bytes):
f02d24d8 2291 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2292
1c088fa8 2293
fb47597b
S
2294def lookup_unit_table(unit_table, s):
2295 units_re = '|'.join(re.escape(u) for u in unit_table)
2296 m = re.match(
782b1b5b 2297 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2298 if not m:
2299 return None
2300 num_str = m.group('num').replace(',', '.')
2301 mult = unit_table[m.group('unit')]
2302 return int(float(num_str) * mult)
2303
2304
be64b5b0
PH
2305def parse_filesize(s):
2306 if s is None:
2307 return None
2308
dfb1b146 2309 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2310 # but we support those too
2311 _UNIT_TABLE = {
2312 'B': 1,
2313 'b': 1,
70852b47 2314 'bytes': 1,
be64b5b0
PH
2315 'KiB': 1024,
2316 'KB': 1000,
2317 'kB': 1024,
2318 'Kb': 1000,
13585d76 2319 'kb': 1000,
70852b47
YCH
2320 'kilobytes': 1000,
2321 'kibibytes': 1024,
be64b5b0
PH
2322 'MiB': 1024 ** 2,
2323 'MB': 1000 ** 2,
2324 'mB': 1024 ** 2,
2325 'Mb': 1000 ** 2,
13585d76 2326 'mb': 1000 ** 2,
70852b47
YCH
2327 'megabytes': 1000 ** 2,
2328 'mebibytes': 1024 ** 2,
be64b5b0
PH
2329 'GiB': 1024 ** 3,
2330 'GB': 1000 ** 3,
2331 'gB': 1024 ** 3,
2332 'Gb': 1000 ** 3,
13585d76 2333 'gb': 1000 ** 3,
70852b47
YCH
2334 'gigabytes': 1000 ** 3,
2335 'gibibytes': 1024 ** 3,
be64b5b0
PH
2336 'TiB': 1024 ** 4,
2337 'TB': 1000 ** 4,
2338 'tB': 1024 ** 4,
2339 'Tb': 1000 ** 4,
13585d76 2340 'tb': 1000 ** 4,
70852b47
YCH
2341 'terabytes': 1000 ** 4,
2342 'tebibytes': 1024 ** 4,
be64b5b0
PH
2343 'PiB': 1024 ** 5,
2344 'PB': 1000 ** 5,
2345 'pB': 1024 ** 5,
2346 'Pb': 1000 ** 5,
13585d76 2347 'pb': 1000 ** 5,
70852b47
YCH
2348 'petabytes': 1000 ** 5,
2349 'pebibytes': 1024 ** 5,
be64b5b0
PH
2350 'EiB': 1024 ** 6,
2351 'EB': 1000 ** 6,
2352 'eB': 1024 ** 6,
2353 'Eb': 1000 ** 6,
13585d76 2354 'eb': 1000 ** 6,
70852b47
YCH
2355 'exabytes': 1000 ** 6,
2356 'exbibytes': 1024 ** 6,
be64b5b0
PH
2357 'ZiB': 1024 ** 7,
2358 'ZB': 1000 ** 7,
2359 'zB': 1024 ** 7,
2360 'Zb': 1000 ** 7,
13585d76 2361 'zb': 1000 ** 7,
70852b47
YCH
2362 'zettabytes': 1000 ** 7,
2363 'zebibytes': 1024 ** 7,
be64b5b0
PH
2364 'YiB': 1024 ** 8,
2365 'YB': 1000 ** 8,
2366 'yB': 1024 ** 8,
2367 'Yb': 1000 ** 8,
13585d76 2368 'yb': 1000 ** 8,
70852b47
YCH
2369 'yottabytes': 1000 ** 8,
2370 'yobibytes': 1024 ** 8,
be64b5b0
PH
2371 }
2372
fb47597b
S
2373 return lookup_unit_table(_UNIT_TABLE, s)
2374
2375
2376def parse_count(s):
2377 if s is None:
be64b5b0
PH
2378 return None
2379
352d5da8 2380 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2381
2382 if re.match(r'^[\d,.]+$', s):
2383 return str_to_int(s)
2384
2385 _UNIT_TABLE = {
2386 'k': 1000,
2387 'K': 1000,
2388 'm': 1000 ** 2,
2389 'M': 1000 ** 2,
2390 'kk': 1000 ** 2,
2391 'KK': 1000 ** 2,
352d5da8 2392 'b': 1000 ** 3,
2393 'B': 1000 ** 3,
fb47597b 2394 }
be64b5b0 2395
352d5da8 2396 ret = lookup_unit_table(_UNIT_TABLE, s)
2397 if ret is not None:
2398 return ret
2399
2400 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2401 if mobj:
2402 return str_to_int(mobj.group(1))
be64b5b0 2403
2f7ae819 2404
b871d7e9
S
2405def parse_resolution(s):
2406 if s is None:
2407 return {}
2408
17ec8bcf 2409 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2410 if mobj:
2411 return {
2412 'width': int(mobj.group('w')),
2413 'height': int(mobj.group('h')),
2414 }
2415
17ec8bcf 2416 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2417 if mobj:
2418 return {'height': int(mobj.group(1))}
2419
2420 mobj = re.search(r'\b([48])[kK]\b', s)
2421 if mobj:
2422 return {'height': int(mobj.group(1)) * 540}
2423
2424 return {}
2425
2426
0dc41787
S
2427def parse_bitrate(s):
2428 if not isinstance(s, compat_str):
2429 return
2430 mobj = re.search(r'\b(\d+)\s*kbps', s)
2431 if mobj:
2432 return int(mobj.group(1))
2433
2434
a942d6cb 2435def month_by_name(name, lang='en'):
caefb1de
PH
2436 """ Return the number of a month by (locale-independently) English name """
2437
f6717dec 2438 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2439
caefb1de 2440 try:
f6717dec 2441 return month_names.index(name) + 1
7105440c
YCH
2442 except ValueError:
2443 return None
2444
2445
2446def month_by_abbreviation(abbrev):
2447 """ Return the number of a month by (locale-independently) English
2448 abbreviations """
2449
2450 try:
2451 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2452 except ValueError:
2453 return None
18258362
JMF
2454
2455
5aafe895 2456def fix_xml_ampersands(xml_str):
18258362 2457 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2458 return re.sub(
2459 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2460 '&amp;',
5aafe895 2461 xml_str)
e3946f98
PH
2462
2463
2464def setproctitle(title):
8bf48f23 2465 assert isinstance(title, compat_str)
c1c05c67
YCH
2466
2467 # ctypes in Jython is not complete
2468 # http://bugs.jython.org/issue2148
2469 if sys.platform.startswith('java'):
2470 return
2471
e3946f98 2472 try:
611c1dd9 2473 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2474 except OSError:
2475 return
2f49bcd6
RC
2476 except TypeError:
2477 # LoadLibrary in Windows Python 2.7.13 only expects
2478 # a bytestring, but since unicode_literals turns
2479 # every string into a unicode string, it fails.
2480 return
6eefe533
PH
2481 title_bytes = title.encode('utf-8')
2482 buf = ctypes.create_string_buffer(len(title_bytes))
2483 buf.value = title_bytes
e3946f98 2484 try:
6eefe533 2485 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2486 except AttributeError:
2487 return # Strange libc, just skip this
d7dda168
PH
2488
2489
2490def remove_start(s, start):
46bc9b7d 2491 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2492
2493
2b9faf55 2494def remove_end(s, end):
46bc9b7d 2495 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2496
2497
31b2051e
S
2498def remove_quotes(s):
2499 if s is None or len(s) < 2:
2500 return s
2501 for quote in ('"', "'", ):
2502 if s[0] == quote and s[-1] == quote:
2503 return s[1:-1]
2504 return s
2505
2506
b6e0c7d2
U
2507def get_domain(url):
2508 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2509 return domain.group('domain') if domain else None
2510
2511
29eb5174 2512def url_basename(url):
9b8aaeed 2513 path = compat_urlparse.urlparse(url).path
28e614de 2514 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2515
2516
02dc0a36
S
2517def base_url(url):
2518 return re.match(r'https?://[^?#&]+/', url).group()
2519
2520
e34c3361 2521def urljoin(base, path):
4b5de77b
S
2522 if isinstance(path, bytes):
2523 path = path.decode('utf-8')
e34c3361
S
2524 if not isinstance(path, compat_str) or not path:
2525 return None
fad4ceb5 2526 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2527 return path
4b5de77b
S
2528 if isinstance(base, bytes):
2529 base = base.decode('utf-8')
2530 if not isinstance(base, compat_str) or not re.match(
2531 r'^(?:https?:)?//', base):
e34c3361
S
2532 return None
2533 return compat_urlparse.urljoin(base, path)
2534
2535
aa94a6d3
PH
2536class HEADRequest(compat_urllib_request.Request):
2537 def get_method(self):
611c1dd9 2538 return 'HEAD'
7217e148
PH
2539
2540
95cf60e8
S
2541class PUTRequest(compat_urllib_request.Request):
2542 def get_method(self):
2543 return 'PUT'
2544
2545
9732d77e 2546def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2547 if get_attr and v is not None:
2548 v = getattr(v, get_attr, None)
1812afb7
S
2549 try:
2550 return int(v) * invscale // scale
31c49255 2551 except (ValueError, TypeError, OverflowError):
af98f8ff 2552 return default
9732d77e 2553
9572013d 2554
40a90862
JMF
2555def str_or_none(v, default=None):
2556 return default if v is None else compat_str(v)
2557
9732d77e
PH
2558
2559def str_to_int(int_str):
48d4681e 2560 """ A more relaxed version of int_or_none """
42db58ec 2561 if isinstance(int_str, compat_integer_types):
348c6bf1 2562 return int_str
42db58ec
S
2563 elif isinstance(int_str, compat_str):
2564 int_str = re.sub(r'[,\.\+]', '', int_str)
2565 return int_or_none(int_str)
608d11f5
PH
2566
2567
9732d77e 2568def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2569 if v is None:
2570 return default
2571 try:
2572 return float(v) * invscale / scale
5e1271c5 2573 except (ValueError, TypeError):
caf80631 2574 return default
43f775e4
PH
2575
2576
c7e327c4
S
2577def bool_or_none(v, default=None):
2578 return v if isinstance(v, bool) else default
2579
2580
53cd37ba
S
2581def strip_or_none(v, default=None):
2582 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2583
2584
af03000a
S
2585def url_or_none(url):
2586 if not url or not isinstance(url, compat_str):
2587 return None
2588 url = url.strip()
29f7c58a 2589 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2590
2591
3e9b66d7
LNO
2592def request_to_url(req):
2593 if isinstance(req, compat_urllib_request.Request):
2594 return req.get_full_url()
2595 else:
2596 return req
2597
2598
e29663c6 2599def strftime_or_none(timestamp, date_format, default=None):
2600 datetime_object = None
2601 try:
2602 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2603 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2604 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2605 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2606 return datetime_object.strftime(date_format)
2607 except (ValueError, TypeError, AttributeError):
2608 return default
2609
2610
608d11f5 2611def parse_duration(s):
8f9312c3 2612 if not isinstance(s, compat_basestring):
608d11f5 2613 return None
ca7b3246 2614 s = s.strip()
38d79fd1 2615 if not s:
2616 return None
ca7b3246 2617
acaff495 2618 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2619 m = re.match(r'''(?x)
2620 (?P<before_secs>
2621 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2622 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2623 (?P<ms>[.:][0-9]+)?Z?$
2624 ''', s)
acaff495 2625 if m:
8bd1c00b 2626 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2627 else:
2628 m = re.match(
056653bb
S
2629 r'''(?ix)(?:P?
2630 (?:
2631 [0-9]+\s*y(?:ears?)?\s*
2632 )?
2633 (?:
2634 [0-9]+\s*m(?:onths?)?\s*
2635 )?
2636 (?:
2637 [0-9]+\s*w(?:eeks?)?\s*
2638 )?
8f4b58d7 2639 (?:
acaff495 2640 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2641 )?
056653bb 2642 T)?
acaff495 2643 (?:
2644 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2645 )?
2646 (?:
2647 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2648 )?
2649 (?:
2650 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2651 )?Z?$''', s)
acaff495 2652 if m:
2653 days, hours, mins, secs, ms = m.groups()
2654 else:
15846398 2655 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2656 if m:
2657 hours, mins = m.groups()
2658 else:
2659 return None
2660
2661 duration = 0
2662 if secs:
2663 duration += float(secs)
2664 if mins:
2665 duration += float(mins) * 60
2666 if hours:
2667 duration += float(hours) * 60 * 60
2668 if days:
2669 duration += float(days) * 24 * 60 * 60
2670 if ms:
8bd1c00b 2671 duration += float(ms.replace(':', '.'))
acaff495 2672 return duration
91d7d0b3
JMF
2673
2674
e65e4c88 2675def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2676 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2677 return (
2678 '{0}.{1}{2}'.format(name, ext, real_ext)
2679 if not expected_real_ext or real_ext[1:] == expected_real_ext
2680 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2681
2682
b3ed15b7
S
2683def replace_extension(filename, ext, expected_real_ext=None):
2684 name, real_ext = os.path.splitext(filename)
2685 return '{0}.{1}'.format(
2686 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2687 ext)
2688
2689
d70ad093
PH
2690def check_executable(exe, args=[]):
2691 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2692 args can be a list of arguments for a short output (like -version) """
2693 try:
d3c93ec2 2694 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2695 except OSError:
2696 return False
2697 return exe
b7ab0590
PH
2698
2699
9af98e17 2700def _get_exe_version_output(exe, args):
95807118 2701 try:
b64d04c1 2702 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2703 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2704 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2705 out, _ = Popen(
2706 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2707 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2708 except OSError:
2709 return False
cae97f65
PH
2710 if isinstance(out, bytes): # Python 2.x
2711 out = out.decode('ascii', 'ignore')
9af98e17 2712 return out
cae97f65
PH
2713
2714
2715def detect_exe_version(output, version_re=None, unrecognized='present'):
2716 assert isinstance(output, compat_str)
2717 if version_re is None:
2718 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2719 m = re.search(version_re, output)
95807118
PH
2720 if m:
2721 return m.group(1)
2722 else:
2723 return unrecognized
2724
2725
9af98e17 2726def get_exe_version(exe, args=['--version'],
2727 version_re=None, unrecognized='present'):
2728 """ Returns the version of the specified executable,
2729 or False if the executable is not present """
2730 out = _get_exe_version_output(exe, args)
2731 return detect_exe_version(out, version_re, unrecognized) if out else False
2732
2733
cb89cfc1 2734class LazyList(collections.abc.Sequence):
483336e7 2735 ''' Lazy immutable list from an iterable
2736 Note that slices of a LazyList are lists and not LazyList'''
2737
8e5fecc8 2738 class IndexError(IndexError):
2739 pass
2740
282f5709 2741 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2742 self.__iterable = iter(iterable)
282f5709 2743 self.__cache = [] if _cache is None else _cache
2744 self.__reversed = reverse
483336e7 2745
2746 def __iter__(self):
28419ca2 2747 if self.__reversed:
2748 # We need to consume the entire iterable to iterate in reverse
981052c9 2749 yield from self.exhaust()
28419ca2 2750 return
2751 yield from self.__cache
483336e7 2752 for item in self.__iterable:
2753 self.__cache.append(item)
2754 yield item
2755
981052c9 2756 def __exhaust(self):
483336e7 2757 self.__cache.extend(self.__iterable)
9f1a1c36 2758 # Discard the emptied iterable to make it pickle-able
2759 self.__iterable = []
28419ca2 2760 return self.__cache
2761
981052c9 2762 def exhaust(self):
2763 ''' Evaluate the entire iterable '''
2764 return self.__exhaust()[::-1 if self.__reversed else 1]
2765
28419ca2 2766 @staticmethod
981052c9 2767 def __reverse_index(x):
e0f2b4b4 2768 return None if x is None else -(x + 1)
483336e7 2769
2770 def __getitem__(self, idx):
2771 if isinstance(idx, slice):
28419ca2 2772 if self.__reversed:
e0f2b4b4 2773 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2774 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2775 elif isinstance(idx, int):
28419ca2 2776 if self.__reversed:
981052c9 2777 idx = self.__reverse_index(idx)
e0f2b4b4 2778 start, stop, step = idx, idx, 0
483336e7 2779 else:
2780 raise TypeError('indices must be integers or slices')
e0f2b4b4 2781 if ((start or 0) < 0 or (stop or 0) < 0
2782 or (start is None and step < 0)
2783 or (stop is None and step > 0)):
483336e7 2784 # We need to consume the entire iterable to be able to slice from the end
2785 # Obviously, never use this with infinite iterables
8e5fecc8 2786 self.__exhaust()
2787 try:
2788 return self.__cache[idx]
2789 except IndexError as e:
2790 raise self.IndexError(e) from e
e0f2b4b4 2791 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2792 if n > 0:
2793 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2794 try:
2795 return self.__cache[idx]
2796 except IndexError as e:
2797 raise self.IndexError(e) from e
483336e7 2798
2799 def __bool__(self):
2800 try:
28419ca2 2801 self[-1] if self.__reversed else self[0]
8e5fecc8 2802 except self.IndexError:
483336e7 2803 return False
2804 return True
2805
2806 def __len__(self):
8e5fecc8 2807 self.__exhaust()
483336e7 2808 return len(self.__cache)
2809
282f5709 2810 def __reversed__(self):
2811 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2812
2813 def __copy__(self):
2814 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2815
28419ca2 2816 def __repr__(self):
2817 # repr and str should mimic a list. So we exhaust the iterable
2818 return repr(self.exhaust())
2819
2820 def __str__(self):
2821 return repr(self.exhaust())
2822
483336e7 2823
7be9ccff 2824class PagedList:
c07a39ae 2825
2826 class IndexError(IndexError):
2827 pass
2828
dd26ced1
PH
2829 def __len__(self):
2830 # This is only useful for tests
2831 return len(self.getslice())
2832
7be9ccff 2833 def __init__(self, pagefunc, pagesize, use_cache=True):
2834 self._pagefunc = pagefunc
2835 self._pagesize = pagesize
f1d13090 2836 self._pagecount = float('inf')
7be9ccff 2837 self._use_cache = use_cache
2838 self._cache = {}
2839
2840 def getpage(self, pagenum):
d8cf8d97 2841 page_results = self._cache.get(pagenum)
2842 if page_results is None:
f1d13090 2843 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2844 if self._use_cache:
2845 self._cache[pagenum] = page_results
2846 return page_results
2847
2848 def getslice(self, start=0, end=None):
2849 return list(self._getslice(start, end))
2850
2851 def _getslice(self, start, end):
55575225 2852 raise NotImplementedError('This method must be implemented by subclasses')
2853
2854 def __getitem__(self, idx):
f1d13090 2855 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2856 if not isinstance(idx, int) or idx < 0:
2857 raise TypeError('indices must be non-negative integers')
2858 entries = self.getslice(idx, idx + 1)
d8cf8d97 2859 if not entries:
c07a39ae 2860 raise self.IndexError()
d8cf8d97 2861 return entries[0]
55575225 2862
9c44d242
PH
2863
2864class OnDemandPagedList(PagedList):
7be9ccff 2865 def _getslice(self, start, end):
b7ab0590
PH
2866 for pagenum in itertools.count(start // self._pagesize):
2867 firstid = pagenum * self._pagesize
2868 nextfirstid = pagenum * self._pagesize + self._pagesize
2869 if start >= nextfirstid:
2870 continue
2871
b7ab0590
PH
2872 startv = (
2873 start % self._pagesize
2874 if firstid <= start < nextfirstid
2875 else 0)
b7ab0590
PH
2876 endv = (
2877 ((end - 1) % self._pagesize) + 1
2878 if (end is not None and firstid <= end <= nextfirstid)
2879 else None)
2880
f1d13090 2881 try:
2882 page_results = self.getpage(pagenum)
2883 except Exception:
2884 self._pagecount = pagenum - 1
2885 raise
b7ab0590
PH
2886 if startv != 0 or endv is not None:
2887 page_results = page_results[startv:endv]
7be9ccff 2888 yield from page_results
b7ab0590
PH
2889
2890 # A little optimization - if current page is not "full", ie. does
2891 # not contain page_size videos then we can assume that this page
2892 # is the last one - there are no more ids on further pages -
2893 # i.e. no need to query again.
2894 if len(page_results) + startv < self._pagesize:
2895 break
2896
2897 # If we got the whole page, but the next page is not interesting,
2898 # break out early as well
2899 if end == nextfirstid:
2900 break
81c2f20b
PH
2901
2902
9c44d242
PH
2903class InAdvancePagedList(PagedList):
2904 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2905 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2906 self._pagecount = pagecount
9c44d242 2907
7be9ccff 2908 def _getslice(self, start, end):
9c44d242 2909 start_page = start // self._pagesize
d37707bd 2910 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2911 skip_elems = start - start_page * self._pagesize
2912 only_more = None if end is None else end - start
2913 for pagenum in range(start_page, end_page):
7be9ccff 2914 page_results = self.getpage(pagenum)
9c44d242 2915 if skip_elems:
7be9ccff 2916 page_results = page_results[skip_elems:]
9c44d242
PH
2917 skip_elems = None
2918 if only_more is not None:
7be9ccff 2919 if len(page_results) < only_more:
2920 only_more -= len(page_results)
9c44d242 2921 else:
7be9ccff 2922 yield from page_results[:only_more]
9c44d242 2923 break
7be9ccff 2924 yield from page_results
9c44d242
PH
2925
2926
81c2f20b 2927def uppercase_escape(s):
676eb3f2 2928 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2929 return re.sub(
a612753d 2930 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2931 lambda m: unicode_escape(m.group(0))[0],
2932 s)
0fe2ff78
YCH
2933
2934
2935def lowercase_escape(s):
2936 unicode_escape = codecs.getdecoder('unicode_escape')
2937 return re.sub(
2938 r'\\u[0-9a-fA-F]{4}',
2939 lambda m: unicode_escape(m.group(0))[0],
2940 s)
b53466e1 2941
d05cfe06
S
2942
2943def escape_rfc3986(s):
2944 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2945 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2946 s = s.encode('utf-8')
ecc0c5ee 2947 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2948
2949
2950def escape_url(url):
2951 """Escape URL as suggested by RFC 3986"""
2952 url_parsed = compat_urllib_parse_urlparse(url)
2953 return url_parsed._replace(
efbed08d 2954 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2955 path=escape_rfc3986(url_parsed.path),
2956 params=escape_rfc3986(url_parsed.params),
2957 query=escape_rfc3986(url_parsed.query),
2958 fragment=escape_rfc3986(url_parsed.fragment)
2959 ).geturl()
2960
62e609ab 2961
4dfbf869 2962def parse_qs(url):
2963 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2964
2965
62e609ab
PH
2966def read_batch_urls(batch_fd):
2967 def fixup(url):
2968 if not isinstance(url, compat_str):
2969 url = url.decode('utf-8', 'replace')
8c04f0be 2970 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2971 for bom in BOM_UTF8:
2972 if url.startswith(bom):
2973 url = url[len(bom):]
2974 url = url.lstrip()
2975 if not url or url.startswith(('#', ';', ']')):
62e609ab 2976 return False
8c04f0be 2977 # "#" cannot be stripped out since it is part of the URI
2978 # However, it can be safely stipped out if follwing a whitespace
2979 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2980
2981 with contextlib.closing(batch_fd) as fd:
2982 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2983
2984
2985def urlencode_postdata(*args, **kargs):
15707c7e 2986 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2987
2988
38f9ef31 2989def update_url_query(url, query):
cacd9966
YCH
2990 if not query:
2991 return url
38f9ef31 2992 parsed_url = compat_urlparse.urlparse(url)
2993 qs = compat_parse_qs(parsed_url.query)
2994 qs.update(query)
2995 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2996 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2997
8e60dc75 2998
ed0291d1
S
2999def update_Request(req, url=None, data=None, headers={}, query={}):
3000 req_headers = req.headers.copy()
3001 req_headers.update(headers)
3002 req_data = data or req.data
3003 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3004 req_get_method = req.get_method()
3005 if req_get_method == 'HEAD':
3006 req_type = HEADRequest
3007 elif req_get_method == 'PUT':
3008 req_type = PUTRequest
3009 else:
3010 req_type = compat_urllib_request.Request
ed0291d1
S
3011 new_req = req_type(
3012 req_url, data=req_data, headers=req_headers,
3013 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3014 if hasattr(req, 'timeout'):
3015 new_req.timeout = req.timeout
3016 return new_req
3017
3018
10c87c15 3019def _multipart_encode_impl(data, boundary):
0c265486
YCH
3020 content_type = 'multipart/form-data; boundary=%s' % boundary
3021
3022 out = b''
3023 for k, v in data.items():
3024 out += b'--' + boundary.encode('ascii') + b'\r\n'
3025 if isinstance(k, compat_str):
3026 k = k.encode('utf-8')
3027 if isinstance(v, compat_str):
3028 v = v.encode('utf-8')
3029 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3030 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3031 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3032 if boundary.encode('ascii') in content:
3033 raise ValueError('Boundary overlaps with data')
3034 out += content
3035
3036 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3037
3038 return out, content_type
3039
3040
3041def multipart_encode(data, boundary=None):
3042 '''
3043 Encode a dict to RFC 7578-compliant form-data
3044
3045 data:
3046 A dict where keys and values can be either Unicode or bytes-like
3047 objects.
3048 boundary:
3049 If specified a Unicode object, it's used as the boundary. Otherwise
3050 a random boundary is generated.
3051
3052 Reference: https://tools.ietf.org/html/rfc7578
3053 '''
3054 has_specified_boundary = boundary is not None
3055
3056 while True:
3057 if boundary is None:
3058 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3059
3060 try:
10c87c15 3061 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3062 break
3063 except ValueError:
3064 if has_specified_boundary:
3065 raise
3066 boundary = None
3067
3068 return out, content_type
3069
3070
86296ad2 3071def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
3072 if isinstance(key_or_keys, (list, tuple)):
3073 for key in key_or_keys:
86296ad2
S
3074 if key not in d or d[key] is None or skip_false_values and not d[key]:
3075 continue
3076 return d[key]
cbecc9b9
S
3077 return default
3078 return d.get(key_or_keys, default)
3079
3080
329ca3be 3081def try_get(src, getter, expected_type=None):
6606817a 3082 for get in variadic(getter):
a32a9a7e
S
3083 try:
3084 v = get(src)
3085 except (AttributeError, KeyError, TypeError, IndexError):
3086 pass
3087 else:
3088 if expected_type is None or isinstance(v, expected_type):
3089 return v
329ca3be
S
3090
3091
6cc62232
S
3092def merge_dicts(*dicts):
3093 merged = {}
3094 for a_dict in dicts:
3095 for k, v in a_dict.items():
3096 if v is None:
3097 continue
3089bc74
S
3098 if (k not in merged
3099 or (isinstance(v, compat_str) and v
3100 and isinstance(merged[k], compat_str)
3101 and not merged[k])):
6cc62232
S
3102 merged[k] = v
3103 return merged
3104
3105
8e60dc75
S
3106def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3107 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3108
16392824 3109
a1a530b0
PH
3110US_RATINGS = {
3111 'G': 0,
3112 'PG': 10,
3113 'PG-13': 13,
3114 'R': 16,
3115 'NC': 18,
3116}
fac55558
PH
3117
3118
a8795327 3119TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3120 'TV-Y': 0,
3121 'TV-Y7': 7,
3122 'TV-G': 0,
3123 'TV-PG': 0,
3124 'TV-14': 14,
3125 'TV-MA': 17,
a8795327
S
3126}
3127
3128
146c80e2 3129def parse_age_limit(s):
a8795327
S
3130 if type(s) == int:
3131 return s if 0 <= s <= 21 else None
3132 if not isinstance(s, compat_basestring):
d838b1bd 3133 return None
146c80e2 3134 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3135 if m:
3136 return int(m.group('age'))
5c5fae6d 3137 s = s.upper()
a8795327
S
3138 if s in US_RATINGS:
3139 return US_RATINGS[s]
5a16c9d9 3140 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3141 if m:
5a16c9d9 3142 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3143 return None
146c80e2
S
3144
3145
fac55558 3146def strip_jsonp(code):
609a61e3 3147 return re.sub(
5552c9eb 3148 r'''(?sx)^
e9c671d5 3149 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3150 (?:\s*&&\s*(?P=func_name))?
3151 \s*\(\s*(?P<callback_data>.*)\);?
3152 \s*?(?://[^\n]*)*$''',
3153 r'\g<callback_data>', code)
478c2c61
PH
3154
3155
5c610515 3156def js_to_json(code, vars={}):
3157 # vars is a dict of var, val pairs to substitute
c843e685 3158 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
3159 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3160 INTEGER_TABLE = (
3161 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3162 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3163 )
3164
e05f6939 3165 def fix_kv(m):
e7b6d122
PH
3166 v = m.group(0)
3167 if v in ('true', 'false', 'null'):
3168 return v
421ddcb8
C
3169 elif v in ('undefined', 'void 0'):
3170 return 'null'
8bdd16b4 3171 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3172 return ""
3173
3174 if v[0] in ("'", '"'):
3175 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3176 '"': '\\"',
bd1e4844 3177 "\\'": "'",
3178 '\\\n': '',
3179 '\\x': '\\u00',
3180 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3181 else:
3182 for regex, base in INTEGER_TABLE:
3183 im = re.match(regex, v)
3184 if im:
3185 i = int(im.group(1), base)
3186 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3187
5c610515 3188 if v in vars:
3189 return vars[v]
3190
e7b6d122 3191 return '"%s"' % v
e05f6939 3192
febff4c1
B
3193 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3194
bd1e4844 3195 return re.sub(r'''(?sx)
3196 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3197 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3198 {comment}|,(?={skip}[\]}}])|
421ddcb8 3199 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3200 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3201 [0-9]+(?={skip}:)|
3202 !+
4195096e 3203 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3204
3205
478c2c61
PH
3206def qualities(quality_ids):
3207 """ Get a numeric quality value out of a list of possible values """
3208 def q(qid):
3209 try:
3210 return quality_ids.index(qid)
3211 except ValueError:
3212 return -1
3213 return q
3214
acd69589 3215
09b49e1f 3216POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3217
3218
de6000d9 3219DEFAULT_OUTTMPL = {
3220 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3221 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3222}
3223OUTTMPL_TYPES = {
72755351 3224 'chapter': None,
de6000d9 3225 'subtitle': None,
3226 'thumbnail': None,
3227 'description': 'description',
3228 'annotation': 'annotations.xml',
3229 'infojson': 'info.json',
08438d2c 3230 'link': None,
3b603dbd 3231 'pl_video': None,
5112f26a 3232 'pl_thumbnail': None,
de6000d9 3233 'pl_description': 'description',
3234 'pl_infojson': 'info.json',
3235}
0a871f68 3236
143db31d 3237# As of [1] format syntax is:
3238# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3239# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3240STR_FORMAT_RE_TMPL = r'''(?x)
3241 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3242 %
524e2e4f 3243 (?P<has_key>\((?P<key>{0})\))?
752cda38 3244 (?P<format>
524e2e4f 3245 (?P<conversion>[#0\-+ ]+)?
3246 (?P<min_width>\d+)?
3247 (?P<precision>\.\d+)?
3248 (?P<len_mod>[hlL])? # unused in python
901130bb 3249 {1} # conversion type
752cda38 3250 )
143db31d 3251'''
3252
7d1eb38a 3253
901130bb 3254STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3255
7d1eb38a 3256
a020a0dc
PH
3257def limit_length(s, length):
3258 """ Add ellipses to overly long strings """
3259 if s is None:
3260 return None
3261 ELLIPSES = '...'
3262 if len(s) > length:
3263 return s[:length - len(ELLIPSES)] + ELLIPSES
3264 return s
48844745
PH
3265
3266
3267def version_tuple(v):
5f9b8394 3268 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3269
3270
3271def is_outdated_version(version, limit, assume_new=True):
3272 if not version:
3273 return not assume_new
3274 try:
3275 return version_tuple(version) < version_tuple(limit)
3276 except ValueError:
3277 return not assume_new
732ea2f0
PH
3278
3279
3280def ytdl_is_updateable():
7a5c1cfe 3281 """ Returns if yt-dlp can be updated with -U """
735d865e 3282
5d535b4a 3283 from .update import is_non_updateable
732ea2f0 3284
5d535b4a 3285 return not is_non_updateable()
7d4111ed
PH
3286
3287
3288def args_to_str(args):
3289 # Get a short string representation for a subprocess command
702ccf2d 3290 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3291
3292
9b9c5355 3293def error_to_compat_str(err):
fdae2358
S
3294 err_str = str(err)
3295 # On python 2 error byte string must be decoded with proper
3296 # encoding rather than ascii
3297 if sys.version_info[0] < 3:
3298 err_str = err_str.decode(preferredencoding())
3299 return err_str
3300
3301
c460bdd5 3302def mimetype2ext(mt):
eb9ee194
S
3303 if mt is None:
3304 return None
3305
9359f3d4
F
3306 mt, _, params = mt.partition(';')
3307 mt = mt.strip()
3308
3309 FULL_MAP = {
765ac263 3310 'audio/mp4': 'm4a',
6c33d24b
YCH
3311 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3312 # it's the most popular one
3313 'audio/mpeg': 'mp3',
ba39289d 3314 'audio/x-wav': 'wav',
9359f3d4
F
3315 'audio/wav': 'wav',
3316 'audio/wave': 'wav',
3317 }
3318
3319 ext = FULL_MAP.get(mt)
765ac263
JMF
3320 if ext is not None:
3321 return ext
3322
9359f3d4 3323 SUBTYPE_MAP = {
f6861ec9 3324 '3gpp': '3gp',
cafcf657 3325 'smptett+xml': 'tt',
cafcf657 3326 'ttaf+xml': 'dfxp',
a0d8d704 3327 'ttml+xml': 'ttml',
f6861ec9 3328 'x-flv': 'flv',
a0d8d704 3329 'x-mp4-fragmented': 'mp4',
d4f05d47 3330 'x-ms-sami': 'sami',
a0d8d704 3331 'x-ms-wmv': 'wmv',
b4173f15
RA
3332 'mpegurl': 'm3u8',
3333 'x-mpegurl': 'm3u8',
3334 'vnd.apple.mpegurl': 'm3u8',
3335 'dash+xml': 'mpd',
b4173f15 3336 'f4m+xml': 'f4m',
f164b971 3337 'hds+xml': 'f4m',
e910fe2f 3338 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3339 'quicktime': 'mov',
98ce1a3f 3340 'mp2t': 'ts',
39e7107d 3341 'x-wav': 'wav',
9359f3d4
F
3342 'filmstrip+json': 'fs',
3343 'svg+xml': 'svg',
3344 }
3345
3346 _, _, subtype = mt.rpartition('/')
3347 ext = SUBTYPE_MAP.get(subtype.lower())
3348 if ext is not None:
3349 return ext
3350
3351 SUFFIX_MAP = {
3352 'json': 'json',
3353 'xml': 'xml',
3354 'zip': 'zip',
3355 'gzip': 'gz',
3356 }
3357
3358 _, _, suffix = subtype.partition('+')
3359 ext = SUFFIX_MAP.get(suffix)
3360 if ext is not None:
3361 return ext
3362
3363 return subtype.replace('+', '.')
c460bdd5
PH
3364
3365
2814f12b
THD
3366def ext2mimetype(ext_or_url):
3367 if not ext_or_url:
3368 return None
3369 if '.' not in ext_or_url:
3370 ext_or_url = f'file.{ext_or_url}'
3371 return mimetypes.guess_type(ext_or_url)[0]
3372
3373
4f3c5e06 3374def parse_codecs(codecs_str):
3375 # http://tools.ietf.org/html/rfc6381
3376 if not codecs_str:
3377 return {}
a0566bbf 3378 split_codecs = list(filter(None, map(
dbf5416a 3379 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3380 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3381 for full_codec in split_codecs:
9bd979ca 3382 parts = full_codec.split('.')
3383 codec = parts[0].replace('0', '')
3384 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3385 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3386 if not vcodec:
b69fd25c 3387 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3388 if codec in ('dvh1', 'dvhe'):
3389 hdr = 'DV'
9bd979ca 3390 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3391 hdr = 'HDR10'
3392 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3393 hdr = 'HDR10'
b69fd25c 3394 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3395 if not acodec:
3396 acodec = full_codec
4afa3ec4
F
3397 elif codec in ('stpp', 'wvtt',):
3398 if not tcodec:
3399 tcodec = full_codec
4f3c5e06 3400 else:
60f5c9fb 3401 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3402 if vcodec or acodec or tcodec:
4f3c5e06 3403 return {
3404 'vcodec': vcodec or 'none',
3405 'acodec': acodec or 'none',
176f1866 3406 'dynamic_range': hdr,
4afa3ec4 3407 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3408 }
b69fd25c 3409 elif len(split_codecs) == 2:
3410 return {
3411 'vcodec': split_codecs[0],
3412 'acodec': split_codecs[1],
3413 }
4f3c5e06 3414 return {}
3415
3416
2ccd1b10 3417def urlhandle_detect_ext(url_handle):
79298173 3418 getheader = url_handle.headers.get
2ccd1b10 3419
b55ee18f
PH
3420 cd = getheader('Content-Disposition')
3421 if cd:
3422 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3423 if m:
3424 e = determine_ext(m.group('filename'), default_ext=None)
3425 if e:
3426 return e
3427
c460bdd5 3428 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3429
3430
1e399778
YCH
3431def encode_data_uri(data, mime_type):
3432 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3433
3434
05900629 3435def age_restricted(content_limit, age_limit):
6ec6cb4e 3436 """ Returns True iff the content should be blocked """
05900629
PH
3437
3438 if age_limit is None: # No limit set
3439 return False
3440 if content_limit is None:
3441 return False # Content available for everyone
3442 return age_limit < content_limit
61ca9a80
PH
3443
3444
3445def is_html(first_bytes):
3446 """ Detect whether a file contains HTML by examining its first bytes. """
3447
3448 BOMS = [
3449 (b'\xef\xbb\xbf', 'utf-8'),
3450 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3451 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3452 (b'\xff\xfe', 'utf-16-le'),
3453 (b'\xfe\xff', 'utf-16-be'),
3454 ]
3455 for bom, enc in BOMS:
3456 if first_bytes.startswith(bom):
3457 s = first_bytes[len(bom):].decode(enc, 'replace')
3458 break
3459 else:
3460 s = first_bytes.decode('utf-8', 'replace')
3461
3462 return re.match(r'^\s*<', s)
a055469f
PH
3463
3464
3465def determine_protocol(info_dict):
3466 protocol = info_dict.get('protocol')
3467 if protocol is not None:
3468 return protocol
3469
7de837a5 3470 url = sanitize_url(info_dict['url'])
a055469f
PH
3471 if url.startswith('rtmp'):
3472 return 'rtmp'
3473 elif url.startswith('mms'):
3474 return 'mms'
3475 elif url.startswith('rtsp'):
3476 return 'rtsp'
3477
3478 ext = determine_ext(url)
3479 if ext == 'm3u8':
3480 return 'm3u8'
3481 elif ext == 'f4m':
3482 return 'f4m'
3483
3484 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3485
3486
c5e3f849 3487def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3488 """ Render a list of rows, each as a list of values.
3489 Text after a \t will be right aligned """
ec11a9f4 3490 def width(string):
c5e3f849 3491 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3492
3493 def get_max_lens(table):
ec11a9f4 3494 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3495
3496 def filter_using_list(row, filterArray):
d16df59d 3497 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3498
d16df59d 3499 max_lens = get_max_lens(data) if hide_empty else []
3500 header_row = filter_using_list(header_row, max_lens)
3501 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3502
cfb56d1a 3503 table = [header_row] + data
76d321f6 3504 max_lens = get_max_lens(table)
c5e3f849 3505 extra_gap += 1
76d321f6 3506 if delim:
c5e3f849 3507 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3508 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3509 for row in table:
3510 for pos, text in enumerate(map(str, row)):
c5e3f849 3511 if '\t' in text:
3512 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3513 else:
3514 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3515 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3516 return ret
347de493
PH
3517
3518
8f18aca8 3519def _match_one(filter_part, dct, incomplete):
77b87f05 3520 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3521 STRING_OPERATORS = {
3522 '*=': operator.contains,
3523 '^=': lambda attr, value: attr.startswith(value),
3524 '$=': lambda attr, value: attr.endswith(value),
3525 '~=': lambda attr, value: re.search(value, attr),
3526 }
347de493 3527 COMPARISON_OPERATORS = {
a047eeb6 3528 **STRING_OPERATORS,
3529 '<=': operator.le, # "<=" must be defined above "<"
347de493 3530 '<': operator.lt,
347de493 3531 '>=': operator.ge,
a047eeb6 3532 '>': operator.gt,
347de493 3533 '=': operator.eq,
347de493 3534 }
a047eeb6 3535
347de493
PH
3536 operator_rex = re.compile(r'''(?x)\s*
3537 (?P<key>[a-z_]+)
77b87f05 3538 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3539 (?:
a047eeb6 3540 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3541 (?P<strval>.+?)
347de493
PH
3542 )
3543 \s*$
3544 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3545 m = operator_rex.search(filter_part)
3546 if m:
18f96d12 3547 m = m.groupdict()
3548 unnegated_op = COMPARISON_OPERATORS[m['op']]
3549 if m['negation']:
77b87f05
MT
3550 op = lambda attr, value: not unnegated_op(attr, value)
3551 else:
3552 op = unnegated_op
18f96d12 3553 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3554 if m['quote']:
3555 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3556 actual_value = dct.get(m['key'])
3557 numeric_comparison = None
3558 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3559 # If the original field is a string and matching comparisonvalue is
3560 # a number we should respect the origin of the original field
3561 # and process comparison value as a string (see
18f96d12 3562 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3563 try:
18f96d12 3564 numeric_comparison = int(comparison_value)
347de493 3565 except ValueError:
18f96d12 3566 numeric_comparison = parse_filesize(comparison_value)
3567 if numeric_comparison is None:
3568 numeric_comparison = parse_filesize(f'{comparison_value}B')
3569 if numeric_comparison is None:
3570 numeric_comparison = parse_duration(comparison_value)
3571 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3572 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3573 if actual_value is None:
18f96d12 3574 return incomplete or m['none_inclusive']
3575 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3576
3577 UNARY_OPERATORS = {
1cc47c66
S
3578 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3579 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3580 }
3581 operator_rex = re.compile(r'''(?x)\s*
3582 (?P<op>%s)\s*(?P<key>[a-z_]+)
3583 \s*$
3584 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3585 m = operator_rex.search(filter_part)
3586 if m:
3587 op = UNARY_OPERATORS[m.group('op')]
3588 actual_value = dct.get(m.group('key'))
8f18aca8 3589 if incomplete and actual_value is None:
3590 return True
347de493
PH
3591 return op(actual_value)
3592
3593 raise ValueError('Invalid filter part %r' % filter_part)
3594
3595
8f18aca8 3596def match_str(filter_str, dct, incomplete=False):
3597 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3598 When incomplete, all conditions passes on missing fields
3599 """
347de493 3600 return all(
8f18aca8 3601 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3602 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3603
3604
3605def match_filter_func(filter_str):
8f18aca8 3606 def _match_func(info_dict, *args, **kwargs):
3607 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3608 return None
3609 else:
3610 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3611 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3612 return _match_func
91410c9b
PH
3613
3614
bf6427d2
YCH
3615def parse_dfxp_time_expr(time_expr):
3616 if not time_expr:
d631d5f9 3617 return
bf6427d2
YCH
3618
3619 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3620 if mobj:
3621 return float(mobj.group('time_offset'))
3622
db2fe38b 3623 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3624 if mobj:
db2fe38b 3625 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3626
3627
c1c924ab 3628def srt_subtitles_timecode(seconds):
aa7785f8 3629 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3630
3631
3632def ass_subtitles_timecode(seconds):
3633 time = timetuple_from_msec(seconds * 1000)
3634 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3635
3636
3637def dfxp2srt(dfxp_data):
3869028f
YCH
3638 '''
3639 @param dfxp_data A bytes-like object containing DFXP data
3640 @returns A unicode object containing converted SRT data
3641 '''
5b995f71 3642 LEGACY_NAMESPACES = (
3869028f
YCH
3643 (b'http://www.w3.org/ns/ttml', [
3644 b'http://www.w3.org/2004/11/ttaf1',
3645 b'http://www.w3.org/2006/04/ttaf1',
3646 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3647 ]),
3869028f
YCH
3648 (b'http://www.w3.org/ns/ttml#styling', [
3649 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3650 ]),
3651 )
3652
3653 SUPPORTED_STYLING = [
3654 'color',
3655 'fontFamily',
3656 'fontSize',
3657 'fontStyle',
3658 'fontWeight',
3659 'textDecoration'
3660 ]
3661
4e335771 3662 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3663 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3664 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3665 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3666 })
bf6427d2 3667
5b995f71
RA
3668 styles = {}
3669 default_style = {}
3670
87de7069 3671 class TTMLPElementParser(object):
5b995f71
RA
3672 _out = ''
3673 _unclosed_elements = []
3674 _applied_styles = []
bf6427d2 3675
2b14cb56 3676 def start(self, tag, attrib):
5b995f71
RA
3677 if tag in (_x('ttml:br'), 'br'):
3678 self._out += '\n'
3679 else:
3680 unclosed_elements = []
3681 style = {}
3682 element_style_id = attrib.get('style')
3683 if default_style:
3684 style.update(default_style)
3685 if element_style_id:
3686 style.update(styles.get(element_style_id, {}))
3687 for prop in SUPPORTED_STYLING:
3688 prop_val = attrib.get(_x('tts:' + prop))
3689 if prop_val:
3690 style[prop] = prop_val
3691 if style:
3692 font = ''
3693 for k, v in sorted(style.items()):
3694 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3695 continue
3696 if k == 'color':
3697 font += ' color="%s"' % v
3698 elif k == 'fontSize':
3699 font += ' size="%s"' % v
3700 elif k == 'fontFamily':
3701 font += ' face="%s"' % v
3702 elif k == 'fontWeight' and v == 'bold':
3703 self._out += '<b>'
3704 unclosed_elements.append('b')
3705 elif k == 'fontStyle' and v == 'italic':
3706 self._out += '<i>'
3707 unclosed_elements.append('i')
3708 elif k == 'textDecoration' and v == 'underline':
3709 self._out += '<u>'
3710 unclosed_elements.append('u')
3711 if font:
3712 self._out += '<font' + font + '>'
3713 unclosed_elements.append('font')
3714 applied_style = {}
3715 if self._applied_styles:
3716 applied_style.update(self._applied_styles[-1])
3717 applied_style.update(style)
3718 self._applied_styles.append(applied_style)
3719 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3720
2b14cb56 3721 def end(self, tag):
5b995f71
RA
3722 if tag not in (_x('ttml:br'), 'br'):
3723 unclosed_elements = self._unclosed_elements.pop()
3724 for element in reversed(unclosed_elements):
3725 self._out += '</%s>' % element
3726 if unclosed_elements and self._applied_styles:
3727 self._applied_styles.pop()
bf6427d2 3728
2b14cb56 3729 def data(self, data):
5b995f71 3730 self._out += data
2b14cb56 3731
3732 def close(self):
5b995f71 3733 return self._out.strip()
2b14cb56 3734
3735 def parse_node(node):
3736 target = TTMLPElementParser()
3737 parser = xml.etree.ElementTree.XMLParser(target=target)
3738 parser.feed(xml.etree.ElementTree.tostring(node))
3739 return parser.close()
bf6427d2 3740
5b995f71
RA
3741 for k, v in LEGACY_NAMESPACES:
3742 for ns in v:
3743 dfxp_data = dfxp_data.replace(ns, k)
3744
3869028f 3745 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3746 out = []
5b995f71 3747 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3748
3749 if not paras:
3750 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3751
5b995f71
RA
3752 repeat = False
3753 while True:
3754 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3755 style_id = style.get('id') or style.get(_x('xml:id'))
3756 if not style_id:
3757 continue
5b995f71
RA
3758 parent_style_id = style.get('style')
3759 if parent_style_id:
3760 if parent_style_id not in styles:
3761 repeat = True
3762 continue
3763 styles[style_id] = styles[parent_style_id].copy()
3764 for prop in SUPPORTED_STYLING:
3765 prop_val = style.get(_x('tts:' + prop))
3766 if prop_val:
3767 styles.setdefault(style_id, {})[prop] = prop_val
3768 if repeat:
3769 repeat = False
3770 else:
3771 break
3772
3773 for p in ('body', 'div'):
3774 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3775 if ele is None:
3776 continue
3777 style = styles.get(ele.get('style'))
3778 if not style:
3779 continue
3780 default_style.update(style)
3781
bf6427d2 3782 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3783 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3784 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3785 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3786 if begin_time is None:
3787 continue
7dff0363 3788 if not end_time:
d631d5f9
YCH
3789 if not dur:
3790 continue
3791 end_time = begin_time + dur
bf6427d2
YCH
3792 out.append('%d\n%s --> %s\n%s\n\n' % (
3793 index,
c1c924ab
YCH
3794 srt_subtitles_timecode(begin_time),
3795 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3796 parse_node(para)))
3797
3798 return ''.join(out)
3799
3800
66e289ba
S
3801def cli_option(params, command_option, param):
3802 param = params.get(param)
98e698f1
RA
3803 if param:
3804 param = compat_str(param)
66e289ba
S
3805 return [command_option, param] if param is not None else []
3806
3807
3808def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3809 param = params.get(param)
5b232f46
S
3810 if param is None:
3811 return []
66e289ba
S
3812 assert isinstance(param, bool)
3813 if separator:
3814 return [command_option + separator + (true_value if param else false_value)]
3815 return [command_option, true_value if param else false_value]
3816
3817
3818def cli_valueless_option(params, command_option, param, expected_value=True):
3819 param = params.get(param)
3820 return [command_option] if param == expected_value else []
3821
3822
e92caff5 3823def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3824 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3825 if use_compat:
5b1ecbb3 3826 return argdict
3827 else:
3828 argdict = None
eab9b2bc 3829 if argdict is None:
5b1ecbb3 3830 return default
eab9b2bc 3831 assert isinstance(argdict, dict)
3832
e92caff5 3833 assert isinstance(keys, (list, tuple))
3834 for key_list in keys:
e92caff5 3835 arg_list = list(filter(
3836 lambda x: x is not None,
6606817a 3837 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3838 if arg_list:
3839 return [arg for args in arg_list for arg in args]
3840 return default
66e289ba 3841
6251555f 3842
330690a2 3843def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3844 main_key, exe = main_key.lower(), exe.lower()
3845 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3846 keys = [f'{root_key}{k}' for k in (keys or [''])]
3847 if root_key in keys:
3848 if main_key != exe:
3849 keys.append((main_key, exe))
3850 keys.append('default')
3851 else:
3852 use_compat = False
3853 return cli_configuration_args(argdict, keys, default, use_compat)
3854
66e289ba 3855
39672624
YCH
3856class ISO639Utils(object):
3857 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3858 _lang_map = {
3859 'aa': 'aar',
3860 'ab': 'abk',
3861 'ae': 'ave',
3862 'af': 'afr',
3863 'ak': 'aka',
3864 'am': 'amh',
3865 'an': 'arg',
3866 'ar': 'ara',
3867 'as': 'asm',
3868 'av': 'ava',
3869 'ay': 'aym',
3870 'az': 'aze',
3871 'ba': 'bak',
3872 'be': 'bel',
3873 'bg': 'bul',
3874 'bh': 'bih',
3875 'bi': 'bis',
3876 'bm': 'bam',
3877 'bn': 'ben',
3878 'bo': 'bod',
3879 'br': 'bre',
3880 'bs': 'bos',
3881 'ca': 'cat',
3882 'ce': 'che',
3883 'ch': 'cha',
3884 'co': 'cos',
3885 'cr': 'cre',
3886 'cs': 'ces',
3887 'cu': 'chu',
3888 'cv': 'chv',
3889 'cy': 'cym',
3890 'da': 'dan',
3891 'de': 'deu',
3892 'dv': 'div',
3893 'dz': 'dzo',
3894 'ee': 'ewe',
3895 'el': 'ell',
3896 'en': 'eng',
3897 'eo': 'epo',
3898 'es': 'spa',
3899 'et': 'est',
3900 'eu': 'eus',
3901 'fa': 'fas',
3902 'ff': 'ful',
3903 'fi': 'fin',
3904 'fj': 'fij',
3905 'fo': 'fao',
3906 'fr': 'fra',
3907 'fy': 'fry',
3908 'ga': 'gle',
3909 'gd': 'gla',
3910 'gl': 'glg',
3911 'gn': 'grn',
3912 'gu': 'guj',
3913 'gv': 'glv',
3914 'ha': 'hau',
3915 'he': 'heb',
b7acc835 3916 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3917 'hi': 'hin',
3918 'ho': 'hmo',
3919 'hr': 'hrv',
3920 'ht': 'hat',
3921 'hu': 'hun',
3922 'hy': 'hye',
3923 'hz': 'her',
3924 'ia': 'ina',
3925 'id': 'ind',
b7acc835 3926 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3927 'ie': 'ile',
3928 'ig': 'ibo',
3929 'ii': 'iii',
3930 'ik': 'ipk',
3931 'io': 'ido',
3932 'is': 'isl',
3933 'it': 'ita',
3934 'iu': 'iku',
3935 'ja': 'jpn',
3936 'jv': 'jav',
3937 'ka': 'kat',
3938 'kg': 'kon',
3939 'ki': 'kik',
3940 'kj': 'kua',
3941 'kk': 'kaz',
3942 'kl': 'kal',
3943 'km': 'khm',
3944 'kn': 'kan',
3945 'ko': 'kor',
3946 'kr': 'kau',
3947 'ks': 'kas',
3948 'ku': 'kur',
3949 'kv': 'kom',
3950 'kw': 'cor',
3951 'ky': 'kir',
3952 'la': 'lat',
3953 'lb': 'ltz',
3954 'lg': 'lug',
3955 'li': 'lim',
3956 'ln': 'lin',
3957 'lo': 'lao',
3958 'lt': 'lit',
3959 'lu': 'lub',
3960 'lv': 'lav',
3961 'mg': 'mlg',
3962 'mh': 'mah',
3963 'mi': 'mri',
3964 'mk': 'mkd',
3965 'ml': 'mal',
3966 'mn': 'mon',
3967 'mr': 'mar',
3968 'ms': 'msa',
3969 'mt': 'mlt',
3970 'my': 'mya',
3971 'na': 'nau',
3972 'nb': 'nob',
3973 'nd': 'nde',
3974 'ne': 'nep',
3975 'ng': 'ndo',
3976 'nl': 'nld',
3977 'nn': 'nno',
3978 'no': 'nor',
3979 'nr': 'nbl',
3980 'nv': 'nav',
3981 'ny': 'nya',
3982 'oc': 'oci',
3983 'oj': 'oji',
3984 'om': 'orm',
3985 'or': 'ori',
3986 'os': 'oss',
3987 'pa': 'pan',
3988 'pi': 'pli',
3989 'pl': 'pol',
3990 'ps': 'pus',
3991 'pt': 'por',
3992 'qu': 'que',
3993 'rm': 'roh',
3994 'rn': 'run',
3995 'ro': 'ron',
3996 'ru': 'rus',
3997 'rw': 'kin',
3998 'sa': 'san',
3999 'sc': 'srd',
4000 'sd': 'snd',
4001 'se': 'sme',
4002 'sg': 'sag',
4003 'si': 'sin',
4004 'sk': 'slk',
4005 'sl': 'slv',
4006 'sm': 'smo',
4007 'sn': 'sna',
4008 'so': 'som',
4009 'sq': 'sqi',
4010 'sr': 'srp',
4011 'ss': 'ssw',
4012 'st': 'sot',
4013 'su': 'sun',
4014 'sv': 'swe',
4015 'sw': 'swa',
4016 'ta': 'tam',
4017 'te': 'tel',
4018 'tg': 'tgk',
4019 'th': 'tha',
4020 'ti': 'tir',
4021 'tk': 'tuk',
4022 'tl': 'tgl',
4023 'tn': 'tsn',
4024 'to': 'ton',
4025 'tr': 'tur',
4026 'ts': 'tso',
4027 'tt': 'tat',
4028 'tw': 'twi',
4029 'ty': 'tah',
4030 'ug': 'uig',
4031 'uk': 'ukr',
4032 'ur': 'urd',
4033 'uz': 'uzb',
4034 've': 'ven',
4035 'vi': 'vie',
4036 'vo': 'vol',
4037 'wa': 'wln',
4038 'wo': 'wol',
4039 'xh': 'xho',
4040 'yi': 'yid',
e9a50fba 4041 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4042 'yo': 'yor',
4043 'za': 'zha',
4044 'zh': 'zho',
4045 'zu': 'zul',
4046 }
4047
4048 @classmethod
4049 def short2long(cls, code):
4050 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4051 return cls._lang_map.get(code[:2])
4052
4053 @classmethod
4054 def long2short(cls, code):
4055 """Convert language code from ISO 639-2/T to ISO 639-1"""
4056 for short_name, long_name in cls._lang_map.items():
4057 if long_name == code:
4058 return short_name
4059
4060
4eb10f66
YCH
4061class ISO3166Utils(object):
4062 # From http://data.okfn.org/data/core/country-list
4063 _country_map = {
4064 'AF': 'Afghanistan',
4065 'AX': 'Åland Islands',
4066 'AL': 'Albania',
4067 'DZ': 'Algeria',
4068 'AS': 'American Samoa',
4069 'AD': 'Andorra',
4070 'AO': 'Angola',
4071 'AI': 'Anguilla',
4072 'AQ': 'Antarctica',
4073 'AG': 'Antigua and Barbuda',
4074 'AR': 'Argentina',
4075 'AM': 'Armenia',
4076 'AW': 'Aruba',
4077 'AU': 'Australia',
4078 'AT': 'Austria',
4079 'AZ': 'Azerbaijan',
4080 'BS': 'Bahamas',
4081 'BH': 'Bahrain',
4082 'BD': 'Bangladesh',
4083 'BB': 'Barbados',
4084 'BY': 'Belarus',
4085 'BE': 'Belgium',
4086 'BZ': 'Belize',
4087 'BJ': 'Benin',
4088 'BM': 'Bermuda',
4089 'BT': 'Bhutan',
4090 'BO': 'Bolivia, Plurinational State of',
4091 'BQ': 'Bonaire, Sint Eustatius and Saba',
4092 'BA': 'Bosnia and Herzegovina',
4093 'BW': 'Botswana',
4094 'BV': 'Bouvet Island',
4095 'BR': 'Brazil',
4096 'IO': 'British Indian Ocean Territory',
4097 'BN': 'Brunei Darussalam',
4098 'BG': 'Bulgaria',
4099 'BF': 'Burkina Faso',
4100 'BI': 'Burundi',
4101 'KH': 'Cambodia',
4102 'CM': 'Cameroon',
4103 'CA': 'Canada',
4104 'CV': 'Cape Verde',
4105 'KY': 'Cayman Islands',
4106 'CF': 'Central African Republic',
4107 'TD': 'Chad',
4108 'CL': 'Chile',
4109 'CN': 'China',
4110 'CX': 'Christmas Island',
4111 'CC': 'Cocos (Keeling) Islands',
4112 'CO': 'Colombia',
4113 'KM': 'Comoros',
4114 'CG': 'Congo',
4115 'CD': 'Congo, the Democratic Republic of the',
4116 'CK': 'Cook Islands',
4117 'CR': 'Costa Rica',
4118 'CI': 'Côte d\'Ivoire',
4119 'HR': 'Croatia',
4120 'CU': 'Cuba',
4121 'CW': 'Curaçao',
4122 'CY': 'Cyprus',
4123 'CZ': 'Czech Republic',
4124 'DK': 'Denmark',
4125 'DJ': 'Djibouti',
4126 'DM': 'Dominica',
4127 'DO': 'Dominican Republic',
4128 'EC': 'Ecuador',
4129 'EG': 'Egypt',
4130 'SV': 'El Salvador',
4131 'GQ': 'Equatorial Guinea',
4132 'ER': 'Eritrea',
4133 'EE': 'Estonia',
4134 'ET': 'Ethiopia',
4135 'FK': 'Falkland Islands (Malvinas)',
4136 'FO': 'Faroe Islands',
4137 'FJ': 'Fiji',
4138 'FI': 'Finland',
4139 'FR': 'France',
4140 'GF': 'French Guiana',
4141 'PF': 'French Polynesia',
4142 'TF': 'French Southern Territories',
4143 'GA': 'Gabon',
4144 'GM': 'Gambia',
4145 'GE': 'Georgia',
4146 'DE': 'Germany',
4147 'GH': 'Ghana',
4148 'GI': 'Gibraltar',
4149 'GR': 'Greece',
4150 'GL': 'Greenland',
4151 'GD': 'Grenada',
4152 'GP': 'Guadeloupe',
4153 'GU': 'Guam',
4154 'GT': 'Guatemala',
4155 'GG': 'Guernsey',
4156 'GN': 'Guinea',
4157 'GW': 'Guinea-Bissau',
4158 'GY': 'Guyana',
4159 'HT': 'Haiti',
4160 'HM': 'Heard Island and McDonald Islands',
4161 'VA': 'Holy See (Vatican City State)',
4162 'HN': 'Honduras',
4163 'HK': 'Hong Kong',
4164 'HU': 'Hungary',
4165 'IS': 'Iceland',
4166 'IN': 'India',
4167 'ID': 'Indonesia',
4168 'IR': 'Iran, Islamic Republic of',
4169 'IQ': 'Iraq',
4170 'IE': 'Ireland',
4171 'IM': 'Isle of Man',
4172 'IL': 'Israel',
4173 'IT': 'Italy',
4174 'JM': 'Jamaica',
4175 'JP': 'Japan',
4176 'JE': 'Jersey',
4177 'JO': 'Jordan',
4178 'KZ': 'Kazakhstan',
4179 'KE': 'Kenya',
4180 'KI': 'Kiribati',
4181 'KP': 'Korea, Democratic People\'s Republic of',
4182 'KR': 'Korea, Republic of',
4183 'KW': 'Kuwait',
4184 'KG': 'Kyrgyzstan',
4185 'LA': 'Lao People\'s Democratic Republic',
4186 'LV': 'Latvia',
4187 'LB': 'Lebanon',
4188 'LS': 'Lesotho',
4189 'LR': 'Liberia',
4190 'LY': 'Libya',
4191 'LI': 'Liechtenstein',
4192 'LT': 'Lithuania',
4193 'LU': 'Luxembourg',
4194 'MO': 'Macao',
4195 'MK': 'Macedonia, the Former Yugoslav Republic of',
4196 'MG': 'Madagascar',
4197 'MW': 'Malawi',
4198 'MY': 'Malaysia',
4199 'MV': 'Maldives',
4200 'ML': 'Mali',
4201 'MT': 'Malta',
4202 'MH': 'Marshall Islands',
4203 'MQ': 'Martinique',
4204 'MR': 'Mauritania',
4205 'MU': 'Mauritius',
4206 'YT': 'Mayotte',
4207 'MX': 'Mexico',
4208 'FM': 'Micronesia, Federated States of',
4209 'MD': 'Moldova, Republic of',
4210 'MC': 'Monaco',
4211 'MN': 'Mongolia',
4212 'ME': 'Montenegro',
4213 'MS': 'Montserrat',
4214 'MA': 'Morocco',
4215 'MZ': 'Mozambique',
4216 'MM': 'Myanmar',
4217 'NA': 'Namibia',
4218 'NR': 'Nauru',
4219 'NP': 'Nepal',
4220 'NL': 'Netherlands',
4221 'NC': 'New Caledonia',
4222 'NZ': 'New Zealand',
4223 'NI': 'Nicaragua',
4224 'NE': 'Niger',
4225 'NG': 'Nigeria',
4226 'NU': 'Niue',
4227 'NF': 'Norfolk Island',
4228 'MP': 'Northern Mariana Islands',
4229 'NO': 'Norway',
4230 'OM': 'Oman',
4231 'PK': 'Pakistan',
4232 'PW': 'Palau',
4233 'PS': 'Palestine, State of',
4234 'PA': 'Panama',
4235 'PG': 'Papua New Guinea',
4236 'PY': 'Paraguay',
4237 'PE': 'Peru',
4238 'PH': 'Philippines',
4239 'PN': 'Pitcairn',
4240 'PL': 'Poland',
4241 'PT': 'Portugal',
4242 'PR': 'Puerto Rico',
4243 'QA': 'Qatar',
4244 'RE': 'Réunion',
4245 'RO': 'Romania',
4246 'RU': 'Russian Federation',
4247 'RW': 'Rwanda',
4248 'BL': 'Saint Barthélemy',
4249 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4250 'KN': 'Saint Kitts and Nevis',
4251 'LC': 'Saint Lucia',
4252 'MF': 'Saint Martin (French part)',
4253 'PM': 'Saint Pierre and Miquelon',
4254 'VC': 'Saint Vincent and the Grenadines',
4255 'WS': 'Samoa',
4256 'SM': 'San Marino',
4257 'ST': 'Sao Tome and Principe',
4258 'SA': 'Saudi Arabia',
4259 'SN': 'Senegal',
4260 'RS': 'Serbia',
4261 'SC': 'Seychelles',
4262 'SL': 'Sierra Leone',
4263 'SG': 'Singapore',
4264 'SX': 'Sint Maarten (Dutch part)',
4265 'SK': 'Slovakia',
4266 'SI': 'Slovenia',
4267 'SB': 'Solomon Islands',
4268 'SO': 'Somalia',
4269 'ZA': 'South Africa',
4270 'GS': 'South Georgia and the South Sandwich Islands',
4271 'SS': 'South Sudan',
4272 'ES': 'Spain',
4273 'LK': 'Sri Lanka',
4274 'SD': 'Sudan',
4275 'SR': 'Suriname',
4276 'SJ': 'Svalbard and Jan Mayen',
4277 'SZ': 'Swaziland',
4278 'SE': 'Sweden',
4279 'CH': 'Switzerland',
4280 'SY': 'Syrian Arab Republic',
4281 'TW': 'Taiwan, Province of China',
4282 'TJ': 'Tajikistan',
4283 'TZ': 'Tanzania, United Republic of',
4284 'TH': 'Thailand',
4285 'TL': 'Timor-Leste',
4286 'TG': 'Togo',
4287 'TK': 'Tokelau',
4288 'TO': 'Tonga',
4289 'TT': 'Trinidad and Tobago',
4290 'TN': 'Tunisia',
4291 'TR': 'Turkey',
4292 'TM': 'Turkmenistan',
4293 'TC': 'Turks and Caicos Islands',
4294 'TV': 'Tuvalu',
4295 'UG': 'Uganda',
4296 'UA': 'Ukraine',
4297 'AE': 'United Arab Emirates',
4298 'GB': 'United Kingdom',
4299 'US': 'United States',
4300 'UM': 'United States Minor Outlying Islands',
4301 'UY': 'Uruguay',
4302 'UZ': 'Uzbekistan',
4303 'VU': 'Vanuatu',
4304 'VE': 'Venezuela, Bolivarian Republic of',
4305 'VN': 'Viet Nam',
4306 'VG': 'Virgin Islands, British',
4307 'VI': 'Virgin Islands, U.S.',
4308 'WF': 'Wallis and Futuna',
4309 'EH': 'Western Sahara',
4310 'YE': 'Yemen',
4311 'ZM': 'Zambia',
4312 'ZW': 'Zimbabwe',
4313 }
4314
4315 @classmethod
4316 def short2full(cls, code):
4317 """Convert an ISO 3166-2 country code to the corresponding full name"""
4318 return cls._country_map.get(code.upper())
4319
4320
773f291d
S
4321class GeoUtils(object):
4322 # Major IPv4 address blocks per country
4323 _country_ip_map = {
53896ca5 4324 'AD': '46.172.224.0/19',
773f291d
S
4325 'AE': '94.200.0.0/13',
4326 'AF': '149.54.0.0/17',
4327 'AG': '209.59.64.0/18',
4328 'AI': '204.14.248.0/21',
4329 'AL': '46.99.0.0/16',
4330 'AM': '46.70.0.0/15',
4331 'AO': '105.168.0.0/13',
53896ca5
S
4332 'AP': '182.50.184.0/21',
4333 'AQ': '23.154.160.0/24',
773f291d
S
4334 'AR': '181.0.0.0/12',
4335 'AS': '202.70.112.0/20',
53896ca5 4336 'AT': '77.116.0.0/14',
773f291d
S
4337 'AU': '1.128.0.0/11',
4338 'AW': '181.41.0.0/18',
53896ca5
S
4339 'AX': '185.217.4.0/22',
4340 'AZ': '5.197.0.0/16',
773f291d
S
4341 'BA': '31.176.128.0/17',
4342 'BB': '65.48.128.0/17',
4343 'BD': '114.130.0.0/16',
4344 'BE': '57.0.0.0/8',
53896ca5 4345 'BF': '102.178.0.0/15',
773f291d
S
4346 'BG': '95.42.0.0/15',
4347 'BH': '37.131.0.0/17',
4348 'BI': '154.117.192.0/18',
4349 'BJ': '137.255.0.0/16',
53896ca5 4350 'BL': '185.212.72.0/23',
773f291d
S
4351 'BM': '196.12.64.0/18',
4352 'BN': '156.31.0.0/16',
4353 'BO': '161.56.0.0/16',
4354 'BQ': '161.0.80.0/20',
53896ca5 4355 'BR': '191.128.0.0/12',
773f291d
S
4356 'BS': '24.51.64.0/18',
4357 'BT': '119.2.96.0/19',
4358 'BW': '168.167.0.0/16',
4359 'BY': '178.120.0.0/13',
4360 'BZ': '179.42.192.0/18',
4361 'CA': '99.224.0.0/11',
4362 'CD': '41.243.0.0/16',
53896ca5
S
4363 'CF': '197.242.176.0/21',
4364 'CG': '160.113.0.0/16',
773f291d 4365 'CH': '85.0.0.0/13',
53896ca5 4366 'CI': '102.136.0.0/14',
773f291d
S
4367 'CK': '202.65.32.0/19',
4368 'CL': '152.172.0.0/14',
53896ca5 4369 'CM': '102.244.0.0/14',
773f291d
S
4370 'CN': '36.128.0.0/10',
4371 'CO': '181.240.0.0/12',
4372 'CR': '201.192.0.0/12',
4373 'CU': '152.206.0.0/15',
4374 'CV': '165.90.96.0/19',
4375 'CW': '190.88.128.0/17',
53896ca5 4376 'CY': '31.153.0.0/16',
773f291d
S
4377 'CZ': '88.100.0.0/14',
4378 'DE': '53.0.0.0/8',
4379 'DJ': '197.241.0.0/17',
4380 'DK': '87.48.0.0/12',
4381 'DM': '192.243.48.0/20',
4382 'DO': '152.166.0.0/15',
4383 'DZ': '41.96.0.0/12',
4384 'EC': '186.68.0.0/15',
4385 'EE': '90.190.0.0/15',
4386 'EG': '156.160.0.0/11',
4387 'ER': '196.200.96.0/20',
4388 'ES': '88.0.0.0/11',
4389 'ET': '196.188.0.0/14',
4390 'EU': '2.16.0.0/13',
4391 'FI': '91.152.0.0/13',
4392 'FJ': '144.120.0.0/16',
53896ca5 4393 'FK': '80.73.208.0/21',
773f291d
S
4394 'FM': '119.252.112.0/20',
4395 'FO': '88.85.32.0/19',
4396 'FR': '90.0.0.0/9',
4397 'GA': '41.158.0.0/15',
4398 'GB': '25.0.0.0/8',
4399 'GD': '74.122.88.0/21',
4400 'GE': '31.146.0.0/16',
4401 'GF': '161.22.64.0/18',
4402 'GG': '62.68.160.0/19',
53896ca5
S
4403 'GH': '154.160.0.0/12',
4404 'GI': '95.164.0.0/16',
773f291d
S
4405 'GL': '88.83.0.0/19',
4406 'GM': '160.182.0.0/15',
4407 'GN': '197.149.192.0/18',
4408 'GP': '104.250.0.0/19',
4409 'GQ': '105.235.224.0/20',
4410 'GR': '94.64.0.0/13',
4411 'GT': '168.234.0.0/16',
4412 'GU': '168.123.0.0/16',
4413 'GW': '197.214.80.0/20',
4414 'GY': '181.41.64.0/18',
4415 'HK': '113.252.0.0/14',
4416 'HN': '181.210.0.0/16',
4417 'HR': '93.136.0.0/13',
4418 'HT': '148.102.128.0/17',
4419 'HU': '84.0.0.0/14',
4420 'ID': '39.192.0.0/10',
4421 'IE': '87.32.0.0/12',
4422 'IL': '79.176.0.0/13',
4423 'IM': '5.62.80.0/20',
4424 'IN': '117.192.0.0/10',
4425 'IO': '203.83.48.0/21',
4426 'IQ': '37.236.0.0/14',
4427 'IR': '2.176.0.0/12',
4428 'IS': '82.221.0.0/16',
4429 'IT': '79.0.0.0/10',
4430 'JE': '87.244.64.0/18',
4431 'JM': '72.27.0.0/17',
4432 'JO': '176.29.0.0/16',
53896ca5 4433 'JP': '133.0.0.0/8',
773f291d
S
4434 'KE': '105.48.0.0/12',
4435 'KG': '158.181.128.0/17',
4436 'KH': '36.37.128.0/17',
4437 'KI': '103.25.140.0/22',
4438 'KM': '197.255.224.0/20',
53896ca5 4439 'KN': '198.167.192.0/19',
773f291d
S
4440 'KP': '175.45.176.0/22',
4441 'KR': '175.192.0.0/10',
4442 'KW': '37.36.0.0/14',
4443 'KY': '64.96.0.0/15',
4444 'KZ': '2.72.0.0/13',
4445 'LA': '115.84.64.0/18',
4446 'LB': '178.135.0.0/16',
53896ca5 4447 'LC': '24.92.144.0/20',
773f291d
S
4448 'LI': '82.117.0.0/19',
4449 'LK': '112.134.0.0/15',
53896ca5 4450 'LR': '102.183.0.0/16',
773f291d
S
4451 'LS': '129.232.0.0/17',
4452 'LT': '78.56.0.0/13',
4453 'LU': '188.42.0.0/16',
4454 'LV': '46.109.0.0/16',
4455 'LY': '41.252.0.0/14',
4456 'MA': '105.128.0.0/11',
4457 'MC': '88.209.64.0/18',
4458 'MD': '37.246.0.0/16',
4459 'ME': '178.175.0.0/17',
4460 'MF': '74.112.232.0/21',
4461 'MG': '154.126.0.0/17',
4462 'MH': '117.103.88.0/21',
4463 'MK': '77.28.0.0/15',
4464 'ML': '154.118.128.0/18',
4465 'MM': '37.111.0.0/17',
4466 'MN': '49.0.128.0/17',
4467 'MO': '60.246.0.0/16',
4468 'MP': '202.88.64.0/20',
4469 'MQ': '109.203.224.0/19',
4470 'MR': '41.188.64.0/18',
4471 'MS': '208.90.112.0/22',
4472 'MT': '46.11.0.0/16',
4473 'MU': '105.16.0.0/12',
4474 'MV': '27.114.128.0/18',
53896ca5 4475 'MW': '102.70.0.0/15',
773f291d
S
4476 'MX': '187.192.0.0/11',
4477 'MY': '175.136.0.0/13',
4478 'MZ': '197.218.0.0/15',
4479 'NA': '41.182.0.0/16',
4480 'NC': '101.101.0.0/18',
4481 'NE': '197.214.0.0/18',
4482 'NF': '203.17.240.0/22',
4483 'NG': '105.112.0.0/12',
4484 'NI': '186.76.0.0/15',
4485 'NL': '145.96.0.0/11',
4486 'NO': '84.208.0.0/13',
4487 'NP': '36.252.0.0/15',
4488 'NR': '203.98.224.0/19',
4489 'NU': '49.156.48.0/22',
4490 'NZ': '49.224.0.0/14',
4491 'OM': '5.36.0.0/15',
4492 'PA': '186.72.0.0/15',
4493 'PE': '186.160.0.0/14',
4494 'PF': '123.50.64.0/18',
4495 'PG': '124.240.192.0/19',
4496 'PH': '49.144.0.0/13',
4497 'PK': '39.32.0.0/11',
4498 'PL': '83.0.0.0/11',
4499 'PM': '70.36.0.0/20',
4500 'PR': '66.50.0.0/16',
4501 'PS': '188.161.0.0/16',
4502 'PT': '85.240.0.0/13',
4503 'PW': '202.124.224.0/20',
4504 'PY': '181.120.0.0/14',
4505 'QA': '37.210.0.0/15',
53896ca5 4506 'RE': '102.35.0.0/16',
773f291d 4507 'RO': '79.112.0.0/13',
53896ca5 4508 'RS': '93.86.0.0/15',
773f291d 4509 'RU': '5.136.0.0/13',
53896ca5 4510 'RW': '41.186.0.0/16',
773f291d
S
4511 'SA': '188.48.0.0/13',
4512 'SB': '202.1.160.0/19',
4513 'SC': '154.192.0.0/11',
53896ca5 4514 'SD': '102.120.0.0/13',
773f291d 4515 'SE': '78.64.0.0/12',
53896ca5 4516 'SG': '8.128.0.0/10',
773f291d
S
4517 'SI': '188.196.0.0/14',
4518 'SK': '78.98.0.0/15',
53896ca5 4519 'SL': '102.143.0.0/17',
773f291d
S
4520 'SM': '89.186.32.0/19',
4521 'SN': '41.82.0.0/15',
53896ca5 4522 'SO': '154.115.192.0/18',
773f291d
S
4523 'SR': '186.179.128.0/17',
4524 'SS': '105.235.208.0/21',
4525 'ST': '197.159.160.0/19',
4526 'SV': '168.243.0.0/16',
4527 'SX': '190.102.0.0/20',
4528 'SY': '5.0.0.0/16',
4529 'SZ': '41.84.224.0/19',
4530 'TC': '65.255.48.0/20',
4531 'TD': '154.68.128.0/19',
4532 'TG': '196.168.0.0/14',
4533 'TH': '171.96.0.0/13',
4534 'TJ': '85.9.128.0/18',
4535 'TK': '27.96.24.0/21',
4536 'TL': '180.189.160.0/20',
4537 'TM': '95.85.96.0/19',
4538 'TN': '197.0.0.0/11',
4539 'TO': '175.176.144.0/21',
4540 'TR': '78.160.0.0/11',
4541 'TT': '186.44.0.0/15',
4542 'TV': '202.2.96.0/19',
4543 'TW': '120.96.0.0/11',
4544 'TZ': '156.156.0.0/14',
53896ca5
S
4545 'UA': '37.52.0.0/14',
4546 'UG': '102.80.0.0/13',
4547 'US': '6.0.0.0/8',
773f291d 4548 'UY': '167.56.0.0/13',
53896ca5 4549 'UZ': '84.54.64.0/18',
773f291d 4550 'VA': '212.77.0.0/19',
53896ca5 4551 'VC': '207.191.240.0/21',
773f291d 4552 'VE': '186.88.0.0/13',
53896ca5 4553 'VG': '66.81.192.0/20',
773f291d
S
4554 'VI': '146.226.0.0/16',
4555 'VN': '14.160.0.0/11',
4556 'VU': '202.80.32.0/20',
4557 'WF': '117.20.32.0/21',
4558 'WS': '202.4.32.0/19',
4559 'YE': '134.35.0.0/16',
4560 'YT': '41.242.116.0/22',
4561 'ZA': '41.0.0.0/11',
53896ca5
S
4562 'ZM': '102.144.0.0/13',
4563 'ZW': '102.177.192.0/18',
773f291d
S
4564 }
4565
4566 @classmethod
5f95927a
S
4567 def random_ipv4(cls, code_or_block):
4568 if len(code_or_block) == 2:
4569 block = cls._country_ip_map.get(code_or_block.upper())
4570 if not block:
4571 return None
4572 else:
4573 block = code_or_block
773f291d
S
4574 addr, preflen = block.split('/')
4575 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4576 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4577 return compat_str(socket.inet_ntoa(
4248dad9 4578 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4579
4580
91410c9b 4581class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4582 def __init__(self, proxies=None):
4583 # Set default handlers
4584 for type in ('http', 'https'):
4585 setattr(self, '%s_open' % type,
4586 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4587 meth(r, proxy, type))
38e87f6c 4588 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4589
91410c9b 4590 def proxy_open(self, req, proxy, type):
2461f79d 4591 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4592 if req_proxy is not None:
4593 proxy = req_proxy
2461f79d
PH
4594 del req.headers['Ytdl-request-proxy']
4595
4596 if proxy == '__noproxy__':
4597 return None # No Proxy
51fb4995 4598 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4599 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4600 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4601 return None
91410c9b
PH
4602 return compat_urllib_request.ProxyHandler.proxy_open(
4603 self, req, proxy, type)
5bc880b9
YCH
4604
4605
0a5445dd
YCH
4606# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4607# released into Public Domain
4608# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4609
4610def long_to_bytes(n, blocksize=0):
4611 """long_to_bytes(n:long, blocksize:int) : string
4612 Convert a long integer to a byte string.
4613
4614 If optional blocksize is given and greater than zero, pad the front of the
4615 byte string with binary zeros so that the length is a multiple of
4616 blocksize.
4617 """
4618 # after much testing, this algorithm was deemed to be the fastest
4619 s = b''
4620 n = int(n)
4621 while n > 0:
4622 s = compat_struct_pack('>I', n & 0xffffffff) + s
4623 n = n >> 32
4624 # strip off leading zeros
4625 for i in range(len(s)):
4626 if s[i] != b'\000'[0]:
4627 break
4628 else:
4629 # only happens when n == 0
4630 s = b'\000'
4631 i = 0
4632 s = s[i:]
4633 # add back some pad bytes. this could be done more efficiently w.r.t. the
4634 # de-padding being done above, but sigh...
4635 if blocksize > 0 and len(s) % blocksize:
4636 s = (blocksize - len(s) % blocksize) * b'\000' + s
4637 return s
4638
4639
4640def bytes_to_long(s):
4641 """bytes_to_long(string) : long
4642 Convert a byte string to a long integer.
4643
4644 This is (essentially) the inverse of long_to_bytes().
4645 """
4646 acc = 0
4647 length = len(s)
4648 if length % 4:
4649 extra = (4 - length % 4)
4650 s = b'\000' * extra + s
4651 length = length + extra
4652 for i in range(0, length, 4):
4653 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4654 return acc
4655
4656
5bc880b9
YCH
4657def ohdave_rsa_encrypt(data, exponent, modulus):
4658 '''
4659 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4660
4661 Input:
4662 data: data to encrypt, bytes-like object
4663 exponent, modulus: parameter e and N of RSA algorithm, both integer
4664 Output: hex string of encrypted data
4665
4666 Limitation: supports one block encryption only
4667 '''
4668
4669 payload = int(binascii.hexlify(data[::-1]), 16)
4670 encrypted = pow(payload, exponent, modulus)
4671 return '%x' % encrypted
81bdc8fd
YCH
4672
4673
f48409c7
YCH
4674def pkcs1pad(data, length):
4675 """
4676 Padding input data with PKCS#1 scheme
4677
4678 @param {int[]} data input data
4679 @param {int} length target length
4680 @returns {int[]} padded data
4681 """
4682 if len(data) > length - 11:
4683 raise ValueError('Input data too long for PKCS#1 padding')
4684
4685 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4686 return [0, 2] + pseudo_random + [0] + data
4687
4688
5eb6bdce 4689def encode_base_n(num, n, table=None):
59f898b7 4690 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4691 if not table:
4692 table = FULL_TABLE[:n]
4693
5eb6bdce
YCH
4694 if n > len(table):
4695 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4696
4697 if num == 0:
4698 return table[0]
4699
81bdc8fd
YCH
4700 ret = ''
4701 while num:
4702 ret = table[num % n] + ret
4703 num = num // n
4704 return ret
f52354a8
YCH
4705
4706
4707def decode_packed_codes(code):
06b3fe29 4708 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4709 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4710 base = int(base)
4711 count = int(count)
4712 symbols = symbols.split('|')
4713 symbol_table = {}
4714
4715 while count:
4716 count -= 1
5eb6bdce 4717 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4718 symbol_table[base_n_count] = symbols[count] or base_n_count
4719
4720 return re.sub(
4721 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4722 obfuscated_code)
e154c651 4723
4724
1ced2221
S
4725def caesar(s, alphabet, shift):
4726 if shift == 0:
4727 return s
4728 l = len(alphabet)
4729 return ''.join(
4730 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4731 for c in s)
4732
4733
4734def rot47(s):
4735 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4736
4737
e154c651 4738def parse_m3u8_attributes(attrib):
4739 info = {}
4740 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4741 if val.startswith('"'):
4742 val = val[1:-1]
4743 info[key] = val
4744 return info
1143535d
YCH
4745
4746
4747def urshift(val, n):
4748 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4749
4750
4751# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4752# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4753def decode_png(png_data):
4754 # Reference: https://www.w3.org/TR/PNG/
4755 header = png_data[8:]
4756
4757 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4758 raise IOError('Not a valid PNG file.')
4759
4760 int_map = {1: '>B', 2: '>H', 4: '>I'}
4761 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4762
4763 chunks = []
4764
4765 while header:
4766 length = unpack_integer(header[:4])
4767 header = header[4:]
4768
4769 chunk_type = header[:4]
4770 header = header[4:]
4771
4772 chunk_data = header[:length]
4773 header = header[length:]
4774
4775 header = header[4:] # Skip CRC
4776
4777 chunks.append({
4778 'type': chunk_type,
4779 'length': length,
4780 'data': chunk_data
4781 })
4782
4783 ihdr = chunks[0]['data']
4784
4785 width = unpack_integer(ihdr[:4])
4786 height = unpack_integer(ihdr[4:8])
4787
4788 idat = b''
4789
4790 for chunk in chunks:
4791 if chunk['type'] == b'IDAT':
4792 idat += chunk['data']
4793
4794 if not idat:
4795 raise IOError('Unable to read PNG data.')
4796
4797 decompressed_data = bytearray(zlib.decompress(idat))
4798
4799 stride = width * 3
4800 pixels = []
4801
4802 def _get_pixel(idx):
4803 x = idx % stride
4804 y = idx // stride
4805 return pixels[y][x]
4806
4807 for y in range(height):
4808 basePos = y * (1 + stride)
4809 filter_type = decompressed_data[basePos]
4810
4811 current_row = []
4812
4813 pixels.append(current_row)
4814
4815 for x in range(stride):
4816 color = decompressed_data[1 + basePos + x]
4817 basex = y * stride + x
4818 left = 0
4819 up = 0
4820
4821 if x > 2:
4822 left = _get_pixel(basex - 3)
4823 if y > 0:
4824 up = _get_pixel(basex - stride)
4825
4826 if filter_type == 1: # Sub
4827 color = (color + left) & 0xff
4828 elif filter_type == 2: # Up
4829 color = (color + up) & 0xff
4830 elif filter_type == 3: # Average
4831 color = (color + ((left + up) >> 1)) & 0xff
4832 elif filter_type == 4: # Paeth
4833 a = left
4834 b = up
4835 c = 0
4836
4837 if x > 2 and y > 0:
4838 c = _get_pixel(basex - stride - 3)
4839
4840 p = a + b - c
4841
4842 pa = abs(p - a)
4843 pb = abs(p - b)
4844 pc = abs(p - c)
4845
4846 if pa <= pb and pa <= pc:
4847 color = (color + a) & 0xff
4848 elif pb <= pc:
4849 color = (color + b) & 0xff
4850 else:
4851 color = (color + c) & 0xff
4852
4853 current_row.append(color)
4854
4855 return width, height, pixels
efa97bdc
YCH
4856
4857
4858def write_xattr(path, key, value):
4859 # This mess below finds the best xattr tool for the job
4860 try:
4861 # try the pyxattr module...
4862 import xattr
4863
53a7e3d2
YCH
4864 if hasattr(xattr, 'set'): # pyxattr
4865 # Unicode arguments are not supported in python-pyxattr until
4866 # version 0.5.0
067aa17e 4867 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4868 pyxattr_required_version = '0.5.0'
4869 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4870 # TODO: fallback to CLI tools
4871 raise XAttrUnavailableError(
4872 'python-pyxattr is detected but is too old. '
7a5c1cfe 4873 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4874 'Falling back to other xattr implementations' % (
4875 pyxattr_required_version, xattr.__version__))
4876
4877 setxattr = xattr.set
4878 else: # xattr
4879 setxattr = xattr.setxattr
efa97bdc
YCH
4880
4881 try:
53a7e3d2 4882 setxattr(path, key, value)
efa97bdc
YCH
4883 except EnvironmentError as e:
4884 raise XAttrMetadataError(e.errno, e.strerror)
4885
4886 except ImportError:
4887 if compat_os_name == 'nt':
4888 # Write xattrs to NTFS Alternate Data Streams:
4889 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4890 assert ':' not in key
4891 assert os.path.exists(path)
4892
4893 ads_fn = path + ':' + key
4894 try:
4895 with open(ads_fn, 'wb') as f:
4896 f.write(value)
4897 except EnvironmentError as e:
4898 raise XAttrMetadataError(e.errno, e.strerror)
4899 else:
4900 user_has_setfattr = check_executable('setfattr', ['--version'])
4901 user_has_xattr = check_executable('xattr', ['-h'])
4902
4903 if user_has_setfattr or user_has_xattr:
4904
4905 value = value.decode('utf-8')
4906 if user_has_setfattr:
4907 executable = 'setfattr'
4908 opts = ['-n', key, '-v', value]
4909 elif user_has_xattr:
4910 executable = 'xattr'
4911 opts = ['-w', key, value]
4912
3089bc74
S
4913 cmd = ([encodeFilename(executable, True)]
4914 + [encodeArgument(o) for o in opts]
4915 + [encodeFilename(path, True)])
efa97bdc
YCH
4916
4917 try:
d3c93ec2 4918 p = Popen(
efa97bdc
YCH
4919 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4920 except EnvironmentError as e:
4921 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4922 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4923 stderr = stderr.decode('utf-8', 'replace')
4924 if p.returncode != 0:
4925 raise XAttrMetadataError(p.returncode, stderr)
4926
4927 else:
4928 # On Unix, and can't find pyxattr, setfattr, or xattr.
4929 if sys.platform.startswith('linux'):
4930 raise XAttrUnavailableError(
4931 "Couldn't find a tool to set the xattrs. "
4932 "Install either the python 'pyxattr' or 'xattr' "
4933 "modules, or the GNU 'attr' package "
4934 "(which contains the 'setfattr' tool).")
4935 else:
4936 raise XAttrUnavailableError(
4937 "Couldn't find a tool to set the xattrs. "
4938 "Install either the python 'xattr' module, "
4939 "or the 'xattr' binary.")
0c265486
YCH
4940
4941
4942def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4943 start_date = datetime.date(1950, 1, 1)
4944 end_date = datetime.date(1995, 12, 31)
4945 offset = random.randint(0, (end_date - start_date).days)
4946 random_date = start_date + datetime.timedelta(offset)
0c265486 4947 return {
aa374bc7
AS
4948 year_field: str(random_date.year),
4949 month_field: str(random_date.month),
4950 day_field: str(random_date.day),
0c265486 4951 }
732044af 4952
c76eb41b 4953
732044af 4954# Templates for internet shortcut files, which are plain text files.
4955DOT_URL_LINK_TEMPLATE = '''
4956[InternetShortcut]
4957URL=%(url)s
4958'''.lstrip()
4959
4960DOT_WEBLOC_LINK_TEMPLATE = '''
4961<?xml version="1.0" encoding="UTF-8"?>
4962<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4963<plist version="1.0">
4964<dict>
4965\t<key>URL</key>
4966\t<string>%(url)s</string>
4967</dict>
4968</plist>
4969'''.lstrip()
4970
4971DOT_DESKTOP_LINK_TEMPLATE = '''
4972[Desktop Entry]
4973Encoding=UTF-8
4974Name=%(filename)s
4975Type=Link
4976URL=%(url)s
4977Icon=text-html
4978'''.lstrip()
4979
08438d2c 4980LINK_TEMPLATES = {
4981 'url': DOT_URL_LINK_TEMPLATE,
4982 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4983 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4984}
4985
732044af 4986
4987def iri_to_uri(iri):
4988 """
4989 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4990
4991 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4992 """
4993
4994 iri_parts = compat_urllib_parse_urlparse(iri)
4995
4996 if '[' in iri_parts.netloc:
4997 raise ValueError('IPv6 URIs are not, yet, supported.')
4998 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4999
5000 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5001
5002 net_location = ''
5003 if iri_parts.username:
5004 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5005 if iri_parts.password is not None:
5006 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5007 net_location += '@'
5008
5009 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
5010 # The 'idna' encoding produces ASCII text.
5011 if iri_parts.port is not None and iri_parts.port != 80:
5012 net_location += ':' + str(iri_parts.port)
5013
5014 return compat_urllib_parse_urlunparse(
5015 (iri_parts.scheme,
5016 net_location,
5017
5018 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5019
5020 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5021 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5022
5023 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5024 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5025
5026 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5027
5028 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5029
5030
5031def to_high_limit_path(path):
5032 if sys.platform in ['win32', 'cygwin']:
5033 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5034 return r'\\?\ '.rstrip() + os.path.abspath(path)
5035
5036 return path
76d321f6 5037
c76eb41b 5038
b868936c 5039def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
e0ddbd02 5040 val = traverse_obj(obj, *variadic(field))
5041 if val in ignore:
5042 return default
5043 return template % (func(val) if func else val)
00dd0cd5 5044
5045
5046def clean_podcast_url(url):
5047 return re.sub(r'''(?x)
5048 (?:
5049 (?:
5050 chtbl\.com/track|
5051 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5052 play\.podtrac\.com
5053 )/[^/]+|
5054 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5055 flex\.acast\.com|
5056 pd(?:
5057 cn\.co| # https://podcorn.com/analytics-prefix/
5058 st\.fm # https://podsights.com/docs/
5059 )/e
5060 )/''', '', url)
ffcb8191
THD
5061
5062
5063_HEX_TABLE = '0123456789abcdef'
5064
5065
5066def random_uuidv4():
5067 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5068
5069
5070def make_dir(path, to_screen=None):
5071 try:
5072 dn = os.path.dirname(path)
5073 if dn and not os.path.exists(dn):
5074 os.makedirs(dn)
5075 return True
5076 except (OSError, IOError) as err:
5077 if callable(to_screen) is not None:
5078 to_screen('unable to create directory ' + error_to_compat_str(err))
5079 return False
f74980cb 5080
5081
5082def get_executable_path():
c552ae88 5083 from zipimport import zipimporter
5084 if hasattr(sys, 'frozen'): # Running from PyInstaller
5085 path = os.path.dirname(sys.executable)
5086 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5087 path = os.path.join(os.path.dirname(__file__), '../..')
5088 else:
5089 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 5090 return os.path.abspath(path)
5091
5092
2f567473 5093def load_plugins(name, suffix, namespace):
3ae5e797 5094 classes = {}
f74980cb 5095 try:
019a94f7
ÁS
5096 plugins_spec = importlib.util.spec_from_file_location(
5097 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5098 plugins = importlib.util.module_from_spec(plugins_spec)
5099 sys.modules[plugins_spec.name] = plugins
5100 plugins_spec.loader.exec_module(plugins)
f74980cb 5101 for name in dir(plugins):
2f567473 5102 if name in namespace:
5103 continue
5104 if not name.endswith(suffix):
f74980cb 5105 continue
5106 klass = getattr(plugins, name)
3ae5e797 5107 classes[name] = namespace[name] = klass
019a94f7 5108 except FileNotFoundError:
f74980cb 5109 pass
f74980cb 5110 return classes
06167fbb 5111
5112
325ebc17 5113def traverse_obj(
352d63fd 5114 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5115 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5116 ''' Traverse nested list/dict/tuple
8f334380 5117 @param path_list A list of paths which are checked one by one.
5118 Each path is a list of keys where each key is a string,
1797b073 5119 a function, a tuple of strings/None or "...".
2614f646 5120 When a fuction is given, it takes the key as argument and
5121 returns whether the key matches or not. When a tuple is given,
8f334380 5122 all the keys given in the tuple are traversed, and
5123 "..." traverses all the keys in the object
1797b073 5124 "None" returns the object without traversal
325ebc17 5125 @param default Default value to return
352d63fd 5126 @param expected_type Only accept final value of this type (Can also be any callable)
5127 @param get_all Return all the values obtained from a path or only the first one
324ad820 5128 @param casesense Whether to consider dictionary keys as case sensitive
5129 @param is_user_input Whether the keys are generated from user input. If True,
5130 strings are converted to int/slice if necessary
5131 @param traverse_string Whether to traverse inside strings. If True, any
5132 non-compatible object will also be converted into a string
8f334380 5133 # TODO: Write tests
324ad820 5134 '''
325ebc17 5135 if not casesense:
dbf5416a 5136 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5137 path_list = (map(_lower, variadic(path)) for path in path_list)
5138
5139 def _traverse_obj(obj, path, _current_depth=0):
5140 nonlocal depth
5141 path = tuple(variadic(path))
5142 for i, key in enumerate(path):
1797b073 5143 if None in (key, obj):
5144 return obj
8f334380 5145 if isinstance(key, (list, tuple)):
5146 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5147 key = ...
5148 if key is ...:
5149 obj = (obj.values() if isinstance(obj, dict)
5150 else obj if isinstance(obj, (list, tuple, LazyList))
5151 else str(obj) if traverse_string else [])
5152 _current_depth += 1
5153 depth = max(depth, _current_depth)
5154 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5155 elif callable(key):
5156 if isinstance(obj, (list, tuple, LazyList)):
5157 obj = enumerate(obj)
5158 elif isinstance(obj, dict):
5159 obj = obj.items()
5160 else:
5161 if not traverse_string:
5162 return None
5163 obj = str(obj)
5164 _current_depth += 1
5165 depth = max(depth, _current_depth)
5166 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 5167 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5168 obj = (obj.get(key) if casesense or (key in obj)
5169 else next((v for k, v in obj.items() if _lower(k) == key), None))
5170 else:
5171 if is_user_input:
5172 key = (int_or_none(key) if ':' not in key
5173 else slice(*map(int_or_none, key.split(':'))))
8f334380 5174 if key == slice(None):
575e17a1 5175 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5176 if not isinstance(key, (int, slice)):
9fea350f 5177 return None
8f334380 5178 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5179 if not traverse_string:
5180 return None
5181 obj = str(obj)
5182 try:
5183 obj = obj[key]
5184 except IndexError:
324ad820 5185 return None
325ebc17 5186 return obj
5187
352d63fd 5188 if isinstance(expected_type, type):
5189 type_test = lambda val: val if isinstance(val, expected_type) else None
5190 elif expected_type is not None:
5191 type_test = expected_type
5192 else:
5193 type_test = lambda val: val
5194
8f334380 5195 for path in path_list:
5196 depth = 0
5197 val = _traverse_obj(obj, path)
325ebc17 5198 if val is not None:
8f334380 5199 if depth:
5200 for _ in range(depth - 1):
6586bca9 5201 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5202 val = [v for v in map(type_test, val) if v is not None]
8f334380 5203 if val:
352d63fd 5204 return val if get_all else val[0]
5205 else:
5206 val = type_test(val)
5207 if val is not None:
8f334380 5208 return val
325ebc17 5209 return default
324ad820 5210
5211
5212def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5213 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5214 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5215 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5216
5217
4b4b7f74 5218def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5219 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5220
5221
3e9b66d7
LNO
5222def decode_base(value, digits):
5223 # This will convert given base-x string to scalar (long or int)
5224 table = {char: index for index, char in enumerate(digits)}
5225 result = 0
5226 base = len(digits)
5227 for chr in value:
5228 result *= base
5229 result += table[chr]
5230 return result
5231
5232
5233def time_seconds(**kwargs):
5234 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5235 return t.timestamp()
5236
5237
49fa4d9a
N
5238# create a JSON Web Signature (jws) with HS256 algorithm
5239# the resulting format is in JWS Compact Serialization
5240# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5241# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5242def jwt_encode_hs256(payload_data, key, headers={}):
5243 header_data = {
5244 'alg': 'HS256',
5245 'typ': 'JWT',
5246 }
5247 if headers:
5248 header_data.update(headers)
5249 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5250 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5251 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5252 signature_b64 = base64.b64encode(h.digest())
5253 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5254 return token
819e0531 5255
5256
16b0d7e6 5257# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5258def jwt_decode_hs256(jwt):
5259 header_b64, payload_b64, signature_b64 = jwt.split('.')
5260 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5261 return payload_data
5262
5263
819e0531 5264def supports_terminal_sequences(stream):
5265 if compat_os_name == 'nt':
e3c7d495 5266 from .compat import WINDOWS_VT_MODE # Must be imported locally
5267 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5268 return False
5269 elif not os.getenv('TERM'):
5270 return False
5271 try:
5272 return stream.isatty()
5273 except BaseException:
5274 return False
5275
5276
ec11a9f4 5277_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5278
5279
5280def remove_terminal_sequences(string):
5281 return _terminal_sequences_re.sub('', string)
5282
5283
5284def number_of_digits(number):
5285 return len('%d' % number)
34921b43 5286
5287
5288def join_nonempty(*values, delim='-', from_dict=None):
5289 if from_dict is not None:
c586f9e8 5290 values = map(from_dict.get, values)
34921b43 5291 return delim.join(map(str, filter(None, values)))
06e57990 5292
5293
27231526
ZM
5294def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5295 """
5296 Find the largest format dimensions in terms of video width and, for each thumbnail:
5297 * Modify the URL: Match the width with the provided regex and replace with the former width
5298 * Update dimensions
5299
5300 This function is useful with video services that scale the provided thumbnails on demand
5301 """
5302 _keys = ('width', 'height')
5303 max_dimensions = max(
5304 [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5305 default=(0, 0))
5306 if not max_dimensions[0]:
5307 return thumbnails
5308 return [
5309 merge_dicts(
5310 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5311 dict(zip(_keys, max_dimensions)), thumbnail)
5312 for thumbnail in thumbnails
5313 ]
5314
5315
93c8410d
LNO
5316def parse_http_range(range):
5317 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5318 if not range:
5319 return None, None, None
5320 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5321 if not crg:
5322 return None, None, None
5323 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5324
5325
06e57990 5326class Config:
5327 own_args = None
5328 filename = None
5329 __initialized = False
5330
5331 def __init__(self, parser, label=None):
5332 self._parser, self.label = parser, label
5333 self._loaded_paths, self.configs = set(), []
5334
5335 def init(self, args=None, filename=None):
5336 assert not self.__initialized
65662dff 5337 directory = ''
06e57990 5338 if filename:
5339 location = os.path.realpath(filename)
65662dff 5340 directory = os.path.dirname(location)
06e57990 5341 if location in self._loaded_paths:
5342 return False
5343 self._loaded_paths.add(location)
5344
5345 self.__initialized = True
5346 self.own_args, self.filename = args, filename
5347 for location in self._parser.parse_args(args)[0].config_locations or []:
65662dff 5348 location = os.path.join(directory, expand_path(location))
06e57990 5349 if os.path.isdir(location):
5350 location = os.path.join(location, 'yt-dlp.conf')
5351 if not os.path.exists(location):
5352 self._parser.error(f'config location {location} does not exist')
5353 self.append_config(self.read_file(location), location)
5354 return True
5355
5356 def __str__(self):
5357 label = join_nonempty(
5358 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5359 delim=' ')
5360 return join_nonempty(
5361 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5362 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5363 delim='\n')
5364
5365 @staticmethod
5366 def read_file(filename, default=[]):
5367 try:
5368 optionf = open(filename)
5369 except IOError:
5370 return default # silently skip if file is not present
5371 try:
5372 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5373 contents = optionf.read()
5374 if sys.version_info < (3,):
5375 contents = contents.decode(preferredencoding())
5376 res = compat_shlex_split(contents, comments=True)
5377 finally:
5378 optionf.close()
5379 return res
5380
5381 @staticmethod
5382 def hide_login_info(opts):
5383 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5384 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5385
5386 def _scrub_eq(o):
5387 m = eqre.match(o)
5388 if m:
5389 return m.group('key') + '=PRIVATE'
5390 else:
5391 return o
5392
5393 opts = list(map(_scrub_eq, opts))
5394 for idx, opt in enumerate(opts):
5395 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5396 opts[idx + 1] = 'PRIVATE'
5397 return opts
5398
5399 def append_config(self, *args, label=None):
5400 config = type(self)(self._parser, label)
5401 config._loaded_paths = self._loaded_paths
5402 if config.init(*args):
5403 self.configs.append(config)
5404
5405 @property
5406 def all_args(self):
5407 for config in reversed(self.configs):
5408 yield from config.all_args
5409 yield from self.own_args or []
5410
5411 def parse_args(self):
5412 return self._parser.parse_args(list(self.all_args))
da42679b
LNO
5413
5414
5415class WebSocketsWrapper():
5416 """Wraps websockets module to use in non-async scopes"""
5417
5418 def __init__(self, url, headers=None):
5419 self.loop = asyncio.events.new_event_loop()
5420 self.conn = compat_websockets.connect(
5421 url, extra_headers=headers, ping_interval=None,
5422 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
15dfb392 5423 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5424
5425 def __enter__(self):
5426 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5427 return self
5428
5429 def send(self, *args):
5430 self.run_with_loop(self.pool.send(*args), self.loop)
5431
5432 def recv(self, *args):
5433 return self.run_with_loop(self.pool.recv(*args), self.loop)
5434
5435 def __exit__(self, type, value, traceback):
5436 try:
5437 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5438 finally:
5439 self.loop.close()
15dfb392 5440 self._cancel_all_tasks(self.loop)
da42679b
LNO
5441
5442 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5443 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5444 @staticmethod
5445 def run_with_loop(main, loop):
5446 if not asyncio.coroutines.iscoroutine(main):
5447 raise ValueError(f'a coroutine was expected, got {main!r}')
5448
5449 try:
5450 return loop.run_until_complete(main)
5451 finally:
5452 loop.run_until_complete(loop.shutdown_asyncgens())
5453 if hasattr(loop, 'shutdown_default_executor'):
5454 loop.run_until_complete(loop.shutdown_default_executor())
5455
5456 @staticmethod
5457 def _cancel_all_tasks(loop):
5458 to_cancel = asyncio.tasks.all_tasks(loop)
5459
5460 if not to_cancel:
5461 return
5462
5463 for task in to_cancel:
5464 task.cancel()
5465
5466 loop.run_until_complete(
5467 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5468
5469 for task in to_cancel:
5470 if task.cancelled():
5471 continue
5472 if task.exception() is not None:
5473 loop.call_exception_handler({
5474 'message': 'unhandled exception during asyncio.run() shutdown',
5475 'exception': task.exception(),
5476 'task': task,
5477 })
5478
5479
5480has_websockets = bool(compat_websockets)
8b7539d2 5481
5482
5483def merge_headers(*dicts):
5484 """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5485 return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}