]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[extractor/archiveorg] Improve handling of formats (#4461)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 151}
a942d6cb 152
a7aaa398
S
153KNOWN_EXTENSIONS = (
154 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
155 'flv', 'f4v', 'f4a', 'f4b',
156 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
157 'mkv', 'mka', 'mk3d',
158 'avi', 'divx',
159 'mov',
160 'asf', 'wmv', 'wma',
161 '3gp', '3g2',
162 'mp3',
871a8929 163 'mpg',
a7aaa398
S
164 'flac',
165 'ape',
166 'wav',
167 'f4f', 'f4m', 'm3u8', 'smil')
168
c587cbb7 169# needed for sanitizing filenames in restricted mode
c8827027 170ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 173
46f59e89
S
174DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
cb655f34
S
178 '%B %dst %Y',
179 '%B %dnd %Y',
9d30c213 180 '%B %drd %Y',
cb655f34 181 '%B %dth %Y',
46f59e89 182 '%b %d %Y',
cb655f34
S
183 '%b %dst %Y',
184 '%b %dnd %Y',
9d30c213 185 '%b %drd %Y',
cb655f34 186 '%b %dth %Y',
46f59e89
S
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
9d30c213 189 '%b %drd %Y %I:%M',
46f59e89
S
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
bccdbd22 193 '%Y.%m.%d.',
46f59e89 194 '%Y/%m/%d',
81c13222 195 '%Y/%m/%d %H:%M',
46f59e89 196 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
4f3fa23e 199 '%Y%m%d',
0c1c6f4b 200 '%Y-%m-%d %H:%M',
46f59e89
S
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
5014558a 203 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
c6eed6b8
S
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
b555ae9b
S
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
a63d9bd0 216 '%H:%M %d-%b-%Y',
46f59e89
S
217)
218
219DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
227])
228
229DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
230DATE_FORMATS_MONTH_FIRST.extend([
231 '%m-%d-%Y',
232 '%m.%d.%Y',
233 '%m/%d/%Y',
234 '%m/%d/%y',
235 '%m/%d/%Y %H:%M:%S',
236])
237
06b3fe29 238PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 239JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 240
1d485a1a 241NUMBER_RE = r'\d+(?:\.\d+)?'
242
7105440c 243
0b9c08b4 244@functools.cache
d77c3dfd 245def preferredencoding():
59ae15a5 246 """Get preferred encoding.
d77c3dfd 247
59ae15a5
PH
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
250 """
251 try:
252 pref = locale.getpreferredencoding()
28e614de 253 'TEST'.encode(pref)
70a1165b 254 except Exception:
59ae15a5 255 pref = 'UTF-8'
bae611f2 256
59ae15a5 257 return pref
d77c3dfd 258
f4bfd65f 259
181c8655 260def write_json_file(obj, fn):
1394646a 261 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 262
cfb0511d 263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
265 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
266
267 try:
268 with tf:
45d86abe 269 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
270 if sys.platform == 'win32':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
19a03940 273 with contextlib.suppress(OSError):
1394646a 274 os.unlink(fn)
19a03940 275 with contextlib.suppress(OSError):
9cd5f54e
R
276 mask = os.umask(0)
277 os.umask(mask)
278 os.chmod(tf.name, 0o666 & ~mask)
181c8655 279 os.rename(tf.name, fn)
70a1165b 280 except Exception:
19a03940 281 with contextlib.suppress(OSError):
181c8655 282 os.remove(tf.name)
181c8655
PH
283 raise
284
285
cfb0511d 286def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 290 return node.find(expr)
59ae56fa 291
d7e66d39
JMF
292# On python2.6 the xml.etree.ElementTree.Element methods don't support
293# the namespace parameter
5f6a1245
JW
294
295
d7e66d39
JMF
296def xpath_with_ns(path, ns_map):
297 components = [c.split(':') for c in path.split('/')]
298 replaced = []
299 for c in components:
300 if len(c) == 1:
301 replaced.append(c[0])
302 else:
303 ns, tag = c
304 replaced.append('{%s}%s' % (ns_map[ns], tag))
305 return '/'.join(replaced)
306
d77c3dfd 307
a41fb80c 308def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 309 def _find_xpath(xpath):
f9934b96 310 return node.find(xpath)
578c0745 311
14f25df2 312 if isinstance(xpath, str):
578c0745
S
313 n = _find_xpath(xpath)
314 else:
315 for xp in xpath:
316 n = _find_xpath(xp)
317 if n is not None:
318 break
d74bebd5 319
8e636da4 320 if n is None:
bf42a990
S
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
bf0ff932
PH
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element %s' % name)
326 else:
327 return None
a41fb80c
S
328 return n
329
330
331def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
332 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
333 if n is None or n == default:
334 return n
335 if n.text is None:
336 if default is not NO_DEFAULT:
337 return default
338 elif fatal:
339 name = xpath if name is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name)
341 else:
342 return None
343 return n.text
a41fb80c
S
344
345
346def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
347 n = find_xpath_attr(node, xpath, key)
348 if n is None:
349 if default is not NO_DEFAULT:
350 return default
351 elif fatal:
86e5f3ed 352 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
353 raise ExtractorError('Could not find XML attribute %s' % name)
354 else:
355 return None
356 return n.attrib[key]
bf0ff932
PH
357
358
c487cf00 359def get_element_by_id(id, html, **kwargs):
43e8fafd 360 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 362
12ea2f30 363
c487cf00 364def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 365 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 366 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
367
368
84c237fb 369def get_element_by_class(class_name, html):
2af12ad9
TC
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
6f32a0b5
ZM
375def get_element_html_by_class(class_name, html):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_html_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_by_attribute(attribute, value, html, **kwargs):
382 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
383 return retval[0] if retval else None
384
385
c487cf00 386def get_element_html_by_attribute(attribute, value, html, **kargs):
387 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
388 return retval[0] if retval else None
389
390
c487cf00 391def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
64fa820c 394 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
395 html, escape_value=False)
396
397
6f32a0b5
ZM
398def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
64fa820c 401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
402 html, escape_value=False)
403
404
405def get_elements_by_attribute(*args, **kwargs):
43e8fafd 406 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
413
414
415def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
416 """
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
419 """
9e6dd238 420
86e5f3ed 421 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 422
84c237fb
YCH
423 value = re.escape(value) if escape_value else value
424
86e5f3ed 425 partial_element_re = rf'''(?x)
6f32a0b5 426 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 427 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 428 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
429 '''
38285056 430
0254f162
ZM
431 for m in re.finditer(partial_element_re, html):
432 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 433
0254f162
ZM
434 yield (
435 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
436 whole
437 )
a921f407 438
c5229f39 439
ac668111 440class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
441 """
442 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
443 closing tag for the first opening tag it has encountered, and can be used
444 as a context manager
445 """
446
447 class HTMLBreakOnClosingTagException(Exception):
448 pass
449
450 def __init__(self):
451 self.tagstack = collections.deque()
ac668111 452 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
453
454 def __enter__(self):
455 return self
456
457 def __exit__(self, *_):
458 self.close()
459
460 def close(self):
461 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
462 # so data remains buffered; we no longer have any interest in it, thus
463 # override this method to discard it
464 pass
465
466 def handle_starttag(self, tag, _):
467 self.tagstack.append(tag)
468
469 def handle_endtag(self, tag):
470 if not self.tagstack:
471 raise compat_HTMLParseError('no tags in the stack')
472 while self.tagstack:
473 inner_tag = self.tagstack.pop()
474 if inner_tag == tag:
475 break
476 else:
477 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
478 if not self.tagstack:
479 raise self.HTMLBreakOnClosingTagException()
480
481
482def get_element_text_and_html_by_tag(tag, html):
483 """
484 For the first element with the specified tag in the passed HTML document
485 return its' content (text) and the whole element (html)
486 """
487 def find_or_raise(haystack, needle, exc):
488 try:
489 return haystack.index(needle)
490 except ValueError:
491 raise exc
492 closing_tag = f'</{tag}>'
493 whole_start = find_or_raise(
494 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
495 content_start = find_or_raise(
496 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
497 content_start += whole_start + 1
498 with HTMLBreakOnClosingTagParser() as parser:
499 parser.feed(html[whole_start:content_start])
500 if not parser.tagstack or parser.tagstack[0] != tag:
501 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
502 offset = content_start
503 while offset < len(html):
504 next_closing_tag_start = find_or_raise(
505 html[offset:], closing_tag,
506 compat_HTMLParseError(f'closing {tag} tag not found'))
507 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
508 try:
509 parser.feed(html[offset:offset + next_closing_tag_end])
510 offset += next_closing_tag_end
511 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
512 return html[content_start:offset + next_closing_tag_start], \
513 html[whole_start:offset + next_closing_tag_end]
514 raise compat_HTMLParseError('unexpected end of html')
515
516
ac668111 517class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 518 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 519
8bb56eee 520 def __init__(self):
c5229f39 521 self.attrs = {}
ac668111 522 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
523
524 def handle_starttag(self, tag, attrs):
525 self.attrs = dict(attrs)
526
c5229f39 527
ac668111 528class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
529 """HTML parser to gather the attributes for the elements of a list"""
530
531 def __init__(self):
ac668111 532 html.parser.HTMLParser.__init__(self)
73673ccf
FF
533 self.items = []
534 self._level = 0
535
536 def handle_starttag(self, tag, attrs):
537 if tag == 'li' and self._level == 0:
538 self.items.append(dict(attrs))
539 self._level += 1
540
541 def handle_endtag(self, tag):
542 self._level -= 1
543
544
8bb56eee
BF
545def extract_attributes(html_element):
546 """Given a string for an HTML element such as
547 <el
548 a="foo" B="bar" c="&98;az" d=boz
549 empty= noval entity="&amp;"
550 sq='"' dq="'"
551 >
552 Decode and return a dictionary of attributes.
553 {
554 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
555 'empty': '', 'noval': None, 'entity': '&',
556 'sq': '"', 'dq': '\''
557 }.
8bb56eee
BF
558 """
559 parser = HTMLAttributeParser()
19a03940 560 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
561 parser.feed(html_element)
562 parser.close()
8bb56eee 563 return parser.attrs
9e6dd238 564
c5229f39 565
73673ccf
FF
566def parse_list(webpage):
567 """Given a string for an series of HTML <li> elements,
568 return a dictionary of their attributes"""
569 parser = HTMLListAttrsParser()
570 parser.feed(webpage)
571 parser.close()
572 return parser.items
573
574
9e6dd238 575def clean_html(html):
59ae15a5 576 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
577
578 if html is None: # Convenience for sanitizing descriptions etc.
579 return html
580
49185227 581 html = re.sub(r'\s+', ' ', html)
582 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
583 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
584 # Strip html tags
585 html = re.sub('<.*?>', '', html)
586 # Replace html entities
587 html = unescapeHTML(html)
7decf895 588 return html.strip()
9e6dd238
FV
589
590
b7c47b74 591class LenientJSONDecoder(json.JSONDecoder):
592 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
593 self.transform_source, self.ignore_extra = transform_source, ignore_extra
594 super().__init__(*args, **kwargs)
595
596 def decode(self, s):
597 if self.transform_source:
598 s = self.transform_source(s)
599 if self.ignore_extra:
600 return self.raw_decode(s.lstrip())[0]
601 return super().decode(s)
602
603
d77c3dfd 604def sanitize_open(filename, open_mode):
59ae15a5
PH
605 """Try to open the given filename, and slightly tweak it if this fails.
606
607 Attempts to open the given filename. If this fails, it tries to change
608 the filename slightly, step by step, until it's either able to open it
609 or it fails and raises a final exception, like the standard open()
610 function.
611
612 It returns the tuple (stream, definitive_file_name).
613 """
0edb3e33 614 if filename == '-':
615 if sys.platform == 'win32':
616 import msvcrt
617 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
618 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 619
0edb3e33 620 for attempt in range(2):
621 try:
622 try:
89737671 623 if sys.platform == 'win32':
b506289f 624 # FIXME: An exclusive lock also locks the file from being read.
625 # Since windows locks are mandatory, don't lock the file on windows (for now).
626 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 627 raise LockingUnsupportedError()
0edb3e33 628 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 629 except OSError:
0edb3e33 630 stream = open(filename, open_mode)
8a82af35 631 return stream, filename
86e5f3ed 632 except OSError as err:
0edb3e33 633 if attempt or err.errno in (errno.EACCES,):
634 raise
635 old_filename, filename = filename, sanitize_path(filename)
636 if old_filename == filename:
637 raise
d77c3dfd
FV
638
639
640def timeconvert(timestr):
59ae15a5
PH
641 """Convert RFC 2822 defined time string into system timestamp"""
642 timestamp = None
643 timetuple = email.utils.parsedate_tz(timestr)
644 if timetuple is not None:
645 timestamp = email.utils.mktime_tz(timetuple)
646 return timestamp
1c469a94 647
5f6a1245 648
5c3895ff 649def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 650 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 651 @param restricted Use a stricter subset of allowed characters
652 @param is_id Whether this is an ID that should be kept unchanged if possible.
653 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 654 """
5c3895ff 655 if s == '':
656 return ''
657
59ae15a5 658 def replace_insane(char):
c587cbb7
AT
659 if restricted and char in ACCENT_CHARS:
660 return ACCENT_CHARS[char]
91dd88b9 661 elif not restricted and char == '\n':
5c3895ff 662 return '\0 '
91dd88b9 663 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
664 return ''
665 elif char == '"':
666 return '' if restricted else '\''
667 elif char == ':':
5c3895ff 668 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 669 elif char in '\\/|*<>':
5c3895ff 670 return '\0_'
671 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
672 return '\0_'
59ae15a5
PH
673 return char
674
5c3895ff 675 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 676 result = ''.join(map(replace_insane, s))
5c3895ff 677 if is_id is NO_DEFAULT:
ae61d108 678 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
679 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 680 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
681 result = result.replace('\0', '') or '_'
682
796173d0
PH
683 if not is_id:
684 while '__' in result:
685 result = result.replace('__', '_')
686 result = result.strip('_')
687 # Common case of "Foreign band name - English song title"
688 if restricted and result.startswith('-_'):
689 result = result[2:]
5a42414b
PH
690 if result.startswith('-'):
691 result = '_' + result[len('-'):]
a7440261 692 result = result.lstrip('.')
796173d0
PH
693 if not result:
694 result = '_'
59ae15a5 695 return result
d77c3dfd 696
5f6a1245 697
c2934512 698def sanitize_path(s, force=False):
a2aaf4db 699 """Sanitizes and normalizes path on Windows"""
c2934512 700 if sys.platform == 'win32':
c4218ac3 701 force = False
c2934512 702 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 703 elif force:
704 drive_or_unc = ''
705 else:
a2aaf4db 706 return s
c2934512 707
be531ef1
S
708 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
709 if drive_or_unc:
a2aaf4db
S
710 norm_path.pop(0)
711 sanitized_path = [
ec85ded8 712 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 713 for path_part in norm_path]
be531ef1
S
714 if drive_or_unc:
715 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 716 elif force and s and s[0] == os.path.sep:
c4218ac3 717 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
718 return os.path.join(*sanitized_path)
719
720
17bcc626 721def sanitize_url(url):
befa4708
S
722 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
723 # the number of unwanted failures due to missing protocol
21633673 724 if url is None:
725 return
726 elif url.startswith('//'):
befa4708
S
727 return 'http:%s' % url
728 # Fix some common typos seen so far
729 COMMON_TYPOS = (
067aa17e 730 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
731 (r'^httpss://', r'https://'),
732 # https://bx1.be/lives/direct-tv/
733 (r'^rmtp([es]?)://', r'rtmp\1://'),
734 )
735 for mistake, fixup in COMMON_TYPOS:
736 if re.match(mistake, url):
737 return re.sub(mistake, fixup, url)
bc6b9bcd 738 return url
17bcc626
S
739
740
5435dcf9 741def extract_basic_auth(url):
14f25df2 742 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
743 if parts.username is None:
744 return url, None
14f25df2 745 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
746 parts.hostname if parts.port is None
747 else '%s:%d' % (parts.hostname, parts.port))))
748 auth_payload = base64.b64encode(
0f06bcd7 749 ('%s:%s' % (parts.username, parts.password or '')).encode())
750 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
751
752
67dda517 753def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 754 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
755 if auth_header is not None:
756 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
757 headers['Authorization'] = auth_header
ac668111 758 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
759
760
51098426
S
761def expand_path(s):
762 """Expand shell variables and ~"""
763 return os.path.expandvars(compat_expanduser(s))
764
765
7e9a6125 766def orderedSet(iterable, *, lazy=False):
767 """Remove all duplicates from the input iterable"""
768 def _iter():
769 seen = [] # Do not use set since the items can be unhashable
770 for x in iterable:
771 if x not in seen:
772 seen.append(x)
773 yield x
774
775 return _iter() if lazy else list(_iter())
d77c3dfd 776
912b38b4 777
55b2f099 778def _htmlentity_transform(entity_with_semicolon):
4e408e47 779 """Transforms an HTML entity to a character."""
55b2f099
YCH
780 entity = entity_with_semicolon[:-1]
781
4e408e47 782 # Known non-numeric HTML entity
ac668111 783 if entity in html.entities.name2codepoint:
784 return chr(html.entities.name2codepoint[entity])
4e408e47 785
55b2f099
YCH
786 # TODO: HTML5 allows entities without a semicolon. For example,
787 # '&Eacuteric' should be decoded as 'Éric'.
ac668111 788 if entity_with_semicolon in html.entities.html5:
789 return html.entities.html5[entity_with_semicolon]
55b2f099 790
91757b0f 791 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
792 if mobj is not None:
793 numstr = mobj.group(1)
28e614de 794 if numstr.startswith('x'):
4e408e47 795 base = 16
28e614de 796 numstr = '0%s' % numstr
4e408e47
PH
797 else:
798 base = 10
067aa17e 799 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 800 with contextlib.suppress(ValueError):
ac668111 801 return chr(int(numstr, base))
4e408e47
PH
802
803 # Unknown entity in name, return its literal representation
7a3f0c00 804 return '&%s;' % entity
4e408e47
PH
805
806
d77c3dfd 807def unescapeHTML(s):
912b38b4
PH
808 if s is None:
809 return None
19a03940 810 assert isinstance(s, str)
d77c3dfd 811
4e408e47 812 return re.sub(
95f3f7c2 813 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 814
8bf48f23 815
cdb19aa4 816def escapeHTML(text):
817 return (
818 text
819 .replace('&', '&amp;')
820 .replace('<', '&lt;')
821 .replace('>', '&gt;')
822 .replace('"', '&quot;')
823 .replace("'", '&#39;')
824 )
825
826
f5b1bca9 827def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 828 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
829 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
830 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 831
832
d3c93ec2 833class Popen(subprocess.Popen):
834 if sys.platform == 'win32':
835 _startupinfo = subprocess.STARTUPINFO()
836 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
837 else:
838 _startupinfo = None
839
f0c9fb96 840 def __init__(self, *args, text=False, **kwargs):
841 if text is True:
842 kwargs['universal_newlines'] = True # For 3.6 compatibility
843 kwargs.setdefault('encoding', 'utf-8')
844 kwargs.setdefault('errors', 'replace')
86e5f3ed 845 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 846
847 def communicate_or_kill(self, *args, **kwargs):
8a82af35 848 try:
849 return self.communicate(*args, **kwargs)
850 except BaseException: # Including KeyboardInterrupt
f0c9fb96 851 self.kill(timeout=None)
8a82af35 852 raise
d3c93ec2 853
f0c9fb96 854 def kill(self, *, timeout=0):
855 super().kill()
856 if timeout != 0:
857 self.wait(timeout=timeout)
858
859 @classmethod
860 def run(cls, *args, **kwargs):
861 with cls(*args, **kwargs) as proc:
862 stdout, stderr = proc.communicate_or_kill()
863 return stdout or '', stderr or '', proc.returncode
864
d3c93ec2 865
aa49acd1
S
866def get_subprocess_encoding():
867 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
868 # For subprocess calls, encode with locale encoding
869 # Refer to http://stackoverflow.com/a/9951851/35070
870 encoding = preferredencoding()
871 else:
872 encoding = sys.getfilesystemencoding()
873 if encoding is None:
874 encoding = 'utf-8'
875 return encoding
876
877
8bf48f23 878def encodeFilename(s, for_subprocess=False):
19a03940 879 assert isinstance(s, str)
cfb0511d 880 return s
aa49acd1
S
881
882
883def decodeFilename(b, for_subprocess=False):
cfb0511d 884 return b
8bf48f23 885
f07b74fc
PH
886
887def encodeArgument(s):
cfb0511d 888 # Legacy code that uses byte strings
889 # Uncomment the following line after fixing all post processors
14f25df2 890 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 891 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
892
893
aa49acd1 894def decodeArgument(b):
cfb0511d 895 return b
aa49acd1
S
896
897
8271226a
PH
898def decodeOption(optval):
899 if optval is None:
900 return optval
901 if isinstance(optval, bytes):
902 optval = optval.decode(preferredencoding())
903
14f25df2 904 assert isinstance(optval, str)
8271226a 905 return optval
1c256f70 906
5f6a1245 907
aa7785f8 908_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
909
910
911def timetuple_from_msec(msec):
912 secs, msec = divmod(msec, 1000)
913 mins, secs = divmod(secs, 60)
914 hrs, mins = divmod(mins, 60)
915 return _timetuple(hrs, mins, secs, msec)
916
917
cdb19aa4 918def formatSeconds(secs, delim=':', msec=False):
aa7785f8 919 time = timetuple_from_msec(secs * 1000)
920 if time.hours:
921 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
922 elif time.minutes:
923 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 924 else:
aa7785f8 925 ret = '%d' % time.seconds
926 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 927
a0ddb8a2 928
77562778 929def _ssl_load_windows_store_certs(ssl_context, storename):
930 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
931 try:
932 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
933 if encoding == 'x509_asn' and (
934 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
935 except PermissionError:
936 return
937 for cert in certs:
19a03940 938 with contextlib.suppress(ssl.SSLError):
77562778 939 ssl_context.load_verify_locations(cadata=cert)
a2366922 940
77562778 941
942def make_HTTPS_handler(params, **kwargs):
943 opts_check_certificate = not params.get('nocheckcertificate')
944 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
945 context.check_hostname = opts_check_certificate
f81c62a6 946 if params.get('legacyserverconnect'):
947 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 948 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
949 context.set_ciphers('DEFAULT')
8a82af35 950
77562778 951 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
952 if opts_check_certificate:
d5820461 953 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
954 context.load_verify_locations(cafile=certifi.where())
168bbc4f 955 else:
956 try:
957 context.load_default_certs()
958 # Work around the issue in load_default_certs when there are bad certificates. See:
959 # https://github.com/yt-dlp/yt-dlp/issues/1060,
960 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
961 except ssl.SSLError:
962 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
963 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
964 for storename in ('CA', 'ROOT'):
965 _ssl_load_windows_store_certs(context, storename)
966 context.set_default_verify_paths()
8a82af35 967
bb58c9ed 968 client_certfile = params.get('client_certificate')
969 if client_certfile:
970 try:
971 context.load_cert_chain(
972 client_certfile, keyfile=params.get('client_certificate_key'),
973 password=params.get('client_certificate_password'))
974 except ssl.SSLError:
975 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 976
977 # Some servers may reject requests if ALPN extension is not sent. See:
978 # https://github.com/python/cpython/issues/85140
979 # https://github.com/yt-dlp/yt-dlp/issues/3878
980 with contextlib.suppress(NotImplementedError):
981 context.set_alpn_protocols(['http/1.1'])
982
77562778 983 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 984
732ea2f0 985
5873d4cc 986def bug_reports_message(before=';'):
57e0f077 987 from .update import REPOSITORY
988
989 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
990 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
991
992 before = before.rstrip()
993 if not before or before.endswith(('.', '!', '?')):
994 msg = msg[0].title() + msg[1:]
995
996 return (before + ' ' if before else '') + msg
08f2a92c
JMF
997
998
bf5b9d85
PM
999class YoutubeDLError(Exception):
1000 """Base exception for YoutubeDL errors."""
aa9369a2 1001 msg = None
1002
1003 def __init__(self, msg=None):
1004 if msg is not None:
1005 self.msg = msg
1006 elif self.msg is None:
1007 self.msg = type(self).__name__
1008 super().__init__(self.msg)
bf5b9d85
PM
1009
1010
ac668111 1011network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1012if hasattr(ssl, 'CertificateError'):
1013 network_exceptions.append(ssl.CertificateError)
1014network_exceptions = tuple(network_exceptions)
1015
1016
bf5b9d85 1017class ExtractorError(YoutubeDLError):
1c256f70 1018 """Error during info extraction."""
5f6a1245 1019
1151c407 1020 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1021 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1022 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1023 """
3158150c 1024 if sys.exc_info()[0] in network_exceptions:
9a82b238 1025 expected = True
d5979c5d 1026
7265a219 1027 self.orig_msg = str(msg)
1c256f70 1028 self.traceback = tb
1151c407 1029 self.expected = expected
2eabb802 1030 self.cause = cause
d11271dd 1031 self.video_id = video_id
1151c407 1032 self.ie = ie
1033 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1034 if isinstance(self.exc_info[1], ExtractorError):
1035 self.exc_info = self.exc_info[1].exc_info
1151c407 1036
86e5f3ed 1037 super().__init__(''.join((
a70635b8 1038 format_field(ie, None, '[%s] '),
1039 format_field(video_id, None, '%s: '),
7265a219 1040 msg,
a70635b8 1041 format_field(cause, None, ' (caused by %r)'),
1151c407 1042 '' if expected else bug_reports_message())))
1c256f70 1043
01951dda 1044 def format_traceback(self):
497d2fab 1045 return join_nonempty(
1046 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1047 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1048 delim='\n') or None
01951dda 1049
1c256f70 1050
416c7fcb
PH
1051class UnsupportedError(ExtractorError):
1052 def __init__(self, url):
86e5f3ed 1053 super().__init__(
416c7fcb
PH
1054 'Unsupported URL: %s' % url, expected=True)
1055 self.url = url
1056
1057
55b3e45b
JMF
1058class RegexNotFoundError(ExtractorError):
1059 """Error when a regex didn't match"""
1060 pass
1061
1062
773f291d
S
1063class GeoRestrictedError(ExtractorError):
1064 """Geographic restriction Error exception.
1065
1066 This exception may be thrown when a video is not available from your
1067 geographic location due to geographic restrictions imposed by a website.
1068 """
b6e0c7d2 1069
0db3bae8 1070 def __init__(self, msg, countries=None, **kwargs):
1071 kwargs['expected'] = True
86e5f3ed 1072 super().__init__(msg, **kwargs)
773f291d
S
1073 self.countries = countries
1074
1075
693f0600 1076class UserNotLive(ExtractorError):
1077 """Error when a channel/user is not live"""
1078
1079 def __init__(self, msg=None, **kwargs):
1080 kwargs['expected'] = True
1081 super().__init__(msg or 'The channel is not currently live', **kwargs)
1082
1083
bf5b9d85 1084class DownloadError(YoutubeDLError):
59ae15a5 1085 """Download Error exception.
d77c3dfd 1086
59ae15a5
PH
1087 This exception may be thrown by FileDownloader objects if they are not
1088 configured to continue on errors. They will contain the appropriate
1089 error message.
1090 """
5f6a1245 1091
8cc83b8d
FV
1092 def __init__(self, msg, exc_info=None):
1093 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1094 super().__init__(msg)
8cc83b8d 1095 self.exc_info = exc_info
d77c3dfd
FV
1096
1097
498f5606 1098class EntryNotInPlaylist(YoutubeDLError):
1099 """Entry not in playlist exception.
1100
1101 This exception will be thrown by YoutubeDL when a requested entry
1102 is not found in the playlist info_dict
1103 """
aa9369a2 1104 msg = 'Entry not found in info'
498f5606 1105
1106
bf5b9d85 1107class SameFileError(YoutubeDLError):
59ae15a5 1108 """Same File exception.
d77c3dfd 1109
59ae15a5
PH
1110 This exception will be thrown by FileDownloader objects if they detect
1111 multiple files would have to be downloaded to the same file on disk.
1112 """
aa9369a2 1113 msg = 'Fixed output name but more than one file to download'
1114
1115 def __init__(self, filename=None):
1116 if filename is not None:
1117 self.msg += f': {filename}'
1118 super().__init__(self.msg)
d77c3dfd
FV
1119
1120
bf5b9d85 1121class PostProcessingError(YoutubeDLError):
59ae15a5 1122 """Post Processing exception.
d77c3dfd 1123
59ae15a5
PH
1124 This exception may be raised by PostProcessor's .run() method to
1125 indicate an error in the postprocessing task.
1126 """
5f6a1245 1127
5f6a1245 1128
48f79687 1129class DownloadCancelled(YoutubeDLError):
1130 """ Exception raised when the download queue should be interrupted """
1131 msg = 'The download was cancelled'
8b0d7497 1132
8b0d7497 1133
48f79687 1134class ExistingVideoReached(DownloadCancelled):
1135 """ --break-on-existing triggered """
1136 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1137
48f79687 1138
1139class RejectedVideoReached(DownloadCancelled):
1140 """ --break-on-reject triggered """
1141 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1142
1143
48f79687 1144class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1145 """ --max-downloads limit has been reached. """
48f79687 1146 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1147
1148
f2ebc5c7 1149class ReExtractInfo(YoutubeDLError):
1150 """ Video info needs to be re-extracted. """
1151
1152 def __init__(self, msg, expected=False):
1153 super().__init__(msg)
1154 self.expected = expected
1155
1156
1157class ThrottledDownload(ReExtractInfo):
48f79687 1158 """ Download speed below --throttled-rate. """
aa9369a2 1159 msg = 'The download speed is below throttle limit'
d77c3dfd 1160
43b22906 1161 def __init__(self):
1162 super().__init__(self.msg, expected=False)
f2ebc5c7 1163
d77c3dfd 1164
bf5b9d85 1165class UnavailableVideoError(YoutubeDLError):
59ae15a5 1166 """Unavailable Format exception.
d77c3dfd 1167
59ae15a5
PH
1168 This exception will be thrown when a video is requested
1169 in a format that is not available for that video.
1170 """
aa9369a2 1171 msg = 'Unable to download video'
1172
1173 def __init__(self, err=None):
1174 if err is not None:
1175 self.msg += f': {err}'
1176 super().__init__(self.msg)
d77c3dfd
FV
1177
1178
bf5b9d85 1179class ContentTooShortError(YoutubeDLError):
59ae15a5 1180 """Content Too Short exception.
d77c3dfd 1181
59ae15a5
PH
1182 This exception may be raised by FileDownloader objects when a file they
1183 download is too small for what the server announced first, indicating
1184 the connection was probably interrupted.
1185 """
d77c3dfd 1186
59ae15a5 1187 def __init__(self, downloaded, expected):
86e5f3ed 1188 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1189 # Both in bytes
59ae15a5
PH
1190 self.downloaded = downloaded
1191 self.expected = expected
d77c3dfd 1192
5f6a1245 1193
bf5b9d85 1194class XAttrMetadataError(YoutubeDLError):
efa97bdc 1195 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1196 super().__init__(msg)
efa97bdc 1197 self.code = code
bd264412 1198 self.msg = msg
efa97bdc
YCH
1199
1200 # Parsing code and msg
3089bc74 1201 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1202 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1203 self.reason = 'NO_SPACE'
1204 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1205 self.reason = 'VALUE_TOO_LONG'
1206 else:
1207 self.reason = 'NOT_SUPPORTED'
1208
1209
bf5b9d85 1210class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1211 pass
1212
1213
c5a59d93 1214def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1215 hc = http_class(*args, **kwargs)
be4a824d 1216 source_address = ydl_handler._params.get('source_address')
8959018a 1217
be4a824d 1218 if source_address is not None:
8959018a
AU
1219 # This is to workaround _create_connection() from socket where it will try all
1220 # address data from getaddrinfo() including IPv6. This filters the result from
1221 # getaddrinfo() based on the source_address value.
1222 # This is based on the cpython socket.create_connection() function.
1223 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1224 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1225 host, port = address
1226 err = None
1227 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1228 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1229 ip_addrs = [addr for addr in addrs if addr[0] == af]
1230 if addrs and not ip_addrs:
1231 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1232 raise OSError(
9e21e6d9
S
1233 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1234 % (ip_version, source_address[0]))
8959018a
AU
1235 for res in ip_addrs:
1236 af, socktype, proto, canonname, sa = res
1237 sock = None
1238 try:
1239 sock = socket.socket(af, socktype, proto)
1240 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1241 sock.settimeout(timeout)
1242 sock.bind(source_address)
1243 sock.connect(sa)
1244 err = None # Explicitly break reference cycle
1245 return sock
86e5f3ed 1246 except OSError as _:
8959018a
AU
1247 err = _
1248 if sock is not None:
1249 sock.close()
1250 if err is not None:
1251 raise err
1252 else:
86e5f3ed 1253 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1254 if hasattr(hc, '_create_connection'):
1255 hc._create_connection = _create_connection
cfb0511d 1256 hc.source_address = (source_address, 0)
be4a824d
PH
1257
1258 return hc
1259
1260
87f0e62d 1261def handle_youtubedl_headers(headers):
992fc9d6
YCH
1262 filtered_headers = headers
1263
1264 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1265 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1266 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1267
992fc9d6 1268 return filtered_headers
87f0e62d
YCH
1269
1270
ac668111 1271class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1272 """Handler for HTTP requests and responses.
1273
1274 This class, when installed with an OpenerDirector, automatically adds
1275 the standard headers to every HTTP request and handles gzipped and
1276 deflated responses from web servers. If compression is to be avoided in
1277 a particular request, the original request in the program code only has
0424ec30 1278 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1279 removed before making the real request.
1280
1281 Part of this code was copied from:
1282
1283 http://techknack.net/python-urllib2-handlers/
1284
1285 Andrew Rowls, the author of that code, agreed to release it to the
1286 public domain.
1287 """
1288
be4a824d 1289 def __init__(self, params, *args, **kwargs):
ac668111 1290 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1291 self._params = params
1292
1293 def http_open(self, req):
ac668111 1294 conn_class = http.client.HTTPConnection
71aff188
YCH
1295
1296 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1297 if socks_proxy:
1298 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1299 del req.headers['Ytdl-socks-proxy']
1300
be4a824d 1301 return self.do_open(functools.partial(
71aff188 1302 _create_http_connection, self, conn_class, False),
be4a824d
PH
1303 req)
1304
59ae15a5
PH
1305 @staticmethod
1306 def deflate(data):
fc2119f2 1307 if not data:
1308 return data
59ae15a5
PH
1309 try:
1310 return zlib.decompress(data, -zlib.MAX_WBITS)
1311 except zlib.error:
1312 return zlib.decompress(data)
1313
4390d5ec 1314 @staticmethod
1315 def brotli(data):
1316 if not data:
1317 return data
9b8ee23b 1318 return brotli.decompress(data)
4390d5ec 1319
acebc9cd 1320 def http_request(self, req):
51f267d9
S
1321 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1322 # always respected by websites, some tend to give out URLs with non percent-encoded
1323 # non-ASCII characters (see telemb.py, ard.py [#3412])
1324 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1325 # To work around aforementioned issue we will replace request's original URL with
1326 # percent-encoded one
1327 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1328 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1329 url = req.get_full_url()
1330 url_escaped = escape_url(url)
1331
1332 # Substitute URL if any change after escaping
1333 if url != url_escaped:
15d260eb 1334 req = update_Request(req, url=url_escaped)
51f267d9 1335
8b7539d2 1336 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1337 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1338 # The dict keys are capitalized because of this bug by urllib
1339 if h.capitalize() not in req.headers:
33ac271b 1340 req.add_header(h, v)
87f0e62d 1341
af14914b 1342 if 'Accept-encoding' not in req.headers:
1343 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1344
87f0e62d 1345 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1346
379a4f16 1347 return super().do_request_(req)
59ae15a5 1348
acebc9cd 1349 def http_response(self, req, resp):
59ae15a5
PH
1350 old_resp = resp
1351 # gzip
1352 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1353 content = resp.read()
1354 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1355 try:
1356 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1357 except OSError as original_ioerror:
aa3e9507
PH
1358 # There may be junk add the end of the file
1359 # See http://stackoverflow.com/q/4928560/35070 for details
1360 for i in range(1, 1024):
1361 try:
1362 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1363 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1364 except OSError:
aa3e9507
PH
1365 continue
1366 break
1367 else:
1368 raise original_ioerror
ac668111 1369 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1370 resp.msg = old_resp.msg
c047270c 1371 del resp.headers['Content-encoding']
59ae15a5
PH
1372 # deflate
1373 if resp.headers.get('Content-encoding', '') == 'deflate':
1374 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1375 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1376 resp.msg = old_resp.msg
c047270c 1377 del resp.headers['Content-encoding']
4390d5ec 1378 # brotli
1379 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1380 resp = urllib.request.addinfourl(
4390d5ec 1381 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1382 resp.msg = old_resp.msg
1383 del resp.headers['Content-encoding']
ad729172 1384 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1385 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1386 if 300 <= resp.code < 400:
1387 location = resp.headers.get('Location')
1388 if location:
1389 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1390 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1391 location_escaped = escape_url(location)
1392 if location != location_escaped:
1393 del resp.headers['Location']
1394 resp.headers['Location'] = location_escaped
59ae15a5 1395 return resp
0f8d03f8 1396
acebc9cd
PH
1397 https_request = http_request
1398 https_response = http_response
bf50b038 1399
5de90176 1400
71aff188
YCH
1401def make_socks_conn_class(base_class, socks_proxy):
1402 assert issubclass(base_class, (
ac668111 1403 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1404
14f25df2 1405 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1406 if url_components.scheme.lower() == 'socks5':
1407 socks_type = ProxyType.SOCKS5
1408 elif url_components.scheme.lower() in ('socks', 'socks4'):
1409 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1410 elif url_components.scheme.lower() == 'socks4a':
1411 socks_type = ProxyType.SOCKS4A
71aff188 1412
cdd94c2e
YCH
1413 def unquote_if_non_empty(s):
1414 if not s:
1415 return s
ac668111 1416 return urllib.parse.unquote_plus(s)
cdd94c2e 1417
71aff188
YCH
1418 proxy_args = (
1419 socks_type,
1420 url_components.hostname, url_components.port or 1080,
1421 True, # Remote DNS
cdd94c2e
YCH
1422 unquote_if_non_empty(url_components.username),
1423 unquote_if_non_empty(url_components.password),
71aff188
YCH
1424 )
1425
1426 class SocksConnection(base_class):
1427 def connect(self):
1428 self.sock = sockssocket()
1429 self.sock.setproxy(*proxy_args)
19a03940 1430 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1431 self.sock.settimeout(self.timeout)
1432 self.sock.connect((self.host, self.port))
1433
ac668111 1434 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1435 if hasattr(self, '_context'): # Python > 2.6
1436 self.sock = self._context.wrap_socket(
1437 self.sock, server_hostname=self.host)
1438 else:
1439 self.sock = ssl.wrap_socket(self.sock)
1440
1441 return SocksConnection
1442
1443
ac668111 1444class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1445 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1446 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1447 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1448 self._params = params
1449
1450 def https_open(self, req):
4f264c02 1451 kwargs = {}
71aff188
YCH
1452 conn_class = self._https_conn_class
1453
4f264c02
JMF
1454 if hasattr(self, '_context'): # python > 2.6
1455 kwargs['context'] = self._context
1456 if hasattr(self, '_check_hostname'): # python 3.x
1457 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1458
1459 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1460 if socks_proxy:
1461 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1462 del req.headers['Ytdl-socks-proxy']
1463
4f28b537 1464 try:
1465 return self.do_open(
1466 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1467 except urllib.error.URLError as e:
1468 if (isinstance(e.reason, ssl.SSLError)
1469 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1470 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1471 raise
be4a824d
PH
1472
1473
ac668111 1474class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1475 """
1476 See [1] for cookie file format.
1477
1478 1. https://curl.haxx.se/docs/http-cookies.html
1479 """
e7e62441 1480 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1481 _ENTRY_LEN = 7
1482 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1483# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1484
1485'''
1486 _CookieFileEntry = collections.namedtuple(
1487 'CookieFileEntry',
1488 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1489
d76fa1f3 1490 def __init__(self, filename=None, *args, **kwargs):
1491 super().__init__(None, *args, **kwargs)
1492 if self.is_path(filename):
1493 filename = os.fspath(filename)
1494 self.filename = filename
1495
24146491 1496 @staticmethod
1497 def _true_or_false(cndn):
1498 return 'TRUE' if cndn else 'FALSE'
1499
d76fa1f3 1500 @staticmethod
1501 def is_path(file):
1502 return isinstance(file, (str, bytes, os.PathLike))
1503
1504 @contextlib.contextmanager
1505 def open(self, file, *, write=False):
1506 if self.is_path(file):
1507 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1508 yield f
1509 else:
1510 if write:
1511 file.truncate(0)
1512 yield file
1513
24146491 1514 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1515 now = time.time()
1516 for cookie in self:
1517 if (not ignore_discard and cookie.discard
1518 or not ignore_expires and cookie.is_expired(now)):
1519 continue
1520 name, value = cookie.name, cookie.value
1521 if value is None:
1522 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1523 # with no name, whereas http.cookiejar regards it as a
1524 # cookie with no value.
1525 name, value = '', name
1526 f.write('%s\n' % '\t'.join((
1527 cookie.domain,
1528 self._true_or_false(cookie.domain.startswith('.')),
1529 cookie.path,
1530 self._true_or_false(cookie.secure),
1531 str_or_none(cookie.expires, default=''),
1532 name, value
1533 )))
1534
1535 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1536 """
1537 Save cookies to a file.
24146491 1538 Code is taken from CPython 3.6
1539 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1540
c380cc28
S
1541 if filename is None:
1542 if self.filename is not None:
1543 filename = self.filename
1544 else:
ac668111 1545 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1546
24146491 1547 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1548 for cookie in self:
1549 if cookie.expires is None:
1550 cookie.expires = 0
c380cc28 1551
d76fa1f3 1552 with self.open(filename, write=True) as f:
c380cc28 1553 f.write(self._HEADER)
24146491 1554 self._really_save(f, *args, **kwargs)
1bab3437
S
1555
1556 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1557 """Load cookies from a file."""
1558 if filename is None:
1559 if self.filename is not None:
1560 filename = self.filename
1561 else:
ac668111 1562 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1563
c380cc28
S
1564 def prepare_line(line):
1565 if line.startswith(self._HTTPONLY_PREFIX):
1566 line = line[len(self._HTTPONLY_PREFIX):]
1567 # comments and empty lines are fine
1568 if line.startswith('#') or not line.strip():
1569 return line
1570 cookie_list = line.split('\t')
1571 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1572 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1573 cookie = self._CookieFileEntry(*cookie_list)
1574 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1575 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1576 return line
1577
e7e62441 1578 cf = io.StringIO()
d76fa1f3 1579 with self.open(filename) as f:
e7e62441 1580 for line in f:
c380cc28
S
1581 try:
1582 cf.write(prepare_line(line))
ac668111 1583 except http.cookiejar.LoadError as e:
94aa0644 1584 if f'{line.strip()} '[0] in '[{"':
ac668111 1585 raise http.cookiejar.LoadError(
94aa0644
L
1586 'Cookies file must be Netscape formatted, not JSON. See '
1587 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1588 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1589 continue
e7e62441 1590 cf.seek(0)
1591 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1592 # Session cookies are denoted by either `expires` field set to
1593 # an empty string or 0. MozillaCookieJar only recognizes the former
1594 # (see [1]). So we need force the latter to be recognized as session
1595 # cookies on our own.
1596 # Session cookies may be important for cookies-based authentication,
1597 # e.g. usually, when user does not check 'Remember me' check box while
1598 # logging in on a site, some important cookies are stored as session
1599 # cookies so that not recognizing them will result in failed login.
1600 # 1. https://bugs.python.org/issue17164
1601 for cookie in self:
1602 # Treat `expires=0` cookies as session cookies
1603 if cookie.expires == 0:
1604 cookie.expires = None
1605 cookie.discard = True
1606
1607
ac668111 1608class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1609 def __init__(self, cookiejar=None):
ac668111 1610 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1611
1612 def http_response(self, request, response):
ac668111 1613 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1614
ac668111 1615 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1616 https_response = http_response
1617
1618
ac668111 1619class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1620 """YoutubeDL redirect handler
1621
1622 The code is based on HTTPRedirectHandler implementation from CPython [1].
1623
1624 This redirect handler solves two issues:
1625 - ensures redirect URL is always unicode under python 2
1626 - introduces support for experimental HTTP response status code
1627 308 Permanent Redirect [2] used by some sites [3]
1628
1629 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1630 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1631 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1632 """
1633
ac668111 1634 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1635
1636 def redirect_request(self, req, fp, code, msg, headers, newurl):
1637 """Return a Request or None in response to a redirect.
1638
1639 This is called by the http_error_30x methods when a
1640 redirection response is received. If a redirection should
1641 take place, return a new Request to allow http_error_30x to
1642 perform the redirect. Otherwise, raise HTTPError if no-one
1643 else should try to handle this url. Return None if you can't
1644 but another Handler might.
1645 """
1646 m = req.get_method()
1647 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1648 or code in (301, 302, 303) and m == "POST")):
14f25df2 1649 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1650 # Strictly (according to RFC 2616), 301 or 302 in response to
1651 # a POST MUST NOT cause a redirection without confirmation
1652 # from the user (of urllib.request, in this case). In practice,
1653 # essentially all clients do redirect in this case, so we do
1654 # the same.
1655
201c1459 1656 # Be conciliant with URIs containing a space. This is mainly
1657 # redundant with the more complete encoding done in http_error_302(),
1658 # but it is kept for compatibility with other callers.
1659 newurl = newurl.replace(' ', '%20')
1660
1661 CONTENT_HEADERS = ("content-length", "content-type")
1662 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1663 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1664
1665 # A 303 must either use GET or HEAD for subsequent request
1666 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1667 if code == 303 and m != 'HEAD':
1668 m = 'GET'
1669 # 301 and 302 redirects are commonly turned into a GET from a POST
1670 # for subsequent requests by browsers, so we'll do the same.
1671 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1672 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1673 if code in (301, 302) and m == 'POST':
1674 m = 'GET'
1675
ac668111 1676 return urllib.request.Request(
201c1459 1677 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1678 unverifiable=True, method=m)
fca6dba8
S
1679
1680
46f59e89
S
1681def extract_timezone(date_str):
1682 m = re.search(
f137e4c2 1683 r'''(?x)
1684 ^.{8,}? # >=8 char non-TZ prefix, if present
1685 (?P<tz>Z| # just the UTC Z, or
1686 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1687 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1688 [ ]? # optional space
1689 (?P<sign>\+|-) # +/-
1690 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1691 $)
1692 ''', date_str)
46f59e89
S
1693 if not m:
1694 timezone = datetime.timedelta()
1695 else:
1696 date_str = date_str[:-len(m.group('tz'))]
1697 if not m.group('sign'):
1698 timezone = datetime.timedelta()
1699 else:
1700 sign = 1 if m.group('sign') == '+' else -1
1701 timezone = datetime.timedelta(
1702 hours=sign * int(m.group('hours')),
1703 minutes=sign * int(m.group('minutes')))
1704 return timezone, date_str
1705
1706
08b38d54 1707def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1708 """ Return a UNIX timestamp from the given date """
1709
1710 if date_str is None:
1711 return None
1712
52c3a6e4
S
1713 date_str = re.sub(r'\.[0-9]+', '', date_str)
1714
08b38d54 1715 if timezone is None:
46f59e89
S
1716 timezone, date_str = extract_timezone(date_str)
1717
19a03940 1718 with contextlib.suppress(ValueError):
86e5f3ed 1719 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1720 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1721 return calendar.timegm(dt.timetuple())
912b38b4
PH
1722
1723
46f59e89
S
1724def date_formats(day_first=True):
1725 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1726
1727
42bdd9d0 1728def unified_strdate(date_str, day_first=True):
bf50b038 1729 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1730
1731 if date_str is None:
1732 return None
bf50b038 1733 upload_date = None
5f6a1245 1734 # Replace commas
026fcc04 1735 date_str = date_str.replace(',', ' ')
42bdd9d0 1736 # Remove AM/PM + timezone
9bb8e0a3 1737 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1738 _, date_str = extract_timezone(date_str)
42bdd9d0 1739
46f59e89 1740 for expression in date_formats(day_first):
19a03940 1741 with contextlib.suppress(ValueError):
bf50b038 1742 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1743 if upload_date is None:
1744 timetuple = email.utils.parsedate_tz(date_str)
1745 if timetuple:
19a03940 1746 with contextlib.suppress(ValueError):
c6b9cf05 1747 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1748 if upload_date is not None:
14f25df2 1749 return str(upload_date)
bf50b038 1750
5f6a1245 1751
46f59e89
S
1752def unified_timestamp(date_str, day_first=True):
1753 if date_str is None:
1754 return None
1755
2ae2ffda 1756 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1757
7dc2a74e 1758 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1759 timezone, date_str = extract_timezone(date_str)
1760
1761 # Remove AM/PM + timezone
1762 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1763
deef3195
S
1764 # Remove unrecognized timezones from ISO 8601 alike timestamps
1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 if m:
1767 date_str = date_str[:-len(m.group('tz'))]
1768
f226880c
PH
1769 # Python only supports microseconds, so remove nanoseconds
1770 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1771 if m:
1772 date_str = m.group(1)
1773
46f59e89 1774 for expression in date_formats(day_first):
19a03940 1775 with contextlib.suppress(ValueError):
7dc2a74e 1776 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1777 return calendar.timegm(dt.timetuple())
46f59e89
S
1778 timetuple = email.utils.parsedate_tz(date_str)
1779 if timetuple:
7dc2a74e 1780 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1781
1782
28e614de 1783def determine_ext(url, default_ext='unknown_video'):
85750f89 1784 if url is None or '.' not in url:
f4776371 1785 return default_ext
9cb9a5df 1786 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1787 if re.match(r'^[A-Za-z0-9]+$', guess):
1788 return guess
a7aaa398
S
1789 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1790 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1791 return guess.rstrip('/')
73e79f2a 1792 else:
cbdbb766 1793 return default_ext
73e79f2a 1794
5f6a1245 1795
824fa511
S
1796def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1797 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1798
5f6a1245 1799
9e62f283 1800def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1801 R"""
1802 Return a datetime object from a string.
1803 Supported format:
1804 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1805
1806 @param format strftime format of DATE
1807 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1808 auto: round to the unit provided in date_str (if applicable).
9e62f283 1809 """
1810 auto_precision = False
1811 if precision == 'auto':
1812 auto_precision = True
1813 precision = 'microsecond'
396a76f7 1814 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1815 if date_str in ('now', 'today'):
37254abc 1816 return today
f8795e10
PH
1817 if date_str == 'yesterday':
1818 return today - datetime.timedelta(days=1)
9e62f283 1819 match = re.match(
3d38b2d6 1820 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1821 date_str)
37254abc 1822 if match is not None:
9e62f283 1823 start_time = datetime_from_str(match.group('start'), precision, format)
1824 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1825 unit = match.group('unit')
9e62f283 1826 if unit == 'month' or unit == 'year':
1827 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1828 unit = 'day'
9e62f283 1829 else:
1830 if unit == 'week':
1831 unit = 'day'
1832 time *= 7
1833 delta = datetime.timedelta(**{unit + 's': time})
1834 new_date = start_time + delta
1835 if auto_precision:
1836 return datetime_round(new_date, unit)
1837 return new_date
1838
1839 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1840
1841
d49f8db3 1842def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1843 R"""
1844 Return a date object from a string using datetime_from_str
9e62f283 1845
3d38b2d6 1846 @param strict Restrict allowed patterns to "YYYYMMDD" and
1847 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1848 """
3d38b2d6 1849 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1850 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1851 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1852
1853
1854def datetime_add_months(dt, months):
1855 """Increment/Decrement a datetime object by months."""
1856 month = dt.month + months - 1
1857 year = dt.year + month // 12
1858 month = month % 12 + 1
1859 day = min(dt.day, calendar.monthrange(year, month)[1])
1860 return dt.replace(year, month, day)
1861
1862
1863def datetime_round(dt, precision='day'):
1864 """
1865 Round a datetime object's time to a specific precision
1866 """
1867 if precision == 'microsecond':
1868 return dt
1869
1870 unit_seconds = {
1871 'day': 86400,
1872 'hour': 3600,
1873 'minute': 60,
1874 'second': 1,
1875 }
1876 roundto = lambda x, n: ((x + n / 2) // n) * n
1877 timestamp = calendar.timegm(dt.timetuple())
1878 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1879
1880
e63fc1be 1881def hyphenate_date(date_str):
1882 """
1883 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1884 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1885 if match is not None:
1886 return '-'.join(match.groups())
1887 else:
1888 return date_str
1889
5f6a1245 1890
86e5f3ed 1891class DateRange:
bd558525 1892 """Represents a time interval between two dates"""
5f6a1245 1893
bd558525
JMF
1894 def __init__(self, start=None, end=None):
1895 """start and end must be strings in the format accepted by date"""
1896 if start is not None:
d49f8db3 1897 self.start = date_from_str(start, strict=True)
bd558525
JMF
1898 else:
1899 self.start = datetime.datetime.min.date()
1900 if end is not None:
d49f8db3 1901 self.end = date_from_str(end, strict=True)
bd558525
JMF
1902 else:
1903 self.end = datetime.datetime.max.date()
37254abc 1904 if self.start > self.end:
bd558525 1905 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1906
bd558525
JMF
1907 @classmethod
1908 def day(cls, day):
1909 """Returns a range that only contains the given day"""
5f6a1245
JW
1910 return cls(day, day)
1911
bd558525
JMF
1912 def __contains__(self, date):
1913 """Check if the date is in the range"""
37254abc
JMF
1914 if not isinstance(date, datetime.date):
1915 date = date_from_str(date)
1916 return self.start <= date <= self.end
5f6a1245 1917
bd558525 1918 def __str__(self):
86e5f3ed 1919 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1920
f2df4071 1921 def __eq__(self, other):
1922 return (isinstance(other, DateRange)
1923 and self.start == other.start and self.end == other.end)
1924
c496ca96
PH
1925
1926def platform_name():
14f25df2 1927 """ Returns the platform name as a str """
b1f94422 1928 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1929 return platform.platform()
c496ca96 1930
b1f94422 1931
1932@functools.cache
1933def system_identifier():
1934 python_implementation = platform.python_implementation()
1935 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1936 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1937
1938 return 'Python %s (%s %s) - %s %s' % (
1939 platform.python_version(),
1940 python_implementation,
1941 platform.architecture()[0],
1942 platform.platform(),
1943 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1944 )
c257baff
PH
1945
1946
0b9c08b4 1947@functools.cache
49fa4d9a 1948def get_windows_version():
8a82af35 1949 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1950 if compat_os_name == 'nt':
1951 return version_tuple(platform.win32_ver()[1])
1952 else:
8a82af35 1953 return ()
49fa4d9a
N
1954
1955
734f90bb 1956def write_string(s, out=None, encoding=None):
19a03940 1957 assert isinstance(s, str)
1958 out = out or sys.stderr
7459e3a2 1959
fe1daad3 1960 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1961 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1962
8a82af35 1963 enc, buffer = None, out
cfb0511d 1964 if 'b' in getattr(out, 'mode', ''):
c487cf00 1965 enc = encoding or preferredencoding()
104aa738 1966 elif hasattr(out, 'buffer'):
8a82af35 1967 buffer = out.buffer
104aa738 1968 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1969
8a82af35 1970 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1971 out.flush()
1972
1973
48ea9cea
PH
1974def bytes_to_intlist(bs):
1975 if not bs:
1976 return []
1977 if isinstance(bs[0], int): # Python 3
1978 return list(bs)
1979 else:
1980 return [ord(c) for c in bs]
1981
c257baff 1982
cba892fa 1983def intlist_to_bytes(xs):
1984 if not xs:
1985 return b''
ac668111 1986 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1987
1988
8a82af35 1989class LockingUnsupportedError(OSError):
1890fc63 1990 msg = 'File locking is not supported'
0edb3e33 1991
1992 def __init__(self):
1993 super().__init__(self.msg)
1994
1995
c1c9a79c
PH
1996# Cross-platform file locking
1997if sys.platform == 'win32':
1998 import ctypes.wintypes
1999 import msvcrt
2000
2001 class OVERLAPPED(ctypes.Structure):
2002 _fields_ = [
2003 ('Internal', ctypes.wintypes.LPVOID),
2004 ('InternalHigh', ctypes.wintypes.LPVOID),
2005 ('Offset', ctypes.wintypes.DWORD),
2006 ('OffsetHigh', ctypes.wintypes.DWORD),
2007 ('hEvent', ctypes.wintypes.HANDLE),
2008 ]
2009
2010 kernel32 = ctypes.windll.kernel32
2011 LockFileEx = kernel32.LockFileEx
2012 LockFileEx.argtypes = [
2013 ctypes.wintypes.HANDLE, # hFile
2014 ctypes.wintypes.DWORD, # dwFlags
2015 ctypes.wintypes.DWORD, # dwReserved
2016 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2017 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2018 ctypes.POINTER(OVERLAPPED) # Overlapped
2019 ]
2020 LockFileEx.restype = ctypes.wintypes.BOOL
2021 UnlockFileEx = kernel32.UnlockFileEx
2022 UnlockFileEx.argtypes = [
2023 ctypes.wintypes.HANDLE, # hFile
2024 ctypes.wintypes.DWORD, # dwReserved
2025 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2026 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2027 ctypes.POINTER(OVERLAPPED) # Overlapped
2028 ]
2029 UnlockFileEx.restype = ctypes.wintypes.BOOL
2030 whole_low = 0xffffffff
2031 whole_high = 0x7fffffff
2032
747c0bd1 2033 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2034 overlapped = OVERLAPPED()
2035 overlapped.Offset = 0
2036 overlapped.OffsetHigh = 0
2037 overlapped.hEvent = 0
2038 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2039
2040 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2041 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2042 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2043 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2044 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2045
2046 def _unlock_file(f):
2047 assert f._lock_file_overlapped_p
2048 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2049 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2050 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2051
2052else:
399a76e6
YCH
2053 try:
2054 import fcntl
c1c9a79c 2055
a3125791 2056 def _lock_file(f, exclusive, block):
b63837bc 2057 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2058 if not block:
2059 flags |= fcntl.LOCK_NB
acea8d7c 2060 try:
b63837bc 2061 fcntl.flock(f, flags)
acea8d7c
JK
2062 except BlockingIOError:
2063 raise
2064 except OSError: # AOSP does not have flock()
b63837bc 2065 fcntl.lockf(f, flags)
c1c9a79c 2066
399a76e6 2067 def _unlock_file(f):
acea8d7c
JK
2068 try:
2069 fcntl.flock(f, fcntl.LOCK_UN)
2070 except OSError:
2071 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2072
399a76e6 2073 except ImportError:
399a76e6 2074
a3125791 2075 def _lock_file(f, exclusive, block):
0edb3e33 2076 raise LockingUnsupportedError()
399a76e6
YCH
2077
2078 def _unlock_file(f):
0edb3e33 2079 raise LockingUnsupportedError()
c1c9a79c
PH
2080
2081
86e5f3ed 2082class locked_file:
0edb3e33 2083 locked = False
747c0bd1 2084
a3125791 2085 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2086 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2087 raise NotImplementedError(mode)
2088 self.mode, self.block = mode, block
2089
2090 writable = any(f in mode for f in 'wax+')
2091 readable = any(f in mode for f in 'r+')
2092 flags = functools.reduce(operator.ior, (
2093 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2094 getattr(os, 'O_BINARY', 0), # Windows only
2095 getattr(os, 'O_NOINHERIT', 0), # Windows only
2096 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2097 os.O_APPEND if 'a' in mode else 0,
2098 os.O_EXCL if 'x' in mode else 0,
2099 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2100 ))
2101
98804d03 2102 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2103
2104 def __enter__(self):
a3125791 2105 exclusive = 'r' not in self.mode
c1c9a79c 2106 try:
a3125791 2107 _lock_file(self.f, exclusive, self.block)
0edb3e33 2108 self.locked = True
86e5f3ed 2109 except OSError:
c1c9a79c
PH
2110 self.f.close()
2111 raise
fcfa8853 2112 if 'w' in self.mode:
131e14dc
JK
2113 try:
2114 self.f.truncate()
2115 except OSError as e:
1890fc63 2116 if e.errno not in (
2117 errno.ESPIPE, # Illegal seek - expected for FIFO
2118 errno.EINVAL, # Invalid argument - expected for /dev/null
2119 ):
2120 raise
c1c9a79c
PH
2121 return self
2122
0edb3e33 2123 def unlock(self):
2124 if not self.locked:
2125 return
c1c9a79c 2126 try:
0edb3e33 2127 _unlock_file(self.f)
c1c9a79c 2128 finally:
0edb3e33 2129 self.locked = False
c1c9a79c 2130
0edb3e33 2131 def __exit__(self, *_):
2132 try:
2133 self.unlock()
2134 finally:
2135 self.f.close()
4eb7f1d1 2136
0edb3e33 2137 open = __enter__
2138 close = __exit__
a3125791 2139
0edb3e33 2140 def __getattr__(self, attr):
2141 return getattr(self.f, attr)
a3125791 2142
0edb3e33 2143 def __iter__(self):
2144 return iter(self.f)
a3125791 2145
4eb7f1d1 2146
0b9c08b4 2147@functools.cache
4644ac55
S
2148def get_filesystem_encoding():
2149 encoding = sys.getfilesystemencoding()
2150 return encoding if encoding is not None else 'utf-8'
2151
2152
4eb7f1d1 2153def shell_quote(args):
a6a173c2 2154 quoted_args = []
4644ac55 2155 encoding = get_filesystem_encoding()
a6a173c2
JMF
2156 for a in args:
2157 if isinstance(a, bytes):
2158 # We may get a filename encoded with 'encodeFilename'
2159 a = a.decode(encoding)
aefce8e6 2160 quoted_args.append(compat_shlex_quote(a))
28e614de 2161 return ' '.join(quoted_args)
9d4660ca
PH
2162
2163
2164def smuggle_url(url, data):
2165 """ Pass additional data in a URL for internal use. """
2166
81953d1a
RA
2167 url, idata = unsmuggle_url(url, {})
2168 data.update(idata)
14f25df2 2169 sdata = urllib.parse.urlencode(
28e614de
PH
2170 {'__youtubedl_smuggle': json.dumps(data)})
2171 return url + '#' + sdata
9d4660ca
PH
2172
2173
79f82953 2174def unsmuggle_url(smug_url, default=None):
83e865a3 2175 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2176 return smug_url, default
28e614de 2177 url, _, sdata = smug_url.rpartition('#')
14f25df2 2178 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2179 data = json.loads(jsond)
2180 return url, data
02dbf93f
PH
2181
2182
e0fd9573 2183def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2184 """ Formats numbers with decimal sufixes like K, M, etc """
2185 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2186 if num is None or num < 0:
e0fd9573 2187 return None
eeb2a770 2188 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2189 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2190 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2191 if factor == 1024:
2192 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2193 converted = num / (factor ** exponent)
abbeeebc 2194 return fmt % (converted, suffix)
e0fd9573 2195
2196
02dbf93f 2197def format_bytes(bytes):
f02d24d8 2198 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2199
1c088fa8 2200
fb47597b
S
2201def lookup_unit_table(unit_table, s):
2202 units_re = '|'.join(re.escape(u) for u in unit_table)
2203 m = re.match(
782b1b5b 2204 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2205 if not m:
2206 return None
2207 num_str = m.group('num').replace(',', '.')
2208 mult = unit_table[m.group('unit')]
2209 return int(float(num_str) * mult)
2210
2211
be64b5b0
PH
2212def parse_filesize(s):
2213 if s is None:
2214 return None
2215
dfb1b146 2216 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2217 # but we support those too
2218 _UNIT_TABLE = {
2219 'B': 1,
2220 'b': 1,
70852b47 2221 'bytes': 1,
be64b5b0
PH
2222 'KiB': 1024,
2223 'KB': 1000,
2224 'kB': 1024,
2225 'Kb': 1000,
13585d76 2226 'kb': 1000,
70852b47
YCH
2227 'kilobytes': 1000,
2228 'kibibytes': 1024,
be64b5b0
PH
2229 'MiB': 1024 ** 2,
2230 'MB': 1000 ** 2,
2231 'mB': 1024 ** 2,
2232 'Mb': 1000 ** 2,
13585d76 2233 'mb': 1000 ** 2,
70852b47
YCH
2234 'megabytes': 1000 ** 2,
2235 'mebibytes': 1024 ** 2,
be64b5b0
PH
2236 'GiB': 1024 ** 3,
2237 'GB': 1000 ** 3,
2238 'gB': 1024 ** 3,
2239 'Gb': 1000 ** 3,
13585d76 2240 'gb': 1000 ** 3,
70852b47
YCH
2241 'gigabytes': 1000 ** 3,
2242 'gibibytes': 1024 ** 3,
be64b5b0
PH
2243 'TiB': 1024 ** 4,
2244 'TB': 1000 ** 4,
2245 'tB': 1024 ** 4,
2246 'Tb': 1000 ** 4,
13585d76 2247 'tb': 1000 ** 4,
70852b47
YCH
2248 'terabytes': 1000 ** 4,
2249 'tebibytes': 1024 ** 4,
be64b5b0
PH
2250 'PiB': 1024 ** 5,
2251 'PB': 1000 ** 5,
2252 'pB': 1024 ** 5,
2253 'Pb': 1000 ** 5,
13585d76 2254 'pb': 1000 ** 5,
70852b47
YCH
2255 'petabytes': 1000 ** 5,
2256 'pebibytes': 1024 ** 5,
be64b5b0
PH
2257 'EiB': 1024 ** 6,
2258 'EB': 1000 ** 6,
2259 'eB': 1024 ** 6,
2260 'Eb': 1000 ** 6,
13585d76 2261 'eb': 1000 ** 6,
70852b47
YCH
2262 'exabytes': 1000 ** 6,
2263 'exbibytes': 1024 ** 6,
be64b5b0
PH
2264 'ZiB': 1024 ** 7,
2265 'ZB': 1000 ** 7,
2266 'zB': 1024 ** 7,
2267 'Zb': 1000 ** 7,
13585d76 2268 'zb': 1000 ** 7,
70852b47
YCH
2269 'zettabytes': 1000 ** 7,
2270 'zebibytes': 1024 ** 7,
be64b5b0
PH
2271 'YiB': 1024 ** 8,
2272 'YB': 1000 ** 8,
2273 'yB': 1024 ** 8,
2274 'Yb': 1000 ** 8,
13585d76 2275 'yb': 1000 ** 8,
70852b47
YCH
2276 'yottabytes': 1000 ** 8,
2277 'yobibytes': 1024 ** 8,
be64b5b0
PH
2278 }
2279
fb47597b
S
2280 return lookup_unit_table(_UNIT_TABLE, s)
2281
2282
2283def parse_count(s):
2284 if s is None:
be64b5b0
PH
2285 return None
2286
352d5da8 2287 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2288
2289 if re.match(r'^[\d,.]+$', s):
2290 return str_to_int(s)
2291
2292 _UNIT_TABLE = {
2293 'k': 1000,
2294 'K': 1000,
2295 'm': 1000 ** 2,
2296 'M': 1000 ** 2,
2297 'kk': 1000 ** 2,
2298 'KK': 1000 ** 2,
352d5da8 2299 'b': 1000 ** 3,
2300 'B': 1000 ** 3,
fb47597b 2301 }
be64b5b0 2302
352d5da8 2303 ret = lookup_unit_table(_UNIT_TABLE, s)
2304 if ret is not None:
2305 return ret
2306
2307 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2308 if mobj:
2309 return str_to_int(mobj.group(1))
be64b5b0 2310
2f7ae819 2311
5d45484c 2312def parse_resolution(s, *, lenient=False):
b871d7e9
S
2313 if s is None:
2314 return {}
2315
5d45484c
LNO
2316 if lenient:
2317 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2318 else:
2319 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2320 if mobj:
2321 return {
2322 'width': int(mobj.group('w')),
2323 'height': int(mobj.group('h')),
2324 }
2325
17ec8bcf 2326 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2327 if mobj:
2328 return {'height': int(mobj.group(1))}
2329
2330 mobj = re.search(r'\b([48])[kK]\b', s)
2331 if mobj:
2332 return {'height': int(mobj.group(1)) * 540}
2333
2334 return {}
2335
2336
0dc41787 2337def parse_bitrate(s):
14f25df2 2338 if not isinstance(s, str):
0dc41787
S
2339 return
2340 mobj = re.search(r'\b(\d+)\s*kbps', s)
2341 if mobj:
2342 return int(mobj.group(1))
2343
2344
a942d6cb 2345def month_by_name(name, lang='en'):
caefb1de
PH
2346 """ Return the number of a month by (locale-independently) English name """
2347
f6717dec 2348 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2349
caefb1de 2350 try:
f6717dec 2351 return month_names.index(name) + 1
7105440c
YCH
2352 except ValueError:
2353 return None
2354
2355
2356def month_by_abbreviation(abbrev):
2357 """ Return the number of a month by (locale-independently) English
2358 abbreviations """
2359
2360 try:
2361 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2362 except ValueError:
2363 return None
18258362
JMF
2364
2365
5aafe895 2366def fix_xml_ampersands(xml_str):
18258362 2367 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2368 return re.sub(
2369 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2370 '&amp;',
5aafe895 2371 xml_str)
e3946f98
PH
2372
2373
2374def setproctitle(title):
14f25df2 2375 assert isinstance(title, str)
c1c05c67
YCH
2376
2377 # ctypes in Jython is not complete
2378 # http://bugs.jython.org/issue2148
2379 if sys.platform.startswith('java'):
2380 return
2381
e3946f98 2382 try:
611c1dd9 2383 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2384 except OSError:
2385 return
2f49bcd6
RC
2386 except TypeError:
2387 # LoadLibrary in Windows Python 2.7.13 only expects
2388 # a bytestring, but since unicode_literals turns
2389 # every string into a unicode string, it fails.
2390 return
0f06bcd7 2391 title_bytes = title.encode()
6eefe533
PH
2392 buf = ctypes.create_string_buffer(len(title_bytes))
2393 buf.value = title_bytes
e3946f98 2394 try:
6eefe533 2395 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2396 except AttributeError:
2397 return # Strange libc, just skip this
d7dda168
PH
2398
2399
2400def remove_start(s, start):
46bc9b7d 2401 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2402
2403
2b9faf55 2404def remove_end(s, end):
46bc9b7d 2405 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2406
2407
31b2051e
S
2408def remove_quotes(s):
2409 if s is None or len(s) < 2:
2410 return s
2411 for quote in ('"', "'", ):
2412 if s[0] == quote and s[-1] == quote:
2413 return s[1:-1]
2414 return s
2415
2416
b6e0c7d2 2417def get_domain(url):
ebf99aaf 2418 """
2419 This implementation is inconsistent, but is kept for compatibility.
2420 Use this only for "webpage_url_domain"
2421 """
2422 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2423
2424
29eb5174 2425def url_basename(url):
14f25df2 2426 path = urllib.parse.urlparse(url).path
28e614de 2427 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2428
2429
02dc0a36
S
2430def base_url(url):
2431 return re.match(r'https?://[^?#&]+/', url).group()
2432
2433
e34c3361 2434def urljoin(base, path):
4b5de77b 2435 if isinstance(path, bytes):
0f06bcd7 2436 path = path.decode()
14f25df2 2437 if not isinstance(path, str) or not path:
e34c3361 2438 return None
fad4ceb5 2439 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2440 return path
4b5de77b 2441 if isinstance(base, bytes):
0f06bcd7 2442 base = base.decode()
14f25df2 2443 if not isinstance(base, str) or not re.match(
4b5de77b 2444 r'^(?:https?:)?//', base):
e34c3361 2445 return None
14f25df2 2446 return urllib.parse.urljoin(base, path)
e34c3361
S
2447
2448
ac668111 2449class HEADRequest(urllib.request.Request):
aa94a6d3 2450 def get_method(self):
611c1dd9 2451 return 'HEAD'
7217e148
PH
2452
2453
ac668111 2454class PUTRequest(urllib.request.Request):
95cf60e8
S
2455 def get_method(self):
2456 return 'PUT'
2457
2458
9732d77e 2459def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2460 if get_attr and v is not None:
2461 v = getattr(v, get_attr, None)
1812afb7
S
2462 try:
2463 return int(v) * invscale // scale
31c49255 2464 except (ValueError, TypeError, OverflowError):
af98f8ff 2465 return default
9732d77e 2466
9572013d 2467
40a90862 2468def str_or_none(v, default=None):
14f25df2 2469 return default if v is None else str(v)
40a90862 2470
9732d77e
PH
2471
2472def str_to_int(int_str):
48d4681e 2473 """ A more relaxed version of int_or_none """
f9934b96 2474 if isinstance(int_str, int):
348c6bf1 2475 return int_str
14f25df2 2476 elif isinstance(int_str, str):
42db58ec
S
2477 int_str = re.sub(r'[,\.\+]', '', int_str)
2478 return int_or_none(int_str)
608d11f5
PH
2479
2480
9732d77e 2481def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2482 if v is None:
2483 return default
2484 try:
2485 return float(v) * invscale / scale
5e1271c5 2486 except (ValueError, TypeError):
caf80631 2487 return default
43f775e4
PH
2488
2489
c7e327c4
S
2490def bool_or_none(v, default=None):
2491 return v if isinstance(v, bool) else default
2492
2493
53cd37ba 2494def strip_or_none(v, default=None):
14f25df2 2495 return v.strip() if isinstance(v, str) else default
b72b4431
S
2496
2497
af03000a 2498def url_or_none(url):
14f25df2 2499 if not url or not isinstance(url, str):
af03000a
S
2500 return None
2501 url = url.strip()
29f7c58a 2502 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2503
2504
3e9b66d7 2505def request_to_url(req):
ac668111 2506 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2507 return req.get_full_url()
2508 else:
2509 return req
2510
2511
e29663c6 2512def strftime_or_none(timestamp, date_format, default=None):
2513 datetime_object = None
2514 try:
f9934b96 2515 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2516 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2517 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2518 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2519 return datetime_object.strftime(date_format)
2520 except (ValueError, TypeError, AttributeError):
2521 return default
2522
2523
608d11f5 2524def parse_duration(s):
f9934b96 2525 if not isinstance(s, str):
608d11f5 2526 return None
ca7b3246 2527 s = s.strip()
38d79fd1 2528 if not s:
2529 return None
ca7b3246 2530
acaff495 2531 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2532 m = re.match(r'''(?x)
2533 (?P<before_secs>
2534 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2535 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2536 (?P<ms>[.:][0-9]+)?Z?$
2537 ''', s)
acaff495 2538 if m:
8bd1c00b 2539 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2540 else:
2541 m = re.match(
056653bb
S
2542 r'''(?ix)(?:P?
2543 (?:
1c1b2f96 2544 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2545 )?
2546 (?:
1c1b2f96 2547 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2548 )?
2549 (?:
1c1b2f96 2550 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2551 )?
8f4b58d7 2552 (?:
1c1b2f96 2553 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2554 )?
056653bb 2555 T)?
acaff495 2556 (?:
1c1b2f96 2557 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2558 )?
2559 (?:
1c1b2f96 2560 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2561 )?
2562 (?:
2563 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2564 )?Z?$''', s)
acaff495 2565 if m:
2566 days, hours, mins, secs, ms = m.groups()
2567 else:
15846398 2568 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2569 if m:
2570 hours, mins = m.groups()
2571 else:
2572 return None
2573
acaff495 2574 if ms:
19a03940 2575 ms = ms.replace(':', '.')
2576 return sum(float(part or 0) * mult for part, mult in (
2577 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2578
2579
e65e4c88 2580def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2581 name, real_ext = os.path.splitext(filename)
e65e4c88 2582 return (
86e5f3ed 2583 f'{name}.{ext}{real_ext}'
e65e4c88 2584 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2585 else f'{filename}.{ext}')
d70ad093
PH
2586
2587
b3ed15b7
S
2588def replace_extension(filename, ext, expected_real_ext=None):
2589 name, real_ext = os.path.splitext(filename)
86e5f3ed 2590 return '{}.{}'.format(
b3ed15b7
S
2591 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2592 ext)
2593
2594
d70ad093
PH
2595def check_executable(exe, args=[]):
2596 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2597 args can be a list of arguments for a short output (like -version) """
2598 try:
f0c9fb96 2599 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2600 except OSError:
2601 return False
2602 return exe
b7ab0590
PH
2603
2604
8a7f68d0 2605def _get_exe_version_output(exe, args, *, to_screen=None):
2606 if to_screen:
2607 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2608 try:
b64d04c1 2609 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2610 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2611 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2612 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2613 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2614 except OSError:
2615 return False
f0c9fb96 2616 return stdout
cae97f65
PH
2617
2618
2619def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2620 assert isinstance(output, str)
cae97f65
PH
2621 if version_re is None:
2622 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2623 m = re.search(version_re, output)
95807118
PH
2624 if m:
2625 return m.group(1)
2626 else:
2627 return unrecognized
2628
2629
9af98e17 2630def get_exe_version(exe, args=['--version'],
2631 version_re=None, unrecognized='present'):
2632 """ Returns the version of the specified executable,
2633 or False if the executable is not present """
2634 out = _get_exe_version_output(exe, args)
2635 return detect_exe_version(out, version_re, unrecognized) if out else False
2636
2637
7e88d7d7 2638def frange(start=0, stop=None, step=1):
2639 """Float range"""
2640 if stop is None:
2641 start, stop = 0, start
2642 sign = [-1, 1][step > 0] if step else 0
2643 while sign * start < sign * stop:
2644 yield start
2645 start += step
2646
2647
cb89cfc1 2648class LazyList(collections.abc.Sequence):
0f06bcd7 2649 """Lazy immutable list from an iterable
2650 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2651
8e5fecc8 2652 class IndexError(IndexError):
2653 pass
2654
282f5709 2655 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2656 self._iterable = iter(iterable)
2657 self._cache = [] if _cache is None else _cache
2658 self._reversed = reverse
483336e7 2659
2660 def __iter__(self):
0f06bcd7 2661 if self._reversed:
28419ca2 2662 # We need to consume the entire iterable to iterate in reverse
981052c9 2663 yield from self.exhaust()
28419ca2 2664 return
0f06bcd7 2665 yield from self._cache
2666 for item in self._iterable:
2667 self._cache.append(item)
483336e7 2668 yield item
2669
0f06bcd7 2670 def _exhaust(self):
2671 self._cache.extend(self._iterable)
2672 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2673 return self._cache
28419ca2 2674
981052c9 2675 def exhaust(self):
0f06bcd7 2676 """Evaluate the entire iterable"""
2677 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2678
28419ca2 2679 @staticmethod
0f06bcd7 2680 def _reverse_index(x):
f2df4071 2681 return None if x is None else ~x
483336e7 2682
2683 def __getitem__(self, idx):
2684 if isinstance(idx, slice):
0f06bcd7 2685 if self._reversed:
2686 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2687 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2688 elif isinstance(idx, int):
0f06bcd7 2689 if self._reversed:
2690 idx = self._reverse_index(idx)
e0f2b4b4 2691 start, stop, step = idx, idx, 0
483336e7 2692 else:
2693 raise TypeError('indices must be integers or slices')
e0f2b4b4 2694 if ((start or 0) < 0 or (stop or 0) < 0
2695 or (start is None and step < 0)
2696 or (stop is None and step > 0)):
483336e7 2697 # We need to consume the entire iterable to be able to slice from the end
2698 # Obviously, never use this with infinite iterables
0f06bcd7 2699 self._exhaust()
8e5fecc8 2700 try:
0f06bcd7 2701 return self._cache[idx]
8e5fecc8 2702 except IndexError as e:
2703 raise self.IndexError(e) from e
0f06bcd7 2704 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2705 if n > 0:
0f06bcd7 2706 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2707 try:
0f06bcd7 2708 return self._cache[idx]
8e5fecc8 2709 except IndexError as e:
2710 raise self.IndexError(e) from e
483336e7 2711
2712 def __bool__(self):
2713 try:
0f06bcd7 2714 self[-1] if self._reversed else self[0]
8e5fecc8 2715 except self.IndexError:
483336e7 2716 return False
2717 return True
2718
2719 def __len__(self):
0f06bcd7 2720 self._exhaust()
2721 return len(self._cache)
483336e7 2722
282f5709 2723 def __reversed__(self):
0f06bcd7 2724 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2725
2726 def __copy__(self):
0f06bcd7 2727 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2728
28419ca2 2729 def __repr__(self):
2730 # repr and str should mimic a list. So we exhaust the iterable
2731 return repr(self.exhaust())
2732
2733 def __str__(self):
2734 return repr(self.exhaust())
2735
483336e7 2736
7be9ccff 2737class PagedList:
c07a39ae 2738
2739 class IndexError(IndexError):
2740 pass
2741
dd26ced1
PH
2742 def __len__(self):
2743 # This is only useful for tests
2744 return len(self.getslice())
2745
7be9ccff 2746 def __init__(self, pagefunc, pagesize, use_cache=True):
2747 self._pagefunc = pagefunc
2748 self._pagesize = pagesize
f1d13090 2749 self._pagecount = float('inf')
7be9ccff 2750 self._use_cache = use_cache
2751 self._cache = {}
2752
2753 def getpage(self, pagenum):
d8cf8d97 2754 page_results = self._cache.get(pagenum)
2755 if page_results is None:
f1d13090 2756 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2757 if self._use_cache:
2758 self._cache[pagenum] = page_results
2759 return page_results
2760
2761 def getslice(self, start=0, end=None):
2762 return list(self._getslice(start, end))
2763
2764 def _getslice(self, start, end):
55575225 2765 raise NotImplementedError('This method must be implemented by subclasses')
2766
2767 def __getitem__(self, idx):
f1d13090 2768 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2769 if not isinstance(idx, int) or idx < 0:
2770 raise TypeError('indices must be non-negative integers')
2771 entries = self.getslice(idx, idx + 1)
d8cf8d97 2772 if not entries:
c07a39ae 2773 raise self.IndexError()
d8cf8d97 2774 return entries[0]
55575225 2775
9c44d242
PH
2776
2777class OnDemandPagedList(PagedList):
a44ca5a4 2778 """Download pages until a page with less than maximum results"""
86e5f3ed 2779
7be9ccff 2780 def _getslice(self, start, end):
b7ab0590
PH
2781 for pagenum in itertools.count(start // self._pagesize):
2782 firstid = pagenum * self._pagesize
2783 nextfirstid = pagenum * self._pagesize + self._pagesize
2784 if start >= nextfirstid:
2785 continue
2786
b7ab0590
PH
2787 startv = (
2788 start % self._pagesize
2789 if firstid <= start < nextfirstid
2790 else 0)
b7ab0590
PH
2791 endv = (
2792 ((end - 1) % self._pagesize) + 1
2793 if (end is not None and firstid <= end <= nextfirstid)
2794 else None)
2795
f1d13090 2796 try:
2797 page_results = self.getpage(pagenum)
2798 except Exception:
2799 self._pagecount = pagenum - 1
2800 raise
b7ab0590
PH
2801 if startv != 0 or endv is not None:
2802 page_results = page_results[startv:endv]
7be9ccff 2803 yield from page_results
b7ab0590
PH
2804
2805 # A little optimization - if current page is not "full", ie. does
2806 # not contain page_size videos then we can assume that this page
2807 # is the last one - there are no more ids on further pages -
2808 # i.e. no need to query again.
2809 if len(page_results) + startv < self._pagesize:
2810 break
2811
2812 # If we got the whole page, but the next page is not interesting,
2813 # break out early as well
2814 if end == nextfirstid:
2815 break
81c2f20b
PH
2816
2817
9c44d242 2818class InAdvancePagedList(PagedList):
a44ca5a4 2819 """PagedList with total number of pages known in advance"""
86e5f3ed 2820
9c44d242 2821 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2822 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2823 self._pagecount = pagecount
9c44d242 2824
7be9ccff 2825 def _getslice(self, start, end):
9c44d242 2826 start_page = start // self._pagesize
d37707bd 2827 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2828 skip_elems = start - start_page * self._pagesize
2829 only_more = None if end is None else end - start
2830 for pagenum in range(start_page, end_page):
7be9ccff 2831 page_results = self.getpage(pagenum)
9c44d242 2832 if skip_elems:
7be9ccff 2833 page_results = page_results[skip_elems:]
9c44d242
PH
2834 skip_elems = None
2835 if only_more is not None:
7be9ccff 2836 if len(page_results) < only_more:
2837 only_more -= len(page_results)
9c44d242 2838 else:
7be9ccff 2839 yield from page_results[:only_more]
9c44d242 2840 break
7be9ccff 2841 yield from page_results
9c44d242
PH
2842
2843
7e88d7d7 2844class PlaylistEntries:
2845 MissingEntry = object()
2846 is_exhausted = False
2847
2848 def __init__(self, ydl, info_dict):
7e9a6125 2849 self.ydl = ydl
2850
2851 # _entries must be assigned now since infodict can change during iteration
2852 entries = info_dict.get('entries')
2853 if entries is None:
2854 raise EntryNotInPlaylist('There are no entries')
2855 elif isinstance(entries, list):
2856 self.is_exhausted = True
2857
2858 requested_entries = info_dict.get('requested_entries')
2859 self.is_incomplete = bool(requested_entries)
2860 if self.is_incomplete:
2861 assert self.is_exhausted
2862 self._entries = [self.MissingEntry] * max(requested_entries)
2863 for i, entry in zip(requested_entries, entries):
2864 self._entries[i - 1] = entry
2865 elif isinstance(entries, (list, PagedList, LazyList)):
2866 self._entries = entries
2867 else:
2868 self._entries = LazyList(entries)
7e88d7d7 2869
2870 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2871 (?P<start>[+-]?\d+)?
2872 (?P<range>[:-]
2873 (?P<end>[+-]?\d+|inf(?:inite)?)?
2874 (?::(?P<step>[+-]?\d+))?
2875 )?''')
2876
2877 @classmethod
2878 def parse_playlist_items(cls, string):
2879 for segment in string.split(','):
2880 if not segment:
2881 raise ValueError('There is two or more consecutive commas')
2882 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2883 if not mobj:
2884 raise ValueError(f'{segment!r} is not a valid specification')
2885 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2886 if int_or_none(step) == 0:
2887 raise ValueError(f'Step in {segment!r} cannot be zero')
2888 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2889
2890 def get_requested_items(self):
2891 playlist_items = self.ydl.params.get('playlist_items')
2892 playlist_start = self.ydl.params.get('playliststart', 1)
2893 playlist_end = self.ydl.params.get('playlistend')
2894 # For backwards compatibility, interpret -1 as whole list
2895 if playlist_end in (-1, None):
2896 playlist_end = ''
2897 if not playlist_items:
2898 playlist_items = f'{playlist_start}:{playlist_end}'
2899 elif playlist_start != 1 or playlist_end:
2900 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2901
2902 for index in self.parse_playlist_items(playlist_items):
2903 for i, entry in self[index]:
2904 yield i, entry
1ac4fd80 2905 if not entry:
2906 continue
7e88d7d7 2907 try:
2908 # TODO: Add auto-generated fields
2909 self.ydl._match_entry(entry, incomplete=True, silent=True)
2910 except (ExistingVideoReached, RejectedVideoReached):
2911 return
2912
7e9a6125 2913 def get_full_count(self):
2914 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2915 return len(self)
2916 elif isinstance(self._entries, InAdvancePagedList):
2917 if self._entries._pagesize == 1:
2918 return self._entries._pagecount
2919
7e88d7d7 2920 @functools.cached_property
2921 def _getter(self):
2922 if isinstance(self._entries, list):
2923 def get_entry(i):
2924 try:
2925 entry = self._entries[i]
2926 except IndexError:
2927 entry = self.MissingEntry
2928 if not self.is_incomplete:
2929 raise self.IndexError()
2930 if entry is self.MissingEntry:
2931 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2932 return entry
2933 else:
2934 def get_entry(i):
2935 try:
2936 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2937 except (LazyList.IndexError, PagedList.IndexError):
2938 raise self.IndexError()
2939 return get_entry
2940
2941 def __getitem__(self, idx):
2942 if isinstance(idx, int):
2943 idx = slice(idx, idx)
2944
2945 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2946 step = 1 if idx.step is None else idx.step
2947 if idx.start is None:
2948 start = 0 if step > 0 else len(self) - 1
2949 else:
2950 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2951
2952 # NB: Do not call len(self) when idx == [:]
2953 if idx.stop is None:
2954 stop = 0 if step < 0 else float('inf')
2955 else:
2956 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2957 stop += [-1, 1][step > 0]
2958
2959 for i in frange(start, stop, step):
2960 if i < 0:
2961 continue
2962 try:
7e9a6125 2963 entry = self._getter(i)
2964 except self.IndexError:
2965 self.is_exhausted = True
2966 if step > 0:
7e88d7d7 2967 break
7e9a6125 2968 continue
7e88d7d7 2969 yield i + 1, entry
2970
2971 def __len__(self):
2972 return len(tuple(self[:]))
2973
2974 class IndexError(IndexError):
2975 pass
2976
2977
81c2f20b 2978def uppercase_escape(s):
676eb3f2 2979 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2980 return re.sub(
a612753d 2981 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2982 lambda m: unicode_escape(m.group(0))[0],
2983 s)
0fe2ff78
YCH
2984
2985
2986def lowercase_escape(s):
2987 unicode_escape = codecs.getdecoder('unicode_escape')
2988 return re.sub(
2989 r'\\u[0-9a-fA-F]{4}',
2990 lambda m: unicode_escape(m.group(0))[0],
2991 s)
b53466e1 2992
d05cfe06
S
2993
2994def escape_rfc3986(s):
2995 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2996 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2997
2998
2999def escape_url(url):
3000 """Escape URL as suggested by RFC 3986"""
14f25df2 3001 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3002 return url_parsed._replace(
efbed08d 3003 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3004 path=escape_rfc3986(url_parsed.path),
3005 params=escape_rfc3986(url_parsed.params),
3006 query=escape_rfc3986(url_parsed.query),
3007 fragment=escape_rfc3986(url_parsed.fragment)
3008 ).geturl()
3009
62e609ab 3010
4dfbf869 3011def parse_qs(url):
14f25df2 3012 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3013
3014
62e609ab
PH
3015def read_batch_urls(batch_fd):
3016 def fixup(url):
14f25df2 3017 if not isinstance(url, str):
62e609ab 3018 url = url.decode('utf-8', 'replace')
8c04f0be 3019 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3020 for bom in BOM_UTF8:
3021 if url.startswith(bom):
3022 url = url[len(bom):]
3023 url = url.lstrip()
3024 if not url or url.startswith(('#', ';', ']')):
62e609ab 3025 return False
8c04f0be 3026 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3027 # However, it can be safely stripped out if following a whitespace
8c04f0be 3028 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3029
3030 with contextlib.closing(batch_fd) as fd:
3031 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3032
3033
3034def urlencode_postdata(*args, **kargs):
14f25df2 3035 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3036
3037
38f9ef31 3038def update_url_query(url, query):
cacd9966
YCH
3039 if not query:
3040 return url
14f25df2 3041 parsed_url = urllib.parse.urlparse(url)
3042 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3043 qs.update(query)
14f25df2 3044 return urllib.parse.urlunparse(parsed_url._replace(
3045 query=urllib.parse.urlencode(qs, True)))
16392824 3046
8e60dc75 3047
c043c246 3048def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3049 req_headers = req.headers.copy()
c043c246 3050 req_headers.update(headers or {})
ed0291d1
S
3051 req_data = data or req.data
3052 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3053 req_get_method = req.get_method()
3054 if req_get_method == 'HEAD':
3055 req_type = HEADRequest
3056 elif req_get_method == 'PUT':
3057 req_type = PUTRequest
3058 else:
ac668111 3059 req_type = urllib.request.Request
ed0291d1
S
3060 new_req = req_type(
3061 req_url, data=req_data, headers=req_headers,
3062 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3063 if hasattr(req, 'timeout'):
3064 new_req.timeout = req.timeout
3065 return new_req
3066
3067
10c87c15 3068def _multipart_encode_impl(data, boundary):
0c265486
YCH
3069 content_type = 'multipart/form-data; boundary=%s' % boundary
3070
3071 out = b''
3072 for k, v in data.items():
3073 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3074 if isinstance(k, str):
0f06bcd7 3075 k = k.encode()
14f25df2 3076 if isinstance(v, str):
0f06bcd7 3077 v = v.encode()
0c265486
YCH
3078 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3079 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3080 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3081 if boundary.encode('ascii') in content:
3082 raise ValueError('Boundary overlaps with data')
3083 out += content
3084
3085 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3086
3087 return out, content_type
3088
3089
3090def multipart_encode(data, boundary=None):
3091 '''
3092 Encode a dict to RFC 7578-compliant form-data
3093
3094 data:
3095 A dict where keys and values can be either Unicode or bytes-like
3096 objects.
3097 boundary:
3098 If specified a Unicode object, it's used as the boundary. Otherwise
3099 a random boundary is generated.
3100
3101 Reference: https://tools.ietf.org/html/rfc7578
3102 '''
3103 has_specified_boundary = boundary is not None
3104
3105 while True:
3106 if boundary is None:
3107 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3108
3109 try:
10c87c15 3110 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3111 break
3112 except ValueError:
3113 if has_specified_boundary:
3114 raise
3115 boundary = None
3116
3117 return out, content_type
3118
3119
86296ad2 3120def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3121 for val in map(d.get, variadic(key_or_keys)):
3122 if val is not None and (val or not skip_false_values):
3123 return val
3124 return default
cbecc9b9
S
3125
3126
c4f60dd7 3127def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3128 for f in funcs:
a32a9a7e 3129 try:
c4f60dd7 3130 val = f(*args, **kwargs)
3131 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3132 pass
3133 else:
c4f60dd7 3134 if expected_type is None or isinstance(val, expected_type):
3135 return val
3136
3137
3138def try_get(src, getter, expected_type=None):
3139 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3140
3141
90137ca4 3142def filter_dict(dct, cndn=lambda _, v: v is not None):
3143 return {k: v for k, v in dct.items() if cndn(k, v)}
3144
3145
6cc62232
S
3146def merge_dicts(*dicts):
3147 merged = {}
3148 for a_dict in dicts:
3149 for k, v in a_dict.items():
90137ca4 3150 if (v is not None and k not in merged
3151 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3152 merged[k] = v
3153 return merged
3154
3155
8e60dc75 3156def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3157 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3158
16392824 3159
a1a530b0
PH
3160US_RATINGS = {
3161 'G': 0,
3162 'PG': 10,
3163 'PG-13': 13,
3164 'R': 16,
3165 'NC': 18,
3166}
fac55558
PH
3167
3168
a8795327 3169TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3170 'TV-Y': 0,
3171 'TV-Y7': 7,
3172 'TV-G': 0,
3173 'TV-PG': 0,
3174 'TV-14': 14,
3175 'TV-MA': 17,
a8795327
S
3176}
3177
3178
146c80e2 3179def parse_age_limit(s):
19a03940 3180 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3181 if type(s) is int: # noqa: E721
a8795327 3182 return s if 0 <= s <= 21 else None
19a03940 3183 elif not isinstance(s, str):
d838b1bd 3184 return None
146c80e2 3185 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3186 if m:
3187 return int(m.group('age'))
5c5fae6d 3188 s = s.upper()
a8795327
S
3189 if s in US_RATINGS:
3190 return US_RATINGS[s]
5a16c9d9 3191 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3192 if m:
5a16c9d9 3193 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3194 return None
146c80e2
S
3195
3196
fac55558 3197def strip_jsonp(code):
609a61e3 3198 return re.sub(
5552c9eb 3199 r'''(?sx)^
e9c671d5 3200 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3201 (?:\s*&&\s*(?P=func_name))?
3202 \s*\(\s*(?P<callback_data>.*)\);?
3203 \s*?(?://[^\n]*)*$''',
3204 r'\g<callback_data>', code)
478c2c61
PH
3205
3206
5c610515 3207def js_to_json(code, vars={}):
3208 # vars is a dict of var, val pairs to substitute
c843e685 3209 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3210 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3211 INTEGER_TABLE = (
86e5f3ed 3212 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3213 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3214 )
3215
e05f6939 3216 def fix_kv(m):
e7b6d122
PH
3217 v = m.group(0)
3218 if v in ('true', 'false', 'null'):
3219 return v
421ddcb8
C
3220 elif v in ('undefined', 'void 0'):
3221 return 'null'
8bdd16b4 3222 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3223 return ""
3224
3225 if v[0] in ("'", '"'):
3226 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3227 '"': '\\"',
bd1e4844 3228 "\\'": "'",
3229 '\\\n': '',
3230 '\\x': '\\u00',
3231 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3232 else:
3233 for regex, base in INTEGER_TABLE:
3234 im = re.match(regex, v)
3235 if im:
3236 i = int(im.group(1), base)
3237 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3238
5c610515 3239 if v in vars:
3240 return vars[v]
3241
e7b6d122 3242 return '"%s"' % v
e05f6939 3243
8072ef2b 3244 def create_map(mobj):
3245 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3246
febff4c1 3247 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3248 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3249
bd1e4844 3250 return re.sub(r'''(?sx)
3251 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3252 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3253 {comment}|,(?={skip}[\]}}])|
421ddcb8 3254 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3255 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3256 [0-9]+(?={skip}:)|
3257 !+
4195096e 3258 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3259
3260
478c2c61
PH
3261def qualities(quality_ids):
3262 """ Get a numeric quality value out of a list of possible values """
3263 def q(qid):
3264 try:
3265 return quality_ids.index(qid)
3266 except ValueError:
3267 return -1
3268 return q
3269
acd69589 3270
8aa0e7cd 3271POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3272
3273
de6000d9 3274DEFAULT_OUTTMPL = {
3275 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3276 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3277}
3278OUTTMPL_TYPES = {
72755351 3279 'chapter': None,
de6000d9 3280 'subtitle': None,
3281 'thumbnail': None,
3282 'description': 'description',
3283 'annotation': 'annotations.xml',
3284 'infojson': 'info.json',
08438d2c 3285 'link': None,
3b603dbd 3286 'pl_video': None,
5112f26a 3287 'pl_thumbnail': None,
de6000d9 3288 'pl_description': 'description',
3289 'pl_infojson': 'info.json',
3290}
0a871f68 3291
143db31d 3292# As of [1] format syntax is:
3293# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3294# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3295STR_FORMAT_RE_TMPL = r'''(?x)
3296 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3297 %
524e2e4f 3298 (?P<has_key>\((?P<key>{0})\))?
752cda38 3299 (?P<format>
524e2e4f 3300 (?P<conversion>[#0\-+ ]+)?
3301 (?P<min_width>\d+)?
3302 (?P<precision>\.\d+)?
3303 (?P<len_mod>[hlL])? # unused in python
901130bb 3304 {1} # conversion type
752cda38 3305 )
143db31d 3306'''
3307
7d1eb38a 3308
901130bb 3309STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3310
7d1eb38a 3311
a020a0dc
PH
3312def limit_length(s, length):
3313 """ Add ellipses to overly long strings """
3314 if s is None:
3315 return None
3316 ELLIPSES = '...'
3317 if len(s) > length:
3318 return s[:length - len(ELLIPSES)] + ELLIPSES
3319 return s
48844745
PH
3320
3321
3322def version_tuple(v):
5f9b8394 3323 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3324
3325
3326def is_outdated_version(version, limit, assume_new=True):
3327 if not version:
3328 return not assume_new
3329 try:
3330 return version_tuple(version) < version_tuple(limit)
3331 except ValueError:
3332 return not assume_new
732ea2f0
PH
3333
3334
3335def ytdl_is_updateable():
7a5c1cfe 3336 """ Returns if yt-dlp can be updated with -U """
735d865e 3337
5d535b4a 3338 from .update import is_non_updateable
732ea2f0 3339
5d535b4a 3340 return not is_non_updateable()
7d4111ed
PH
3341
3342
3343def args_to_str(args):
3344 # Get a short string representation for a subprocess command
702ccf2d 3345 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3346
3347
9b9c5355 3348def error_to_compat_str(err):
cfb0511d 3349 return str(err)
fdae2358
S
3350
3351
a44ca5a4 3352def error_to_str(err):
3353 return f'{type(err).__name__}: {err}'
3354
3355
c460bdd5 3356def mimetype2ext(mt):
eb9ee194
S
3357 if mt is None:
3358 return None
3359
9359f3d4
F
3360 mt, _, params = mt.partition(';')
3361 mt = mt.strip()
3362
3363 FULL_MAP = {
765ac263 3364 'audio/mp4': 'm4a',
6c33d24b
YCH
3365 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3366 # it's the most popular one
3367 'audio/mpeg': 'mp3',
ba39289d 3368 'audio/x-wav': 'wav',
9359f3d4
F
3369 'audio/wav': 'wav',
3370 'audio/wave': 'wav',
3371 }
3372
3373 ext = FULL_MAP.get(mt)
765ac263
JMF
3374 if ext is not None:
3375 return ext
3376
9359f3d4 3377 SUBTYPE_MAP = {
f6861ec9 3378 '3gpp': '3gp',
cafcf657 3379 'smptett+xml': 'tt',
cafcf657 3380 'ttaf+xml': 'dfxp',
a0d8d704 3381 'ttml+xml': 'ttml',
f6861ec9 3382 'x-flv': 'flv',
a0d8d704 3383 'x-mp4-fragmented': 'mp4',
d4f05d47 3384 'x-ms-sami': 'sami',
a0d8d704 3385 'x-ms-wmv': 'wmv',
b4173f15
RA
3386 'mpegurl': 'm3u8',
3387 'x-mpegurl': 'm3u8',
3388 'vnd.apple.mpegurl': 'm3u8',
3389 'dash+xml': 'mpd',
b4173f15 3390 'f4m+xml': 'f4m',
f164b971 3391 'hds+xml': 'f4m',
e910fe2f 3392 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3393 'quicktime': 'mov',
98ce1a3f 3394 'mp2t': 'ts',
39e7107d 3395 'x-wav': 'wav',
9359f3d4
F
3396 'filmstrip+json': 'fs',
3397 'svg+xml': 'svg',
3398 }
3399
3400 _, _, subtype = mt.rpartition('/')
3401 ext = SUBTYPE_MAP.get(subtype.lower())
3402 if ext is not None:
3403 return ext
3404
3405 SUFFIX_MAP = {
3406 'json': 'json',
3407 'xml': 'xml',
3408 'zip': 'zip',
3409 'gzip': 'gz',
3410 }
3411
3412 _, _, suffix = subtype.partition('+')
3413 ext = SUFFIX_MAP.get(suffix)
3414 if ext is not None:
3415 return ext
3416
3417 return subtype.replace('+', '.')
c460bdd5
PH
3418
3419
2814f12b
THD
3420def ext2mimetype(ext_or_url):
3421 if not ext_or_url:
3422 return None
3423 if '.' not in ext_or_url:
3424 ext_or_url = f'file.{ext_or_url}'
3425 return mimetypes.guess_type(ext_or_url)[0]
3426
3427
4f3c5e06 3428def parse_codecs(codecs_str):
3429 # http://tools.ietf.org/html/rfc6381
3430 if not codecs_str:
3431 return {}
a0566bbf 3432 split_codecs = list(filter(None, map(
dbf5416a 3433 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3434 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3435 for full_codec in split_codecs:
d816f61f 3436 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3437 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3438 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3439 if vcodec:
3440 continue
3441 vcodec = full_codec
3442 if parts[0] in ('dvh1', 'dvhe'):
3443 hdr = 'DV'
3444 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3445 hdr = 'HDR10'
3446 elif parts[:2] == ['vp9', '2']:
3447 hdr = 'HDR10'
3448 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3449 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3450 acodec = acodec or full_codec
3451 elif parts[0] in ('stpp', 'wvtt'):
3452 scodec = scodec or full_codec
4f3c5e06 3453 else:
19a03940 3454 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3455 if vcodec or acodec or scodec:
4f3c5e06 3456 return {
3457 'vcodec': vcodec or 'none',
3458 'acodec': acodec or 'none',
176f1866 3459 'dynamic_range': hdr,
3fe75fdc 3460 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3461 }
b69fd25c 3462 elif len(split_codecs) == 2:
3463 return {
3464 'vcodec': split_codecs[0],
3465 'acodec': split_codecs[1],
3466 }
4f3c5e06 3467 return {}
3468
3469
2ccd1b10 3470def urlhandle_detect_ext(url_handle):
79298173 3471 getheader = url_handle.headers.get
2ccd1b10 3472
b55ee18f
PH
3473 cd = getheader('Content-Disposition')
3474 if cd:
3475 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3476 if m:
3477 e = determine_ext(m.group('filename'), default_ext=None)
3478 if e:
3479 return e
3480
c460bdd5 3481 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3482
3483
1e399778
YCH
3484def encode_data_uri(data, mime_type):
3485 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3486
3487
05900629 3488def age_restricted(content_limit, age_limit):
6ec6cb4e 3489 """ Returns True iff the content should be blocked """
05900629
PH
3490
3491 if age_limit is None: # No limit set
3492 return False
3493 if content_limit is None:
3494 return False # Content available for everyone
3495 return age_limit < content_limit
61ca9a80
PH
3496
3497
88f60feb 3498# List of known byte-order-marks (BOM)
a904a7f8
L
3499BOMS = [
3500 (b'\xef\xbb\xbf', 'utf-8'),
3501 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3502 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3503 (b'\xff\xfe', 'utf-16-le'),
3504 (b'\xfe\xff', 'utf-16-be'),
3505]
a904a7f8
L
3506
3507
61ca9a80
PH
3508def is_html(first_bytes):
3509 """ Detect whether a file contains HTML by examining its first bytes. """
3510
80e8493e 3511 encoding = 'utf-8'
61ca9a80 3512 for bom, enc in BOMS:
80e8493e 3513 while first_bytes.startswith(bom):
3514 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3515
80e8493e 3516 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3517
3518
3519def determine_protocol(info_dict):
3520 protocol = info_dict.get('protocol')
3521 if protocol is not None:
3522 return protocol
3523
7de837a5 3524 url = sanitize_url(info_dict['url'])
a055469f
PH
3525 if url.startswith('rtmp'):
3526 return 'rtmp'
3527 elif url.startswith('mms'):
3528 return 'mms'
3529 elif url.startswith('rtsp'):
3530 return 'rtsp'
3531
3532 ext = determine_ext(url)
3533 if ext == 'm3u8':
3534 return 'm3u8'
3535 elif ext == 'f4m':
3536 return 'f4m'
3537
14f25df2 3538 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3539
3540
c5e3f849 3541def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3542 """ Render a list of rows, each as a list of values.
3543 Text after a \t will be right aligned """
ec11a9f4 3544 def width(string):
c5e3f849 3545 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3546
3547 def get_max_lens(table):
ec11a9f4 3548 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3549
3550 def filter_using_list(row, filterArray):
d16df59d 3551 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3552
d16df59d 3553 max_lens = get_max_lens(data) if hide_empty else []
3554 header_row = filter_using_list(header_row, max_lens)
3555 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3556
cfb56d1a 3557 table = [header_row] + data
76d321f6 3558 max_lens = get_max_lens(table)
c5e3f849 3559 extra_gap += 1
76d321f6 3560 if delim:
c5e3f849 3561 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3562 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3563 for row in table:
3564 for pos, text in enumerate(map(str, row)):
c5e3f849 3565 if '\t' in text:
3566 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3567 else:
3568 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3569 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3570 return ret
347de493
PH
3571
3572
8f18aca8 3573def _match_one(filter_part, dct, incomplete):
77b87f05 3574 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3575 STRING_OPERATORS = {
3576 '*=': operator.contains,
3577 '^=': lambda attr, value: attr.startswith(value),
3578 '$=': lambda attr, value: attr.endswith(value),
3579 '~=': lambda attr, value: re.search(value, attr),
3580 }
347de493 3581 COMPARISON_OPERATORS = {
a047eeb6 3582 **STRING_OPERATORS,
3583 '<=': operator.le, # "<=" must be defined above "<"
347de493 3584 '<': operator.lt,
347de493 3585 '>=': operator.ge,
a047eeb6 3586 '>': operator.gt,
347de493 3587 '=': operator.eq,
347de493 3588 }
a047eeb6 3589
6db9c4d5 3590 if isinstance(incomplete, bool):
3591 is_incomplete = lambda _: incomplete
3592 else:
3593 is_incomplete = lambda k: k in incomplete
3594
64fa820c 3595 operator_rex = re.compile(r'''(?x)
347de493 3596 (?P<key>[a-z_]+)
77b87f05 3597 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3598 (?:
a047eeb6 3599 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3600 (?P<strval>.+?)
347de493 3601 )
347de493 3602 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3603 m = operator_rex.fullmatch(filter_part.strip())
347de493 3604 if m:
18f96d12 3605 m = m.groupdict()
3606 unnegated_op = COMPARISON_OPERATORS[m['op']]
3607 if m['negation']:
77b87f05
MT
3608 op = lambda attr, value: not unnegated_op(attr, value)
3609 else:
3610 op = unnegated_op
18f96d12 3611 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3612 if m['quote']:
3613 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3614 actual_value = dct.get(m['key'])
3615 numeric_comparison = None
f9934b96 3616 if isinstance(actual_value, (int, float)):
e5a088dc
S
3617 # If the original field is a string and matching comparisonvalue is
3618 # a number we should respect the origin of the original field
3619 # and process comparison value as a string (see
18f96d12 3620 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3621 try:
18f96d12 3622 numeric_comparison = int(comparison_value)
347de493 3623 except ValueError:
18f96d12 3624 numeric_comparison = parse_filesize(comparison_value)
3625 if numeric_comparison is None:
3626 numeric_comparison = parse_filesize(f'{comparison_value}B')
3627 if numeric_comparison is None:
3628 numeric_comparison = parse_duration(comparison_value)
3629 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3630 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3631 if actual_value is None:
6db9c4d5 3632 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3633 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3634
3635 UNARY_OPERATORS = {
1cc47c66
S
3636 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3637 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3638 }
64fa820c 3639 operator_rex = re.compile(r'''(?x)
347de493 3640 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3641 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3642 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3643 if m:
3644 op = UNARY_OPERATORS[m.group('op')]
3645 actual_value = dct.get(m.group('key'))
6db9c4d5 3646 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3647 return True
347de493
PH
3648 return op(actual_value)
3649
3650 raise ValueError('Invalid filter part %r' % filter_part)
3651
3652
8f18aca8 3653def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3654 """ Filter a dictionary with a simple string syntax.
3655 @returns Whether the filter passes
3656 @param incomplete Set of keys that is expected to be missing from dct.
3657 Can be True/False to indicate all/none of the keys may be missing.
3658 All conditions on incomplete keys pass if the key is missing
8f18aca8 3659 """
347de493 3660 return all(
8f18aca8 3661 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3662 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3663
3664
b1a7cd05 3665def match_filter_func(filters):
3666 if not filters:
d1b5f70b 3667 return None
492272fe 3668 filters = set(variadic(filters))
d1b5f70b 3669
492272fe 3670 interactive = '-' in filters
3671 if interactive:
3672 filters.remove('-')
3673
3674 def _match_func(info_dict, incomplete=False):
3675 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3676 return NO_DEFAULT if interactive and not incomplete else None
347de493 3677 else:
3bec830a 3678 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3679 filter_str = ') | ('.join(map(str.strip, filters))
3680 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3681 return _match_func
91410c9b
PH
3682
3683
f2df4071 3684class download_range_func:
3685 def __init__(self, chapters, ranges):
3686 self.chapters, self.ranges = chapters, ranges
3687
3688 def __call__(self, info_dict, ydl):
5ec1b6b7 3689 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3690 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3691 for regex in self.chapters or []:
5ec1b6b7 3692 for i, chapter in enumerate(info_dict.get('chapters') or []):
3693 if re.search(regex, chapter['title']):
3694 warning = None
3695 yield {**chapter, 'index': i}
f2df4071 3696 if self.chapters and warning:
5ec1b6b7 3697 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3698
f2df4071 3699 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3700
f2df4071 3701 def __eq__(self, other):
3702 return (isinstance(other, download_range_func)
3703 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3704
3705
bf6427d2
YCH
3706def parse_dfxp_time_expr(time_expr):
3707 if not time_expr:
d631d5f9 3708 return
bf6427d2 3709
1d485a1a 3710 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3711 if mobj:
3712 return float(mobj.group('time_offset'))
3713
db2fe38b 3714 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3715 if mobj:
db2fe38b 3716 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3717
3718
c1c924ab 3719def srt_subtitles_timecode(seconds):
aa7785f8 3720 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3721
3722
3723def ass_subtitles_timecode(seconds):
3724 time = timetuple_from_msec(seconds * 1000)
3725 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3726
3727
3728def dfxp2srt(dfxp_data):
3869028f
YCH
3729 '''
3730 @param dfxp_data A bytes-like object containing DFXP data
3731 @returns A unicode object containing converted SRT data
3732 '''
5b995f71 3733 LEGACY_NAMESPACES = (
3869028f
YCH
3734 (b'http://www.w3.org/ns/ttml', [
3735 b'http://www.w3.org/2004/11/ttaf1',
3736 b'http://www.w3.org/2006/04/ttaf1',
3737 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3738 ]),
3869028f
YCH
3739 (b'http://www.w3.org/ns/ttml#styling', [
3740 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3741 ]),
3742 )
3743
3744 SUPPORTED_STYLING = [
3745 'color',
3746 'fontFamily',
3747 'fontSize',
3748 'fontStyle',
3749 'fontWeight',
3750 'textDecoration'
3751 ]
3752
4e335771 3753 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3754 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3755 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3756 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3757 })
bf6427d2 3758
5b995f71
RA
3759 styles = {}
3760 default_style = {}
3761
86e5f3ed 3762 class TTMLPElementParser:
5b995f71
RA
3763 _out = ''
3764 _unclosed_elements = []
3765 _applied_styles = []
bf6427d2 3766
2b14cb56 3767 def start(self, tag, attrib):
5b995f71
RA
3768 if tag in (_x('ttml:br'), 'br'):
3769 self._out += '\n'
3770 else:
3771 unclosed_elements = []
3772 style = {}
3773 element_style_id = attrib.get('style')
3774 if default_style:
3775 style.update(default_style)
3776 if element_style_id:
3777 style.update(styles.get(element_style_id, {}))
3778 for prop in SUPPORTED_STYLING:
3779 prop_val = attrib.get(_x('tts:' + prop))
3780 if prop_val:
3781 style[prop] = prop_val
3782 if style:
3783 font = ''
3784 for k, v in sorted(style.items()):
3785 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3786 continue
3787 if k == 'color':
3788 font += ' color="%s"' % v
3789 elif k == 'fontSize':
3790 font += ' size="%s"' % v
3791 elif k == 'fontFamily':
3792 font += ' face="%s"' % v
3793 elif k == 'fontWeight' and v == 'bold':
3794 self._out += '<b>'
3795 unclosed_elements.append('b')
3796 elif k == 'fontStyle' and v == 'italic':
3797 self._out += '<i>'
3798 unclosed_elements.append('i')
3799 elif k == 'textDecoration' and v == 'underline':
3800 self._out += '<u>'
3801 unclosed_elements.append('u')
3802 if font:
3803 self._out += '<font' + font + '>'
3804 unclosed_elements.append('font')
3805 applied_style = {}
3806 if self._applied_styles:
3807 applied_style.update(self._applied_styles[-1])
3808 applied_style.update(style)
3809 self._applied_styles.append(applied_style)
3810 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3811
2b14cb56 3812 def end(self, tag):
5b995f71
RA
3813 if tag not in (_x('ttml:br'), 'br'):
3814 unclosed_elements = self._unclosed_elements.pop()
3815 for element in reversed(unclosed_elements):
3816 self._out += '</%s>' % element
3817 if unclosed_elements and self._applied_styles:
3818 self._applied_styles.pop()
bf6427d2 3819
2b14cb56 3820 def data(self, data):
5b995f71 3821 self._out += data
2b14cb56 3822
3823 def close(self):
5b995f71 3824 return self._out.strip()
2b14cb56 3825
3826 def parse_node(node):
3827 target = TTMLPElementParser()
3828 parser = xml.etree.ElementTree.XMLParser(target=target)
3829 parser.feed(xml.etree.ElementTree.tostring(node))
3830 return parser.close()
bf6427d2 3831
5b995f71
RA
3832 for k, v in LEGACY_NAMESPACES:
3833 for ns in v:
3834 dfxp_data = dfxp_data.replace(ns, k)
3835
3869028f 3836 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3837 out = []
5b995f71 3838 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3839
3840 if not paras:
3841 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3842
5b995f71
RA
3843 repeat = False
3844 while True:
3845 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3846 style_id = style.get('id') or style.get(_x('xml:id'))
3847 if not style_id:
3848 continue
5b995f71
RA
3849 parent_style_id = style.get('style')
3850 if parent_style_id:
3851 if parent_style_id not in styles:
3852 repeat = True
3853 continue
3854 styles[style_id] = styles[parent_style_id].copy()
3855 for prop in SUPPORTED_STYLING:
3856 prop_val = style.get(_x('tts:' + prop))
3857 if prop_val:
3858 styles.setdefault(style_id, {})[prop] = prop_val
3859 if repeat:
3860 repeat = False
3861 else:
3862 break
3863
3864 for p in ('body', 'div'):
3865 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3866 if ele is None:
3867 continue
3868 style = styles.get(ele.get('style'))
3869 if not style:
3870 continue
3871 default_style.update(style)
3872
bf6427d2 3873 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3874 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3875 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3876 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3877 if begin_time is None:
3878 continue
7dff0363 3879 if not end_time:
d631d5f9
YCH
3880 if not dur:
3881 continue
3882 end_time = begin_time + dur
bf6427d2
YCH
3883 out.append('%d\n%s --> %s\n%s\n\n' % (
3884 index,
c1c924ab
YCH
3885 srt_subtitles_timecode(begin_time),
3886 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3887 parse_node(para)))
3888
3889 return ''.join(out)
3890
3891
c487cf00 3892def cli_option(params, command_option, param, separator=None):
66e289ba 3893 param = params.get(param)
c487cf00 3894 return ([] if param is None
3895 else [command_option, str(param)] if separator is None
3896 else [f'{command_option}{separator}{param}'])
66e289ba
S
3897
3898
3899def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3900 param = params.get(param)
c487cf00 3901 assert param in (True, False, None)
3902 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3903
3904
3905def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3906 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3907
3908
e92caff5 3909def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3910 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3911 if use_compat:
5b1ecbb3 3912 return argdict
3913 else:
3914 argdict = None
eab9b2bc 3915 if argdict is None:
5b1ecbb3 3916 return default
eab9b2bc 3917 assert isinstance(argdict, dict)
3918
e92caff5 3919 assert isinstance(keys, (list, tuple))
3920 for key_list in keys:
e92caff5 3921 arg_list = list(filter(
3922 lambda x: x is not None,
6606817a 3923 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3924 if arg_list:
3925 return [arg for args in arg_list for arg in args]
3926 return default
66e289ba 3927
6251555f 3928
330690a2 3929def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3930 main_key, exe = main_key.lower(), exe.lower()
3931 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3932 keys = [f'{root_key}{k}' for k in (keys or [''])]
3933 if root_key in keys:
3934 if main_key != exe:
3935 keys.append((main_key, exe))
3936 keys.append('default')
3937 else:
3938 use_compat = False
3939 return cli_configuration_args(argdict, keys, default, use_compat)
3940
66e289ba 3941
86e5f3ed 3942class ISO639Utils:
39672624
YCH
3943 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3944 _lang_map = {
3945 'aa': 'aar',
3946 'ab': 'abk',
3947 'ae': 'ave',
3948 'af': 'afr',
3949 'ak': 'aka',
3950 'am': 'amh',
3951 'an': 'arg',
3952 'ar': 'ara',
3953 'as': 'asm',
3954 'av': 'ava',
3955 'ay': 'aym',
3956 'az': 'aze',
3957 'ba': 'bak',
3958 'be': 'bel',
3959 'bg': 'bul',
3960 'bh': 'bih',
3961 'bi': 'bis',
3962 'bm': 'bam',
3963 'bn': 'ben',
3964 'bo': 'bod',
3965 'br': 'bre',
3966 'bs': 'bos',
3967 'ca': 'cat',
3968 'ce': 'che',
3969 'ch': 'cha',
3970 'co': 'cos',
3971 'cr': 'cre',
3972 'cs': 'ces',
3973 'cu': 'chu',
3974 'cv': 'chv',
3975 'cy': 'cym',
3976 'da': 'dan',
3977 'de': 'deu',
3978 'dv': 'div',
3979 'dz': 'dzo',
3980 'ee': 'ewe',
3981 'el': 'ell',
3982 'en': 'eng',
3983 'eo': 'epo',
3984 'es': 'spa',
3985 'et': 'est',
3986 'eu': 'eus',
3987 'fa': 'fas',
3988 'ff': 'ful',
3989 'fi': 'fin',
3990 'fj': 'fij',
3991 'fo': 'fao',
3992 'fr': 'fra',
3993 'fy': 'fry',
3994 'ga': 'gle',
3995 'gd': 'gla',
3996 'gl': 'glg',
3997 'gn': 'grn',
3998 'gu': 'guj',
3999 'gv': 'glv',
4000 'ha': 'hau',
4001 'he': 'heb',
b7acc835 4002 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4003 'hi': 'hin',
4004 'ho': 'hmo',
4005 'hr': 'hrv',
4006 'ht': 'hat',
4007 'hu': 'hun',
4008 'hy': 'hye',
4009 'hz': 'her',
4010 'ia': 'ina',
4011 'id': 'ind',
b7acc835 4012 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4013 'ie': 'ile',
4014 'ig': 'ibo',
4015 'ii': 'iii',
4016 'ik': 'ipk',
4017 'io': 'ido',
4018 'is': 'isl',
4019 'it': 'ita',
4020 'iu': 'iku',
4021 'ja': 'jpn',
4022 'jv': 'jav',
4023 'ka': 'kat',
4024 'kg': 'kon',
4025 'ki': 'kik',
4026 'kj': 'kua',
4027 'kk': 'kaz',
4028 'kl': 'kal',
4029 'km': 'khm',
4030 'kn': 'kan',
4031 'ko': 'kor',
4032 'kr': 'kau',
4033 'ks': 'kas',
4034 'ku': 'kur',
4035 'kv': 'kom',
4036 'kw': 'cor',
4037 'ky': 'kir',
4038 'la': 'lat',
4039 'lb': 'ltz',
4040 'lg': 'lug',
4041 'li': 'lim',
4042 'ln': 'lin',
4043 'lo': 'lao',
4044 'lt': 'lit',
4045 'lu': 'lub',
4046 'lv': 'lav',
4047 'mg': 'mlg',
4048 'mh': 'mah',
4049 'mi': 'mri',
4050 'mk': 'mkd',
4051 'ml': 'mal',
4052 'mn': 'mon',
4053 'mr': 'mar',
4054 'ms': 'msa',
4055 'mt': 'mlt',
4056 'my': 'mya',
4057 'na': 'nau',
4058 'nb': 'nob',
4059 'nd': 'nde',
4060 'ne': 'nep',
4061 'ng': 'ndo',
4062 'nl': 'nld',
4063 'nn': 'nno',
4064 'no': 'nor',
4065 'nr': 'nbl',
4066 'nv': 'nav',
4067 'ny': 'nya',
4068 'oc': 'oci',
4069 'oj': 'oji',
4070 'om': 'orm',
4071 'or': 'ori',
4072 'os': 'oss',
4073 'pa': 'pan',
4074 'pi': 'pli',
4075 'pl': 'pol',
4076 'ps': 'pus',
4077 'pt': 'por',
4078 'qu': 'que',
4079 'rm': 'roh',
4080 'rn': 'run',
4081 'ro': 'ron',
4082 'ru': 'rus',
4083 'rw': 'kin',
4084 'sa': 'san',
4085 'sc': 'srd',
4086 'sd': 'snd',
4087 'se': 'sme',
4088 'sg': 'sag',
4089 'si': 'sin',
4090 'sk': 'slk',
4091 'sl': 'slv',
4092 'sm': 'smo',
4093 'sn': 'sna',
4094 'so': 'som',
4095 'sq': 'sqi',
4096 'sr': 'srp',
4097 'ss': 'ssw',
4098 'st': 'sot',
4099 'su': 'sun',
4100 'sv': 'swe',
4101 'sw': 'swa',
4102 'ta': 'tam',
4103 'te': 'tel',
4104 'tg': 'tgk',
4105 'th': 'tha',
4106 'ti': 'tir',
4107 'tk': 'tuk',
4108 'tl': 'tgl',
4109 'tn': 'tsn',
4110 'to': 'ton',
4111 'tr': 'tur',
4112 'ts': 'tso',
4113 'tt': 'tat',
4114 'tw': 'twi',
4115 'ty': 'tah',
4116 'ug': 'uig',
4117 'uk': 'ukr',
4118 'ur': 'urd',
4119 'uz': 'uzb',
4120 've': 'ven',
4121 'vi': 'vie',
4122 'vo': 'vol',
4123 'wa': 'wln',
4124 'wo': 'wol',
4125 'xh': 'xho',
4126 'yi': 'yid',
e9a50fba 4127 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4128 'yo': 'yor',
4129 'za': 'zha',
4130 'zh': 'zho',
4131 'zu': 'zul',
4132 }
4133
4134 @classmethod
4135 def short2long(cls, code):
4136 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4137 return cls._lang_map.get(code[:2])
4138
4139 @classmethod
4140 def long2short(cls, code):
4141 """Convert language code from ISO 639-2/T to ISO 639-1"""
4142 for short_name, long_name in cls._lang_map.items():
4143 if long_name == code:
4144 return short_name
4145
4146
86e5f3ed 4147class ISO3166Utils:
4eb10f66
YCH
4148 # From http://data.okfn.org/data/core/country-list
4149 _country_map = {
4150 'AF': 'Afghanistan',
4151 'AX': 'Åland Islands',
4152 'AL': 'Albania',
4153 'DZ': 'Algeria',
4154 'AS': 'American Samoa',
4155 'AD': 'Andorra',
4156 'AO': 'Angola',
4157 'AI': 'Anguilla',
4158 'AQ': 'Antarctica',
4159 'AG': 'Antigua and Barbuda',
4160 'AR': 'Argentina',
4161 'AM': 'Armenia',
4162 'AW': 'Aruba',
4163 'AU': 'Australia',
4164 'AT': 'Austria',
4165 'AZ': 'Azerbaijan',
4166 'BS': 'Bahamas',
4167 'BH': 'Bahrain',
4168 'BD': 'Bangladesh',
4169 'BB': 'Barbados',
4170 'BY': 'Belarus',
4171 'BE': 'Belgium',
4172 'BZ': 'Belize',
4173 'BJ': 'Benin',
4174 'BM': 'Bermuda',
4175 'BT': 'Bhutan',
4176 'BO': 'Bolivia, Plurinational State of',
4177 'BQ': 'Bonaire, Sint Eustatius and Saba',
4178 'BA': 'Bosnia and Herzegovina',
4179 'BW': 'Botswana',
4180 'BV': 'Bouvet Island',
4181 'BR': 'Brazil',
4182 'IO': 'British Indian Ocean Territory',
4183 'BN': 'Brunei Darussalam',
4184 'BG': 'Bulgaria',
4185 'BF': 'Burkina Faso',
4186 'BI': 'Burundi',
4187 'KH': 'Cambodia',
4188 'CM': 'Cameroon',
4189 'CA': 'Canada',
4190 'CV': 'Cape Verde',
4191 'KY': 'Cayman Islands',
4192 'CF': 'Central African Republic',
4193 'TD': 'Chad',
4194 'CL': 'Chile',
4195 'CN': 'China',
4196 'CX': 'Christmas Island',
4197 'CC': 'Cocos (Keeling) Islands',
4198 'CO': 'Colombia',
4199 'KM': 'Comoros',
4200 'CG': 'Congo',
4201 'CD': 'Congo, the Democratic Republic of the',
4202 'CK': 'Cook Islands',
4203 'CR': 'Costa Rica',
4204 'CI': 'Côte d\'Ivoire',
4205 'HR': 'Croatia',
4206 'CU': 'Cuba',
4207 'CW': 'Curaçao',
4208 'CY': 'Cyprus',
4209 'CZ': 'Czech Republic',
4210 'DK': 'Denmark',
4211 'DJ': 'Djibouti',
4212 'DM': 'Dominica',
4213 'DO': 'Dominican Republic',
4214 'EC': 'Ecuador',
4215 'EG': 'Egypt',
4216 'SV': 'El Salvador',
4217 'GQ': 'Equatorial Guinea',
4218 'ER': 'Eritrea',
4219 'EE': 'Estonia',
4220 'ET': 'Ethiopia',
4221 'FK': 'Falkland Islands (Malvinas)',
4222 'FO': 'Faroe Islands',
4223 'FJ': 'Fiji',
4224 'FI': 'Finland',
4225 'FR': 'France',
4226 'GF': 'French Guiana',
4227 'PF': 'French Polynesia',
4228 'TF': 'French Southern Territories',
4229 'GA': 'Gabon',
4230 'GM': 'Gambia',
4231 'GE': 'Georgia',
4232 'DE': 'Germany',
4233 'GH': 'Ghana',
4234 'GI': 'Gibraltar',
4235 'GR': 'Greece',
4236 'GL': 'Greenland',
4237 'GD': 'Grenada',
4238 'GP': 'Guadeloupe',
4239 'GU': 'Guam',
4240 'GT': 'Guatemala',
4241 'GG': 'Guernsey',
4242 'GN': 'Guinea',
4243 'GW': 'Guinea-Bissau',
4244 'GY': 'Guyana',
4245 'HT': 'Haiti',
4246 'HM': 'Heard Island and McDonald Islands',
4247 'VA': 'Holy See (Vatican City State)',
4248 'HN': 'Honduras',
4249 'HK': 'Hong Kong',
4250 'HU': 'Hungary',
4251 'IS': 'Iceland',
4252 'IN': 'India',
4253 'ID': 'Indonesia',
4254 'IR': 'Iran, Islamic Republic of',
4255 'IQ': 'Iraq',
4256 'IE': 'Ireland',
4257 'IM': 'Isle of Man',
4258 'IL': 'Israel',
4259 'IT': 'Italy',
4260 'JM': 'Jamaica',
4261 'JP': 'Japan',
4262 'JE': 'Jersey',
4263 'JO': 'Jordan',
4264 'KZ': 'Kazakhstan',
4265 'KE': 'Kenya',
4266 'KI': 'Kiribati',
4267 'KP': 'Korea, Democratic People\'s Republic of',
4268 'KR': 'Korea, Republic of',
4269 'KW': 'Kuwait',
4270 'KG': 'Kyrgyzstan',
4271 'LA': 'Lao People\'s Democratic Republic',
4272 'LV': 'Latvia',
4273 'LB': 'Lebanon',
4274 'LS': 'Lesotho',
4275 'LR': 'Liberia',
4276 'LY': 'Libya',
4277 'LI': 'Liechtenstein',
4278 'LT': 'Lithuania',
4279 'LU': 'Luxembourg',
4280 'MO': 'Macao',
4281 'MK': 'Macedonia, the Former Yugoslav Republic of',
4282 'MG': 'Madagascar',
4283 'MW': 'Malawi',
4284 'MY': 'Malaysia',
4285 'MV': 'Maldives',
4286 'ML': 'Mali',
4287 'MT': 'Malta',
4288 'MH': 'Marshall Islands',
4289 'MQ': 'Martinique',
4290 'MR': 'Mauritania',
4291 'MU': 'Mauritius',
4292 'YT': 'Mayotte',
4293 'MX': 'Mexico',
4294 'FM': 'Micronesia, Federated States of',
4295 'MD': 'Moldova, Republic of',
4296 'MC': 'Monaco',
4297 'MN': 'Mongolia',
4298 'ME': 'Montenegro',
4299 'MS': 'Montserrat',
4300 'MA': 'Morocco',
4301 'MZ': 'Mozambique',
4302 'MM': 'Myanmar',
4303 'NA': 'Namibia',
4304 'NR': 'Nauru',
4305 'NP': 'Nepal',
4306 'NL': 'Netherlands',
4307 'NC': 'New Caledonia',
4308 'NZ': 'New Zealand',
4309 'NI': 'Nicaragua',
4310 'NE': 'Niger',
4311 'NG': 'Nigeria',
4312 'NU': 'Niue',
4313 'NF': 'Norfolk Island',
4314 'MP': 'Northern Mariana Islands',
4315 'NO': 'Norway',
4316 'OM': 'Oman',
4317 'PK': 'Pakistan',
4318 'PW': 'Palau',
4319 'PS': 'Palestine, State of',
4320 'PA': 'Panama',
4321 'PG': 'Papua New Guinea',
4322 'PY': 'Paraguay',
4323 'PE': 'Peru',
4324 'PH': 'Philippines',
4325 'PN': 'Pitcairn',
4326 'PL': 'Poland',
4327 'PT': 'Portugal',
4328 'PR': 'Puerto Rico',
4329 'QA': 'Qatar',
4330 'RE': 'Réunion',
4331 'RO': 'Romania',
4332 'RU': 'Russian Federation',
4333 'RW': 'Rwanda',
4334 'BL': 'Saint Barthélemy',
4335 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4336 'KN': 'Saint Kitts and Nevis',
4337 'LC': 'Saint Lucia',
4338 'MF': 'Saint Martin (French part)',
4339 'PM': 'Saint Pierre and Miquelon',
4340 'VC': 'Saint Vincent and the Grenadines',
4341 'WS': 'Samoa',
4342 'SM': 'San Marino',
4343 'ST': 'Sao Tome and Principe',
4344 'SA': 'Saudi Arabia',
4345 'SN': 'Senegal',
4346 'RS': 'Serbia',
4347 'SC': 'Seychelles',
4348 'SL': 'Sierra Leone',
4349 'SG': 'Singapore',
4350 'SX': 'Sint Maarten (Dutch part)',
4351 'SK': 'Slovakia',
4352 'SI': 'Slovenia',
4353 'SB': 'Solomon Islands',
4354 'SO': 'Somalia',
4355 'ZA': 'South Africa',
4356 'GS': 'South Georgia and the South Sandwich Islands',
4357 'SS': 'South Sudan',
4358 'ES': 'Spain',
4359 'LK': 'Sri Lanka',
4360 'SD': 'Sudan',
4361 'SR': 'Suriname',
4362 'SJ': 'Svalbard and Jan Mayen',
4363 'SZ': 'Swaziland',
4364 'SE': 'Sweden',
4365 'CH': 'Switzerland',
4366 'SY': 'Syrian Arab Republic',
4367 'TW': 'Taiwan, Province of China',
4368 'TJ': 'Tajikistan',
4369 'TZ': 'Tanzania, United Republic of',
4370 'TH': 'Thailand',
4371 'TL': 'Timor-Leste',
4372 'TG': 'Togo',
4373 'TK': 'Tokelau',
4374 'TO': 'Tonga',
4375 'TT': 'Trinidad and Tobago',
4376 'TN': 'Tunisia',
4377 'TR': 'Turkey',
4378 'TM': 'Turkmenistan',
4379 'TC': 'Turks and Caicos Islands',
4380 'TV': 'Tuvalu',
4381 'UG': 'Uganda',
4382 'UA': 'Ukraine',
4383 'AE': 'United Arab Emirates',
4384 'GB': 'United Kingdom',
4385 'US': 'United States',
4386 'UM': 'United States Minor Outlying Islands',
4387 'UY': 'Uruguay',
4388 'UZ': 'Uzbekistan',
4389 'VU': 'Vanuatu',
4390 'VE': 'Venezuela, Bolivarian Republic of',
4391 'VN': 'Viet Nam',
4392 'VG': 'Virgin Islands, British',
4393 'VI': 'Virgin Islands, U.S.',
4394 'WF': 'Wallis and Futuna',
4395 'EH': 'Western Sahara',
4396 'YE': 'Yemen',
4397 'ZM': 'Zambia',
4398 'ZW': 'Zimbabwe',
2f97cc61 4399 # Not ISO 3166 codes, but used for IP blocks
4400 'AP': 'Asia/Pacific Region',
4401 'EU': 'Europe',
4eb10f66
YCH
4402 }
4403
4404 @classmethod
4405 def short2full(cls, code):
4406 """Convert an ISO 3166-2 country code to the corresponding full name"""
4407 return cls._country_map.get(code.upper())
4408
4409
86e5f3ed 4410class GeoUtils:
773f291d
S
4411 # Major IPv4 address blocks per country
4412 _country_ip_map = {
53896ca5 4413 'AD': '46.172.224.0/19',
773f291d
S
4414 'AE': '94.200.0.0/13',
4415 'AF': '149.54.0.0/17',
4416 'AG': '209.59.64.0/18',
4417 'AI': '204.14.248.0/21',
4418 'AL': '46.99.0.0/16',
4419 'AM': '46.70.0.0/15',
4420 'AO': '105.168.0.0/13',
53896ca5
S
4421 'AP': '182.50.184.0/21',
4422 'AQ': '23.154.160.0/24',
773f291d
S
4423 'AR': '181.0.0.0/12',
4424 'AS': '202.70.112.0/20',
53896ca5 4425 'AT': '77.116.0.0/14',
773f291d
S
4426 'AU': '1.128.0.0/11',
4427 'AW': '181.41.0.0/18',
53896ca5
S
4428 'AX': '185.217.4.0/22',
4429 'AZ': '5.197.0.0/16',
773f291d
S
4430 'BA': '31.176.128.0/17',
4431 'BB': '65.48.128.0/17',
4432 'BD': '114.130.0.0/16',
4433 'BE': '57.0.0.0/8',
53896ca5 4434 'BF': '102.178.0.0/15',
773f291d
S
4435 'BG': '95.42.0.0/15',
4436 'BH': '37.131.0.0/17',
4437 'BI': '154.117.192.0/18',
4438 'BJ': '137.255.0.0/16',
53896ca5 4439 'BL': '185.212.72.0/23',
773f291d
S
4440 'BM': '196.12.64.0/18',
4441 'BN': '156.31.0.0/16',
4442 'BO': '161.56.0.0/16',
4443 'BQ': '161.0.80.0/20',
53896ca5 4444 'BR': '191.128.0.0/12',
773f291d
S
4445 'BS': '24.51.64.0/18',
4446 'BT': '119.2.96.0/19',
4447 'BW': '168.167.0.0/16',
4448 'BY': '178.120.0.0/13',
4449 'BZ': '179.42.192.0/18',
4450 'CA': '99.224.0.0/11',
4451 'CD': '41.243.0.0/16',
53896ca5
S
4452 'CF': '197.242.176.0/21',
4453 'CG': '160.113.0.0/16',
773f291d 4454 'CH': '85.0.0.0/13',
53896ca5 4455 'CI': '102.136.0.0/14',
773f291d
S
4456 'CK': '202.65.32.0/19',
4457 'CL': '152.172.0.0/14',
53896ca5 4458 'CM': '102.244.0.0/14',
773f291d
S
4459 'CN': '36.128.0.0/10',
4460 'CO': '181.240.0.0/12',
4461 'CR': '201.192.0.0/12',
4462 'CU': '152.206.0.0/15',
4463 'CV': '165.90.96.0/19',
4464 'CW': '190.88.128.0/17',
53896ca5 4465 'CY': '31.153.0.0/16',
773f291d
S
4466 'CZ': '88.100.0.0/14',
4467 'DE': '53.0.0.0/8',
4468 'DJ': '197.241.0.0/17',
4469 'DK': '87.48.0.0/12',
4470 'DM': '192.243.48.0/20',
4471 'DO': '152.166.0.0/15',
4472 'DZ': '41.96.0.0/12',
4473 'EC': '186.68.0.0/15',
4474 'EE': '90.190.0.0/15',
4475 'EG': '156.160.0.0/11',
4476 'ER': '196.200.96.0/20',
4477 'ES': '88.0.0.0/11',
4478 'ET': '196.188.0.0/14',
4479 'EU': '2.16.0.0/13',
4480 'FI': '91.152.0.0/13',
4481 'FJ': '144.120.0.0/16',
53896ca5 4482 'FK': '80.73.208.0/21',
773f291d
S
4483 'FM': '119.252.112.0/20',
4484 'FO': '88.85.32.0/19',
4485 'FR': '90.0.0.0/9',
4486 'GA': '41.158.0.0/15',
4487 'GB': '25.0.0.0/8',
4488 'GD': '74.122.88.0/21',
4489 'GE': '31.146.0.0/16',
4490 'GF': '161.22.64.0/18',
4491 'GG': '62.68.160.0/19',
53896ca5
S
4492 'GH': '154.160.0.0/12',
4493 'GI': '95.164.0.0/16',
773f291d
S
4494 'GL': '88.83.0.0/19',
4495 'GM': '160.182.0.0/15',
4496 'GN': '197.149.192.0/18',
4497 'GP': '104.250.0.0/19',
4498 'GQ': '105.235.224.0/20',
4499 'GR': '94.64.0.0/13',
4500 'GT': '168.234.0.0/16',
4501 'GU': '168.123.0.0/16',
4502 'GW': '197.214.80.0/20',
4503 'GY': '181.41.64.0/18',
4504 'HK': '113.252.0.0/14',
4505 'HN': '181.210.0.0/16',
4506 'HR': '93.136.0.0/13',
4507 'HT': '148.102.128.0/17',
4508 'HU': '84.0.0.0/14',
4509 'ID': '39.192.0.0/10',
4510 'IE': '87.32.0.0/12',
4511 'IL': '79.176.0.0/13',
4512 'IM': '5.62.80.0/20',
4513 'IN': '117.192.0.0/10',
4514 'IO': '203.83.48.0/21',
4515 'IQ': '37.236.0.0/14',
4516 'IR': '2.176.0.0/12',
4517 'IS': '82.221.0.0/16',
4518 'IT': '79.0.0.0/10',
4519 'JE': '87.244.64.0/18',
4520 'JM': '72.27.0.0/17',
4521 'JO': '176.29.0.0/16',
53896ca5 4522 'JP': '133.0.0.0/8',
773f291d
S
4523 'KE': '105.48.0.0/12',
4524 'KG': '158.181.128.0/17',
4525 'KH': '36.37.128.0/17',
4526 'KI': '103.25.140.0/22',
4527 'KM': '197.255.224.0/20',
53896ca5 4528 'KN': '198.167.192.0/19',
773f291d
S
4529 'KP': '175.45.176.0/22',
4530 'KR': '175.192.0.0/10',
4531 'KW': '37.36.0.0/14',
4532 'KY': '64.96.0.0/15',
4533 'KZ': '2.72.0.0/13',
4534 'LA': '115.84.64.0/18',
4535 'LB': '178.135.0.0/16',
53896ca5 4536 'LC': '24.92.144.0/20',
773f291d
S
4537 'LI': '82.117.0.0/19',
4538 'LK': '112.134.0.0/15',
53896ca5 4539 'LR': '102.183.0.0/16',
773f291d
S
4540 'LS': '129.232.0.0/17',
4541 'LT': '78.56.0.0/13',
4542 'LU': '188.42.0.0/16',
4543 'LV': '46.109.0.0/16',
4544 'LY': '41.252.0.0/14',
4545 'MA': '105.128.0.0/11',
4546 'MC': '88.209.64.0/18',
4547 'MD': '37.246.0.0/16',
4548 'ME': '178.175.0.0/17',
4549 'MF': '74.112.232.0/21',
4550 'MG': '154.126.0.0/17',
4551 'MH': '117.103.88.0/21',
4552 'MK': '77.28.0.0/15',
4553 'ML': '154.118.128.0/18',
4554 'MM': '37.111.0.0/17',
4555 'MN': '49.0.128.0/17',
4556 'MO': '60.246.0.0/16',
4557 'MP': '202.88.64.0/20',
4558 'MQ': '109.203.224.0/19',
4559 'MR': '41.188.64.0/18',
4560 'MS': '208.90.112.0/22',
4561 'MT': '46.11.0.0/16',
4562 'MU': '105.16.0.0/12',
4563 'MV': '27.114.128.0/18',
53896ca5 4564 'MW': '102.70.0.0/15',
773f291d
S
4565 'MX': '187.192.0.0/11',
4566 'MY': '175.136.0.0/13',
4567 'MZ': '197.218.0.0/15',
4568 'NA': '41.182.0.0/16',
4569 'NC': '101.101.0.0/18',
4570 'NE': '197.214.0.0/18',
4571 'NF': '203.17.240.0/22',
4572 'NG': '105.112.0.0/12',
4573 'NI': '186.76.0.0/15',
4574 'NL': '145.96.0.0/11',
4575 'NO': '84.208.0.0/13',
4576 'NP': '36.252.0.0/15',
4577 'NR': '203.98.224.0/19',
4578 'NU': '49.156.48.0/22',
4579 'NZ': '49.224.0.0/14',
4580 'OM': '5.36.0.0/15',
4581 'PA': '186.72.0.0/15',
4582 'PE': '186.160.0.0/14',
4583 'PF': '123.50.64.0/18',
4584 'PG': '124.240.192.0/19',
4585 'PH': '49.144.0.0/13',
4586 'PK': '39.32.0.0/11',
4587 'PL': '83.0.0.0/11',
4588 'PM': '70.36.0.0/20',
4589 'PR': '66.50.0.0/16',
4590 'PS': '188.161.0.0/16',
4591 'PT': '85.240.0.0/13',
4592 'PW': '202.124.224.0/20',
4593 'PY': '181.120.0.0/14',
4594 'QA': '37.210.0.0/15',
53896ca5 4595 'RE': '102.35.0.0/16',
773f291d 4596 'RO': '79.112.0.0/13',
53896ca5 4597 'RS': '93.86.0.0/15',
773f291d 4598 'RU': '5.136.0.0/13',
53896ca5 4599 'RW': '41.186.0.0/16',
773f291d
S
4600 'SA': '188.48.0.0/13',
4601 'SB': '202.1.160.0/19',
4602 'SC': '154.192.0.0/11',
53896ca5 4603 'SD': '102.120.0.0/13',
773f291d 4604 'SE': '78.64.0.0/12',
53896ca5 4605 'SG': '8.128.0.0/10',
773f291d
S
4606 'SI': '188.196.0.0/14',
4607 'SK': '78.98.0.0/15',
53896ca5 4608 'SL': '102.143.0.0/17',
773f291d
S
4609 'SM': '89.186.32.0/19',
4610 'SN': '41.82.0.0/15',
53896ca5 4611 'SO': '154.115.192.0/18',
773f291d
S
4612 'SR': '186.179.128.0/17',
4613 'SS': '105.235.208.0/21',
4614 'ST': '197.159.160.0/19',
4615 'SV': '168.243.0.0/16',
4616 'SX': '190.102.0.0/20',
4617 'SY': '5.0.0.0/16',
4618 'SZ': '41.84.224.0/19',
4619 'TC': '65.255.48.0/20',
4620 'TD': '154.68.128.0/19',
4621 'TG': '196.168.0.0/14',
4622 'TH': '171.96.0.0/13',
4623 'TJ': '85.9.128.0/18',
4624 'TK': '27.96.24.0/21',
4625 'TL': '180.189.160.0/20',
4626 'TM': '95.85.96.0/19',
4627 'TN': '197.0.0.0/11',
4628 'TO': '175.176.144.0/21',
4629 'TR': '78.160.0.0/11',
4630 'TT': '186.44.0.0/15',
4631 'TV': '202.2.96.0/19',
4632 'TW': '120.96.0.0/11',
4633 'TZ': '156.156.0.0/14',
53896ca5
S
4634 'UA': '37.52.0.0/14',
4635 'UG': '102.80.0.0/13',
4636 'US': '6.0.0.0/8',
773f291d 4637 'UY': '167.56.0.0/13',
53896ca5 4638 'UZ': '84.54.64.0/18',
773f291d 4639 'VA': '212.77.0.0/19',
53896ca5 4640 'VC': '207.191.240.0/21',
773f291d 4641 'VE': '186.88.0.0/13',
53896ca5 4642 'VG': '66.81.192.0/20',
773f291d
S
4643 'VI': '146.226.0.0/16',
4644 'VN': '14.160.0.0/11',
4645 'VU': '202.80.32.0/20',
4646 'WF': '117.20.32.0/21',
4647 'WS': '202.4.32.0/19',
4648 'YE': '134.35.0.0/16',
4649 'YT': '41.242.116.0/22',
4650 'ZA': '41.0.0.0/11',
53896ca5
S
4651 'ZM': '102.144.0.0/13',
4652 'ZW': '102.177.192.0/18',
773f291d
S
4653 }
4654
4655 @classmethod
5f95927a
S
4656 def random_ipv4(cls, code_or_block):
4657 if len(code_or_block) == 2:
4658 block = cls._country_ip_map.get(code_or_block.upper())
4659 if not block:
4660 return None
4661 else:
4662 block = code_or_block
773f291d 4663 addr, preflen = block.split('/')
ac668111 4664 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4665 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4666 return str(socket.inet_ntoa(
ac668111 4667 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4668
4669
ac668111 4670class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4671 def __init__(self, proxies=None):
4672 # Set default handlers
4673 for type in ('http', 'https'):
4674 setattr(self, '%s_open' % type,
4675 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4676 meth(r, proxy, type))
ac668111 4677 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4678
91410c9b 4679 def proxy_open(self, req, proxy, type):
2461f79d 4680 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4681 if req_proxy is not None:
4682 proxy = req_proxy
2461f79d
PH
4683 del req.headers['Ytdl-request-proxy']
4684
4685 if proxy == '__noproxy__':
4686 return None # No Proxy
14f25df2 4687 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4688 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4689 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4690 return None
ac668111 4691 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4692 self, req, proxy, type)
5bc880b9
YCH
4693
4694
0a5445dd
YCH
4695# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4696# released into Public Domain
4697# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4698
4699def long_to_bytes(n, blocksize=0):
4700 """long_to_bytes(n:long, blocksize:int) : string
4701 Convert a long integer to a byte string.
4702
4703 If optional blocksize is given and greater than zero, pad the front of the
4704 byte string with binary zeros so that the length is a multiple of
4705 blocksize.
4706 """
4707 # after much testing, this algorithm was deemed to be the fastest
4708 s = b''
4709 n = int(n)
4710 while n > 0:
ac668111 4711 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4712 n = n >> 32
4713 # strip off leading zeros
4714 for i in range(len(s)):
4715 if s[i] != b'\000'[0]:
4716 break
4717 else:
4718 # only happens when n == 0
4719 s = b'\000'
4720 i = 0
4721 s = s[i:]
4722 # add back some pad bytes. this could be done more efficiently w.r.t. the
4723 # de-padding being done above, but sigh...
4724 if blocksize > 0 and len(s) % blocksize:
4725 s = (blocksize - len(s) % blocksize) * b'\000' + s
4726 return s
4727
4728
4729def bytes_to_long(s):
4730 """bytes_to_long(string) : long
4731 Convert a byte string to a long integer.
4732
4733 This is (essentially) the inverse of long_to_bytes().
4734 """
4735 acc = 0
4736 length = len(s)
4737 if length % 4:
4738 extra = (4 - length % 4)
4739 s = b'\000' * extra + s
4740 length = length + extra
4741 for i in range(0, length, 4):
ac668111 4742 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4743 return acc
4744
4745
5bc880b9
YCH
4746def ohdave_rsa_encrypt(data, exponent, modulus):
4747 '''
4748 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4749
4750 Input:
4751 data: data to encrypt, bytes-like object
4752 exponent, modulus: parameter e and N of RSA algorithm, both integer
4753 Output: hex string of encrypted data
4754
4755 Limitation: supports one block encryption only
4756 '''
4757
4758 payload = int(binascii.hexlify(data[::-1]), 16)
4759 encrypted = pow(payload, exponent, modulus)
4760 return '%x' % encrypted
81bdc8fd
YCH
4761
4762
f48409c7
YCH
4763def pkcs1pad(data, length):
4764 """
4765 Padding input data with PKCS#1 scheme
4766
4767 @param {int[]} data input data
4768 @param {int} length target length
4769 @returns {int[]} padded data
4770 """
4771 if len(data) > length - 11:
4772 raise ValueError('Input data too long for PKCS#1 padding')
4773
4774 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4775 return [0, 2] + pseudo_random + [0] + data
4776
4777
7b2c3f47 4778def _base_n_table(n, table):
4779 if not table and not n:
4780 raise ValueError('Either table or n must be specified')
612f2be5 4781 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4782
44f14eb4 4783 if n and n != len(table):
612f2be5 4784 raise ValueError(f'base {n} exceeds table length {len(table)}')
4785 return table
59f898b7 4786
5eb6bdce 4787
7b2c3f47 4788def encode_base_n(num, n=None, table=None):
4789 """Convert given int to a base-n string"""
612f2be5 4790 table = _base_n_table(n, table)
7b2c3f47 4791 if not num:
5eb6bdce
YCH
4792 return table[0]
4793
7b2c3f47 4794 result, base = '', len(table)
81bdc8fd 4795 while num:
7b2c3f47 4796 result = table[num % base] + result
612f2be5 4797 num = num // base
7b2c3f47 4798 return result
4799
4800
4801def decode_base_n(string, n=None, table=None):
4802 """Convert given base-n string to int"""
4803 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4804 result, base = 0, len(table)
4805 for char in string:
4806 result = result * base + table[char]
4807 return result
4808
4809
4810def decode_base(value, digits):
4811 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4812 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4813 return decode_base_n(value, table=digits)
f52354a8
YCH
4814
4815
4816def decode_packed_codes(code):
06b3fe29 4817 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4818 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4819 base = int(base)
4820 count = int(count)
4821 symbols = symbols.split('|')
4822 symbol_table = {}
4823
4824 while count:
4825 count -= 1
5eb6bdce 4826 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4827 symbol_table[base_n_count] = symbols[count] or base_n_count
4828
4829 return re.sub(
4830 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4831 obfuscated_code)
e154c651 4832
4833
1ced2221
S
4834def caesar(s, alphabet, shift):
4835 if shift == 0:
4836 return s
4837 l = len(alphabet)
4838 return ''.join(
4839 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4840 for c in s)
4841
4842
4843def rot47(s):
4844 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4845
4846
e154c651 4847def parse_m3u8_attributes(attrib):
4848 info = {}
4849 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4850 if val.startswith('"'):
4851 val = val[1:-1]
4852 info[key] = val
4853 return info
1143535d
YCH
4854
4855
4856def urshift(val, n):
4857 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4858
4859
4860# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4861# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4862def decode_png(png_data):
4863 # Reference: https://www.w3.org/TR/PNG/
4864 header = png_data[8:]
4865
4866 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4867 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4868
4869 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4870 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4871
4872 chunks = []
4873
4874 while header:
4875 length = unpack_integer(header[:4])
4876 header = header[4:]
4877
4878 chunk_type = header[:4]
4879 header = header[4:]
4880
4881 chunk_data = header[:length]
4882 header = header[length:]
4883
4884 header = header[4:] # Skip CRC
4885
4886 chunks.append({
4887 'type': chunk_type,
4888 'length': length,
4889 'data': chunk_data
4890 })
4891
4892 ihdr = chunks[0]['data']
4893
4894 width = unpack_integer(ihdr[:4])
4895 height = unpack_integer(ihdr[4:8])
4896
4897 idat = b''
4898
4899 for chunk in chunks:
4900 if chunk['type'] == b'IDAT':
4901 idat += chunk['data']
4902
4903 if not idat:
86e5f3ed 4904 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4905
4906 decompressed_data = bytearray(zlib.decompress(idat))
4907
4908 stride = width * 3
4909 pixels = []
4910
4911 def _get_pixel(idx):
4912 x = idx % stride
4913 y = idx // stride
4914 return pixels[y][x]
4915
4916 for y in range(height):
4917 basePos = y * (1 + stride)
4918 filter_type = decompressed_data[basePos]
4919
4920 current_row = []
4921
4922 pixels.append(current_row)
4923
4924 for x in range(stride):
4925 color = decompressed_data[1 + basePos + x]
4926 basex = y * stride + x
4927 left = 0
4928 up = 0
4929
4930 if x > 2:
4931 left = _get_pixel(basex - 3)
4932 if y > 0:
4933 up = _get_pixel(basex - stride)
4934
4935 if filter_type == 1: # Sub
4936 color = (color + left) & 0xff
4937 elif filter_type == 2: # Up
4938 color = (color + up) & 0xff
4939 elif filter_type == 3: # Average
4940 color = (color + ((left + up) >> 1)) & 0xff
4941 elif filter_type == 4: # Paeth
4942 a = left
4943 b = up
4944 c = 0
4945
4946 if x > 2 and y > 0:
4947 c = _get_pixel(basex - stride - 3)
4948
4949 p = a + b - c
4950
4951 pa = abs(p - a)
4952 pb = abs(p - b)
4953 pc = abs(p - c)
4954
4955 if pa <= pb and pa <= pc:
4956 color = (color + a) & 0xff
4957 elif pb <= pc:
4958 color = (color + b) & 0xff
4959 else:
4960 color = (color + c) & 0xff
4961
4962 current_row.append(color)
4963
4964 return width, height, pixels
efa97bdc
YCH
4965
4966
4967def write_xattr(path, key, value):
6f7563be 4968 # Windows: Write xattrs to NTFS Alternate Data Streams:
4969 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4970 if compat_os_name == 'nt':
4971 assert ':' not in key
4972 assert os.path.exists(path)
efa97bdc
YCH
4973
4974 try:
6f7563be 4975 with open(f'{path}:{key}', 'wb') as f:
4976 f.write(value)
86e5f3ed 4977 except OSError as e:
efa97bdc 4978 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4979 return
efa97bdc 4980
6f7563be 4981 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4982
6f7563be 4983 setxattr = None
4984 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4985 # Unicode arguments are not supported in pyxattr until version 0.5.0
4986 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4987 if version_tuple(xattr.__version__) >= (0, 5, 0):
4988 setxattr = xattr.set
4989 elif xattr:
4990 setxattr = xattr.setxattr
efa97bdc 4991
6f7563be 4992 if setxattr:
4993 try:
4994 setxattr(path, key, value)
4995 except OSError as e:
4996 raise XAttrMetadataError(e.errno, e.strerror)
4997 return
efa97bdc 4998
6f7563be 4999 # UNIX Method 2. Use setfattr/xattr executables
5000 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5001 else 'xattr' if check_executable('xattr', ['-h']) else None)
5002 if not exe:
5003 raise XAttrUnavailableError(
5004 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5005 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5006
0f06bcd7 5007 value = value.decode()
6f7563be 5008 try:
f0c9fb96 5009 _, stderr, returncode = Popen.run(
6f7563be 5010 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5011 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5012 except OSError as e:
5013 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5014 if returncode:
5015 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5016
5017
5018def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5019 start_date = datetime.date(1950, 1, 1)
5020 end_date = datetime.date(1995, 12, 31)
5021 offset = random.randint(0, (end_date - start_date).days)
5022 random_date = start_date + datetime.timedelta(offset)
0c265486 5023 return {
aa374bc7
AS
5024 year_field: str(random_date.year),
5025 month_field: str(random_date.month),
5026 day_field: str(random_date.day),
0c265486 5027 }
732044af 5028
c76eb41b 5029
732044af 5030# Templates for internet shortcut files, which are plain text files.
e5a998f3 5031DOT_URL_LINK_TEMPLATE = '''\
732044af 5032[InternetShortcut]
5033URL=%(url)s
e5a998f3 5034'''
732044af 5035
e5a998f3 5036DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5037<?xml version="1.0" encoding="UTF-8"?>
5038<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5039<plist version="1.0">
5040<dict>
5041\t<key>URL</key>
5042\t<string>%(url)s</string>
5043</dict>
5044</plist>
e5a998f3 5045'''
732044af 5046
e5a998f3 5047DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5048[Desktop Entry]
5049Encoding=UTF-8
5050Name=%(filename)s
5051Type=Link
5052URL=%(url)s
5053Icon=text-html
e5a998f3 5054'''
732044af 5055
08438d2c 5056LINK_TEMPLATES = {
5057 'url': DOT_URL_LINK_TEMPLATE,
5058 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5059 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5060}
5061
732044af 5062
5063def iri_to_uri(iri):
5064 """
5065 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5066
5067 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5068 """
5069
14f25df2 5070 iri_parts = urllib.parse.urlparse(iri)
732044af 5071
5072 if '[' in iri_parts.netloc:
5073 raise ValueError('IPv6 URIs are not, yet, supported.')
5074 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5075
5076 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5077
5078 net_location = ''
5079 if iri_parts.username:
f9934b96 5080 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5081 if iri_parts.password is not None:
f9934b96 5082 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5083 net_location += '@'
5084
0f06bcd7 5085 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5086 # The 'idna' encoding produces ASCII text.
5087 if iri_parts.port is not None and iri_parts.port != 80:
5088 net_location += ':' + str(iri_parts.port)
5089
f9934b96 5090 return urllib.parse.urlunparse(
732044af 5091 (iri_parts.scheme,
5092 net_location,
5093
f9934b96 5094 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5095
5096 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5097 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5098
5099 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5100 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5101
f9934b96 5102 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5103
5104 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5105
5106
5107def to_high_limit_path(path):
5108 if sys.platform in ['win32', 'cygwin']:
5109 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5110 return '\\\\?\\' + os.path.abspath(path)
732044af 5111
5112 return path
76d321f6 5113
c76eb41b 5114
7b2c3f47 5115def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5116 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5117 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5118 return default
7b2c3f47 5119 return template % func(val)
00dd0cd5 5120
5121
5122def clean_podcast_url(url):
5123 return re.sub(r'''(?x)
5124 (?:
5125 (?:
5126 chtbl\.com/track|
5127 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5128 play\.podtrac\.com
5129 )/[^/]+|
5130 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5131 flex\.acast\.com|
5132 pd(?:
5133 cn\.co| # https://podcorn.com/analytics-prefix/
5134 st\.fm # https://podsights.com/docs/
5135 )/e
5136 )/''', '', url)
ffcb8191
THD
5137
5138
5139_HEX_TABLE = '0123456789abcdef'
5140
5141
5142def random_uuidv4():
5143 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5144
5145
5146def make_dir(path, to_screen=None):
5147 try:
5148 dn = os.path.dirname(path)
5149 if dn and not os.path.exists(dn):
5150 os.makedirs(dn)
5151 return True
86e5f3ed 5152 except OSError as err:
0202b52a 5153 if callable(to_screen) is not None:
5154 to_screen('unable to create directory ' + error_to_compat_str(err))
5155 return False
f74980cb 5156
5157
5158def get_executable_path():
b5899f4f 5159 from .update import _get_variant_and_executable_path
c487cf00 5160
b5899f4f 5161 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5162
5163
2f567473 5164def load_plugins(name, suffix, namespace):
3ae5e797 5165 classes = {}
19a03940 5166 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5167 plugins_spec = importlib.util.spec_from_file_location(
5168 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5169 plugins = importlib.util.module_from_spec(plugins_spec)
5170 sys.modules[plugins_spec.name] = plugins
5171 plugins_spec.loader.exec_module(plugins)
f74980cb 5172 for name in dir(plugins):
2f567473 5173 if name in namespace:
5174 continue
5175 if not name.endswith(suffix):
f74980cb 5176 continue
5177 klass = getattr(plugins, name)
3ae5e797 5178 classes[name] = namespace[name] = klass
f74980cb 5179 return classes
06167fbb 5180
5181
325ebc17 5182def traverse_obj(
352d63fd 5183 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5184 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5185 ''' Traverse nested list/dict/tuple
8f334380 5186 @param path_list A list of paths which are checked one by one.
19a03940 5187 Each path is a list of keys where each key is a:
5188 - None: Do nothing
5189 - string: A dictionary key
5190 - int: An index into a list
5191 - tuple: A list of keys all of which will be traversed
5192 - Ellipsis: Fetch all values in the object
5193 - Function: Takes the key and value as arguments
5194 and returns whether the key matches or not
325ebc17 5195 @param default Default value to return
352d63fd 5196 @param expected_type Only accept final value of this type (Can also be any callable)
5197 @param get_all Return all the values obtained from a path or only the first one
324ad820 5198 @param casesense Whether to consider dictionary keys as case sensitive
5199 @param is_user_input Whether the keys are generated from user input. If True,
5200 strings are converted to int/slice if necessary
5201 @param traverse_string Whether to traverse inside strings. If True, any
5202 non-compatible object will also be converted into a string
8f334380 5203 # TODO: Write tests
324ad820 5204 '''
325ebc17 5205 if not casesense:
dbf5416a 5206 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5207 path_list = (map(_lower, variadic(path)) for path in path_list)
5208
5209 def _traverse_obj(obj, path, _current_depth=0):
5210 nonlocal depth
5211 path = tuple(variadic(path))
5212 for i, key in enumerate(path):
1797b073 5213 if None in (key, obj):
5214 return obj
8f334380 5215 if isinstance(key, (list, tuple)):
5216 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5217 key = ...
5218 if key is ...:
5219 obj = (obj.values() if isinstance(obj, dict)
5220 else obj if isinstance(obj, (list, tuple, LazyList))
5221 else str(obj) if traverse_string else [])
5222 _current_depth += 1
5223 depth = max(depth, _current_depth)
5224 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5225 elif callable(key):
5226 if isinstance(obj, (list, tuple, LazyList)):
5227 obj = enumerate(obj)
5228 elif isinstance(obj, dict):
5229 obj = obj.items()
5230 else:
5231 if not traverse_string:
5232 return None
5233 obj = str(obj)
5234 _current_depth += 1
5235 depth = max(depth, _current_depth)
e6f868a6 5236 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5237 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5238 obj = (obj.get(key) if casesense or (key in obj)
5239 else next((v for k, v in obj.items() if _lower(k) == key), None))
5240 else:
5241 if is_user_input:
5242 key = (int_or_none(key) if ':' not in key
5243 else slice(*map(int_or_none, key.split(':'))))
8f334380 5244 if key == slice(None):
575e17a1 5245 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5246 if not isinstance(key, (int, slice)):
9fea350f 5247 return None
8f334380 5248 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5249 if not traverse_string:
5250 return None
5251 obj = str(obj)
5252 try:
5253 obj = obj[key]
5254 except IndexError:
324ad820 5255 return None
325ebc17 5256 return obj
5257
352d63fd 5258 if isinstance(expected_type, type):
5259 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5260 else:
7b2c3f47 5261 type_test = expected_type or IDENTITY
352d63fd 5262
8f334380 5263 for path in path_list:
5264 depth = 0
5265 val = _traverse_obj(obj, path)
325ebc17 5266 if val is not None:
8f334380 5267 if depth:
5268 for _ in range(depth - 1):
6586bca9 5269 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5270 val = [v for v in map(type_test, val) if v is not None]
8f334380 5271 if val:
352d63fd 5272 return val if get_all else val[0]
5273 else:
5274 val = type_test(val)
5275 if val is not None:
8f334380 5276 return val
325ebc17 5277 return default
324ad820 5278
5279
5280def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5281 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5282 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5283 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5284
5285
ff91cf74 5286def get_first(obj, keys, **kwargs):
5287 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5288
5289
4b4b7f74 5290def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5291 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5292
5293
3e9b66d7
LNO
5294def time_seconds(**kwargs):
5295 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5296 return t.timestamp()
5297
5298
49fa4d9a
N
5299# create a JSON Web Signature (jws) with HS256 algorithm
5300# the resulting format is in JWS Compact Serialization
5301# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5302# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5303def jwt_encode_hs256(payload_data, key, headers={}):
5304 header_data = {
5305 'alg': 'HS256',
5306 'typ': 'JWT',
5307 }
5308 if headers:
5309 header_data.update(headers)
0f06bcd7 5310 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5311 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5312 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5313 signature_b64 = base64.b64encode(h.digest())
5314 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5315 return token
819e0531 5316
5317
16b0d7e6 5318# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5319def jwt_decode_hs256(jwt):
5320 header_b64, payload_b64, signature_b64 = jwt.split('.')
5321 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5322 return payload_data
5323
5324
53973b4d 5325WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5326
5327
0b9c08b4 5328@functools.cache
819e0531 5329def supports_terminal_sequences(stream):
5330 if compat_os_name == 'nt':
8a82af35 5331 if not WINDOWS_VT_MODE:
819e0531 5332 return False
5333 elif not os.getenv('TERM'):
5334 return False
5335 try:
5336 return stream.isatty()
5337 except BaseException:
5338 return False
5339
5340
53973b4d 5341def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5342 if get_windows_version() < (10, 0, 10586):
53973b4d 5343 return
5344 global WINDOWS_VT_MODE
53973b4d 5345 try:
f0c9fb96 5346 Popen.run('', shell=True)
53973b4d 5347 except Exception:
5348 return
5349
5350 WINDOWS_VT_MODE = True
5351 supports_terminal_sequences.cache_clear()
5352
5353
ec11a9f4 5354_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5355
5356
5357def remove_terminal_sequences(string):
5358 return _terminal_sequences_re.sub('', string)
5359
5360
5361def number_of_digits(number):
5362 return len('%d' % number)
34921b43 5363
5364
5365def join_nonempty(*values, delim='-', from_dict=None):
5366 if from_dict is not None:
7b2c3f47 5367 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5368 return delim.join(map(str, filter(None, values)))
06e57990 5369
5370
27231526
ZM
5371def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5372 """
5373 Find the largest format dimensions in terms of video width and, for each thumbnail:
5374 * Modify the URL: Match the width with the provided regex and replace with the former width
5375 * Update dimensions
5376
5377 This function is useful with video services that scale the provided thumbnails on demand
5378 """
5379 _keys = ('width', 'height')
5380 max_dimensions = max(
86e5f3ed 5381 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5382 default=(0, 0))
5383 if not max_dimensions[0]:
5384 return thumbnails
5385 return [
5386 merge_dicts(
5387 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5388 dict(zip(_keys, max_dimensions)), thumbnail)
5389 for thumbnail in thumbnails
5390 ]
5391
5392
93c8410d
LNO
5393def parse_http_range(range):
5394 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5395 if not range:
5396 return None, None, None
5397 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5398 if not crg:
5399 return None, None, None
5400 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5401
5402
6b9e832d 5403def read_stdin(what):
5404 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5405 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5406 return sys.stdin
5407
5408
a904a7f8
L
5409def determine_file_encoding(data):
5410 """
88f60feb 5411 Detect the text encoding used
a904a7f8
L
5412 @returns (encoding, bytes to skip)
5413 """
5414
88f60feb 5415 # BOM marks are given priority over declarations
a904a7f8 5416 for bom, enc in BOMS:
a904a7f8
L
5417 if data.startswith(bom):
5418 return enc, len(bom)
5419
88f60feb 5420 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5421 # We ignore the endianness to get a good enough match
a904a7f8 5422 data = data.replace(b'\0', b'')
88f60feb 5423 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5424 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5425
5426
06e57990 5427class Config:
5428 own_args = None
9e491463 5429 parsed_args = None
06e57990 5430 filename = None
5431 __initialized = False
5432
5433 def __init__(self, parser, label=None):
9e491463 5434 self.parser, self.label = parser, label
06e57990 5435 self._loaded_paths, self.configs = set(), []
5436
5437 def init(self, args=None, filename=None):
5438 assert not self.__initialized
284a60c5 5439 self.own_args, self.filename = args, filename
5440 return self.load_configs()
5441
5442 def load_configs(self):
65662dff 5443 directory = ''
284a60c5 5444 if self.filename:
5445 location = os.path.realpath(self.filename)
65662dff 5446 directory = os.path.dirname(location)
06e57990 5447 if location in self._loaded_paths:
5448 return False
5449 self._loaded_paths.add(location)
5450
284a60c5 5451 self.__initialized = True
5452 opts, _ = self.parser.parse_known_args(self.own_args)
5453 self.parsed_args = self.own_args
9e491463 5454 for location in opts.config_locations or []:
6b9e832d 5455 if location == '-':
5456 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5457 continue
65662dff 5458 location = os.path.join(directory, expand_path(location))
06e57990 5459 if os.path.isdir(location):
5460 location = os.path.join(location, 'yt-dlp.conf')
5461 if not os.path.exists(location):
9e491463 5462 self.parser.error(f'config location {location} does not exist')
06e57990 5463 self.append_config(self.read_file(location), location)
5464 return True
5465
5466 def __str__(self):
5467 label = join_nonempty(
5468 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5469 delim=' ')
5470 return join_nonempty(
5471 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5472 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5473 delim='\n')
5474
5475 @staticmethod
5476 def read_file(filename, default=[]):
5477 try:
a904a7f8 5478 optionf = open(filename, 'rb')
86e5f3ed 5479 except OSError:
06e57990 5480 return default # silently skip if file is not present
a904a7f8
L
5481 try:
5482 enc, skip = determine_file_encoding(optionf.read(512))
5483 optionf.seek(skip, io.SEEK_SET)
5484 except OSError:
5485 enc = None # silently skip read errors
06e57990 5486 try:
5487 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5488 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5489 res = shlex.split(contents, comments=True)
44a6fcff 5490 except Exception as err:
5491 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5492 finally:
5493 optionf.close()
5494 return res
5495
5496 @staticmethod
5497 def hide_login_info(opts):
86e5f3ed 5498 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5499 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5500
5501 def _scrub_eq(o):
5502 m = eqre.match(o)
5503 if m:
5504 return m.group('key') + '=PRIVATE'
5505 else:
5506 return o
5507
5508 opts = list(map(_scrub_eq, opts))
5509 for idx, opt in enumerate(opts):
5510 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5511 opts[idx + 1] = 'PRIVATE'
5512 return opts
5513
5514 def append_config(self, *args, label=None):
9e491463 5515 config = type(self)(self.parser, label)
06e57990 5516 config._loaded_paths = self._loaded_paths
5517 if config.init(*args):
5518 self.configs.append(config)
5519
5520 @property
5521 def all_args(self):
5522 for config in reversed(self.configs):
5523 yield from config.all_args
9e491463 5524 yield from self.parsed_args or []
5525
5526 def parse_known_args(self, **kwargs):
5527 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5528
5529 def parse_args(self):
9e491463 5530 return self.parser.parse_args(self.all_args)
da42679b
LNO
5531
5532
5533class WebSocketsWrapper():
5534 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5535 pool = None
da42679b 5536
3cea3edd 5537 def __init__(self, url, headers=None, connect=True):
059bc4db 5538 self.loop = asyncio.new_event_loop()
9cd08050 5539 # XXX: "loop" is deprecated
5540 self.conn = websockets.connect(
5541 url, extra_headers=headers, ping_interval=None,
5542 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5543 if connect:
5544 self.__enter__()
15dfb392 5545 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5546
5547 def __enter__(self):
3cea3edd 5548 if not self.pool:
9cd08050 5549 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5550 return self
5551
5552 def send(self, *args):
5553 self.run_with_loop(self.pool.send(*args), self.loop)
5554
5555 def recv(self, *args):
5556 return self.run_with_loop(self.pool.recv(*args), self.loop)
5557
5558 def __exit__(self, type, value, traceback):
5559 try:
5560 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5561 finally:
5562 self.loop.close()
15dfb392 5563 self._cancel_all_tasks(self.loop)
da42679b
LNO
5564
5565 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5566 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5567 @staticmethod
5568 def run_with_loop(main, loop):
059bc4db 5569 if not asyncio.iscoroutine(main):
da42679b
LNO
5570 raise ValueError(f'a coroutine was expected, got {main!r}')
5571
5572 try:
5573 return loop.run_until_complete(main)
5574 finally:
5575 loop.run_until_complete(loop.shutdown_asyncgens())
5576 if hasattr(loop, 'shutdown_default_executor'):
5577 loop.run_until_complete(loop.shutdown_default_executor())
5578
5579 @staticmethod
5580 def _cancel_all_tasks(loop):
059bc4db 5581 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5582
5583 if not to_cancel:
5584 return
5585
5586 for task in to_cancel:
5587 task.cancel()
5588
9cd08050 5589 # XXX: "loop" is removed in python 3.10+
da42679b 5590 loop.run_until_complete(
059bc4db 5591 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5592
5593 for task in to_cancel:
5594 if task.cancelled():
5595 continue
5596 if task.exception() is not None:
5597 loop.call_exception_handler({
5598 'message': 'unhandled exception during asyncio.run() shutdown',
5599 'exception': task.exception(),
5600 'task': task,
5601 })
5602
5603
8b7539d2 5604def merge_headers(*dicts):
08d30158 5605 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5606 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5607
5608
b1f94422 5609def cached_method(f):
5610 """Cache a method"""
5611 signature = inspect.signature(f)
5612
5613 @functools.wraps(f)
5614 def wrapper(self, *args, **kwargs):
5615 bound_args = signature.bind(self, *args, **kwargs)
5616 bound_args.apply_defaults()
5617 key = tuple(bound_args.arguments.values())
5618
5619 if not hasattr(self, '__cached_method__cache'):
5620 self.__cached_method__cache = {}
5621 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5622 if key not in cache:
5623 cache[key] = f(self, *args, **kwargs)
5624 return cache[key]
5625 return wrapper
5626
5627
28787f16 5628class classproperty:
b1f94422 5629 """property access for class methods"""
c487cf00 5630
5631 def __init__(self, func):
5632 functools.update_wrapper(self, func)
5633 self.func = func
28787f16 5634
5635 def __get__(self, _, cls):
c487cf00 5636 return self.func(cls)
19a03940 5637
5638
64fa820c 5639class Namespace(types.SimpleNamespace):
591bb9d3 5640 """Immutable namespace"""
591bb9d3 5641
7896214c 5642 def __iter__(self):
64fa820c 5643 return iter(self.__dict__.values())
7896214c 5644
64fa820c 5645 @property
5646 def items_(self):
5647 return self.__dict__.items()
9b8ee23b 5648
5649
5650# Deprecated
5651has_certifi = bool(certifi)
5652has_websockets = bool(websockets)