]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Standardize retry mechanism (#1649)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 151}
a942d6cb 152
c587cbb7 153# needed for sanitizing filenames in restricted mode
c8827027 154ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
155 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
156 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 157
46f59e89
S
158DATE_FORMATS = (
159 '%d %B %Y',
160 '%d %b %Y',
161 '%B %d %Y',
cb655f34
S
162 '%B %dst %Y',
163 '%B %dnd %Y',
9d30c213 164 '%B %drd %Y',
cb655f34 165 '%B %dth %Y',
46f59e89 166 '%b %d %Y',
cb655f34
S
167 '%b %dst %Y',
168 '%b %dnd %Y',
9d30c213 169 '%b %drd %Y',
cb655f34 170 '%b %dth %Y',
46f59e89
S
171 '%b %dst %Y %I:%M',
172 '%b %dnd %Y %I:%M',
9d30c213 173 '%b %drd %Y %I:%M',
46f59e89
S
174 '%b %dth %Y %I:%M',
175 '%Y %m %d',
176 '%Y-%m-%d',
bccdbd22 177 '%Y.%m.%d.',
46f59e89 178 '%Y/%m/%d',
81c13222 179 '%Y/%m/%d %H:%M',
46f59e89 180 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
181 '%Y%m%d%H%M',
182 '%Y%m%d%H%M%S',
4f3fa23e 183 '%Y%m%d',
0c1c6f4b 184 '%Y-%m-%d %H:%M',
46f59e89
S
185 '%Y-%m-%d %H:%M:%S',
186 '%Y-%m-%d %H:%M:%S.%f',
5014558a 187 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
188 '%d.%m.%Y %H:%M',
189 '%d.%m.%Y %H.%M',
190 '%Y-%m-%dT%H:%M:%SZ',
191 '%Y-%m-%dT%H:%M:%S.%fZ',
192 '%Y-%m-%dT%H:%M:%S.%f0Z',
193 '%Y-%m-%dT%H:%M:%S',
194 '%Y-%m-%dT%H:%M:%S.%f',
195 '%Y-%m-%dT%H:%M',
c6eed6b8
S
196 '%b %d %Y at %H:%M',
197 '%b %d %Y at %H:%M:%S',
b555ae9b
S
198 '%B %d %Y at %H:%M',
199 '%B %d %Y at %H:%M:%S',
a63d9bd0 200 '%H:%M %d-%b-%Y',
46f59e89
S
201)
202
203DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
204DATE_FORMATS_DAY_FIRST.extend([
205 '%d-%m-%Y',
206 '%d.%m.%Y',
207 '%d.%m.%y',
208 '%d/%m/%Y',
209 '%d/%m/%y',
210 '%d/%m/%Y %H:%M:%S',
47304e07 211 '%d-%m-%Y %H:%M',
46f59e89
S
212])
213
214DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
215DATE_FORMATS_MONTH_FIRST.extend([
216 '%m-%d-%Y',
217 '%m.%d.%Y',
218 '%m/%d/%Y',
219 '%m/%d/%y',
220 '%m/%d/%Y %H:%M:%S',
221])
222
06b3fe29 223PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 224JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 225
1d485a1a 226NUMBER_RE = r'\d+(?:\.\d+)?'
227
7105440c 228
0b9c08b4 229@functools.cache
d77c3dfd 230def preferredencoding():
59ae15a5 231 """Get preferred encoding.
d77c3dfd 232
59ae15a5
PH
233 Returns the best encoding scheme for the system, based on
234 locale.getpreferredencoding() and some further tweaks.
235 """
236 try:
237 pref = locale.getpreferredencoding()
28e614de 238 'TEST'.encode(pref)
70a1165b 239 except Exception:
59ae15a5 240 pref = 'UTF-8'
bae611f2 241
59ae15a5 242 return pref
d77c3dfd 243
f4bfd65f 244
181c8655 245def write_json_file(obj, fn):
1394646a 246 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 247
cfb0511d 248 tf = tempfile.NamedTemporaryFile(
249 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
250 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
251
252 try:
253 with tf:
45d86abe 254 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
255 if sys.platform == 'win32':
256 # Need to remove existing file on Windows, else os.rename raises
257 # WindowsError or FileExistsError.
19a03940 258 with contextlib.suppress(OSError):
1394646a 259 os.unlink(fn)
19a03940 260 with contextlib.suppress(OSError):
9cd5f54e
R
261 mask = os.umask(0)
262 os.umask(mask)
263 os.chmod(tf.name, 0o666 & ~mask)
181c8655 264 os.rename(tf.name, fn)
70a1165b 265 except Exception:
19a03940 266 with contextlib.suppress(OSError):
181c8655 267 os.remove(tf.name)
181c8655
PH
268 raise
269
270
cfb0511d 271def find_xpath_attr(node, xpath, key, val=None):
272 """ Find the xpath xpath[@key=val] """
273 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 274 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 275 return node.find(expr)
59ae56fa 276
d7e66d39
JMF
277# On python2.6 the xml.etree.ElementTree.Element methods don't support
278# the namespace parameter
5f6a1245
JW
279
280
d7e66d39
JMF
281def xpath_with_ns(path, ns_map):
282 components = [c.split(':') for c in path.split('/')]
283 replaced = []
284 for c in components:
285 if len(c) == 1:
286 replaced.append(c[0])
287 else:
288 ns, tag = c
289 replaced.append('{%s}%s' % (ns_map[ns], tag))
290 return '/'.join(replaced)
291
d77c3dfd 292
a41fb80c 293def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 294 def _find_xpath(xpath):
f9934b96 295 return node.find(xpath)
578c0745 296
14f25df2 297 if isinstance(xpath, str):
578c0745
S
298 n = _find_xpath(xpath)
299 else:
300 for xp in xpath:
301 n = _find_xpath(xp)
302 if n is not None:
303 break
d74bebd5 304
8e636da4 305 if n is None:
bf42a990
S
306 if default is not NO_DEFAULT:
307 return default
308 elif fatal:
bf0ff932
PH
309 name = xpath if name is None else name
310 raise ExtractorError('Could not find XML element %s' % name)
311 else:
312 return None
a41fb80c
S
313 return n
314
315
316def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
317 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
318 if n is None or n == default:
319 return n
320 if n.text is None:
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element\'s text %s' % name)
326 else:
327 return None
328 return n.text
a41fb80c
S
329
330
331def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
332 n = find_xpath_attr(node, xpath, key)
333 if n is None:
334 if default is not NO_DEFAULT:
335 return default
336 elif fatal:
86e5f3ed 337 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
338 raise ExtractorError('Could not find XML attribute %s' % name)
339 else:
340 return None
341 return n.attrib[key]
bf0ff932
PH
342
343
c487cf00 344def get_element_by_id(id, html, **kwargs):
43e8fafd 345 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 346 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 347
12ea2f30 348
c487cf00 349def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 350 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 351 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
352
353
84c237fb 354def get_element_by_class(class_name, html):
2af12ad9
TC
355 """Return the content of the first tag with the specified class in the passed HTML document"""
356 retval = get_elements_by_class(class_name, html)
357 return retval[0] if retval else None
358
359
6f32a0b5
ZM
360def get_element_html_by_class(class_name, html):
361 """Return the html of the first tag with the specified class in the passed HTML document"""
362 retval = get_elements_html_by_class(class_name, html)
363 return retval[0] if retval else None
364
365
c487cf00 366def get_element_by_attribute(attribute, value, html, **kwargs):
367 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
368 return retval[0] if retval else None
369
370
c487cf00 371def get_element_html_by_attribute(attribute, value, html, **kargs):
372 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
373 return retval[0] if retval else None
374
375
c487cf00 376def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
377 """Return the content of all tags with the specified class in the passed HTML document as a list"""
378 return get_elements_by_attribute(
64fa820c 379 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
380 html, escape_value=False)
381
382
6f32a0b5
ZM
383def get_elements_html_by_class(class_name, html):
384 """Return the html of all tags with the specified class in the passed HTML document as a list"""
385 return get_elements_html_by_attribute(
64fa820c 386 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
387 html, escape_value=False)
388
389
390def get_elements_by_attribute(*args, **kwargs):
43e8fafd 391 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
392 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
393
394
395def get_elements_html_by_attribute(*args, **kwargs):
396 """Return the html of the tag with the specified attribute in the passed HTML document"""
397 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
398
399
400def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
401 """
402 Return the text (content) and the html (whole) of the tag with the specified
403 attribute in the passed HTML document
404 """
9e6dd238 405
86e5f3ed 406 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 407
84c237fb
YCH
408 value = re.escape(value) if escape_value else value
409
86e5f3ed 410 partial_element_re = rf'''(?x)
6f32a0b5 411 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 412 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 413 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
414 '''
38285056 415
0254f162
ZM
416 for m in re.finditer(partial_element_re, html):
417 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 418
0254f162
ZM
419 yield (
420 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
421 whole
422 )
a921f407 423
c5229f39 424
ac668111 425class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
426 """
427 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
428 closing tag for the first opening tag it has encountered, and can be used
429 as a context manager
430 """
431
432 class HTMLBreakOnClosingTagException(Exception):
433 pass
434
435 def __init__(self):
436 self.tagstack = collections.deque()
ac668111 437 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
438
439 def __enter__(self):
440 return self
441
442 def __exit__(self, *_):
443 self.close()
444
445 def close(self):
446 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
447 # so data remains buffered; we no longer have any interest in it, thus
448 # override this method to discard it
449 pass
450
451 def handle_starttag(self, tag, _):
452 self.tagstack.append(tag)
453
454 def handle_endtag(self, tag):
455 if not self.tagstack:
456 raise compat_HTMLParseError('no tags in the stack')
457 while self.tagstack:
458 inner_tag = self.tagstack.pop()
459 if inner_tag == tag:
460 break
461 else:
462 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
463 if not self.tagstack:
464 raise self.HTMLBreakOnClosingTagException()
465
466
467def get_element_text_and_html_by_tag(tag, html):
468 """
469 For the first element with the specified tag in the passed HTML document
470 return its' content (text) and the whole element (html)
471 """
472 def find_or_raise(haystack, needle, exc):
473 try:
474 return haystack.index(needle)
475 except ValueError:
476 raise exc
477 closing_tag = f'</{tag}>'
478 whole_start = find_or_raise(
479 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
480 content_start = find_or_raise(
481 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
482 content_start += whole_start + 1
483 with HTMLBreakOnClosingTagParser() as parser:
484 parser.feed(html[whole_start:content_start])
485 if not parser.tagstack or parser.tagstack[0] != tag:
486 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
487 offset = content_start
488 while offset < len(html):
489 next_closing_tag_start = find_or_raise(
490 html[offset:], closing_tag,
491 compat_HTMLParseError(f'closing {tag} tag not found'))
492 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
493 try:
494 parser.feed(html[offset:offset + next_closing_tag_end])
495 offset += next_closing_tag_end
496 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
497 return html[content_start:offset + next_closing_tag_start], \
498 html[whole_start:offset + next_closing_tag_end]
499 raise compat_HTMLParseError('unexpected end of html')
500
501
ac668111 502class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 503 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 504
8bb56eee 505 def __init__(self):
c5229f39 506 self.attrs = {}
ac668111 507 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
508
509 def handle_starttag(self, tag, attrs):
510 self.attrs = dict(attrs)
511
c5229f39 512
ac668111 513class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
514 """HTML parser to gather the attributes for the elements of a list"""
515
516 def __init__(self):
ac668111 517 html.parser.HTMLParser.__init__(self)
73673ccf
FF
518 self.items = []
519 self._level = 0
520
521 def handle_starttag(self, tag, attrs):
522 if tag == 'li' and self._level == 0:
523 self.items.append(dict(attrs))
524 self._level += 1
525
526 def handle_endtag(self, tag):
527 self._level -= 1
528
529
8bb56eee
BF
530def extract_attributes(html_element):
531 """Given a string for an HTML element such as
532 <el
533 a="foo" B="bar" c="&98;az" d=boz
534 empty= noval entity="&amp;"
535 sq='"' dq="'"
536 >
537 Decode and return a dictionary of attributes.
538 {
539 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
540 'empty': '', 'noval': None, 'entity': '&',
541 'sq': '"', 'dq': '\''
542 }.
8bb56eee
BF
543 """
544 parser = HTMLAttributeParser()
19a03940 545 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
546 parser.feed(html_element)
547 parser.close()
8bb56eee 548 return parser.attrs
9e6dd238 549
c5229f39 550
73673ccf
FF
551def parse_list(webpage):
552 """Given a string for an series of HTML <li> elements,
553 return a dictionary of their attributes"""
554 parser = HTMLListAttrsParser()
555 parser.feed(webpage)
556 parser.close()
557 return parser.items
558
559
9e6dd238 560def clean_html(html):
59ae15a5 561 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
562
563 if html is None: # Convenience for sanitizing descriptions etc.
564 return html
565
49185227 566 html = re.sub(r'\s+', ' ', html)
567 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
568 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
569 # Strip html tags
570 html = re.sub('<.*?>', '', html)
571 # Replace html entities
572 html = unescapeHTML(html)
7decf895 573 return html.strip()
9e6dd238
FV
574
575
b7c47b74 576class LenientJSONDecoder(json.JSONDecoder):
577 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
578 self.transform_source, self.ignore_extra = transform_source, ignore_extra
579 super().__init__(*args, **kwargs)
580
581 def decode(self, s):
582 if self.transform_source:
583 s = self.transform_source(s)
584 if self.ignore_extra:
585 return self.raw_decode(s.lstrip())[0]
586 return super().decode(s)
587
588
d77c3dfd 589def sanitize_open(filename, open_mode):
59ae15a5
PH
590 """Try to open the given filename, and slightly tweak it if this fails.
591
592 Attempts to open the given filename. If this fails, it tries to change
593 the filename slightly, step by step, until it's either able to open it
594 or it fails and raises a final exception, like the standard open()
595 function.
596
597 It returns the tuple (stream, definitive_file_name).
598 """
0edb3e33 599 if filename == '-':
600 if sys.platform == 'win32':
601 import msvcrt
be5c1ae8 602
daef7911 603 # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
604 with contextlib.suppress(io.UnsupportedOperation):
605 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 606 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 607
0edb3e33 608 for attempt in range(2):
609 try:
610 try:
89737671 611 if sys.platform == 'win32':
b506289f 612 # FIXME: An exclusive lock also locks the file from being read.
613 # Since windows locks are mandatory, don't lock the file on windows (for now).
614 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 615 raise LockingUnsupportedError()
0edb3e33 616 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 617 except OSError:
0edb3e33 618 stream = open(filename, open_mode)
8a82af35 619 return stream, filename
86e5f3ed 620 except OSError as err:
0edb3e33 621 if attempt or err.errno in (errno.EACCES,):
622 raise
623 old_filename, filename = filename, sanitize_path(filename)
624 if old_filename == filename:
625 raise
d77c3dfd
FV
626
627
628def timeconvert(timestr):
59ae15a5
PH
629 """Convert RFC 2822 defined time string into system timestamp"""
630 timestamp = None
631 timetuple = email.utils.parsedate_tz(timestr)
632 if timetuple is not None:
633 timestamp = email.utils.mktime_tz(timetuple)
634 return timestamp
1c469a94 635
5f6a1245 636
5c3895ff 637def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 638 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 639 @param restricted Use a stricter subset of allowed characters
640 @param is_id Whether this is an ID that should be kept unchanged if possible.
641 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 642 """
5c3895ff 643 if s == '':
644 return ''
645
59ae15a5 646 def replace_insane(char):
c587cbb7
AT
647 if restricted and char in ACCENT_CHARS:
648 return ACCENT_CHARS[char]
91dd88b9 649 elif not restricted and char == '\n':
5c3895ff 650 return '\0 '
91dd88b9 651 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
652 return ''
653 elif char == '"':
654 return '' if restricted else '\''
655 elif char == ':':
5c3895ff 656 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 657 elif char in '\\/|*<>':
5c3895ff 658 return '\0_'
659 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
660 return '\0_'
59ae15a5
PH
661 return char
662
5c3895ff 663 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 664 result = ''.join(map(replace_insane, s))
5c3895ff 665 if is_id is NO_DEFAULT:
ae61d108 666 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
667 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 668 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
669 result = result.replace('\0', '') or '_'
670
796173d0
PH
671 if not is_id:
672 while '__' in result:
673 result = result.replace('__', '_')
674 result = result.strip('_')
675 # Common case of "Foreign band name - English song title"
676 if restricted and result.startswith('-_'):
677 result = result[2:]
5a42414b
PH
678 if result.startswith('-'):
679 result = '_' + result[len('-'):]
a7440261 680 result = result.lstrip('.')
796173d0
PH
681 if not result:
682 result = '_'
59ae15a5 683 return result
d77c3dfd 684
5f6a1245 685
c2934512 686def sanitize_path(s, force=False):
a2aaf4db 687 """Sanitizes and normalizes path on Windows"""
c2934512 688 if sys.platform == 'win32':
c4218ac3 689 force = False
c2934512 690 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 691 elif force:
692 drive_or_unc = ''
693 else:
a2aaf4db 694 return s
c2934512 695
be531ef1
S
696 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
697 if drive_or_unc:
a2aaf4db
S
698 norm_path.pop(0)
699 sanitized_path = [
ec85ded8 700 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 701 for path_part in norm_path]
be531ef1
S
702 if drive_or_unc:
703 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 704 elif force and s and s[0] == os.path.sep:
c4218ac3 705 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
706 return os.path.join(*sanitized_path)
707
708
8f97a15d 709def sanitize_url(url, *, scheme='http'):
befa4708
S
710 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
711 # the number of unwanted failures due to missing protocol
21633673 712 if url is None:
713 return
714 elif url.startswith('//'):
8f97a15d 715 return f'{scheme}:{url}'
befa4708
S
716 # Fix some common typos seen so far
717 COMMON_TYPOS = (
067aa17e 718 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
719 (r'^httpss://', r'https://'),
720 # https://bx1.be/lives/direct-tv/
721 (r'^rmtp([es]?)://', r'rtmp\1://'),
722 )
723 for mistake, fixup in COMMON_TYPOS:
724 if re.match(mistake, url):
725 return re.sub(mistake, fixup, url)
bc6b9bcd 726 return url
17bcc626
S
727
728
5435dcf9 729def extract_basic_auth(url):
14f25df2 730 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
731 if parts.username is None:
732 return url, None
14f25df2 733 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
734 parts.hostname if parts.port is None
735 else '%s:%d' % (parts.hostname, parts.port))))
736 auth_payload = base64.b64encode(
0f06bcd7 737 ('%s:%s' % (parts.username, parts.password or '')).encode())
738 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
739
740
67dda517 741def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 742 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
743 if auth_header is not None:
744 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
745 headers['Authorization'] = auth_header
ac668111 746 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
747
748
51098426
S
749def expand_path(s):
750 """Expand shell variables and ~"""
751 return os.path.expandvars(compat_expanduser(s))
752
753
7e9a6125 754def orderedSet(iterable, *, lazy=False):
755 """Remove all duplicates from the input iterable"""
756 def _iter():
757 seen = [] # Do not use set since the items can be unhashable
758 for x in iterable:
759 if x not in seen:
760 seen.append(x)
761 yield x
762
763 return _iter() if lazy else list(_iter())
d77c3dfd 764
912b38b4 765
55b2f099 766def _htmlentity_transform(entity_with_semicolon):
4e408e47 767 """Transforms an HTML entity to a character."""
55b2f099
YCH
768 entity = entity_with_semicolon[:-1]
769
4e408e47 770 # Known non-numeric HTML entity
ac668111 771 if entity in html.entities.name2codepoint:
772 return chr(html.entities.name2codepoint[entity])
4e408e47 773
55b2f099
YCH
774 # TODO: HTML5 allows entities without a semicolon. For example,
775 # '&Eacuteric' should be decoded as 'Éric'.
ac668111 776 if entity_with_semicolon in html.entities.html5:
777 return html.entities.html5[entity_with_semicolon]
55b2f099 778
91757b0f 779 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
780 if mobj is not None:
781 numstr = mobj.group(1)
28e614de 782 if numstr.startswith('x'):
4e408e47 783 base = 16
28e614de 784 numstr = '0%s' % numstr
4e408e47
PH
785 else:
786 base = 10
067aa17e 787 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 788 with contextlib.suppress(ValueError):
ac668111 789 return chr(int(numstr, base))
4e408e47
PH
790
791 # Unknown entity in name, return its literal representation
7a3f0c00 792 return '&%s;' % entity
4e408e47
PH
793
794
d77c3dfd 795def unescapeHTML(s):
912b38b4
PH
796 if s is None:
797 return None
19a03940 798 assert isinstance(s, str)
d77c3dfd 799
4e408e47 800 return re.sub(
95f3f7c2 801 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 802
8bf48f23 803
cdb19aa4 804def escapeHTML(text):
805 return (
806 text
807 .replace('&', '&amp;')
808 .replace('<', '&lt;')
809 .replace('>', '&gt;')
810 .replace('"', '&quot;')
811 .replace("'", '&#39;')
812 )
813
814
f5b1bca9 815def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 816 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
817 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
818 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 819
820
d3c93ec2 821class Popen(subprocess.Popen):
822 if sys.platform == 'win32':
823 _startupinfo = subprocess.STARTUPINFO()
824 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
825 else:
826 _startupinfo = None
827
f0c9fb96 828 def __init__(self, *args, text=False, **kwargs):
829 if text is True:
830 kwargs['universal_newlines'] = True # For 3.6 compatibility
831 kwargs.setdefault('encoding', 'utf-8')
832 kwargs.setdefault('errors', 'replace')
86e5f3ed 833 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 834
835 def communicate_or_kill(self, *args, **kwargs):
8a82af35 836 try:
837 return self.communicate(*args, **kwargs)
838 except BaseException: # Including KeyboardInterrupt
f0c9fb96 839 self.kill(timeout=None)
8a82af35 840 raise
d3c93ec2 841
f0c9fb96 842 def kill(self, *, timeout=0):
843 super().kill()
844 if timeout != 0:
845 self.wait(timeout=timeout)
846
847 @classmethod
848 def run(cls, *args, **kwargs):
849 with cls(*args, **kwargs) as proc:
850 stdout, stderr = proc.communicate_or_kill()
851 return stdout or '', stderr or '', proc.returncode
852
d3c93ec2 853
aa49acd1
S
854def get_subprocess_encoding():
855 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
856 # For subprocess calls, encode with locale encoding
857 # Refer to http://stackoverflow.com/a/9951851/35070
858 encoding = preferredencoding()
859 else:
860 encoding = sys.getfilesystemencoding()
861 if encoding is None:
862 encoding = 'utf-8'
863 return encoding
864
865
8bf48f23 866def encodeFilename(s, for_subprocess=False):
19a03940 867 assert isinstance(s, str)
cfb0511d 868 return s
aa49acd1
S
869
870
871def decodeFilename(b, for_subprocess=False):
cfb0511d 872 return b
8bf48f23 873
f07b74fc
PH
874
875def encodeArgument(s):
cfb0511d 876 # Legacy code that uses byte strings
877 # Uncomment the following line after fixing all post processors
14f25df2 878 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 879 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
880
881
aa49acd1 882def decodeArgument(b):
cfb0511d 883 return b
aa49acd1
S
884
885
8271226a
PH
886def decodeOption(optval):
887 if optval is None:
888 return optval
889 if isinstance(optval, bytes):
890 optval = optval.decode(preferredencoding())
891
14f25df2 892 assert isinstance(optval, str)
8271226a 893 return optval
1c256f70 894
5f6a1245 895
aa7785f8 896_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
897
898
899def timetuple_from_msec(msec):
900 secs, msec = divmod(msec, 1000)
901 mins, secs = divmod(secs, 60)
902 hrs, mins = divmod(mins, 60)
903 return _timetuple(hrs, mins, secs, msec)
904
905
cdb19aa4 906def formatSeconds(secs, delim=':', msec=False):
aa7785f8 907 time = timetuple_from_msec(secs * 1000)
908 if time.hours:
909 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
910 elif time.minutes:
911 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 912 else:
aa7785f8 913 ret = '%d' % time.seconds
914 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 915
a0ddb8a2 916
77562778 917def _ssl_load_windows_store_certs(ssl_context, storename):
918 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
919 try:
920 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
921 if encoding == 'x509_asn' and (
922 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
923 except PermissionError:
924 return
925 for cert in certs:
19a03940 926 with contextlib.suppress(ssl.SSLError):
77562778 927 ssl_context.load_verify_locations(cadata=cert)
a2366922 928
77562778 929
930def make_HTTPS_handler(params, **kwargs):
931 opts_check_certificate = not params.get('nocheckcertificate')
932 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
933 context.check_hostname = opts_check_certificate
f81c62a6 934 if params.get('legacyserverconnect'):
935 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 936 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
937 context.set_ciphers('DEFAULT')
8a82af35 938
77562778 939 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
940 if opts_check_certificate:
d5820461 941 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
942 context.load_verify_locations(cafile=certifi.where())
168bbc4f 943 else:
944 try:
945 context.load_default_certs()
946 # Work around the issue in load_default_certs when there are bad certificates. See:
947 # https://github.com/yt-dlp/yt-dlp/issues/1060,
948 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
949 except ssl.SSLError:
950 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
951 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
952 for storename in ('CA', 'ROOT'):
953 _ssl_load_windows_store_certs(context, storename)
954 context.set_default_verify_paths()
8a82af35 955
bb58c9ed 956 client_certfile = params.get('client_certificate')
957 if client_certfile:
958 try:
959 context.load_cert_chain(
960 client_certfile, keyfile=params.get('client_certificate_key'),
961 password=params.get('client_certificate_password'))
962 except ssl.SSLError:
963 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 964
965 # Some servers may reject requests if ALPN extension is not sent. See:
966 # https://github.com/python/cpython/issues/85140
967 # https://github.com/yt-dlp/yt-dlp/issues/3878
968 with contextlib.suppress(NotImplementedError):
969 context.set_alpn_protocols(['http/1.1'])
970
77562778 971 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 972
732ea2f0 973
5873d4cc 974def bug_reports_message(before=';'):
57e0f077 975 from .update import REPOSITORY
976
977 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
978 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
979
980 before = before.rstrip()
981 if not before or before.endswith(('.', '!', '?')):
982 msg = msg[0].title() + msg[1:]
983
984 return (before + ' ' if before else '') + msg
08f2a92c
JMF
985
986
bf5b9d85
PM
987class YoutubeDLError(Exception):
988 """Base exception for YoutubeDL errors."""
aa9369a2 989 msg = None
990
991 def __init__(self, msg=None):
992 if msg is not None:
993 self.msg = msg
994 elif self.msg is None:
995 self.msg = type(self).__name__
996 super().__init__(self.msg)
bf5b9d85
PM
997
998
ac668111 999network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1000if hasattr(ssl, 'CertificateError'):
1001 network_exceptions.append(ssl.CertificateError)
1002network_exceptions = tuple(network_exceptions)
1003
1004
bf5b9d85 1005class ExtractorError(YoutubeDLError):
1c256f70 1006 """Error during info extraction."""
5f6a1245 1007
1151c407 1008 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1009 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1010 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1011 """
3158150c 1012 if sys.exc_info()[0] in network_exceptions:
9a82b238 1013 expected = True
d5979c5d 1014
7265a219 1015 self.orig_msg = str(msg)
1c256f70 1016 self.traceback = tb
1151c407 1017 self.expected = expected
2eabb802 1018 self.cause = cause
d11271dd 1019 self.video_id = video_id
1151c407 1020 self.ie = ie
1021 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1022 if isinstance(self.exc_info[1], ExtractorError):
1023 self.exc_info = self.exc_info[1].exc_info
1151c407 1024
86e5f3ed 1025 super().__init__(''.join((
a70635b8 1026 format_field(ie, None, '[%s] '),
1027 format_field(video_id, None, '%s: '),
7265a219 1028 msg,
a70635b8 1029 format_field(cause, None, ' (caused by %r)'),
1151c407 1030 '' if expected else bug_reports_message())))
1c256f70 1031
01951dda 1032 def format_traceback(self):
497d2fab 1033 return join_nonempty(
1034 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1035 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1036 delim='\n') or None
01951dda 1037
1c256f70 1038
416c7fcb
PH
1039class UnsupportedError(ExtractorError):
1040 def __init__(self, url):
86e5f3ed 1041 super().__init__(
416c7fcb
PH
1042 'Unsupported URL: %s' % url, expected=True)
1043 self.url = url
1044
1045
55b3e45b
JMF
1046class RegexNotFoundError(ExtractorError):
1047 """Error when a regex didn't match"""
1048 pass
1049
1050
773f291d
S
1051class GeoRestrictedError(ExtractorError):
1052 """Geographic restriction Error exception.
1053
1054 This exception may be thrown when a video is not available from your
1055 geographic location due to geographic restrictions imposed by a website.
1056 """
b6e0c7d2 1057
0db3bae8 1058 def __init__(self, msg, countries=None, **kwargs):
1059 kwargs['expected'] = True
86e5f3ed 1060 super().__init__(msg, **kwargs)
773f291d
S
1061 self.countries = countries
1062
1063
693f0600 1064class UserNotLive(ExtractorError):
1065 """Error when a channel/user is not live"""
1066
1067 def __init__(self, msg=None, **kwargs):
1068 kwargs['expected'] = True
1069 super().__init__(msg or 'The channel is not currently live', **kwargs)
1070
1071
bf5b9d85 1072class DownloadError(YoutubeDLError):
59ae15a5 1073 """Download Error exception.
d77c3dfd 1074
59ae15a5
PH
1075 This exception may be thrown by FileDownloader objects if they are not
1076 configured to continue on errors. They will contain the appropriate
1077 error message.
1078 """
5f6a1245 1079
8cc83b8d
FV
1080 def __init__(self, msg, exc_info=None):
1081 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1082 super().__init__(msg)
8cc83b8d 1083 self.exc_info = exc_info
d77c3dfd
FV
1084
1085
498f5606 1086class EntryNotInPlaylist(YoutubeDLError):
1087 """Entry not in playlist exception.
1088
1089 This exception will be thrown by YoutubeDL when a requested entry
1090 is not found in the playlist info_dict
1091 """
aa9369a2 1092 msg = 'Entry not found in info'
498f5606 1093
1094
bf5b9d85 1095class SameFileError(YoutubeDLError):
59ae15a5 1096 """Same File exception.
d77c3dfd 1097
59ae15a5
PH
1098 This exception will be thrown by FileDownloader objects if they detect
1099 multiple files would have to be downloaded to the same file on disk.
1100 """
aa9369a2 1101 msg = 'Fixed output name but more than one file to download'
1102
1103 def __init__(self, filename=None):
1104 if filename is not None:
1105 self.msg += f': {filename}'
1106 super().__init__(self.msg)
d77c3dfd
FV
1107
1108
bf5b9d85 1109class PostProcessingError(YoutubeDLError):
59ae15a5 1110 """Post Processing exception.
d77c3dfd 1111
59ae15a5
PH
1112 This exception may be raised by PostProcessor's .run() method to
1113 indicate an error in the postprocessing task.
1114 """
5f6a1245 1115
5f6a1245 1116
48f79687 1117class DownloadCancelled(YoutubeDLError):
1118 """ Exception raised when the download queue should be interrupted """
1119 msg = 'The download was cancelled'
8b0d7497 1120
8b0d7497 1121
48f79687 1122class ExistingVideoReached(DownloadCancelled):
1123 """ --break-on-existing triggered """
1124 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1125
48f79687 1126
1127class RejectedVideoReached(DownloadCancelled):
1128 """ --break-on-reject triggered """
1129 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1130
1131
48f79687 1132class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1133 """ --max-downloads limit has been reached. """
48f79687 1134 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1135
1136
f2ebc5c7 1137class ReExtractInfo(YoutubeDLError):
1138 """ Video info needs to be re-extracted. """
1139
1140 def __init__(self, msg, expected=False):
1141 super().__init__(msg)
1142 self.expected = expected
1143
1144
1145class ThrottledDownload(ReExtractInfo):
48f79687 1146 """ Download speed below --throttled-rate. """
aa9369a2 1147 msg = 'The download speed is below throttle limit'
d77c3dfd 1148
43b22906 1149 def __init__(self):
1150 super().__init__(self.msg, expected=False)
f2ebc5c7 1151
d77c3dfd 1152
bf5b9d85 1153class UnavailableVideoError(YoutubeDLError):
59ae15a5 1154 """Unavailable Format exception.
d77c3dfd 1155
59ae15a5
PH
1156 This exception will be thrown when a video is requested
1157 in a format that is not available for that video.
1158 """
aa9369a2 1159 msg = 'Unable to download video'
1160
1161 def __init__(self, err=None):
1162 if err is not None:
1163 self.msg += f': {err}'
1164 super().__init__(self.msg)
d77c3dfd
FV
1165
1166
bf5b9d85 1167class ContentTooShortError(YoutubeDLError):
59ae15a5 1168 """Content Too Short exception.
d77c3dfd 1169
59ae15a5
PH
1170 This exception may be raised by FileDownloader objects when a file they
1171 download is too small for what the server announced first, indicating
1172 the connection was probably interrupted.
1173 """
d77c3dfd 1174
59ae15a5 1175 def __init__(self, downloaded, expected):
86e5f3ed 1176 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1177 # Both in bytes
59ae15a5
PH
1178 self.downloaded = downloaded
1179 self.expected = expected
d77c3dfd 1180
5f6a1245 1181
bf5b9d85 1182class XAttrMetadataError(YoutubeDLError):
efa97bdc 1183 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1184 super().__init__(msg)
efa97bdc 1185 self.code = code
bd264412 1186 self.msg = msg
efa97bdc
YCH
1187
1188 # Parsing code and msg
3089bc74 1189 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1190 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1191 self.reason = 'NO_SPACE'
1192 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1193 self.reason = 'VALUE_TOO_LONG'
1194 else:
1195 self.reason = 'NOT_SUPPORTED'
1196
1197
bf5b9d85 1198class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1199 pass
1200
1201
c5a59d93 1202def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1203 hc = http_class(*args, **kwargs)
be4a824d 1204 source_address = ydl_handler._params.get('source_address')
8959018a 1205
be4a824d 1206 if source_address is not None:
8959018a
AU
1207 # This is to workaround _create_connection() from socket where it will try all
1208 # address data from getaddrinfo() including IPv6. This filters the result from
1209 # getaddrinfo() based on the source_address value.
1210 # This is based on the cpython socket.create_connection() function.
1211 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1212 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1213 host, port = address
1214 err = None
1215 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1216 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1217 ip_addrs = [addr for addr in addrs if addr[0] == af]
1218 if addrs and not ip_addrs:
1219 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1220 raise OSError(
9e21e6d9
S
1221 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1222 % (ip_version, source_address[0]))
8959018a
AU
1223 for res in ip_addrs:
1224 af, socktype, proto, canonname, sa = res
1225 sock = None
1226 try:
1227 sock = socket.socket(af, socktype, proto)
1228 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1229 sock.settimeout(timeout)
1230 sock.bind(source_address)
1231 sock.connect(sa)
1232 err = None # Explicitly break reference cycle
1233 return sock
86e5f3ed 1234 except OSError as _:
8959018a
AU
1235 err = _
1236 if sock is not None:
1237 sock.close()
1238 if err is not None:
1239 raise err
1240 else:
86e5f3ed 1241 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1242 if hasattr(hc, '_create_connection'):
1243 hc._create_connection = _create_connection
cfb0511d 1244 hc.source_address = (source_address, 0)
be4a824d
PH
1245
1246 return hc
1247
1248
87f0e62d 1249def handle_youtubedl_headers(headers):
992fc9d6
YCH
1250 filtered_headers = headers
1251
1252 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1253 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1254 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1255
992fc9d6 1256 return filtered_headers
87f0e62d
YCH
1257
1258
ac668111 1259class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1260 """Handler for HTTP requests and responses.
1261
1262 This class, when installed with an OpenerDirector, automatically adds
1263 the standard headers to every HTTP request and handles gzipped and
1264 deflated responses from web servers. If compression is to be avoided in
1265 a particular request, the original request in the program code only has
0424ec30 1266 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1267 removed before making the real request.
1268
1269 Part of this code was copied from:
1270
1271 http://techknack.net/python-urllib2-handlers/
1272
1273 Andrew Rowls, the author of that code, agreed to release it to the
1274 public domain.
1275 """
1276
be4a824d 1277 def __init__(self, params, *args, **kwargs):
ac668111 1278 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1279 self._params = params
1280
1281 def http_open(self, req):
ac668111 1282 conn_class = http.client.HTTPConnection
71aff188
YCH
1283
1284 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1285 if socks_proxy:
1286 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1287 del req.headers['Ytdl-socks-proxy']
1288
be4a824d 1289 return self.do_open(functools.partial(
71aff188 1290 _create_http_connection, self, conn_class, False),
be4a824d
PH
1291 req)
1292
59ae15a5
PH
1293 @staticmethod
1294 def deflate(data):
fc2119f2 1295 if not data:
1296 return data
59ae15a5
PH
1297 try:
1298 return zlib.decompress(data, -zlib.MAX_WBITS)
1299 except zlib.error:
1300 return zlib.decompress(data)
1301
4390d5ec 1302 @staticmethod
1303 def brotli(data):
1304 if not data:
1305 return data
9b8ee23b 1306 return brotli.decompress(data)
4390d5ec 1307
acebc9cd 1308 def http_request(self, req):
51f267d9
S
1309 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1310 # always respected by websites, some tend to give out URLs with non percent-encoded
1311 # non-ASCII characters (see telemb.py, ard.py [#3412])
1312 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1313 # To work around aforementioned issue we will replace request's original URL with
1314 # percent-encoded one
1315 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1316 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1317 url = req.get_full_url()
1318 url_escaped = escape_url(url)
1319
1320 # Substitute URL if any change after escaping
1321 if url != url_escaped:
15d260eb 1322 req = update_Request(req, url=url_escaped)
51f267d9 1323
8b7539d2 1324 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1325 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1326 # The dict keys are capitalized because of this bug by urllib
1327 if h.capitalize() not in req.headers:
33ac271b 1328 req.add_header(h, v)
87f0e62d 1329
af14914b 1330 if 'Accept-encoding' not in req.headers:
1331 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1332
87f0e62d 1333 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1334
379a4f16 1335 return super().do_request_(req)
59ae15a5 1336
acebc9cd 1337 def http_response(self, req, resp):
59ae15a5
PH
1338 old_resp = resp
1339 # gzip
1340 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1341 content = resp.read()
1342 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1343 try:
1344 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1345 except OSError as original_ioerror:
aa3e9507
PH
1346 # There may be junk add the end of the file
1347 # See http://stackoverflow.com/q/4928560/35070 for details
1348 for i in range(1, 1024):
1349 try:
1350 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1351 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1352 except OSError:
aa3e9507
PH
1353 continue
1354 break
1355 else:
1356 raise original_ioerror
ac668111 1357 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1358 resp.msg = old_resp.msg
c047270c 1359 del resp.headers['Content-encoding']
59ae15a5
PH
1360 # deflate
1361 if resp.headers.get('Content-encoding', '') == 'deflate':
1362 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1363 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1364 resp.msg = old_resp.msg
c047270c 1365 del resp.headers['Content-encoding']
4390d5ec 1366 # brotli
1367 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1368 resp = urllib.request.addinfourl(
4390d5ec 1369 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1370 resp.msg = old_resp.msg
1371 del resp.headers['Content-encoding']
ad729172 1372 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1373 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1374 if 300 <= resp.code < 400:
1375 location = resp.headers.get('Location')
1376 if location:
1377 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1378 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1379 location_escaped = escape_url(location)
1380 if location != location_escaped:
1381 del resp.headers['Location']
1382 resp.headers['Location'] = location_escaped
59ae15a5 1383 return resp
0f8d03f8 1384
acebc9cd
PH
1385 https_request = http_request
1386 https_response = http_response
bf50b038 1387
5de90176 1388
71aff188
YCH
1389def make_socks_conn_class(base_class, socks_proxy):
1390 assert issubclass(base_class, (
ac668111 1391 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1392
14f25df2 1393 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1394 if url_components.scheme.lower() == 'socks5':
1395 socks_type = ProxyType.SOCKS5
1396 elif url_components.scheme.lower() in ('socks', 'socks4'):
1397 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1398 elif url_components.scheme.lower() == 'socks4a':
1399 socks_type = ProxyType.SOCKS4A
71aff188 1400
cdd94c2e
YCH
1401 def unquote_if_non_empty(s):
1402 if not s:
1403 return s
ac668111 1404 return urllib.parse.unquote_plus(s)
cdd94c2e 1405
71aff188
YCH
1406 proxy_args = (
1407 socks_type,
1408 url_components.hostname, url_components.port or 1080,
1409 True, # Remote DNS
cdd94c2e
YCH
1410 unquote_if_non_empty(url_components.username),
1411 unquote_if_non_empty(url_components.password),
71aff188
YCH
1412 )
1413
1414 class SocksConnection(base_class):
1415 def connect(self):
1416 self.sock = sockssocket()
1417 self.sock.setproxy(*proxy_args)
19a03940 1418 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1419 self.sock.settimeout(self.timeout)
1420 self.sock.connect((self.host, self.port))
1421
ac668111 1422 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1423 if hasattr(self, '_context'): # Python > 2.6
1424 self.sock = self._context.wrap_socket(
1425 self.sock, server_hostname=self.host)
1426 else:
1427 self.sock = ssl.wrap_socket(self.sock)
1428
1429 return SocksConnection
1430
1431
ac668111 1432class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1433 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1434 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1435 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1436 self._params = params
1437
1438 def https_open(self, req):
4f264c02 1439 kwargs = {}
71aff188
YCH
1440 conn_class = self._https_conn_class
1441
4f264c02
JMF
1442 if hasattr(self, '_context'): # python > 2.6
1443 kwargs['context'] = self._context
1444 if hasattr(self, '_check_hostname'): # python 3.x
1445 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1446
1447 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1448 if socks_proxy:
1449 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1450 del req.headers['Ytdl-socks-proxy']
1451
4f28b537 1452 try:
1453 return self.do_open(
1454 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1455 except urllib.error.URLError as e:
1456 if (isinstance(e.reason, ssl.SSLError)
1457 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1458 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1459 raise
be4a824d
PH
1460
1461
ac668111 1462class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1463 """
1464 See [1] for cookie file format.
1465
1466 1. https://curl.haxx.se/docs/http-cookies.html
1467 """
e7e62441 1468 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1469 _ENTRY_LEN = 7
1470 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1471# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1472
1473'''
1474 _CookieFileEntry = collections.namedtuple(
1475 'CookieFileEntry',
1476 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1477
d76fa1f3 1478 def __init__(self, filename=None, *args, **kwargs):
1479 super().__init__(None, *args, **kwargs)
1480 if self.is_path(filename):
1481 filename = os.fspath(filename)
1482 self.filename = filename
1483
24146491 1484 @staticmethod
1485 def _true_or_false(cndn):
1486 return 'TRUE' if cndn else 'FALSE'
1487
d76fa1f3 1488 @staticmethod
1489 def is_path(file):
1490 return isinstance(file, (str, bytes, os.PathLike))
1491
1492 @contextlib.contextmanager
1493 def open(self, file, *, write=False):
1494 if self.is_path(file):
1495 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1496 yield f
1497 else:
1498 if write:
1499 file.truncate(0)
1500 yield file
1501
24146491 1502 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1503 now = time.time()
1504 for cookie in self:
1505 if (not ignore_discard and cookie.discard
1506 or not ignore_expires and cookie.is_expired(now)):
1507 continue
1508 name, value = cookie.name, cookie.value
1509 if value is None:
1510 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1511 # with no name, whereas http.cookiejar regards it as a
1512 # cookie with no value.
1513 name, value = '', name
1514 f.write('%s\n' % '\t'.join((
1515 cookie.domain,
1516 self._true_or_false(cookie.domain.startswith('.')),
1517 cookie.path,
1518 self._true_or_false(cookie.secure),
1519 str_or_none(cookie.expires, default=''),
1520 name, value
1521 )))
1522
1523 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1524 """
1525 Save cookies to a file.
24146491 1526 Code is taken from CPython 3.6
1527 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1528
c380cc28
S
1529 if filename is None:
1530 if self.filename is not None:
1531 filename = self.filename
1532 else:
ac668111 1533 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1534
24146491 1535 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1536 for cookie in self:
1537 if cookie.expires is None:
1538 cookie.expires = 0
c380cc28 1539
d76fa1f3 1540 with self.open(filename, write=True) as f:
c380cc28 1541 f.write(self._HEADER)
24146491 1542 self._really_save(f, *args, **kwargs)
1bab3437
S
1543
1544 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1545 """Load cookies from a file."""
1546 if filename is None:
1547 if self.filename is not None:
1548 filename = self.filename
1549 else:
ac668111 1550 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1551
c380cc28
S
1552 def prepare_line(line):
1553 if line.startswith(self._HTTPONLY_PREFIX):
1554 line = line[len(self._HTTPONLY_PREFIX):]
1555 # comments and empty lines are fine
1556 if line.startswith('#') or not line.strip():
1557 return line
1558 cookie_list = line.split('\t')
1559 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1560 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1561 cookie = self._CookieFileEntry(*cookie_list)
1562 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1563 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1564 return line
1565
e7e62441 1566 cf = io.StringIO()
d76fa1f3 1567 with self.open(filename) as f:
e7e62441 1568 for line in f:
c380cc28
S
1569 try:
1570 cf.write(prepare_line(line))
ac668111 1571 except http.cookiejar.LoadError as e:
94aa0644 1572 if f'{line.strip()} '[0] in '[{"':
ac668111 1573 raise http.cookiejar.LoadError(
94aa0644
L
1574 'Cookies file must be Netscape formatted, not JSON. See '
1575 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1576 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1577 continue
e7e62441 1578 cf.seek(0)
1579 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1580 # Session cookies are denoted by either `expires` field set to
1581 # an empty string or 0. MozillaCookieJar only recognizes the former
1582 # (see [1]). So we need force the latter to be recognized as session
1583 # cookies on our own.
1584 # Session cookies may be important for cookies-based authentication,
1585 # e.g. usually, when user does not check 'Remember me' check box while
1586 # logging in on a site, some important cookies are stored as session
1587 # cookies so that not recognizing them will result in failed login.
1588 # 1. https://bugs.python.org/issue17164
1589 for cookie in self:
1590 # Treat `expires=0` cookies as session cookies
1591 if cookie.expires == 0:
1592 cookie.expires = None
1593 cookie.discard = True
1594
1595
ac668111 1596class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1597 def __init__(self, cookiejar=None):
ac668111 1598 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1599
1600 def http_response(self, request, response):
ac668111 1601 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1602
ac668111 1603 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1604 https_response = http_response
1605
1606
ac668111 1607class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1608 """YoutubeDL redirect handler
1609
1610 The code is based on HTTPRedirectHandler implementation from CPython [1].
1611
1612 This redirect handler solves two issues:
1613 - ensures redirect URL is always unicode under python 2
1614 - introduces support for experimental HTTP response status code
1615 308 Permanent Redirect [2] used by some sites [3]
1616
1617 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1618 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1619 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1620 """
1621
ac668111 1622 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1623
1624 def redirect_request(self, req, fp, code, msg, headers, newurl):
1625 """Return a Request or None in response to a redirect.
1626
1627 This is called by the http_error_30x methods when a
1628 redirection response is received. If a redirection should
1629 take place, return a new Request to allow http_error_30x to
1630 perform the redirect. Otherwise, raise HTTPError if no-one
1631 else should try to handle this url. Return None if you can't
1632 but another Handler might.
1633 """
1634 m = req.get_method()
1635 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1636 or code in (301, 302, 303) and m == "POST")):
14f25df2 1637 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1638 # Strictly (according to RFC 2616), 301 or 302 in response to
1639 # a POST MUST NOT cause a redirection without confirmation
1640 # from the user (of urllib.request, in this case). In practice,
1641 # essentially all clients do redirect in this case, so we do
1642 # the same.
1643
201c1459 1644 # Be conciliant with URIs containing a space. This is mainly
1645 # redundant with the more complete encoding done in http_error_302(),
1646 # but it is kept for compatibility with other callers.
1647 newurl = newurl.replace(' ', '%20')
1648
1649 CONTENT_HEADERS = ("content-length", "content-type")
1650 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1651 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1652
1653 # A 303 must either use GET or HEAD for subsequent request
1654 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1655 if code == 303 and m != 'HEAD':
1656 m = 'GET'
1657 # 301 and 302 redirects are commonly turned into a GET from a POST
1658 # for subsequent requests by browsers, so we'll do the same.
1659 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1660 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1661 if code in (301, 302) and m == 'POST':
1662 m = 'GET'
1663
ac668111 1664 return urllib.request.Request(
201c1459 1665 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1666 unverifiable=True, method=m)
fca6dba8
S
1667
1668
46f59e89
S
1669def extract_timezone(date_str):
1670 m = re.search(
f137e4c2 1671 r'''(?x)
1672 ^.{8,}? # >=8 char non-TZ prefix, if present
1673 (?P<tz>Z| # just the UTC Z, or
1674 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1675 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1676 [ ]? # optional space
1677 (?P<sign>\+|-) # +/-
1678 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1679 $)
1680 ''', date_str)
46f59e89
S
1681 if not m:
1682 timezone = datetime.timedelta()
1683 else:
1684 date_str = date_str[:-len(m.group('tz'))]
1685 if not m.group('sign'):
1686 timezone = datetime.timedelta()
1687 else:
1688 sign = 1 if m.group('sign') == '+' else -1
1689 timezone = datetime.timedelta(
1690 hours=sign * int(m.group('hours')),
1691 minutes=sign * int(m.group('minutes')))
1692 return timezone, date_str
1693
1694
08b38d54 1695def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1696 """ Return a UNIX timestamp from the given date """
1697
1698 if date_str is None:
1699 return None
1700
52c3a6e4
S
1701 date_str = re.sub(r'\.[0-9]+', '', date_str)
1702
08b38d54 1703 if timezone is None:
46f59e89
S
1704 timezone, date_str = extract_timezone(date_str)
1705
19a03940 1706 with contextlib.suppress(ValueError):
86e5f3ed 1707 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1708 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1709 return calendar.timegm(dt.timetuple())
912b38b4
PH
1710
1711
46f59e89
S
1712def date_formats(day_first=True):
1713 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1714
1715
42bdd9d0 1716def unified_strdate(date_str, day_first=True):
bf50b038 1717 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1718
1719 if date_str is None:
1720 return None
bf50b038 1721 upload_date = None
5f6a1245 1722 # Replace commas
026fcc04 1723 date_str = date_str.replace(',', ' ')
42bdd9d0 1724 # Remove AM/PM + timezone
9bb8e0a3 1725 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1726 _, date_str = extract_timezone(date_str)
42bdd9d0 1727
46f59e89 1728 for expression in date_formats(day_first):
19a03940 1729 with contextlib.suppress(ValueError):
bf50b038 1730 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1731 if upload_date is None:
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
19a03940 1734 with contextlib.suppress(ValueError):
c6b9cf05 1735 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1736 if upload_date is not None:
14f25df2 1737 return str(upload_date)
bf50b038 1738
5f6a1245 1739
46f59e89
S
1740def unified_timestamp(date_str, day_first=True):
1741 if date_str is None:
1742 return None
1743
2ae2ffda 1744 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1745
7dc2a74e 1746 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1747 timezone, date_str = extract_timezone(date_str)
1748
1749 # Remove AM/PM + timezone
1750 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1751
deef3195
S
1752 # Remove unrecognized timezones from ISO 8601 alike timestamps
1753 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1754 if m:
1755 date_str = date_str[:-len(m.group('tz'))]
1756
f226880c
PH
1757 # Python only supports microseconds, so remove nanoseconds
1758 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1759 if m:
1760 date_str = m.group(1)
1761
46f59e89 1762 for expression in date_formats(day_first):
19a03940 1763 with contextlib.suppress(ValueError):
7dc2a74e 1764 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1765 return calendar.timegm(dt.timetuple())
46f59e89
S
1766 timetuple = email.utils.parsedate_tz(date_str)
1767 if timetuple:
7dc2a74e 1768 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1769
1770
28e614de 1771def determine_ext(url, default_ext='unknown_video'):
85750f89 1772 if url is None or '.' not in url:
f4776371 1773 return default_ext
9cb9a5df 1774 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1775 if re.match(r'^[A-Za-z0-9]+$', guess):
1776 return guess
a7aaa398
S
1777 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1778 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1779 return guess.rstrip('/')
73e79f2a 1780 else:
cbdbb766 1781 return default_ext
73e79f2a 1782
5f6a1245 1783
824fa511
S
1784def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1785 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1786
5f6a1245 1787
9e62f283 1788def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1789 R"""
1790 Return a datetime object from a string.
1791 Supported format:
1792 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1793
1794 @param format strftime format of DATE
1795 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1796 auto: round to the unit provided in date_str (if applicable).
9e62f283 1797 """
1798 auto_precision = False
1799 if precision == 'auto':
1800 auto_precision = True
1801 precision = 'microsecond'
396a76f7 1802 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1803 if date_str in ('now', 'today'):
37254abc 1804 return today
f8795e10
PH
1805 if date_str == 'yesterday':
1806 return today - datetime.timedelta(days=1)
9e62f283 1807 match = re.match(
3d38b2d6 1808 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1809 date_str)
37254abc 1810 if match is not None:
9e62f283 1811 start_time = datetime_from_str(match.group('start'), precision, format)
1812 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1813 unit = match.group('unit')
9e62f283 1814 if unit == 'month' or unit == 'year':
1815 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1816 unit = 'day'
9e62f283 1817 else:
1818 if unit == 'week':
1819 unit = 'day'
1820 time *= 7
1821 delta = datetime.timedelta(**{unit + 's': time})
1822 new_date = start_time + delta
1823 if auto_precision:
1824 return datetime_round(new_date, unit)
1825 return new_date
1826
1827 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1828
1829
d49f8db3 1830def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1831 R"""
1832 Return a date object from a string using datetime_from_str
9e62f283 1833
3d38b2d6 1834 @param strict Restrict allowed patterns to "YYYYMMDD" and
1835 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1836 """
3d38b2d6 1837 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1838 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1839 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1840
1841
1842def datetime_add_months(dt, months):
1843 """Increment/Decrement a datetime object by months."""
1844 month = dt.month + months - 1
1845 year = dt.year + month // 12
1846 month = month % 12 + 1
1847 day = min(dt.day, calendar.monthrange(year, month)[1])
1848 return dt.replace(year, month, day)
1849
1850
1851def datetime_round(dt, precision='day'):
1852 """
1853 Round a datetime object's time to a specific precision
1854 """
1855 if precision == 'microsecond':
1856 return dt
1857
1858 unit_seconds = {
1859 'day': 86400,
1860 'hour': 3600,
1861 'minute': 60,
1862 'second': 1,
1863 }
1864 roundto = lambda x, n: ((x + n / 2) // n) * n
1865 timestamp = calendar.timegm(dt.timetuple())
1866 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1867
1868
e63fc1be 1869def hyphenate_date(date_str):
1870 """
1871 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1872 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1873 if match is not None:
1874 return '-'.join(match.groups())
1875 else:
1876 return date_str
1877
5f6a1245 1878
86e5f3ed 1879class DateRange:
bd558525 1880 """Represents a time interval between two dates"""
5f6a1245 1881
bd558525
JMF
1882 def __init__(self, start=None, end=None):
1883 """start and end must be strings in the format accepted by date"""
1884 if start is not None:
d49f8db3 1885 self.start = date_from_str(start, strict=True)
bd558525
JMF
1886 else:
1887 self.start = datetime.datetime.min.date()
1888 if end is not None:
d49f8db3 1889 self.end = date_from_str(end, strict=True)
bd558525
JMF
1890 else:
1891 self.end = datetime.datetime.max.date()
37254abc 1892 if self.start > self.end:
bd558525 1893 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1894
bd558525
JMF
1895 @classmethod
1896 def day(cls, day):
1897 """Returns a range that only contains the given day"""
5f6a1245
JW
1898 return cls(day, day)
1899
bd558525
JMF
1900 def __contains__(self, date):
1901 """Check if the date is in the range"""
37254abc
JMF
1902 if not isinstance(date, datetime.date):
1903 date = date_from_str(date)
1904 return self.start <= date <= self.end
5f6a1245 1905
bd558525 1906 def __str__(self):
86e5f3ed 1907 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1908
f2df4071 1909 def __eq__(self, other):
1910 return (isinstance(other, DateRange)
1911 and self.start == other.start and self.end == other.end)
1912
c496ca96
PH
1913
1914def platform_name():
14f25df2 1915 """ Returns the platform name as a str """
b1f94422 1916 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1917 return platform.platform()
c496ca96 1918
b1f94422 1919
1920@functools.cache
1921def system_identifier():
1922 python_implementation = platform.python_implementation()
1923 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1924 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1925
1926 return 'Python %s (%s %s) - %s %s' % (
1927 platform.python_version(),
1928 python_implementation,
1929 platform.architecture()[0],
1930 platform.platform(),
1931 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1932 )
c257baff
PH
1933
1934
0b9c08b4 1935@functools.cache
49fa4d9a 1936def get_windows_version():
8a82af35 1937 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1938 if compat_os_name == 'nt':
1939 return version_tuple(platform.win32_ver()[1])
1940 else:
8a82af35 1941 return ()
49fa4d9a
N
1942
1943
734f90bb 1944def write_string(s, out=None, encoding=None):
19a03940 1945 assert isinstance(s, str)
1946 out = out or sys.stderr
7459e3a2 1947
fe1daad3 1948 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1949 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1950
8a82af35 1951 enc, buffer = None, out
cfb0511d 1952 if 'b' in getattr(out, 'mode', ''):
c487cf00 1953 enc = encoding or preferredencoding()
104aa738 1954 elif hasattr(out, 'buffer'):
8a82af35 1955 buffer = out.buffer
104aa738 1956 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1957
8a82af35 1958 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1959 out.flush()
1960
1961
48ea9cea
PH
1962def bytes_to_intlist(bs):
1963 if not bs:
1964 return []
1965 if isinstance(bs[0], int): # Python 3
1966 return list(bs)
1967 else:
1968 return [ord(c) for c in bs]
1969
c257baff 1970
cba892fa 1971def intlist_to_bytes(xs):
1972 if not xs:
1973 return b''
ac668111 1974 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1975
1976
8a82af35 1977class LockingUnsupportedError(OSError):
1890fc63 1978 msg = 'File locking is not supported'
0edb3e33 1979
1980 def __init__(self):
1981 super().__init__(self.msg)
1982
1983
c1c9a79c
PH
1984# Cross-platform file locking
1985if sys.platform == 'win32':
1986 import ctypes.wintypes
1987 import msvcrt
1988
1989 class OVERLAPPED(ctypes.Structure):
1990 _fields_ = [
1991 ('Internal', ctypes.wintypes.LPVOID),
1992 ('InternalHigh', ctypes.wintypes.LPVOID),
1993 ('Offset', ctypes.wintypes.DWORD),
1994 ('OffsetHigh', ctypes.wintypes.DWORD),
1995 ('hEvent', ctypes.wintypes.HANDLE),
1996 ]
1997
1998 kernel32 = ctypes.windll.kernel32
1999 LockFileEx = kernel32.LockFileEx
2000 LockFileEx.argtypes = [
2001 ctypes.wintypes.HANDLE, # hFile
2002 ctypes.wintypes.DWORD, # dwFlags
2003 ctypes.wintypes.DWORD, # dwReserved
2004 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2006 ctypes.POINTER(OVERLAPPED) # Overlapped
2007 ]
2008 LockFileEx.restype = ctypes.wintypes.BOOL
2009 UnlockFileEx = kernel32.UnlockFileEx
2010 UnlockFileEx.argtypes = [
2011 ctypes.wintypes.HANDLE, # hFile
2012 ctypes.wintypes.DWORD, # dwReserved
2013 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2014 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2015 ctypes.POINTER(OVERLAPPED) # Overlapped
2016 ]
2017 UnlockFileEx.restype = ctypes.wintypes.BOOL
2018 whole_low = 0xffffffff
2019 whole_high = 0x7fffffff
2020
747c0bd1 2021 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2022 overlapped = OVERLAPPED()
2023 overlapped.Offset = 0
2024 overlapped.OffsetHigh = 0
2025 overlapped.hEvent = 0
2026 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2027
2028 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2029 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2030 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2031 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2032 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2033
2034 def _unlock_file(f):
2035 assert f._lock_file_overlapped_p
2036 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2037 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2038 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2039
2040else:
399a76e6
YCH
2041 try:
2042 import fcntl
c1c9a79c 2043
a3125791 2044 def _lock_file(f, exclusive, block):
b63837bc 2045 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2046 if not block:
2047 flags |= fcntl.LOCK_NB
acea8d7c 2048 try:
b63837bc 2049 fcntl.flock(f, flags)
acea8d7c
JK
2050 except BlockingIOError:
2051 raise
2052 except OSError: # AOSP does not have flock()
b63837bc 2053 fcntl.lockf(f, flags)
c1c9a79c 2054
399a76e6 2055 def _unlock_file(f):
acea8d7c
JK
2056 try:
2057 fcntl.flock(f, fcntl.LOCK_UN)
2058 except OSError:
2059 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2060
399a76e6 2061 except ImportError:
399a76e6 2062
a3125791 2063 def _lock_file(f, exclusive, block):
0edb3e33 2064 raise LockingUnsupportedError()
399a76e6
YCH
2065
2066 def _unlock_file(f):
0edb3e33 2067 raise LockingUnsupportedError()
c1c9a79c
PH
2068
2069
86e5f3ed 2070class locked_file:
0edb3e33 2071 locked = False
747c0bd1 2072
a3125791 2073 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2074 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2075 raise NotImplementedError(mode)
2076 self.mode, self.block = mode, block
2077
2078 writable = any(f in mode for f in 'wax+')
2079 readable = any(f in mode for f in 'r+')
2080 flags = functools.reduce(operator.ior, (
2081 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2082 getattr(os, 'O_BINARY', 0), # Windows only
2083 getattr(os, 'O_NOINHERIT', 0), # Windows only
2084 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2085 os.O_APPEND if 'a' in mode else 0,
2086 os.O_EXCL if 'x' in mode else 0,
2087 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2088 ))
2089
98804d03 2090 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2091
2092 def __enter__(self):
a3125791 2093 exclusive = 'r' not in self.mode
c1c9a79c 2094 try:
a3125791 2095 _lock_file(self.f, exclusive, self.block)
0edb3e33 2096 self.locked = True
86e5f3ed 2097 except OSError:
c1c9a79c
PH
2098 self.f.close()
2099 raise
fcfa8853 2100 if 'w' in self.mode:
131e14dc
JK
2101 try:
2102 self.f.truncate()
2103 except OSError as e:
1890fc63 2104 if e.errno not in (
2105 errno.ESPIPE, # Illegal seek - expected for FIFO
2106 errno.EINVAL, # Invalid argument - expected for /dev/null
2107 ):
2108 raise
c1c9a79c
PH
2109 return self
2110
0edb3e33 2111 def unlock(self):
2112 if not self.locked:
2113 return
c1c9a79c 2114 try:
0edb3e33 2115 _unlock_file(self.f)
c1c9a79c 2116 finally:
0edb3e33 2117 self.locked = False
c1c9a79c 2118
0edb3e33 2119 def __exit__(self, *_):
2120 try:
2121 self.unlock()
2122 finally:
2123 self.f.close()
4eb7f1d1 2124
0edb3e33 2125 open = __enter__
2126 close = __exit__
a3125791 2127
0edb3e33 2128 def __getattr__(self, attr):
2129 return getattr(self.f, attr)
a3125791 2130
0edb3e33 2131 def __iter__(self):
2132 return iter(self.f)
a3125791 2133
4eb7f1d1 2134
0b9c08b4 2135@functools.cache
4644ac55
S
2136def get_filesystem_encoding():
2137 encoding = sys.getfilesystemencoding()
2138 return encoding if encoding is not None else 'utf-8'
2139
2140
4eb7f1d1 2141def shell_quote(args):
a6a173c2 2142 quoted_args = []
4644ac55 2143 encoding = get_filesystem_encoding()
a6a173c2
JMF
2144 for a in args:
2145 if isinstance(a, bytes):
2146 # We may get a filename encoded with 'encodeFilename'
2147 a = a.decode(encoding)
aefce8e6 2148 quoted_args.append(compat_shlex_quote(a))
28e614de 2149 return ' '.join(quoted_args)
9d4660ca
PH
2150
2151
2152def smuggle_url(url, data):
2153 """ Pass additional data in a URL for internal use. """
2154
81953d1a
RA
2155 url, idata = unsmuggle_url(url, {})
2156 data.update(idata)
14f25df2 2157 sdata = urllib.parse.urlencode(
28e614de
PH
2158 {'__youtubedl_smuggle': json.dumps(data)})
2159 return url + '#' + sdata
9d4660ca
PH
2160
2161
79f82953 2162def unsmuggle_url(smug_url, default=None):
83e865a3 2163 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2164 return smug_url, default
28e614de 2165 url, _, sdata = smug_url.rpartition('#')
14f25df2 2166 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2167 data = json.loads(jsond)
2168 return url, data
02dbf93f
PH
2169
2170
e0fd9573 2171def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2172 """ Formats numbers with decimal sufixes like K, M, etc """
2173 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2174 if num is None or num < 0:
e0fd9573 2175 return None
eeb2a770 2176 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2177 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2178 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2179 if factor == 1024:
2180 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2181 converted = num / (factor ** exponent)
abbeeebc 2182 return fmt % (converted, suffix)
e0fd9573 2183
2184
02dbf93f 2185def format_bytes(bytes):
f02d24d8 2186 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2187
1c088fa8 2188
fb47597b
S
2189def lookup_unit_table(unit_table, s):
2190 units_re = '|'.join(re.escape(u) for u in unit_table)
2191 m = re.match(
782b1b5b 2192 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2193 if not m:
2194 return None
2195 num_str = m.group('num').replace(',', '.')
2196 mult = unit_table[m.group('unit')]
2197 return int(float(num_str) * mult)
2198
2199
be64b5b0
PH
2200def parse_filesize(s):
2201 if s is None:
2202 return None
2203
dfb1b146 2204 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2205 # but we support those too
2206 _UNIT_TABLE = {
2207 'B': 1,
2208 'b': 1,
70852b47 2209 'bytes': 1,
be64b5b0
PH
2210 'KiB': 1024,
2211 'KB': 1000,
2212 'kB': 1024,
2213 'Kb': 1000,
13585d76 2214 'kb': 1000,
70852b47
YCH
2215 'kilobytes': 1000,
2216 'kibibytes': 1024,
be64b5b0
PH
2217 'MiB': 1024 ** 2,
2218 'MB': 1000 ** 2,
2219 'mB': 1024 ** 2,
2220 'Mb': 1000 ** 2,
13585d76 2221 'mb': 1000 ** 2,
70852b47
YCH
2222 'megabytes': 1000 ** 2,
2223 'mebibytes': 1024 ** 2,
be64b5b0
PH
2224 'GiB': 1024 ** 3,
2225 'GB': 1000 ** 3,
2226 'gB': 1024 ** 3,
2227 'Gb': 1000 ** 3,
13585d76 2228 'gb': 1000 ** 3,
70852b47
YCH
2229 'gigabytes': 1000 ** 3,
2230 'gibibytes': 1024 ** 3,
be64b5b0
PH
2231 'TiB': 1024 ** 4,
2232 'TB': 1000 ** 4,
2233 'tB': 1024 ** 4,
2234 'Tb': 1000 ** 4,
13585d76 2235 'tb': 1000 ** 4,
70852b47
YCH
2236 'terabytes': 1000 ** 4,
2237 'tebibytes': 1024 ** 4,
be64b5b0
PH
2238 'PiB': 1024 ** 5,
2239 'PB': 1000 ** 5,
2240 'pB': 1024 ** 5,
2241 'Pb': 1000 ** 5,
13585d76 2242 'pb': 1000 ** 5,
70852b47
YCH
2243 'petabytes': 1000 ** 5,
2244 'pebibytes': 1024 ** 5,
be64b5b0
PH
2245 'EiB': 1024 ** 6,
2246 'EB': 1000 ** 6,
2247 'eB': 1024 ** 6,
2248 'Eb': 1000 ** 6,
13585d76 2249 'eb': 1000 ** 6,
70852b47
YCH
2250 'exabytes': 1000 ** 6,
2251 'exbibytes': 1024 ** 6,
be64b5b0
PH
2252 'ZiB': 1024 ** 7,
2253 'ZB': 1000 ** 7,
2254 'zB': 1024 ** 7,
2255 'Zb': 1000 ** 7,
13585d76 2256 'zb': 1000 ** 7,
70852b47
YCH
2257 'zettabytes': 1000 ** 7,
2258 'zebibytes': 1024 ** 7,
be64b5b0
PH
2259 'YiB': 1024 ** 8,
2260 'YB': 1000 ** 8,
2261 'yB': 1024 ** 8,
2262 'Yb': 1000 ** 8,
13585d76 2263 'yb': 1000 ** 8,
70852b47
YCH
2264 'yottabytes': 1000 ** 8,
2265 'yobibytes': 1024 ** 8,
be64b5b0
PH
2266 }
2267
fb47597b
S
2268 return lookup_unit_table(_UNIT_TABLE, s)
2269
2270
2271def parse_count(s):
2272 if s is None:
be64b5b0
PH
2273 return None
2274
352d5da8 2275 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2276
2277 if re.match(r'^[\d,.]+$', s):
2278 return str_to_int(s)
2279
2280 _UNIT_TABLE = {
2281 'k': 1000,
2282 'K': 1000,
2283 'm': 1000 ** 2,
2284 'M': 1000 ** 2,
2285 'kk': 1000 ** 2,
2286 'KK': 1000 ** 2,
352d5da8 2287 'b': 1000 ** 3,
2288 'B': 1000 ** 3,
fb47597b 2289 }
be64b5b0 2290
352d5da8 2291 ret = lookup_unit_table(_UNIT_TABLE, s)
2292 if ret is not None:
2293 return ret
2294
2295 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2296 if mobj:
2297 return str_to_int(mobj.group(1))
be64b5b0 2298
2f7ae819 2299
5d45484c 2300def parse_resolution(s, *, lenient=False):
b871d7e9
S
2301 if s is None:
2302 return {}
2303
5d45484c
LNO
2304 if lenient:
2305 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2306 else:
2307 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2308 if mobj:
2309 return {
2310 'width': int(mobj.group('w')),
2311 'height': int(mobj.group('h')),
2312 }
2313
17ec8bcf 2314 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2315 if mobj:
2316 return {'height': int(mobj.group(1))}
2317
2318 mobj = re.search(r'\b([48])[kK]\b', s)
2319 if mobj:
2320 return {'height': int(mobj.group(1)) * 540}
2321
2322 return {}
2323
2324
0dc41787 2325def parse_bitrate(s):
14f25df2 2326 if not isinstance(s, str):
0dc41787
S
2327 return
2328 mobj = re.search(r'\b(\d+)\s*kbps', s)
2329 if mobj:
2330 return int(mobj.group(1))
2331
2332
a942d6cb 2333def month_by_name(name, lang='en'):
caefb1de
PH
2334 """ Return the number of a month by (locale-independently) English name """
2335
f6717dec 2336 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2337
caefb1de 2338 try:
f6717dec 2339 return month_names.index(name) + 1
7105440c
YCH
2340 except ValueError:
2341 return None
2342
2343
2344def month_by_abbreviation(abbrev):
2345 """ Return the number of a month by (locale-independently) English
2346 abbreviations """
2347
2348 try:
2349 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2350 except ValueError:
2351 return None
18258362
JMF
2352
2353
5aafe895 2354def fix_xml_ampersands(xml_str):
18258362 2355 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2356 return re.sub(
2357 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2358 '&amp;',
5aafe895 2359 xml_str)
e3946f98
PH
2360
2361
2362def setproctitle(title):
14f25df2 2363 assert isinstance(title, str)
c1c05c67
YCH
2364
2365 # ctypes in Jython is not complete
2366 # http://bugs.jython.org/issue2148
2367 if sys.platform.startswith('java'):
2368 return
2369
e3946f98 2370 try:
611c1dd9 2371 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2372 except OSError:
2373 return
2f49bcd6
RC
2374 except TypeError:
2375 # LoadLibrary in Windows Python 2.7.13 only expects
2376 # a bytestring, but since unicode_literals turns
2377 # every string into a unicode string, it fails.
2378 return
0f06bcd7 2379 title_bytes = title.encode()
6eefe533
PH
2380 buf = ctypes.create_string_buffer(len(title_bytes))
2381 buf.value = title_bytes
e3946f98 2382 try:
6eefe533 2383 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2384 except AttributeError:
2385 return # Strange libc, just skip this
d7dda168
PH
2386
2387
2388def remove_start(s, start):
46bc9b7d 2389 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2390
2391
2b9faf55 2392def remove_end(s, end):
46bc9b7d 2393 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2394
2395
31b2051e
S
2396def remove_quotes(s):
2397 if s is None or len(s) < 2:
2398 return s
2399 for quote in ('"', "'", ):
2400 if s[0] == quote and s[-1] == quote:
2401 return s[1:-1]
2402 return s
2403
2404
b6e0c7d2 2405def get_domain(url):
ebf99aaf 2406 """
2407 This implementation is inconsistent, but is kept for compatibility.
2408 Use this only for "webpage_url_domain"
2409 """
2410 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2411
2412
29eb5174 2413def url_basename(url):
14f25df2 2414 path = urllib.parse.urlparse(url).path
28e614de 2415 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2416
2417
02dc0a36
S
2418def base_url(url):
2419 return re.match(r'https?://[^?#&]+/', url).group()
2420
2421
e34c3361 2422def urljoin(base, path):
4b5de77b 2423 if isinstance(path, bytes):
0f06bcd7 2424 path = path.decode()
14f25df2 2425 if not isinstance(path, str) or not path:
e34c3361 2426 return None
fad4ceb5 2427 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2428 return path
4b5de77b 2429 if isinstance(base, bytes):
0f06bcd7 2430 base = base.decode()
14f25df2 2431 if not isinstance(base, str) or not re.match(
4b5de77b 2432 r'^(?:https?:)?//', base):
e34c3361 2433 return None
14f25df2 2434 return urllib.parse.urljoin(base, path)
e34c3361
S
2435
2436
ac668111 2437class HEADRequest(urllib.request.Request):
aa94a6d3 2438 def get_method(self):
611c1dd9 2439 return 'HEAD'
7217e148
PH
2440
2441
ac668111 2442class PUTRequest(urllib.request.Request):
95cf60e8
S
2443 def get_method(self):
2444 return 'PUT'
2445
2446
9732d77e 2447def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2448 if get_attr and v is not None:
2449 v = getattr(v, get_attr, None)
1812afb7
S
2450 try:
2451 return int(v) * invscale // scale
31c49255 2452 except (ValueError, TypeError, OverflowError):
af98f8ff 2453 return default
9732d77e 2454
9572013d 2455
40a90862 2456def str_or_none(v, default=None):
14f25df2 2457 return default if v is None else str(v)
40a90862 2458
9732d77e
PH
2459
2460def str_to_int(int_str):
48d4681e 2461 """ A more relaxed version of int_or_none """
f9934b96 2462 if isinstance(int_str, int):
348c6bf1 2463 return int_str
14f25df2 2464 elif isinstance(int_str, str):
42db58ec
S
2465 int_str = re.sub(r'[,\.\+]', '', int_str)
2466 return int_or_none(int_str)
608d11f5
PH
2467
2468
9732d77e 2469def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2470 if v is None:
2471 return default
2472 try:
2473 return float(v) * invscale / scale
5e1271c5 2474 except (ValueError, TypeError):
caf80631 2475 return default
43f775e4
PH
2476
2477
c7e327c4
S
2478def bool_or_none(v, default=None):
2479 return v if isinstance(v, bool) else default
2480
2481
53cd37ba 2482def strip_or_none(v, default=None):
14f25df2 2483 return v.strip() if isinstance(v, str) else default
b72b4431
S
2484
2485
af03000a 2486def url_or_none(url):
14f25df2 2487 if not url or not isinstance(url, str):
af03000a
S
2488 return None
2489 url = url.strip()
29f7c58a 2490 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2491
2492
3e9b66d7 2493def request_to_url(req):
ac668111 2494 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2495 return req.get_full_url()
2496 else:
2497 return req
2498
2499
e29663c6 2500def strftime_or_none(timestamp, date_format, default=None):
2501 datetime_object = None
2502 try:
f9934b96 2503 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2504 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2505 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2506 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2507 return datetime_object.strftime(date_format)
2508 except (ValueError, TypeError, AttributeError):
2509 return default
2510
2511
608d11f5 2512def parse_duration(s):
f9934b96 2513 if not isinstance(s, str):
608d11f5 2514 return None
ca7b3246 2515 s = s.strip()
38d79fd1 2516 if not s:
2517 return None
ca7b3246 2518
acaff495 2519 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2520 m = re.match(r'''(?x)
2521 (?P<before_secs>
2522 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2523 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2524 (?P<ms>[.:][0-9]+)?Z?$
2525 ''', s)
acaff495 2526 if m:
8bd1c00b 2527 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2528 else:
2529 m = re.match(
056653bb
S
2530 r'''(?ix)(?:P?
2531 (?:
1c1b2f96 2532 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2533 )?
2534 (?:
1c1b2f96 2535 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2536 )?
2537 (?:
1c1b2f96 2538 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2539 )?
8f4b58d7 2540 (?:
1c1b2f96 2541 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2542 )?
056653bb 2543 T)?
acaff495 2544 (?:
1c1b2f96 2545 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2546 )?
2547 (?:
1c1b2f96 2548 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2549 )?
2550 (?:
2551 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2552 )?Z?$''', s)
acaff495 2553 if m:
2554 days, hours, mins, secs, ms = m.groups()
2555 else:
15846398 2556 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2557 if m:
2558 hours, mins = m.groups()
2559 else:
2560 return None
2561
acaff495 2562 if ms:
19a03940 2563 ms = ms.replace(':', '.')
2564 return sum(float(part or 0) * mult for part, mult in (
2565 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2566
2567
e65e4c88 2568def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2569 name, real_ext = os.path.splitext(filename)
e65e4c88 2570 return (
86e5f3ed 2571 f'{name}.{ext}{real_ext}'
e65e4c88 2572 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2573 else f'{filename}.{ext}')
d70ad093
PH
2574
2575
b3ed15b7
S
2576def replace_extension(filename, ext, expected_real_ext=None):
2577 name, real_ext = os.path.splitext(filename)
86e5f3ed 2578 return '{}.{}'.format(
b3ed15b7
S
2579 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2580 ext)
2581
2582
d70ad093
PH
2583def check_executable(exe, args=[]):
2584 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2585 args can be a list of arguments for a short output (like -version) """
2586 try:
f0c9fb96 2587 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2588 except OSError:
2589 return False
2590 return exe
b7ab0590
PH
2591
2592
8a7f68d0 2593def _get_exe_version_output(exe, args, *, to_screen=None):
2594 if to_screen:
2595 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2596 try:
b64d04c1 2597 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2598 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2599 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2600 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2601 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2602 except OSError:
2603 return False
f0c9fb96 2604 return stdout
cae97f65
PH
2605
2606
2607def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2608 assert isinstance(output, str)
cae97f65
PH
2609 if version_re is None:
2610 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2611 m = re.search(version_re, output)
95807118
PH
2612 if m:
2613 return m.group(1)
2614 else:
2615 return unrecognized
2616
2617
9af98e17 2618def get_exe_version(exe, args=['--version'],
2619 version_re=None, unrecognized='present'):
2620 """ Returns the version of the specified executable,
2621 or False if the executable is not present """
2622 out = _get_exe_version_output(exe, args)
2623 return detect_exe_version(out, version_re, unrecognized) if out else False
2624
2625
7e88d7d7 2626def frange(start=0, stop=None, step=1):
2627 """Float range"""
2628 if stop is None:
2629 start, stop = 0, start
2630 sign = [-1, 1][step > 0] if step else 0
2631 while sign * start < sign * stop:
2632 yield start
2633 start += step
2634
2635
cb89cfc1 2636class LazyList(collections.abc.Sequence):
0f06bcd7 2637 """Lazy immutable list from an iterable
2638 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2639
8e5fecc8 2640 class IndexError(IndexError):
2641 pass
2642
282f5709 2643 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2644 self._iterable = iter(iterable)
2645 self._cache = [] if _cache is None else _cache
2646 self._reversed = reverse
483336e7 2647
2648 def __iter__(self):
0f06bcd7 2649 if self._reversed:
28419ca2 2650 # We need to consume the entire iterable to iterate in reverse
981052c9 2651 yield from self.exhaust()
28419ca2 2652 return
0f06bcd7 2653 yield from self._cache
2654 for item in self._iterable:
2655 self._cache.append(item)
483336e7 2656 yield item
2657
0f06bcd7 2658 def _exhaust(self):
2659 self._cache.extend(self._iterable)
2660 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2661 return self._cache
28419ca2 2662
981052c9 2663 def exhaust(self):
0f06bcd7 2664 """Evaluate the entire iterable"""
2665 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2666
28419ca2 2667 @staticmethod
0f06bcd7 2668 def _reverse_index(x):
f2df4071 2669 return None if x is None else ~x
483336e7 2670
2671 def __getitem__(self, idx):
2672 if isinstance(idx, slice):
0f06bcd7 2673 if self._reversed:
2674 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2675 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2676 elif isinstance(idx, int):
0f06bcd7 2677 if self._reversed:
2678 idx = self._reverse_index(idx)
e0f2b4b4 2679 start, stop, step = idx, idx, 0
483336e7 2680 else:
2681 raise TypeError('indices must be integers or slices')
e0f2b4b4 2682 if ((start or 0) < 0 or (stop or 0) < 0
2683 or (start is None and step < 0)
2684 or (stop is None and step > 0)):
483336e7 2685 # We need to consume the entire iterable to be able to slice from the end
2686 # Obviously, never use this with infinite iterables
0f06bcd7 2687 self._exhaust()
8e5fecc8 2688 try:
0f06bcd7 2689 return self._cache[idx]
8e5fecc8 2690 except IndexError as e:
2691 raise self.IndexError(e) from e
0f06bcd7 2692 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2693 if n > 0:
0f06bcd7 2694 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2695 try:
0f06bcd7 2696 return self._cache[idx]
8e5fecc8 2697 except IndexError as e:
2698 raise self.IndexError(e) from e
483336e7 2699
2700 def __bool__(self):
2701 try:
0f06bcd7 2702 self[-1] if self._reversed else self[0]
8e5fecc8 2703 except self.IndexError:
483336e7 2704 return False
2705 return True
2706
2707 def __len__(self):
0f06bcd7 2708 self._exhaust()
2709 return len(self._cache)
483336e7 2710
282f5709 2711 def __reversed__(self):
0f06bcd7 2712 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2713
2714 def __copy__(self):
0f06bcd7 2715 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2716
28419ca2 2717 def __repr__(self):
2718 # repr and str should mimic a list. So we exhaust the iterable
2719 return repr(self.exhaust())
2720
2721 def __str__(self):
2722 return repr(self.exhaust())
2723
483336e7 2724
7be9ccff 2725class PagedList:
c07a39ae 2726
2727 class IndexError(IndexError):
2728 pass
2729
dd26ced1
PH
2730 def __len__(self):
2731 # This is only useful for tests
2732 return len(self.getslice())
2733
7be9ccff 2734 def __init__(self, pagefunc, pagesize, use_cache=True):
2735 self._pagefunc = pagefunc
2736 self._pagesize = pagesize
f1d13090 2737 self._pagecount = float('inf')
7be9ccff 2738 self._use_cache = use_cache
2739 self._cache = {}
2740
2741 def getpage(self, pagenum):
d8cf8d97 2742 page_results = self._cache.get(pagenum)
2743 if page_results is None:
f1d13090 2744 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2745 if self._use_cache:
2746 self._cache[pagenum] = page_results
2747 return page_results
2748
2749 def getslice(self, start=0, end=None):
2750 return list(self._getslice(start, end))
2751
2752 def _getslice(self, start, end):
55575225 2753 raise NotImplementedError('This method must be implemented by subclasses')
2754
2755 def __getitem__(self, idx):
f1d13090 2756 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2757 if not isinstance(idx, int) or idx < 0:
2758 raise TypeError('indices must be non-negative integers')
2759 entries = self.getslice(idx, idx + 1)
d8cf8d97 2760 if not entries:
c07a39ae 2761 raise self.IndexError()
d8cf8d97 2762 return entries[0]
55575225 2763
9c44d242
PH
2764
2765class OnDemandPagedList(PagedList):
a44ca5a4 2766 """Download pages until a page with less than maximum results"""
86e5f3ed 2767
7be9ccff 2768 def _getslice(self, start, end):
b7ab0590
PH
2769 for pagenum in itertools.count(start // self._pagesize):
2770 firstid = pagenum * self._pagesize
2771 nextfirstid = pagenum * self._pagesize + self._pagesize
2772 if start >= nextfirstid:
2773 continue
2774
b7ab0590
PH
2775 startv = (
2776 start % self._pagesize
2777 if firstid <= start < nextfirstid
2778 else 0)
b7ab0590
PH
2779 endv = (
2780 ((end - 1) % self._pagesize) + 1
2781 if (end is not None and firstid <= end <= nextfirstid)
2782 else None)
2783
f1d13090 2784 try:
2785 page_results = self.getpage(pagenum)
2786 except Exception:
2787 self._pagecount = pagenum - 1
2788 raise
b7ab0590
PH
2789 if startv != 0 or endv is not None:
2790 page_results = page_results[startv:endv]
7be9ccff 2791 yield from page_results
b7ab0590
PH
2792
2793 # A little optimization - if current page is not "full", ie. does
2794 # not contain page_size videos then we can assume that this page
2795 # is the last one - there are no more ids on further pages -
2796 # i.e. no need to query again.
2797 if len(page_results) + startv < self._pagesize:
2798 break
2799
2800 # If we got the whole page, but the next page is not interesting,
2801 # break out early as well
2802 if end == nextfirstid:
2803 break
81c2f20b
PH
2804
2805
9c44d242 2806class InAdvancePagedList(PagedList):
a44ca5a4 2807 """PagedList with total number of pages known in advance"""
86e5f3ed 2808
9c44d242 2809 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2810 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2811 self._pagecount = pagecount
9c44d242 2812
7be9ccff 2813 def _getslice(self, start, end):
9c44d242 2814 start_page = start // self._pagesize
d37707bd 2815 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2816 skip_elems = start - start_page * self._pagesize
2817 only_more = None if end is None else end - start
2818 for pagenum in range(start_page, end_page):
7be9ccff 2819 page_results = self.getpage(pagenum)
9c44d242 2820 if skip_elems:
7be9ccff 2821 page_results = page_results[skip_elems:]
9c44d242
PH
2822 skip_elems = None
2823 if only_more is not None:
7be9ccff 2824 if len(page_results) < only_more:
2825 only_more -= len(page_results)
9c44d242 2826 else:
7be9ccff 2827 yield from page_results[:only_more]
9c44d242 2828 break
7be9ccff 2829 yield from page_results
9c44d242
PH
2830
2831
7e88d7d7 2832class PlaylistEntries:
2833 MissingEntry = object()
2834 is_exhausted = False
2835
2836 def __init__(self, ydl, info_dict):
7e9a6125 2837 self.ydl = ydl
2838
2839 # _entries must be assigned now since infodict can change during iteration
2840 entries = info_dict.get('entries')
2841 if entries is None:
2842 raise EntryNotInPlaylist('There are no entries')
2843 elif isinstance(entries, list):
2844 self.is_exhausted = True
2845
2846 requested_entries = info_dict.get('requested_entries')
2847 self.is_incomplete = bool(requested_entries)
2848 if self.is_incomplete:
2849 assert self.is_exhausted
2850 self._entries = [self.MissingEntry] * max(requested_entries)
2851 for i, entry in zip(requested_entries, entries):
2852 self._entries[i - 1] = entry
2853 elif isinstance(entries, (list, PagedList, LazyList)):
2854 self._entries = entries
2855 else:
2856 self._entries = LazyList(entries)
7e88d7d7 2857
2858 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2859 (?P<start>[+-]?\d+)?
2860 (?P<range>[:-]
2861 (?P<end>[+-]?\d+|inf(?:inite)?)?
2862 (?::(?P<step>[+-]?\d+))?
2863 )?''')
2864
2865 @classmethod
2866 def parse_playlist_items(cls, string):
2867 for segment in string.split(','):
2868 if not segment:
2869 raise ValueError('There is two or more consecutive commas')
2870 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2871 if not mobj:
2872 raise ValueError(f'{segment!r} is not a valid specification')
2873 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2874 if int_or_none(step) == 0:
2875 raise ValueError(f'Step in {segment!r} cannot be zero')
2876 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2877
2878 def get_requested_items(self):
2879 playlist_items = self.ydl.params.get('playlist_items')
2880 playlist_start = self.ydl.params.get('playliststart', 1)
2881 playlist_end = self.ydl.params.get('playlistend')
2882 # For backwards compatibility, interpret -1 as whole list
2883 if playlist_end in (-1, None):
2884 playlist_end = ''
2885 if not playlist_items:
2886 playlist_items = f'{playlist_start}:{playlist_end}'
2887 elif playlist_start != 1 or playlist_end:
2888 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2889
2890 for index in self.parse_playlist_items(playlist_items):
2891 for i, entry in self[index]:
2892 yield i, entry
1ac4fd80 2893 if not entry:
2894 continue
7e88d7d7 2895 try:
2896 # TODO: Add auto-generated fields
2897 self.ydl._match_entry(entry, incomplete=True, silent=True)
2898 except (ExistingVideoReached, RejectedVideoReached):
2899 return
2900
7e9a6125 2901 def get_full_count(self):
2902 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2903 return len(self)
2904 elif isinstance(self._entries, InAdvancePagedList):
2905 if self._entries._pagesize == 1:
2906 return self._entries._pagecount
2907
7e88d7d7 2908 @functools.cached_property
2909 def _getter(self):
2910 if isinstance(self._entries, list):
2911 def get_entry(i):
2912 try:
2913 entry = self._entries[i]
2914 except IndexError:
2915 entry = self.MissingEntry
2916 if not self.is_incomplete:
2917 raise self.IndexError()
2918 if entry is self.MissingEntry:
2919 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2920 return entry
2921 else:
2922 def get_entry(i):
2923 try:
2924 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2925 except (LazyList.IndexError, PagedList.IndexError):
2926 raise self.IndexError()
2927 return get_entry
2928
2929 def __getitem__(self, idx):
2930 if isinstance(idx, int):
2931 idx = slice(idx, idx)
2932
2933 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2934 step = 1 if idx.step is None else idx.step
2935 if idx.start is None:
2936 start = 0 if step > 0 else len(self) - 1
2937 else:
2938 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2939
2940 # NB: Do not call len(self) when idx == [:]
2941 if idx.stop is None:
2942 stop = 0 if step < 0 else float('inf')
2943 else:
2944 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2945 stop += [-1, 1][step > 0]
2946
2947 for i in frange(start, stop, step):
2948 if i < 0:
2949 continue
2950 try:
7e9a6125 2951 entry = self._getter(i)
2952 except self.IndexError:
2953 self.is_exhausted = True
2954 if step > 0:
7e88d7d7 2955 break
7e9a6125 2956 continue
7e88d7d7 2957 yield i + 1, entry
2958
2959 def __len__(self):
2960 return len(tuple(self[:]))
2961
2962 class IndexError(IndexError):
2963 pass
2964
2965
81c2f20b 2966def uppercase_escape(s):
676eb3f2 2967 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2968 return re.sub(
a612753d 2969 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2970 lambda m: unicode_escape(m.group(0))[0],
2971 s)
0fe2ff78
YCH
2972
2973
2974def lowercase_escape(s):
2975 unicode_escape = codecs.getdecoder('unicode_escape')
2976 return re.sub(
2977 r'\\u[0-9a-fA-F]{4}',
2978 lambda m: unicode_escape(m.group(0))[0],
2979 s)
b53466e1 2980
d05cfe06
S
2981
2982def escape_rfc3986(s):
2983 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2984 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2985
2986
2987def escape_url(url):
2988 """Escape URL as suggested by RFC 3986"""
14f25df2 2989 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2990 return url_parsed._replace(
efbed08d 2991 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2992 path=escape_rfc3986(url_parsed.path),
2993 params=escape_rfc3986(url_parsed.params),
2994 query=escape_rfc3986(url_parsed.query),
2995 fragment=escape_rfc3986(url_parsed.fragment)
2996 ).geturl()
2997
62e609ab 2998
4dfbf869 2999def parse_qs(url):
14f25df2 3000 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3001
3002
62e609ab
PH
3003def read_batch_urls(batch_fd):
3004 def fixup(url):
14f25df2 3005 if not isinstance(url, str):
62e609ab 3006 url = url.decode('utf-8', 'replace')
8c04f0be 3007 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3008 for bom in BOM_UTF8:
3009 if url.startswith(bom):
3010 url = url[len(bom):]
3011 url = url.lstrip()
3012 if not url or url.startswith(('#', ';', ']')):
62e609ab 3013 return False
8c04f0be 3014 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3015 # However, it can be safely stripped out if following a whitespace
8c04f0be 3016 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3017
3018 with contextlib.closing(batch_fd) as fd:
3019 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3020
3021
3022def urlencode_postdata(*args, **kargs):
14f25df2 3023 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3024
3025
38f9ef31 3026def update_url_query(url, query):
cacd9966
YCH
3027 if not query:
3028 return url
14f25df2 3029 parsed_url = urllib.parse.urlparse(url)
3030 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3031 qs.update(query)
14f25df2 3032 return urllib.parse.urlunparse(parsed_url._replace(
3033 query=urllib.parse.urlencode(qs, True)))
16392824 3034
8e60dc75 3035
c043c246 3036def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3037 req_headers = req.headers.copy()
c043c246 3038 req_headers.update(headers or {})
ed0291d1
S
3039 req_data = data or req.data
3040 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3041 req_get_method = req.get_method()
3042 if req_get_method == 'HEAD':
3043 req_type = HEADRequest
3044 elif req_get_method == 'PUT':
3045 req_type = PUTRequest
3046 else:
ac668111 3047 req_type = urllib.request.Request
ed0291d1
S
3048 new_req = req_type(
3049 req_url, data=req_data, headers=req_headers,
3050 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3051 if hasattr(req, 'timeout'):
3052 new_req.timeout = req.timeout
3053 return new_req
3054
3055
10c87c15 3056def _multipart_encode_impl(data, boundary):
0c265486
YCH
3057 content_type = 'multipart/form-data; boundary=%s' % boundary
3058
3059 out = b''
3060 for k, v in data.items():
3061 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3062 if isinstance(k, str):
0f06bcd7 3063 k = k.encode()
14f25df2 3064 if isinstance(v, str):
0f06bcd7 3065 v = v.encode()
0c265486
YCH
3066 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3067 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3068 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3069 if boundary.encode('ascii') in content:
3070 raise ValueError('Boundary overlaps with data')
3071 out += content
3072
3073 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3074
3075 return out, content_type
3076
3077
3078def multipart_encode(data, boundary=None):
3079 '''
3080 Encode a dict to RFC 7578-compliant form-data
3081
3082 data:
3083 A dict where keys and values can be either Unicode or bytes-like
3084 objects.
3085 boundary:
3086 If specified a Unicode object, it's used as the boundary. Otherwise
3087 a random boundary is generated.
3088
3089 Reference: https://tools.ietf.org/html/rfc7578
3090 '''
3091 has_specified_boundary = boundary is not None
3092
3093 while True:
3094 if boundary is None:
3095 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3096
3097 try:
10c87c15 3098 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3099 break
3100 except ValueError:
3101 if has_specified_boundary:
3102 raise
3103 boundary = None
3104
3105 return out, content_type
3106
3107
86296ad2 3108def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3109 for val in map(d.get, variadic(key_or_keys)):
3110 if val is not None and (val or not skip_false_values):
3111 return val
3112 return default
cbecc9b9
S
3113
3114
c4f60dd7 3115def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3116 for f in funcs:
a32a9a7e 3117 try:
c4f60dd7 3118 val = f(*args, **kwargs)
3119 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3120 pass
3121 else:
c4f60dd7 3122 if expected_type is None or isinstance(val, expected_type):
3123 return val
3124
3125
3126def try_get(src, getter, expected_type=None):
3127 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3128
3129
90137ca4 3130def filter_dict(dct, cndn=lambda _, v: v is not None):
3131 return {k: v for k, v in dct.items() if cndn(k, v)}
3132
3133
6cc62232
S
3134def merge_dicts(*dicts):
3135 merged = {}
3136 for a_dict in dicts:
3137 for k, v in a_dict.items():
90137ca4 3138 if (v is not None and k not in merged
3139 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3140 merged[k] = v
3141 return merged
3142
3143
8e60dc75 3144def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3145 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3146
16392824 3147
a1a530b0
PH
3148US_RATINGS = {
3149 'G': 0,
3150 'PG': 10,
3151 'PG-13': 13,
3152 'R': 16,
3153 'NC': 18,
3154}
fac55558
PH
3155
3156
a8795327 3157TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3158 'TV-Y': 0,
3159 'TV-Y7': 7,
3160 'TV-G': 0,
3161 'TV-PG': 0,
3162 'TV-14': 14,
3163 'TV-MA': 17,
a8795327
S
3164}
3165
3166
146c80e2 3167def parse_age_limit(s):
19a03940 3168 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3169 if type(s) is int: # noqa: E721
a8795327 3170 return s if 0 <= s <= 21 else None
19a03940 3171 elif not isinstance(s, str):
d838b1bd 3172 return None
146c80e2 3173 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3174 if m:
3175 return int(m.group('age'))
5c5fae6d 3176 s = s.upper()
a8795327
S
3177 if s in US_RATINGS:
3178 return US_RATINGS[s]
5a16c9d9 3179 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3180 if m:
5a16c9d9 3181 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3182 return None
146c80e2
S
3183
3184
fac55558 3185def strip_jsonp(code):
609a61e3 3186 return re.sub(
5552c9eb 3187 r'''(?sx)^
e9c671d5 3188 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3189 (?:\s*&&\s*(?P=func_name))?
3190 \s*\(\s*(?P<callback_data>.*)\);?
3191 \s*?(?://[^\n]*)*$''',
3192 r'\g<callback_data>', code)
478c2c61
PH
3193
3194
5c610515 3195def js_to_json(code, vars={}):
3196 # vars is a dict of var, val pairs to substitute
c843e685 3197 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3198 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3199 INTEGER_TABLE = (
86e5f3ed 3200 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3201 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3202 )
3203
e05f6939 3204 def fix_kv(m):
e7b6d122
PH
3205 v = m.group(0)
3206 if v in ('true', 'false', 'null'):
3207 return v
421ddcb8
C
3208 elif v in ('undefined', 'void 0'):
3209 return 'null'
8bdd16b4 3210 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3211 return ""
3212
3213 if v[0] in ("'", '"'):
3214 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3215 '"': '\\"',
bd1e4844 3216 "\\'": "'",
3217 '\\\n': '',
3218 '\\x': '\\u00',
3219 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3220 else:
3221 for regex, base in INTEGER_TABLE:
3222 im = re.match(regex, v)
3223 if im:
3224 i = int(im.group(1), base)
3225 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3226
5c610515 3227 if v in vars:
3228 return vars[v]
3229
e7b6d122 3230 return '"%s"' % v
e05f6939 3231
8072ef2b 3232 def create_map(mobj):
3233 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3234
febff4c1 3235 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3236 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3237
bd1e4844 3238 return re.sub(r'''(?sx)
3239 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3240 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3241 {comment}|,(?={skip}[\]}}])|
421ddcb8 3242 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3243 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3244 [0-9]+(?={skip}:)|
3245 !+
4195096e 3246 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3247
3248
478c2c61
PH
3249def qualities(quality_ids):
3250 """ Get a numeric quality value out of a list of possible values """
3251 def q(qid):
3252 try:
3253 return quality_ids.index(qid)
3254 except ValueError:
3255 return -1
3256 return q
3257
acd69589 3258
8aa0e7cd 3259POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3260
3261
de6000d9 3262DEFAULT_OUTTMPL = {
3263 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3264 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3265}
3266OUTTMPL_TYPES = {
72755351 3267 'chapter': None,
de6000d9 3268 'subtitle': None,
3269 'thumbnail': None,
3270 'description': 'description',
3271 'annotation': 'annotations.xml',
3272 'infojson': 'info.json',
08438d2c 3273 'link': None,
3b603dbd 3274 'pl_video': None,
5112f26a 3275 'pl_thumbnail': None,
de6000d9 3276 'pl_description': 'description',
3277 'pl_infojson': 'info.json',
3278}
0a871f68 3279
143db31d 3280# As of [1] format syntax is:
3281# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3282# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3283STR_FORMAT_RE_TMPL = r'''(?x)
3284 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3285 %
524e2e4f 3286 (?P<has_key>\((?P<key>{0})\))?
752cda38 3287 (?P<format>
524e2e4f 3288 (?P<conversion>[#0\-+ ]+)?
3289 (?P<min_width>\d+)?
3290 (?P<precision>\.\d+)?
3291 (?P<len_mod>[hlL])? # unused in python
901130bb 3292 {1} # conversion type
752cda38 3293 )
143db31d 3294'''
3295
7d1eb38a 3296
901130bb 3297STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3298
7d1eb38a 3299
a020a0dc
PH
3300def limit_length(s, length):
3301 """ Add ellipses to overly long strings """
3302 if s is None:
3303 return None
3304 ELLIPSES = '...'
3305 if len(s) > length:
3306 return s[:length - len(ELLIPSES)] + ELLIPSES
3307 return s
48844745
PH
3308
3309
3310def version_tuple(v):
5f9b8394 3311 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3312
3313
3314def is_outdated_version(version, limit, assume_new=True):
3315 if not version:
3316 return not assume_new
3317 try:
3318 return version_tuple(version) < version_tuple(limit)
3319 except ValueError:
3320 return not assume_new
732ea2f0
PH
3321
3322
3323def ytdl_is_updateable():
7a5c1cfe 3324 """ Returns if yt-dlp can be updated with -U """
735d865e 3325
5d535b4a 3326 from .update import is_non_updateable
732ea2f0 3327
5d535b4a 3328 return not is_non_updateable()
7d4111ed
PH
3329
3330
3331def args_to_str(args):
3332 # Get a short string representation for a subprocess command
702ccf2d 3333 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3334
3335
9b9c5355 3336def error_to_compat_str(err):
cfb0511d 3337 return str(err)
fdae2358
S
3338
3339
a44ca5a4 3340def error_to_str(err):
3341 return f'{type(err).__name__}: {err}'
3342
3343
c460bdd5 3344def mimetype2ext(mt):
eb9ee194
S
3345 if mt is None:
3346 return None
3347
9359f3d4
F
3348 mt, _, params = mt.partition(';')
3349 mt = mt.strip()
3350
3351 FULL_MAP = {
765ac263 3352 'audio/mp4': 'm4a',
6c33d24b
YCH
3353 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3354 # it's the most popular one
3355 'audio/mpeg': 'mp3',
ba39289d 3356 'audio/x-wav': 'wav',
9359f3d4
F
3357 'audio/wav': 'wav',
3358 'audio/wave': 'wav',
3359 }
3360
3361 ext = FULL_MAP.get(mt)
765ac263
JMF
3362 if ext is not None:
3363 return ext
3364
9359f3d4 3365 SUBTYPE_MAP = {
f6861ec9 3366 '3gpp': '3gp',
cafcf657 3367 'smptett+xml': 'tt',
cafcf657 3368 'ttaf+xml': 'dfxp',
a0d8d704 3369 'ttml+xml': 'ttml',
f6861ec9 3370 'x-flv': 'flv',
a0d8d704 3371 'x-mp4-fragmented': 'mp4',
d4f05d47 3372 'x-ms-sami': 'sami',
a0d8d704 3373 'x-ms-wmv': 'wmv',
b4173f15
RA
3374 'mpegurl': 'm3u8',
3375 'x-mpegurl': 'm3u8',
3376 'vnd.apple.mpegurl': 'm3u8',
3377 'dash+xml': 'mpd',
b4173f15 3378 'f4m+xml': 'f4m',
f164b971 3379 'hds+xml': 'f4m',
e910fe2f 3380 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3381 'quicktime': 'mov',
98ce1a3f 3382 'mp2t': 'ts',
39e7107d 3383 'x-wav': 'wav',
9359f3d4
F
3384 'filmstrip+json': 'fs',
3385 'svg+xml': 'svg',
3386 }
3387
3388 _, _, subtype = mt.rpartition('/')
3389 ext = SUBTYPE_MAP.get(subtype.lower())
3390 if ext is not None:
3391 return ext
3392
3393 SUFFIX_MAP = {
3394 'json': 'json',
3395 'xml': 'xml',
3396 'zip': 'zip',
3397 'gzip': 'gz',
3398 }
3399
3400 _, _, suffix = subtype.partition('+')
3401 ext = SUFFIX_MAP.get(suffix)
3402 if ext is not None:
3403 return ext
3404
3405 return subtype.replace('+', '.')
c460bdd5
PH
3406
3407
2814f12b
THD
3408def ext2mimetype(ext_or_url):
3409 if not ext_or_url:
3410 return None
3411 if '.' not in ext_or_url:
3412 ext_or_url = f'file.{ext_or_url}'
3413 return mimetypes.guess_type(ext_or_url)[0]
3414
3415
4f3c5e06 3416def parse_codecs(codecs_str):
3417 # http://tools.ietf.org/html/rfc6381
3418 if not codecs_str:
3419 return {}
a0566bbf 3420 split_codecs = list(filter(None, map(
dbf5416a 3421 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3422 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3423 for full_codec in split_codecs:
d816f61f 3424 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3425 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3426 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3427 if vcodec:
3428 continue
3429 vcodec = full_codec
3430 if parts[0] in ('dvh1', 'dvhe'):
3431 hdr = 'DV'
3432 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3433 hdr = 'HDR10'
3434 elif parts[:2] == ['vp9', '2']:
3435 hdr = 'HDR10'
3436 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3437 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3438 acodec = acodec or full_codec
3439 elif parts[0] in ('stpp', 'wvtt'):
3440 scodec = scodec or full_codec
4f3c5e06 3441 else:
19a03940 3442 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3443 if vcodec or acodec or scodec:
4f3c5e06 3444 return {
3445 'vcodec': vcodec or 'none',
3446 'acodec': acodec or 'none',
176f1866 3447 'dynamic_range': hdr,
3fe75fdc 3448 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3449 }
b69fd25c 3450 elif len(split_codecs) == 2:
3451 return {
3452 'vcodec': split_codecs[0],
3453 'acodec': split_codecs[1],
3454 }
4f3c5e06 3455 return {}
3456
3457
2ccd1b10 3458def urlhandle_detect_ext(url_handle):
79298173 3459 getheader = url_handle.headers.get
2ccd1b10 3460
b55ee18f
PH
3461 cd = getheader('Content-Disposition')
3462 if cd:
3463 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3464 if m:
3465 e = determine_ext(m.group('filename'), default_ext=None)
3466 if e:
3467 return e
3468
c460bdd5 3469 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3470
3471
1e399778
YCH
3472def encode_data_uri(data, mime_type):
3473 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3474
3475
05900629 3476def age_restricted(content_limit, age_limit):
6ec6cb4e 3477 """ Returns True iff the content should be blocked """
05900629
PH
3478
3479 if age_limit is None: # No limit set
3480 return False
3481 if content_limit is None:
3482 return False # Content available for everyone
3483 return age_limit < content_limit
61ca9a80
PH
3484
3485
88f60feb 3486# List of known byte-order-marks (BOM)
a904a7f8
L
3487BOMS = [
3488 (b'\xef\xbb\xbf', 'utf-8'),
3489 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3490 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3491 (b'\xff\xfe', 'utf-16-le'),
3492 (b'\xfe\xff', 'utf-16-be'),
3493]
a904a7f8
L
3494
3495
61ca9a80
PH
3496def is_html(first_bytes):
3497 """ Detect whether a file contains HTML by examining its first bytes. """
3498
80e8493e 3499 encoding = 'utf-8'
61ca9a80 3500 for bom, enc in BOMS:
80e8493e 3501 while first_bytes.startswith(bom):
3502 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3503
80e8493e 3504 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3505
3506
3507def determine_protocol(info_dict):
3508 protocol = info_dict.get('protocol')
3509 if protocol is not None:
3510 return protocol
3511
7de837a5 3512 url = sanitize_url(info_dict['url'])
a055469f
PH
3513 if url.startswith('rtmp'):
3514 return 'rtmp'
3515 elif url.startswith('mms'):
3516 return 'mms'
3517 elif url.startswith('rtsp'):
3518 return 'rtsp'
3519
3520 ext = determine_ext(url)
3521 if ext == 'm3u8':
3522 return 'm3u8'
3523 elif ext == 'f4m':
3524 return 'f4m'
3525
14f25df2 3526 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3527
3528
c5e3f849 3529def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3530 """ Render a list of rows, each as a list of values.
3531 Text after a \t will be right aligned """
ec11a9f4 3532 def width(string):
c5e3f849 3533 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3534
3535 def get_max_lens(table):
ec11a9f4 3536 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3537
3538 def filter_using_list(row, filterArray):
d16df59d 3539 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3540
d16df59d 3541 max_lens = get_max_lens(data) if hide_empty else []
3542 header_row = filter_using_list(header_row, max_lens)
3543 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3544
cfb56d1a 3545 table = [header_row] + data
76d321f6 3546 max_lens = get_max_lens(table)
c5e3f849 3547 extra_gap += 1
76d321f6 3548 if delim:
c5e3f849 3549 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3550 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3551 for row in table:
3552 for pos, text in enumerate(map(str, row)):
c5e3f849 3553 if '\t' in text:
3554 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3555 else:
3556 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3557 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3558 return ret
347de493
PH
3559
3560
8f18aca8 3561def _match_one(filter_part, dct, incomplete):
77b87f05 3562 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3563 STRING_OPERATORS = {
3564 '*=': operator.contains,
3565 '^=': lambda attr, value: attr.startswith(value),
3566 '$=': lambda attr, value: attr.endswith(value),
3567 '~=': lambda attr, value: re.search(value, attr),
3568 }
347de493 3569 COMPARISON_OPERATORS = {
a047eeb6 3570 **STRING_OPERATORS,
3571 '<=': operator.le, # "<=" must be defined above "<"
347de493 3572 '<': operator.lt,
347de493 3573 '>=': operator.ge,
a047eeb6 3574 '>': operator.gt,
347de493 3575 '=': operator.eq,
347de493 3576 }
a047eeb6 3577
6db9c4d5 3578 if isinstance(incomplete, bool):
3579 is_incomplete = lambda _: incomplete
3580 else:
3581 is_incomplete = lambda k: k in incomplete
3582
64fa820c 3583 operator_rex = re.compile(r'''(?x)
347de493 3584 (?P<key>[a-z_]+)
77b87f05 3585 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3586 (?:
a047eeb6 3587 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3588 (?P<strval>.+?)
347de493 3589 )
347de493 3590 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3591 m = operator_rex.fullmatch(filter_part.strip())
347de493 3592 if m:
18f96d12 3593 m = m.groupdict()
3594 unnegated_op = COMPARISON_OPERATORS[m['op']]
3595 if m['negation']:
77b87f05
MT
3596 op = lambda attr, value: not unnegated_op(attr, value)
3597 else:
3598 op = unnegated_op
18f96d12 3599 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3600 if m['quote']:
3601 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3602 actual_value = dct.get(m['key'])
3603 numeric_comparison = None
f9934b96 3604 if isinstance(actual_value, (int, float)):
e5a088dc
S
3605 # If the original field is a string and matching comparisonvalue is
3606 # a number we should respect the origin of the original field
3607 # and process comparison value as a string (see
18f96d12 3608 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3609 try:
18f96d12 3610 numeric_comparison = int(comparison_value)
347de493 3611 except ValueError:
18f96d12 3612 numeric_comparison = parse_filesize(comparison_value)
3613 if numeric_comparison is None:
3614 numeric_comparison = parse_filesize(f'{comparison_value}B')
3615 if numeric_comparison is None:
3616 numeric_comparison = parse_duration(comparison_value)
3617 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3618 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3619 if actual_value is None:
6db9c4d5 3620 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3621 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3622
3623 UNARY_OPERATORS = {
1cc47c66
S
3624 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3625 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3626 }
64fa820c 3627 operator_rex = re.compile(r'''(?x)
347de493 3628 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3629 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3630 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3631 if m:
3632 op = UNARY_OPERATORS[m.group('op')]
3633 actual_value = dct.get(m.group('key'))
6db9c4d5 3634 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3635 return True
347de493
PH
3636 return op(actual_value)
3637
3638 raise ValueError('Invalid filter part %r' % filter_part)
3639
3640
8f18aca8 3641def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3642 """ Filter a dictionary with a simple string syntax.
3643 @returns Whether the filter passes
3644 @param incomplete Set of keys that is expected to be missing from dct.
3645 Can be True/False to indicate all/none of the keys may be missing.
3646 All conditions on incomplete keys pass if the key is missing
8f18aca8 3647 """
347de493 3648 return all(
8f18aca8 3649 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3650 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3651
3652
b1a7cd05 3653def match_filter_func(filters):
3654 if not filters:
d1b5f70b 3655 return None
492272fe 3656 filters = set(variadic(filters))
d1b5f70b 3657
492272fe 3658 interactive = '-' in filters
3659 if interactive:
3660 filters.remove('-')
3661
3662 def _match_func(info_dict, incomplete=False):
3663 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3664 return NO_DEFAULT if interactive and not incomplete else None
347de493 3665 else:
3bec830a 3666 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3667 filter_str = ') | ('.join(map(str.strip, filters))
3668 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3669 return _match_func
91410c9b
PH
3670
3671
f2df4071 3672class download_range_func:
3673 def __init__(self, chapters, ranges):
3674 self.chapters, self.ranges = chapters, ranges
3675
3676 def __call__(self, info_dict, ydl):
5ec1b6b7 3677 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3678 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3679 for regex in self.chapters or []:
5ec1b6b7 3680 for i, chapter in enumerate(info_dict.get('chapters') or []):
3681 if re.search(regex, chapter['title']):
3682 warning = None
3683 yield {**chapter, 'index': i}
f2df4071 3684 if self.chapters and warning:
5ec1b6b7 3685 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3686
f2df4071 3687 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3688
f2df4071 3689 def __eq__(self, other):
3690 return (isinstance(other, download_range_func)
3691 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3692
3693
bf6427d2
YCH
3694def parse_dfxp_time_expr(time_expr):
3695 if not time_expr:
d631d5f9 3696 return
bf6427d2 3697
1d485a1a 3698 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3699 if mobj:
3700 return float(mobj.group('time_offset'))
3701
db2fe38b 3702 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3703 if mobj:
db2fe38b 3704 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3705
3706
c1c924ab 3707def srt_subtitles_timecode(seconds):
aa7785f8 3708 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3709
3710
3711def ass_subtitles_timecode(seconds):
3712 time = timetuple_from_msec(seconds * 1000)
3713 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3714
3715
3716def dfxp2srt(dfxp_data):
3869028f
YCH
3717 '''
3718 @param dfxp_data A bytes-like object containing DFXP data
3719 @returns A unicode object containing converted SRT data
3720 '''
5b995f71 3721 LEGACY_NAMESPACES = (
3869028f
YCH
3722 (b'http://www.w3.org/ns/ttml', [
3723 b'http://www.w3.org/2004/11/ttaf1',
3724 b'http://www.w3.org/2006/04/ttaf1',
3725 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3726 ]),
3869028f
YCH
3727 (b'http://www.w3.org/ns/ttml#styling', [
3728 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3729 ]),
3730 )
3731
3732 SUPPORTED_STYLING = [
3733 'color',
3734 'fontFamily',
3735 'fontSize',
3736 'fontStyle',
3737 'fontWeight',
3738 'textDecoration'
3739 ]
3740
4e335771 3741 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3742 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3743 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3744 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3745 })
bf6427d2 3746
5b995f71
RA
3747 styles = {}
3748 default_style = {}
3749
86e5f3ed 3750 class TTMLPElementParser:
5b995f71
RA
3751 _out = ''
3752 _unclosed_elements = []
3753 _applied_styles = []
bf6427d2 3754
2b14cb56 3755 def start(self, tag, attrib):
5b995f71
RA
3756 if tag in (_x('ttml:br'), 'br'):
3757 self._out += '\n'
3758 else:
3759 unclosed_elements = []
3760 style = {}
3761 element_style_id = attrib.get('style')
3762 if default_style:
3763 style.update(default_style)
3764 if element_style_id:
3765 style.update(styles.get(element_style_id, {}))
3766 for prop in SUPPORTED_STYLING:
3767 prop_val = attrib.get(_x('tts:' + prop))
3768 if prop_val:
3769 style[prop] = prop_val
3770 if style:
3771 font = ''
3772 for k, v in sorted(style.items()):
3773 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3774 continue
3775 if k == 'color':
3776 font += ' color="%s"' % v
3777 elif k == 'fontSize':
3778 font += ' size="%s"' % v
3779 elif k == 'fontFamily':
3780 font += ' face="%s"' % v
3781 elif k == 'fontWeight' and v == 'bold':
3782 self._out += '<b>'
3783 unclosed_elements.append('b')
3784 elif k == 'fontStyle' and v == 'italic':
3785 self._out += '<i>'
3786 unclosed_elements.append('i')
3787 elif k == 'textDecoration' and v == 'underline':
3788 self._out += '<u>'
3789 unclosed_elements.append('u')
3790 if font:
3791 self._out += '<font' + font + '>'
3792 unclosed_elements.append('font')
3793 applied_style = {}
3794 if self._applied_styles:
3795 applied_style.update(self._applied_styles[-1])
3796 applied_style.update(style)
3797 self._applied_styles.append(applied_style)
3798 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3799
2b14cb56 3800 def end(self, tag):
5b995f71
RA
3801 if tag not in (_x('ttml:br'), 'br'):
3802 unclosed_elements = self._unclosed_elements.pop()
3803 for element in reversed(unclosed_elements):
3804 self._out += '</%s>' % element
3805 if unclosed_elements and self._applied_styles:
3806 self._applied_styles.pop()
bf6427d2 3807
2b14cb56 3808 def data(self, data):
5b995f71 3809 self._out += data
2b14cb56 3810
3811 def close(self):
5b995f71 3812 return self._out.strip()
2b14cb56 3813
3814 def parse_node(node):
3815 target = TTMLPElementParser()
3816 parser = xml.etree.ElementTree.XMLParser(target=target)
3817 parser.feed(xml.etree.ElementTree.tostring(node))
3818 return parser.close()
bf6427d2 3819
5b995f71
RA
3820 for k, v in LEGACY_NAMESPACES:
3821 for ns in v:
3822 dfxp_data = dfxp_data.replace(ns, k)
3823
3869028f 3824 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3825 out = []
5b995f71 3826 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3827
3828 if not paras:
3829 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3830
5b995f71
RA
3831 repeat = False
3832 while True:
3833 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3834 style_id = style.get('id') or style.get(_x('xml:id'))
3835 if not style_id:
3836 continue
5b995f71
RA
3837 parent_style_id = style.get('style')
3838 if parent_style_id:
3839 if parent_style_id not in styles:
3840 repeat = True
3841 continue
3842 styles[style_id] = styles[parent_style_id].copy()
3843 for prop in SUPPORTED_STYLING:
3844 prop_val = style.get(_x('tts:' + prop))
3845 if prop_val:
3846 styles.setdefault(style_id, {})[prop] = prop_val
3847 if repeat:
3848 repeat = False
3849 else:
3850 break
3851
3852 for p in ('body', 'div'):
3853 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3854 if ele is None:
3855 continue
3856 style = styles.get(ele.get('style'))
3857 if not style:
3858 continue
3859 default_style.update(style)
3860
bf6427d2 3861 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3862 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3863 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3864 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3865 if begin_time is None:
3866 continue
7dff0363 3867 if not end_time:
d631d5f9
YCH
3868 if not dur:
3869 continue
3870 end_time = begin_time + dur
bf6427d2
YCH
3871 out.append('%d\n%s --> %s\n%s\n\n' % (
3872 index,
c1c924ab
YCH
3873 srt_subtitles_timecode(begin_time),
3874 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3875 parse_node(para)))
3876
3877 return ''.join(out)
3878
3879
c487cf00 3880def cli_option(params, command_option, param, separator=None):
66e289ba 3881 param = params.get(param)
c487cf00 3882 return ([] if param is None
3883 else [command_option, str(param)] if separator is None
3884 else [f'{command_option}{separator}{param}'])
66e289ba
S
3885
3886
3887def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3888 param = params.get(param)
c487cf00 3889 assert param in (True, False, None)
3890 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3891
3892
3893def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3894 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3895
3896
e92caff5 3897def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3898 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3899 if use_compat:
5b1ecbb3 3900 return argdict
3901 else:
3902 argdict = None
eab9b2bc 3903 if argdict is None:
5b1ecbb3 3904 return default
eab9b2bc 3905 assert isinstance(argdict, dict)
3906
e92caff5 3907 assert isinstance(keys, (list, tuple))
3908 for key_list in keys:
e92caff5 3909 arg_list = list(filter(
3910 lambda x: x is not None,
6606817a 3911 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3912 if arg_list:
3913 return [arg for args in arg_list for arg in args]
3914 return default
66e289ba 3915
6251555f 3916
330690a2 3917def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3918 main_key, exe = main_key.lower(), exe.lower()
3919 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3920 keys = [f'{root_key}{k}' for k in (keys or [''])]
3921 if root_key in keys:
3922 if main_key != exe:
3923 keys.append((main_key, exe))
3924 keys.append('default')
3925 else:
3926 use_compat = False
3927 return cli_configuration_args(argdict, keys, default, use_compat)
3928
66e289ba 3929
86e5f3ed 3930class ISO639Utils:
39672624
YCH
3931 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3932 _lang_map = {
3933 'aa': 'aar',
3934 'ab': 'abk',
3935 'ae': 'ave',
3936 'af': 'afr',
3937 'ak': 'aka',
3938 'am': 'amh',
3939 'an': 'arg',
3940 'ar': 'ara',
3941 'as': 'asm',
3942 'av': 'ava',
3943 'ay': 'aym',
3944 'az': 'aze',
3945 'ba': 'bak',
3946 'be': 'bel',
3947 'bg': 'bul',
3948 'bh': 'bih',
3949 'bi': 'bis',
3950 'bm': 'bam',
3951 'bn': 'ben',
3952 'bo': 'bod',
3953 'br': 'bre',
3954 'bs': 'bos',
3955 'ca': 'cat',
3956 'ce': 'che',
3957 'ch': 'cha',
3958 'co': 'cos',
3959 'cr': 'cre',
3960 'cs': 'ces',
3961 'cu': 'chu',
3962 'cv': 'chv',
3963 'cy': 'cym',
3964 'da': 'dan',
3965 'de': 'deu',
3966 'dv': 'div',
3967 'dz': 'dzo',
3968 'ee': 'ewe',
3969 'el': 'ell',
3970 'en': 'eng',
3971 'eo': 'epo',
3972 'es': 'spa',
3973 'et': 'est',
3974 'eu': 'eus',
3975 'fa': 'fas',
3976 'ff': 'ful',
3977 'fi': 'fin',
3978 'fj': 'fij',
3979 'fo': 'fao',
3980 'fr': 'fra',
3981 'fy': 'fry',
3982 'ga': 'gle',
3983 'gd': 'gla',
3984 'gl': 'glg',
3985 'gn': 'grn',
3986 'gu': 'guj',
3987 'gv': 'glv',
3988 'ha': 'hau',
3989 'he': 'heb',
b7acc835 3990 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3991 'hi': 'hin',
3992 'ho': 'hmo',
3993 'hr': 'hrv',
3994 'ht': 'hat',
3995 'hu': 'hun',
3996 'hy': 'hye',
3997 'hz': 'her',
3998 'ia': 'ina',
3999 'id': 'ind',
b7acc835 4000 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4001 'ie': 'ile',
4002 'ig': 'ibo',
4003 'ii': 'iii',
4004 'ik': 'ipk',
4005 'io': 'ido',
4006 'is': 'isl',
4007 'it': 'ita',
4008 'iu': 'iku',
4009 'ja': 'jpn',
4010 'jv': 'jav',
4011 'ka': 'kat',
4012 'kg': 'kon',
4013 'ki': 'kik',
4014 'kj': 'kua',
4015 'kk': 'kaz',
4016 'kl': 'kal',
4017 'km': 'khm',
4018 'kn': 'kan',
4019 'ko': 'kor',
4020 'kr': 'kau',
4021 'ks': 'kas',
4022 'ku': 'kur',
4023 'kv': 'kom',
4024 'kw': 'cor',
4025 'ky': 'kir',
4026 'la': 'lat',
4027 'lb': 'ltz',
4028 'lg': 'lug',
4029 'li': 'lim',
4030 'ln': 'lin',
4031 'lo': 'lao',
4032 'lt': 'lit',
4033 'lu': 'lub',
4034 'lv': 'lav',
4035 'mg': 'mlg',
4036 'mh': 'mah',
4037 'mi': 'mri',
4038 'mk': 'mkd',
4039 'ml': 'mal',
4040 'mn': 'mon',
4041 'mr': 'mar',
4042 'ms': 'msa',
4043 'mt': 'mlt',
4044 'my': 'mya',
4045 'na': 'nau',
4046 'nb': 'nob',
4047 'nd': 'nde',
4048 'ne': 'nep',
4049 'ng': 'ndo',
4050 'nl': 'nld',
4051 'nn': 'nno',
4052 'no': 'nor',
4053 'nr': 'nbl',
4054 'nv': 'nav',
4055 'ny': 'nya',
4056 'oc': 'oci',
4057 'oj': 'oji',
4058 'om': 'orm',
4059 'or': 'ori',
4060 'os': 'oss',
4061 'pa': 'pan',
4062 'pi': 'pli',
4063 'pl': 'pol',
4064 'ps': 'pus',
4065 'pt': 'por',
4066 'qu': 'que',
4067 'rm': 'roh',
4068 'rn': 'run',
4069 'ro': 'ron',
4070 'ru': 'rus',
4071 'rw': 'kin',
4072 'sa': 'san',
4073 'sc': 'srd',
4074 'sd': 'snd',
4075 'se': 'sme',
4076 'sg': 'sag',
4077 'si': 'sin',
4078 'sk': 'slk',
4079 'sl': 'slv',
4080 'sm': 'smo',
4081 'sn': 'sna',
4082 'so': 'som',
4083 'sq': 'sqi',
4084 'sr': 'srp',
4085 'ss': 'ssw',
4086 'st': 'sot',
4087 'su': 'sun',
4088 'sv': 'swe',
4089 'sw': 'swa',
4090 'ta': 'tam',
4091 'te': 'tel',
4092 'tg': 'tgk',
4093 'th': 'tha',
4094 'ti': 'tir',
4095 'tk': 'tuk',
4096 'tl': 'tgl',
4097 'tn': 'tsn',
4098 'to': 'ton',
4099 'tr': 'tur',
4100 'ts': 'tso',
4101 'tt': 'tat',
4102 'tw': 'twi',
4103 'ty': 'tah',
4104 'ug': 'uig',
4105 'uk': 'ukr',
4106 'ur': 'urd',
4107 'uz': 'uzb',
4108 've': 'ven',
4109 'vi': 'vie',
4110 'vo': 'vol',
4111 'wa': 'wln',
4112 'wo': 'wol',
4113 'xh': 'xho',
4114 'yi': 'yid',
e9a50fba 4115 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4116 'yo': 'yor',
4117 'za': 'zha',
4118 'zh': 'zho',
4119 'zu': 'zul',
4120 }
4121
4122 @classmethod
4123 def short2long(cls, code):
4124 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4125 return cls._lang_map.get(code[:2])
4126
4127 @classmethod
4128 def long2short(cls, code):
4129 """Convert language code from ISO 639-2/T to ISO 639-1"""
4130 for short_name, long_name in cls._lang_map.items():
4131 if long_name == code:
4132 return short_name
4133
4134
86e5f3ed 4135class ISO3166Utils:
4eb10f66
YCH
4136 # From http://data.okfn.org/data/core/country-list
4137 _country_map = {
4138 'AF': 'Afghanistan',
4139 'AX': 'Åland Islands',
4140 'AL': 'Albania',
4141 'DZ': 'Algeria',
4142 'AS': 'American Samoa',
4143 'AD': 'Andorra',
4144 'AO': 'Angola',
4145 'AI': 'Anguilla',
4146 'AQ': 'Antarctica',
4147 'AG': 'Antigua and Barbuda',
4148 'AR': 'Argentina',
4149 'AM': 'Armenia',
4150 'AW': 'Aruba',
4151 'AU': 'Australia',
4152 'AT': 'Austria',
4153 'AZ': 'Azerbaijan',
4154 'BS': 'Bahamas',
4155 'BH': 'Bahrain',
4156 'BD': 'Bangladesh',
4157 'BB': 'Barbados',
4158 'BY': 'Belarus',
4159 'BE': 'Belgium',
4160 'BZ': 'Belize',
4161 'BJ': 'Benin',
4162 'BM': 'Bermuda',
4163 'BT': 'Bhutan',
4164 'BO': 'Bolivia, Plurinational State of',
4165 'BQ': 'Bonaire, Sint Eustatius and Saba',
4166 'BA': 'Bosnia and Herzegovina',
4167 'BW': 'Botswana',
4168 'BV': 'Bouvet Island',
4169 'BR': 'Brazil',
4170 'IO': 'British Indian Ocean Territory',
4171 'BN': 'Brunei Darussalam',
4172 'BG': 'Bulgaria',
4173 'BF': 'Burkina Faso',
4174 'BI': 'Burundi',
4175 'KH': 'Cambodia',
4176 'CM': 'Cameroon',
4177 'CA': 'Canada',
4178 'CV': 'Cape Verde',
4179 'KY': 'Cayman Islands',
4180 'CF': 'Central African Republic',
4181 'TD': 'Chad',
4182 'CL': 'Chile',
4183 'CN': 'China',
4184 'CX': 'Christmas Island',
4185 'CC': 'Cocos (Keeling) Islands',
4186 'CO': 'Colombia',
4187 'KM': 'Comoros',
4188 'CG': 'Congo',
4189 'CD': 'Congo, the Democratic Republic of the',
4190 'CK': 'Cook Islands',
4191 'CR': 'Costa Rica',
4192 'CI': 'Côte d\'Ivoire',
4193 'HR': 'Croatia',
4194 'CU': 'Cuba',
4195 'CW': 'Curaçao',
4196 'CY': 'Cyprus',
4197 'CZ': 'Czech Republic',
4198 'DK': 'Denmark',
4199 'DJ': 'Djibouti',
4200 'DM': 'Dominica',
4201 'DO': 'Dominican Republic',
4202 'EC': 'Ecuador',
4203 'EG': 'Egypt',
4204 'SV': 'El Salvador',
4205 'GQ': 'Equatorial Guinea',
4206 'ER': 'Eritrea',
4207 'EE': 'Estonia',
4208 'ET': 'Ethiopia',
4209 'FK': 'Falkland Islands (Malvinas)',
4210 'FO': 'Faroe Islands',
4211 'FJ': 'Fiji',
4212 'FI': 'Finland',
4213 'FR': 'France',
4214 'GF': 'French Guiana',
4215 'PF': 'French Polynesia',
4216 'TF': 'French Southern Territories',
4217 'GA': 'Gabon',
4218 'GM': 'Gambia',
4219 'GE': 'Georgia',
4220 'DE': 'Germany',
4221 'GH': 'Ghana',
4222 'GI': 'Gibraltar',
4223 'GR': 'Greece',
4224 'GL': 'Greenland',
4225 'GD': 'Grenada',
4226 'GP': 'Guadeloupe',
4227 'GU': 'Guam',
4228 'GT': 'Guatemala',
4229 'GG': 'Guernsey',
4230 'GN': 'Guinea',
4231 'GW': 'Guinea-Bissau',
4232 'GY': 'Guyana',
4233 'HT': 'Haiti',
4234 'HM': 'Heard Island and McDonald Islands',
4235 'VA': 'Holy See (Vatican City State)',
4236 'HN': 'Honduras',
4237 'HK': 'Hong Kong',
4238 'HU': 'Hungary',
4239 'IS': 'Iceland',
4240 'IN': 'India',
4241 'ID': 'Indonesia',
4242 'IR': 'Iran, Islamic Republic of',
4243 'IQ': 'Iraq',
4244 'IE': 'Ireland',
4245 'IM': 'Isle of Man',
4246 'IL': 'Israel',
4247 'IT': 'Italy',
4248 'JM': 'Jamaica',
4249 'JP': 'Japan',
4250 'JE': 'Jersey',
4251 'JO': 'Jordan',
4252 'KZ': 'Kazakhstan',
4253 'KE': 'Kenya',
4254 'KI': 'Kiribati',
4255 'KP': 'Korea, Democratic People\'s Republic of',
4256 'KR': 'Korea, Republic of',
4257 'KW': 'Kuwait',
4258 'KG': 'Kyrgyzstan',
4259 'LA': 'Lao People\'s Democratic Republic',
4260 'LV': 'Latvia',
4261 'LB': 'Lebanon',
4262 'LS': 'Lesotho',
4263 'LR': 'Liberia',
4264 'LY': 'Libya',
4265 'LI': 'Liechtenstein',
4266 'LT': 'Lithuania',
4267 'LU': 'Luxembourg',
4268 'MO': 'Macao',
4269 'MK': 'Macedonia, the Former Yugoslav Republic of',
4270 'MG': 'Madagascar',
4271 'MW': 'Malawi',
4272 'MY': 'Malaysia',
4273 'MV': 'Maldives',
4274 'ML': 'Mali',
4275 'MT': 'Malta',
4276 'MH': 'Marshall Islands',
4277 'MQ': 'Martinique',
4278 'MR': 'Mauritania',
4279 'MU': 'Mauritius',
4280 'YT': 'Mayotte',
4281 'MX': 'Mexico',
4282 'FM': 'Micronesia, Federated States of',
4283 'MD': 'Moldova, Republic of',
4284 'MC': 'Monaco',
4285 'MN': 'Mongolia',
4286 'ME': 'Montenegro',
4287 'MS': 'Montserrat',
4288 'MA': 'Morocco',
4289 'MZ': 'Mozambique',
4290 'MM': 'Myanmar',
4291 'NA': 'Namibia',
4292 'NR': 'Nauru',
4293 'NP': 'Nepal',
4294 'NL': 'Netherlands',
4295 'NC': 'New Caledonia',
4296 'NZ': 'New Zealand',
4297 'NI': 'Nicaragua',
4298 'NE': 'Niger',
4299 'NG': 'Nigeria',
4300 'NU': 'Niue',
4301 'NF': 'Norfolk Island',
4302 'MP': 'Northern Mariana Islands',
4303 'NO': 'Norway',
4304 'OM': 'Oman',
4305 'PK': 'Pakistan',
4306 'PW': 'Palau',
4307 'PS': 'Palestine, State of',
4308 'PA': 'Panama',
4309 'PG': 'Papua New Guinea',
4310 'PY': 'Paraguay',
4311 'PE': 'Peru',
4312 'PH': 'Philippines',
4313 'PN': 'Pitcairn',
4314 'PL': 'Poland',
4315 'PT': 'Portugal',
4316 'PR': 'Puerto Rico',
4317 'QA': 'Qatar',
4318 'RE': 'Réunion',
4319 'RO': 'Romania',
4320 'RU': 'Russian Federation',
4321 'RW': 'Rwanda',
4322 'BL': 'Saint Barthélemy',
4323 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4324 'KN': 'Saint Kitts and Nevis',
4325 'LC': 'Saint Lucia',
4326 'MF': 'Saint Martin (French part)',
4327 'PM': 'Saint Pierre and Miquelon',
4328 'VC': 'Saint Vincent and the Grenadines',
4329 'WS': 'Samoa',
4330 'SM': 'San Marino',
4331 'ST': 'Sao Tome and Principe',
4332 'SA': 'Saudi Arabia',
4333 'SN': 'Senegal',
4334 'RS': 'Serbia',
4335 'SC': 'Seychelles',
4336 'SL': 'Sierra Leone',
4337 'SG': 'Singapore',
4338 'SX': 'Sint Maarten (Dutch part)',
4339 'SK': 'Slovakia',
4340 'SI': 'Slovenia',
4341 'SB': 'Solomon Islands',
4342 'SO': 'Somalia',
4343 'ZA': 'South Africa',
4344 'GS': 'South Georgia and the South Sandwich Islands',
4345 'SS': 'South Sudan',
4346 'ES': 'Spain',
4347 'LK': 'Sri Lanka',
4348 'SD': 'Sudan',
4349 'SR': 'Suriname',
4350 'SJ': 'Svalbard and Jan Mayen',
4351 'SZ': 'Swaziland',
4352 'SE': 'Sweden',
4353 'CH': 'Switzerland',
4354 'SY': 'Syrian Arab Republic',
4355 'TW': 'Taiwan, Province of China',
4356 'TJ': 'Tajikistan',
4357 'TZ': 'Tanzania, United Republic of',
4358 'TH': 'Thailand',
4359 'TL': 'Timor-Leste',
4360 'TG': 'Togo',
4361 'TK': 'Tokelau',
4362 'TO': 'Tonga',
4363 'TT': 'Trinidad and Tobago',
4364 'TN': 'Tunisia',
4365 'TR': 'Turkey',
4366 'TM': 'Turkmenistan',
4367 'TC': 'Turks and Caicos Islands',
4368 'TV': 'Tuvalu',
4369 'UG': 'Uganda',
4370 'UA': 'Ukraine',
4371 'AE': 'United Arab Emirates',
4372 'GB': 'United Kingdom',
4373 'US': 'United States',
4374 'UM': 'United States Minor Outlying Islands',
4375 'UY': 'Uruguay',
4376 'UZ': 'Uzbekistan',
4377 'VU': 'Vanuatu',
4378 'VE': 'Venezuela, Bolivarian Republic of',
4379 'VN': 'Viet Nam',
4380 'VG': 'Virgin Islands, British',
4381 'VI': 'Virgin Islands, U.S.',
4382 'WF': 'Wallis and Futuna',
4383 'EH': 'Western Sahara',
4384 'YE': 'Yemen',
4385 'ZM': 'Zambia',
4386 'ZW': 'Zimbabwe',
2f97cc61 4387 # Not ISO 3166 codes, but used for IP blocks
4388 'AP': 'Asia/Pacific Region',
4389 'EU': 'Europe',
4eb10f66
YCH
4390 }
4391
4392 @classmethod
4393 def short2full(cls, code):
4394 """Convert an ISO 3166-2 country code to the corresponding full name"""
4395 return cls._country_map.get(code.upper())
4396
4397
86e5f3ed 4398class GeoUtils:
773f291d
S
4399 # Major IPv4 address blocks per country
4400 _country_ip_map = {
53896ca5 4401 'AD': '46.172.224.0/19',
773f291d
S
4402 'AE': '94.200.0.0/13',
4403 'AF': '149.54.0.0/17',
4404 'AG': '209.59.64.0/18',
4405 'AI': '204.14.248.0/21',
4406 'AL': '46.99.0.0/16',
4407 'AM': '46.70.0.0/15',
4408 'AO': '105.168.0.0/13',
53896ca5
S
4409 'AP': '182.50.184.0/21',
4410 'AQ': '23.154.160.0/24',
773f291d
S
4411 'AR': '181.0.0.0/12',
4412 'AS': '202.70.112.0/20',
53896ca5 4413 'AT': '77.116.0.0/14',
773f291d
S
4414 'AU': '1.128.0.0/11',
4415 'AW': '181.41.0.0/18',
53896ca5
S
4416 'AX': '185.217.4.0/22',
4417 'AZ': '5.197.0.0/16',
773f291d
S
4418 'BA': '31.176.128.0/17',
4419 'BB': '65.48.128.0/17',
4420 'BD': '114.130.0.0/16',
4421 'BE': '57.0.0.0/8',
53896ca5 4422 'BF': '102.178.0.0/15',
773f291d
S
4423 'BG': '95.42.0.0/15',
4424 'BH': '37.131.0.0/17',
4425 'BI': '154.117.192.0/18',
4426 'BJ': '137.255.0.0/16',
53896ca5 4427 'BL': '185.212.72.0/23',
773f291d
S
4428 'BM': '196.12.64.0/18',
4429 'BN': '156.31.0.0/16',
4430 'BO': '161.56.0.0/16',
4431 'BQ': '161.0.80.0/20',
53896ca5 4432 'BR': '191.128.0.0/12',
773f291d
S
4433 'BS': '24.51.64.0/18',
4434 'BT': '119.2.96.0/19',
4435 'BW': '168.167.0.0/16',
4436 'BY': '178.120.0.0/13',
4437 'BZ': '179.42.192.0/18',
4438 'CA': '99.224.0.0/11',
4439 'CD': '41.243.0.0/16',
53896ca5
S
4440 'CF': '197.242.176.0/21',
4441 'CG': '160.113.0.0/16',
773f291d 4442 'CH': '85.0.0.0/13',
53896ca5 4443 'CI': '102.136.0.0/14',
773f291d
S
4444 'CK': '202.65.32.0/19',
4445 'CL': '152.172.0.0/14',
53896ca5 4446 'CM': '102.244.0.0/14',
773f291d
S
4447 'CN': '36.128.0.0/10',
4448 'CO': '181.240.0.0/12',
4449 'CR': '201.192.0.0/12',
4450 'CU': '152.206.0.0/15',
4451 'CV': '165.90.96.0/19',
4452 'CW': '190.88.128.0/17',
53896ca5 4453 'CY': '31.153.0.0/16',
773f291d
S
4454 'CZ': '88.100.0.0/14',
4455 'DE': '53.0.0.0/8',
4456 'DJ': '197.241.0.0/17',
4457 'DK': '87.48.0.0/12',
4458 'DM': '192.243.48.0/20',
4459 'DO': '152.166.0.0/15',
4460 'DZ': '41.96.0.0/12',
4461 'EC': '186.68.0.0/15',
4462 'EE': '90.190.0.0/15',
4463 'EG': '156.160.0.0/11',
4464 'ER': '196.200.96.0/20',
4465 'ES': '88.0.0.0/11',
4466 'ET': '196.188.0.0/14',
4467 'EU': '2.16.0.0/13',
4468 'FI': '91.152.0.0/13',
4469 'FJ': '144.120.0.0/16',
53896ca5 4470 'FK': '80.73.208.0/21',
773f291d
S
4471 'FM': '119.252.112.0/20',
4472 'FO': '88.85.32.0/19',
4473 'FR': '90.0.0.0/9',
4474 'GA': '41.158.0.0/15',
4475 'GB': '25.0.0.0/8',
4476 'GD': '74.122.88.0/21',
4477 'GE': '31.146.0.0/16',
4478 'GF': '161.22.64.0/18',
4479 'GG': '62.68.160.0/19',
53896ca5
S
4480 'GH': '154.160.0.0/12',
4481 'GI': '95.164.0.0/16',
773f291d
S
4482 'GL': '88.83.0.0/19',
4483 'GM': '160.182.0.0/15',
4484 'GN': '197.149.192.0/18',
4485 'GP': '104.250.0.0/19',
4486 'GQ': '105.235.224.0/20',
4487 'GR': '94.64.0.0/13',
4488 'GT': '168.234.0.0/16',
4489 'GU': '168.123.0.0/16',
4490 'GW': '197.214.80.0/20',
4491 'GY': '181.41.64.0/18',
4492 'HK': '113.252.0.0/14',
4493 'HN': '181.210.0.0/16',
4494 'HR': '93.136.0.0/13',
4495 'HT': '148.102.128.0/17',
4496 'HU': '84.0.0.0/14',
4497 'ID': '39.192.0.0/10',
4498 'IE': '87.32.0.0/12',
4499 'IL': '79.176.0.0/13',
4500 'IM': '5.62.80.0/20',
4501 'IN': '117.192.0.0/10',
4502 'IO': '203.83.48.0/21',
4503 'IQ': '37.236.0.0/14',
4504 'IR': '2.176.0.0/12',
4505 'IS': '82.221.0.0/16',
4506 'IT': '79.0.0.0/10',
4507 'JE': '87.244.64.0/18',
4508 'JM': '72.27.0.0/17',
4509 'JO': '176.29.0.0/16',
53896ca5 4510 'JP': '133.0.0.0/8',
773f291d
S
4511 'KE': '105.48.0.0/12',
4512 'KG': '158.181.128.0/17',
4513 'KH': '36.37.128.0/17',
4514 'KI': '103.25.140.0/22',
4515 'KM': '197.255.224.0/20',
53896ca5 4516 'KN': '198.167.192.0/19',
773f291d
S
4517 'KP': '175.45.176.0/22',
4518 'KR': '175.192.0.0/10',
4519 'KW': '37.36.0.0/14',
4520 'KY': '64.96.0.0/15',
4521 'KZ': '2.72.0.0/13',
4522 'LA': '115.84.64.0/18',
4523 'LB': '178.135.0.0/16',
53896ca5 4524 'LC': '24.92.144.0/20',
773f291d
S
4525 'LI': '82.117.0.0/19',
4526 'LK': '112.134.0.0/15',
53896ca5 4527 'LR': '102.183.0.0/16',
773f291d
S
4528 'LS': '129.232.0.0/17',
4529 'LT': '78.56.0.0/13',
4530 'LU': '188.42.0.0/16',
4531 'LV': '46.109.0.0/16',
4532 'LY': '41.252.0.0/14',
4533 'MA': '105.128.0.0/11',
4534 'MC': '88.209.64.0/18',
4535 'MD': '37.246.0.0/16',
4536 'ME': '178.175.0.0/17',
4537 'MF': '74.112.232.0/21',
4538 'MG': '154.126.0.0/17',
4539 'MH': '117.103.88.0/21',
4540 'MK': '77.28.0.0/15',
4541 'ML': '154.118.128.0/18',
4542 'MM': '37.111.0.0/17',
4543 'MN': '49.0.128.0/17',
4544 'MO': '60.246.0.0/16',
4545 'MP': '202.88.64.0/20',
4546 'MQ': '109.203.224.0/19',
4547 'MR': '41.188.64.0/18',
4548 'MS': '208.90.112.0/22',
4549 'MT': '46.11.0.0/16',
4550 'MU': '105.16.0.0/12',
4551 'MV': '27.114.128.0/18',
53896ca5 4552 'MW': '102.70.0.0/15',
773f291d
S
4553 'MX': '187.192.0.0/11',
4554 'MY': '175.136.0.0/13',
4555 'MZ': '197.218.0.0/15',
4556 'NA': '41.182.0.0/16',
4557 'NC': '101.101.0.0/18',
4558 'NE': '197.214.0.0/18',
4559 'NF': '203.17.240.0/22',
4560 'NG': '105.112.0.0/12',
4561 'NI': '186.76.0.0/15',
4562 'NL': '145.96.0.0/11',
4563 'NO': '84.208.0.0/13',
4564 'NP': '36.252.0.0/15',
4565 'NR': '203.98.224.0/19',
4566 'NU': '49.156.48.0/22',
4567 'NZ': '49.224.0.0/14',
4568 'OM': '5.36.0.0/15',
4569 'PA': '186.72.0.0/15',
4570 'PE': '186.160.0.0/14',
4571 'PF': '123.50.64.0/18',
4572 'PG': '124.240.192.0/19',
4573 'PH': '49.144.0.0/13',
4574 'PK': '39.32.0.0/11',
4575 'PL': '83.0.0.0/11',
4576 'PM': '70.36.0.0/20',
4577 'PR': '66.50.0.0/16',
4578 'PS': '188.161.0.0/16',
4579 'PT': '85.240.0.0/13',
4580 'PW': '202.124.224.0/20',
4581 'PY': '181.120.0.0/14',
4582 'QA': '37.210.0.0/15',
53896ca5 4583 'RE': '102.35.0.0/16',
773f291d 4584 'RO': '79.112.0.0/13',
53896ca5 4585 'RS': '93.86.0.0/15',
773f291d 4586 'RU': '5.136.0.0/13',
53896ca5 4587 'RW': '41.186.0.0/16',
773f291d
S
4588 'SA': '188.48.0.0/13',
4589 'SB': '202.1.160.0/19',
4590 'SC': '154.192.0.0/11',
53896ca5 4591 'SD': '102.120.0.0/13',
773f291d 4592 'SE': '78.64.0.0/12',
53896ca5 4593 'SG': '8.128.0.0/10',
773f291d
S
4594 'SI': '188.196.0.0/14',
4595 'SK': '78.98.0.0/15',
53896ca5 4596 'SL': '102.143.0.0/17',
773f291d
S
4597 'SM': '89.186.32.0/19',
4598 'SN': '41.82.0.0/15',
53896ca5 4599 'SO': '154.115.192.0/18',
773f291d
S
4600 'SR': '186.179.128.0/17',
4601 'SS': '105.235.208.0/21',
4602 'ST': '197.159.160.0/19',
4603 'SV': '168.243.0.0/16',
4604 'SX': '190.102.0.0/20',
4605 'SY': '5.0.0.0/16',
4606 'SZ': '41.84.224.0/19',
4607 'TC': '65.255.48.0/20',
4608 'TD': '154.68.128.0/19',
4609 'TG': '196.168.0.0/14',
4610 'TH': '171.96.0.0/13',
4611 'TJ': '85.9.128.0/18',
4612 'TK': '27.96.24.0/21',
4613 'TL': '180.189.160.0/20',
4614 'TM': '95.85.96.0/19',
4615 'TN': '197.0.0.0/11',
4616 'TO': '175.176.144.0/21',
4617 'TR': '78.160.0.0/11',
4618 'TT': '186.44.0.0/15',
4619 'TV': '202.2.96.0/19',
4620 'TW': '120.96.0.0/11',
4621 'TZ': '156.156.0.0/14',
53896ca5
S
4622 'UA': '37.52.0.0/14',
4623 'UG': '102.80.0.0/13',
4624 'US': '6.0.0.0/8',
773f291d 4625 'UY': '167.56.0.0/13',
53896ca5 4626 'UZ': '84.54.64.0/18',
773f291d 4627 'VA': '212.77.0.0/19',
53896ca5 4628 'VC': '207.191.240.0/21',
773f291d 4629 'VE': '186.88.0.0/13',
53896ca5 4630 'VG': '66.81.192.0/20',
773f291d
S
4631 'VI': '146.226.0.0/16',
4632 'VN': '14.160.0.0/11',
4633 'VU': '202.80.32.0/20',
4634 'WF': '117.20.32.0/21',
4635 'WS': '202.4.32.0/19',
4636 'YE': '134.35.0.0/16',
4637 'YT': '41.242.116.0/22',
4638 'ZA': '41.0.0.0/11',
53896ca5
S
4639 'ZM': '102.144.0.0/13',
4640 'ZW': '102.177.192.0/18',
773f291d
S
4641 }
4642
4643 @classmethod
5f95927a
S
4644 def random_ipv4(cls, code_or_block):
4645 if len(code_or_block) == 2:
4646 block = cls._country_ip_map.get(code_or_block.upper())
4647 if not block:
4648 return None
4649 else:
4650 block = code_or_block
773f291d 4651 addr, preflen = block.split('/')
ac668111 4652 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4653 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4654 return str(socket.inet_ntoa(
ac668111 4655 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4656
4657
ac668111 4658class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4659 def __init__(self, proxies=None):
4660 # Set default handlers
4661 for type in ('http', 'https'):
4662 setattr(self, '%s_open' % type,
4663 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4664 meth(r, proxy, type))
ac668111 4665 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4666
91410c9b 4667 def proxy_open(self, req, proxy, type):
2461f79d 4668 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4669 if req_proxy is not None:
4670 proxy = req_proxy
2461f79d
PH
4671 del req.headers['Ytdl-request-proxy']
4672
4673 if proxy == '__noproxy__':
4674 return None # No Proxy
14f25df2 4675 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4676 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4677 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4678 return None
ac668111 4679 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4680 self, req, proxy, type)
5bc880b9
YCH
4681
4682
0a5445dd
YCH
4683# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4684# released into Public Domain
4685# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4686
4687def long_to_bytes(n, blocksize=0):
4688 """long_to_bytes(n:long, blocksize:int) : string
4689 Convert a long integer to a byte string.
4690
4691 If optional blocksize is given and greater than zero, pad the front of the
4692 byte string with binary zeros so that the length is a multiple of
4693 blocksize.
4694 """
4695 # after much testing, this algorithm was deemed to be the fastest
4696 s = b''
4697 n = int(n)
4698 while n > 0:
ac668111 4699 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4700 n = n >> 32
4701 # strip off leading zeros
4702 for i in range(len(s)):
4703 if s[i] != b'\000'[0]:
4704 break
4705 else:
4706 # only happens when n == 0
4707 s = b'\000'
4708 i = 0
4709 s = s[i:]
4710 # add back some pad bytes. this could be done more efficiently w.r.t. the
4711 # de-padding being done above, but sigh...
4712 if blocksize > 0 and len(s) % blocksize:
4713 s = (blocksize - len(s) % blocksize) * b'\000' + s
4714 return s
4715
4716
4717def bytes_to_long(s):
4718 """bytes_to_long(string) : long
4719 Convert a byte string to a long integer.
4720
4721 This is (essentially) the inverse of long_to_bytes().
4722 """
4723 acc = 0
4724 length = len(s)
4725 if length % 4:
4726 extra = (4 - length % 4)
4727 s = b'\000' * extra + s
4728 length = length + extra
4729 for i in range(0, length, 4):
ac668111 4730 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4731 return acc
4732
4733
5bc880b9
YCH
4734def ohdave_rsa_encrypt(data, exponent, modulus):
4735 '''
4736 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4737
4738 Input:
4739 data: data to encrypt, bytes-like object
4740 exponent, modulus: parameter e and N of RSA algorithm, both integer
4741 Output: hex string of encrypted data
4742
4743 Limitation: supports one block encryption only
4744 '''
4745
4746 payload = int(binascii.hexlify(data[::-1]), 16)
4747 encrypted = pow(payload, exponent, modulus)
4748 return '%x' % encrypted
81bdc8fd
YCH
4749
4750
f48409c7
YCH
4751def pkcs1pad(data, length):
4752 """
4753 Padding input data with PKCS#1 scheme
4754
4755 @param {int[]} data input data
4756 @param {int} length target length
4757 @returns {int[]} padded data
4758 """
4759 if len(data) > length - 11:
4760 raise ValueError('Input data too long for PKCS#1 padding')
4761
4762 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4763 return [0, 2] + pseudo_random + [0] + data
4764
4765
7b2c3f47 4766def _base_n_table(n, table):
4767 if not table and not n:
4768 raise ValueError('Either table or n must be specified')
612f2be5 4769 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4770
44f14eb4 4771 if n and n != len(table):
612f2be5 4772 raise ValueError(f'base {n} exceeds table length {len(table)}')
4773 return table
59f898b7 4774
5eb6bdce 4775
7b2c3f47 4776def encode_base_n(num, n=None, table=None):
4777 """Convert given int to a base-n string"""
612f2be5 4778 table = _base_n_table(n, table)
7b2c3f47 4779 if not num:
5eb6bdce
YCH
4780 return table[0]
4781
7b2c3f47 4782 result, base = '', len(table)
81bdc8fd 4783 while num:
7b2c3f47 4784 result = table[num % base] + result
612f2be5 4785 num = num // base
7b2c3f47 4786 return result
4787
4788
4789def decode_base_n(string, n=None, table=None):
4790 """Convert given base-n string to int"""
4791 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4792 result, base = 0, len(table)
4793 for char in string:
4794 result = result * base + table[char]
4795 return result
4796
4797
4798def decode_base(value, digits):
4799 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4800 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4801 return decode_base_n(value, table=digits)
f52354a8
YCH
4802
4803
4804def decode_packed_codes(code):
06b3fe29 4805 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4806 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4807 base = int(base)
4808 count = int(count)
4809 symbols = symbols.split('|')
4810 symbol_table = {}
4811
4812 while count:
4813 count -= 1
5eb6bdce 4814 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4815 symbol_table[base_n_count] = symbols[count] or base_n_count
4816
4817 return re.sub(
4818 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4819 obfuscated_code)
e154c651 4820
4821
1ced2221
S
4822def caesar(s, alphabet, shift):
4823 if shift == 0:
4824 return s
4825 l = len(alphabet)
4826 return ''.join(
4827 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4828 for c in s)
4829
4830
4831def rot47(s):
4832 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4833
4834
e154c651 4835def parse_m3u8_attributes(attrib):
4836 info = {}
4837 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4838 if val.startswith('"'):
4839 val = val[1:-1]
4840 info[key] = val
4841 return info
1143535d
YCH
4842
4843
4844def urshift(val, n):
4845 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4846
4847
4848# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4849# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4850def decode_png(png_data):
4851 # Reference: https://www.w3.org/TR/PNG/
4852 header = png_data[8:]
4853
4854 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4855 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4856
4857 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4858 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4859
4860 chunks = []
4861
4862 while header:
4863 length = unpack_integer(header[:4])
4864 header = header[4:]
4865
4866 chunk_type = header[:4]
4867 header = header[4:]
4868
4869 chunk_data = header[:length]
4870 header = header[length:]
4871
4872 header = header[4:] # Skip CRC
4873
4874 chunks.append({
4875 'type': chunk_type,
4876 'length': length,
4877 'data': chunk_data
4878 })
4879
4880 ihdr = chunks[0]['data']
4881
4882 width = unpack_integer(ihdr[:4])
4883 height = unpack_integer(ihdr[4:8])
4884
4885 idat = b''
4886
4887 for chunk in chunks:
4888 if chunk['type'] == b'IDAT':
4889 idat += chunk['data']
4890
4891 if not idat:
86e5f3ed 4892 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4893
4894 decompressed_data = bytearray(zlib.decompress(idat))
4895
4896 stride = width * 3
4897 pixels = []
4898
4899 def _get_pixel(idx):
4900 x = idx % stride
4901 y = idx // stride
4902 return pixels[y][x]
4903
4904 for y in range(height):
4905 basePos = y * (1 + stride)
4906 filter_type = decompressed_data[basePos]
4907
4908 current_row = []
4909
4910 pixels.append(current_row)
4911
4912 for x in range(stride):
4913 color = decompressed_data[1 + basePos + x]
4914 basex = y * stride + x
4915 left = 0
4916 up = 0
4917
4918 if x > 2:
4919 left = _get_pixel(basex - 3)
4920 if y > 0:
4921 up = _get_pixel(basex - stride)
4922
4923 if filter_type == 1: # Sub
4924 color = (color + left) & 0xff
4925 elif filter_type == 2: # Up
4926 color = (color + up) & 0xff
4927 elif filter_type == 3: # Average
4928 color = (color + ((left + up) >> 1)) & 0xff
4929 elif filter_type == 4: # Paeth
4930 a = left
4931 b = up
4932 c = 0
4933
4934 if x > 2 and y > 0:
4935 c = _get_pixel(basex - stride - 3)
4936
4937 p = a + b - c
4938
4939 pa = abs(p - a)
4940 pb = abs(p - b)
4941 pc = abs(p - c)
4942
4943 if pa <= pb and pa <= pc:
4944 color = (color + a) & 0xff
4945 elif pb <= pc:
4946 color = (color + b) & 0xff
4947 else:
4948 color = (color + c) & 0xff
4949
4950 current_row.append(color)
4951
4952 return width, height, pixels
efa97bdc
YCH
4953
4954
4955def write_xattr(path, key, value):
6f7563be 4956 # Windows: Write xattrs to NTFS Alternate Data Streams:
4957 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4958 if compat_os_name == 'nt':
4959 assert ':' not in key
4960 assert os.path.exists(path)
efa97bdc
YCH
4961
4962 try:
6f7563be 4963 with open(f'{path}:{key}', 'wb') as f:
4964 f.write(value)
86e5f3ed 4965 except OSError as e:
efa97bdc 4966 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4967 return
efa97bdc 4968
6f7563be 4969 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4970
6f7563be 4971 setxattr = None
4972 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4973 # Unicode arguments are not supported in pyxattr until version 0.5.0
4974 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4975 if version_tuple(xattr.__version__) >= (0, 5, 0):
4976 setxattr = xattr.set
4977 elif xattr:
4978 setxattr = xattr.setxattr
efa97bdc 4979
6f7563be 4980 if setxattr:
4981 try:
4982 setxattr(path, key, value)
4983 except OSError as e:
4984 raise XAttrMetadataError(e.errno, e.strerror)
4985 return
efa97bdc 4986
6f7563be 4987 # UNIX Method 2. Use setfattr/xattr executables
4988 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4989 else 'xattr' if check_executable('xattr', ['-h']) else None)
4990 if not exe:
4991 raise XAttrUnavailableError(
4992 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4993 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4994
0f06bcd7 4995 value = value.decode()
6f7563be 4996 try:
f0c9fb96 4997 _, stderr, returncode = Popen.run(
6f7563be 4998 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4999 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5000 except OSError as e:
5001 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5002 if returncode:
5003 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5004
5005
5006def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5007 start_date = datetime.date(1950, 1, 1)
5008 end_date = datetime.date(1995, 12, 31)
5009 offset = random.randint(0, (end_date - start_date).days)
5010 random_date = start_date + datetime.timedelta(offset)
0c265486 5011 return {
aa374bc7
AS
5012 year_field: str(random_date.year),
5013 month_field: str(random_date.month),
5014 day_field: str(random_date.day),
0c265486 5015 }
732044af 5016
c76eb41b 5017
732044af 5018# Templates for internet shortcut files, which are plain text files.
e5a998f3 5019DOT_URL_LINK_TEMPLATE = '''\
732044af 5020[InternetShortcut]
5021URL=%(url)s
e5a998f3 5022'''
732044af 5023
e5a998f3 5024DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5025<?xml version="1.0" encoding="UTF-8"?>
5026<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5027<plist version="1.0">
5028<dict>
5029\t<key>URL</key>
5030\t<string>%(url)s</string>
5031</dict>
5032</plist>
e5a998f3 5033'''
732044af 5034
e5a998f3 5035DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5036[Desktop Entry]
5037Encoding=UTF-8
5038Name=%(filename)s
5039Type=Link
5040URL=%(url)s
5041Icon=text-html
e5a998f3 5042'''
732044af 5043
08438d2c 5044LINK_TEMPLATES = {
5045 'url': DOT_URL_LINK_TEMPLATE,
5046 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5047 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5048}
5049
732044af 5050
5051def iri_to_uri(iri):
5052 """
5053 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5054
5055 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5056 """
5057
14f25df2 5058 iri_parts = urllib.parse.urlparse(iri)
732044af 5059
5060 if '[' in iri_parts.netloc:
5061 raise ValueError('IPv6 URIs are not, yet, supported.')
5062 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5063
5064 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5065
5066 net_location = ''
5067 if iri_parts.username:
f9934b96 5068 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5069 if iri_parts.password is not None:
f9934b96 5070 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5071 net_location += '@'
5072
0f06bcd7 5073 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5074 # The 'idna' encoding produces ASCII text.
5075 if iri_parts.port is not None and iri_parts.port != 80:
5076 net_location += ':' + str(iri_parts.port)
5077
f9934b96 5078 return urllib.parse.urlunparse(
732044af 5079 (iri_parts.scheme,
5080 net_location,
5081
f9934b96 5082 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5083
5084 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5085 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5086
5087 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5088 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5089
f9934b96 5090 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5091
5092 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5093
5094
5095def to_high_limit_path(path):
5096 if sys.platform in ['win32', 'cygwin']:
5097 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5098 return '\\\\?\\' + os.path.abspath(path)
732044af 5099
5100 return path
76d321f6 5101
c76eb41b 5102
7b2c3f47 5103def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5104 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5105 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5106 return default
7b2c3f47 5107 return template % func(val)
00dd0cd5 5108
5109
5110def clean_podcast_url(url):
5111 return re.sub(r'''(?x)
5112 (?:
5113 (?:
5114 chtbl\.com/track|
5115 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5116 play\.podtrac\.com
5117 )/[^/]+|
5118 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5119 flex\.acast\.com|
5120 pd(?:
5121 cn\.co| # https://podcorn.com/analytics-prefix/
5122 st\.fm # https://podsights.com/docs/
5123 )/e
5124 )/''', '', url)
ffcb8191
THD
5125
5126
5127_HEX_TABLE = '0123456789abcdef'
5128
5129
5130def random_uuidv4():
5131 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5132
5133
5134def make_dir(path, to_screen=None):
5135 try:
5136 dn = os.path.dirname(path)
5137 if dn and not os.path.exists(dn):
5138 os.makedirs(dn)
5139 return True
86e5f3ed 5140 except OSError as err:
0202b52a 5141 if callable(to_screen) is not None:
5142 to_screen('unable to create directory ' + error_to_compat_str(err))
5143 return False
f74980cb 5144
5145
5146def get_executable_path():
b5899f4f 5147 from .update import _get_variant_and_executable_path
c487cf00 5148
b5899f4f 5149 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5150
5151
2f567473 5152def load_plugins(name, suffix, namespace):
3ae5e797 5153 classes = {}
19a03940 5154 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5155 plugins_spec = importlib.util.spec_from_file_location(
5156 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5157 plugins = importlib.util.module_from_spec(plugins_spec)
5158 sys.modules[plugins_spec.name] = plugins
5159 plugins_spec.loader.exec_module(plugins)
f74980cb 5160 for name in dir(plugins):
2f567473 5161 if name in namespace:
5162 continue
5163 if not name.endswith(suffix):
f74980cb 5164 continue
5165 klass = getattr(plugins, name)
3ae5e797 5166 classes[name] = namespace[name] = klass
f74980cb 5167 return classes
06167fbb 5168
5169
325ebc17 5170def traverse_obj(
352d63fd 5171 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5172 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5173 ''' Traverse nested list/dict/tuple
8f334380 5174 @param path_list A list of paths which are checked one by one.
19a03940 5175 Each path is a list of keys where each key is a:
5176 - None: Do nothing
5177 - string: A dictionary key
5178 - int: An index into a list
5179 - tuple: A list of keys all of which will be traversed
5180 - Ellipsis: Fetch all values in the object
5181 - Function: Takes the key and value as arguments
5182 and returns whether the key matches or not
325ebc17 5183 @param default Default value to return
352d63fd 5184 @param expected_type Only accept final value of this type (Can also be any callable)
5185 @param get_all Return all the values obtained from a path or only the first one
324ad820 5186 @param casesense Whether to consider dictionary keys as case sensitive
5187 @param is_user_input Whether the keys are generated from user input. If True,
5188 strings are converted to int/slice if necessary
5189 @param traverse_string Whether to traverse inside strings. If True, any
5190 non-compatible object will also be converted into a string
8f334380 5191 # TODO: Write tests
324ad820 5192 '''
325ebc17 5193 if not casesense:
dbf5416a 5194 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5195 path_list = (map(_lower, variadic(path)) for path in path_list)
5196
5197 def _traverse_obj(obj, path, _current_depth=0):
5198 nonlocal depth
5199 path = tuple(variadic(path))
5200 for i, key in enumerate(path):
1797b073 5201 if None in (key, obj):
5202 return obj
8f334380 5203 if isinstance(key, (list, tuple)):
5204 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5205 key = ...
5206 if key is ...:
5207 obj = (obj.values() if isinstance(obj, dict)
5208 else obj if isinstance(obj, (list, tuple, LazyList))
5209 else str(obj) if traverse_string else [])
5210 _current_depth += 1
5211 depth = max(depth, _current_depth)
5212 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5213 elif callable(key):
5214 if isinstance(obj, (list, tuple, LazyList)):
5215 obj = enumerate(obj)
5216 elif isinstance(obj, dict):
5217 obj = obj.items()
5218 else:
5219 if not traverse_string:
5220 return None
5221 obj = str(obj)
5222 _current_depth += 1
5223 depth = max(depth, _current_depth)
e6f868a6 5224 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5225 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5226 obj = (obj.get(key) if casesense or (key in obj)
5227 else next((v for k, v in obj.items() if _lower(k) == key), None))
5228 else:
5229 if is_user_input:
5230 key = (int_or_none(key) if ':' not in key
5231 else slice(*map(int_or_none, key.split(':'))))
8f334380 5232 if key == slice(None):
575e17a1 5233 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5234 if not isinstance(key, (int, slice)):
9fea350f 5235 return None
8f334380 5236 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5237 if not traverse_string:
5238 return None
5239 obj = str(obj)
5240 try:
5241 obj = obj[key]
5242 except IndexError:
324ad820 5243 return None
325ebc17 5244 return obj
5245
352d63fd 5246 if isinstance(expected_type, type):
5247 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5248 else:
7b2c3f47 5249 type_test = expected_type or IDENTITY
352d63fd 5250
8f334380 5251 for path in path_list:
5252 depth = 0
5253 val = _traverse_obj(obj, path)
325ebc17 5254 if val is not None:
8f334380 5255 if depth:
5256 for _ in range(depth - 1):
6586bca9 5257 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5258 val = [v for v in map(type_test, val) if v is not None]
8f334380 5259 if val:
352d63fd 5260 return val if get_all else val[0]
5261 else:
5262 val = type_test(val)
5263 if val is not None:
8f334380 5264 return val
325ebc17 5265 return default
324ad820 5266
5267
5268def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5269 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5270 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5271 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5272
5273
ff91cf74 5274def get_first(obj, keys, **kwargs):
5275 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5276
5277
4b4b7f74 5278def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5279 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5280
5281
3e9b66d7
LNO
5282def time_seconds(**kwargs):
5283 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5284 return t.timestamp()
5285
5286
49fa4d9a
N
5287# create a JSON Web Signature (jws) with HS256 algorithm
5288# the resulting format is in JWS Compact Serialization
5289# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5290# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5291def jwt_encode_hs256(payload_data, key, headers={}):
5292 header_data = {
5293 'alg': 'HS256',
5294 'typ': 'JWT',
5295 }
5296 if headers:
5297 header_data.update(headers)
0f06bcd7 5298 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5299 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5300 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5301 signature_b64 = base64.b64encode(h.digest())
5302 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5303 return token
819e0531 5304
5305
16b0d7e6 5306# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5307def jwt_decode_hs256(jwt):
5308 header_b64, payload_b64, signature_b64 = jwt.split('.')
5309 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5310 return payload_data
5311
5312
53973b4d 5313WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5314
5315
0b9c08b4 5316@functools.cache
819e0531 5317def supports_terminal_sequences(stream):
5318 if compat_os_name == 'nt':
8a82af35 5319 if not WINDOWS_VT_MODE:
819e0531 5320 return False
5321 elif not os.getenv('TERM'):
5322 return False
5323 try:
5324 return stream.isatty()
5325 except BaseException:
5326 return False
5327
5328
53973b4d 5329def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5330 if get_windows_version() < (10, 0, 10586):
53973b4d 5331 return
5332 global WINDOWS_VT_MODE
53973b4d 5333 try:
f0c9fb96 5334 Popen.run('', shell=True)
53973b4d 5335 except Exception:
5336 return
5337
5338 WINDOWS_VT_MODE = True
5339 supports_terminal_sequences.cache_clear()
5340
5341
ec11a9f4 5342_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5343
5344
5345def remove_terminal_sequences(string):
5346 return _terminal_sequences_re.sub('', string)
5347
5348
5349def number_of_digits(number):
5350 return len('%d' % number)
34921b43 5351
5352
5353def join_nonempty(*values, delim='-', from_dict=None):
5354 if from_dict is not None:
7b2c3f47 5355 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5356 return delim.join(map(str, filter(None, values)))
06e57990 5357
5358
27231526
ZM
5359def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5360 """
5361 Find the largest format dimensions in terms of video width and, for each thumbnail:
5362 * Modify the URL: Match the width with the provided regex and replace with the former width
5363 * Update dimensions
5364
5365 This function is useful with video services that scale the provided thumbnails on demand
5366 """
5367 _keys = ('width', 'height')
5368 max_dimensions = max(
86e5f3ed 5369 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5370 default=(0, 0))
5371 if not max_dimensions[0]:
5372 return thumbnails
5373 return [
5374 merge_dicts(
5375 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5376 dict(zip(_keys, max_dimensions)), thumbnail)
5377 for thumbnail in thumbnails
5378 ]
5379
5380
93c8410d
LNO
5381def parse_http_range(range):
5382 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5383 if not range:
5384 return None, None, None
5385 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5386 if not crg:
5387 return None, None, None
5388 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5389
5390
6b9e832d 5391def read_stdin(what):
5392 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5393 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5394 return sys.stdin
5395
5396
a904a7f8
L
5397def determine_file_encoding(data):
5398 """
88f60feb 5399 Detect the text encoding used
a904a7f8
L
5400 @returns (encoding, bytes to skip)
5401 """
5402
88f60feb 5403 # BOM marks are given priority over declarations
a904a7f8 5404 for bom, enc in BOMS:
a904a7f8
L
5405 if data.startswith(bom):
5406 return enc, len(bom)
5407
88f60feb 5408 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5409 # We ignore the endianness to get a good enough match
a904a7f8 5410 data = data.replace(b'\0', b'')
88f60feb 5411 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5412 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5413
5414
06e57990 5415class Config:
5416 own_args = None
9e491463 5417 parsed_args = None
06e57990 5418 filename = None
5419 __initialized = False
5420
5421 def __init__(self, parser, label=None):
9e491463 5422 self.parser, self.label = parser, label
06e57990 5423 self._loaded_paths, self.configs = set(), []
5424
5425 def init(self, args=None, filename=None):
5426 assert not self.__initialized
284a60c5 5427 self.own_args, self.filename = args, filename
5428 return self.load_configs()
5429
5430 def load_configs(self):
65662dff 5431 directory = ''
284a60c5 5432 if self.filename:
5433 location = os.path.realpath(self.filename)
65662dff 5434 directory = os.path.dirname(location)
06e57990 5435 if location in self._loaded_paths:
5436 return False
5437 self._loaded_paths.add(location)
5438
284a60c5 5439 self.__initialized = True
5440 opts, _ = self.parser.parse_known_args(self.own_args)
5441 self.parsed_args = self.own_args
9e491463 5442 for location in opts.config_locations or []:
6b9e832d 5443 if location == '-':
5444 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5445 continue
65662dff 5446 location = os.path.join(directory, expand_path(location))
06e57990 5447 if os.path.isdir(location):
5448 location = os.path.join(location, 'yt-dlp.conf')
5449 if not os.path.exists(location):
9e491463 5450 self.parser.error(f'config location {location} does not exist')
06e57990 5451 self.append_config(self.read_file(location), location)
5452 return True
5453
5454 def __str__(self):
5455 label = join_nonempty(
5456 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5457 delim=' ')
5458 return join_nonempty(
5459 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5460 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5461 delim='\n')
5462
5463 @staticmethod
5464 def read_file(filename, default=[]):
5465 try:
a904a7f8 5466 optionf = open(filename, 'rb')
86e5f3ed 5467 except OSError:
06e57990 5468 return default # silently skip if file is not present
a904a7f8
L
5469 try:
5470 enc, skip = determine_file_encoding(optionf.read(512))
5471 optionf.seek(skip, io.SEEK_SET)
5472 except OSError:
5473 enc = None # silently skip read errors
06e57990 5474 try:
5475 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5476 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5477 res = shlex.split(contents, comments=True)
44a6fcff 5478 except Exception as err:
5479 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5480 finally:
5481 optionf.close()
5482 return res
5483
5484 @staticmethod
5485 def hide_login_info(opts):
86e5f3ed 5486 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5487 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5488
5489 def _scrub_eq(o):
5490 m = eqre.match(o)
5491 if m:
5492 return m.group('key') + '=PRIVATE'
5493 else:
5494 return o
5495
5496 opts = list(map(_scrub_eq, opts))
5497 for idx, opt in enumerate(opts):
5498 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5499 opts[idx + 1] = 'PRIVATE'
5500 return opts
5501
5502 def append_config(self, *args, label=None):
9e491463 5503 config = type(self)(self.parser, label)
06e57990 5504 config._loaded_paths = self._loaded_paths
5505 if config.init(*args):
5506 self.configs.append(config)
5507
5508 @property
5509 def all_args(self):
5510 for config in reversed(self.configs):
5511 yield from config.all_args
9e491463 5512 yield from self.parsed_args or []
5513
5514 def parse_known_args(self, **kwargs):
5515 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5516
5517 def parse_args(self):
9e491463 5518 return self.parser.parse_args(self.all_args)
da42679b
LNO
5519
5520
5521class WebSocketsWrapper():
5522 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5523 pool = None
da42679b 5524
3cea3edd 5525 def __init__(self, url, headers=None, connect=True):
059bc4db 5526 self.loop = asyncio.new_event_loop()
9cd08050 5527 # XXX: "loop" is deprecated
5528 self.conn = websockets.connect(
5529 url, extra_headers=headers, ping_interval=None,
5530 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5531 if connect:
5532 self.__enter__()
15dfb392 5533 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5534
5535 def __enter__(self):
3cea3edd 5536 if not self.pool:
9cd08050 5537 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5538 return self
5539
5540 def send(self, *args):
5541 self.run_with_loop(self.pool.send(*args), self.loop)
5542
5543 def recv(self, *args):
5544 return self.run_with_loop(self.pool.recv(*args), self.loop)
5545
5546 def __exit__(self, type, value, traceback):
5547 try:
5548 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5549 finally:
5550 self.loop.close()
15dfb392 5551 self._cancel_all_tasks(self.loop)
da42679b
LNO
5552
5553 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5554 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5555 @staticmethod
5556 def run_with_loop(main, loop):
059bc4db 5557 if not asyncio.iscoroutine(main):
da42679b
LNO
5558 raise ValueError(f'a coroutine was expected, got {main!r}')
5559
5560 try:
5561 return loop.run_until_complete(main)
5562 finally:
5563 loop.run_until_complete(loop.shutdown_asyncgens())
5564 if hasattr(loop, 'shutdown_default_executor'):
5565 loop.run_until_complete(loop.shutdown_default_executor())
5566
5567 @staticmethod
5568 def _cancel_all_tasks(loop):
059bc4db 5569 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5570
5571 if not to_cancel:
5572 return
5573
5574 for task in to_cancel:
5575 task.cancel()
5576
9cd08050 5577 # XXX: "loop" is removed in python 3.10+
da42679b 5578 loop.run_until_complete(
059bc4db 5579 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5580
5581 for task in to_cancel:
5582 if task.cancelled():
5583 continue
5584 if task.exception() is not None:
5585 loop.call_exception_handler({
5586 'message': 'unhandled exception during asyncio.run() shutdown',
5587 'exception': task.exception(),
5588 'task': task,
5589 })
5590
5591
8b7539d2 5592def merge_headers(*dicts):
08d30158 5593 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5594 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5595
5596
b1f94422 5597def cached_method(f):
5598 """Cache a method"""
5599 signature = inspect.signature(f)
5600
5601 @functools.wraps(f)
5602 def wrapper(self, *args, **kwargs):
5603 bound_args = signature.bind(self, *args, **kwargs)
5604 bound_args.apply_defaults()
5605 key = tuple(bound_args.arguments.values())
5606
5607 if not hasattr(self, '__cached_method__cache'):
5608 self.__cached_method__cache = {}
5609 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5610 if key not in cache:
5611 cache[key] = f(self, *args, **kwargs)
5612 return cache[key]
5613 return wrapper
5614
5615
28787f16 5616class classproperty:
b1f94422 5617 """property access for class methods"""
c487cf00 5618
5619 def __init__(self, func):
5620 functools.update_wrapper(self, func)
5621 self.func = func
28787f16 5622
5623 def __get__(self, _, cls):
c487cf00 5624 return self.func(cls)
19a03940 5625
5626
64fa820c 5627class Namespace(types.SimpleNamespace):
591bb9d3 5628 """Immutable namespace"""
591bb9d3 5629
7896214c 5630 def __iter__(self):
64fa820c 5631 return iter(self.__dict__.values())
7896214c 5632
64fa820c 5633 @property
5634 def items_(self):
5635 return self.__dict__.items()
9b8ee23b 5636
5637
8dc59305 5638MEDIA_EXTENSIONS = Namespace(
5639 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5640 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5641 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5642 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5643 thumbnails=('jpg', 'png', 'webp'),
5644 storyboards=('mhtml', ),
5645 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5646 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5647)
5648MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5649MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5650
5651KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5652
5653
be5c1ae8 5654class RetryManager:
5655 """Usage:
5656 for retry in RetryManager(...):
5657 try:
5658 ...
5659 except SomeException as err:
5660 retry.error = err
5661 continue
5662 """
5663 attempt, _error = 0, None
5664
5665 def __init__(self, _retries, _error_callback, **kwargs):
5666 self.retries = _retries or 0
5667 self.error_callback = functools.partial(_error_callback, **kwargs)
5668
5669 def _should_retry(self):
5670 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5671
5672 @property
5673 def error(self):
5674 if self._error is NO_DEFAULT:
5675 return None
5676 return self._error
5677
5678 @error.setter
5679 def error(self, value):
5680 self._error = value
5681
5682 def __iter__(self):
5683 while self._should_retry():
5684 self.error = NO_DEFAULT
5685 self.attempt += 1
5686 yield self
5687 if self.error:
5688 self.error_callback(self.error, self.attempt, self.retries)
5689
5690 @staticmethod
5691 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5692 """Utility function for reporting retries"""
5693 if count > retries:
5694 if error:
5695 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5696 raise e
5697
5698 if not count:
5699 return warn(e)
5700 elif isinstance(e, ExtractorError):
5701 e = remove_end(e.cause or e.orig_msg, '.')
5702 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5703
5704 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5705 if delay:
5706 info(f'Sleeping {delay:.2f} seconds ...')
5707 time.sleep(delay)
5708
5709
9b8ee23b 5710# Deprecated
5711has_certifi = bool(certifi)
5712has_websockets = bool(websockets)