]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[core] Support decoding multiple content encodings (#7142)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
69bec673 50from . import traversal
51
52from ..compat import functools # isort: split
53from ..compat import (
36e6f62c 54 compat_etree_fromstring,
51098426 55 compat_expanduser,
f8271158 56 compat_HTMLParseError,
efa97bdc 57 compat_os_name,
702ccf2d 58 compat_shlex_quote,
8c25f81b 59)
69bec673 60from ..dependencies import brotli, certifi, websockets, xattr
61from ..socks import ProxyType, sockssocket
51fb4995 62
46f1370e 63__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
64
468e2e92
FV
65# This is not clearly defined otherwise
66compiled_regex_type = type(re.compile(''))
67
f7a147e3
S
68
69def random_user_agent():
70 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
71 _CHROME_VERSIONS = (
19b4c74d 72 '90.0.4430.212',
73 '90.0.4430.24',
74 '90.0.4430.70',
75 '90.0.4430.72',
76 '90.0.4430.85',
77 '90.0.4430.93',
78 '91.0.4472.101',
79 '91.0.4472.106',
80 '91.0.4472.114',
81 '91.0.4472.124',
82 '91.0.4472.164',
83 '91.0.4472.19',
84 '91.0.4472.77',
85 '92.0.4515.107',
86 '92.0.4515.115',
87 '92.0.4515.131',
88 '92.0.4515.159',
89 '92.0.4515.43',
90 '93.0.4556.0',
91 '93.0.4577.15',
92 '93.0.4577.63',
93 '93.0.4577.82',
94 '94.0.4606.41',
95 '94.0.4606.54',
96 '94.0.4606.61',
97 '94.0.4606.71',
98 '94.0.4606.81',
99 '94.0.4606.85',
100 '95.0.4638.17',
101 '95.0.4638.50',
102 '95.0.4638.54',
103 '95.0.4638.69',
104 '95.0.4638.74',
105 '96.0.4664.18',
106 '96.0.4664.45',
107 '96.0.4664.55',
108 '96.0.4664.93',
109 '97.0.4692.20',
f7a147e3
S
110 )
111 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
112
113
4390d5ec 114SUPPORTED_ENCODINGS = [
115 'gzip', 'deflate'
116]
9b8ee23b 117if brotli:
4390d5ec 118 SUPPORTED_ENCODINGS.append('br')
119
3e669f36 120std_headers = {
f7a147e3 121 'User-Agent': random_user_agent(),
59ae15a5 122 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 123 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 124 'Sec-Fetch-Mode': 'navigate',
3e669f36 125}
f427df17 126
5f6a1245 127
fb37eb25
S
128USER_AGENTS = {
129 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
130}
131
132
4823ec9f 133class NO_DEFAULT:
134 pass
135
136
137def IDENTITY(x):
138 return x
139
bf42a990 140
7105440c
YCH
141ENGLISH_MONTH_NAMES = [
142 'January', 'February', 'March', 'April', 'May', 'June',
143 'July', 'August', 'September', 'October', 'November', 'December']
144
f6717dec
S
145MONTH_NAMES = {
146 'en': ENGLISH_MONTH_NAMES,
147 'fr': [
3e4185c3
S
148 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
149 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 150 # these follow the genitive grammatical case (dopełniacz)
151 # some websites might be using nominative, which will require another month list
152 # https://en.wikibooks.org/wiki/Polish/Noun_cases
153 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
154 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 155}
a942d6cb 156
8f53dc44 157# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
158TIMEZONE_NAMES = {
159 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
160 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
161 'EST': -5, 'EDT': -4, # Eastern
162 'CST': -6, 'CDT': -5, # Central
163 'MST': -7, 'MDT': -6, # Mountain
164 'PST': -8, 'PDT': -7 # Pacific
165}
166
c587cbb7 167# needed for sanitizing filenames in restricted mode
c8827027 168ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
169 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
170 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 171
46f59e89
S
172DATE_FORMATS = (
173 '%d %B %Y',
174 '%d %b %Y',
175 '%B %d %Y',
cb655f34
S
176 '%B %dst %Y',
177 '%B %dnd %Y',
9d30c213 178 '%B %drd %Y',
cb655f34 179 '%B %dth %Y',
46f59e89 180 '%b %d %Y',
cb655f34
S
181 '%b %dst %Y',
182 '%b %dnd %Y',
9d30c213 183 '%b %drd %Y',
cb655f34 184 '%b %dth %Y',
46f59e89
S
185 '%b %dst %Y %I:%M',
186 '%b %dnd %Y %I:%M',
9d30c213 187 '%b %drd %Y %I:%M',
46f59e89
S
188 '%b %dth %Y %I:%M',
189 '%Y %m %d',
190 '%Y-%m-%d',
bccdbd22 191 '%Y.%m.%d.',
46f59e89 192 '%Y/%m/%d',
81c13222 193 '%Y/%m/%d %H:%M',
46f59e89 194 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
195 '%Y%m%d%H%M',
196 '%Y%m%d%H%M%S',
4f3fa23e 197 '%Y%m%d',
0c1c6f4b 198 '%Y-%m-%d %H:%M',
46f59e89
S
199 '%Y-%m-%d %H:%M:%S',
200 '%Y-%m-%d %H:%M:%S.%f',
5014558a 201 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
202 '%d.%m.%Y %H:%M',
203 '%d.%m.%Y %H.%M',
204 '%Y-%m-%dT%H:%M:%SZ',
205 '%Y-%m-%dT%H:%M:%S.%fZ',
206 '%Y-%m-%dT%H:%M:%S.%f0Z',
207 '%Y-%m-%dT%H:%M:%S',
208 '%Y-%m-%dT%H:%M:%S.%f',
209 '%Y-%m-%dT%H:%M',
c6eed6b8
S
210 '%b %d %Y at %H:%M',
211 '%b %d %Y at %H:%M:%S',
b555ae9b
S
212 '%B %d %Y at %H:%M',
213 '%B %d %Y at %H:%M:%S',
a63d9bd0 214 '%H:%M %d-%b-%Y',
46f59e89
S
215)
216
217DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
218DATE_FORMATS_DAY_FIRST.extend([
219 '%d-%m-%Y',
220 '%d.%m.%Y',
221 '%d.%m.%y',
222 '%d/%m/%Y',
223 '%d/%m/%y',
224 '%d/%m/%Y %H:%M:%S',
47304e07 225 '%d-%m-%Y %H:%M',
46f59e89
S
226])
227
228DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
229DATE_FORMATS_MONTH_FIRST.extend([
230 '%m-%d-%Y',
231 '%m.%d.%Y',
232 '%m/%d/%Y',
233 '%m/%d/%y',
234 '%m/%d/%Y %H:%M:%S',
235])
236
06b3fe29 237PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 238JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 239
1d485a1a 240NUMBER_RE = r'\d+(?:\.\d+)?'
241
7105440c 242
0b9c08b4 243@functools.cache
d77c3dfd 244def preferredencoding():
59ae15a5 245 """Get preferred encoding.
d77c3dfd 246
59ae15a5
PH
247 Returns the best encoding scheme for the system, based on
248 locale.getpreferredencoding() and some further tweaks.
249 """
250 try:
251 pref = locale.getpreferredencoding()
28e614de 252 'TEST'.encode(pref)
70a1165b 253 except Exception:
59ae15a5 254 pref = 'UTF-8'
bae611f2 255
59ae15a5 256 return pref
d77c3dfd 257
f4bfd65f 258
181c8655 259def write_json_file(obj, fn):
1394646a 260 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 261
cfb0511d 262 tf = tempfile.NamedTemporaryFile(
263 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
264 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
265
266 try:
267 with tf:
45d86abe 268 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
269 if sys.platform == 'win32':
270 # Need to remove existing file on Windows, else os.rename raises
271 # WindowsError or FileExistsError.
19a03940 272 with contextlib.suppress(OSError):
1394646a 273 os.unlink(fn)
19a03940 274 with contextlib.suppress(OSError):
9cd5f54e
R
275 mask = os.umask(0)
276 os.umask(mask)
277 os.chmod(tf.name, 0o666 & ~mask)
181c8655 278 os.rename(tf.name, fn)
70a1165b 279 except Exception:
19a03940 280 with contextlib.suppress(OSError):
181c8655 281 os.remove(tf.name)
181c8655
PH
282 raise
283
284
cfb0511d 285def find_xpath_attr(node, xpath, key, val=None):
286 """ Find the xpath xpath[@key=val] """
287 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 288 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 289 return node.find(expr)
59ae56fa 290
d7e66d39
JMF
291# On python2.6 the xml.etree.ElementTree.Element methods don't support
292# the namespace parameter
5f6a1245
JW
293
294
d7e66d39
JMF
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
a41fb80c 307def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 308 def _find_xpath(xpath):
f9934b96 309 return node.find(xpath)
578c0745 310
14f25df2 311 if isinstance(xpath, str):
578c0745
S
312 n = _find_xpath(xpath)
313 else:
314 for xp in xpath:
315 n = _find_xpath(xp)
316 if n is not None:
317 break
d74bebd5 318
8e636da4 319 if n is None:
bf42a990
S
320 if default is not NO_DEFAULT:
321 return default
322 elif fatal:
bf0ff932
PH
323 name = xpath if name is None else name
324 raise ExtractorError('Could not find XML element %s' % name)
325 else:
326 return None
a41fb80c
S
327 return n
328
329
330def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
331 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
332 if n is None or n == default:
333 return n
334 if n.text is None:
335 if default is not NO_DEFAULT:
336 return default
337 elif fatal:
338 name = xpath if name is None else name
339 raise ExtractorError('Could not find XML element\'s text %s' % name)
340 else:
341 return None
342 return n.text
a41fb80c
S
343
344
345def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
346 n = find_xpath_attr(node, xpath, key)
347 if n is None:
348 if default is not NO_DEFAULT:
349 return default
350 elif fatal:
86e5f3ed 351 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
352 raise ExtractorError('Could not find XML attribute %s' % name)
353 else:
354 return None
355 return n.attrib[key]
bf0ff932
PH
356
357
c487cf00 358def get_element_by_id(id, html, **kwargs):
43e8fafd 359 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 360 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 361
12ea2f30 362
c487cf00 363def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 364 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 365 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
366
367
84c237fb 368def get_element_by_class(class_name, html):
2af12ad9
TC
369 """Return the content of the first tag with the specified class in the passed HTML document"""
370 retval = get_elements_by_class(class_name, html)
371 return retval[0] if retval else None
372
373
6f32a0b5
ZM
374def get_element_html_by_class(class_name, html):
375 """Return the html of the first tag with the specified class in the passed HTML document"""
376 retval = get_elements_html_by_class(class_name, html)
377 return retval[0] if retval else None
378
379
c487cf00 380def get_element_by_attribute(attribute, value, html, **kwargs):
381 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
382 return retval[0] if retval else None
383
384
c487cf00 385def get_element_html_by_attribute(attribute, value, html, **kargs):
386 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
387 return retval[0] if retval else None
388
389
c487cf00 390def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
391 """Return the content of all tags with the specified class in the passed HTML document as a list"""
392 return get_elements_by_attribute(
64fa820c 393 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
394 html, escape_value=False)
395
396
6f32a0b5
ZM
397def get_elements_html_by_class(class_name, html):
398 """Return the html of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_html_by_attribute(
64fa820c 400 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
401 html, escape_value=False)
402
403
404def get_elements_by_attribute(*args, **kwargs):
43e8fafd 405 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
406 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
407
408
409def get_elements_html_by_attribute(*args, **kwargs):
410 """Return the html of the tag with the specified attribute in the passed HTML document"""
411 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
412
413
4c9a1a3b 414def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
415 """
416 Return the text (content) and the html (whole) of the tag with the specified
417 attribute in the passed HTML document
418 """
c61473c1
M
419 if not value:
420 return
9e6dd238 421
86e5f3ed 422 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 423
84c237fb
YCH
424 value = re.escape(value) if escape_value else value
425
86e5f3ed 426 partial_element_re = rf'''(?x)
4c9a1a3b 427 <(?P<tag>{tag})
0254f162 428 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 429 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
430 '''
38285056 431
0254f162
ZM
432 for m in re.finditer(partial_element_re, html):
433 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 434
0254f162
ZM
435 yield (
436 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
437 whole
438 )
a921f407 439
c5229f39 440
ac668111 441class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
442 """
443 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
444 closing tag for the first opening tag it has encountered, and can be used
445 as a context manager
446 """
447
448 class HTMLBreakOnClosingTagException(Exception):
449 pass
450
451 def __init__(self):
452 self.tagstack = collections.deque()
ac668111 453 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
454
455 def __enter__(self):
456 return self
457
458 def __exit__(self, *_):
459 self.close()
460
461 def close(self):
462 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
463 # so data remains buffered; we no longer have any interest in it, thus
464 # override this method to discard it
465 pass
466
467 def handle_starttag(self, tag, _):
468 self.tagstack.append(tag)
469
470 def handle_endtag(self, tag):
471 if not self.tagstack:
472 raise compat_HTMLParseError('no tags in the stack')
473 while self.tagstack:
474 inner_tag = self.tagstack.pop()
475 if inner_tag == tag:
476 break
477 else:
478 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
479 if not self.tagstack:
480 raise self.HTMLBreakOnClosingTagException()
481
482
46d09f87 483# XXX: This should be far less strict
6f32a0b5
ZM
484def get_element_text_and_html_by_tag(tag, html):
485 """
486 For the first element with the specified tag in the passed HTML document
487 return its' content (text) and the whole element (html)
488 """
489 def find_or_raise(haystack, needle, exc):
490 try:
491 return haystack.index(needle)
492 except ValueError:
493 raise exc
494 closing_tag = f'</{tag}>'
495 whole_start = find_or_raise(
496 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
497 content_start = find_or_raise(
498 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
499 content_start += whole_start + 1
500 with HTMLBreakOnClosingTagParser() as parser:
501 parser.feed(html[whole_start:content_start])
502 if not parser.tagstack or parser.tagstack[0] != tag:
503 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
504 offset = content_start
505 while offset < len(html):
506 next_closing_tag_start = find_or_raise(
507 html[offset:], closing_tag,
508 compat_HTMLParseError(f'closing {tag} tag not found'))
509 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
510 try:
511 parser.feed(html[offset:offset + next_closing_tag_end])
512 offset += next_closing_tag_end
513 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
514 return html[content_start:offset + next_closing_tag_start], \
515 html[whole_start:offset + next_closing_tag_end]
516 raise compat_HTMLParseError('unexpected end of html')
517
518
ac668111 519class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 520 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 521
8bb56eee 522 def __init__(self):
c5229f39 523 self.attrs = {}
ac668111 524 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
525
526 def handle_starttag(self, tag, attrs):
527 self.attrs = dict(attrs)
7053aa3a 528 raise compat_HTMLParseError('done')
8bb56eee 529
c5229f39 530
ac668111 531class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
532 """HTML parser to gather the attributes for the elements of a list"""
533
534 def __init__(self):
ac668111 535 html.parser.HTMLParser.__init__(self)
73673ccf
FF
536 self.items = []
537 self._level = 0
538
539 def handle_starttag(self, tag, attrs):
540 if tag == 'li' and self._level == 0:
541 self.items.append(dict(attrs))
542 self._level += 1
543
544 def handle_endtag(self, tag):
545 self._level -= 1
546
547
8bb56eee
BF
548def extract_attributes(html_element):
549 """Given a string for an HTML element such as
550 <el
551 a="foo" B="bar" c="&98;az" d=boz
552 empty= noval entity="&amp;"
553 sq='"' dq="'"
554 >
555 Decode and return a dictionary of attributes.
556 {
557 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
558 'empty': '', 'noval': None, 'entity': '&',
559 'sq': '"', 'dq': '\''
560 }.
8bb56eee
BF
561 """
562 parser = HTMLAttributeParser()
19a03940 563 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
564 parser.feed(html_element)
565 parser.close()
8bb56eee 566 return parser.attrs
9e6dd238 567
c5229f39 568
73673ccf
FF
569def parse_list(webpage):
570 """Given a string for an series of HTML <li> elements,
571 return a dictionary of their attributes"""
572 parser = HTMLListAttrsParser()
573 parser.feed(webpage)
574 parser.close()
575 return parser.items
576
577
9e6dd238 578def clean_html(html):
59ae15a5 579 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
580
581 if html is None: # Convenience for sanitizing descriptions etc.
582 return html
583
49185227 584 html = re.sub(r'\s+', ' ', html)
585 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
586 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
587 # Strip html tags
588 html = re.sub('<.*?>', '', html)
589 # Replace html entities
590 html = unescapeHTML(html)
7decf895 591 return html.strip()
9e6dd238
FV
592
593
b7c47b74 594class LenientJSONDecoder(json.JSONDecoder):
cc090836 595 # TODO: Write tests
596 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 597 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 598 self._close_attempts = 2 * close_objects
b7c47b74 599 super().__init__(*args, **kwargs)
600
cc090836 601 @staticmethod
602 def _close_object(err):
603 doc = err.doc[:err.pos]
604 # We need to add comma first to get the correct error message
605 if err.msg.startswith('Expecting \',\''):
606 return doc + ','
607 elif not doc.endswith(','):
608 return
609
610 if err.msg.startswith('Expecting property name'):
611 return doc[:-1] + '}'
612 elif err.msg.startswith('Expecting value'):
613 return doc[:-1] + ']'
614
b7c47b74 615 def decode(self, s):
616 if self.transform_source:
617 s = self.transform_source(s)
cc090836 618 for attempt in range(self._close_attempts + 1):
619 try:
620 if self.ignore_extra:
621 return self.raw_decode(s.lstrip())[0]
622 return super().decode(s)
623 except json.JSONDecodeError as e:
624 if e.pos is None:
625 raise
626 elif attempt < self._close_attempts:
627 s = self._close_object(e)
628 if s is not None:
629 continue
2fa669f7 630 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 631 assert False, 'Too many attempts to decode JSON'
b7c47b74 632
633
d77c3dfd 634def sanitize_open(filename, open_mode):
59ae15a5
PH
635 """Try to open the given filename, and slightly tweak it if this fails.
636
637 Attempts to open the given filename. If this fails, it tries to change
638 the filename slightly, step by step, until it's either able to open it
639 or it fails and raises a final exception, like the standard open()
640 function.
641
642 It returns the tuple (stream, definitive_file_name).
643 """
0edb3e33 644 if filename == '-':
645 if sys.platform == 'win32':
646 import msvcrt
be5c1ae8 647
62b58c09 648 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 649 with contextlib.suppress(io.UnsupportedOperation):
650 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 651 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 652
0edb3e33 653 for attempt in range(2):
654 try:
655 try:
89737671 656 if sys.platform == 'win32':
b506289f 657 # FIXME: An exclusive lock also locks the file from being read.
658 # Since windows locks are mandatory, don't lock the file on windows (for now).
659 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 660 raise LockingUnsupportedError()
0edb3e33 661 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 662 except OSError:
0edb3e33 663 stream = open(filename, open_mode)
8a82af35 664 return stream, filename
86e5f3ed 665 except OSError as err:
0edb3e33 666 if attempt or err.errno in (errno.EACCES,):
667 raise
668 old_filename, filename = filename, sanitize_path(filename)
669 if old_filename == filename:
670 raise
d77c3dfd
FV
671
672
673def timeconvert(timestr):
59ae15a5
PH
674 """Convert RFC 2822 defined time string into system timestamp"""
675 timestamp = None
676 timetuple = email.utils.parsedate_tz(timestr)
677 if timetuple is not None:
678 timestamp = email.utils.mktime_tz(timetuple)
679 return timestamp
1c469a94 680
5f6a1245 681
5c3895ff 682def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 683 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 684 @param restricted Use a stricter subset of allowed characters
685 @param is_id Whether this is an ID that should be kept unchanged if possible.
686 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 687 """
5c3895ff 688 if s == '':
689 return ''
690
59ae15a5 691 def replace_insane(char):
c587cbb7
AT
692 if restricted and char in ACCENT_CHARS:
693 return ACCENT_CHARS[char]
91dd88b9 694 elif not restricted and char == '\n':
5c3895ff 695 return '\0 '
989a01c2 696 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
697 # Replace with their full-width unicode counterparts
698 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 699 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
700 return ''
701 elif char == '"':
702 return '' if restricted else '\''
703 elif char == ':':
5c3895ff 704 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 705 elif char in '\\/|*<>':
5c3895ff 706 return '\0_'
707 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
708 return '\0_'
59ae15a5
PH
709 return char
710
db4678e4 711 # Replace look-alike Unicode glyphs
712 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 713 s = unicodedata.normalize('NFKC', s)
5c3895ff 714 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 715 result = ''.join(map(replace_insane, s))
5c3895ff 716 if is_id is NO_DEFAULT:
ae61d108 717 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
718 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 719 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
720 result = result.replace('\0', '') or '_'
721
796173d0
PH
722 if not is_id:
723 while '__' in result:
724 result = result.replace('__', '_')
725 result = result.strip('_')
726 # Common case of "Foreign band name - English song title"
727 if restricted and result.startswith('-_'):
728 result = result[2:]
5a42414b
PH
729 if result.startswith('-'):
730 result = '_' + result[len('-'):]
a7440261 731 result = result.lstrip('.')
796173d0
PH
732 if not result:
733 result = '_'
59ae15a5 734 return result
d77c3dfd 735
5f6a1245 736
c2934512 737def sanitize_path(s, force=False):
a2aaf4db 738 """Sanitizes and normalizes path on Windows"""
c2934512 739 if sys.platform == 'win32':
c4218ac3 740 force = False
c2934512 741 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 742 elif force:
743 drive_or_unc = ''
744 else:
a2aaf4db 745 return s
c2934512 746
be531ef1
S
747 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
748 if drive_or_unc:
a2aaf4db
S
749 norm_path.pop(0)
750 sanitized_path = [
ec85ded8 751 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 752 for path_part in norm_path]
be531ef1
S
753 if drive_or_unc:
754 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 755 elif force and s and s[0] == os.path.sep:
c4218ac3 756 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
757 return os.path.join(*sanitized_path)
758
759
8f97a15d 760def sanitize_url(url, *, scheme='http'):
befa4708
S
761 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
762 # the number of unwanted failures due to missing protocol
21633673 763 if url is None:
764 return
765 elif url.startswith('//'):
8f97a15d 766 return f'{scheme}:{url}'
befa4708
S
767 # Fix some common typos seen so far
768 COMMON_TYPOS = (
067aa17e 769 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
770 (r'^httpss://', r'https://'),
771 # https://bx1.be/lives/direct-tv/
772 (r'^rmtp([es]?)://', r'rtmp\1://'),
773 )
774 for mistake, fixup in COMMON_TYPOS:
775 if re.match(mistake, url):
776 return re.sub(mistake, fixup, url)
bc6b9bcd 777 return url
17bcc626
S
778
779
5435dcf9 780def extract_basic_auth(url):
14f25df2 781 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
782 if parts.username is None:
783 return url, None
14f25df2 784 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
785 parts.hostname if parts.port is None
786 else '%s:%d' % (parts.hostname, parts.port))))
787 auth_payload = base64.b64encode(
0f06bcd7 788 ('%s:%s' % (parts.username, parts.password or '')).encode())
789 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
790
791
67dda517 792def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 793 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
794 if auth_header is not None:
795 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
796 headers['Authorization'] = auth_header
ac668111 797 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
798
799
51098426 800def expand_path(s):
2fa669f7 801 """Expand shell variables and ~"""
51098426
S
802 return os.path.expandvars(compat_expanduser(s))
803
804
7e9a6125 805def orderedSet(iterable, *, lazy=False):
806 """Remove all duplicates from the input iterable"""
807 def _iter():
808 seen = [] # Do not use set since the items can be unhashable
809 for x in iterable:
810 if x not in seen:
811 seen.append(x)
812 yield x
813
814 return _iter() if lazy else list(_iter())
d77c3dfd 815
912b38b4 816
55b2f099 817def _htmlentity_transform(entity_with_semicolon):
4e408e47 818 """Transforms an HTML entity to a character."""
55b2f099
YCH
819 entity = entity_with_semicolon[:-1]
820
4e408e47 821 # Known non-numeric HTML entity
ac668111 822 if entity in html.entities.name2codepoint:
823 return chr(html.entities.name2codepoint[entity])
4e408e47 824
62b58c09
L
825 # TODO: HTML5 allows entities without a semicolon.
826 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 827 if entity_with_semicolon in html.entities.html5:
828 return html.entities.html5[entity_with_semicolon]
55b2f099 829
91757b0f 830 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
831 if mobj is not None:
832 numstr = mobj.group(1)
28e614de 833 if numstr.startswith('x'):
4e408e47 834 base = 16
28e614de 835 numstr = '0%s' % numstr
4e408e47
PH
836 else:
837 base = 10
067aa17e 838 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 839 with contextlib.suppress(ValueError):
ac668111 840 return chr(int(numstr, base))
4e408e47
PH
841
842 # Unknown entity in name, return its literal representation
7a3f0c00 843 return '&%s;' % entity
4e408e47
PH
844
845
d77c3dfd 846def unescapeHTML(s):
912b38b4
PH
847 if s is None:
848 return None
19a03940 849 assert isinstance(s, str)
d77c3dfd 850
4e408e47 851 return re.sub(
95f3f7c2 852 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 853
8bf48f23 854
cdb19aa4 855def escapeHTML(text):
856 return (
857 text
858 .replace('&', '&amp;')
859 .replace('<', '&lt;')
860 .replace('>', '&gt;')
861 .replace('"', '&quot;')
862 .replace("'", '&#39;')
863 )
864
865
f5b1bca9 866def process_communicate_or_kill(p, *args, **kwargs):
da4db748 867 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
868 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 869 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 870
871
d3c93ec2 872class Popen(subprocess.Popen):
873 if sys.platform == 'win32':
874 _startupinfo = subprocess.STARTUPINFO()
875 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
876 else:
877 _startupinfo = None
878
82ea226c
L
879 @staticmethod
880 def _fix_pyinstaller_ld_path(env):
881 """Restore LD_LIBRARY_PATH when using PyInstaller
882 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
883 https://github.com/yt-dlp/yt-dlp/issues/4573
884 """
885 if not hasattr(sys, '_MEIPASS'):
886 return
887
888 def _fix(key):
889 orig = env.get(f'{key}_ORIG')
890 if orig is None:
891 env.pop(key, None)
892 else:
893 env[key] = orig
894
895 _fix('LD_LIBRARY_PATH') # Linux
896 _fix('DYLD_LIBRARY_PATH') # macOS
897
898 def __init__(self, *args, env=None, text=False, **kwargs):
899 if env is None:
900 env = os.environ.copy()
901 self._fix_pyinstaller_ld_path(env)
902
da8e2912 903 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 904 if text is True:
905 kwargs['universal_newlines'] = True # For 3.6 compatibility
906 kwargs.setdefault('encoding', 'utf-8')
907 kwargs.setdefault('errors', 'replace')
82ea226c 908 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 909
910 def communicate_or_kill(self, *args, **kwargs):
8a82af35 911 try:
912 return self.communicate(*args, **kwargs)
913 except BaseException: # Including KeyboardInterrupt
f0c9fb96 914 self.kill(timeout=None)
8a82af35 915 raise
d3c93ec2 916
f0c9fb96 917 def kill(self, *, timeout=0):
918 super().kill()
919 if timeout != 0:
920 self.wait(timeout=timeout)
921
922 @classmethod
992dc6b4 923 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 924 with cls(*args, **kwargs) as proc:
da8e2912 925 default = '' if proc.__text_mode else b''
992dc6b4 926 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 927 return stdout or default, stderr or default, proc.returncode
f0c9fb96 928
d3c93ec2 929
f07b74fc 930def encodeArgument(s):
cfb0511d 931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
14f25df2 933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 934 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
935
936
aa7785f8 937_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
938
939
940def timetuple_from_msec(msec):
941 secs, msec = divmod(msec, 1000)
942 mins, secs = divmod(secs, 60)
943 hrs, mins = divmod(mins, 60)
944 return _timetuple(hrs, mins, secs, msec)
945
946
cdb19aa4 947def formatSeconds(secs, delim=':', msec=False):
aa7785f8 948 time = timetuple_from_msec(secs * 1000)
949 if time.hours:
950 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
951 elif time.minutes:
952 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 953 else:
aa7785f8 954 ret = '%d' % time.seconds
955 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 956
a0ddb8a2 957
77562778 958def _ssl_load_windows_store_certs(ssl_context, storename):
959 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
960 try:
961 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
962 if encoding == 'x509_asn' and (
963 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
964 except PermissionError:
965 return
966 for cert in certs:
19a03940 967 with contextlib.suppress(ssl.SSLError):
77562778 968 ssl_context.load_verify_locations(cadata=cert)
a2366922 969
77562778 970
971def make_HTTPS_handler(params, **kwargs):
972 opts_check_certificate = not params.get('nocheckcertificate')
973 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
974 context.check_hostname = opts_check_certificate
f81c62a6 975 if params.get('legacyserverconnect'):
976 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 977 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
978 context.set_ciphers('DEFAULT')
ac8e69dd
M
979 elif (
980 sys.version_info < (3, 10)
981 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
982 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
983 ):
5b9f253f
M
984 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
985 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
986 # in some situations [2][3].
987 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
988 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 989 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
990 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
991 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
992 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
993 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
994 # 5. https://peps.python.org/pep-0644/#libressl-support
995 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
996 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
997 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 998
77562778 999 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000 if opts_check_certificate:
69bec673 1001 if certifi and 'no-certifi' not in params.get('compat_opts', []):
d5820461 1002 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1003 else:
1004 try:
1005 context.load_default_certs()
1006 # Work around the issue in load_default_certs when there are bad certificates. See:
1007 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1008 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1009 except ssl.SSLError:
1010 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1011 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1012 for storename in ('CA', 'ROOT'):
1013 _ssl_load_windows_store_certs(context, storename)
1014 context.set_default_verify_paths()
8a82af35 1015
bb58c9ed 1016 client_certfile = params.get('client_certificate')
1017 if client_certfile:
1018 try:
1019 context.load_cert_chain(
1020 client_certfile, keyfile=params.get('client_certificate_key'),
1021 password=params.get('client_certificate_password'))
1022 except ssl.SSLError:
1023 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1024
1025 # Some servers may reject requests if ALPN extension is not sent. See:
1026 # https://github.com/python/cpython/issues/85140
1027 # https://github.com/yt-dlp/yt-dlp/issues/3878
1028 with contextlib.suppress(NotImplementedError):
1029 context.set_alpn_protocols(['http/1.1'])
1030
77562778 1031 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1032
732ea2f0 1033
5873d4cc 1034def bug_reports_message(before=';'):
69bec673 1035 from ..update import REPOSITORY
57e0f077 1036
1037 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1038 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1039
1040 before = before.rstrip()
1041 if not before or before.endswith(('.', '!', '?')):
1042 msg = msg[0].title() + msg[1:]
1043
1044 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1045
1046
bf5b9d85
PM
1047class YoutubeDLError(Exception):
1048 """Base exception for YoutubeDL errors."""
aa9369a2 1049 msg = None
1050
1051 def __init__(self, msg=None):
1052 if msg is not None:
1053 self.msg = msg
1054 elif self.msg is None:
1055 self.msg = type(self).__name__
1056 super().__init__(self.msg)
bf5b9d85
PM
1057
1058
ac668111 1059network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1060if hasattr(ssl, 'CertificateError'):
1061 network_exceptions.append(ssl.CertificateError)
1062network_exceptions = tuple(network_exceptions)
1063
1064
bf5b9d85 1065class ExtractorError(YoutubeDLError):
1c256f70 1066 """Error during info extraction."""
5f6a1245 1067
1151c407 1068 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1069 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1070 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1071 """
3158150c 1072 if sys.exc_info()[0] in network_exceptions:
9a82b238 1073 expected = True
d5979c5d 1074
7265a219 1075 self.orig_msg = str(msg)
1c256f70 1076 self.traceback = tb
1151c407 1077 self.expected = expected
2eabb802 1078 self.cause = cause
d11271dd 1079 self.video_id = video_id
1151c407 1080 self.ie = ie
1081 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1082 if isinstance(self.exc_info[1], ExtractorError):
1083 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1084 super().__init__(self.__msg)
1151c407 1085
9bcfe33b 1086 @property
1087 def __msg(self):
1088 return ''.join((
1089 format_field(self.ie, None, '[%s] '),
1090 format_field(self.video_id, None, '%s: '),
1091 self.orig_msg,
1092 format_field(self.cause, None, ' (caused by %r)'),
1093 '' if self.expected else bug_reports_message()))
1c256f70 1094
01951dda 1095 def format_traceback(self):
497d2fab 1096 return join_nonempty(
1097 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1098 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1099 delim='\n') or None
01951dda 1100
9bcfe33b 1101 def __setattr__(self, name, value):
1102 super().__setattr__(name, value)
1103 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1104 self.msg = self.__msg or type(self).__name__
1105 self.args = (self.msg, ) # Cannot be property
1106
1c256f70 1107
416c7fcb
PH
1108class UnsupportedError(ExtractorError):
1109 def __init__(self, url):
86e5f3ed 1110 super().__init__(
416c7fcb
PH
1111 'Unsupported URL: %s' % url, expected=True)
1112 self.url = url
1113
1114
55b3e45b
JMF
1115class RegexNotFoundError(ExtractorError):
1116 """Error when a regex didn't match"""
1117 pass
1118
1119
773f291d
S
1120class GeoRestrictedError(ExtractorError):
1121 """Geographic restriction Error exception.
1122
1123 This exception may be thrown when a video is not available from your
1124 geographic location due to geographic restrictions imposed by a website.
1125 """
b6e0c7d2 1126
0db3bae8 1127 def __init__(self, msg, countries=None, **kwargs):
1128 kwargs['expected'] = True
86e5f3ed 1129 super().__init__(msg, **kwargs)
773f291d
S
1130 self.countries = countries
1131
1132
693f0600 1133class UserNotLive(ExtractorError):
1134 """Error when a channel/user is not live"""
1135
1136 def __init__(self, msg=None, **kwargs):
1137 kwargs['expected'] = True
1138 super().__init__(msg or 'The channel is not currently live', **kwargs)
1139
1140
bf5b9d85 1141class DownloadError(YoutubeDLError):
59ae15a5 1142 """Download Error exception.
d77c3dfd 1143
59ae15a5
PH
1144 This exception may be thrown by FileDownloader objects if they are not
1145 configured to continue on errors. They will contain the appropriate
1146 error message.
1147 """
5f6a1245 1148
8cc83b8d
FV
1149 def __init__(self, msg, exc_info=None):
1150 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1151 super().__init__(msg)
8cc83b8d 1152 self.exc_info = exc_info
d77c3dfd
FV
1153
1154
498f5606 1155class EntryNotInPlaylist(YoutubeDLError):
1156 """Entry not in playlist exception.
1157
1158 This exception will be thrown by YoutubeDL when a requested entry
1159 is not found in the playlist info_dict
1160 """
aa9369a2 1161 msg = 'Entry not found in info'
498f5606 1162
1163
bf5b9d85 1164class SameFileError(YoutubeDLError):
59ae15a5 1165 """Same File exception.
d77c3dfd 1166
59ae15a5
PH
1167 This exception will be thrown by FileDownloader objects if they detect
1168 multiple files would have to be downloaded to the same file on disk.
1169 """
aa9369a2 1170 msg = 'Fixed output name but more than one file to download'
1171
1172 def __init__(self, filename=None):
1173 if filename is not None:
1174 self.msg += f': {filename}'
1175 super().__init__(self.msg)
d77c3dfd
FV
1176
1177
bf5b9d85 1178class PostProcessingError(YoutubeDLError):
59ae15a5 1179 """Post Processing exception.
d77c3dfd 1180
59ae15a5
PH
1181 This exception may be raised by PostProcessor's .run() method to
1182 indicate an error in the postprocessing task.
1183 """
5f6a1245 1184
5f6a1245 1185
48f79687 1186class DownloadCancelled(YoutubeDLError):
1187 """ Exception raised when the download queue should be interrupted """
1188 msg = 'The download was cancelled'
8b0d7497 1189
8b0d7497 1190
48f79687 1191class ExistingVideoReached(DownloadCancelled):
1192 """ --break-on-existing triggered """
1193 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1194
48f79687 1195
1196class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1197 """ --break-match-filter triggered """
1198 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1199
1200
48f79687 1201class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1202 """ --max-downloads limit has been reached. """
48f79687 1203 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1204
1205
f2ebc5c7 1206class ReExtractInfo(YoutubeDLError):
1207 """ Video info needs to be re-extracted. """
1208
1209 def __init__(self, msg, expected=False):
1210 super().__init__(msg)
1211 self.expected = expected
1212
1213
1214class ThrottledDownload(ReExtractInfo):
48f79687 1215 """ Download speed below --throttled-rate. """
aa9369a2 1216 msg = 'The download speed is below throttle limit'
d77c3dfd 1217
43b22906 1218 def __init__(self):
1219 super().__init__(self.msg, expected=False)
f2ebc5c7 1220
d77c3dfd 1221
bf5b9d85 1222class UnavailableVideoError(YoutubeDLError):
59ae15a5 1223 """Unavailable Format exception.
d77c3dfd 1224
59ae15a5
PH
1225 This exception will be thrown when a video is requested
1226 in a format that is not available for that video.
1227 """
aa9369a2 1228 msg = 'Unable to download video'
1229
1230 def __init__(self, err=None):
1231 if err is not None:
1232 self.msg += f': {err}'
1233 super().__init__(self.msg)
d77c3dfd
FV
1234
1235
bf5b9d85 1236class ContentTooShortError(YoutubeDLError):
59ae15a5 1237 """Content Too Short exception.
d77c3dfd 1238
59ae15a5
PH
1239 This exception may be raised by FileDownloader objects when a file they
1240 download is too small for what the server announced first, indicating
1241 the connection was probably interrupted.
1242 """
d77c3dfd 1243
59ae15a5 1244 def __init__(self, downloaded, expected):
86e5f3ed 1245 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1246 # Both in bytes
59ae15a5
PH
1247 self.downloaded = downloaded
1248 self.expected = expected
d77c3dfd 1249
5f6a1245 1250
bf5b9d85 1251class XAttrMetadataError(YoutubeDLError):
efa97bdc 1252 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1253 super().__init__(msg)
efa97bdc 1254 self.code = code
bd264412 1255 self.msg = msg
efa97bdc
YCH
1256
1257 # Parsing code and msg
3089bc74 1258 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1259 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1260 self.reason = 'NO_SPACE'
1261 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1262 self.reason = 'VALUE_TOO_LONG'
1263 else:
1264 self.reason = 'NOT_SUPPORTED'
1265
1266
bf5b9d85 1267class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1268 pass
1269
1270
c5a59d93 1271def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1272 hc = http_class(*args, **kwargs)
be4a824d 1273 source_address = ydl_handler._params.get('source_address')
8959018a 1274
be4a824d 1275 if source_address is not None:
8959018a
AU
1276 # This is to workaround _create_connection() from socket where it will try all
1277 # address data from getaddrinfo() including IPv6. This filters the result from
1278 # getaddrinfo() based on the source_address value.
1279 # This is based on the cpython socket.create_connection() function.
1280 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1281 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1282 host, port = address
1283 err = None
1284 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1285 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1286 ip_addrs = [addr for addr in addrs if addr[0] == af]
1287 if addrs and not ip_addrs:
1288 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1289 raise OSError(
9e21e6d9
S
1290 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1291 % (ip_version, source_address[0]))
8959018a
AU
1292 for res in ip_addrs:
1293 af, socktype, proto, canonname, sa = res
1294 sock = None
1295 try:
1296 sock = socket.socket(af, socktype, proto)
1297 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1298 sock.settimeout(timeout)
1299 sock.bind(source_address)
1300 sock.connect(sa)
1301 err = None # Explicitly break reference cycle
1302 return sock
86e5f3ed 1303 except OSError as _:
8959018a
AU
1304 err = _
1305 if sock is not None:
1306 sock.close()
1307 if err is not None:
1308 raise err
1309 else:
86e5f3ed 1310 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1311 if hasattr(hc, '_create_connection'):
1312 hc._create_connection = _create_connection
cfb0511d 1313 hc.source_address = (source_address, 0)
be4a824d
PH
1314
1315 return hc
1316
1317
ac668111 1318class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1319 """Handler for HTTP requests and responses.
1320
1321 This class, when installed with an OpenerDirector, automatically adds
955c8958 1322 the standard headers to every HTTP request and handles gzipped, deflated and
1323 brotli responses from web servers.
59ae15a5
PH
1324
1325 Part of this code was copied from:
1326
1327 http://techknack.net/python-urllib2-handlers/
1328
1329 Andrew Rowls, the author of that code, agreed to release it to the
1330 public domain.
1331 """
1332
be4a824d 1333 def __init__(self, params, *args, **kwargs):
ac668111 1334 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1335 self._params = params
1336
1337 def http_open(self, req):
ac668111 1338 conn_class = http.client.HTTPConnection
71aff188
YCH
1339
1340 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341 if socks_proxy:
1342 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343 del req.headers['Ytdl-socks-proxy']
1344
be4a824d 1345 return self.do_open(functools.partial(
71aff188 1346 _create_http_connection, self, conn_class, False),
be4a824d
PH
1347 req)
1348
59ae15a5
PH
1349 @staticmethod
1350 def deflate(data):
fc2119f2 1351 if not data:
1352 return data
59ae15a5
PH
1353 try:
1354 return zlib.decompress(data, -zlib.MAX_WBITS)
1355 except zlib.error:
1356 return zlib.decompress(data)
1357
4390d5ec 1358 @staticmethod
1359 def brotli(data):
1360 if not data:
1361 return data
9b8ee23b 1362 return brotli.decompress(data)
4390d5ec 1363
daafbf49 1364 @staticmethod
1365 def gz(data):
1366 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1367 try:
1368 return gz.read()
1369 except OSError as original_oserror:
1370 # There may be junk add the end of the file
1371 # See http://stackoverflow.com/q/4928560/35070 for details
1372 for i in range(1, 1024):
1373 try:
1374 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1375 return gz.read()
1376 except OSError:
1377 continue
1378 else:
1379 raise original_oserror
1380
acebc9cd 1381 def http_request(self, req):
51f267d9
S
1382 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1383 # always respected by websites, some tend to give out URLs with non percent-encoded
1384 # non-ASCII characters (see telemb.py, ard.py [#3412])
1385 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1386 # To work around aforementioned issue we will replace request's original URL with
1387 # percent-encoded one
1388 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1389 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1390 url = req.get_full_url()
1391 url_escaped = escape_url(url)
1392
1393 # Substitute URL if any change after escaping
1394 if url != url_escaped:
15d260eb 1395 req = update_Request(req, url=url_escaped)
51f267d9 1396
8b7539d2 1397 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1398 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1399 # The dict keys are capitalized because of this bug by urllib
1400 if h.capitalize() not in req.headers:
33ac271b 1401 req.add_header(h, v)
87f0e62d 1402
955c8958 1403 if 'Youtubedl-no-compression' in req.headers: # deprecated
1404 req.headers.pop('Youtubedl-no-compression', None)
1405 req.add_header('Accept-encoding', 'identity')
1406
af14914b 1407 if 'Accept-encoding' not in req.headers:
1408 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1409
379a4f16 1410 return super().do_request_(req)
59ae15a5 1411
acebc9cd 1412 def http_response(self, req, resp):
59ae15a5 1413 old_resp = resp
daafbf49 1414
1415 # Content-Encoding header lists the encodings in order that they were applied [1].
1416 # To decompress, we simply do the reverse.
1417 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1418 decoded_response = None
1419 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1420 if encoding == 'gzip':
1421 decoded_response = self.gz(decoded_response or resp.read())
1422 elif encoding == 'deflate':
1423 decoded_response = self.deflate(decoded_response or resp.read())
1424 elif encoding == 'br' and brotli:
1425 decoded_response = self.brotli(decoded_response or resp.read())
1426
1427 if decoded_response is not None:
1428 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
4390d5ec 1429 resp.msg = old_resp.msg
ad729172 1430 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1431 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1432 if 300 <= resp.code < 400:
1433 location = resp.headers.get('Location')
1434 if location:
1435 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1436 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1437 location_escaped = escape_url(location)
1438 if location != location_escaped:
1439 del resp.headers['Location']
1440 resp.headers['Location'] = location_escaped
59ae15a5 1441 return resp
0f8d03f8 1442
acebc9cd
PH
1443 https_request = http_request
1444 https_response = http_response
bf50b038 1445
5de90176 1446
71aff188
YCH
1447def make_socks_conn_class(base_class, socks_proxy):
1448 assert issubclass(base_class, (
ac668111 1449 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1450
14f25df2 1451 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1452 if url_components.scheme.lower() == 'socks5':
1453 socks_type = ProxyType.SOCKS5
1454 elif url_components.scheme.lower() in ('socks', 'socks4'):
1455 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1456 elif url_components.scheme.lower() == 'socks4a':
1457 socks_type = ProxyType.SOCKS4A
71aff188 1458
cdd94c2e
YCH
1459 def unquote_if_non_empty(s):
1460 if not s:
1461 return s
ac668111 1462 return urllib.parse.unquote_plus(s)
cdd94c2e 1463
71aff188
YCH
1464 proxy_args = (
1465 socks_type,
1466 url_components.hostname, url_components.port or 1080,
1467 True, # Remote DNS
cdd94c2e
YCH
1468 unquote_if_non_empty(url_components.username),
1469 unquote_if_non_empty(url_components.password),
71aff188
YCH
1470 )
1471
1472 class SocksConnection(base_class):
1473 def connect(self):
1474 self.sock = sockssocket()
1475 self.sock.setproxy(*proxy_args)
19a03940 1476 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1477 self.sock.settimeout(self.timeout)
1478 self.sock.connect((self.host, self.port))
1479
ac668111 1480 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1481 if hasattr(self, '_context'): # Python > 2.6
1482 self.sock = self._context.wrap_socket(
1483 self.sock, server_hostname=self.host)
1484 else:
1485 self.sock = ssl.wrap_socket(self.sock)
1486
1487 return SocksConnection
1488
1489
ac668111 1490class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1491 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1492 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1493 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1494 self._params = params
1495
1496 def https_open(self, req):
4f264c02 1497 kwargs = {}
71aff188
YCH
1498 conn_class = self._https_conn_class
1499
4f264c02
JMF
1500 if hasattr(self, '_context'): # python > 2.6
1501 kwargs['context'] = self._context
1502 if hasattr(self, '_check_hostname'): # python 3.x
1503 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1504
1505 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1506 if socks_proxy:
1507 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1508 del req.headers['Ytdl-socks-proxy']
1509
4f28b537 1510 try:
1511 return self.do_open(
1512 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1513 except urllib.error.URLError as e:
1514 if (isinstance(e.reason, ssl.SSLError)
1515 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1516 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1517 raise
be4a824d
PH
1518
1519
941e881e 1520def is_path_like(f):
1521 return isinstance(f, (str, bytes, os.PathLike))
1522
1523
ac668111 1524class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1525 def __init__(self, cookiejar=None):
ac668111 1526 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1527
1528 def http_response(self, request, response):
ac668111 1529 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1530
ac668111 1531 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1532 https_response = http_response
1533
1534
ac668111 1535class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1536 """YoutubeDL redirect handler
1537
1538 The code is based on HTTPRedirectHandler implementation from CPython [1].
1539
08916a49 1540 This redirect handler fixes and improves the logic to better align with RFC7261
1541 and what browsers tend to do [2][3]
201c1459 1542
1543 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
08916a49 1544 2. https://datatracker.ietf.org/doc/html/rfc7231
1545 3. https://github.com/python/cpython/issues/91306
201c1459 1546 """
1547
ac668111 1548 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1549
1550 def redirect_request(self, req, fp, code, msg, headers, newurl):
08916a49 1551 if code not in (301, 302, 303, 307, 308):
14f25df2 1552 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
afac4caa 1553
08916a49 1554 new_method = req.get_method()
1555 new_data = req.data
1556 remove_headers = []
afac4caa 1557 # A 303 must either use GET or HEAD for subsequent request
1558 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
08916a49 1559 if code == 303 and req.get_method() != 'HEAD':
1560 new_method = 'GET'
afac4caa 1561 # 301 and 302 redirects are commonly turned into a GET from a POST
1562 # for subsequent requests by browsers, so we'll do the same.
1563 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1564 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
08916a49 1565 elif code in (301, 302) and req.get_method() == 'POST':
1566 new_method = 'GET'
1567
1568 # only remove payload if method changed (e.g. POST to GET)
1569 if new_method != req.get_method():
1570 new_data = None
1571 remove_headers.extend(['Content-Length', 'Content-Type'])
1572
1573 new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
afac4caa 1574
ac668111 1575 return urllib.request.Request(
08916a49 1576 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1577 unverifiable=True, method=new_method, data=new_data)
fca6dba8
S
1578
1579
46f59e89
S
1580def extract_timezone(date_str):
1581 m = re.search(
f137e4c2 1582 r'''(?x)
1583 ^.{8,}? # >=8 char non-TZ prefix, if present
1584 (?P<tz>Z| # just the UTC Z, or
1585 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1586 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1587 [ ]? # optional space
1588 (?P<sign>\+|-) # +/-
1589 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1590 $)
1591 ''', date_str)
46f59e89 1592 if not m:
8f53dc44 1593 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1594 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1595 if timezone is not None:
1596 date_str = date_str[:-len(m.group('tz'))]
1597 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1598 else:
1599 date_str = date_str[:-len(m.group('tz'))]
1600 if not m.group('sign'):
1601 timezone = datetime.timedelta()
1602 else:
1603 sign = 1 if m.group('sign') == '+' else -1
1604 timezone = datetime.timedelta(
1605 hours=sign * int(m.group('hours')),
1606 minutes=sign * int(m.group('minutes')))
1607 return timezone, date_str
1608
1609
08b38d54 1610def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1611 """ Return a UNIX timestamp from the given date """
1612
1613 if date_str is None:
1614 return None
1615
52c3a6e4
S
1616 date_str = re.sub(r'\.[0-9]+', '', date_str)
1617
08b38d54 1618 if timezone is None:
46f59e89
S
1619 timezone, date_str = extract_timezone(date_str)
1620
19a03940 1621 with contextlib.suppress(ValueError):
86e5f3ed 1622 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1623 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1624 return calendar.timegm(dt.timetuple())
912b38b4
PH
1625
1626
46f59e89
S
1627def date_formats(day_first=True):
1628 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1629
1630
42bdd9d0 1631def unified_strdate(date_str, day_first=True):
bf50b038 1632 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1633
1634 if date_str is None:
1635 return None
bf50b038 1636 upload_date = None
5f6a1245 1637 # Replace commas
026fcc04 1638 date_str = date_str.replace(',', ' ')
42bdd9d0 1639 # Remove AM/PM + timezone
9bb8e0a3 1640 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1641 _, date_str = extract_timezone(date_str)
42bdd9d0 1642
46f59e89 1643 for expression in date_formats(day_first):
19a03940 1644 with contextlib.suppress(ValueError):
bf50b038 1645 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1646 if upload_date is None:
1647 timetuple = email.utils.parsedate_tz(date_str)
1648 if timetuple:
19a03940 1649 with contextlib.suppress(ValueError):
c6b9cf05 1650 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1651 if upload_date is not None:
14f25df2 1652 return str(upload_date)
bf50b038 1653
5f6a1245 1654
46f59e89
S
1655def unified_timestamp(date_str, day_first=True):
1656 if date_str is None:
1657 return None
1658
8f53dc44 1659 date_str = re.sub(r'\s+', ' ', re.sub(
1660 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1661
7dc2a74e 1662 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1663 timezone, date_str = extract_timezone(date_str)
1664
1665 # Remove AM/PM + timezone
1666 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1667
deef3195
S
1668 # Remove unrecognized timezones from ISO 8601 alike timestamps
1669 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1670 if m:
1671 date_str = date_str[:-len(m.group('tz'))]
1672
f226880c
PH
1673 # Python only supports microseconds, so remove nanoseconds
1674 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1675 if m:
1676 date_str = m.group(1)
1677
46f59e89 1678 for expression in date_formats(day_first):
19a03940 1679 with contextlib.suppress(ValueError):
7dc2a74e 1680 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1681 return calendar.timegm(dt.timetuple())
8f53dc44 1682
46f59e89
S
1683 timetuple = email.utils.parsedate_tz(date_str)
1684 if timetuple:
8f53dc44 1685 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1686
1687
28e614de 1688def determine_ext(url, default_ext='unknown_video'):
85750f89 1689 if url is None or '.' not in url:
f4776371 1690 return default_ext
9cb9a5df 1691 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1692 if re.match(r'^[A-Za-z0-9]+$', guess):
1693 return guess
a7aaa398
S
1694 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1695 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1696 return guess.rstrip('/')
73e79f2a 1697 else:
cbdbb766 1698 return default_ext
73e79f2a 1699
5f6a1245 1700
824fa511
S
1701def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1702 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1703
5f6a1245 1704
9e62f283 1705def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1706 R"""
1707 Return a datetime object from a string.
1708 Supported format:
1709 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1710
1711 @param format strftime format of DATE
1712 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1713 auto: round to the unit provided in date_str (if applicable).
9e62f283 1714 """
1715 auto_precision = False
1716 if precision == 'auto':
1717 auto_precision = True
1718 precision = 'microsecond'
396a76f7 1719 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1720 if date_str in ('now', 'today'):
37254abc 1721 return today
f8795e10
PH
1722 if date_str == 'yesterday':
1723 return today - datetime.timedelta(days=1)
9e62f283 1724 match = re.match(
3d38b2d6 1725 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1726 date_str)
37254abc 1727 if match is not None:
9e62f283 1728 start_time = datetime_from_str(match.group('start'), precision, format)
1729 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1730 unit = match.group('unit')
9e62f283 1731 if unit == 'month' or unit == 'year':
1732 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1733 unit = 'day'
9e62f283 1734 else:
1735 if unit == 'week':
1736 unit = 'day'
1737 time *= 7
1738 delta = datetime.timedelta(**{unit + 's': time})
1739 new_date = start_time + delta
1740 if auto_precision:
1741 return datetime_round(new_date, unit)
1742 return new_date
1743
1744 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1745
1746
d49f8db3 1747def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1748 R"""
1749 Return a date object from a string using datetime_from_str
9e62f283 1750
3d38b2d6 1751 @param strict Restrict allowed patterns to "YYYYMMDD" and
1752 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1753 """
3d38b2d6 1754 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1755 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1756 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1757
1758
1759def datetime_add_months(dt, months):
1760 """Increment/Decrement a datetime object by months."""
1761 month = dt.month + months - 1
1762 year = dt.year + month // 12
1763 month = month % 12 + 1
1764 day = min(dt.day, calendar.monthrange(year, month)[1])
1765 return dt.replace(year, month, day)
1766
1767
1768def datetime_round(dt, precision='day'):
1769 """
1770 Round a datetime object's time to a specific precision
1771 """
1772 if precision == 'microsecond':
1773 return dt
1774
1775 unit_seconds = {
1776 'day': 86400,
1777 'hour': 3600,
1778 'minute': 60,
1779 'second': 1,
1780 }
1781 roundto = lambda x, n: ((x + n / 2) // n) * n
1782 timestamp = calendar.timegm(dt.timetuple())
1783 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1784
1785
e63fc1be 1786def hyphenate_date(date_str):
1787 """
1788 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1789 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1790 if match is not None:
1791 return '-'.join(match.groups())
1792 else:
1793 return date_str
1794
5f6a1245 1795
86e5f3ed 1796class DateRange:
bd558525 1797 """Represents a time interval between two dates"""
5f6a1245 1798
bd558525
JMF
1799 def __init__(self, start=None, end=None):
1800 """start and end must be strings in the format accepted by date"""
1801 if start is not None:
d49f8db3 1802 self.start = date_from_str(start, strict=True)
bd558525
JMF
1803 else:
1804 self.start = datetime.datetime.min.date()
1805 if end is not None:
d49f8db3 1806 self.end = date_from_str(end, strict=True)
bd558525
JMF
1807 else:
1808 self.end = datetime.datetime.max.date()
37254abc 1809 if self.start > self.end:
bd558525 1810 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1811
bd558525
JMF
1812 @classmethod
1813 def day(cls, day):
1814 """Returns a range that only contains the given day"""
5f6a1245
JW
1815 return cls(day, day)
1816
bd558525
JMF
1817 def __contains__(self, date):
1818 """Check if the date is in the range"""
37254abc
JMF
1819 if not isinstance(date, datetime.date):
1820 date = date_from_str(date)
1821 return self.start <= date <= self.end
5f6a1245 1822
46f1370e 1823 def __repr__(self):
1824 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1825
f2df4071 1826 def __eq__(self, other):
1827 return (isinstance(other, DateRange)
1828 and self.start == other.start and self.end == other.end)
1829
c496ca96 1830
b1f94422 1831@functools.cache
1832def system_identifier():
1833 python_implementation = platform.python_implementation()
1834 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1835 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1836 libc_ver = []
1837 with contextlib.suppress(OSError): # We may not have access to the executable
1838 libc_ver = platform.libc_ver()
b1f94422 1839
17fc3dc4 1840 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1841 platform.python_version(),
1842 python_implementation,
17fc3dc4 1843 platform.machine(),
b1f94422 1844 platform.architecture()[0],
1845 platform.platform(),
5b9f253f
M
1846 ssl.OPENSSL_VERSION,
1847 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1848 )
c257baff
PH
1849
1850
0b9c08b4 1851@functools.cache
49fa4d9a 1852def get_windows_version():
8a82af35 1853 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1854 if compat_os_name == 'nt':
1855 return version_tuple(platform.win32_ver()[1])
1856 else:
8a82af35 1857 return ()
49fa4d9a
N
1858
1859
734f90bb 1860def write_string(s, out=None, encoding=None):
19a03940 1861 assert isinstance(s, str)
1862 out = out or sys.stderr
3b479100
SS
1863 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1864 if not out:
1865 return
7459e3a2 1866
fe1daad3 1867 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1868 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1869
8a82af35 1870 enc, buffer = None, out
cfb0511d 1871 if 'b' in getattr(out, 'mode', ''):
c487cf00 1872 enc = encoding or preferredencoding()
104aa738 1873 elif hasattr(out, 'buffer'):
8a82af35 1874 buffer = out.buffer
104aa738 1875 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1876
8a82af35 1877 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1878 out.flush()
1879
1880
da4db748 1881def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1882 from .. import _IN_CLI
da4db748 1883 if _IN_CLI:
1884 if msg in deprecation_warning._cache:
1885 return
1886 deprecation_warning._cache.add(msg)
1887 if printer:
1888 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1889 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1890 else:
1891 import warnings
1892 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1893
1894
1895deprecation_warning._cache = set()
1896
1897
48ea9cea
PH
1898def bytes_to_intlist(bs):
1899 if not bs:
1900 return []
1901 if isinstance(bs[0], int): # Python 3
1902 return list(bs)
1903 else:
1904 return [ord(c) for c in bs]
1905
c257baff 1906
cba892fa 1907def intlist_to_bytes(xs):
1908 if not xs:
1909 return b''
ac668111 1910 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1911
1912
8a82af35 1913class LockingUnsupportedError(OSError):
1890fc63 1914 msg = 'File locking is not supported'
0edb3e33 1915
1916 def __init__(self):
1917 super().__init__(self.msg)
1918
1919
c1c9a79c
PH
1920# Cross-platform file locking
1921if sys.platform == 'win32':
fe0918bb 1922 import ctypes
c1c9a79c
PH
1923 import ctypes.wintypes
1924 import msvcrt
1925
1926 class OVERLAPPED(ctypes.Structure):
1927 _fields_ = [
1928 ('Internal', ctypes.wintypes.LPVOID),
1929 ('InternalHigh', ctypes.wintypes.LPVOID),
1930 ('Offset', ctypes.wintypes.DWORD),
1931 ('OffsetHigh', ctypes.wintypes.DWORD),
1932 ('hEvent', ctypes.wintypes.HANDLE),
1933 ]
1934
37e325b9 1935 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1936 LockFileEx = kernel32.LockFileEx
1937 LockFileEx.argtypes = [
1938 ctypes.wintypes.HANDLE, # hFile
1939 ctypes.wintypes.DWORD, # dwFlags
1940 ctypes.wintypes.DWORD, # dwReserved
1941 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1942 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1943 ctypes.POINTER(OVERLAPPED) # Overlapped
1944 ]
1945 LockFileEx.restype = ctypes.wintypes.BOOL
1946 UnlockFileEx = kernel32.UnlockFileEx
1947 UnlockFileEx.argtypes = [
1948 ctypes.wintypes.HANDLE, # hFile
1949 ctypes.wintypes.DWORD, # dwReserved
1950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1951 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1952 ctypes.POINTER(OVERLAPPED) # Overlapped
1953 ]
1954 UnlockFileEx.restype = ctypes.wintypes.BOOL
1955 whole_low = 0xffffffff
1956 whole_high = 0x7fffffff
1957
747c0bd1 1958 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1959 overlapped = OVERLAPPED()
1960 overlapped.Offset = 0
1961 overlapped.OffsetHigh = 0
1962 overlapped.hEvent = 0
1963 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1964
1965 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1966 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1967 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1968 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1969 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1970
1971 def _unlock_file(f):
1972 assert f._lock_file_overlapped_p
1973 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1974 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1975 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1976
1977else:
399a76e6
YCH
1978 try:
1979 import fcntl
c1c9a79c 1980
a3125791 1981 def _lock_file(f, exclusive, block):
b63837bc 1982 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1983 if not block:
1984 flags |= fcntl.LOCK_NB
acea8d7c 1985 try:
b63837bc 1986 fcntl.flock(f, flags)
acea8d7c
JK
1987 except BlockingIOError:
1988 raise
1989 except OSError: # AOSP does not have flock()
b63837bc 1990 fcntl.lockf(f, flags)
c1c9a79c 1991
399a76e6 1992 def _unlock_file(f):
45998b3e
E
1993 with contextlib.suppress(OSError):
1994 return fcntl.flock(f, fcntl.LOCK_UN)
1995 with contextlib.suppress(OSError):
1996 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1997 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1998
399a76e6 1999 except ImportError:
399a76e6 2000
a3125791 2001 def _lock_file(f, exclusive, block):
0edb3e33 2002 raise LockingUnsupportedError()
399a76e6
YCH
2003
2004 def _unlock_file(f):
0edb3e33 2005 raise LockingUnsupportedError()
c1c9a79c
PH
2006
2007
86e5f3ed 2008class locked_file:
0edb3e33 2009 locked = False
747c0bd1 2010
a3125791 2011 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2012 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2013 raise NotImplementedError(mode)
2014 self.mode, self.block = mode, block
2015
2016 writable = any(f in mode for f in 'wax+')
2017 readable = any(f in mode for f in 'r+')
2018 flags = functools.reduce(operator.ior, (
2019 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2020 getattr(os, 'O_BINARY', 0), # Windows only
2021 getattr(os, 'O_NOINHERIT', 0), # Windows only
2022 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2023 os.O_APPEND if 'a' in mode else 0,
2024 os.O_EXCL if 'x' in mode else 0,
2025 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2026 ))
2027
98804d03 2028 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2029
2030 def __enter__(self):
a3125791 2031 exclusive = 'r' not in self.mode
c1c9a79c 2032 try:
a3125791 2033 _lock_file(self.f, exclusive, self.block)
0edb3e33 2034 self.locked = True
86e5f3ed 2035 except OSError:
c1c9a79c
PH
2036 self.f.close()
2037 raise
fcfa8853 2038 if 'w' in self.mode:
131e14dc
JK
2039 try:
2040 self.f.truncate()
2041 except OSError as e:
1890fc63 2042 if e.errno not in (
2043 errno.ESPIPE, # Illegal seek - expected for FIFO
2044 errno.EINVAL, # Invalid argument - expected for /dev/null
2045 ):
2046 raise
c1c9a79c
PH
2047 return self
2048
0edb3e33 2049 def unlock(self):
2050 if not self.locked:
2051 return
c1c9a79c 2052 try:
0edb3e33 2053 _unlock_file(self.f)
c1c9a79c 2054 finally:
0edb3e33 2055 self.locked = False
c1c9a79c 2056
0edb3e33 2057 def __exit__(self, *_):
2058 try:
2059 self.unlock()
2060 finally:
2061 self.f.close()
4eb7f1d1 2062
0edb3e33 2063 open = __enter__
2064 close = __exit__
a3125791 2065
0edb3e33 2066 def __getattr__(self, attr):
2067 return getattr(self.f, attr)
a3125791 2068
0edb3e33 2069 def __iter__(self):
2070 return iter(self.f)
a3125791 2071
4eb7f1d1 2072
0b9c08b4 2073@functools.cache
4644ac55
S
2074def get_filesystem_encoding():
2075 encoding = sys.getfilesystemencoding()
2076 return encoding if encoding is not None else 'utf-8'
2077
2078
4eb7f1d1 2079def shell_quote(args):
a6a173c2 2080 quoted_args = []
4644ac55 2081 encoding = get_filesystem_encoding()
a6a173c2
JMF
2082 for a in args:
2083 if isinstance(a, bytes):
2084 # We may get a filename encoded with 'encodeFilename'
2085 a = a.decode(encoding)
aefce8e6 2086 quoted_args.append(compat_shlex_quote(a))
28e614de 2087 return ' '.join(quoted_args)
9d4660ca
PH
2088
2089
2090def smuggle_url(url, data):
2091 """ Pass additional data in a URL for internal use. """
2092
81953d1a
RA
2093 url, idata = unsmuggle_url(url, {})
2094 data.update(idata)
14f25df2 2095 sdata = urllib.parse.urlencode(
28e614de
PH
2096 {'__youtubedl_smuggle': json.dumps(data)})
2097 return url + '#' + sdata
9d4660ca
PH
2098
2099
79f82953 2100def unsmuggle_url(smug_url, default=None):
83e865a3 2101 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2102 return smug_url, default
28e614de 2103 url, _, sdata = smug_url.rpartition('#')
14f25df2 2104 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2105 data = json.loads(jsond)
2106 return url, data
02dbf93f
PH
2107
2108
e0fd9573 2109def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2110 """ Formats numbers with decimal sufixes like K, M, etc """
2111 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2112 if num is None or num < 0:
e0fd9573 2113 return None
eeb2a770 2114 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2115 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2116 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2117 if factor == 1024:
2118 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2119 converted = num / (factor ** exponent)
abbeeebc 2120 return fmt % (converted, suffix)
e0fd9573 2121
2122
02dbf93f 2123def format_bytes(bytes):
f02d24d8 2124 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2125
1c088fa8 2126
64c464a1 2127def lookup_unit_table(unit_table, s, strict=False):
2128 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2129 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2130 m = (re.fullmatch if strict else re.match)(
2131 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2132 if not m:
2133 return None
64c464a1 2134
2135 num = float(m.group('num').replace(',', '.'))
fb47597b 2136 mult = unit_table[m.group('unit')]
64c464a1 2137 return round(num * mult)
2138
2139
2140def parse_bytes(s):
2141 """Parse a string indicating a byte quantity into an integer"""
2142 return lookup_unit_table(
2143 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2144 s.upper(), strict=True)
fb47597b
S
2145
2146
be64b5b0
PH
2147def parse_filesize(s):
2148 if s is None:
2149 return None
2150
dfb1b146 2151 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2152 # but we support those too
2153 _UNIT_TABLE = {
2154 'B': 1,
2155 'b': 1,
70852b47 2156 'bytes': 1,
be64b5b0
PH
2157 'KiB': 1024,
2158 'KB': 1000,
2159 'kB': 1024,
2160 'Kb': 1000,
13585d76 2161 'kb': 1000,
70852b47
YCH
2162 'kilobytes': 1000,
2163 'kibibytes': 1024,
be64b5b0
PH
2164 'MiB': 1024 ** 2,
2165 'MB': 1000 ** 2,
2166 'mB': 1024 ** 2,
2167 'Mb': 1000 ** 2,
13585d76 2168 'mb': 1000 ** 2,
70852b47
YCH
2169 'megabytes': 1000 ** 2,
2170 'mebibytes': 1024 ** 2,
be64b5b0
PH
2171 'GiB': 1024 ** 3,
2172 'GB': 1000 ** 3,
2173 'gB': 1024 ** 3,
2174 'Gb': 1000 ** 3,
13585d76 2175 'gb': 1000 ** 3,
70852b47
YCH
2176 'gigabytes': 1000 ** 3,
2177 'gibibytes': 1024 ** 3,
be64b5b0
PH
2178 'TiB': 1024 ** 4,
2179 'TB': 1000 ** 4,
2180 'tB': 1024 ** 4,
2181 'Tb': 1000 ** 4,
13585d76 2182 'tb': 1000 ** 4,
70852b47
YCH
2183 'terabytes': 1000 ** 4,
2184 'tebibytes': 1024 ** 4,
be64b5b0
PH
2185 'PiB': 1024 ** 5,
2186 'PB': 1000 ** 5,
2187 'pB': 1024 ** 5,
2188 'Pb': 1000 ** 5,
13585d76 2189 'pb': 1000 ** 5,
70852b47
YCH
2190 'petabytes': 1000 ** 5,
2191 'pebibytes': 1024 ** 5,
be64b5b0
PH
2192 'EiB': 1024 ** 6,
2193 'EB': 1000 ** 6,
2194 'eB': 1024 ** 6,
2195 'Eb': 1000 ** 6,
13585d76 2196 'eb': 1000 ** 6,
70852b47
YCH
2197 'exabytes': 1000 ** 6,
2198 'exbibytes': 1024 ** 6,
be64b5b0
PH
2199 'ZiB': 1024 ** 7,
2200 'ZB': 1000 ** 7,
2201 'zB': 1024 ** 7,
2202 'Zb': 1000 ** 7,
13585d76 2203 'zb': 1000 ** 7,
70852b47
YCH
2204 'zettabytes': 1000 ** 7,
2205 'zebibytes': 1024 ** 7,
be64b5b0
PH
2206 'YiB': 1024 ** 8,
2207 'YB': 1000 ** 8,
2208 'yB': 1024 ** 8,
2209 'Yb': 1000 ** 8,
13585d76 2210 'yb': 1000 ** 8,
70852b47
YCH
2211 'yottabytes': 1000 ** 8,
2212 'yobibytes': 1024 ** 8,
be64b5b0
PH
2213 }
2214
fb47597b
S
2215 return lookup_unit_table(_UNIT_TABLE, s)
2216
2217
2218def parse_count(s):
2219 if s is None:
be64b5b0
PH
2220 return None
2221
352d5da8 2222 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2223
2224 if re.match(r'^[\d,.]+$', s):
2225 return str_to_int(s)
2226
2227 _UNIT_TABLE = {
2228 'k': 1000,
2229 'K': 1000,
2230 'm': 1000 ** 2,
2231 'M': 1000 ** 2,
2232 'kk': 1000 ** 2,
2233 'KK': 1000 ** 2,
352d5da8 2234 'b': 1000 ** 3,
2235 'B': 1000 ** 3,
fb47597b 2236 }
be64b5b0 2237
352d5da8 2238 ret = lookup_unit_table(_UNIT_TABLE, s)
2239 if ret is not None:
2240 return ret
2241
2242 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2243 if mobj:
2244 return str_to_int(mobj.group(1))
be64b5b0 2245
2f7ae819 2246
5d45484c 2247def parse_resolution(s, *, lenient=False):
b871d7e9
S
2248 if s is None:
2249 return {}
2250
5d45484c
LNO
2251 if lenient:
2252 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2253 else:
2254 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2255 if mobj:
2256 return {
2257 'width': int(mobj.group('w')),
2258 'height': int(mobj.group('h')),
2259 }
2260
17ec8bcf 2261 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2262 if mobj:
2263 return {'height': int(mobj.group(1))}
2264
2265 mobj = re.search(r'\b([48])[kK]\b', s)
2266 if mobj:
2267 return {'height': int(mobj.group(1)) * 540}
2268
2269 return {}
2270
2271
0dc41787 2272def parse_bitrate(s):
14f25df2 2273 if not isinstance(s, str):
0dc41787
S
2274 return
2275 mobj = re.search(r'\b(\d+)\s*kbps', s)
2276 if mobj:
2277 return int(mobj.group(1))
2278
2279
a942d6cb 2280def month_by_name(name, lang='en'):
caefb1de
PH
2281 """ Return the number of a month by (locale-independently) English name """
2282
f6717dec 2283 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2284
caefb1de 2285 try:
f6717dec 2286 return month_names.index(name) + 1
7105440c
YCH
2287 except ValueError:
2288 return None
2289
2290
2291def month_by_abbreviation(abbrev):
2292 """ Return the number of a month by (locale-independently) English
2293 abbreviations """
2294
2295 try:
2296 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2297 except ValueError:
2298 return None
18258362
JMF
2299
2300
5aafe895 2301def fix_xml_ampersands(xml_str):
18258362 2302 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2303 return re.sub(
2304 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2305 '&amp;',
5aafe895 2306 xml_str)
e3946f98
PH
2307
2308
2309def setproctitle(title):
14f25df2 2310 assert isinstance(title, str)
c1c05c67 2311
fe0918bb 2312 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2313 try:
2314 import ctypes
2315 except ImportError:
c1c05c67
YCH
2316 return
2317
e3946f98 2318 try:
611c1dd9 2319 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2320 except OSError:
2321 return
2f49bcd6
RC
2322 except TypeError:
2323 # LoadLibrary in Windows Python 2.7.13 only expects
2324 # a bytestring, but since unicode_literals turns
2325 # every string into a unicode string, it fails.
2326 return
0f06bcd7 2327 title_bytes = title.encode()
6eefe533
PH
2328 buf = ctypes.create_string_buffer(len(title_bytes))
2329 buf.value = title_bytes
e3946f98 2330 try:
6eefe533 2331 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2332 except AttributeError:
2333 return # Strange libc, just skip this
d7dda168
PH
2334
2335
2336def remove_start(s, start):
46bc9b7d 2337 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2338
2339
2b9faf55 2340def remove_end(s, end):
46bc9b7d 2341 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2342
2343
31b2051e
S
2344def remove_quotes(s):
2345 if s is None or len(s) < 2:
2346 return s
2347 for quote in ('"', "'", ):
2348 if s[0] == quote and s[-1] == quote:
2349 return s[1:-1]
2350 return s
2351
2352
b6e0c7d2 2353def get_domain(url):
ebf99aaf 2354 """
2355 This implementation is inconsistent, but is kept for compatibility.
2356 Use this only for "webpage_url_domain"
2357 """
2358 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2359
2360
29eb5174 2361def url_basename(url):
14f25df2 2362 path = urllib.parse.urlparse(url).path
28e614de 2363 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2364
2365
02dc0a36 2366def base_url(url):
7657ec7e 2367 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2368
2369
e34c3361 2370def urljoin(base, path):
4b5de77b 2371 if isinstance(path, bytes):
0f06bcd7 2372 path = path.decode()
14f25df2 2373 if not isinstance(path, str) or not path:
e34c3361 2374 return None
fad4ceb5 2375 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2376 return path
4b5de77b 2377 if isinstance(base, bytes):
0f06bcd7 2378 base = base.decode()
14f25df2 2379 if not isinstance(base, str) or not re.match(
4b5de77b 2380 r'^(?:https?:)?//', base):
e34c3361 2381 return None
14f25df2 2382 return urllib.parse.urljoin(base, path)
e34c3361
S
2383
2384
ac668111 2385class HEADRequest(urllib.request.Request):
aa94a6d3 2386 def get_method(self):
611c1dd9 2387 return 'HEAD'
7217e148
PH
2388
2389
ac668111 2390class PUTRequest(urllib.request.Request):
95cf60e8
S
2391 def get_method(self):
2392 return 'PUT'
2393
2394
9732d77e 2395def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2396 if get_attr and v is not None:
2397 v = getattr(v, get_attr, None)
1812afb7
S
2398 try:
2399 return int(v) * invscale // scale
31c49255 2400 except (ValueError, TypeError, OverflowError):
af98f8ff 2401 return default
9732d77e 2402
9572013d 2403
40a90862 2404def str_or_none(v, default=None):
14f25df2 2405 return default if v is None else str(v)
40a90862 2406
9732d77e
PH
2407
2408def str_to_int(int_str):
48d4681e 2409 """ A more relaxed version of int_or_none """
f9934b96 2410 if isinstance(int_str, int):
348c6bf1 2411 return int_str
14f25df2 2412 elif isinstance(int_str, str):
42db58ec
S
2413 int_str = re.sub(r'[,\.\+]', '', int_str)
2414 return int_or_none(int_str)
608d11f5
PH
2415
2416
9732d77e 2417def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2418 if v is None:
2419 return default
2420 try:
2421 return float(v) * invscale / scale
5e1271c5 2422 except (ValueError, TypeError):
caf80631 2423 return default
43f775e4
PH
2424
2425
c7e327c4
S
2426def bool_or_none(v, default=None):
2427 return v if isinstance(v, bool) else default
2428
2429
53cd37ba 2430def strip_or_none(v, default=None):
14f25df2 2431 return v.strip() if isinstance(v, str) else default
b72b4431
S
2432
2433
af03000a 2434def url_or_none(url):
14f25df2 2435 if not url or not isinstance(url, str):
af03000a
S
2436 return None
2437 url = url.strip()
29f7c58a 2438 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2439
2440
3e9b66d7 2441def request_to_url(req):
ac668111 2442 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2443 return req.get_full_url()
2444 else:
2445 return req
2446
2447
e29663c6 2448def strftime_or_none(timestamp, date_format, default=None):
2449 datetime_object = None
2450 try:
f9934b96 2451 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2452 # Using naive datetime here can break timestamp() in Windows
2453 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2454 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2455 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2456 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2457 date_format = re.sub( # Support %s on windows
2458 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2459 return datetime_object.strftime(date_format)
2460 except (ValueError, TypeError, AttributeError):
2461 return default
2462
2463
608d11f5 2464def parse_duration(s):
f9934b96 2465 if not isinstance(s, str):
608d11f5 2466 return None
ca7b3246 2467 s = s.strip()
38d79fd1 2468 if not s:
2469 return None
ca7b3246 2470
acaff495 2471 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2472 m = re.match(r'''(?x)
2473 (?P<before_secs>
2474 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2475 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2476 (?P<ms>[.:][0-9]+)?Z?$
2477 ''', s)
acaff495 2478 if m:
8bd1c00b 2479 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2480 else:
2481 m = re.match(
056653bb
S
2482 r'''(?ix)(?:P?
2483 (?:
1c1b2f96 2484 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2485 )?
2486 (?:
1c1b2f96 2487 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2488 )?
2489 (?:
1c1b2f96 2490 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2491 )?
8f4b58d7 2492 (?:
1c1b2f96 2493 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2494 )?
056653bb 2495 T)?
acaff495 2496 (?:
1c1b2f96 2497 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2498 )?
2499 (?:
1c1b2f96 2500 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2501 )?
2502 (?:
2503 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2504 )?Z?$''', s)
acaff495 2505 if m:
2506 days, hours, mins, secs, ms = m.groups()
2507 else:
15846398 2508 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2509 if m:
2510 hours, mins = m.groups()
2511 else:
2512 return None
2513
acaff495 2514 if ms:
19a03940 2515 ms = ms.replace(':', '.')
2516 return sum(float(part or 0) * mult for part, mult in (
2517 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2518
2519
e65e4c88 2520def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2521 name, real_ext = os.path.splitext(filename)
e65e4c88 2522 return (
86e5f3ed 2523 f'{name}.{ext}{real_ext}'
e65e4c88 2524 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2525 else f'{filename}.{ext}')
d70ad093
PH
2526
2527
b3ed15b7
S
2528def replace_extension(filename, ext, expected_real_ext=None):
2529 name, real_ext = os.path.splitext(filename)
86e5f3ed 2530 return '{}.{}'.format(
b3ed15b7
S
2531 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2532 ext)
2533
2534
d70ad093
PH
2535def check_executable(exe, args=[]):
2536 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2537 args can be a list of arguments for a short output (like -version) """
2538 try:
f0c9fb96 2539 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2540 except OSError:
2541 return False
2542 return exe
b7ab0590
PH
2543
2544
7aaf4cd2 2545def _get_exe_version_output(exe, args):
95807118 2546 try:
b64d04c1 2547 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2548 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2549 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2550 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2551 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2552 if ret:
2553 return None
95807118
PH
2554 except OSError:
2555 return False
f0c9fb96 2556 return stdout
cae97f65
PH
2557
2558
2559def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2560 assert isinstance(output, str)
cae97f65
PH
2561 if version_re is None:
2562 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2563 m = re.search(version_re, output)
95807118
PH
2564 if m:
2565 return m.group(1)
2566 else:
2567 return unrecognized
2568
2569
9af98e17 2570def get_exe_version(exe, args=['--version'],
1cdda329 2571 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2572 """ Returns the version of the specified executable,
2573 or False if the executable is not present """
1cdda329 2574 unrecognized = variadic(unrecognized)
2575 assert len(unrecognized) in (1, 2)
9af98e17 2576 out = _get_exe_version_output(exe, args)
1cdda329 2577 if out is None:
2578 return unrecognized[-1]
2579 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2580
2581
7e88d7d7 2582def frange(start=0, stop=None, step=1):
2583 """Float range"""
2584 if stop is None:
2585 start, stop = 0, start
2586 sign = [-1, 1][step > 0] if step else 0
2587 while sign * start < sign * stop:
2588 yield start
2589 start += step
2590
2591
cb89cfc1 2592class LazyList(collections.abc.Sequence):
0f06bcd7 2593 """Lazy immutable list from an iterable
2594 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2595
8e5fecc8 2596 class IndexError(IndexError):
2597 pass
2598
282f5709 2599 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2600 self._iterable = iter(iterable)
2601 self._cache = [] if _cache is None else _cache
2602 self._reversed = reverse
483336e7 2603
2604 def __iter__(self):
0f06bcd7 2605 if self._reversed:
28419ca2 2606 # We need to consume the entire iterable to iterate in reverse
981052c9 2607 yield from self.exhaust()
28419ca2 2608 return
0f06bcd7 2609 yield from self._cache
2610 for item in self._iterable:
2611 self._cache.append(item)
483336e7 2612 yield item
2613
0f06bcd7 2614 def _exhaust(self):
2615 self._cache.extend(self._iterable)
2616 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2617 return self._cache
28419ca2 2618
981052c9 2619 def exhaust(self):
0f06bcd7 2620 """Evaluate the entire iterable"""
2621 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2622
28419ca2 2623 @staticmethod
0f06bcd7 2624 def _reverse_index(x):
f2df4071 2625 return None if x is None else ~x
483336e7 2626
2627 def __getitem__(self, idx):
2628 if isinstance(idx, slice):
0f06bcd7 2629 if self._reversed:
2630 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2631 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2632 elif isinstance(idx, int):
0f06bcd7 2633 if self._reversed:
2634 idx = self._reverse_index(idx)
e0f2b4b4 2635 start, stop, step = idx, idx, 0
483336e7 2636 else:
2637 raise TypeError('indices must be integers or slices')
e0f2b4b4 2638 if ((start or 0) < 0 or (stop or 0) < 0
2639 or (start is None and step < 0)
2640 or (stop is None and step > 0)):
483336e7 2641 # We need to consume the entire iterable to be able to slice from the end
2642 # Obviously, never use this with infinite iterables
0f06bcd7 2643 self._exhaust()
8e5fecc8 2644 try:
0f06bcd7 2645 return self._cache[idx]
8e5fecc8 2646 except IndexError as e:
2647 raise self.IndexError(e) from e
0f06bcd7 2648 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2649 if n > 0:
0f06bcd7 2650 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2651 try:
0f06bcd7 2652 return self._cache[idx]
8e5fecc8 2653 except IndexError as e:
2654 raise self.IndexError(e) from e
483336e7 2655
2656 def __bool__(self):
2657 try:
0f06bcd7 2658 self[-1] if self._reversed else self[0]
8e5fecc8 2659 except self.IndexError:
483336e7 2660 return False
2661 return True
2662
2663 def __len__(self):
0f06bcd7 2664 self._exhaust()
2665 return len(self._cache)
483336e7 2666
282f5709 2667 def __reversed__(self):
0f06bcd7 2668 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2669
2670 def __copy__(self):
0f06bcd7 2671 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2672
28419ca2 2673 def __repr__(self):
2674 # repr and str should mimic a list. So we exhaust the iterable
2675 return repr(self.exhaust())
2676
2677 def __str__(self):
2678 return repr(self.exhaust())
2679
483336e7 2680
7be9ccff 2681class PagedList:
c07a39ae 2682
2683 class IndexError(IndexError):
2684 pass
2685
dd26ced1
PH
2686 def __len__(self):
2687 # This is only useful for tests
2688 return len(self.getslice())
2689
7be9ccff 2690 def __init__(self, pagefunc, pagesize, use_cache=True):
2691 self._pagefunc = pagefunc
2692 self._pagesize = pagesize
f1d13090 2693 self._pagecount = float('inf')
7be9ccff 2694 self._use_cache = use_cache
2695 self._cache = {}
2696
2697 def getpage(self, pagenum):
d8cf8d97 2698 page_results = self._cache.get(pagenum)
2699 if page_results is None:
f1d13090 2700 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2701 if self._use_cache:
2702 self._cache[pagenum] = page_results
2703 return page_results
2704
2705 def getslice(self, start=0, end=None):
2706 return list(self._getslice(start, end))
2707
2708 def _getslice(self, start, end):
55575225 2709 raise NotImplementedError('This method must be implemented by subclasses')
2710
2711 def __getitem__(self, idx):
f1d13090 2712 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2713 if not isinstance(idx, int) or idx < 0:
2714 raise TypeError('indices must be non-negative integers')
2715 entries = self.getslice(idx, idx + 1)
d8cf8d97 2716 if not entries:
c07a39ae 2717 raise self.IndexError()
d8cf8d97 2718 return entries[0]
55575225 2719
9c44d242
PH
2720
2721class OnDemandPagedList(PagedList):
a44ca5a4 2722 """Download pages until a page with less than maximum results"""
86e5f3ed 2723
7be9ccff 2724 def _getslice(self, start, end):
b7ab0590
PH
2725 for pagenum in itertools.count(start // self._pagesize):
2726 firstid = pagenum * self._pagesize
2727 nextfirstid = pagenum * self._pagesize + self._pagesize
2728 if start >= nextfirstid:
2729 continue
2730
b7ab0590
PH
2731 startv = (
2732 start % self._pagesize
2733 if firstid <= start < nextfirstid
2734 else 0)
b7ab0590
PH
2735 endv = (
2736 ((end - 1) % self._pagesize) + 1
2737 if (end is not None and firstid <= end <= nextfirstid)
2738 else None)
2739
f1d13090 2740 try:
2741 page_results = self.getpage(pagenum)
2742 except Exception:
2743 self._pagecount = pagenum - 1
2744 raise
b7ab0590
PH
2745 if startv != 0 or endv is not None:
2746 page_results = page_results[startv:endv]
7be9ccff 2747 yield from page_results
b7ab0590
PH
2748
2749 # A little optimization - if current page is not "full", ie. does
2750 # not contain page_size videos then we can assume that this page
2751 # is the last one - there are no more ids on further pages -
2752 # i.e. no need to query again.
2753 if len(page_results) + startv < self._pagesize:
2754 break
2755
2756 # If we got the whole page, but the next page is not interesting,
2757 # break out early as well
2758 if end == nextfirstid:
2759 break
81c2f20b
PH
2760
2761
9c44d242 2762class InAdvancePagedList(PagedList):
a44ca5a4 2763 """PagedList with total number of pages known in advance"""
86e5f3ed 2764
9c44d242 2765 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2766 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2767 self._pagecount = pagecount
9c44d242 2768
7be9ccff 2769 def _getslice(self, start, end):
9c44d242 2770 start_page = start // self._pagesize
d37707bd 2771 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2772 skip_elems = start - start_page * self._pagesize
2773 only_more = None if end is None else end - start
2774 for pagenum in range(start_page, end_page):
7be9ccff 2775 page_results = self.getpage(pagenum)
9c44d242 2776 if skip_elems:
7be9ccff 2777 page_results = page_results[skip_elems:]
9c44d242
PH
2778 skip_elems = None
2779 if only_more is not None:
7be9ccff 2780 if len(page_results) < only_more:
2781 only_more -= len(page_results)
9c44d242 2782 else:
7be9ccff 2783 yield from page_results[:only_more]
9c44d242 2784 break
7be9ccff 2785 yield from page_results
9c44d242
PH
2786
2787
7e88d7d7 2788class PlaylistEntries:
2789 MissingEntry = object()
2790 is_exhausted = False
2791
2792 def __init__(self, ydl, info_dict):
7e9a6125 2793 self.ydl = ydl
2794
2795 # _entries must be assigned now since infodict can change during iteration
2796 entries = info_dict.get('entries')
2797 if entries is None:
2798 raise EntryNotInPlaylist('There are no entries')
2799 elif isinstance(entries, list):
2800 self.is_exhausted = True
2801
2802 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2803 self.is_incomplete = requested_entries is not None
7e9a6125 2804 if self.is_incomplete:
2805 assert self.is_exhausted
bc5c2f8a 2806 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2807 for i, entry in zip(requested_entries, entries):
2808 self._entries[i - 1] = entry
2809 elif isinstance(entries, (list, PagedList, LazyList)):
2810 self._entries = entries
2811 else:
2812 self._entries = LazyList(entries)
7e88d7d7 2813
2814 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2815 (?P<start>[+-]?\d+)?
2816 (?P<range>[:-]
2817 (?P<end>[+-]?\d+|inf(?:inite)?)?
2818 (?::(?P<step>[+-]?\d+))?
2819 )?''')
2820
2821 @classmethod
2822 def parse_playlist_items(cls, string):
2823 for segment in string.split(','):
2824 if not segment:
2825 raise ValueError('There is two or more consecutive commas')
2826 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2827 if not mobj:
2828 raise ValueError(f'{segment!r} is not a valid specification')
2829 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2830 if int_or_none(step) == 0:
2831 raise ValueError(f'Step in {segment!r} cannot be zero')
2832 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2833
2834 def get_requested_items(self):
2835 playlist_items = self.ydl.params.get('playlist_items')
2836 playlist_start = self.ydl.params.get('playliststart', 1)
2837 playlist_end = self.ydl.params.get('playlistend')
2838 # For backwards compatibility, interpret -1 as whole list
2839 if playlist_end in (-1, None):
2840 playlist_end = ''
2841 if not playlist_items:
2842 playlist_items = f'{playlist_start}:{playlist_end}'
2843 elif playlist_start != 1 or playlist_end:
2844 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2845
2846 for index in self.parse_playlist_items(playlist_items):
2847 for i, entry in self[index]:
2848 yield i, entry
1ac4fd80 2849 if not entry:
2850 continue
7e88d7d7 2851 try:
d21056f4 2852 # The item may have just been added to archive. Don't break due to it
2853 if not self.ydl.params.get('lazy_playlist'):
2854 # TODO: Add auto-generated fields
2855 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2856 except (ExistingVideoReached, RejectedVideoReached):
2857 return
2858
7e9a6125 2859 def get_full_count(self):
2860 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2861 return len(self)
2862 elif isinstance(self._entries, InAdvancePagedList):
2863 if self._entries._pagesize == 1:
2864 return self._entries._pagecount
2865
7e88d7d7 2866 @functools.cached_property
2867 def _getter(self):
2868 if isinstance(self._entries, list):
2869 def get_entry(i):
2870 try:
2871 entry = self._entries[i]
2872 except IndexError:
2873 entry = self.MissingEntry
2874 if not self.is_incomplete:
2875 raise self.IndexError()
2876 if entry is self.MissingEntry:
bc5c2f8a 2877 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2878 return entry
2879 else:
2880 def get_entry(i):
2881 try:
2882 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2883 except (LazyList.IndexError, PagedList.IndexError):
2884 raise self.IndexError()
2885 return get_entry
2886
2887 def __getitem__(self, idx):
2888 if isinstance(idx, int):
2889 idx = slice(idx, idx)
2890
2891 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2892 step = 1 if idx.step is None else idx.step
2893 if idx.start is None:
2894 start = 0 if step > 0 else len(self) - 1
2895 else:
2896 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2897
2898 # NB: Do not call len(self) when idx == [:]
2899 if idx.stop is None:
2900 stop = 0 if step < 0 else float('inf')
2901 else:
2902 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2903 stop += [-1, 1][step > 0]
2904
2905 for i in frange(start, stop, step):
2906 if i < 0:
2907 continue
2908 try:
7e9a6125 2909 entry = self._getter(i)
2910 except self.IndexError:
2911 self.is_exhausted = True
2912 if step > 0:
7e88d7d7 2913 break
7e9a6125 2914 continue
7e88d7d7 2915 yield i + 1, entry
2916
2917 def __len__(self):
2918 return len(tuple(self[:]))
2919
2920 class IndexError(IndexError):
2921 pass
2922
2923
81c2f20b 2924def uppercase_escape(s):
676eb3f2 2925 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2926 return re.sub(
a612753d 2927 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2928 lambda m: unicode_escape(m.group(0))[0],
2929 s)
0fe2ff78
YCH
2930
2931
2932def lowercase_escape(s):
2933 unicode_escape = codecs.getdecoder('unicode_escape')
2934 return re.sub(
2935 r'\\u[0-9a-fA-F]{4}',
2936 lambda m: unicode_escape(m.group(0))[0],
2937 s)
b53466e1 2938
d05cfe06
S
2939
2940def escape_rfc3986(s):
2941 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2942 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2943
2944
2945def escape_url(url):
2946 """Escape URL as suggested by RFC 3986"""
14f25df2 2947 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2948 return url_parsed._replace(
efbed08d 2949 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2950 path=escape_rfc3986(url_parsed.path),
2951 params=escape_rfc3986(url_parsed.params),
2952 query=escape_rfc3986(url_parsed.query),
2953 fragment=escape_rfc3986(url_parsed.fragment)
2954 ).geturl()
2955
62e609ab 2956
96b9e9cf 2957def parse_qs(url, **kwargs):
2958 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2959
2960
62e609ab
PH
2961def read_batch_urls(batch_fd):
2962 def fixup(url):
14f25df2 2963 if not isinstance(url, str):
62e609ab 2964 url = url.decode('utf-8', 'replace')
8c04f0be 2965 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2966 for bom in BOM_UTF8:
2967 if url.startswith(bom):
2968 url = url[len(bom):]
2969 url = url.lstrip()
2970 if not url or url.startswith(('#', ';', ']')):
62e609ab 2971 return False
8c04f0be 2972 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2973 # However, it can be safely stripped out if following a whitespace
8c04f0be 2974 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2975
2976 with contextlib.closing(batch_fd) as fd:
2977 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2978
2979
2980def urlencode_postdata(*args, **kargs):
14f25df2 2981 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2982
2983
45b2ee6f 2984def update_url(url, *, query_update=None, **kwargs):
2985 """Replace URL components specified by kwargs
2986 @param url str or parse url tuple
2987 @param query_update update query
2988 @returns str
2989 """
2990 if isinstance(url, str):
2991 if not kwargs and not query_update:
2992 return url
2993 else:
2994 url = urllib.parse.urlparse(url)
2995 if query_update:
2996 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2997 kwargs['query'] = urllib.parse.urlencode({
2998 **urllib.parse.parse_qs(url.query),
2999 **query_update
3000 }, True)
3001 return urllib.parse.urlunparse(url._replace(**kwargs))
3002
3003
38f9ef31 3004def update_url_query(url, query):
45b2ee6f 3005 return update_url(url, query_update=query)
16392824 3006
8e60dc75 3007
c043c246 3008def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3009 req_headers = req.headers.copy()
c043c246 3010 req_headers.update(headers or {})
ed0291d1
S
3011 req_data = data or req.data
3012 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3013 req_get_method = req.get_method()
3014 if req_get_method == 'HEAD':
3015 req_type = HEADRequest
3016 elif req_get_method == 'PUT':
3017 req_type = PUTRequest
3018 else:
ac668111 3019 req_type = urllib.request.Request
ed0291d1
S
3020 new_req = req_type(
3021 req_url, data=req_data, headers=req_headers,
3022 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3023 if hasattr(req, 'timeout'):
3024 new_req.timeout = req.timeout
3025 return new_req
3026
3027
10c87c15 3028def _multipart_encode_impl(data, boundary):
0c265486
YCH
3029 content_type = 'multipart/form-data; boundary=%s' % boundary
3030
3031 out = b''
3032 for k, v in data.items():
3033 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3034 if isinstance(k, str):
0f06bcd7 3035 k = k.encode()
14f25df2 3036 if isinstance(v, str):
0f06bcd7 3037 v = v.encode()
0c265486
YCH
3038 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3039 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3040 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3041 if boundary.encode('ascii') in content:
3042 raise ValueError('Boundary overlaps with data')
3043 out += content
3044
3045 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3046
3047 return out, content_type
3048
3049
3050def multipart_encode(data, boundary=None):
3051 '''
3052 Encode a dict to RFC 7578-compliant form-data
3053
3054 data:
3055 A dict where keys and values can be either Unicode or bytes-like
3056 objects.
3057 boundary:
3058 If specified a Unicode object, it's used as the boundary. Otherwise
3059 a random boundary is generated.
3060
3061 Reference: https://tools.ietf.org/html/rfc7578
3062 '''
3063 has_specified_boundary = boundary is not None
3064
3065 while True:
3066 if boundary is None:
3067 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3068
3069 try:
10c87c15 3070 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3071 break
3072 except ValueError:
3073 if has_specified_boundary:
3074 raise
3075 boundary = None
3076
3077 return out, content_type
3078
3079
b079c26f
SS
3080def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3081 if blocked_types is NO_DEFAULT:
3082 blocked_types = (str, bytes, collections.abc.Mapping)
3083 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3084
3085
3086def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 3087 if not isinstance(allowed_types, (tuple, type)):
3088 deprecation_warning('allowed_types should be a tuple or a type')
3089 allowed_types = tuple(allowed_types)
6f2287cb 3090 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 3091
3092
c4f60dd7 3093def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3094 for f in funcs:
a32a9a7e 3095 try:
c4f60dd7 3096 val = f(*args, **kwargs)
ab029d7e 3097 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3098 pass
3099 else:
c4f60dd7 3100 if expected_type is None or isinstance(val, expected_type):
3101 return val
3102
3103
3104def try_get(src, getter, expected_type=None):
3105 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3106
3107
90137ca4 3108def filter_dict(dct, cndn=lambda _, v: v is not None):
3109 return {k: v for k, v in dct.items() if cndn(k, v)}
3110
3111
6cc62232
S
3112def merge_dicts(*dicts):
3113 merged = {}
3114 for a_dict in dicts:
3115 for k, v in a_dict.items():
90137ca4 3116 if (v is not None and k not in merged
3117 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3118 merged[k] = v
3119 return merged
3120
3121
8e60dc75 3122def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3123 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3124
16392824 3125
a1a530b0
PH
3126US_RATINGS = {
3127 'G': 0,
3128 'PG': 10,
3129 'PG-13': 13,
3130 'R': 16,
3131 'NC': 18,
3132}
fac55558
PH
3133
3134
a8795327 3135TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3136 'TV-Y': 0,
3137 'TV-Y7': 7,
3138 'TV-G': 0,
3139 'TV-PG': 0,
3140 'TV-14': 14,
3141 'TV-MA': 17,
a8795327
S
3142}
3143
3144
146c80e2 3145def parse_age_limit(s):
19a03940 3146 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3147 if type(s) is int: # noqa: E721
a8795327 3148 return s if 0 <= s <= 21 else None
19a03940 3149 elif not isinstance(s, str):
d838b1bd 3150 return None
146c80e2 3151 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3152 if m:
3153 return int(m.group('age'))
5c5fae6d 3154 s = s.upper()
a8795327
S
3155 if s in US_RATINGS:
3156 return US_RATINGS[s]
5a16c9d9 3157 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3158 if m:
5a16c9d9 3159 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3160 return None
146c80e2
S
3161
3162
fac55558 3163def strip_jsonp(code):
609a61e3 3164 return re.sub(
5552c9eb 3165 r'''(?sx)^
e9c671d5 3166 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3167 (?:\s*&&\s*(?P=func_name))?
3168 \s*\(\s*(?P<callback_data>.*)\);?
3169 \s*?(?://[^\n]*)*$''',
3170 r'\g<callback_data>', code)
478c2c61
PH
3171
3172
8f53dc44 3173def js_to_json(code, vars={}, *, strict=False):
5c610515 3174 # vars is a dict of var, val pairs to substitute
0898c5c8 3175 STRING_QUOTES = '\'"`'
a71b812f 3176 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3177 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3178 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3179 INTEGER_TABLE = (
86e5f3ed 3180 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3181 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3182 )
3183
a71b812f
SS
3184 def process_escape(match):
3185 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3186 escape = match.group(1) or match.group(2)
3187
3188 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3189 else R'\u00' if escape == 'x'
3190 else '' if escape == '\n'
3191 else escape)
3192
0898c5c8
SS
3193 def template_substitute(match):
3194 evaluated = js_to_json(match.group(1), vars, strict=strict)
3195 if evaluated[0] == '"':
3196 return json.loads(evaluated)
3197 return evaluated
3198
e05f6939 3199 def fix_kv(m):
e7b6d122
PH
3200 v = m.group(0)
3201 if v in ('true', 'false', 'null'):
3202 return v
421ddcb8
C
3203 elif v in ('undefined', 'void 0'):
3204 return 'null'
8bdd16b4 3205 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3206 return ''
3207
3208 if v[0] in STRING_QUOTES:
0898c5c8
SS
3209 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3210 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
3211 return f'"{escaped}"'
3212
3213 for regex, base in INTEGER_TABLE:
3214 im = re.match(regex, v)
3215 if im:
3216 i = int(im.group(1), base)
3217 return f'"{i}":' if v.endswith(':') else str(i)
3218
3219 if v in vars:
d5f043d1
C
3220 try:
3221 if not strict:
3222 json.loads(vars[v])
08e29b9f 3223 except json.JSONDecodeError:
d5f043d1
C
3224 return json.dumps(vars[v])
3225 else:
3226 return vars[v]
89ac4a19 3227
a71b812f
SS
3228 if not strict:
3229 return f'"{v}"'
5c610515 3230
a71b812f 3231 raise ValueError(f'Unknown value: {v}')
e05f6939 3232
8072ef2b 3233 def create_map(mobj):
3234 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3235
8072ef2b 3236 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3237 if not strict:
3238 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3239 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3240 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3241 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3242
a71b812f
SS
3243 return re.sub(rf'''(?sx)
3244 {STRING_RE}|
3245 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3246 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3247 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3248 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3249 !+
a71b812f 3250 ''', fix_kv, code)
e05f6939
PH
3251
3252
478c2c61
PH
3253def qualities(quality_ids):
3254 """ Get a numeric quality value out of a list of possible values """
3255 def q(qid):
3256 try:
3257 return quality_ids.index(qid)
3258 except ValueError:
3259 return -1
3260 return q
3261
acd69589 3262
119e40ef 3263POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3264
3265
de6000d9 3266DEFAULT_OUTTMPL = {
3267 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3268 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3269}
3270OUTTMPL_TYPES = {
72755351 3271 'chapter': None,
de6000d9 3272 'subtitle': None,
3273 'thumbnail': None,
3274 'description': 'description',
3275 'annotation': 'annotations.xml',
3276 'infojson': 'info.json',
08438d2c 3277 'link': None,
3b603dbd 3278 'pl_video': None,
5112f26a 3279 'pl_thumbnail': None,
de6000d9 3280 'pl_description': 'description',
3281 'pl_infojson': 'info.json',
3282}
0a871f68 3283
143db31d 3284# As of [1] format syntax is:
3285# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3286# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3287STR_FORMAT_RE_TMPL = r'''(?x)
3288 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3289 %
524e2e4f 3290 (?P<has_key>\((?P<key>{0})\))?
752cda38 3291 (?P<format>
524e2e4f 3292 (?P<conversion>[#0\-+ ]+)?
3293 (?P<min_width>\d+)?
3294 (?P<precision>\.\d+)?
3295 (?P<len_mod>[hlL])? # unused in python
901130bb 3296 {1} # conversion type
752cda38 3297 )
143db31d 3298'''
3299
7d1eb38a 3300
901130bb 3301STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3302
7d1eb38a 3303
a020a0dc
PH
3304def limit_length(s, length):
3305 """ Add ellipses to overly long strings """
3306 if s is None:
3307 return None
3308 ELLIPSES = '...'
3309 if len(s) > length:
3310 return s[:length - len(ELLIPSES)] + ELLIPSES
3311 return s
48844745
PH
3312
3313
3314def version_tuple(v):
5f9b8394 3315 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3316
3317
3318def is_outdated_version(version, limit, assume_new=True):
3319 if not version:
3320 return not assume_new
3321 try:
3322 return version_tuple(version) < version_tuple(limit)
3323 except ValueError:
3324 return not assume_new
732ea2f0
PH
3325
3326
3327def ytdl_is_updateable():
7a5c1cfe 3328 """ Returns if yt-dlp can be updated with -U """
735d865e 3329
69bec673 3330 from ..update import is_non_updateable
732ea2f0 3331
5d535b4a 3332 return not is_non_updateable()
7d4111ed
PH
3333
3334
3335def args_to_str(args):
3336 # Get a short string representation for a subprocess command
702ccf2d 3337 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3338
3339
a44ca5a4 3340def error_to_str(err):
3341 return f'{type(err).__name__}: {err}'
3342
3343
2647c933 3344def mimetype2ext(mt, default=NO_DEFAULT):
3345 if not isinstance(mt, str):
3346 if default is not NO_DEFAULT:
3347 return default
eb9ee194
S
3348 return None
3349
2647c933 3350 MAP = {
3351 # video
f6861ec9 3352 '3gpp': '3gp',
2647c933 3353 'mp2t': 'ts',
3354 'mp4': 'mp4',
3355 'mpeg': 'mpeg',
3356 'mpegurl': 'm3u8',
3357 'quicktime': 'mov',
3358 'webm': 'webm',
3359 'vp9': 'vp9',
f6861ec9 3360 'x-flv': 'flv',
2647c933 3361 'x-m4v': 'm4v',
3362 'x-matroska': 'mkv',
3363 'x-mng': 'mng',
a0d8d704 3364 'x-mp4-fragmented': 'mp4',
2647c933 3365 'x-ms-asf': 'asf',
a0d8d704 3366 'x-ms-wmv': 'wmv',
2647c933 3367 'x-msvideo': 'avi',
3368
3369 # application (streaming playlists)
b4173f15 3370 'dash+xml': 'mpd',
b4173f15 3371 'f4m+xml': 'f4m',
f164b971 3372 'hds+xml': 'f4m',
2647c933 3373 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3374 'vnd.ms-sstr+xml': 'ism',
2647c933 3375 'x-mpegurl': 'm3u8',
3376
3377 # audio
3378 'audio/mp4': 'm4a',
3379 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3380 # Using .mp3 as it's the most popular one
3381 'audio/mpeg': 'mp3',
d80ca5de 3382 'audio/webm': 'webm',
2647c933 3383 'audio/x-matroska': 'mka',
3384 'audio/x-mpegurl': 'm3u',
3385 'midi': 'mid',
3386 'ogg': 'ogg',
3387 'wav': 'wav',
3388 'wave': 'wav',
3389 'x-aac': 'aac',
3390 'x-flac': 'flac',
3391 'x-m4a': 'm4a',
3392 'x-realaudio': 'ra',
39e7107d 3393 'x-wav': 'wav',
9359f3d4 3394
2647c933 3395 # image
3396 'avif': 'avif',
3397 'bmp': 'bmp',
3398 'gif': 'gif',
3399 'jpeg': 'jpg',
3400 'png': 'png',
3401 'svg+xml': 'svg',
3402 'tiff': 'tif',
3403 'vnd.wap.wbmp': 'wbmp',
3404 'webp': 'webp',
3405 'x-icon': 'ico',
3406 'x-jng': 'jng',
3407 'x-ms-bmp': 'bmp',
3408
3409 # caption
3410 'filmstrip+json': 'fs',
3411 'smptett+xml': 'tt',
3412 'ttaf+xml': 'dfxp',
3413 'ttml+xml': 'ttml',
3414 'x-ms-sami': 'sami',
9359f3d4 3415
2647c933 3416 # misc
3417 'gzip': 'gz',
9359f3d4
F
3418 'json': 'json',
3419 'xml': 'xml',
3420 'zip': 'zip',
9359f3d4
F
3421 }
3422
2647c933 3423 mimetype = mt.partition(';')[0].strip().lower()
3424 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3425
69bec673 3426 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 3427 if ext:
3428 return ext
3429 elif default is not NO_DEFAULT:
3430 return default
9359f3d4 3431 return subtype.replace('+', '.')
c460bdd5
PH
3432
3433
2814f12b
THD
3434def ext2mimetype(ext_or_url):
3435 if not ext_or_url:
3436 return None
3437 if '.' not in ext_or_url:
3438 ext_or_url = f'file.{ext_or_url}'
3439 return mimetypes.guess_type(ext_or_url)[0]
3440
3441
4f3c5e06 3442def parse_codecs(codecs_str):
3443 # http://tools.ietf.org/html/rfc6381
3444 if not codecs_str:
3445 return {}
a0566bbf 3446 split_codecs = list(filter(None, map(
dbf5416a 3447 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3448 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3449 for full_codec in split_codecs:
d816f61f 3450 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3451 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3453 if vcodec:
3454 continue
3455 vcodec = full_codec
3456 if parts[0] in ('dvh1', 'dvhe'):
3457 hdr = 'DV'
69bec673 3458 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 3459 hdr = 'HDR10'
3460 elif parts[:2] == ['vp9', '2']:
3461 hdr = 'HDR10'
71082216 3462 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3463 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464 acodec = acodec or full_codec
3465 elif parts[0] in ('stpp', 'wvtt'):
3466 scodec = scodec or full_codec
4f3c5e06 3467 else:
19a03940 3468 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3469 if vcodec or acodec or scodec:
4f3c5e06 3470 return {
3471 'vcodec': vcodec or 'none',
3472 'acodec': acodec or 'none',
176f1866 3473 'dynamic_range': hdr,
3fe75fdc 3474 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3475 }
b69fd25c 3476 elif len(split_codecs) == 2:
3477 return {
3478 'vcodec': split_codecs[0],
3479 'acodec': split_codecs[1],
3480 }
4f3c5e06 3481 return {}
3482
3483
fc61aff4
LL
3484def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3485 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3486
3487 allow_mkv = not preferences or 'mkv' in preferences
3488
3489 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3490 return 'mkv' # TODO: any other format allows this?
3491
3492 # TODO: All codecs supported by parse_codecs isn't handled here
3493 COMPATIBLE_CODECS = {
3494 'mp4': {
71082216 3495 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3496 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3497 },
3498 'webm': {
3499 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500 'vp9x', 'vp8x', # in the webm spec
3501 },
3502 }
3503
69bec673 3504 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
8f84770a 3505 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3506
3507 for ext in preferences or COMPATIBLE_CODECS.keys():
3508 codec_set = COMPATIBLE_CODECS.get(ext, set())
3509 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3510 return ext
3511
3512 COMPATIBLE_EXTS = (
3513 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3514 {'webm', 'weba'},
fc61aff4
LL
3515 )
3516 for ext in preferences or vexts:
3517 current_exts = {ext, *vexts, *aexts}
3518 if ext == 'mkv' or current_exts == {ext} or any(
3519 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3520 return ext
3521 return 'mkv' if allow_mkv else preferences[-1]
3522
3523
2647c933 3524def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3525 getheader = url_handle.headers.get
2ccd1b10 3526
b55ee18f
PH
3527 cd = getheader('Content-Disposition')
3528 if cd:
3529 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3530 if m:
3531 e = determine_ext(m.group('filename'), default_ext=None)
3532 if e:
3533 return e
3534
2647c933 3535 meta_ext = getheader('x-amz-meta-name')
3536 if meta_ext:
3537 e = meta_ext.rpartition('.')[2]
3538 if e:
3539 return e
3540
3541 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3542
3543
1e399778
YCH
3544def encode_data_uri(data, mime_type):
3545 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3546
3547
05900629 3548def age_restricted(content_limit, age_limit):
6ec6cb4e 3549 """ Returns True iff the content should be blocked """
05900629
PH
3550
3551 if age_limit is None: # No limit set
3552 return False
3553 if content_limit is None:
3554 return False # Content available for everyone
3555 return age_limit < content_limit
61ca9a80
PH
3556
3557
88f60feb 3558# List of known byte-order-marks (BOM)
a904a7f8
L
3559BOMS = [
3560 (b'\xef\xbb\xbf', 'utf-8'),
3561 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3562 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3563 (b'\xff\xfe', 'utf-16-le'),
3564 (b'\xfe\xff', 'utf-16-be'),
3565]
a904a7f8
L
3566
3567
61ca9a80
PH
3568def is_html(first_bytes):
3569 """ Detect whether a file contains HTML by examining its first bytes. """
3570
80e8493e 3571 encoding = 'utf-8'
61ca9a80 3572 for bom, enc in BOMS:
80e8493e 3573 while first_bytes.startswith(bom):
3574 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3575
80e8493e 3576 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3577
3578
3579def determine_protocol(info_dict):
3580 protocol = info_dict.get('protocol')
3581 if protocol is not None:
3582 return protocol
3583
7de837a5 3584 url = sanitize_url(info_dict['url'])
a055469f
PH
3585 if url.startswith('rtmp'):
3586 return 'rtmp'
3587 elif url.startswith('mms'):
3588 return 'mms'
3589 elif url.startswith('rtsp'):
3590 return 'rtsp'
3591
3592 ext = determine_ext(url)
3593 if ext == 'm3u8':
deae7c17 3594 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3595 elif ext == 'f4m':
3596 return 'f4m'
3597
14f25df2 3598 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3599
3600
c5e3f849 3601def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3602 """ Render a list of rows, each as a list of values.
3603 Text after a \t will be right aligned """
ec11a9f4 3604 def width(string):
c5e3f849 3605 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3606
3607 def get_max_lens(table):
ec11a9f4 3608 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3609
3610 def filter_using_list(row, filterArray):
d16df59d 3611 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3612
d16df59d 3613 max_lens = get_max_lens(data) if hide_empty else []
3614 header_row = filter_using_list(header_row, max_lens)
3615 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3616
cfb56d1a 3617 table = [header_row] + data
76d321f6 3618 max_lens = get_max_lens(table)
c5e3f849 3619 extra_gap += 1
76d321f6 3620 if delim:
c5e3f849 3621 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3622 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3623 for row in table:
3624 for pos, text in enumerate(map(str, row)):
c5e3f849 3625 if '\t' in text:
3626 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3627 else:
3628 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3629 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3630 return ret
347de493
PH
3631
3632
8f18aca8 3633def _match_one(filter_part, dct, incomplete):
77b87f05 3634 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3635 STRING_OPERATORS = {
3636 '*=': operator.contains,
3637 '^=': lambda attr, value: attr.startswith(value),
3638 '$=': lambda attr, value: attr.endswith(value),
3639 '~=': lambda attr, value: re.search(value, attr),
3640 }
347de493 3641 COMPARISON_OPERATORS = {
a047eeb6 3642 **STRING_OPERATORS,
3643 '<=': operator.le, # "<=" must be defined above "<"
347de493 3644 '<': operator.lt,
347de493 3645 '>=': operator.ge,
a047eeb6 3646 '>': operator.gt,
347de493 3647 '=': operator.eq,
347de493 3648 }
a047eeb6 3649
6db9c4d5 3650 if isinstance(incomplete, bool):
3651 is_incomplete = lambda _: incomplete
3652 else:
3653 is_incomplete = lambda k: k in incomplete
3654
64fa820c 3655 operator_rex = re.compile(r'''(?x)
347de493 3656 (?P<key>[a-z_]+)
77b87f05 3657 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3658 (?:
a047eeb6 3659 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3660 (?P<strval>.+?)
347de493 3661 )
347de493 3662 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3663 m = operator_rex.fullmatch(filter_part.strip())
347de493 3664 if m:
18f96d12 3665 m = m.groupdict()
3666 unnegated_op = COMPARISON_OPERATORS[m['op']]
3667 if m['negation']:
77b87f05
MT
3668 op = lambda attr, value: not unnegated_op(attr, value)
3669 else:
3670 op = unnegated_op
18f96d12 3671 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3672 if m['quote']:
3673 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3674 actual_value = dct.get(m['key'])
3675 numeric_comparison = None
f9934b96 3676 if isinstance(actual_value, (int, float)):
e5a088dc
S
3677 # If the original field is a string and matching comparisonvalue is
3678 # a number we should respect the origin of the original field
3679 # and process comparison value as a string (see
18f96d12 3680 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3681 try:
18f96d12 3682 numeric_comparison = int(comparison_value)
347de493 3683 except ValueError:
18f96d12 3684 numeric_comparison = parse_filesize(comparison_value)
3685 if numeric_comparison is None:
3686 numeric_comparison = parse_filesize(f'{comparison_value}B')
3687 if numeric_comparison is None:
3688 numeric_comparison = parse_duration(comparison_value)
3689 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3690 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3691 if actual_value is None:
6db9c4d5 3692 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3693 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3694
3695 UNARY_OPERATORS = {
1cc47c66
S
3696 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3697 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3698 }
64fa820c 3699 operator_rex = re.compile(r'''(?x)
347de493 3700 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3701 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3702 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3703 if m:
3704 op = UNARY_OPERATORS[m.group('op')]
3705 actual_value = dct.get(m.group('key'))
6db9c4d5 3706 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3707 return True
347de493
PH
3708 return op(actual_value)
3709
3710 raise ValueError('Invalid filter part %r' % filter_part)
3711
3712
8f18aca8 3713def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3714 """ Filter a dictionary with a simple string syntax.
3715 @returns Whether the filter passes
3716 @param incomplete Set of keys that is expected to be missing from dct.
3717 Can be True/False to indicate all/none of the keys may be missing.
3718 All conditions on incomplete keys pass if the key is missing
8f18aca8 3719 """
347de493 3720 return all(
8f18aca8 3721 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3722 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3723
3724
fe2ce85a 3725def match_filter_func(filters, breaking_filters=None):
3726 if not filters and not breaking_filters:
d1b5f70b 3727 return None
fe2ce85a 3728 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3729 filters = set(variadic(filters or []))
d1b5f70b 3730
492272fe 3731 interactive = '-' in filters
3732 if interactive:
3733 filters.remove('-')
3734
3735 def _match_func(info_dict, incomplete=False):
fe2ce85a 3736 ret = breaking_filters(info_dict, incomplete)
3737 if ret is not None:
3738 raise RejectedVideoReached(ret)
3739
492272fe 3740 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3741 return NO_DEFAULT if interactive and not incomplete else None
347de493 3742 else:
3bec830a 3743 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3744 filter_str = ') | ('.join(map(str.strip, filters))
3745 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3746 return _match_func
91410c9b
PH
3747
3748
f2df4071 3749class download_range_func:
3750 def __init__(self, chapters, ranges):
3751 self.chapters, self.ranges = chapters, ranges
3752
3753 def __call__(self, info_dict, ydl):
0500ee3d 3754 if not self.ranges and not self.chapters:
3755 yield {}
3756
5ec1b6b7 3757 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3758 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3759 for regex in self.chapters or []:
5ec1b6b7 3760 for i, chapter in enumerate(info_dict.get('chapters') or []):
3761 if re.search(regex, chapter['title']):
3762 warning = None
3763 yield {**chapter, 'index': i}
f2df4071 3764 if self.chapters and warning:
5ec1b6b7 3765 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3766
f2df4071 3767 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3768
f2df4071 3769 def __eq__(self, other):
3770 return (isinstance(other, download_range_func)
3771 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3772
71df9b7f 3773 def __repr__(self):
a5387729 3774 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3775
5ec1b6b7 3776
bf6427d2
YCH
3777def parse_dfxp_time_expr(time_expr):
3778 if not time_expr:
d631d5f9 3779 return
bf6427d2 3780
1d485a1a 3781 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3782 if mobj:
3783 return float(mobj.group('time_offset'))
3784
db2fe38b 3785 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3786 if mobj:
db2fe38b 3787 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3788
3789
c1c924ab 3790def srt_subtitles_timecode(seconds):
aa7785f8 3791 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3792
3793
3794def ass_subtitles_timecode(seconds):
3795 time = timetuple_from_msec(seconds * 1000)
3796 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3797
3798
3799def dfxp2srt(dfxp_data):
3869028f
YCH
3800 '''
3801 @param dfxp_data A bytes-like object containing DFXP data
3802 @returns A unicode object containing converted SRT data
3803 '''
5b995f71 3804 LEGACY_NAMESPACES = (
3869028f
YCH
3805 (b'http://www.w3.org/ns/ttml', [
3806 b'http://www.w3.org/2004/11/ttaf1',
3807 b'http://www.w3.org/2006/04/ttaf1',
3808 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3809 ]),
3869028f
YCH
3810 (b'http://www.w3.org/ns/ttml#styling', [
3811 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3812 ]),
3813 )
3814
3815 SUPPORTED_STYLING = [
3816 'color',
3817 'fontFamily',
3818 'fontSize',
3819 'fontStyle',
3820 'fontWeight',
3821 'textDecoration'
3822 ]
3823
4e335771 3824 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3825 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3826 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3827 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3828 })
bf6427d2 3829
5b995f71
RA
3830 styles = {}
3831 default_style = {}
3832
86e5f3ed 3833 class TTMLPElementParser:
5b995f71
RA
3834 _out = ''
3835 _unclosed_elements = []
3836 _applied_styles = []
bf6427d2 3837
2b14cb56 3838 def start(self, tag, attrib):
5b995f71
RA
3839 if tag in (_x('ttml:br'), 'br'):
3840 self._out += '\n'
3841 else:
3842 unclosed_elements = []
3843 style = {}
3844 element_style_id = attrib.get('style')
3845 if default_style:
3846 style.update(default_style)
3847 if element_style_id:
3848 style.update(styles.get(element_style_id, {}))
3849 for prop in SUPPORTED_STYLING:
3850 prop_val = attrib.get(_x('tts:' + prop))
3851 if prop_val:
3852 style[prop] = prop_val
3853 if style:
3854 font = ''
3855 for k, v in sorted(style.items()):
3856 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3857 continue
3858 if k == 'color':
3859 font += ' color="%s"' % v
3860 elif k == 'fontSize':
3861 font += ' size="%s"' % v
3862 elif k == 'fontFamily':
3863 font += ' face="%s"' % v
3864 elif k == 'fontWeight' and v == 'bold':
3865 self._out += '<b>'
3866 unclosed_elements.append('b')
3867 elif k == 'fontStyle' and v == 'italic':
3868 self._out += '<i>'
3869 unclosed_elements.append('i')
3870 elif k == 'textDecoration' and v == 'underline':
3871 self._out += '<u>'
3872 unclosed_elements.append('u')
3873 if font:
3874 self._out += '<font' + font + '>'
3875 unclosed_elements.append('font')
3876 applied_style = {}
3877 if self._applied_styles:
3878 applied_style.update(self._applied_styles[-1])
3879 applied_style.update(style)
3880 self._applied_styles.append(applied_style)
3881 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3882
2b14cb56 3883 def end(self, tag):
5b995f71
RA
3884 if tag not in (_x('ttml:br'), 'br'):
3885 unclosed_elements = self._unclosed_elements.pop()
3886 for element in reversed(unclosed_elements):
3887 self._out += '</%s>' % element
3888 if unclosed_elements and self._applied_styles:
3889 self._applied_styles.pop()
bf6427d2 3890
2b14cb56 3891 def data(self, data):
5b995f71 3892 self._out += data
2b14cb56 3893
3894 def close(self):
5b995f71 3895 return self._out.strip()
2b14cb56 3896
6a765f13 3897 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3898 # This will not trigger false positives since only UTF-8 text is being replaced
3899 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3900
2b14cb56 3901 def parse_node(node):
3902 target = TTMLPElementParser()
3903 parser = xml.etree.ElementTree.XMLParser(target=target)
3904 parser.feed(xml.etree.ElementTree.tostring(node))
3905 return parser.close()
bf6427d2 3906
5b995f71
RA
3907 for k, v in LEGACY_NAMESPACES:
3908 for ns in v:
3909 dfxp_data = dfxp_data.replace(ns, k)
3910
3869028f 3911 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3912 out = []
5b995f71 3913 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3914
3915 if not paras:
3916 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3917
5b995f71
RA
3918 repeat = False
3919 while True:
3920 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3921 style_id = style.get('id') or style.get(_x('xml:id'))
3922 if not style_id:
3923 continue
5b995f71
RA
3924 parent_style_id = style.get('style')
3925 if parent_style_id:
3926 if parent_style_id not in styles:
3927 repeat = True
3928 continue
3929 styles[style_id] = styles[parent_style_id].copy()
3930 for prop in SUPPORTED_STYLING:
3931 prop_val = style.get(_x('tts:' + prop))
3932 if prop_val:
3933 styles.setdefault(style_id, {})[prop] = prop_val
3934 if repeat:
3935 repeat = False
3936 else:
3937 break
3938
3939 for p in ('body', 'div'):
3940 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3941 if ele is None:
3942 continue
3943 style = styles.get(ele.get('style'))
3944 if not style:
3945 continue
3946 default_style.update(style)
3947
bf6427d2 3948 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3949 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3950 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3951 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3952 if begin_time is None:
3953 continue
7dff0363 3954 if not end_time:
d631d5f9
YCH
3955 if not dur:
3956 continue
3957 end_time = begin_time + dur
bf6427d2
YCH
3958 out.append('%d\n%s --> %s\n%s\n\n' % (
3959 index,
c1c924ab
YCH
3960 srt_subtitles_timecode(begin_time),
3961 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3962 parse_node(para)))
3963
3964 return ''.join(out)
3965
3966
c487cf00 3967def cli_option(params, command_option, param, separator=None):
66e289ba 3968 param = params.get(param)
c487cf00 3969 return ([] if param is None
3970 else [command_option, str(param)] if separator is None
3971 else [f'{command_option}{separator}{param}'])
66e289ba
S
3972
3973
3974def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3975 param = params.get(param)
c487cf00 3976 assert param in (True, False, None)
3977 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3978
3979
3980def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3981 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3982
3983
e92caff5 3984def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3985 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3986 if use_compat:
5b1ecbb3 3987 return argdict
3988 else:
3989 argdict = None
eab9b2bc 3990 if argdict is None:
5b1ecbb3 3991 return default
eab9b2bc 3992 assert isinstance(argdict, dict)
3993
e92caff5 3994 assert isinstance(keys, (list, tuple))
3995 for key_list in keys:
e92caff5 3996 arg_list = list(filter(
3997 lambda x: x is not None,
6606817a 3998 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3999 if arg_list:
4000 return [arg for args in arg_list for arg in args]
4001 return default
66e289ba 4002
6251555f 4003
330690a2 4004def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4005 main_key, exe = main_key.lower(), exe.lower()
4006 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4007 keys = [f'{root_key}{k}' for k in (keys or [''])]
4008 if root_key in keys:
4009 if main_key != exe:
4010 keys.append((main_key, exe))
4011 keys.append('default')
4012 else:
4013 use_compat = False
4014 return cli_configuration_args(argdict, keys, default, use_compat)
4015
66e289ba 4016
86e5f3ed 4017class ISO639Utils:
39672624
YCH
4018 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4019 _lang_map = {
4020 'aa': 'aar',
4021 'ab': 'abk',
4022 'ae': 'ave',
4023 'af': 'afr',
4024 'ak': 'aka',
4025 'am': 'amh',
4026 'an': 'arg',
4027 'ar': 'ara',
4028 'as': 'asm',
4029 'av': 'ava',
4030 'ay': 'aym',
4031 'az': 'aze',
4032 'ba': 'bak',
4033 'be': 'bel',
4034 'bg': 'bul',
4035 'bh': 'bih',
4036 'bi': 'bis',
4037 'bm': 'bam',
4038 'bn': 'ben',
4039 'bo': 'bod',
4040 'br': 'bre',
4041 'bs': 'bos',
4042 'ca': 'cat',
4043 'ce': 'che',
4044 'ch': 'cha',
4045 'co': 'cos',
4046 'cr': 'cre',
4047 'cs': 'ces',
4048 'cu': 'chu',
4049 'cv': 'chv',
4050 'cy': 'cym',
4051 'da': 'dan',
4052 'de': 'deu',
4053 'dv': 'div',
4054 'dz': 'dzo',
4055 'ee': 'ewe',
4056 'el': 'ell',
4057 'en': 'eng',
4058 'eo': 'epo',
4059 'es': 'spa',
4060 'et': 'est',
4061 'eu': 'eus',
4062 'fa': 'fas',
4063 'ff': 'ful',
4064 'fi': 'fin',
4065 'fj': 'fij',
4066 'fo': 'fao',
4067 'fr': 'fra',
4068 'fy': 'fry',
4069 'ga': 'gle',
4070 'gd': 'gla',
4071 'gl': 'glg',
4072 'gn': 'grn',
4073 'gu': 'guj',
4074 'gv': 'glv',
4075 'ha': 'hau',
4076 'he': 'heb',
b7acc835 4077 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4078 'hi': 'hin',
4079 'ho': 'hmo',
4080 'hr': 'hrv',
4081 'ht': 'hat',
4082 'hu': 'hun',
4083 'hy': 'hye',
4084 'hz': 'her',
4085 'ia': 'ina',
4086 'id': 'ind',
b7acc835 4087 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4088 'ie': 'ile',
4089 'ig': 'ibo',
4090 'ii': 'iii',
4091 'ik': 'ipk',
4092 'io': 'ido',
4093 'is': 'isl',
4094 'it': 'ita',
4095 'iu': 'iku',
4096 'ja': 'jpn',
4097 'jv': 'jav',
4098 'ka': 'kat',
4099 'kg': 'kon',
4100 'ki': 'kik',
4101 'kj': 'kua',
4102 'kk': 'kaz',
4103 'kl': 'kal',
4104 'km': 'khm',
4105 'kn': 'kan',
4106 'ko': 'kor',
4107 'kr': 'kau',
4108 'ks': 'kas',
4109 'ku': 'kur',
4110 'kv': 'kom',
4111 'kw': 'cor',
4112 'ky': 'kir',
4113 'la': 'lat',
4114 'lb': 'ltz',
4115 'lg': 'lug',
4116 'li': 'lim',
4117 'ln': 'lin',
4118 'lo': 'lao',
4119 'lt': 'lit',
4120 'lu': 'lub',
4121 'lv': 'lav',
4122 'mg': 'mlg',
4123 'mh': 'mah',
4124 'mi': 'mri',
4125 'mk': 'mkd',
4126 'ml': 'mal',
4127 'mn': 'mon',
4128 'mr': 'mar',
4129 'ms': 'msa',
4130 'mt': 'mlt',
4131 'my': 'mya',
4132 'na': 'nau',
4133 'nb': 'nob',
4134 'nd': 'nde',
4135 'ne': 'nep',
4136 'ng': 'ndo',
4137 'nl': 'nld',
4138 'nn': 'nno',
4139 'no': 'nor',
4140 'nr': 'nbl',
4141 'nv': 'nav',
4142 'ny': 'nya',
4143 'oc': 'oci',
4144 'oj': 'oji',
4145 'om': 'orm',
4146 'or': 'ori',
4147 'os': 'oss',
4148 'pa': 'pan',
4149 'pi': 'pli',
4150 'pl': 'pol',
4151 'ps': 'pus',
4152 'pt': 'por',
4153 'qu': 'que',
4154 'rm': 'roh',
4155 'rn': 'run',
4156 'ro': 'ron',
4157 'ru': 'rus',
4158 'rw': 'kin',
4159 'sa': 'san',
4160 'sc': 'srd',
4161 'sd': 'snd',
4162 'se': 'sme',
4163 'sg': 'sag',
4164 'si': 'sin',
4165 'sk': 'slk',
4166 'sl': 'slv',
4167 'sm': 'smo',
4168 'sn': 'sna',
4169 'so': 'som',
4170 'sq': 'sqi',
4171 'sr': 'srp',
4172 'ss': 'ssw',
4173 'st': 'sot',
4174 'su': 'sun',
4175 'sv': 'swe',
4176 'sw': 'swa',
4177 'ta': 'tam',
4178 'te': 'tel',
4179 'tg': 'tgk',
4180 'th': 'tha',
4181 'ti': 'tir',
4182 'tk': 'tuk',
4183 'tl': 'tgl',
4184 'tn': 'tsn',
4185 'to': 'ton',
4186 'tr': 'tur',
4187 'ts': 'tso',
4188 'tt': 'tat',
4189 'tw': 'twi',
4190 'ty': 'tah',
4191 'ug': 'uig',
4192 'uk': 'ukr',
4193 'ur': 'urd',
4194 'uz': 'uzb',
4195 've': 'ven',
4196 'vi': 'vie',
4197 'vo': 'vol',
4198 'wa': 'wln',
4199 'wo': 'wol',
4200 'xh': 'xho',
4201 'yi': 'yid',
e9a50fba 4202 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4203 'yo': 'yor',
4204 'za': 'zha',
4205 'zh': 'zho',
4206 'zu': 'zul',
4207 }
4208
4209 @classmethod
4210 def short2long(cls, code):
4211 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4212 return cls._lang_map.get(code[:2])
4213
4214 @classmethod
4215 def long2short(cls, code):
4216 """Convert language code from ISO 639-2/T to ISO 639-1"""
4217 for short_name, long_name in cls._lang_map.items():
4218 if long_name == code:
4219 return short_name
4220
4221
86e5f3ed 4222class ISO3166Utils:
4eb10f66
YCH
4223 # From http://data.okfn.org/data/core/country-list
4224 _country_map = {
4225 'AF': 'Afghanistan',
4226 'AX': 'Åland Islands',
4227 'AL': 'Albania',
4228 'DZ': 'Algeria',
4229 'AS': 'American Samoa',
4230 'AD': 'Andorra',
4231 'AO': 'Angola',
4232 'AI': 'Anguilla',
4233 'AQ': 'Antarctica',
4234 'AG': 'Antigua and Barbuda',
4235 'AR': 'Argentina',
4236 'AM': 'Armenia',
4237 'AW': 'Aruba',
4238 'AU': 'Australia',
4239 'AT': 'Austria',
4240 'AZ': 'Azerbaijan',
4241 'BS': 'Bahamas',
4242 'BH': 'Bahrain',
4243 'BD': 'Bangladesh',
4244 'BB': 'Barbados',
4245 'BY': 'Belarus',
4246 'BE': 'Belgium',
4247 'BZ': 'Belize',
4248 'BJ': 'Benin',
4249 'BM': 'Bermuda',
4250 'BT': 'Bhutan',
4251 'BO': 'Bolivia, Plurinational State of',
4252 'BQ': 'Bonaire, Sint Eustatius and Saba',
4253 'BA': 'Bosnia and Herzegovina',
4254 'BW': 'Botswana',
4255 'BV': 'Bouvet Island',
4256 'BR': 'Brazil',
4257 'IO': 'British Indian Ocean Territory',
4258 'BN': 'Brunei Darussalam',
4259 'BG': 'Bulgaria',
4260 'BF': 'Burkina Faso',
4261 'BI': 'Burundi',
4262 'KH': 'Cambodia',
4263 'CM': 'Cameroon',
4264 'CA': 'Canada',
4265 'CV': 'Cape Verde',
4266 'KY': 'Cayman Islands',
4267 'CF': 'Central African Republic',
4268 'TD': 'Chad',
4269 'CL': 'Chile',
4270 'CN': 'China',
4271 'CX': 'Christmas Island',
4272 'CC': 'Cocos (Keeling) Islands',
4273 'CO': 'Colombia',
4274 'KM': 'Comoros',
4275 'CG': 'Congo',
4276 'CD': 'Congo, the Democratic Republic of the',
4277 'CK': 'Cook Islands',
4278 'CR': 'Costa Rica',
4279 'CI': 'Côte d\'Ivoire',
4280 'HR': 'Croatia',
4281 'CU': 'Cuba',
4282 'CW': 'Curaçao',
4283 'CY': 'Cyprus',
4284 'CZ': 'Czech Republic',
4285 'DK': 'Denmark',
4286 'DJ': 'Djibouti',
4287 'DM': 'Dominica',
4288 'DO': 'Dominican Republic',
4289 'EC': 'Ecuador',
4290 'EG': 'Egypt',
4291 'SV': 'El Salvador',
4292 'GQ': 'Equatorial Guinea',
4293 'ER': 'Eritrea',
4294 'EE': 'Estonia',
4295 'ET': 'Ethiopia',
4296 'FK': 'Falkland Islands (Malvinas)',
4297 'FO': 'Faroe Islands',
4298 'FJ': 'Fiji',
4299 'FI': 'Finland',
4300 'FR': 'France',
4301 'GF': 'French Guiana',
4302 'PF': 'French Polynesia',
4303 'TF': 'French Southern Territories',
4304 'GA': 'Gabon',
4305 'GM': 'Gambia',
4306 'GE': 'Georgia',
4307 'DE': 'Germany',
4308 'GH': 'Ghana',
4309 'GI': 'Gibraltar',
4310 'GR': 'Greece',
4311 'GL': 'Greenland',
4312 'GD': 'Grenada',
4313 'GP': 'Guadeloupe',
4314 'GU': 'Guam',
4315 'GT': 'Guatemala',
4316 'GG': 'Guernsey',
4317 'GN': 'Guinea',
4318 'GW': 'Guinea-Bissau',
4319 'GY': 'Guyana',
4320 'HT': 'Haiti',
4321 'HM': 'Heard Island and McDonald Islands',
4322 'VA': 'Holy See (Vatican City State)',
4323 'HN': 'Honduras',
4324 'HK': 'Hong Kong',
4325 'HU': 'Hungary',
4326 'IS': 'Iceland',
4327 'IN': 'India',
4328 'ID': 'Indonesia',
4329 'IR': 'Iran, Islamic Republic of',
4330 'IQ': 'Iraq',
4331 'IE': 'Ireland',
4332 'IM': 'Isle of Man',
4333 'IL': 'Israel',
4334 'IT': 'Italy',
4335 'JM': 'Jamaica',
4336 'JP': 'Japan',
4337 'JE': 'Jersey',
4338 'JO': 'Jordan',
4339 'KZ': 'Kazakhstan',
4340 'KE': 'Kenya',
4341 'KI': 'Kiribati',
4342 'KP': 'Korea, Democratic People\'s Republic of',
4343 'KR': 'Korea, Republic of',
4344 'KW': 'Kuwait',
4345 'KG': 'Kyrgyzstan',
4346 'LA': 'Lao People\'s Democratic Republic',
4347 'LV': 'Latvia',
4348 'LB': 'Lebanon',
4349 'LS': 'Lesotho',
4350 'LR': 'Liberia',
4351 'LY': 'Libya',
4352 'LI': 'Liechtenstein',
4353 'LT': 'Lithuania',
4354 'LU': 'Luxembourg',
4355 'MO': 'Macao',
4356 'MK': 'Macedonia, the Former Yugoslav Republic of',
4357 'MG': 'Madagascar',
4358 'MW': 'Malawi',
4359 'MY': 'Malaysia',
4360 'MV': 'Maldives',
4361 'ML': 'Mali',
4362 'MT': 'Malta',
4363 'MH': 'Marshall Islands',
4364 'MQ': 'Martinique',
4365 'MR': 'Mauritania',
4366 'MU': 'Mauritius',
4367 'YT': 'Mayotte',
4368 'MX': 'Mexico',
4369 'FM': 'Micronesia, Federated States of',
4370 'MD': 'Moldova, Republic of',
4371 'MC': 'Monaco',
4372 'MN': 'Mongolia',
4373 'ME': 'Montenegro',
4374 'MS': 'Montserrat',
4375 'MA': 'Morocco',
4376 'MZ': 'Mozambique',
4377 'MM': 'Myanmar',
4378 'NA': 'Namibia',
4379 'NR': 'Nauru',
4380 'NP': 'Nepal',
4381 'NL': 'Netherlands',
4382 'NC': 'New Caledonia',
4383 'NZ': 'New Zealand',
4384 'NI': 'Nicaragua',
4385 'NE': 'Niger',
4386 'NG': 'Nigeria',
4387 'NU': 'Niue',
4388 'NF': 'Norfolk Island',
4389 'MP': 'Northern Mariana Islands',
4390 'NO': 'Norway',
4391 'OM': 'Oman',
4392 'PK': 'Pakistan',
4393 'PW': 'Palau',
4394 'PS': 'Palestine, State of',
4395 'PA': 'Panama',
4396 'PG': 'Papua New Guinea',
4397 'PY': 'Paraguay',
4398 'PE': 'Peru',
4399 'PH': 'Philippines',
4400 'PN': 'Pitcairn',
4401 'PL': 'Poland',
4402 'PT': 'Portugal',
4403 'PR': 'Puerto Rico',
4404 'QA': 'Qatar',
4405 'RE': 'Réunion',
4406 'RO': 'Romania',
4407 'RU': 'Russian Federation',
4408 'RW': 'Rwanda',
4409 'BL': 'Saint Barthélemy',
4410 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4411 'KN': 'Saint Kitts and Nevis',
4412 'LC': 'Saint Lucia',
4413 'MF': 'Saint Martin (French part)',
4414 'PM': 'Saint Pierre and Miquelon',
4415 'VC': 'Saint Vincent and the Grenadines',
4416 'WS': 'Samoa',
4417 'SM': 'San Marino',
4418 'ST': 'Sao Tome and Principe',
4419 'SA': 'Saudi Arabia',
4420 'SN': 'Senegal',
4421 'RS': 'Serbia',
4422 'SC': 'Seychelles',
4423 'SL': 'Sierra Leone',
4424 'SG': 'Singapore',
4425 'SX': 'Sint Maarten (Dutch part)',
4426 'SK': 'Slovakia',
4427 'SI': 'Slovenia',
4428 'SB': 'Solomon Islands',
4429 'SO': 'Somalia',
4430 'ZA': 'South Africa',
4431 'GS': 'South Georgia and the South Sandwich Islands',
4432 'SS': 'South Sudan',
4433 'ES': 'Spain',
4434 'LK': 'Sri Lanka',
4435 'SD': 'Sudan',
4436 'SR': 'Suriname',
4437 'SJ': 'Svalbard and Jan Mayen',
4438 'SZ': 'Swaziland',
4439 'SE': 'Sweden',
4440 'CH': 'Switzerland',
4441 'SY': 'Syrian Arab Republic',
4442 'TW': 'Taiwan, Province of China',
4443 'TJ': 'Tajikistan',
4444 'TZ': 'Tanzania, United Republic of',
4445 'TH': 'Thailand',
4446 'TL': 'Timor-Leste',
4447 'TG': 'Togo',
4448 'TK': 'Tokelau',
4449 'TO': 'Tonga',
4450 'TT': 'Trinidad and Tobago',
4451 'TN': 'Tunisia',
4452 'TR': 'Turkey',
4453 'TM': 'Turkmenistan',
4454 'TC': 'Turks and Caicos Islands',
4455 'TV': 'Tuvalu',
4456 'UG': 'Uganda',
4457 'UA': 'Ukraine',
4458 'AE': 'United Arab Emirates',
4459 'GB': 'United Kingdom',
4460 'US': 'United States',
4461 'UM': 'United States Minor Outlying Islands',
4462 'UY': 'Uruguay',
4463 'UZ': 'Uzbekistan',
4464 'VU': 'Vanuatu',
4465 'VE': 'Venezuela, Bolivarian Republic of',
4466 'VN': 'Viet Nam',
4467 'VG': 'Virgin Islands, British',
4468 'VI': 'Virgin Islands, U.S.',
4469 'WF': 'Wallis and Futuna',
4470 'EH': 'Western Sahara',
4471 'YE': 'Yemen',
4472 'ZM': 'Zambia',
4473 'ZW': 'Zimbabwe',
2f97cc61 4474 # Not ISO 3166 codes, but used for IP blocks
4475 'AP': 'Asia/Pacific Region',
4476 'EU': 'Europe',
4eb10f66
YCH
4477 }
4478
4479 @classmethod
4480 def short2full(cls, code):
4481 """Convert an ISO 3166-2 country code to the corresponding full name"""
4482 return cls._country_map.get(code.upper())
4483
4484
86e5f3ed 4485class GeoUtils:
773f291d
S
4486 # Major IPv4 address blocks per country
4487 _country_ip_map = {
53896ca5 4488 'AD': '46.172.224.0/19',
773f291d
S
4489 'AE': '94.200.0.0/13',
4490 'AF': '149.54.0.0/17',
4491 'AG': '209.59.64.0/18',
4492 'AI': '204.14.248.0/21',
4493 'AL': '46.99.0.0/16',
4494 'AM': '46.70.0.0/15',
4495 'AO': '105.168.0.0/13',
53896ca5
S
4496 'AP': '182.50.184.0/21',
4497 'AQ': '23.154.160.0/24',
773f291d
S
4498 'AR': '181.0.0.0/12',
4499 'AS': '202.70.112.0/20',
53896ca5 4500 'AT': '77.116.0.0/14',
773f291d
S
4501 'AU': '1.128.0.0/11',
4502 'AW': '181.41.0.0/18',
53896ca5
S
4503 'AX': '185.217.4.0/22',
4504 'AZ': '5.197.0.0/16',
773f291d
S
4505 'BA': '31.176.128.0/17',
4506 'BB': '65.48.128.0/17',
4507 'BD': '114.130.0.0/16',
4508 'BE': '57.0.0.0/8',
53896ca5 4509 'BF': '102.178.0.0/15',
773f291d
S
4510 'BG': '95.42.0.0/15',
4511 'BH': '37.131.0.0/17',
4512 'BI': '154.117.192.0/18',
4513 'BJ': '137.255.0.0/16',
53896ca5 4514 'BL': '185.212.72.0/23',
773f291d
S
4515 'BM': '196.12.64.0/18',
4516 'BN': '156.31.0.0/16',
4517 'BO': '161.56.0.0/16',
4518 'BQ': '161.0.80.0/20',
53896ca5 4519 'BR': '191.128.0.0/12',
773f291d
S
4520 'BS': '24.51.64.0/18',
4521 'BT': '119.2.96.0/19',
4522 'BW': '168.167.0.0/16',
4523 'BY': '178.120.0.0/13',
4524 'BZ': '179.42.192.0/18',
4525 'CA': '99.224.0.0/11',
4526 'CD': '41.243.0.0/16',
53896ca5
S
4527 'CF': '197.242.176.0/21',
4528 'CG': '160.113.0.0/16',
773f291d 4529 'CH': '85.0.0.0/13',
53896ca5 4530 'CI': '102.136.0.0/14',
773f291d
S
4531 'CK': '202.65.32.0/19',
4532 'CL': '152.172.0.0/14',
53896ca5 4533 'CM': '102.244.0.0/14',
773f291d
S
4534 'CN': '36.128.0.0/10',
4535 'CO': '181.240.0.0/12',
4536 'CR': '201.192.0.0/12',
4537 'CU': '152.206.0.0/15',
4538 'CV': '165.90.96.0/19',
4539 'CW': '190.88.128.0/17',
53896ca5 4540 'CY': '31.153.0.0/16',
773f291d
S
4541 'CZ': '88.100.0.0/14',
4542 'DE': '53.0.0.0/8',
4543 'DJ': '197.241.0.0/17',
4544 'DK': '87.48.0.0/12',
4545 'DM': '192.243.48.0/20',
4546 'DO': '152.166.0.0/15',
4547 'DZ': '41.96.0.0/12',
4548 'EC': '186.68.0.0/15',
4549 'EE': '90.190.0.0/15',
4550 'EG': '156.160.0.0/11',
4551 'ER': '196.200.96.0/20',
4552 'ES': '88.0.0.0/11',
4553 'ET': '196.188.0.0/14',
4554 'EU': '2.16.0.0/13',
4555 'FI': '91.152.0.0/13',
4556 'FJ': '144.120.0.0/16',
53896ca5 4557 'FK': '80.73.208.0/21',
773f291d
S
4558 'FM': '119.252.112.0/20',
4559 'FO': '88.85.32.0/19',
4560 'FR': '90.0.0.0/9',
4561 'GA': '41.158.0.0/15',
4562 'GB': '25.0.0.0/8',
4563 'GD': '74.122.88.0/21',
4564 'GE': '31.146.0.0/16',
4565 'GF': '161.22.64.0/18',
4566 'GG': '62.68.160.0/19',
53896ca5
S
4567 'GH': '154.160.0.0/12',
4568 'GI': '95.164.0.0/16',
773f291d
S
4569 'GL': '88.83.0.0/19',
4570 'GM': '160.182.0.0/15',
4571 'GN': '197.149.192.0/18',
4572 'GP': '104.250.0.0/19',
4573 'GQ': '105.235.224.0/20',
4574 'GR': '94.64.0.0/13',
4575 'GT': '168.234.0.0/16',
4576 'GU': '168.123.0.0/16',
4577 'GW': '197.214.80.0/20',
4578 'GY': '181.41.64.0/18',
4579 'HK': '113.252.0.0/14',
4580 'HN': '181.210.0.0/16',
4581 'HR': '93.136.0.0/13',
4582 'HT': '148.102.128.0/17',
4583 'HU': '84.0.0.0/14',
4584 'ID': '39.192.0.0/10',
4585 'IE': '87.32.0.0/12',
4586 'IL': '79.176.0.0/13',
4587 'IM': '5.62.80.0/20',
4588 'IN': '117.192.0.0/10',
4589 'IO': '203.83.48.0/21',
4590 'IQ': '37.236.0.0/14',
4591 'IR': '2.176.0.0/12',
4592 'IS': '82.221.0.0/16',
4593 'IT': '79.0.0.0/10',
4594 'JE': '87.244.64.0/18',
4595 'JM': '72.27.0.0/17',
4596 'JO': '176.29.0.0/16',
53896ca5 4597 'JP': '133.0.0.0/8',
773f291d
S
4598 'KE': '105.48.0.0/12',
4599 'KG': '158.181.128.0/17',
4600 'KH': '36.37.128.0/17',
4601 'KI': '103.25.140.0/22',
4602 'KM': '197.255.224.0/20',
53896ca5 4603 'KN': '198.167.192.0/19',
773f291d
S
4604 'KP': '175.45.176.0/22',
4605 'KR': '175.192.0.0/10',
4606 'KW': '37.36.0.0/14',
4607 'KY': '64.96.0.0/15',
4608 'KZ': '2.72.0.0/13',
4609 'LA': '115.84.64.0/18',
4610 'LB': '178.135.0.0/16',
53896ca5 4611 'LC': '24.92.144.0/20',
773f291d
S
4612 'LI': '82.117.0.0/19',
4613 'LK': '112.134.0.0/15',
53896ca5 4614 'LR': '102.183.0.0/16',
773f291d
S
4615 'LS': '129.232.0.0/17',
4616 'LT': '78.56.0.0/13',
4617 'LU': '188.42.0.0/16',
4618 'LV': '46.109.0.0/16',
4619 'LY': '41.252.0.0/14',
4620 'MA': '105.128.0.0/11',
4621 'MC': '88.209.64.0/18',
4622 'MD': '37.246.0.0/16',
4623 'ME': '178.175.0.0/17',
4624 'MF': '74.112.232.0/21',
4625 'MG': '154.126.0.0/17',
4626 'MH': '117.103.88.0/21',
4627 'MK': '77.28.0.0/15',
4628 'ML': '154.118.128.0/18',
4629 'MM': '37.111.0.0/17',
4630 'MN': '49.0.128.0/17',
4631 'MO': '60.246.0.0/16',
4632 'MP': '202.88.64.0/20',
4633 'MQ': '109.203.224.0/19',
4634 'MR': '41.188.64.0/18',
4635 'MS': '208.90.112.0/22',
4636 'MT': '46.11.0.0/16',
4637 'MU': '105.16.0.0/12',
4638 'MV': '27.114.128.0/18',
53896ca5 4639 'MW': '102.70.0.0/15',
773f291d
S
4640 'MX': '187.192.0.0/11',
4641 'MY': '175.136.0.0/13',
4642 'MZ': '197.218.0.0/15',
4643 'NA': '41.182.0.0/16',
4644 'NC': '101.101.0.0/18',
4645 'NE': '197.214.0.0/18',
4646 'NF': '203.17.240.0/22',
4647 'NG': '105.112.0.0/12',
4648 'NI': '186.76.0.0/15',
4649 'NL': '145.96.0.0/11',
4650 'NO': '84.208.0.0/13',
4651 'NP': '36.252.0.0/15',
4652 'NR': '203.98.224.0/19',
4653 'NU': '49.156.48.0/22',
4654 'NZ': '49.224.0.0/14',
4655 'OM': '5.36.0.0/15',
4656 'PA': '186.72.0.0/15',
4657 'PE': '186.160.0.0/14',
4658 'PF': '123.50.64.0/18',
4659 'PG': '124.240.192.0/19',
4660 'PH': '49.144.0.0/13',
4661 'PK': '39.32.0.0/11',
4662 'PL': '83.0.0.0/11',
4663 'PM': '70.36.0.0/20',
4664 'PR': '66.50.0.0/16',
4665 'PS': '188.161.0.0/16',
4666 'PT': '85.240.0.0/13',
4667 'PW': '202.124.224.0/20',
4668 'PY': '181.120.0.0/14',
4669 'QA': '37.210.0.0/15',
53896ca5 4670 'RE': '102.35.0.0/16',
773f291d 4671 'RO': '79.112.0.0/13',
53896ca5 4672 'RS': '93.86.0.0/15',
773f291d 4673 'RU': '5.136.0.0/13',
53896ca5 4674 'RW': '41.186.0.0/16',
773f291d
S
4675 'SA': '188.48.0.0/13',
4676 'SB': '202.1.160.0/19',
4677 'SC': '154.192.0.0/11',
53896ca5 4678 'SD': '102.120.0.0/13',
773f291d 4679 'SE': '78.64.0.0/12',
53896ca5 4680 'SG': '8.128.0.0/10',
773f291d
S
4681 'SI': '188.196.0.0/14',
4682 'SK': '78.98.0.0/15',
53896ca5 4683 'SL': '102.143.0.0/17',
773f291d
S
4684 'SM': '89.186.32.0/19',
4685 'SN': '41.82.0.0/15',
53896ca5 4686 'SO': '154.115.192.0/18',
773f291d
S
4687 'SR': '186.179.128.0/17',
4688 'SS': '105.235.208.0/21',
4689 'ST': '197.159.160.0/19',
4690 'SV': '168.243.0.0/16',
4691 'SX': '190.102.0.0/20',
4692 'SY': '5.0.0.0/16',
4693 'SZ': '41.84.224.0/19',
4694 'TC': '65.255.48.0/20',
4695 'TD': '154.68.128.0/19',
4696 'TG': '196.168.0.0/14',
4697 'TH': '171.96.0.0/13',
4698 'TJ': '85.9.128.0/18',
4699 'TK': '27.96.24.0/21',
4700 'TL': '180.189.160.0/20',
4701 'TM': '95.85.96.0/19',
4702 'TN': '197.0.0.0/11',
4703 'TO': '175.176.144.0/21',
4704 'TR': '78.160.0.0/11',
4705 'TT': '186.44.0.0/15',
4706 'TV': '202.2.96.0/19',
4707 'TW': '120.96.0.0/11',
4708 'TZ': '156.156.0.0/14',
53896ca5
S
4709 'UA': '37.52.0.0/14',
4710 'UG': '102.80.0.0/13',
4711 'US': '6.0.0.0/8',
773f291d 4712 'UY': '167.56.0.0/13',
53896ca5 4713 'UZ': '84.54.64.0/18',
773f291d 4714 'VA': '212.77.0.0/19',
53896ca5 4715 'VC': '207.191.240.0/21',
773f291d 4716 'VE': '186.88.0.0/13',
53896ca5 4717 'VG': '66.81.192.0/20',
773f291d
S
4718 'VI': '146.226.0.0/16',
4719 'VN': '14.160.0.0/11',
4720 'VU': '202.80.32.0/20',
4721 'WF': '117.20.32.0/21',
4722 'WS': '202.4.32.0/19',
4723 'YE': '134.35.0.0/16',
4724 'YT': '41.242.116.0/22',
4725 'ZA': '41.0.0.0/11',
53896ca5
S
4726 'ZM': '102.144.0.0/13',
4727 'ZW': '102.177.192.0/18',
773f291d
S
4728 }
4729
4730 @classmethod
5f95927a
S
4731 def random_ipv4(cls, code_or_block):
4732 if len(code_or_block) == 2:
4733 block = cls._country_ip_map.get(code_or_block.upper())
4734 if not block:
4735 return None
4736 else:
4737 block = code_or_block
773f291d 4738 addr, preflen = block.split('/')
ac668111 4739 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4740 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4741 return str(socket.inet_ntoa(
ac668111 4742 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4743
4744
ac668111 4745class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4746 def __init__(self, proxies=None):
4747 # Set default handlers
4748 for type in ('http', 'https'):
4749 setattr(self, '%s_open' % type,
4750 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4751 meth(r, proxy, type))
ac668111 4752 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4753
91410c9b 4754 def proxy_open(self, req, proxy, type):
2461f79d 4755 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4756 if req_proxy is not None:
4757 proxy = req_proxy
2461f79d
PH
4758 del req.headers['Ytdl-request-proxy']
4759
4760 if proxy == '__noproxy__':
4761 return None # No Proxy
14f25df2 4762 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4763 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4764 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4765 return None
ac668111 4766 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4767 self, req, proxy, type)
5bc880b9
YCH
4768
4769
0a5445dd
YCH
4770# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4771# released into Public Domain
4772# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4773
4774def long_to_bytes(n, blocksize=0):
4775 """long_to_bytes(n:long, blocksize:int) : string
4776 Convert a long integer to a byte string.
4777
4778 If optional blocksize is given and greater than zero, pad the front of the
4779 byte string with binary zeros so that the length is a multiple of
4780 blocksize.
4781 """
4782 # after much testing, this algorithm was deemed to be the fastest
4783 s = b''
4784 n = int(n)
4785 while n > 0:
ac668111 4786 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4787 n = n >> 32
4788 # strip off leading zeros
4789 for i in range(len(s)):
4790 if s[i] != b'\000'[0]:
4791 break
4792 else:
4793 # only happens when n == 0
4794 s = b'\000'
4795 i = 0
4796 s = s[i:]
4797 # add back some pad bytes. this could be done more efficiently w.r.t. the
4798 # de-padding being done above, but sigh...
4799 if blocksize > 0 and len(s) % blocksize:
4800 s = (blocksize - len(s) % blocksize) * b'\000' + s
4801 return s
4802
4803
4804def bytes_to_long(s):
4805 """bytes_to_long(string) : long
4806 Convert a byte string to a long integer.
4807
4808 This is (essentially) the inverse of long_to_bytes().
4809 """
4810 acc = 0
4811 length = len(s)
4812 if length % 4:
4813 extra = (4 - length % 4)
4814 s = b'\000' * extra + s
4815 length = length + extra
4816 for i in range(0, length, 4):
ac668111 4817 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4818 return acc
4819
4820
5bc880b9
YCH
4821def ohdave_rsa_encrypt(data, exponent, modulus):
4822 '''
4823 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4824
4825 Input:
4826 data: data to encrypt, bytes-like object
4827 exponent, modulus: parameter e and N of RSA algorithm, both integer
4828 Output: hex string of encrypted data
4829
4830 Limitation: supports one block encryption only
4831 '''
4832
4833 payload = int(binascii.hexlify(data[::-1]), 16)
4834 encrypted = pow(payload, exponent, modulus)
4835 return '%x' % encrypted
81bdc8fd
YCH
4836
4837
f48409c7
YCH
4838def pkcs1pad(data, length):
4839 """
4840 Padding input data with PKCS#1 scheme
4841
4842 @param {int[]} data input data
4843 @param {int} length target length
4844 @returns {int[]} padded data
4845 """
4846 if len(data) > length - 11:
4847 raise ValueError('Input data too long for PKCS#1 padding')
4848
4849 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4850 return [0, 2] + pseudo_random + [0] + data
4851
4852
7b2c3f47 4853def _base_n_table(n, table):
4854 if not table and not n:
4855 raise ValueError('Either table or n must be specified')
612f2be5 4856 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4857
44f14eb4 4858 if n and n != len(table):
612f2be5 4859 raise ValueError(f'base {n} exceeds table length {len(table)}')
4860 return table
59f898b7 4861
5eb6bdce 4862
7b2c3f47 4863def encode_base_n(num, n=None, table=None):
4864 """Convert given int to a base-n string"""
612f2be5 4865 table = _base_n_table(n, table)
7b2c3f47 4866 if not num:
5eb6bdce
YCH
4867 return table[0]
4868
7b2c3f47 4869 result, base = '', len(table)
81bdc8fd 4870 while num:
7b2c3f47 4871 result = table[num % base] + result
612f2be5 4872 num = num // base
7b2c3f47 4873 return result
4874
4875
4876def decode_base_n(string, n=None, table=None):
4877 """Convert given base-n string to int"""
4878 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4879 result, base = 0, len(table)
4880 for char in string:
4881 result = result * base + table[char]
4882 return result
4883
4884
f52354a8 4885def decode_packed_codes(code):
06b3fe29 4886 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4887 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4888 base = int(base)
4889 count = int(count)
4890 symbols = symbols.split('|')
4891 symbol_table = {}
4892
4893 while count:
4894 count -= 1
5eb6bdce 4895 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4896 symbol_table[base_n_count] = symbols[count] or base_n_count
4897
4898 return re.sub(
4899 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4900 obfuscated_code)
e154c651 4901
4902
1ced2221
S
4903def caesar(s, alphabet, shift):
4904 if shift == 0:
4905 return s
4906 l = len(alphabet)
4907 return ''.join(
4908 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4909 for c in s)
4910
4911
4912def rot47(s):
4913 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4914
4915
e154c651 4916def parse_m3u8_attributes(attrib):
4917 info = {}
4918 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4919 if val.startswith('"'):
4920 val = val[1:-1]
4921 info[key] = val
4922 return info
1143535d
YCH
4923
4924
4925def urshift(val, n):
4926 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4927
4928
efa97bdc 4929def write_xattr(path, key, value):
6f7563be 4930 # Windows: Write xattrs to NTFS Alternate Data Streams:
4931 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4932 if compat_os_name == 'nt':
4933 assert ':' not in key
4934 assert os.path.exists(path)
efa97bdc
YCH
4935
4936 try:
6f7563be 4937 with open(f'{path}:{key}', 'wb') as f:
4938 f.write(value)
86e5f3ed 4939 except OSError as e:
efa97bdc 4940 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4941 return
efa97bdc 4942
6f7563be 4943 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4944
6f7563be 4945 setxattr = None
4946 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4947 # Unicode arguments are not supported in pyxattr until version 0.5.0
4948 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4949 if version_tuple(xattr.__version__) >= (0, 5, 0):
4950 setxattr = xattr.set
4951 elif xattr:
4952 setxattr = xattr.setxattr
efa97bdc 4953
6f7563be 4954 if setxattr:
4955 try:
4956 setxattr(path, key, value)
4957 except OSError as e:
4958 raise XAttrMetadataError(e.errno, e.strerror)
4959 return
efa97bdc 4960
6f7563be 4961 # UNIX Method 2. Use setfattr/xattr executables
4962 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4963 else 'xattr' if check_executable('xattr', ['-h']) else None)
4964 if not exe:
4965 raise XAttrUnavailableError(
4966 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4967 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4968
0f06bcd7 4969 value = value.decode()
6f7563be 4970 try:
f0c9fb96 4971 _, stderr, returncode = Popen.run(
6f7563be 4972 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4973 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4974 except OSError as e:
4975 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4976 if returncode:
4977 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4978
4979
4980def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4981 start_date = datetime.date(1950, 1, 1)
4982 end_date = datetime.date(1995, 12, 31)
4983 offset = random.randint(0, (end_date - start_date).days)
4984 random_date = start_date + datetime.timedelta(offset)
0c265486 4985 return {
aa374bc7
AS
4986 year_field: str(random_date.year),
4987 month_field: str(random_date.month),
4988 day_field: str(random_date.day),
0c265486 4989 }
732044af 4990
c76eb41b 4991
8c53322c
L
4992def find_available_port(interface=''):
4993 try:
4994 with socket.socket() as sock:
4995 sock.bind((interface, 0))
4996 return sock.getsockname()[1]
4997 except OSError:
4998 return None
4999
5000
732044af 5001# Templates for internet shortcut files, which are plain text files.
e5a998f3 5002DOT_URL_LINK_TEMPLATE = '''\
732044af 5003[InternetShortcut]
5004URL=%(url)s
e5a998f3 5005'''
732044af 5006
e5a998f3 5007DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5008<?xml version="1.0" encoding="UTF-8"?>
5009<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5010<plist version="1.0">
5011<dict>
5012\t<key>URL</key>
5013\t<string>%(url)s</string>
5014</dict>
5015</plist>
e5a998f3 5016'''
732044af 5017
e5a998f3 5018DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5019[Desktop Entry]
5020Encoding=UTF-8
5021Name=%(filename)s
5022Type=Link
5023URL=%(url)s
5024Icon=text-html
e5a998f3 5025'''
732044af 5026
08438d2c 5027LINK_TEMPLATES = {
5028 'url': DOT_URL_LINK_TEMPLATE,
5029 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5030 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5031}
5032
732044af 5033
5034def iri_to_uri(iri):
5035 """
5036 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5037
5038 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5039 """
5040
14f25df2 5041 iri_parts = urllib.parse.urlparse(iri)
732044af 5042
5043 if '[' in iri_parts.netloc:
5044 raise ValueError('IPv6 URIs are not, yet, supported.')
5045 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5046
5047 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5048
5049 net_location = ''
5050 if iri_parts.username:
f9934b96 5051 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5052 if iri_parts.password is not None:
f9934b96 5053 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5054 net_location += '@'
5055
0f06bcd7 5056 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5057 # The 'idna' encoding produces ASCII text.
5058 if iri_parts.port is not None and iri_parts.port != 80:
5059 net_location += ':' + str(iri_parts.port)
5060
f9934b96 5061 return urllib.parse.urlunparse(
732044af 5062 (iri_parts.scheme,
5063 net_location,
5064
f9934b96 5065 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5066
5067 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5068 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5069
5070 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5071 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5072
f9934b96 5073 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5074
5075 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5076
5077
5078def to_high_limit_path(path):
5079 if sys.platform in ['win32', 'cygwin']:
5080 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5081 return '\\\\?\\' + os.path.abspath(path)
732044af 5082
5083 return path
76d321f6 5084
c76eb41b 5085
7b2c3f47 5086def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 5087 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 5088 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5089 return default
7b2c3f47 5090 return template % func(val)
00dd0cd5 5091
5092
5093def clean_podcast_url(url):
5094 return re.sub(r'''(?x)
5095 (?:
5096 (?:
5097 chtbl\.com/track|
5098 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5099 play\.podtrac\.com
5100 )/[^/]+|
5101 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5102 flex\.acast\.com|
5103 pd(?:
5104 cn\.co| # https://podcorn.com/analytics-prefix/
5105 st\.fm # https://podsights.com/docs/
5106 )/e
5107 )/''', '', url)
ffcb8191
THD
5108
5109
5110_HEX_TABLE = '0123456789abcdef'
5111
5112
5113def random_uuidv4():
5114 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5115
5116
5117def make_dir(path, to_screen=None):
5118 try:
5119 dn = os.path.dirname(path)
b25d6cb9
AI
5120 if dn:
5121 os.makedirs(dn, exist_ok=True)
0202b52a 5122 return True
86e5f3ed 5123 except OSError as err:
0202b52a 5124 if callable(to_screen) is not None:
69bec673 5125 to_screen(f'unable to create directory {err}')
0202b52a 5126 return False
f74980cb 5127
5128
5129def get_executable_path():
69bec673 5130 from ..update import _get_variant_and_executable_path
c487cf00 5131
b5899f4f 5132 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5133
5134
8e40b9d1 5135def get_user_config_dirs(package_name):
8e40b9d1
M
5136 # .config (e.g. ~/.config/package_name)
5137 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5138 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5139
5140 # appdata (%APPDATA%/package_name)
5141 appdata_dir = os.getenv('appdata')
5142 if appdata_dir:
773c272d 5143 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5144
5145 # home (~/.package_name)
773c272d 5146 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5147
5148
5149def get_system_config_dirs(package_name):
8e40b9d1 5150 # /etc/package_name
773c272d 5151 yield os.path.join('/etc', package_name)
06167fbb 5152
5153
3e9b66d7 5154def time_seconds(**kwargs):
83c4970e
L
5155 """
5156 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5157 """
5158 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5159
5160
49fa4d9a
N
5161# create a JSON Web Signature (jws) with HS256 algorithm
5162# the resulting format is in JWS Compact Serialization
5163# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5164# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5165def jwt_encode_hs256(payload_data, key, headers={}):
5166 header_data = {
5167 'alg': 'HS256',
5168 'typ': 'JWT',
5169 }
5170 if headers:
5171 header_data.update(headers)
0f06bcd7 5172 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5173 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5174 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5175 signature_b64 = base64.b64encode(h.digest())
5176 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5177 return token
819e0531 5178
5179
16b0d7e6 5180# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5181def jwt_decode_hs256(jwt):
5182 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5183 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5184 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5185 return payload_data
5186
5187
53973b4d 5188WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5189
5190
7a32c70d 5191@functools.cache
819e0531 5192def supports_terminal_sequences(stream):
5193 if compat_os_name == 'nt':
8a82af35 5194 if not WINDOWS_VT_MODE:
819e0531 5195 return False
5196 elif not os.getenv('TERM'):
5197 return False
5198 try:
5199 return stream.isatty()
5200 except BaseException:
5201 return False
5202
5203
c53a18f0 5204def windows_enable_vt_mode():
5205 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5206 if get_windows_version() < (10, 0, 10586):
53973b4d 5207 return
53973b4d 5208
c53a18f0 5209 import ctypes
5210 import ctypes.wintypes
5211 import msvcrt
5212
5213 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5214
5215 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5216 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5217 try:
5218 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5219 dw_original_mode = ctypes.wintypes.DWORD()
5220 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5221 if not success:
5222 raise Exception('GetConsoleMode failed')
5223
5224 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5225 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5226 if not success:
5227 raise Exception('SetConsoleMode failed')
c53a18f0 5228 finally:
5229 os.close(handle)
53973b4d 5230
f0795149 5231 global WINDOWS_VT_MODE
5232 WINDOWS_VT_MODE = True
5233 supports_terminal_sequences.cache_clear()
5234
53973b4d 5235
ec11a9f4 5236_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5237
5238
5239def remove_terminal_sequences(string):
5240 return _terminal_sequences_re.sub('', string)
5241
5242
5243def number_of_digits(number):
5244 return len('%d' % number)
34921b43 5245
5246
5247def join_nonempty(*values, delim='-', from_dict=None):
5248 if from_dict is not None:
69bec673 5249 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5250 return delim.join(map(str, filter(None, values)))
06e57990 5251
5252
27231526
ZM
5253def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5254 """
5255 Find the largest format dimensions in terms of video width and, for each thumbnail:
5256 * Modify the URL: Match the width with the provided regex and replace with the former width
5257 * Update dimensions
5258
5259 This function is useful with video services that scale the provided thumbnails on demand
5260 """
5261 _keys = ('width', 'height')
5262 max_dimensions = max(
86e5f3ed 5263 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5264 default=(0, 0))
5265 if not max_dimensions[0]:
5266 return thumbnails
5267 return [
5268 merge_dicts(
5269 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5270 dict(zip(_keys, max_dimensions)), thumbnail)
5271 for thumbnail in thumbnails
5272 ]
5273
5274
93c8410d
LNO
5275def parse_http_range(range):
5276 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5277 if not range:
5278 return None, None, None
5279 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5280 if not crg:
5281 return None, None, None
5282 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5283
5284
6b9e832d 5285def read_stdin(what):
5286 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5287 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5288 return sys.stdin
5289
5290
a904a7f8
L
5291def determine_file_encoding(data):
5292 """
88f60feb 5293 Detect the text encoding used
a904a7f8
L
5294 @returns (encoding, bytes to skip)
5295 """
5296
88f60feb 5297 # BOM marks are given priority over declarations
a904a7f8 5298 for bom, enc in BOMS:
a904a7f8
L
5299 if data.startswith(bom):
5300 return enc, len(bom)
5301
88f60feb 5302 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5303 # We ignore the endianness to get a good enough match
a904a7f8 5304 data = data.replace(b'\0', b'')
88f60feb 5305 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5306 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5307
5308
06e57990 5309class Config:
5310 own_args = None
9e491463 5311 parsed_args = None
06e57990 5312 filename = None
5313 __initialized = False
5314
5315 def __init__(self, parser, label=None):
9e491463 5316 self.parser, self.label = parser, label
06e57990 5317 self._loaded_paths, self.configs = set(), []
5318
5319 def init(self, args=None, filename=None):
5320 assert not self.__initialized
284a60c5 5321 self.own_args, self.filename = args, filename
5322 return self.load_configs()
5323
5324 def load_configs(self):
65662dff 5325 directory = ''
284a60c5 5326 if self.filename:
5327 location = os.path.realpath(self.filename)
65662dff 5328 directory = os.path.dirname(location)
06e57990 5329 if location in self._loaded_paths:
5330 return False
5331 self._loaded_paths.add(location)
5332
284a60c5 5333 self.__initialized = True
5334 opts, _ = self.parser.parse_known_args(self.own_args)
5335 self.parsed_args = self.own_args
9e491463 5336 for location in opts.config_locations or []:
6b9e832d 5337 if location == '-':
1060f82f 5338 if location in self._loaded_paths:
5339 continue
5340 self._loaded_paths.add(location)
6b9e832d 5341 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5342 continue
65662dff 5343 location = os.path.join(directory, expand_path(location))
06e57990 5344 if os.path.isdir(location):
5345 location = os.path.join(location, 'yt-dlp.conf')
5346 if not os.path.exists(location):
9e491463 5347 self.parser.error(f'config location {location} does not exist')
06e57990 5348 self.append_config(self.read_file(location), location)
5349 return True
5350
5351 def __str__(self):
5352 label = join_nonempty(
5353 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5354 delim=' ')
5355 return join_nonempty(
5356 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5357 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5358 delim='\n')
5359
7a32c70d 5360 @staticmethod
06e57990 5361 def read_file(filename, default=[]):
5362 try:
a904a7f8 5363 optionf = open(filename, 'rb')
86e5f3ed 5364 except OSError:
06e57990 5365 return default # silently skip if file is not present
a904a7f8
L
5366 try:
5367 enc, skip = determine_file_encoding(optionf.read(512))
5368 optionf.seek(skip, io.SEEK_SET)
5369 except OSError:
5370 enc = None # silently skip read errors
06e57990 5371 try:
5372 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5373 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5374 res = shlex.split(contents, comments=True)
44a6fcff 5375 except Exception as err:
5376 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5377 finally:
5378 optionf.close()
5379 return res
5380
7a32c70d 5381 @staticmethod
06e57990 5382 def hide_login_info(opts):
86e5f3ed 5383 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5384 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5385
5386 def _scrub_eq(o):
5387 m = eqre.match(o)
5388 if m:
5389 return m.group('key') + '=PRIVATE'
5390 else:
5391 return o
5392
5393 opts = list(map(_scrub_eq, opts))
5394 for idx, opt in enumerate(opts):
5395 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5396 opts[idx + 1] = 'PRIVATE'
5397 return opts
5398
5399 def append_config(self, *args, label=None):
9e491463 5400 config = type(self)(self.parser, label)
06e57990 5401 config._loaded_paths = self._loaded_paths
5402 if config.init(*args):
5403 self.configs.append(config)
5404
7a32c70d 5405 @property
06e57990 5406 def all_args(self):
5407 for config in reversed(self.configs):
5408 yield from config.all_args
9e491463 5409 yield from self.parsed_args or []
5410
5411 def parse_known_args(self, **kwargs):
5412 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5413
5414 def parse_args(self):
9e491463 5415 return self.parser.parse_args(self.all_args)
da42679b
LNO
5416
5417
d5d1df8a 5418class WebSocketsWrapper:
da42679b 5419 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5420 pool = None
da42679b 5421
3cea3edd 5422 def __init__(self, url, headers=None, connect=True):
059bc4db 5423 self.loop = asyncio.new_event_loop()
9cd08050 5424 # XXX: "loop" is deprecated
5425 self.conn = websockets.connect(
5426 url, extra_headers=headers, ping_interval=None,
5427 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5428 if connect:
5429 self.__enter__()
15dfb392 5430 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5431
5432 def __enter__(self):
3cea3edd 5433 if not self.pool:
9cd08050 5434 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5435 return self
5436
5437 def send(self, *args):
5438 self.run_with_loop(self.pool.send(*args), self.loop)
5439
5440 def recv(self, *args):
5441 return self.run_with_loop(self.pool.recv(*args), self.loop)
5442
5443 def __exit__(self, type, value, traceback):
5444 try:
5445 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5446 finally:
5447 self.loop.close()
15dfb392 5448 self._cancel_all_tasks(self.loop)
da42679b
LNO
5449
5450 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5451 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5452 @staticmethod
da42679b 5453 def run_with_loop(main, loop):
059bc4db 5454 if not asyncio.iscoroutine(main):
da42679b
LNO
5455 raise ValueError(f'a coroutine was expected, got {main!r}')
5456
5457 try:
5458 return loop.run_until_complete(main)
5459 finally:
5460 loop.run_until_complete(loop.shutdown_asyncgens())
5461 if hasattr(loop, 'shutdown_default_executor'):
5462 loop.run_until_complete(loop.shutdown_default_executor())
5463
7a32c70d 5464 @staticmethod
da42679b 5465 def _cancel_all_tasks(loop):
059bc4db 5466 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5467
5468 if not to_cancel:
5469 return
5470
5471 for task in to_cancel:
5472 task.cancel()
5473
9cd08050 5474 # XXX: "loop" is removed in python 3.10+
da42679b 5475 loop.run_until_complete(
059bc4db 5476 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5477
5478 for task in to_cancel:
5479 if task.cancelled():
5480 continue
5481 if task.exception() is not None:
5482 loop.call_exception_handler({
5483 'message': 'unhandled exception during asyncio.run() shutdown',
5484 'exception': task.exception(),
5485 'task': task,
5486 })
5487
5488
8b7539d2 5489def merge_headers(*dicts):
08d30158 5490 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5491 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5492
5493
b1f94422 5494def cached_method(f):
5495 """Cache a method"""
5496 signature = inspect.signature(f)
5497
7a32c70d 5498 @functools.wraps(f)
b1f94422 5499 def wrapper(self, *args, **kwargs):
5500 bound_args = signature.bind(self, *args, **kwargs)
5501 bound_args.apply_defaults()
d5d1df8a 5502 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5503
6368e2e6 5504 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5505 if key not in cache:
5506 cache[key] = f(self, *args, **kwargs)
5507 return cache[key]
5508 return wrapper
5509
5510
28787f16 5511class classproperty:
83cc7b8a 5512 """property access for class methods with optional caching"""
5513 def __new__(cls, func=None, *args, **kwargs):
5514 if not func:
5515 return functools.partial(cls, *args, **kwargs)
5516 return super().__new__(cls)
c487cf00 5517
83cc7b8a 5518 def __init__(self, func, *, cache=False):
c487cf00 5519 functools.update_wrapper(self, func)
5520 self.func = func
83cc7b8a 5521 self._cache = {} if cache else None
28787f16 5522
5523 def __get__(self, _, cls):
83cc7b8a 5524 if self._cache is None:
5525 return self.func(cls)
5526 elif cls not in self._cache:
5527 self._cache[cls] = self.func(cls)
5528 return self._cache[cls]
19a03940 5529
5530
a5387729 5531class function_with_repr:
b2e0343b 5532 def __init__(self, func, repr_=None):
a5387729 5533 functools.update_wrapper(self, func)
b2e0343b 5534 self.func, self.__repr = func, repr_
a5387729 5535
5536 def __call__(self, *args, **kwargs):
5537 return self.func(*args, **kwargs)
5538
5539 def __repr__(self):
b2e0343b 5540 if self.__repr:
5541 return self.__repr
a5387729 5542 return f'{self.func.__module__}.{self.func.__qualname__}'
5543
5544
64fa820c 5545class Namespace(types.SimpleNamespace):
591bb9d3 5546 """Immutable namespace"""
591bb9d3 5547
7896214c 5548 def __iter__(self):
64fa820c 5549 return iter(self.__dict__.values())
7896214c 5550
7a32c70d 5551 @property
64fa820c 5552 def items_(self):
5553 return self.__dict__.items()
9b8ee23b 5554
5555
8dc59305 5556MEDIA_EXTENSIONS = Namespace(
5557 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5558 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5559 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5560 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5561 thumbnails=('jpg', 'png', 'webp'),
5562 storyboards=('mhtml', ),
5563 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5564 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5565)
5566MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5567MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5568
5569KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5570
5571
be5c1ae8 5572class RetryManager:
5573 """Usage:
5574 for retry in RetryManager(...):
5575 try:
5576 ...
5577 except SomeException as err:
5578 retry.error = err
5579 continue
5580 """
5581 attempt, _error = 0, None
5582
5583 def __init__(self, _retries, _error_callback, **kwargs):
5584 self.retries = _retries or 0
5585 self.error_callback = functools.partial(_error_callback, **kwargs)
5586
5587 def _should_retry(self):
5588 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5589
7a32c70d 5590 @property
be5c1ae8 5591 def error(self):
5592 if self._error is NO_DEFAULT:
5593 return None
5594 return self._error
5595
7a32c70d 5596 @error.setter
be5c1ae8 5597 def error(self, value):
5598 self._error = value
5599
5600 def __iter__(self):
5601 while self._should_retry():
5602 self.error = NO_DEFAULT
5603 self.attempt += 1
5604 yield self
5605 if self.error:
5606 self.error_callback(self.error, self.attempt, self.retries)
5607
7a32c70d 5608 @staticmethod
be5c1ae8 5609 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5610 """Utility function for reporting retries"""
5611 if count > retries:
5612 if error:
5613 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5614 raise e
5615
5616 if not count:
5617 return warn(e)
5618 elif isinstance(e, ExtractorError):
3ce29336 5619 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5620 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5621
5622 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5623 if delay:
5624 info(f'Sleeping {delay:.2f} seconds ...')
5625 time.sleep(delay)
5626
5627
0647d925 5628def make_archive_id(ie, video_id):
5629 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5630 return f'{ie_key.lower()} {video_id}'
5631
5632
a1c5bd82 5633def truncate_string(s, left, right=0):
5634 assert left > 3 and right >= 0
5635 if s is None or len(s) <= left + right:
5636 return s
71df9b7f 5637 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5638
5639
5314b521 5640def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5641 assert 'all' in alias_dict, '"all" alias is required'
5642 requested = list(start or [])
5643 for val in options:
5644 discard = val.startswith('-')
5645 if discard:
5646 val = val[1:]
5647
5648 if val in alias_dict:
5649 val = alias_dict[val] if not discard else [
5650 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5651 # NB: Do not allow regex in aliases for performance
5652 requested = orderedSet_from_options(val, alias_dict, start=requested)
5653 continue
5654
5655 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5656 else [val] if val in alias_dict['all'] else None)
5657 if current is None:
5658 raise ValueError(val)
5659
5660 if discard:
5661 for item in current:
5662 while item in requested:
5663 requested.remove(item)
5664 else:
5665 requested.extend(current)
5666
5667 return orderedSet(requested)
5668
5669
d0d74b71 5670class FormatSorter:
5671 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5672
5673 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5674 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5675 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5676 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5677 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5678 'fps', 'fs_approx', 'source', 'id')
5679
5680 settings = {
5681 'vcodec': {'type': 'ordered', 'regex': True,
5682 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5683 'acodec': {'type': 'ordered', 'regex': True,
71082216 5684 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5685 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5686 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5687 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5688 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5689 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5690 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5691 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5692 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5693 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5694 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5695 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5696 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5697 'field': ('vcodec', 'acodec'),
5698 'function': lambda it: int(any(v != 'none' for v in it))},
5699 'ie_pref': {'priority': True, 'type': 'extractor'},
5700 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5701 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5702 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5703 'quality': {'convert': 'float', 'default': -1},
5704 'filesize': {'convert': 'bytes'},
5705 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5706 'id': {'convert': 'string', 'field': 'format_id'},
5707 'height': {'convert': 'float_none'},
5708 'width': {'convert': 'float_none'},
5709 'fps': {'convert': 'float_none'},
5710 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5711 'tbr': {'convert': 'float_none'},
5712 'vbr': {'convert': 'float_none'},
5713 'abr': {'convert': 'float_none'},
5714 'asr': {'convert': 'float_none'},
5715 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5716
5717 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5718 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5719 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5720 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5721 'res': {'type': 'multiple', 'field': ('height', 'width'),
5722 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5723
5724 # Actual field names
5725 'format_id': {'type': 'alias', 'field': 'id'},
5726 'preference': {'type': 'alias', 'field': 'ie_pref'},
5727 'language_preference': {'type': 'alias', 'field': 'lang'},
5728 'source_preference': {'type': 'alias', 'field': 'source'},
5729 'protocol': {'type': 'alias', 'field': 'proto'},
5730 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5731 'audio_channels': {'type': 'alias', 'field': 'channels'},
5732
5733 # Deprecated
5734 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5735 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5736 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5737 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5738 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5739 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5740 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5741 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5742 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5743 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5744 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5745 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5746 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5747 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5748 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5749 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5750 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5751 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5752 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5753 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5754 }
5755
5756 def __init__(self, ydl, field_preference):
5757 self.ydl = ydl
5758 self._order = []
5759 self.evaluate_params(self.ydl.params, field_preference)
5760 if ydl.params.get('verbose'):
5761 self.print_verbose_info(self.ydl.write_debug)
5762
5763 def _get_field_setting(self, field, key):
5764 if field not in self.settings:
5765 if key in ('forced', 'priority'):
5766 return False
5767 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5768 'deprecated and may be removed in a future version')
5769 self.settings[field] = {}
5770 propObj = self.settings[field]
5771 if key not in propObj:
5772 type = propObj.get('type')
5773 if key == 'field':
5774 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5775 elif key == 'convert':
5776 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5777 else:
5778 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5779 propObj[key] = default
5780 return propObj[key]
5781
5782 def _resolve_field_value(self, field, value, convertNone=False):
5783 if value is None:
5784 if not convertNone:
5785 return None
5786 else:
5787 value = value.lower()
5788 conversion = self._get_field_setting(field, 'convert')
5789 if conversion == 'ignore':
5790 return None
5791 if conversion == 'string':
5792 return value
5793 elif conversion == 'float_none':
5794 return float_or_none(value)
5795 elif conversion == 'bytes':
5796 return parse_bytes(value)
5797 elif conversion == 'order':
5798 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5799 use_regex = self._get_field_setting(field, 'regex')
5800 list_length = len(order_list)
5801 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5802 if use_regex and value is not None:
5803 for i, regex in enumerate(order_list):
5804 if regex and re.match(regex, value):
5805 return list_length - i
5806 return list_length - empty_pos # not in list
5807 else: # not regex or value = None
5808 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5809 else:
5810 if value.isnumeric():
5811 return float(value)
5812 else:
5813 self.settings[field]['convert'] = 'string'
5814 return value
5815
5816 def evaluate_params(self, params, sort_extractor):
5817 self._use_free_order = params.get('prefer_free_formats', False)
5818 self._sort_user = params.get('format_sort', [])
5819 self._sort_extractor = sort_extractor
5820
5821 def add_item(field, reverse, closest, limit_text):
5822 field = field.lower()
5823 if field in self._order:
5824 return
5825 self._order.append(field)
5826 limit = self._resolve_field_value(field, limit_text)
5827 data = {
5828 'reverse': reverse,
5829 'closest': False if limit is None else closest,
5830 'limit_text': limit_text,
5831 'limit': limit}
5832 if field in self.settings:
5833 self.settings[field].update(data)
5834 else:
5835 self.settings[field] = data
5836
5837 sort_list = (
5838 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5839 + (tuple() if params.get('format_sort_force', False)
5840 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5841 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5842
5843 for item in sort_list:
5844 match = re.match(self.regex, item)
5845 if match is None:
5846 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5847 field = match.group('field')
5848 if field is None:
5849 continue
5850 if self._get_field_setting(field, 'type') == 'alias':
5851 alias, field = field, self._get_field_setting(field, 'field')
5852 if self._get_field_setting(alias, 'deprecated'):
5853 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5854 f'be removed in a future version. Please use {field} instead')
5855 reverse = match.group('reverse') is not None
5856 closest = match.group('separator') == '~'
5857 limit_text = match.group('limit')
5858
5859 has_limit = limit_text is not None
5860 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5861 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5862
5863 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5864 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5865 limit_count = len(limits)
5866 for (i, f) in enumerate(fields):
5867 add_item(f, reverse, closest,
5868 limits[i] if i < limit_count
5869 else limits[0] if has_limit and not has_multiple_limits
5870 else None)
5871
5872 def print_verbose_info(self, write_debug):
5873 if self._sort_user:
5874 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5875 if self._sort_extractor:
5876 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5877 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5878 '+' if self._get_field_setting(field, 'reverse') else '', field,
5879 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5880 self._get_field_setting(field, 'limit_text'),
5881 self._get_field_setting(field, 'limit'))
5882 if self._get_field_setting(field, 'limit_text') is not None else '')
5883 for field in self._order if self._get_field_setting(field, 'visible')]))
5884
5885 def _calculate_field_preference_from_value(self, format, field, type, value):
5886 reverse = self._get_field_setting(field, 'reverse')
5887 closest = self._get_field_setting(field, 'closest')
5888 limit = self._get_field_setting(field, 'limit')
5889
5890 if type == 'extractor':
5891 maximum = self._get_field_setting(field, 'max')
5892 if value is None or (maximum is not None and value >= maximum):
5893 value = -1
5894 elif type == 'boolean':
5895 in_list = self._get_field_setting(field, 'in_list')
5896 not_in_list = self._get_field_setting(field, 'not_in_list')
5897 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5898 elif type == 'ordered':
5899 value = self._resolve_field_value(field, value, True)
5900
5901 # try to convert to number
5902 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5903 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5904 if is_num:
5905 value = val_num
5906
5907 return ((-10, 0) if value is None
5908 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5909 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5910 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5911 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5912 else (-1, value, 0))
5913
5914 def _calculate_field_preference(self, format, field):
5915 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5916 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5917 if type == 'multiple':
5918 type = 'field' # Only 'field' is allowed in multiple for now
5919 actual_fields = self._get_field_setting(field, 'field')
5920
5921 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5922 else:
5923 value = get_value(field)
5924 return self._calculate_field_preference_from_value(format, field, type, value)
5925
5926 def calculate_preference(self, format):
5927 # Determine missing protocol
5928 if not format.get('protocol'):
5929 format['protocol'] = determine_protocol(format)
5930
5931 # Determine missing ext
5932 if not format.get('ext') and 'url' in format:
5933 format['ext'] = determine_ext(format['url'])
5934 if format.get('vcodec') == 'none':
5935 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5936 format['video_ext'] = 'none'
5937 else:
5938 format['video_ext'] = format['ext']
5939 format['audio_ext'] = 'none'
5940 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5941 # format['preference'] = -1000
5942
5424dbaf
L
5943 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5944 # HEVC-over-FLV is out-of-spec by FLV's original spec
5945 # ref. https://trac.ffmpeg.org/ticket/6389
5946 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5947 format['preference'] = -100
5948
d0d74b71 5949 # Determine missing bitrates
5950 if format.get('tbr') is None:
5951 if format.get('vbr') is not None and format.get('abr') is not None:
5952 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
5953 else:
5954 if format.get('vcodec') != 'none' and format.get('vbr') is None:
5955 format['vbr'] = format.get('tbr') - format.get('abr', 0)
5956 if format.get('acodec') != 'none' and format.get('abr') is None:
5957 format['abr'] = format.get('tbr') - format.get('vbr', 0)
5958
5959 return tuple(self._calculate_field_preference(format, field) for field in self._order)