]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[core] Prevent `Cookie` leaks on HTTP redirect
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
db3ad8a6 28import netrc
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
989a01c2 44import unicodedata
14f25df2 45import urllib.error
f8271158 46import urllib.parse
ac668111 47import urllib.request
bcf89ce6 48import xml.etree.ElementTree
d77c3dfd 49import zlib
d77c3dfd 50
69bec673 51from . import traversal
52
53from ..compat import functools # isort: split
54from ..compat import (
36e6f62c 55 compat_etree_fromstring,
51098426 56 compat_expanduser,
f8271158 57 compat_HTMLParseError,
efa97bdc 58 compat_os_name,
702ccf2d 59 compat_shlex_quote,
8c25f81b 60)
69bec673 61from ..dependencies import brotli, certifi, websockets, xattr
62from ..socks import ProxyType, sockssocket
51fb4995 63
46f1370e 64__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
65
468e2e92
FV
66# This is not clearly defined otherwise
67compiled_regex_type = type(re.compile(''))
68
f7a147e3
S
69
70def random_user_agent():
71 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
72 _CHROME_VERSIONS = (
19b4c74d 73 '90.0.4430.212',
74 '90.0.4430.24',
75 '90.0.4430.70',
76 '90.0.4430.72',
77 '90.0.4430.85',
78 '90.0.4430.93',
79 '91.0.4472.101',
80 '91.0.4472.106',
81 '91.0.4472.114',
82 '91.0.4472.124',
83 '91.0.4472.164',
84 '91.0.4472.19',
85 '91.0.4472.77',
86 '92.0.4515.107',
87 '92.0.4515.115',
88 '92.0.4515.131',
89 '92.0.4515.159',
90 '92.0.4515.43',
91 '93.0.4556.0',
92 '93.0.4577.15',
93 '93.0.4577.63',
94 '93.0.4577.82',
95 '94.0.4606.41',
96 '94.0.4606.54',
97 '94.0.4606.61',
98 '94.0.4606.71',
99 '94.0.4606.81',
100 '94.0.4606.85',
101 '95.0.4638.17',
102 '95.0.4638.50',
103 '95.0.4638.54',
104 '95.0.4638.69',
105 '95.0.4638.74',
106 '96.0.4664.18',
107 '96.0.4664.45',
108 '96.0.4664.55',
109 '96.0.4664.93',
110 '97.0.4692.20',
f7a147e3
S
111 )
112 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
113
114
4390d5ec 115SUPPORTED_ENCODINGS = [
116 'gzip', 'deflate'
117]
9b8ee23b 118if brotli:
4390d5ec 119 SUPPORTED_ENCODINGS.append('br')
120
3e669f36 121std_headers = {
f7a147e3 122 'User-Agent': random_user_agent(),
59ae15a5 123 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 124 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 125 'Sec-Fetch-Mode': 'navigate',
3e669f36 126}
f427df17 127
5f6a1245 128
fb37eb25
S
129USER_AGENTS = {
130 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
131}
132
133
4823ec9f 134class NO_DEFAULT:
135 pass
136
137
138def IDENTITY(x):
139 return x
140
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 156}
a942d6cb 157
8f53dc44 158# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166}
167
c587cbb7 168# needed for sanitizing filenames in restricted mode
c8827027 169ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 172
46f59e89
S
173DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
cb655f34
S
177 '%B %dst %Y',
178 '%B %dnd %Y',
9d30c213 179 '%B %drd %Y',
cb655f34 180 '%B %dth %Y',
46f59e89 181 '%b %d %Y',
cb655f34
S
182 '%b %dst %Y',
183 '%b %dnd %Y',
9d30c213 184 '%b %drd %Y',
cb655f34 185 '%b %dth %Y',
46f59e89
S
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
9d30c213 188 '%b %drd %Y %I:%M',
46f59e89
S
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
bccdbd22 192 '%Y.%m.%d.',
46f59e89 193 '%Y/%m/%d',
81c13222 194 '%Y/%m/%d %H:%M',
46f59e89 195 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
4f3fa23e 198 '%Y%m%d',
0c1c6f4b 199 '%Y-%m-%d %H:%M',
46f59e89
S
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
5014558a 202 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
c6eed6b8
S
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
b555ae9b
S
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
a63d9bd0 215 '%H:%M %d-%b-%Y',
46f59e89
S
216)
217
218DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
47304e07 226 '%d-%m-%Y %H:%M',
4cbfa570 227 '%H:%M %d/%m/%Y',
46f59e89
S
228])
229
230DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237])
238
06b3fe29 239PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 240JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 241
1d485a1a 242NUMBER_RE = r'\d+(?:\.\d+)?'
243
7105440c 244
0b9c08b4 245@functools.cache
d77c3dfd 246def preferredencoding():
59ae15a5 247 """Get preferred encoding.
d77c3dfd 248
59ae15a5
PH
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
28e614de 254 'TEST'.encode(pref)
70a1165b 255 except Exception:
59ae15a5 256 pref = 'UTF-8'
bae611f2 257
59ae15a5 258 return pref
d77c3dfd 259
f4bfd65f 260
181c8655 261def write_json_file(obj, fn):
1394646a 262 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 263
cfb0511d 264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
267
268 try:
269 with tf:
45d86abe 270 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
19a03940 274 with contextlib.suppress(OSError):
1394646a 275 os.unlink(fn)
19a03940 276 with contextlib.suppress(OSError):
9cd5f54e
R
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
181c8655 280 os.rename(tf.name, fn)
70a1165b 281 except Exception:
19a03940 282 with contextlib.suppress(OSError):
181c8655 283 os.remove(tf.name)
181c8655
PH
284 raise
285
286
cfb0511d 287def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 291 return node.find(expr)
59ae56fa 292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
5f6a1245
JW
295
296
d7e66d39
JMF
297def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
d77c3dfd 308
a41fb80c 309def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 310 def _find_xpath(xpath):
f9934b96 311 return node.find(xpath)
578c0745 312
14f25df2 313 if isinstance(xpath, str):
578c0745
S
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
d74bebd5 320
8e636da4 321 if n is None:
bf42a990
S
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
bf0ff932
PH
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
a41fb80c
S
329 return n
330
331
332def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
a41fb80c
S
345
346
347def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
86e5f3ed 353 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
bf0ff932
PH
358
359
c487cf00 360def get_element_by_id(id, html, **kwargs):
43e8fafd 361 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 362 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 363
12ea2f30 364
c487cf00 365def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 366 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 367 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
368
369
84c237fb 370def get_element_by_class(class_name, html):
2af12ad9
TC
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
6f32a0b5
ZM
376def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
c487cf00 382def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
384 return retval[0] if retval else None
385
386
c487cf00 387def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
389 return retval[0] if retval else None
390
391
c487cf00 392def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
64fa820c 395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
396 html, escape_value=False)
397
398
6f32a0b5
ZM
399def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
64fa820c 402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
403 html, escape_value=False)
404
405
406def get_elements_by_attribute(*args, **kwargs):
43e8fafd 407 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
4c9a1a3b 416def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
c61473c1
M
421 if not value:
422 return
9e6dd238 423
86e5f3ed 424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 425
84c237fb
YCH
426 value = re.escape(value) if escape_value else value
427
86e5f3ed 428 partial_element_re = rf'''(?x)
4c9a1a3b 429 <(?P<tag>{tag})
0254f162 430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
38285056 433
0254f162
ZM
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 436
0254f162
ZM
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
a921f407 441
c5229f39 442
ac668111 443class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
ac668111 455 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
46d09f87 485# XXX: This should be far less strict
6f32a0b5
ZM
486def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
ac668111 521class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 522 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 523
8bb56eee 524 def __init__(self):
c5229f39 525 self.attrs = {}
ac668111 526 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
7053aa3a 530 raise compat_HTMLParseError('done')
8bb56eee 531
c5229f39 532
ac668111 533class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
ac668111 537 html.parser.HTMLParser.__init__(self)
73673ccf
FF
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
8bb56eee
BF
550def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
8bb56eee
BF
563 """
564 parser = HTMLAttributeParser()
19a03940 565 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
566 parser.feed(html_element)
567 parser.close()
8bb56eee 568 return parser.attrs
9e6dd238 569
c5229f39 570
73673ccf
FF
571def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
9e6dd238 580def clean_html(html):
59ae15a5 581 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
49185227 586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
7decf895 593 return html.strip()
9e6dd238
FV
594
595
b7c47b74 596class LenientJSONDecoder(json.JSONDecoder):
cc090836 597 # TODO: Write tests
598 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 599 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 600 self._close_attempts = 2 * close_objects
b7c47b74 601 super().__init__(*args, **kwargs)
602
cc090836 603 @staticmethod
604 def _close_object(err):
605 doc = err.doc[:err.pos]
606 # We need to add comma first to get the correct error message
607 if err.msg.startswith('Expecting \',\''):
608 return doc + ','
609 elif not doc.endswith(','):
610 return
611
612 if err.msg.startswith('Expecting property name'):
613 return doc[:-1] + '}'
614 elif err.msg.startswith('Expecting value'):
615 return doc[:-1] + ']'
616
b7c47b74 617 def decode(self, s):
618 if self.transform_source:
619 s = self.transform_source(s)
cc090836 620 for attempt in range(self._close_attempts + 1):
621 try:
622 if self.ignore_extra:
623 return self.raw_decode(s.lstrip())[0]
624 return super().decode(s)
625 except json.JSONDecodeError as e:
626 if e.pos is None:
627 raise
628 elif attempt < self._close_attempts:
629 s = self._close_object(e)
630 if s is not None:
631 continue
2fa669f7 632 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 633 assert False, 'Too many attempts to decode JSON'
b7c47b74 634
635
d77c3dfd 636def sanitize_open(filename, open_mode):
59ae15a5
PH
637 """Try to open the given filename, and slightly tweak it if this fails.
638
639 Attempts to open the given filename. If this fails, it tries to change
640 the filename slightly, step by step, until it's either able to open it
641 or it fails and raises a final exception, like the standard open()
642 function.
643
644 It returns the tuple (stream, definitive_file_name).
645 """
0edb3e33 646 if filename == '-':
647 if sys.platform == 'win32':
648 import msvcrt
be5c1ae8 649
62b58c09 650 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 651 with contextlib.suppress(io.UnsupportedOperation):
652 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 653 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 654
0edb3e33 655 for attempt in range(2):
656 try:
657 try:
89737671 658 if sys.platform == 'win32':
b506289f 659 # FIXME: An exclusive lock also locks the file from being read.
660 # Since windows locks are mandatory, don't lock the file on windows (for now).
661 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 662 raise LockingUnsupportedError()
0edb3e33 663 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 664 except OSError:
0edb3e33 665 stream = open(filename, open_mode)
8a82af35 666 return stream, filename
86e5f3ed 667 except OSError as err:
0edb3e33 668 if attempt or err.errno in (errno.EACCES,):
669 raise
670 old_filename, filename = filename, sanitize_path(filename)
671 if old_filename == filename:
672 raise
d77c3dfd
FV
673
674
675def timeconvert(timestr):
59ae15a5
PH
676 """Convert RFC 2822 defined time string into system timestamp"""
677 timestamp = None
678 timetuple = email.utils.parsedate_tz(timestr)
679 if timetuple is not None:
680 timestamp = email.utils.mktime_tz(timetuple)
681 return timestamp
1c469a94 682
5f6a1245 683
5c3895ff 684def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 685 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 686 @param restricted Use a stricter subset of allowed characters
687 @param is_id Whether this is an ID that should be kept unchanged if possible.
688 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 689 """
5c3895ff 690 if s == '':
691 return ''
692
59ae15a5 693 def replace_insane(char):
c587cbb7
AT
694 if restricted and char in ACCENT_CHARS:
695 return ACCENT_CHARS[char]
91dd88b9 696 elif not restricted and char == '\n':
5c3895ff 697 return '\0 '
989a01c2 698 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
699 # Replace with their full-width unicode counterparts
700 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 701 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
702 return ''
703 elif char == '"':
704 return '' if restricted else '\''
705 elif char == ':':
5c3895ff 706 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 707 elif char in '\\/|*<>':
5c3895ff 708 return '\0_'
709 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
710 return '\0_'
59ae15a5
PH
711 return char
712
db4678e4 713 # Replace look-alike Unicode glyphs
714 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 715 s = unicodedata.normalize('NFKC', s)
5c3895ff 716 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 717 result = ''.join(map(replace_insane, s))
5c3895ff 718 if is_id is NO_DEFAULT:
ae61d108 719 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
720 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 721 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
722 result = result.replace('\0', '') or '_'
723
796173d0
PH
724 if not is_id:
725 while '__' in result:
726 result = result.replace('__', '_')
727 result = result.strip('_')
728 # Common case of "Foreign band name - English song title"
729 if restricted and result.startswith('-_'):
730 result = result[2:]
5a42414b
PH
731 if result.startswith('-'):
732 result = '_' + result[len('-'):]
a7440261 733 result = result.lstrip('.')
796173d0
PH
734 if not result:
735 result = '_'
59ae15a5 736 return result
d77c3dfd 737
5f6a1245 738
c2934512 739def sanitize_path(s, force=False):
a2aaf4db 740 """Sanitizes and normalizes path on Windows"""
c2934512 741 if sys.platform == 'win32':
c4218ac3 742 force = False
c2934512 743 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 744 elif force:
745 drive_or_unc = ''
746 else:
a2aaf4db 747 return s
c2934512 748
be531ef1
S
749 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
750 if drive_or_unc:
a2aaf4db
S
751 norm_path.pop(0)
752 sanitized_path = [
ec85ded8 753 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 754 for path_part in norm_path]
be531ef1
S
755 if drive_or_unc:
756 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 757 elif force and s and s[0] == os.path.sep:
c4218ac3 758 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
759 return os.path.join(*sanitized_path)
760
761
8f97a15d 762def sanitize_url(url, *, scheme='http'):
befa4708
S
763 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
764 # the number of unwanted failures due to missing protocol
21633673 765 if url is None:
766 return
767 elif url.startswith('//'):
8f97a15d 768 return f'{scheme}:{url}'
befa4708
S
769 # Fix some common typos seen so far
770 COMMON_TYPOS = (
067aa17e 771 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
772 (r'^httpss://', r'https://'),
773 # https://bx1.be/lives/direct-tv/
774 (r'^rmtp([es]?)://', r'rtmp\1://'),
775 )
776 for mistake, fixup in COMMON_TYPOS:
777 if re.match(mistake, url):
778 return re.sub(mistake, fixup, url)
bc6b9bcd 779 return url
17bcc626
S
780
781
5435dcf9 782def extract_basic_auth(url):
14f25df2 783 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
784 if parts.username is None:
785 return url, None
14f25df2 786 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
787 parts.hostname if parts.port is None
788 else '%s:%d' % (parts.hostname, parts.port))))
789 auth_payload = base64.b64encode(
0f06bcd7 790 ('%s:%s' % (parts.username, parts.password or '')).encode())
791 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
792
793
67dda517 794def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 795 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
796 if auth_header is not None:
797 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
798 headers['Authorization'] = auth_header
ac668111 799 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
800
801
51098426 802def expand_path(s):
2fa669f7 803 """Expand shell variables and ~"""
51098426
S
804 return os.path.expandvars(compat_expanduser(s))
805
806
7e9a6125 807def orderedSet(iterable, *, lazy=False):
808 """Remove all duplicates from the input iterable"""
809 def _iter():
810 seen = [] # Do not use set since the items can be unhashable
811 for x in iterable:
812 if x not in seen:
813 seen.append(x)
814 yield x
815
816 return _iter() if lazy else list(_iter())
d77c3dfd 817
912b38b4 818
55b2f099 819def _htmlentity_transform(entity_with_semicolon):
4e408e47 820 """Transforms an HTML entity to a character."""
55b2f099
YCH
821 entity = entity_with_semicolon[:-1]
822
4e408e47 823 # Known non-numeric HTML entity
ac668111 824 if entity in html.entities.name2codepoint:
825 return chr(html.entities.name2codepoint[entity])
4e408e47 826
62b58c09
L
827 # TODO: HTML5 allows entities without a semicolon.
828 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 829 if entity_with_semicolon in html.entities.html5:
830 return html.entities.html5[entity_with_semicolon]
55b2f099 831
91757b0f 832 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
833 if mobj is not None:
834 numstr = mobj.group(1)
28e614de 835 if numstr.startswith('x'):
4e408e47 836 base = 16
28e614de 837 numstr = '0%s' % numstr
4e408e47
PH
838 else:
839 base = 10
067aa17e 840 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 841 with contextlib.suppress(ValueError):
ac668111 842 return chr(int(numstr, base))
4e408e47
PH
843
844 # Unknown entity in name, return its literal representation
7a3f0c00 845 return '&%s;' % entity
4e408e47
PH
846
847
d77c3dfd 848def unescapeHTML(s):
912b38b4
PH
849 if s is None:
850 return None
19a03940 851 assert isinstance(s, str)
d77c3dfd 852
4e408e47 853 return re.sub(
95f3f7c2 854 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 855
8bf48f23 856
cdb19aa4 857def escapeHTML(text):
858 return (
859 text
860 .replace('&', '&amp;')
861 .replace('<', '&lt;')
862 .replace('>', '&gt;')
863 .replace('"', '&quot;')
864 .replace("'", '&#39;')
865 )
866
867
db3ad8a6
ND
868class netrc_from_content(netrc.netrc):
869 def __init__(self, content):
870 self.hosts, self.macros = {}, {}
871 with io.StringIO(content) as stream:
872 self._parse('-', stream, False)
873
874
d3c93ec2 875class Popen(subprocess.Popen):
876 if sys.platform == 'win32':
877 _startupinfo = subprocess.STARTUPINFO()
878 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
879 else:
880 _startupinfo = None
881
82ea226c
L
882 @staticmethod
883 def _fix_pyinstaller_ld_path(env):
884 """Restore LD_LIBRARY_PATH when using PyInstaller
885 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
886 https://github.com/yt-dlp/yt-dlp/issues/4573
887 """
888 if not hasattr(sys, '_MEIPASS'):
889 return
890
891 def _fix(key):
892 orig = env.get(f'{key}_ORIG')
893 if orig is None:
894 env.pop(key, None)
895 else:
896 env[key] = orig
897
898 _fix('LD_LIBRARY_PATH') # Linux
899 _fix('DYLD_LIBRARY_PATH') # macOS
900
901 def __init__(self, *args, env=None, text=False, **kwargs):
902 if env is None:
903 env = os.environ.copy()
904 self._fix_pyinstaller_ld_path(env)
905
da8e2912 906 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 907 if text is True:
908 kwargs['universal_newlines'] = True # For 3.6 compatibility
909 kwargs.setdefault('encoding', 'utf-8')
910 kwargs.setdefault('errors', 'replace')
82ea226c 911 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 912
913 def communicate_or_kill(self, *args, **kwargs):
8a82af35 914 try:
915 return self.communicate(*args, **kwargs)
916 except BaseException: # Including KeyboardInterrupt
f0c9fb96 917 self.kill(timeout=None)
8a82af35 918 raise
d3c93ec2 919
f0c9fb96 920 def kill(self, *, timeout=0):
921 super().kill()
922 if timeout != 0:
923 self.wait(timeout=timeout)
924
925 @classmethod
992dc6b4 926 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 927 with cls(*args, **kwargs) as proc:
da8e2912 928 default = '' if proc.__text_mode else b''
992dc6b4 929 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 930 return stdout or default, stderr or default, proc.returncode
f0c9fb96 931
d3c93ec2 932
f07b74fc 933def encodeArgument(s):
cfb0511d 934 # Legacy code that uses byte strings
935 # Uncomment the following line after fixing all post processors
14f25df2 936 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 937 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
938
939
aa7785f8 940_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
941
942
943def timetuple_from_msec(msec):
944 secs, msec = divmod(msec, 1000)
945 mins, secs = divmod(secs, 60)
946 hrs, mins = divmod(mins, 60)
947 return _timetuple(hrs, mins, secs, msec)
948
949
cdb19aa4 950def formatSeconds(secs, delim=':', msec=False):
aa7785f8 951 time = timetuple_from_msec(secs * 1000)
952 if time.hours:
953 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
954 elif time.minutes:
955 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 956 else:
aa7785f8 957 ret = '%d' % time.seconds
958 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 959
a0ddb8a2 960
77562778 961def _ssl_load_windows_store_certs(ssl_context, storename):
962 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
963 try:
964 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
965 if encoding == 'x509_asn' and (
966 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
967 except PermissionError:
968 return
969 for cert in certs:
19a03940 970 with contextlib.suppress(ssl.SSLError):
77562778 971 ssl_context.load_verify_locations(cadata=cert)
a2366922 972
77562778 973
974def make_HTTPS_handler(params, **kwargs):
975 opts_check_certificate = not params.get('nocheckcertificate')
976 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
977 context.check_hostname = opts_check_certificate
f81c62a6 978 if params.get('legacyserverconnect'):
979 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 980 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
981 context.set_ciphers('DEFAULT')
ac8e69dd
M
982 elif (
983 sys.version_info < (3, 10)
984 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
985 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
986 ):
5b9f253f
M
987 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
988 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
989 # in some situations [2][3].
990 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
991 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 992 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
993 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
994 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
995 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
996 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
997 # 5. https://peps.python.org/pep-0644/#libressl-support
998 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
999 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1001
77562778 1002 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003 if opts_check_certificate:
69bec673 1004 if certifi and 'no-certifi' not in params.get('compat_opts', []):
d5820461 1005 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1006 else:
1007 try:
1008 context.load_default_certs()
1009 # Work around the issue in load_default_certs when there are bad certificates. See:
1010 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012 except ssl.SSLError:
1013 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015 for storename in ('CA', 'ROOT'):
1016 _ssl_load_windows_store_certs(context, storename)
1017 context.set_default_verify_paths()
8a82af35 1018
bb58c9ed 1019 client_certfile = params.get('client_certificate')
1020 if client_certfile:
1021 try:
1022 context.load_cert_chain(
1023 client_certfile, keyfile=params.get('client_certificate_key'),
1024 password=params.get('client_certificate_password'))
1025 except ssl.SSLError:
1026 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1027
1028 # Some servers may reject requests if ALPN extension is not sent. See:
1029 # https://github.com/python/cpython/issues/85140
1030 # https://github.com/yt-dlp/yt-dlp/issues/3878
1031 with contextlib.suppress(NotImplementedError):
1032 context.set_alpn_protocols(['http/1.1'])
1033
77562778 1034 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1035
732ea2f0 1036
5873d4cc 1037def bug_reports_message(before=';'):
69bec673 1038 from ..update import REPOSITORY
57e0f077 1039
1040 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1041 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1042
1043 before = before.rstrip()
1044 if not before or before.endswith(('.', '!', '?')):
1045 msg = msg[0].title() + msg[1:]
1046
1047 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1048
1049
bf5b9d85
PM
1050class YoutubeDLError(Exception):
1051 """Base exception for YoutubeDL errors."""
aa9369a2 1052 msg = None
1053
1054 def __init__(self, msg=None):
1055 if msg is not None:
1056 self.msg = msg
1057 elif self.msg is None:
1058 self.msg = type(self).__name__
1059 super().__init__(self.msg)
bf5b9d85
PM
1060
1061
ac668111 1062network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1063if hasattr(ssl, 'CertificateError'):
1064 network_exceptions.append(ssl.CertificateError)
1065network_exceptions = tuple(network_exceptions)
1066
1067
bf5b9d85 1068class ExtractorError(YoutubeDLError):
1c256f70 1069 """Error during info extraction."""
5f6a1245 1070
1151c407 1071 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1072 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1073 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1074 """
3158150c 1075 if sys.exc_info()[0] in network_exceptions:
9a82b238 1076 expected = True
d5979c5d 1077
7265a219 1078 self.orig_msg = str(msg)
1c256f70 1079 self.traceback = tb
1151c407 1080 self.expected = expected
2eabb802 1081 self.cause = cause
d11271dd 1082 self.video_id = video_id
1151c407 1083 self.ie = ie
1084 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1085 if isinstance(self.exc_info[1], ExtractorError):
1086 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1087 super().__init__(self.__msg)
1151c407 1088
9bcfe33b 1089 @property
1090 def __msg(self):
1091 return ''.join((
1092 format_field(self.ie, None, '[%s] '),
1093 format_field(self.video_id, None, '%s: '),
1094 self.orig_msg,
1095 format_field(self.cause, None, ' (caused by %r)'),
1096 '' if self.expected else bug_reports_message()))
1c256f70 1097
01951dda 1098 def format_traceback(self):
497d2fab 1099 return join_nonempty(
1100 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1101 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1102 delim='\n') or None
01951dda 1103
9bcfe33b 1104 def __setattr__(self, name, value):
1105 super().__setattr__(name, value)
1106 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107 self.msg = self.__msg or type(self).__name__
1108 self.args = (self.msg, ) # Cannot be property
1109
1c256f70 1110
416c7fcb
PH
1111class UnsupportedError(ExtractorError):
1112 def __init__(self, url):
86e5f3ed 1113 super().__init__(
416c7fcb
PH
1114 'Unsupported URL: %s' % url, expected=True)
1115 self.url = url
1116
1117
55b3e45b
JMF
1118class RegexNotFoundError(ExtractorError):
1119 """Error when a regex didn't match"""
1120 pass
1121
1122
773f291d
S
1123class GeoRestrictedError(ExtractorError):
1124 """Geographic restriction Error exception.
1125
1126 This exception may be thrown when a video is not available from your
1127 geographic location due to geographic restrictions imposed by a website.
1128 """
b6e0c7d2 1129
0db3bae8 1130 def __init__(self, msg, countries=None, **kwargs):
1131 kwargs['expected'] = True
86e5f3ed 1132 super().__init__(msg, **kwargs)
773f291d
S
1133 self.countries = countries
1134
1135
693f0600 1136class UserNotLive(ExtractorError):
1137 """Error when a channel/user is not live"""
1138
1139 def __init__(self, msg=None, **kwargs):
1140 kwargs['expected'] = True
1141 super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
bf5b9d85 1144class DownloadError(YoutubeDLError):
59ae15a5 1145 """Download Error exception.
d77c3dfd 1146
59ae15a5
PH
1147 This exception may be thrown by FileDownloader objects if they are not
1148 configured to continue on errors. They will contain the appropriate
1149 error message.
1150 """
5f6a1245 1151
8cc83b8d
FV
1152 def __init__(self, msg, exc_info=None):
1153 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1154 super().__init__(msg)
8cc83b8d 1155 self.exc_info = exc_info
d77c3dfd
FV
1156
1157
498f5606 1158class EntryNotInPlaylist(YoutubeDLError):
1159 """Entry not in playlist exception.
1160
1161 This exception will be thrown by YoutubeDL when a requested entry
1162 is not found in the playlist info_dict
1163 """
aa9369a2 1164 msg = 'Entry not found in info'
498f5606 1165
1166
bf5b9d85 1167class SameFileError(YoutubeDLError):
59ae15a5 1168 """Same File exception.
d77c3dfd 1169
59ae15a5
PH
1170 This exception will be thrown by FileDownloader objects if they detect
1171 multiple files would have to be downloaded to the same file on disk.
1172 """
aa9369a2 1173 msg = 'Fixed output name but more than one file to download'
1174
1175 def __init__(self, filename=None):
1176 if filename is not None:
1177 self.msg += f': {filename}'
1178 super().__init__(self.msg)
d77c3dfd
FV
1179
1180
bf5b9d85 1181class PostProcessingError(YoutubeDLError):
59ae15a5 1182 """Post Processing exception.
d77c3dfd 1183
59ae15a5
PH
1184 This exception may be raised by PostProcessor's .run() method to
1185 indicate an error in the postprocessing task.
1186 """
5f6a1245 1187
5f6a1245 1188
48f79687 1189class DownloadCancelled(YoutubeDLError):
1190 """ Exception raised when the download queue should be interrupted """
1191 msg = 'The download was cancelled'
8b0d7497 1192
8b0d7497 1193
48f79687 1194class ExistingVideoReached(DownloadCancelled):
1195 """ --break-on-existing triggered """
1196 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1197
48f79687 1198
1199class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1200 """ --break-match-filter triggered """
1201 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1202
1203
48f79687 1204class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1205 """ --max-downloads limit has been reached. """
48f79687 1206 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
f2ebc5c7 1209class ReExtractInfo(YoutubeDLError):
1210 """ Video info needs to be re-extracted. """
1211
1212 def __init__(self, msg, expected=False):
1213 super().__init__(msg)
1214 self.expected = expected
1215
1216
1217class ThrottledDownload(ReExtractInfo):
48f79687 1218 """ Download speed below --throttled-rate. """
aa9369a2 1219 msg = 'The download speed is below throttle limit'
d77c3dfd 1220
43b22906 1221 def __init__(self):
1222 super().__init__(self.msg, expected=False)
f2ebc5c7 1223
d77c3dfd 1224
bf5b9d85 1225class UnavailableVideoError(YoutubeDLError):
59ae15a5 1226 """Unavailable Format exception.
d77c3dfd 1227
59ae15a5
PH
1228 This exception will be thrown when a video is requested
1229 in a format that is not available for that video.
1230 """
aa9369a2 1231 msg = 'Unable to download video'
1232
1233 def __init__(self, err=None):
1234 if err is not None:
1235 self.msg += f': {err}'
1236 super().__init__(self.msg)
d77c3dfd
FV
1237
1238
bf5b9d85 1239class ContentTooShortError(YoutubeDLError):
59ae15a5 1240 """Content Too Short exception.
d77c3dfd 1241
59ae15a5
PH
1242 This exception may be raised by FileDownloader objects when a file they
1243 download is too small for what the server announced first, indicating
1244 the connection was probably interrupted.
1245 """
d77c3dfd 1246
59ae15a5 1247 def __init__(self, downloaded, expected):
86e5f3ed 1248 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1249 # Both in bytes
59ae15a5
PH
1250 self.downloaded = downloaded
1251 self.expected = expected
d77c3dfd 1252
5f6a1245 1253
bf5b9d85 1254class XAttrMetadataError(YoutubeDLError):
efa97bdc 1255 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1256 super().__init__(msg)
efa97bdc 1257 self.code = code
bd264412 1258 self.msg = msg
efa97bdc
YCH
1259
1260 # Parsing code and msg
3089bc74 1261 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1262 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1263 self.reason = 'NO_SPACE'
1264 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265 self.reason = 'VALUE_TOO_LONG'
1266 else:
1267 self.reason = 'NOT_SUPPORTED'
1268
1269
bf5b9d85 1270class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1271 pass
1272
1273
c5a59d93 1274def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1275 hc = http_class(*args, **kwargs)
be4a824d 1276 source_address = ydl_handler._params.get('source_address')
8959018a 1277
be4a824d 1278 if source_address is not None:
8959018a
AU
1279 # This is to workaround _create_connection() from socket where it will try all
1280 # address data from getaddrinfo() including IPv6. This filters the result from
1281 # getaddrinfo() based on the source_address value.
1282 # This is based on the cpython socket.create_connection() function.
1283 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285 host, port = address
1286 err = None
1287 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1288 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289 ip_addrs = [addr for addr in addrs if addr[0] == af]
1290 if addrs and not ip_addrs:
1291 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1292 raise OSError(
9e21e6d9
S
1293 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294 % (ip_version, source_address[0]))
8959018a
AU
1295 for res in ip_addrs:
1296 af, socktype, proto, canonname, sa = res
1297 sock = None
1298 try:
1299 sock = socket.socket(af, socktype, proto)
1300 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301 sock.settimeout(timeout)
1302 sock.bind(source_address)
1303 sock.connect(sa)
1304 err = None # Explicitly break reference cycle
1305 return sock
86e5f3ed 1306 except OSError as _:
8959018a
AU
1307 err = _
1308 if sock is not None:
1309 sock.close()
1310 if err is not None:
1311 raise err
1312 else:
86e5f3ed 1313 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1314 if hasattr(hc, '_create_connection'):
1315 hc._create_connection = _create_connection
cfb0511d 1316 hc.source_address = (source_address, 0)
be4a824d
PH
1317
1318 return hc
1319
1320
ac668111 1321class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1322 """Handler for HTTP requests and responses.
1323
1324 This class, when installed with an OpenerDirector, automatically adds
955c8958 1325 the standard headers to every HTTP request and handles gzipped, deflated and
1326 brotli responses from web servers.
59ae15a5
PH
1327
1328 Part of this code was copied from:
1329
1330 http://techknack.net/python-urllib2-handlers/
1331
1332 Andrew Rowls, the author of that code, agreed to release it to the
1333 public domain.
1334 """
1335
be4a824d 1336 def __init__(self, params, *args, **kwargs):
ac668111 1337 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1338 self._params = params
1339
1340 def http_open(self, req):
ac668111 1341 conn_class = http.client.HTTPConnection
71aff188
YCH
1342
1343 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344 if socks_proxy:
1345 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346 del req.headers['Ytdl-socks-proxy']
1347
be4a824d 1348 return self.do_open(functools.partial(
71aff188 1349 _create_http_connection, self, conn_class, False),
be4a824d
PH
1350 req)
1351
59ae15a5
PH
1352 @staticmethod
1353 def deflate(data):
fc2119f2 1354 if not data:
1355 return data
59ae15a5
PH
1356 try:
1357 return zlib.decompress(data, -zlib.MAX_WBITS)
1358 except zlib.error:
1359 return zlib.decompress(data)
1360
4390d5ec 1361 @staticmethod
1362 def brotli(data):
1363 if not data:
1364 return data
9b8ee23b 1365 return brotli.decompress(data)
4390d5ec 1366
daafbf49 1367 @staticmethod
1368 def gz(data):
1369 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370 try:
1371 return gz.read()
1372 except OSError as original_oserror:
1373 # There may be junk add the end of the file
1374 # See http://stackoverflow.com/q/4928560/35070 for details
1375 for i in range(1, 1024):
1376 try:
1377 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378 return gz.read()
1379 except OSError:
1380 continue
1381 else:
1382 raise original_oserror
1383
acebc9cd 1384 def http_request(self, req):
51f267d9
S
1385 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386 # always respected by websites, some tend to give out URLs with non percent-encoded
1387 # non-ASCII characters (see telemb.py, ard.py [#3412])
1388 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389 # To work around aforementioned issue we will replace request's original URL with
1390 # percent-encoded one
1391 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393 url = req.get_full_url()
1394 url_escaped = escape_url(url)
1395
1396 # Substitute URL if any change after escaping
1397 if url != url_escaped:
15d260eb 1398 req = update_Request(req, url=url_escaped)
51f267d9 1399
8b7539d2 1400 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1401 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402 # The dict keys are capitalized because of this bug by urllib
1403 if h.capitalize() not in req.headers:
33ac271b 1404 req.add_header(h, v)
87f0e62d 1405
955c8958 1406 if 'Youtubedl-no-compression' in req.headers: # deprecated
1407 req.headers.pop('Youtubedl-no-compression', None)
1408 req.add_header('Accept-encoding', 'identity')
1409
af14914b 1410 if 'Accept-encoding' not in req.headers:
1411 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
379a4f16 1413 return super().do_request_(req)
59ae15a5 1414
acebc9cd 1415 def http_response(self, req, resp):
59ae15a5 1416 old_resp = resp
daafbf49 1417
1418 # Content-Encoding header lists the encodings in order that they were applied [1].
1419 # To decompress, we simply do the reverse.
1420 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421 decoded_response = None
1422 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423 if encoding == 'gzip':
1424 decoded_response = self.gz(decoded_response or resp.read())
1425 elif encoding == 'deflate':
1426 decoded_response = self.deflate(decoded_response or resp.read())
1427 elif encoding == 'br' and brotli:
1428 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430 if decoded_response is not None:
1431 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
4390d5ec 1432 resp.msg = old_resp.msg
ad729172 1433 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1434 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1435 if 300 <= resp.code < 400:
1436 location = resp.headers.get('Location')
1437 if location:
1438 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1439 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1440 location_escaped = escape_url(location)
1441 if location != location_escaped:
1442 del resp.headers['Location']
1443 resp.headers['Location'] = location_escaped
59ae15a5 1444 return resp
0f8d03f8 1445
acebc9cd
PH
1446 https_request = http_request
1447 https_response = http_response
bf50b038 1448
5de90176 1449
71aff188
YCH
1450def make_socks_conn_class(base_class, socks_proxy):
1451 assert issubclass(base_class, (
ac668111 1452 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1453
14f25df2 1454 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1455 if url_components.scheme.lower() == 'socks5':
1456 socks_type = ProxyType.SOCKS5
1457 elif url_components.scheme.lower() in ('socks', 'socks4'):
1458 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1459 elif url_components.scheme.lower() == 'socks4a':
1460 socks_type = ProxyType.SOCKS4A
71aff188 1461
cdd94c2e
YCH
1462 def unquote_if_non_empty(s):
1463 if not s:
1464 return s
ac668111 1465 return urllib.parse.unquote_plus(s)
cdd94c2e 1466
71aff188
YCH
1467 proxy_args = (
1468 socks_type,
1469 url_components.hostname, url_components.port or 1080,
1470 True, # Remote DNS
cdd94c2e
YCH
1471 unquote_if_non_empty(url_components.username),
1472 unquote_if_non_empty(url_components.password),
71aff188
YCH
1473 )
1474
1475 class SocksConnection(base_class):
1476 def connect(self):
1477 self.sock = sockssocket()
1478 self.sock.setproxy(*proxy_args)
19a03940 1479 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1480 self.sock.settimeout(self.timeout)
1481 self.sock.connect((self.host, self.port))
1482
ac668111 1483 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1484 if hasattr(self, '_context'): # Python > 2.6
1485 self.sock = self._context.wrap_socket(
1486 self.sock, server_hostname=self.host)
1487 else:
1488 self.sock = ssl.wrap_socket(self.sock)
1489
1490 return SocksConnection
1491
1492
ac668111 1493class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1494 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1495 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1497 self._params = params
1498
1499 def https_open(self, req):
4f264c02 1500 kwargs = {}
71aff188
YCH
1501 conn_class = self._https_conn_class
1502
4f264c02
JMF
1503 if hasattr(self, '_context'): # python > 2.6
1504 kwargs['context'] = self._context
1505 if hasattr(self, '_check_hostname'): # python 3.x
1506 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1507
1508 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509 if socks_proxy:
1510 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511 del req.headers['Ytdl-socks-proxy']
1512
4f28b537 1513 try:
1514 return self.do_open(
1515 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516 except urllib.error.URLError as e:
1517 if (isinstance(e.reason, ssl.SSLError)
1518 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520 raise
be4a824d
PH
1521
1522
941e881e 1523def is_path_like(f):
1524 return isinstance(f, (str, bytes, os.PathLike))
1525
1526
ac668111 1527class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1528 def __init__(self, cookiejar=None):
ac668111 1529 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1530
1531 def http_response(self, request, response):
ac668111 1532 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1533
ac668111 1534 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1535 https_response = http_response
1536
1537
ac668111 1538class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1539 """YoutubeDL redirect handler
1540
1541 The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
08916a49 1543 This redirect handler fixes and improves the logic to better align with RFC7261
1544 and what browsers tend to do [2][3]
201c1459 1545
1546 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
08916a49 1547 2. https://datatracker.ietf.org/doc/html/rfc7231
1548 3. https://github.com/python/cpython/issues/91306
201c1459 1549 """
1550
ac668111 1551 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1552
1553 def redirect_request(self, req, fp, code, msg, headers, newurl):
08916a49 1554 if code not in (301, 302, 303, 307, 308):
14f25df2 1555 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
afac4caa 1556
08916a49 1557 new_method = req.get_method()
1558 new_data = req.data
f8b4bcc0 1559
1560 # Technically the Cookie header should be in unredirected_hdrs,
1561 # however in practice some may set it in normal headers anyway.
1562 # We will remove it here to prevent any leaks.
1563 remove_headers = ['Cookie']
1564
afac4caa 1565 # A 303 must either use GET or HEAD for subsequent request
1566 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
08916a49 1567 if code == 303 and req.get_method() != 'HEAD':
1568 new_method = 'GET'
afac4caa 1569 # 301 and 302 redirects are commonly turned into a GET from a POST
1570 # for subsequent requests by browsers, so we'll do the same.
1571 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1572 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
08916a49 1573 elif code in (301, 302) and req.get_method() == 'POST':
1574 new_method = 'GET'
1575
1576 # only remove payload if method changed (e.g. POST to GET)
1577 if new_method != req.get_method():
1578 new_data = None
1579 remove_headers.extend(['Content-Length', 'Content-Type'])
1580
f8b4bcc0 1581 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
afac4caa 1582
ac668111 1583 return urllib.request.Request(
08916a49 1584 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1585 unverifiable=True, method=new_method, data=new_data)
fca6dba8
S
1586
1587
46f59e89
S
1588def extract_timezone(date_str):
1589 m = re.search(
f137e4c2 1590 r'''(?x)
1591 ^.{8,}? # >=8 char non-TZ prefix, if present
1592 (?P<tz>Z| # just the UTC Z, or
1593 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1594 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1595 [ ]? # optional space
1596 (?P<sign>\+|-) # +/-
1597 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1598 $)
1599 ''', date_str)
46f59e89 1600 if not m:
8f53dc44 1601 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1602 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1603 if timezone is not None:
1604 date_str = date_str[:-len(m.group('tz'))]
1605 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1606 else:
1607 date_str = date_str[:-len(m.group('tz'))]
1608 if not m.group('sign'):
1609 timezone = datetime.timedelta()
1610 else:
1611 sign = 1 if m.group('sign') == '+' else -1
1612 timezone = datetime.timedelta(
1613 hours=sign * int(m.group('hours')),
1614 minutes=sign * int(m.group('minutes')))
1615 return timezone, date_str
1616
1617
08b38d54 1618def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1619 """ Return a UNIX timestamp from the given date """
1620
1621 if date_str is None:
1622 return None
1623
52c3a6e4
S
1624 date_str = re.sub(r'\.[0-9]+', '', date_str)
1625
08b38d54 1626 if timezone is None:
46f59e89
S
1627 timezone, date_str = extract_timezone(date_str)
1628
19a03940 1629 with contextlib.suppress(ValueError):
86e5f3ed 1630 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1631 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1632 return calendar.timegm(dt.timetuple())
912b38b4
PH
1633
1634
46f59e89
S
1635def date_formats(day_first=True):
1636 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1637
1638
42bdd9d0 1639def unified_strdate(date_str, day_first=True):
bf50b038 1640 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1641
1642 if date_str is None:
1643 return None
bf50b038 1644 upload_date = None
5f6a1245 1645 # Replace commas
026fcc04 1646 date_str = date_str.replace(',', ' ')
42bdd9d0 1647 # Remove AM/PM + timezone
9bb8e0a3 1648 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1649 _, date_str = extract_timezone(date_str)
42bdd9d0 1650
46f59e89 1651 for expression in date_formats(day_first):
19a03940 1652 with contextlib.suppress(ValueError):
bf50b038 1653 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1654 if upload_date is None:
1655 timetuple = email.utils.parsedate_tz(date_str)
1656 if timetuple:
19a03940 1657 with contextlib.suppress(ValueError):
c6b9cf05 1658 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1659 if upload_date is not None:
14f25df2 1660 return str(upload_date)
bf50b038 1661
5f6a1245 1662
46f59e89 1663def unified_timestamp(date_str, day_first=True):
ad54c913 1664 if not isinstance(date_str, str):
46f59e89
S
1665 return None
1666
8f53dc44 1667 date_str = re.sub(r'\s+', ' ', re.sub(
1668 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1669
7dc2a74e 1670 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1671 timezone, date_str = extract_timezone(date_str)
1672
1673 # Remove AM/PM + timezone
1674 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1675
deef3195
S
1676 # Remove unrecognized timezones from ISO 8601 alike timestamps
1677 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1678 if m:
1679 date_str = date_str[:-len(m.group('tz'))]
1680
f226880c
PH
1681 # Python only supports microseconds, so remove nanoseconds
1682 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1683 if m:
1684 date_str = m.group(1)
1685
46f59e89 1686 for expression in date_formats(day_first):
19a03940 1687 with contextlib.suppress(ValueError):
7dc2a74e 1688 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1689 return calendar.timegm(dt.timetuple())
8f53dc44 1690
46f59e89
S
1691 timetuple = email.utils.parsedate_tz(date_str)
1692 if timetuple:
8f53dc44 1693 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1694
1695
28e614de 1696def determine_ext(url, default_ext='unknown_video'):
85750f89 1697 if url is None or '.' not in url:
f4776371 1698 return default_ext
9cb9a5df 1699 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1700 if re.match(r'^[A-Za-z0-9]+$', guess):
1701 return guess
a7aaa398
S
1702 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1703 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1704 return guess.rstrip('/')
73e79f2a 1705 else:
cbdbb766 1706 return default_ext
73e79f2a 1707
5f6a1245 1708
824fa511
S
1709def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1710 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1711
5f6a1245 1712
9e62f283 1713def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1714 R"""
1715 Return a datetime object from a string.
1716 Supported format:
1717 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1718
1719 @param format strftime format of DATE
1720 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1721 auto: round to the unit provided in date_str (if applicable).
9e62f283 1722 """
1723 auto_precision = False
1724 if precision == 'auto':
1725 auto_precision = True
1726 precision = 'microsecond'
396a76f7 1727 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1728 if date_str in ('now', 'today'):
37254abc 1729 return today
f8795e10
PH
1730 if date_str == 'yesterday':
1731 return today - datetime.timedelta(days=1)
9e62f283 1732 match = re.match(
3d38b2d6 1733 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1734 date_str)
37254abc 1735 if match is not None:
9e62f283 1736 start_time = datetime_from_str(match.group('start'), precision, format)
1737 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1738 unit = match.group('unit')
9e62f283 1739 if unit == 'month' or unit == 'year':
1740 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1741 unit = 'day'
9e62f283 1742 else:
1743 if unit == 'week':
1744 unit = 'day'
1745 time *= 7
1746 delta = datetime.timedelta(**{unit + 's': time})
1747 new_date = start_time + delta
1748 if auto_precision:
1749 return datetime_round(new_date, unit)
1750 return new_date
1751
1752 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1753
1754
d49f8db3 1755def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1756 R"""
1757 Return a date object from a string using datetime_from_str
9e62f283 1758
3d38b2d6 1759 @param strict Restrict allowed patterns to "YYYYMMDD" and
1760 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1761 """
3d38b2d6 1762 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1763 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1764 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1765
1766
1767def datetime_add_months(dt, months):
1768 """Increment/Decrement a datetime object by months."""
1769 month = dt.month + months - 1
1770 year = dt.year + month // 12
1771 month = month % 12 + 1
1772 day = min(dt.day, calendar.monthrange(year, month)[1])
1773 return dt.replace(year, month, day)
1774
1775
1776def datetime_round(dt, precision='day'):
1777 """
1778 Round a datetime object's time to a specific precision
1779 """
1780 if precision == 'microsecond':
1781 return dt
1782
1783 unit_seconds = {
1784 'day': 86400,
1785 'hour': 3600,
1786 'minute': 60,
1787 'second': 1,
1788 }
1789 roundto = lambda x, n: ((x + n / 2) // n) * n
1790 timestamp = calendar.timegm(dt.timetuple())
1791 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1792
1793
e63fc1be 1794def hyphenate_date(date_str):
1795 """
1796 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1797 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1798 if match is not None:
1799 return '-'.join(match.groups())
1800 else:
1801 return date_str
1802
5f6a1245 1803
86e5f3ed 1804class DateRange:
bd558525 1805 """Represents a time interval between two dates"""
5f6a1245 1806
bd558525
JMF
1807 def __init__(self, start=None, end=None):
1808 """start and end must be strings in the format accepted by date"""
1809 if start is not None:
d49f8db3 1810 self.start = date_from_str(start, strict=True)
bd558525
JMF
1811 else:
1812 self.start = datetime.datetime.min.date()
1813 if end is not None:
d49f8db3 1814 self.end = date_from_str(end, strict=True)
bd558525
JMF
1815 else:
1816 self.end = datetime.datetime.max.date()
37254abc 1817 if self.start > self.end:
bd558525 1818 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1819
bd558525
JMF
1820 @classmethod
1821 def day(cls, day):
1822 """Returns a range that only contains the given day"""
5f6a1245
JW
1823 return cls(day, day)
1824
bd558525
JMF
1825 def __contains__(self, date):
1826 """Check if the date is in the range"""
37254abc
JMF
1827 if not isinstance(date, datetime.date):
1828 date = date_from_str(date)
1829 return self.start <= date <= self.end
5f6a1245 1830
46f1370e 1831 def __repr__(self):
1832 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1833
f2df4071 1834 def __eq__(self, other):
1835 return (isinstance(other, DateRange)
1836 and self.start == other.start and self.end == other.end)
1837
c496ca96 1838
b1f94422 1839@functools.cache
1840def system_identifier():
1841 python_implementation = platform.python_implementation()
1842 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1843 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1844 libc_ver = []
1845 with contextlib.suppress(OSError): # We may not have access to the executable
1846 libc_ver = platform.libc_ver()
b1f94422 1847
17fc3dc4 1848 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1849 platform.python_version(),
1850 python_implementation,
17fc3dc4 1851 platform.machine(),
b1f94422 1852 platform.architecture()[0],
1853 platform.platform(),
5b9f253f
M
1854 ssl.OPENSSL_VERSION,
1855 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1856 )
c257baff
PH
1857
1858
0b9c08b4 1859@functools.cache
49fa4d9a 1860def get_windows_version():
8a82af35 1861 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1862 if compat_os_name == 'nt':
1863 return version_tuple(platform.win32_ver()[1])
1864 else:
8a82af35 1865 return ()
49fa4d9a
N
1866
1867
734f90bb 1868def write_string(s, out=None, encoding=None):
19a03940 1869 assert isinstance(s, str)
1870 out = out or sys.stderr
3b479100
SS
1871 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1872 if not out:
1873 return
7459e3a2 1874
fe1daad3 1875 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1876 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1877
8a82af35 1878 enc, buffer = None, out
cfb0511d 1879 if 'b' in getattr(out, 'mode', ''):
c487cf00 1880 enc = encoding or preferredencoding()
104aa738 1881 elif hasattr(out, 'buffer'):
8a82af35 1882 buffer = out.buffer
104aa738 1883 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1884
8a82af35 1885 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1886 out.flush()
1887
1888
da4db748 1889def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1890 from .. import _IN_CLI
da4db748 1891 if _IN_CLI:
1892 if msg in deprecation_warning._cache:
1893 return
1894 deprecation_warning._cache.add(msg)
1895 if printer:
1896 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1897 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1898 else:
1899 import warnings
1900 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1901
1902
1903deprecation_warning._cache = set()
1904
1905
48ea9cea
PH
1906def bytes_to_intlist(bs):
1907 if not bs:
1908 return []
1909 if isinstance(bs[0], int): # Python 3
1910 return list(bs)
1911 else:
1912 return [ord(c) for c in bs]
1913
c257baff 1914
cba892fa 1915def intlist_to_bytes(xs):
1916 if not xs:
1917 return b''
ac668111 1918 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1919
1920
8a82af35 1921class LockingUnsupportedError(OSError):
1890fc63 1922 msg = 'File locking is not supported'
0edb3e33 1923
1924 def __init__(self):
1925 super().__init__(self.msg)
1926
1927
c1c9a79c
PH
1928# Cross-platform file locking
1929if sys.platform == 'win32':
fe0918bb 1930 import ctypes
c1c9a79c
PH
1931 import ctypes.wintypes
1932 import msvcrt
1933
1934 class OVERLAPPED(ctypes.Structure):
1935 _fields_ = [
1936 ('Internal', ctypes.wintypes.LPVOID),
1937 ('InternalHigh', ctypes.wintypes.LPVOID),
1938 ('Offset', ctypes.wintypes.DWORD),
1939 ('OffsetHigh', ctypes.wintypes.DWORD),
1940 ('hEvent', ctypes.wintypes.HANDLE),
1941 ]
1942
37e325b9 1943 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1944 LockFileEx = kernel32.LockFileEx
1945 LockFileEx.argtypes = [
1946 ctypes.wintypes.HANDLE, # hFile
1947 ctypes.wintypes.DWORD, # dwFlags
1948 ctypes.wintypes.DWORD, # dwReserved
1949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1951 ctypes.POINTER(OVERLAPPED) # Overlapped
1952 ]
1953 LockFileEx.restype = ctypes.wintypes.BOOL
1954 UnlockFileEx = kernel32.UnlockFileEx
1955 UnlockFileEx.argtypes = [
1956 ctypes.wintypes.HANDLE, # hFile
1957 ctypes.wintypes.DWORD, # dwReserved
1958 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1960 ctypes.POINTER(OVERLAPPED) # Overlapped
1961 ]
1962 UnlockFileEx.restype = ctypes.wintypes.BOOL
1963 whole_low = 0xffffffff
1964 whole_high = 0x7fffffff
1965
747c0bd1 1966 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1967 overlapped = OVERLAPPED()
1968 overlapped.Offset = 0
1969 overlapped.OffsetHigh = 0
1970 overlapped.hEvent = 0
1971 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1972
1973 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1974 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1975 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1976 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1977 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1978
1979 def _unlock_file(f):
1980 assert f._lock_file_overlapped_p
1981 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1982 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1983 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1984
1985else:
399a76e6
YCH
1986 try:
1987 import fcntl
c1c9a79c 1988
a3125791 1989 def _lock_file(f, exclusive, block):
b63837bc 1990 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1991 if not block:
1992 flags |= fcntl.LOCK_NB
acea8d7c 1993 try:
b63837bc 1994 fcntl.flock(f, flags)
acea8d7c
JK
1995 except BlockingIOError:
1996 raise
1997 except OSError: # AOSP does not have flock()
b63837bc 1998 fcntl.lockf(f, flags)
c1c9a79c 1999
399a76e6 2000 def _unlock_file(f):
45998b3e
E
2001 with contextlib.suppress(OSError):
2002 return fcntl.flock(f, fcntl.LOCK_UN)
2003 with contextlib.suppress(OSError):
2004 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2005 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 2006
399a76e6 2007 except ImportError:
399a76e6 2008
a3125791 2009 def _lock_file(f, exclusive, block):
0edb3e33 2010 raise LockingUnsupportedError()
399a76e6
YCH
2011
2012 def _unlock_file(f):
0edb3e33 2013 raise LockingUnsupportedError()
c1c9a79c
PH
2014
2015
86e5f3ed 2016class locked_file:
0edb3e33 2017 locked = False
747c0bd1 2018
a3125791 2019 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2020 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2021 raise NotImplementedError(mode)
2022 self.mode, self.block = mode, block
2023
2024 writable = any(f in mode for f in 'wax+')
2025 readable = any(f in mode for f in 'r+')
2026 flags = functools.reduce(operator.ior, (
2027 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2028 getattr(os, 'O_BINARY', 0), # Windows only
2029 getattr(os, 'O_NOINHERIT', 0), # Windows only
2030 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2031 os.O_APPEND if 'a' in mode else 0,
2032 os.O_EXCL if 'x' in mode else 0,
2033 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2034 ))
2035
98804d03 2036 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2037
2038 def __enter__(self):
a3125791 2039 exclusive = 'r' not in self.mode
c1c9a79c 2040 try:
a3125791 2041 _lock_file(self.f, exclusive, self.block)
0edb3e33 2042 self.locked = True
86e5f3ed 2043 except OSError:
c1c9a79c
PH
2044 self.f.close()
2045 raise
fcfa8853 2046 if 'w' in self.mode:
131e14dc
JK
2047 try:
2048 self.f.truncate()
2049 except OSError as e:
1890fc63 2050 if e.errno not in (
2051 errno.ESPIPE, # Illegal seek - expected for FIFO
2052 errno.EINVAL, # Invalid argument - expected for /dev/null
2053 ):
2054 raise
c1c9a79c
PH
2055 return self
2056
0edb3e33 2057 def unlock(self):
2058 if not self.locked:
2059 return
c1c9a79c 2060 try:
0edb3e33 2061 _unlock_file(self.f)
c1c9a79c 2062 finally:
0edb3e33 2063 self.locked = False
c1c9a79c 2064
0edb3e33 2065 def __exit__(self, *_):
2066 try:
2067 self.unlock()
2068 finally:
2069 self.f.close()
4eb7f1d1 2070
0edb3e33 2071 open = __enter__
2072 close = __exit__
a3125791 2073
0edb3e33 2074 def __getattr__(self, attr):
2075 return getattr(self.f, attr)
a3125791 2076
0edb3e33 2077 def __iter__(self):
2078 return iter(self.f)
a3125791 2079
4eb7f1d1 2080
0b9c08b4 2081@functools.cache
4644ac55
S
2082def get_filesystem_encoding():
2083 encoding = sys.getfilesystemencoding()
2084 return encoding if encoding is not None else 'utf-8'
2085
2086
4eb7f1d1 2087def shell_quote(args):
a6a173c2 2088 quoted_args = []
4644ac55 2089 encoding = get_filesystem_encoding()
a6a173c2
JMF
2090 for a in args:
2091 if isinstance(a, bytes):
2092 # We may get a filename encoded with 'encodeFilename'
2093 a = a.decode(encoding)
aefce8e6 2094 quoted_args.append(compat_shlex_quote(a))
28e614de 2095 return ' '.join(quoted_args)
9d4660ca
PH
2096
2097
2098def smuggle_url(url, data):
2099 """ Pass additional data in a URL for internal use. """
2100
81953d1a
RA
2101 url, idata = unsmuggle_url(url, {})
2102 data.update(idata)
14f25df2 2103 sdata = urllib.parse.urlencode(
28e614de
PH
2104 {'__youtubedl_smuggle': json.dumps(data)})
2105 return url + '#' + sdata
9d4660ca
PH
2106
2107
79f82953 2108def unsmuggle_url(smug_url, default=None):
83e865a3 2109 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2110 return smug_url, default
28e614de 2111 url, _, sdata = smug_url.rpartition('#')
14f25df2 2112 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2113 data = json.loads(jsond)
2114 return url, data
02dbf93f
PH
2115
2116
e0fd9573 2117def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2118 """ Formats numbers with decimal sufixes like K, M, etc """
2119 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2120 if num is None or num < 0:
e0fd9573 2121 return None
eeb2a770 2122 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2123 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2124 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2125 if factor == 1024:
2126 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2127 converted = num / (factor ** exponent)
abbeeebc 2128 return fmt % (converted, suffix)
e0fd9573 2129
2130
02dbf93f 2131def format_bytes(bytes):
f02d24d8 2132 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2133
1c088fa8 2134
64c464a1 2135def lookup_unit_table(unit_table, s, strict=False):
2136 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2137 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2138 m = (re.fullmatch if strict else re.match)(
2139 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2140 if not m:
2141 return None
64c464a1 2142
2143 num = float(m.group('num').replace(',', '.'))
fb47597b 2144 mult = unit_table[m.group('unit')]
64c464a1 2145 return round(num * mult)
2146
2147
2148def parse_bytes(s):
2149 """Parse a string indicating a byte quantity into an integer"""
2150 return lookup_unit_table(
2151 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2152 s.upper(), strict=True)
fb47597b
S
2153
2154
be64b5b0
PH
2155def parse_filesize(s):
2156 if s is None:
2157 return None
2158
dfb1b146 2159 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2160 # but we support those too
2161 _UNIT_TABLE = {
2162 'B': 1,
2163 'b': 1,
70852b47 2164 'bytes': 1,
be64b5b0
PH
2165 'KiB': 1024,
2166 'KB': 1000,
2167 'kB': 1024,
2168 'Kb': 1000,
13585d76 2169 'kb': 1000,
70852b47
YCH
2170 'kilobytes': 1000,
2171 'kibibytes': 1024,
be64b5b0
PH
2172 'MiB': 1024 ** 2,
2173 'MB': 1000 ** 2,
2174 'mB': 1024 ** 2,
2175 'Mb': 1000 ** 2,
13585d76 2176 'mb': 1000 ** 2,
70852b47
YCH
2177 'megabytes': 1000 ** 2,
2178 'mebibytes': 1024 ** 2,
be64b5b0
PH
2179 'GiB': 1024 ** 3,
2180 'GB': 1000 ** 3,
2181 'gB': 1024 ** 3,
2182 'Gb': 1000 ** 3,
13585d76 2183 'gb': 1000 ** 3,
70852b47
YCH
2184 'gigabytes': 1000 ** 3,
2185 'gibibytes': 1024 ** 3,
be64b5b0
PH
2186 'TiB': 1024 ** 4,
2187 'TB': 1000 ** 4,
2188 'tB': 1024 ** 4,
2189 'Tb': 1000 ** 4,
13585d76 2190 'tb': 1000 ** 4,
70852b47
YCH
2191 'terabytes': 1000 ** 4,
2192 'tebibytes': 1024 ** 4,
be64b5b0
PH
2193 'PiB': 1024 ** 5,
2194 'PB': 1000 ** 5,
2195 'pB': 1024 ** 5,
2196 'Pb': 1000 ** 5,
13585d76 2197 'pb': 1000 ** 5,
70852b47
YCH
2198 'petabytes': 1000 ** 5,
2199 'pebibytes': 1024 ** 5,
be64b5b0
PH
2200 'EiB': 1024 ** 6,
2201 'EB': 1000 ** 6,
2202 'eB': 1024 ** 6,
2203 'Eb': 1000 ** 6,
13585d76 2204 'eb': 1000 ** 6,
70852b47
YCH
2205 'exabytes': 1000 ** 6,
2206 'exbibytes': 1024 ** 6,
be64b5b0
PH
2207 'ZiB': 1024 ** 7,
2208 'ZB': 1000 ** 7,
2209 'zB': 1024 ** 7,
2210 'Zb': 1000 ** 7,
13585d76 2211 'zb': 1000 ** 7,
70852b47
YCH
2212 'zettabytes': 1000 ** 7,
2213 'zebibytes': 1024 ** 7,
be64b5b0
PH
2214 'YiB': 1024 ** 8,
2215 'YB': 1000 ** 8,
2216 'yB': 1024 ** 8,
2217 'Yb': 1000 ** 8,
13585d76 2218 'yb': 1000 ** 8,
70852b47
YCH
2219 'yottabytes': 1000 ** 8,
2220 'yobibytes': 1024 ** 8,
be64b5b0
PH
2221 }
2222
fb47597b
S
2223 return lookup_unit_table(_UNIT_TABLE, s)
2224
2225
2226def parse_count(s):
2227 if s is None:
be64b5b0
PH
2228 return None
2229
352d5da8 2230 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2231
2232 if re.match(r'^[\d,.]+$', s):
2233 return str_to_int(s)
2234
2235 _UNIT_TABLE = {
2236 'k': 1000,
2237 'K': 1000,
2238 'm': 1000 ** 2,
2239 'M': 1000 ** 2,
2240 'kk': 1000 ** 2,
2241 'KK': 1000 ** 2,
352d5da8 2242 'b': 1000 ** 3,
2243 'B': 1000 ** 3,
fb47597b 2244 }
be64b5b0 2245
352d5da8 2246 ret = lookup_unit_table(_UNIT_TABLE, s)
2247 if ret is not None:
2248 return ret
2249
2250 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2251 if mobj:
2252 return str_to_int(mobj.group(1))
be64b5b0 2253
2f7ae819 2254
5d45484c 2255def parse_resolution(s, *, lenient=False):
b871d7e9
S
2256 if s is None:
2257 return {}
2258
5d45484c
LNO
2259 if lenient:
2260 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2261 else:
2262 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2263 if mobj:
2264 return {
2265 'width': int(mobj.group('w')),
2266 'height': int(mobj.group('h')),
2267 }
2268
17ec8bcf 2269 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2270 if mobj:
2271 return {'height': int(mobj.group(1))}
2272
2273 mobj = re.search(r'\b([48])[kK]\b', s)
2274 if mobj:
2275 return {'height': int(mobj.group(1)) * 540}
2276
2277 return {}
2278
2279
0dc41787 2280def parse_bitrate(s):
14f25df2 2281 if not isinstance(s, str):
0dc41787
S
2282 return
2283 mobj = re.search(r'\b(\d+)\s*kbps', s)
2284 if mobj:
2285 return int(mobj.group(1))
2286
2287
a942d6cb 2288def month_by_name(name, lang='en'):
caefb1de
PH
2289 """ Return the number of a month by (locale-independently) English name """
2290
f6717dec 2291 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2292
caefb1de 2293 try:
f6717dec 2294 return month_names.index(name) + 1
7105440c
YCH
2295 except ValueError:
2296 return None
2297
2298
2299def month_by_abbreviation(abbrev):
2300 """ Return the number of a month by (locale-independently) English
2301 abbreviations """
2302
2303 try:
2304 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2305 except ValueError:
2306 return None
18258362
JMF
2307
2308
5aafe895 2309def fix_xml_ampersands(xml_str):
18258362 2310 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2311 return re.sub(
2312 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2313 '&amp;',
5aafe895 2314 xml_str)
e3946f98
PH
2315
2316
2317def setproctitle(title):
14f25df2 2318 assert isinstance(title, str)
c1c05c67 2319
fe0918bb 2320 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2321 try:
2322 import ctypes
2323 except ImportError:
c1c05c67
YCH
2324 return
2325
e3946f98 2326 try:
611c1dd9 2327 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2328 except OSError:
2329 return
2f49bcd6
RC
2330 except TypeError:
2331 # LoadLibrary in Windows Python 2.7.13 only expects
2332 # a bytestring, but since unicode_literals turns
2333 # every string into a unicode string, it fails.
2334 return
0f06bcd7 2335 title_bytes = title.encode()
6eefe533
PH
2336 buf = ctypes.create_string_buffer(len(title_bytes))
2337 buf.value = title_bytes
e3946f98 2338 try:
6eefe533 2339 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2340 except AttributeError:
2341 return # Strange libc, just skip this
d7dda168
PH
2342
2343
2344def remove_start(s, start):
46bc9b7d 2345 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2346
2347
2b9faf55 2348def remove_end(s, end):
46bc9b7d 2349 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2350
2351
31b2051e
S
2352def remove_quotes(s):
2353 if s is None or len(s) < 2:
2354 return s
2355 for quote in ('"', "'", ):
2356 if s[0] == quote and s[-1] == quote:
2357 return s[1:-1]
2358 return s
2359
2360
b6e0c7d2 2361def get_domain(url):
ebf99aaf 2362 """
2363 This implementation is inconsistent, but is kept for compatibility.
2364 Use this only for "webpage_url_domain"
2365 """
2366 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2367
2368
29eb5174 2369def url_basename(url):
14f25df2 2370 path = urllib.parse.urlparse(url).path
28e614de 2371 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2372
2373
02dc0a36 2374def base_url(url):
7657ec7e 2375 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2376
2377
e34c3361 2378def urljoin(base, path):
4b5de77b 2379 if isinstance(path, bytes):
0f06bcd7 2380 path = path.decode()
14f25df2 2381 if not isinstance(path, str) or not path:
e34c3361 2382 return None
fad4ceb5 2383 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2384 return path
4b5de77b 2385 if isinstance(base, bytes):
0f06bcd7 2386 base = base.decode()
14f25df2 2387 if not isinstance(base, str) or not re.match(
4b5de77b 2388 r'^(?:https?:)?//', base):
e34c3361 2389 return None
14f25df2 2390 return urllib.parse.urljoin(base, path)
e34c3361
S
2391
2392
ac668111 2393class HEADRequest(urllib.request.Request):
aa94a6d3 2394 def get_method(self):
611c1dd9 2395 return 'HEAD'
7217e148
PH
2396
2397
ac668111 2398class PUTRequest(urllib.request.Request):
95cf60e8
S
2399 def get_method(self):
2400 return 'PUT'
2401
2402
9732d77e 2403def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2404 if get_attr and v is not None:
2405 v = getattr(v, get_attr, None)
1812afb7
S
2406 try:
2407 return int(v) * invscale // scale
31c49255 2408 except (ValueError, TypeError, OverflowError):
af98f8ff 2409 return default
9732d77e 2410
9572013d 2411
40a90862 2412def str_or_none(v, default=None):
14f25df2 2413 return default if v is None else str(v)
40a90862 2414
9732d77e
PH
2415
2416def str_to_int(int_str):
48d4681e 2417 """ A more relaxed version of int_or_none """
f9934b96 2418 if isinstance(int_str, int):
348c6bf1 2419 return int_str
14f25df2 2420 elif isinstance(int_str, str):
42db58ec
S
2421 int_str = re.sub(r'[,\.\+]', '', int_str)
2422 return int_or_none(int_str)
608d11f5
PH
2423
2424
9732d77e 2425def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2426 if v is None:
2427 return default
2428 try:
2429 return float(v) * invscale / scale
5e1271c5 2430 except (ValueError, TypeError):
caf80631 2431 return default
43f775e4
PH
2432
2433
c7e327c4
S
2434def bool_or_none(v, default=None):
2435 return v if isinstance(v, bool) else default
2436
2437
53cd37ba 2438def strip_or_none(v, default=None):
14f25df2 2439 return v.strip() if isinstance(v, str) else default
b72b4431
S
2440
2441
af03000a 2442def url_or_none(url):
14f25df2 2443 if not url or not isinstance(url, str):
af03000a
S
2444 return None
2445 url = url.strip()
29f7c58a 2446 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2447
2448
3e9b66d7 2449def request_to_url(req):
ac668111 2450 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2451 return req.get_full_url()
2452 else:
2453 return req
2454
2455
ad54c913 2456def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 2457 datetime_object = None
2458 try:
f9934b96 2459 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2460 # Using naive datetime here can break timestamp() in Windows
2461 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 2462 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2463 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2464 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2465 + datetime.timedelta(seconds=timestamp))
14f25df2 2466 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2467 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2468 date_format = re.sub( # Support %s on windows
2469 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2470 return datetime_object.strftime(date_format)
2471 except (ValueError, TypeError, AttributeError):
2472 return default
2473
2474
608d11f5 2475def parse_duration(s):
f9934b96 2476 if not isinstance(s, str):
608d11f5 2477 return None
ca7b3246 2478 s = s.strip()
38d79fd1 2479 if not s:
2480 return None
ca7b3246 2481
acaff495 2482 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2483 m = re.match(r'''(?x)
2484 (?P<before_secs>
2485 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2486 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2487 (?P<ms>[.:][0-9]+)?Z?$
2488 ''', s)
acaff495 2489 if m:
8bd1c00b 2490 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2491 else:
2492 m = re.match(
056653bb
S
2493 r'''(?ix)(?:P?
2494 (?:
1c1b2f96 2495 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2496 )?
2497 (?:
1c1b2f96 2498 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2499 )?
2500 (?:
1c1b2f96 2501 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2502 )?
8f4b58d7 2503 (?:
1c1b2f96 2504 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2505 )?
056653bb 2506 T)?
acaff495 2507 (?:
1c1b2f96 2508 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2509 )?
2510 (?:
1c1b2f96 2511 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2512 )?
2513 (?:
2514 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2515 )?Z?$''', s)
acaff495 2516 if m:
2517 days, hours, mins, secs, ms = m.groups()
2518 else:
15846398 2519 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2520 if m:
2521 hours, mins = m.groups()
2522 else:
2523 return None
2524
acaff495 2525 if ms:
19a03940 2526 ms = ms.replace(':', '.')
2527 return sum(float(part or 0) * mult for part, mult in (
2528 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2529
2530
e65e4c88 2531def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2532 name, real_ext = os.path.splitext(filename)
e65e4c88 2533 return (
86e5f3ed 2534 f'{name}.{ext}{real_ext}'
e65e4c88 2535 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2536 else f'{filename}.{ext}')
d70ad093
PH
2537
2538
b3ed15b7
S
2539def replace_extension(filename, ext, expected_real_ext=None):
2540 name, real_ext = os.path.splitext(filename)
86e5f3ed 2541 return '{}.{}'.format(
b3ed15b7
S
2542 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2543 ext)
2544
2545
d70ad093
PH
2546def check_executable(exe, args=[]):
2547 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2548 args can be a list of arguments for a short output (like -version) """
2549 try:
f0c9fb96 2550 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2551 except OSError:
2552 return False
2553 return exe
b7ab0590
PH
2554
2555
7aaf4cd2 2556def _get_exe_version_output(exe, args):
95807118 2557 try:
b64d04c1 2558 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2559 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2560 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2561 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2562 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2563 if ret:
2564 return None
95807118
PH
2565 except OSError:
2566 return False
f0c9fb96 2567 return stdout
cae97f65
PH
2568
2569
2570def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2571 assert isinstance(output, str)
cae97f65
PH
2572 if version_re is None:
2573 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2574 m = re.search(version_re, output)
95807118
PH
2575 if m:
2576 return m.group(1)
2577 else:
2578 return unrecognized
2579
2580
9af98e17 2581def get_exe_version(exe, args=['--version'],
1cdda329 2582 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2583 """ Returns the version of the specified executable,
2584 or False if the executable is not present """
1cdda329 2585 unrecognized = variadic(unrecognized)
2586 assert len(unrecognized) in (1, 2)
9af98e17 2587 out = _get_exe_version_output(exe, args)
1cdda329 2588 if out is None:
2589 return unrecognized[-1]
2590 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2591
2592
7e88d7d7 2593def frange(start=0, stop=None, step=1):
2594 """Float range"""
2595 if stop is None:
2596 start, stop = 0, start
2597 sign = [-1, 1][step > 0] if step else 0
2598 while sign * start < sign * stop:
2599 yield start
2600 start += step
2601
2602
cb89cfc1 2603class LazyList(collections.abc.Sequence):
0f06bcd7 2604 """Lazy immutable list from an iterable
2605 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2606
8e5fecc8 2607 class IndexError(IndexError):
2608 pass
2609
282f5709 2610 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2611 self._iterable = iter(iterable)
2612 self._cache = [] if _cache is None else _cache
2613 self._reversed = reverse
483336e7 2614
2615 def __iter__(self):
0f06bcd7 2616 if self._reversed:
28419ca2 2617 # We need to consume the entire iterable to iterate in reverse
981052c9 2618 yield from self.exhaust()
28419ca2 2619 return
0f06bcd7 2620 yield from self._cache
2621 for item in self._iterable:
2622 self._cache.append(item)
483336e7 2623 yield item
2624
0f06bcd7 2625 def _exhaust(self):
2626 self._cache.extend(self._iterable)
2627 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2628 return self._cache
28419ca2 2629
981052c9 2630 def exhaust(self):
0f06bcd7 2631 """Evaluate the entire iterable"""
2632 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2633
28419ca2 2634 @staticmethod
0f06bcd7 2635 def _reverse_index(x):
f2df4071 2636 return None if x is None else ~x
483336e7 2637
2638 def __getitem__(self, idx):
2639 if isinstance(idx, slice):
0f06bcd7 2640 if self._reversed:
2641 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2642 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2643 elif isinstance(idx, int):
0f06bcd7 2644 if self._reversed:
2645 idx = self._reverse_index(idx)
e0f2b4b4 2646 start, stop, step = idx, idx, 0
483336e7 2647 else:
2648 raise TypeError('indices must be integers or slices')
e0f2b4b4 2649 if ((start or 0) < 0 or (stop or 0) < 0
2650 or (start is None and step < 0)
2651 or (stop is None and step > 0)):
483336e7 2652 # We need to consume the entire iterable to be able to slice from the end
2653 # Obviously, never use this with infinite iterables
0f06bcd7 2654 self._exhaust()
8e5fecc8 2655 try:
0f06bcd7 2656 return self._cache[idx]
8e5fecc8 2657 except IndexError as e:
2658 raise self.IndexError(e) from e
0f06bcd7 2659 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2660 if n > 0:
0f06bcd7 2661 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2662 try:
0f06bcd7 2663 return self._cache[idx]
8e5fecc8 2664 except IndexError as e:
2665 raise self.IndexError(e) from e
483336e7 2666
2667 def __bool__(self):
2668 try:
0f06bcd7 2669 self[-1] if self._reversed else self[0]
8e5fecc8 2670 except self.IndexError:
483336e7 2671 return False
2672 return True
2673
2674 def __len__(self):
0f06bcd7 2675 self._exhaust()
2676 return len(self._cache)
483336e7 2677
282f5709 2678 def __reversed__(self):
0f06bcd7 2679 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2680
2681 def __copy__(self):
0f06bcd7 2682 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2683
28419ca2 2684 def __repr__(self):
2685 # repr and str should mimic a list. So we exhaust the iterable
2686 return repr(self.exhaust())
2687
2688 def __str__(self):
2689 return repr(self.exhaust())
2690
483336e7 2691
7be9ccff 2692class PagedList:
c07a39ae 2693
2694 class IndexError(IndexError):
2695 pass
2696
dd26ced1
PH
2697 def __len__(self):
2698 # This is only useful for tests
2699 return len(self.getslice())
2700
7be9ccff 2701 def __init__(self, pagefunc, pagesize, use_cache=True):
2702 self._pagefunc = pagefunc
2703 self._pagesize = pagesize
f1d13090 2704 self._pagecount = float('inf')
7be9ccff 2705 self._use_cache = use_cache
2706 self._cache = {}
2707
2708 def getpage(self, pagenum):
d8cf8d97 2709 page_results = self._cache.get(pagenum)
2710 if page_results is None:
f1d13090 2711 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2712 if self._use_cache:
2713 self._cache[pagenum] = page_results
2714 return page_results
2715
2716 def getslice(self, start=0, end=None):
2717 return list(self._getslice(start, end))
2718
2719 def _getslice(self, start, end):
55575225 2720 raise NotImplementedError('This method must be implemented by subclasses')
2721
2722 def __getitem__(self, idx):
f1d13090 2723 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2724 if not isinstance(idx, int) or idx < 0:
2725 raise TypeError('indices must be non-negative integers')
2726 entries = self.getslice(idx, idx + 1)
d8cf8d97 2727 if not entries:
c07a39ae 2728 raise self.IndexError()
d8cf8d97 2729 return entries[0]
55575225 2730
9c44d242
PH
2731
2732class OnDemandPagedList(PagedList):
a44ca5a4 2733 """Download pages until a page with less than maximum results"""
86e5f3ed 2734
7be9ccff 2735 def _getslice(self, start, end):
b7ab0590
PH
2736 for pagenum in itertools.count(start // self._pagesize):
2737 firstid = pagenum * self._pagesize
2738 nextfirstid = pagenum * self._pagesize + self._pagesize
2739 if start >= nextfirstid:
2740 continue
2741
b7ab0590
PH
2742 startv = (
2743 start % self._pagesize
2744 if firstid <= start < nextfirstid
2745 else 0)
b7ab0590
PH
2746 endv = (
2747 ((end - 1) % self._pagesize) + 1
2748 if (end is not None and firstid <= end <= nextfirstid)
2749 else None)
2750
f1d13090 2751 try:
2752 page_results = self.getpage(pagenum)
2753 except Exception:
2754 self._pagecount = pagenum - 1
2755 raise
b7ab0590
PH
2756 if startv != 0 or endv is not None:
2757 page_results = page_results[startv:endv]
7be9ccff 2758 yield from page_results
b7ab0590
PH
2759
2760 # A little optimization - if current page is not "full", ie. does
2761 # not contain page_size videos then we can assume that this page
2762 # is the last one - there are no more ids on further pages -
2763 # i.e. no need to query again.
2764 if len(page_results) + startv < self._pagesize:
2765 break
2766
2767 # If we got the whole page, but the next page is not interesting,
2768 # break out early as well
2769 if end == nextfirstid:
2770 break
81c2f20b
PH
2771
2772
9c44d242 2773class InAdvancePagedList(PagedList):
a44ca5a4 2774 """PagedList with total number of pages known in advance"""
86e5f3ed 2775
9c44d242 2776 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2777 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2778 self._pagecount = pagecount
9c44d242 2779
7be9ccff 2780 def _getslice(self, start, end):
9c44d242 2781 start_page = start // self._pagesize
d37707bd 2782 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2783 skip_elems = start - start_page * self._pagesize
2784 only_more = None if end is None else end - start
2785 for pagenum in range(start_page, end_page):
7be9ccff 2786 page_results = self.getpage(pagenum)
9c44d242 2787 if skip_elems:
7be9ccff 2788 page_results = page_results[skip_elems:]
9c44d242
PH
2789 skip_elems = None
2790 if only_more is not None:
7be9ccff 2791 if len(page_results) < only_more:
2792 only_more -= len(page_results)
9c44d242 2793 else:
7be9ccff 2794 yield from page_results[:only_more]
9c44d242 2795 break
7be9ccff 2796 yield from page_results
9c44d242
PH
2797
2798
7e88d7d7 2799class PlaylistEntries:
2800 MissingEntry = object()
2801 is_exhausted = False
2802
2803 def __init__(self, ydl, info_dict):
7e9a6125 2804 self.ydl = ydl
2805
2806 # _entries must be assigned now since infodict can change during iteration
2807 entries = info_dict.get('entries')
2808 if entries is None:
2809 raise EntryNotInPlaylist('There are no entries')
2810 elif isinstance(entries, list):
2811 self.is_exhausted = True
2812
2813 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2814 self.is_incomplete = requested_entries is not None
7e9a6125 2815 if self.is_incomplete:
2816 assert self.is_exhausted
bc5c2f8a 2817 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2818 for i, entry in zip(requested_entries, entries):
2819 self._entries[i - 1] = entry
2820 elif isinstance(entries, (list, PagedList, LazyList)):
2821 self._entries = entries
2822 else:
2823 self._entries = LazyList(entries)
7e88d7d7 2824
2825 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2826 (?P<start>[+-]?\d+)?
2827 (?P<range>[:-]
2828 (?P<end>[+-]?\d+|inf(?:inite)?)?
2829 (?::(?P<step>[+-]?\d+))?
2830 )?''')
2831
2832 @classmethod
2833 def parse_playlist_items(cls, string):
2834 for segment in string.split(','):
2835 if not segment:
2836 raise ValueError('There is two or more consecutive commas')
2837 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2838 if not mobj:
2839 raise ValueError(f'{segment!r} is not a valid specification')
2840 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2841 if int_or_none(step) == 0:
2842 raise ValueError(f'Step in {segment!r} cannot be zero')
2843 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2844
2845 def get_requested_items(self):
2846 playlist_items = self.ydl.params.get('playlist_items')
2847 playlist_start = self.ydl.params.get('playliststart', 1)
2848 playlist_end = self.ydl.params.get('playlistend')
2849 # For backwards compatibility, interpret -1 as whole list
2850 if playlist_end in (-1, None):
2851 playlist_end = ''
2852 if not playlist_items:
2853 playlist_items = f'{playlist_start}:{playlist_end}'
2854 elif playlist_start != 1 or playlist_end:
2855 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2856
2857 for index in self.parse_playlist_items(playlist_items):
2858 for i, entry in self[index]:
2859 yield i, entry
1ac4fd80 2860 if not entry:
2861 continue
7e88d7d7 2862 try:
d21056f4 2863 # The item may have just been added to archive. Don't break due to it
2864 if not self.ydl.params.get('lazy_playlist'):
2865 # TODO: Add auto-generated fields
2866 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2867 except (ExistingVideoReached, RejectedVideoReached):
2868 return
2869
7e9a6125 2870 def get_full_count(self):
2871 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2872 return len(self)
2873 elif isinstance(self._entries, InAdvancePagedList):
2874 if self._entries._pagesize == 1:
2875 return self._entries._pagecount
2876
7e88d7d7 2877 @functools.cached_property
2878 def _getter(self):
2879 if isinstance(self._entries, list):
2880 def get_entry(i):
2881 try:
2882 entry = self._entries[i]
2883 except IndexError:
2884 entry = self.MissingEntry
2885 if not self.is_incomplete:
2886 raise self.IndexError()
2887 if entry is self.MissingEntry:
bc5c2f8a 2888 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2889 return entry
2890 else:
2891 def get_entry(i):
2892 try:
2893 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2894 except (LazyList.IndexError, PagedList.IndexError):
2895 raise self.IndexError()
2896 return get_entry
2897
2898 def __getitem__(self, idx):
2899 if isinstance(idx, int):
2900 idx = slice(idx, idx)
2901
2902 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2903 step = 1 if idx.step is None else idx.step
2904 if idx.start is None:
2905 start = 0 if step > 0 else len(self) - 1
2906 else:
2907 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2908
2909 # NB: Do not call len(self) when idx == [:]
2910 if idx.stop is None:
2911 stop = 0 if step < 0 else float('inf')
2912 else:
2913 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2914 stop += [-1, 1][step > 0]
2915
2916 for i in frange(start, stop, step):
2917 if i < 0:
2918 continue
2919 try:
7e9a6125 2920 entry = self._getter(i)
2921 except self.IndexError:
2922 self.is_exhausted = True
2923 if step > 0:
7e88d7d7 2924 break
7e9a6125 2925 continue
7e88d7d7 2926 yield i + 1, entry
2927
2928 def __len__(self):
2929 return len(tuple(self[:]))
2930
2931 class IndexError(IndexError):
2932 pass
2933
2934
81c2f20b 2935def uppercase_escape(s):
676eb3f2 2936 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2937 return re.sub(
a612753d 2938 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2939 lambda m: unicode_escape(m.group(0))[0],
2940 s)
0fe2ff78
YCH
2941
2942
2943def lowercase_escape(s):
2944 unicode_escape = codecs.getdecoder('unicode_escape')
2945 return re.sub(
2946 r'\\u[0-9a-fA-F]{4}',
2947 lambda m: unicode_escape(m.group(0))[0],
2948 s)
b53466e1 2949
d05cfe06
S
2950
2951def escape_rfc3986(s):
2952 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2953 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2954
2955
2956def escape_url(url):
2957 """Escape URL as suggested by RFC 3986"""
14f25df2 2958 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2959 return url_parsed._replace(
efbed08d 2960 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2961 path=escape_rfc3986(url_parsed.path),
2962 params=escape_rfc3986(url_parsed.params),
2963 query=escape_rfc3986(url_parsed.query),
2964 fragment=escape_rfc3986(url_parsed.fragment)
2965 ).geturl()
2966
62e609ab 2967
96b9e9cf 2968def parse_qs(url, **kwargs):
2969 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2970
2971
62e609ab
PH
2972def read_batch_urls(batch_fd):
2973 def fixup(url):
14f25df2 2974 if not isinstance(url, str):
62e609ab 2975 url = url.decode('utf-8', 'replace')
8c04f0be 2976 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2977 for bom in BOM_UTF8:
2978 if url.startswith(bom):
2979 url = url[len(bom):]
2980 url = url.lstrip()
2981 if not url or url.startswith(('#', ';', ']')):
62e609ab 2982 return False
8c04f0be 2983 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2984 # However, it can be safely stripped out if following a whitespace
8c04f0be 2985 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2986
2987 with contextlib.closing(batch_fd) as fd:
2988 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2989
2990
2991def urlencode_postdata(*args, **kargs):
14f25df2 2992 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2993
2994
45b2ee6f 2995def update_url(url, *, query_update=None, **kwargs):
2996 """Replace URL components specified by kwargs
2997 @param url str or parse url tuple
2998 @param query_update update query
2999 @returns str
3000 """
3001 if isinstance(url, str):
3002 if not kwargs and not query_update:
3003 return url
3004 else:
3005 url = urllib.parse.urlparse(url)
3006 if query_update:
3007 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3008 kwargs['query'] = urllib.parse.urlencode({
3009 **urllib.parse.parse_qs(url.query),
3010 **query_update
3011 }, True)
3012 return urllib.parse.urlunparse(url._replace(**kwargs))
3013
3014
38f9ef31 3015def update_url_query(url, query):
45b2ee6f 3016 return update_url(url, query_update=query)
16392824 3017
8e60dc75 3018
c043c246 3019def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3020 req_headers = req.headers.copy()
c043c246 3021 req_headers.update(headers or {})
ed0291d1
S
3022 req_data = data or req.data
3023 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3024 req_get_method = req.get_method()
3025 if req_get_method == 'HEAD':
3026 req_type = HEADRequest
3027 elif req_get_method == 'PUT':
3028 req_type = PUTRequest
3029 else:
ac668111 3030 req_type = urllib.request.Request
ed0291d1
S
3031 new_req = req_type(
3032 req_url, data=req_data, headers=req_headers,
3033 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3034 if hasattr(req, 'timeout'):
3035 new_req.timeout = req.timeout
3036 return new_req
3037
3038
10c87c15 3039def _multipart_encode_impl(data, boundary):
0c265486
YCH
3040 content_type = 'multipart/form-data; boundary=%s' % boundary
3041
3042 out = b''
3043 for k, v in data.items():
3044 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3045 if isinstance(k, str):
0f06bcd7 3046 k = k.encode()
14f25df2 3047 if isinstance(v, str):
0f06bcd7 3048 v = v.encode()
0c265486
YCH
3049 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3050 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3051 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3052 if boundary.encode('ascii') in content:
3053 raise ValueError('Boundary overlaps with data')
3054 out += content
3055
3056 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3057
3058 return out, content_type
3059
3060
3061def multipart_encode(data, boundary=None):
3062 '''
3063 Encode a dict to RFC 7578-compliant form-data
3064
3065 data:
3066 A dict where keys and values can be either Unicode or bytes-like
3067 objects.
3068 boundary:
3069 If specified a Unicode object, it's used as the boundary. Otherwise
3070 a random boundary is generated.
3071
3072 Reference: https://tools.ietf.org/html/rfc7578
3073 '''
3074 has_specified_boundary = boundary is not None
3075
3076 while True:
3077 if boundary is None:
3078 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3079
3080 try:
10c87c15 3081 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3082 break
3083 except ValueError:
3084 if has_specified_boundary:
3085 raise
3086 boundary = None
3087
3088 return out, content_type
3089
3090
b079c26f
SS
3091def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3092 if blocked_types is NO_DEFAULT:
3093 blocked_types = (str, bytes, collections.abc.Mapping)
3094 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3095
3096
3097def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 3098 if not isinstance(allowed_types, (tuple, type)):
3099 deprecation_warning('allowed_types should be a tuple or a type')
3100 allowed_types = tuple(allowed_types)
6f2287cb 3101 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 3102
3103
c4f60dd7 3104def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3105 for f in funcs:
a32a9a7e 3106 try:
c4f60dd7 3107 val = f(*args, **kwargs)
ab029d7e 3108 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3109 pass
3110 else:
c4f60dd7 3111 if expected_type is None or isinstance(val, expected_type):
3112 return val
3113
3114
3115def try_get(src, getter, expected_type=None):
3116 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3117
3118
90137ca4 3119def filter_dict(dct, cndn=lambda _, v: v is not None):
3120 return {k: v for k, v in dct.items() if cndn(k, v)}
3121
3122
6cc62232
S
3123def merge_dicts(*dicts):
3124 merged = {}
3125 for a_dict in dicts:
3126 for k, v in a_dict.items():
90137ca4 3127 if (v is not None and k not in merged
3128 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3129 merged[k] = v
3130 return merged
3131
3132
8e60dc75 3133def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3134 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3135
16392824 3136
a1a530b0
PH
3137US_RATINGS = {
3138 'G': 0,
3139 'PG': 10,
3140 'PG-13': 13,
3141 'R': 16,
3142 'NC': 18,
3143}
fac55558
PH
3144
3145
a8795327 3146TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3147 'TV-Y': 0,
3148 'TV-Y7': 7,
3149 'TV-G': 0,
3150 'TV-PG': 0,
3151 'TV-14': 14,
3152 'TV-MA': 17,
a8795327
S
3153}
3154
3155
146c80e2 3156def parse_age_limit(s):
19a03940 3157 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3158 if type(s) is int: # noqa: E721
a8795327 3159 return s if 0 <= s <= 21 else None
19a03940 3160 elif not isinstance(s, str):
d838b1bd 3161 return None
146c80e2 3162 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3163 if m:
3164 return int(m.group('age'))
5c5fae6d 3165 s = s.upper()
a8795327
S
3166 if s in US_RATINGS:
3167 return US_RATINGS[s]
5a16c9d9 3168 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3169 if m:
5a16c9d9 3170 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3171 return None
146c80e2
S
3172
3173
fac55558 3174def strip_jsonp(code):
609a61e3 3175 return re.sub(
5552c9eb 3176 r'''(?sx)^
e9c671d5 3177 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3178 (?:\s*&&\s*(?P=func_name))?
3179 \s*\(\s*(?P<callback_data>.*)\);?
3180 \s*?(?://[^\n]*)*$''',
3181 r'\g<callback_data>', code)
478c2c61
PH
3182
3183
8f53dc44 3184def js_to_json(code, vars={}, *, strict=False):
5c610515 3185 # vars is a dict of var, val pairs to substitute
0898c5c8 3186 STRING_QUOTES = '\'"`'
a71b812f 3187 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3188 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3189 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3190 INTEGER_TABLE = (
86e5f3ed 3191 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3192 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3193 )
3194
a71b812f
SS
3195 def process_escape(match):
3196 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3197 escape = match.group(1) or match.group(2)
3198
3199 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3200 else R'\u00' if escape == 'x'
3201 else '' if escape == '\n'
3202 else escape)
3203
0898c5c8
SS
3204 def template_substitute(match):
3205 evaluated = js_to_json(match.group(1), vars, strict=strict)
3206 if evaluated[0] == '"':
3207 return json.loads(evaluated)
3208 return evaluated
3209
e05f6939 3210 def fix_kv(m):
e7b6d122
PH
3211 v = m.group(0)
3212 if v in ('true', 'false', 'null'):
3213 return v
421ddcb8
C
3214 elif v in ('undefined', 'void 0'):
3215 return 'null'
8bdd16b4 3216 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3217 return ''
3218
3219 if v[0] in STRING_QUOTES:
0898c5c8
SS
3220 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3221 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
3222 return f'"{escaped}"'
3223
3224 for regex, base in INTEGER_TABLE:
3225 im = re.match(regex, v)
3226 if im:
3227 i = int(im.group(1), base)
3228 return f'"{i}":' if v.endswith(':') else str(i)
3229
3230 if v in vars:
d5f043d1
C
3231 try:
3232 if not strict:
3233 json.loads(vars[v])
08e29b9f 3234 except json.JSONDecodeError:
d5f043d1
C
3235 return json.dumps(vars[v])
3236 else:
3237 return vars[v]
89ac4a19 3238
a71b812f
SS
3239 if not strict:
3240 return f'"{v}"'
5c610515 3241
a71b812f 3242 raise ValueError(f'Unknown value: {v}')
e05f6939 3243
8072ef2b 3244 def create_map(mobj):
3245 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3246
8072ef2b 3247 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3248 if not strict:
3249 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3250 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3251 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3252 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3253
a71b812f
SS
3254 return re.sub(rf'''(?sx)
3255 {STRING_RE}|
3256 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3257 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3258 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3259 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3260 !+
a71b812f 3261 ''', fix_kv, code)
e05f6939
PH
3262
3263
478c2c61
PH
3264def qualities(quality_ids):
3265 """ Get a numeric quality value out of a list of possible values """
3266 def q(qid):
3267 try:
3268 return quality_ids.index(qid)
3269 except ValueError:
3270 return -1
3271 return q
3272
acd69589 3273
119e40ef 3274POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3275
3276
de6000d9 3277DEFAULT_OUTTMPL = {
3278 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3279 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3280}
3281OUTTMPL_TYPES = {
72755351 3282 'chapter': None,
de6000d9 3283 'subtitle': None,
3284 'thumbnail': None,
3285 'description': 'description',
3286 'annotation': 'annotations.xml',
3287 'infojson': 'info.json',
08438d2c 3288 'link': None,
3b603dbd 3289 'pl_video': None,
5112f26a 3290 'pl_thumbnail': None,
de6000d9 3291 'pl_description': 'description',
3292 'pl_infojson': 'info.json',
3293}
0a871f68 3294
143db31d 3295# As of [1] format syntax is:
3296# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3297# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3298STR_FORMAT_RE_TMPL = r'''(?x)
3299 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3300 %
524e2e4f 3301 (?P<has_key>\((?P<key>{0})\))?
752cda38 3302 (?P<format>
524e2e4f 3303 (?P<conversion>[#0\-+ ]+)?
3304 (?P<min_width>\d+)?
3305 (?P<precision>\.\d+)?
3306 (?P<len_mod>[hlL])? # unused in python
901130bb 3307 {1} # conversion type
752cda38 3308 )
143db31d 3309'''
3310
7d1eb38a 3311
ebe1b4e3 3312STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 3313
7d1eb38a 3314
a020a0dc
PH
3315def limit_length(s, length):
3316 """ Add ellipses to overly long strings """
3317 if s is None:
3318 return None
3319 ELLIPSES = '...'
3320 if len(s) > length:
3321 return s[:length - len(ELLIPSES)] + ELLIPSES
3322 return s
48844745
PH
3323
3324
3325def version_tuple(v):
5f9b8394 3326 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3327
3328
3329def is_outdated_version(version, limit, assume_new=True):
3330 if not version:
3331 return not assume_new
3332 try:
3333 return version_tuple(version) < version_tuple(limit)
3334 except ValueError:
3335 return not assume_new
732ea2f0
PH
3336
3337
3338def ytdl_is_updateable():
7a5c1cfe 3339 """ Returns if yt-dlp can be updated with -U """
735d865e 3340
69bec673 3341 from ..update import is_non_updateable
732ea2f0 3342
5d535b4a 3343 return not is_non_updateable()
7d4111ed
PH
3344
3345
3346def args_to_str(args):
3347 # Get a short string representation for a subprocess command
702ccf2d 3348 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3349
3350
a44ca5a4 3351def error_to_str(err):
3352 return f'{type(err).__name__}: {err}'
3353
3354
2647c933 3355def mimetype2ext(mt, default=NO_DEFAULT):
3356 if not isinstance(mt, str):
3357 if default is not NO_DEFAULT:
3358 return default
eb9ee194
S
3359 return None
3360
2647c933 3361 MAP = {
3362 # video
f6861ec9 3363 '3gpp': '3gp',
2647c933 3364 'mp2t': 'ts',
3365 'mp4': 'mp4',
3366 'mpeg': 'mpeg',
3367 'mpegurl': 'm3u8',
3368 'quicktime': 'mov',
3369 'webm': 'webm',
3370 'vp9': 'vp9',
f6861ec9 3371 'x-flv': 'flv',
2647c933 3372 'x-m4v': 'm4v',
3373 'x-matroska': 'mkv',
3374 'x-mng': 'mng',
a0d8d704 3375 'x-mp4-fragmented': 'mp4',
2647c933 3376 'x-ms-asf': 'asf',
a0d8d704 3377 'x-ms-wmv': 'wmv',
2647c933 3378 'x-msvideo': 'avi',
3379
3380 # application (streaming playlists)
b4173f15 3381 'dash+xml': 'mpd',
b4173f15 3382 'f4m+xml': 'f4m',
f164b971 3383 'hds+xml': 'f4m',
2647c933 3384 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3385 'vnd.ms-sstr+xml': 'ism',
2647c933 3386 'x-mpegurl': 'm3u8',
3387
3388 # audio
3389 'audio/mp4': 'm4a',
3390 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3391 # Using .mp3 as it's the most popular one
3392 'audio/mpeg': 'mp3',
d80ca5de 3393 'audio/webm': 'webm',
2647c933 3394 'audio/x-matroska': 'mka',
3395 'audio/x-mpegurl': 'm3u',
3396 'midi': 'mid',
3397 'ogg': 'ogg',
3398 'wav': 'wav',
3399 'wave': 'wav',
3400 'x-aac': 'aac',
3401 'x-flac': 'flac',
3402 'x-m4a': 'm4a',
3403 'x-realaudio': 'ra',
39e7107d 3404 'x-wav': 'wav',
9359f3d4 3405
2647c933 3406 # image
3407 'avif': 'avif',
3408 'bmp': 'bmp',
3409 'gif': 'gif',
3410 'jpeg': 'jpg',
3411 'png': 'png',
3412 'svg+xml': 'svg',
3413 'tiff': 'tif',
3414 'vnd.wap.wbmp': 'wbmp',
3415 'webp': 'webp',
3416 'x-icon': 'ico',
3417 'x-jng': 'jng',
3418 'x-ms-bmp': 'bmp',
3419
3420 # caption
3421 'filmstrip+json': 'fs',
3422 'smptett+xml': 'tt',
3423 'ttaf+xml': 'dfxp',
3424 'ttml+xml': 'ttml',
3425 'x-ms-sami': 'sami',
9359f3d4 3426
2647c933 3427 # misc
3428 'gzip': 'gz',
9359f3d4
F
3429 'json': 'json',
3430 'xml': 'xml',
3431 'zip': 'zip',
9359f3d4
F
3432 }
3433
2647c933 3434 mimetype = mt.partition(';')[0].strip().lower()
3435 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3436
69bec673 3437 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 3438 if ext:
3439 return ext
3440 elif default is not NO_DEFAULT:
3441 return default
9359f3d4 3442 return subtype.replace('+', '.')
c460bdd5
PH
3443
3444
2814f12b
THD
3445def ext2mimetype(ext_or_url):
3446 if not ext_or_url:
3447 return None
3448 if '.' not in ext_or_url:
3449 ext_or_url = f'file.{ext_or_url}'
3450 return mimetypes.guess_type(ext_or_url)[0]
3451
3452
4f3c5e06 3453def parse_codecs(codecs_str):
3454 # http://tools.ietf.org/html/rfc6381
3455 if not codecs_str:
3456 return {}
a0566bbf 3457 split_codecs = list(filter(None, map(
dbf5416a 3458 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3459 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3460 for full_codec in split_codecs:
d816f61f 3461 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3462 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3463 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3464 if vcodec:
3465 continue
3466 vcodec = full_codec
3467 if parts[0] in ('dvh1', 'dvhe'):
3468 hdr = 'DV'
69bec673 3469 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 3470 hdr = 'HDR10'
3471 elif parts[:2] == ['vp9', '2']:
3472 hdr = 'HDR10'
71082216 3473 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3474 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3475 acodec = acodec or full_codec
3476 elif parts[0] in ('stpp', 'wvtt'):
3477 scodec = scodec or full_codec
4f3c5e06 3478 else:
19a03940 3479 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3480 if vcodec or acodec or scodec:
4f3c5e06 3481 return {
3482 'vcodec': vcodec or 'none',
3483 'acodec': acodec or 'none',
176f1866 3484 'dynamic_range': hdr,
3fe75fdc 3485 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3486 }
b69fd25c 3487 elif len(split_codecs) == 2:
3488 return {
3489 'vcodec': split_codecs[0],
3490 'acodec': split_codecs[1],
3491 }
4f3c5e06 3492 return {}
3493
3494
fc61aff4
LL
3495def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3496 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3497
3498 allow_mkv = not preferences or 'mkv' in preferences
3499
3500 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3501 return 'mkv' # TODO: any other format allows this?
3502
3503 # TODO: All codecs supported by parse_codecs isn't handled here
3504 COMPATIBLE_CODECS = {
3505 'mp4': {
71082216 3506 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3507 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3508 },
3509 'webm': {
3510 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3511 'vp9x', 'vp8x', # in the webm spec
3512 },
3513 }
3514
812cdfa0 3515 sanitize_codec = functools.partial(
3516 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3517 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3518
3519 for ext in preferences or COMPATIBLE_CODECS.keys():
3520 codec_set = COMPATIBLE_CODECS.get(ext, set())
3521 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3522 return ext
3523
3524 COMPATIBLE_EXTS = (
3525 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3526 {'webm', 'weba'},
fc61aff4
LL
3527 )
3528 for ext in preferences or vexts:
3529 current_exts = {ext, *vexts, *aexts}
3530 if ext == 'mkv' or current_exts == {ext} or any(
3531 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3532 return ext
3533 return 'mkv' if allow_mkv else preferences[-1]
3534
3535
2647c933 3536def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3537 getheader = url_handle.headers.get
2ccd1b10 3538
b55ee18f
PH
3539 cd = getheader('Content-Disposition')
3540 if cd:
3541 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3542 if m:
3543 e = determine_ext(m.group('filename'), default_ext=None)
3544 if e:
3545 return e
3546
2647c933 3547 meta_ext = getheader('x-amz-meta-name')
3548 if meta_ext:
3549 e = meta_ext.rpartition('.')[2]
3550 if e:
3551 return e
3552
3553 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3554
3555
1e399778
YCH
3556def encode_data_uri(data, mime_type):
3557 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3558
3559
05900629 3560def age_restricted(content_limit, age_limit):
6ec6cb4e 3561 """ Returns True iff the content should be blocked """
05900629
PH
3562
3563 if age_limit is None: # No limit set
3564 return False
3565 if content_limit is None:
3566 return False # Content available for everyone
3567 return age_limit < content_limit
61ca9a80
PH
3568
3569
88f60feb 3570# List of known byte-order-marks (BOM)
a904a7f8
L
3571BOMS = [
3572 (b'\xef\xbb\xbf', 'utf-8'),
3573 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3574 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3575 (b'\xff\xfe', 'utf-16-le'),
3576 (b'\xfe\xff', 'utf-16-be'),
3577]
a904a7f8
L
3578
3579
61ca9a80
PH
3580def is_html(first_bytes):
3581 """ Detect whether a file contains HTML by examining its first bytes. """
3582
80e8493e 3583 encoding = 'utf-8'
61ca9a80 3584 for bom, enc in BOMS:
80e8493e 3585 while first_bytes.startswith(bom):
3586 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3587
80e8493e 3588 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3589
3590
3591def determine_protocol(info_dict):
3592 protocol = info_dict.get('protocol')
3593 if protocol is not None:
3594 return protocol
3595
7de837a5 3596 url = sanitize_url(info_dict['url'])
a055469f
PH
3597 if url.startswith('rtmp'):
3598 return 'rtmp'
3599 elif url.startswith('mms'):
3600 return 'mms'
3601 elif url.startswith('rtsp'):
3602 return 'rtsp'
3603
3604 ext = determine_ext(url)
3605 if ext == 'm3u8':
deae7c17 3606 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3607 elif ext == 'f4m':
3608 return 'f4m'
3609
14f25df2 3610 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3611
3612
c5e3f849 3613def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3614 """ Render a list of rows, each as a list of values.
3615 Text after a \t will be right aligned """
ec11a9f4 3616 def width(string):
c5e3f849 3617 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3618
3619 def get_max_lens(table):
ec11a9f4 3620 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3621
3622 def filter_using_list(row, filterArray):
d16df59d 3623 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3624
d16df59d 3625 max_lens = get_max_lens(data) if hide_empty else []
3626 header_row = filter_using_list(header_row, max_lens)
3627 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3628
cfb56d1a 3629 table = [header_row] + data
76d321f6 3630 max_lens = get_max_lens(table)
c5e3f849 3631 extra_gap += 1
76d321f6 3632 if delim:
c5e3f849 3633 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3634 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3635 for row in table:
3636 for pos, text in enumerate(map(str, row)):
c5e3f849 3637 if '\t' in text:
3638 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3639 else:
3640 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3641 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3642 return ret
347de493
PH
3643
3644
8f18aca8 3645def _match_one(filter_part, dct, incomplete):
77b87f05 3646 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3647 STRING_OPERATORS = {
3648 '*=': operator.contains,
3649 '^=': lambda attr, value: attr.startswith(value),
3650 '$=': lambda attr, value: attr.endswith(value),
3651 '~=': lambda attr, value: re.search(value, attr),
3652 }
347de493 3653 COMPARISON_OPERATORS = {
a047eeb6 3654 **STRING_OPERATORS,
3655 '<=': operator.le, # "<=" must be defined above "<"
347de493 3656 '<': operator.lt,
347de493 3657 '>=': operator.ge,
a047eeb6 3658 '>': operator.gt,
347de493 3659 '=': operator.eq,
347de493 3660 }
a047eeb6 3661
6db9c4d5 3662 if isinstance(incomplete, bool):
3663 is_incomplete = lambda _: incomplete
3664 else:
3665 is_incomplete = lambda k: k in incomplete
3666
64fa820c 3667 operator_rex = re.compile(r'''(?x)
347de493 3668 (?P<key>[a-z_]+)
77b87f05 3669 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3670 (?:
a047eeb6 3671 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3672 (?P<strval>.+?)
347de493 3673 )
347de493 3674 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3675 m = operator_rex.fullmatch(filter_part.strip())
347de493 3676 if m:
18f96d12 3677 m = m.groupdict()
3678 unnegated_op = COMPARISON_OPERATORS[m['op']]
3679 if m['negation']:
77b87f05
MT
3680 op = lambda attr, value: not unnegated_op(attr, value)
3681 else:
3682 op = unnegated_op
18f96d12 3683 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3684 if m['quote']:
3685 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3686 actual_value = dct.get(m['key'])
3687 numeric_comparison = None
f9934b96 3688 if isinstance(actual_value, (int, float)):
e5a088dc
S
3689 # If the original field is a string and matching comparisonvalue is
3690 # a number we should respect the origin of the original field
3691 # and process comparison value as a string (see
18f96d12 3692 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3693 try:
18f96d12 3694 numeric_comparison = int(comparison_value)
347de493 3695 except ValueError:
18f96d12 3696 numeric_comparison = parse_filesize(comparison_value)
3697 if numeric_comparison is None:
3698 numeric_comparison = parse_filesize(f'{comparison_value}B')
3699 if numeric_comparison is None:
3700 numeric_comparison = parse_duration(comparison_value)
3701 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3702 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3703 if actual_value is None:
6db9c4d5 3704 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3705 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3706
3707 UNARY_OPERATORS = {
1cc47c66
S
3708 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3709 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3710 }
64fa820c 3711 operator_rex = re.compile(r'''(?x)
347de493 3712 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3713 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3714 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3715 if m:
3716 op = UNARY_OPERATORS[m.group('op')]
3717 actual_value = dct.get(m.group('key'))
6db9c4d5 3718 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3719 return True
347de493
PH
3720 return op(actual_value)
3721
3722 raise ValueError('Invalid filter part %r' % filter_part)
3723
3724
8f18aca8 3725def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3726 """ Filter a dictionary with a simple string syntax.
3727 @returns Whether the filter passes
3728 @param incomplete Set of keys that is expected to be missing from dct.
3729 Can be True/False to indicate all/none of the keys may be missing.
3730 All conditions on incomplete keys pass if the key is missing
8f18aca8 3731 """
347de493 3732 return all(
8f18aca8 3733 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3734 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3735
3736
fe2ce85a 3737def match_filter_func(filters, breaking_filters=None):
3738 if not filters and not breaking_filters:
d1b5f70b 3739 return None
fe2ce85a 3740 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3741 filters = set(variadic(filters or []))
d1b5f70b 3742
492272fe 3743 interactive = '-' in filters
3744 if interactive:
3745 filters.remove('-')
3746
3747 def _match_func(info_dict, incomplete=False):
fe2ce85a 3748 ret = breaking_filters(info_dict, incomplete)
3749 if ret is not None:
3750 raise RejectedVideoReached(ret)
3751
492272fe 3752 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3753 return NO_DEFAULT if interactive and not incomplete else None
347de493 3754 else:
3bec830a 3755 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3756 filter_str = ') | ('.join(map(str.strip, filters))
3757 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3758 return _match_func
91410c9b
PH
3759
3760
f2df4071 3761class download_range_func:
b4e0d758 3762 def __init__(self, chapters, ranges, from_info=False):
3763 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3764
3765 def __call__(self, info_dict, ydl):
0500ee3d 3766
5ec1b6b7 3767 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3768 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3769 for regex in self.chapters or []:
5ec1b6b7 3770 for i, chapter in enumerate(info_dict.get('chapters') or []):
3771 if re.search(regex, chapter['title']):
3772 warning = None
3773 yield {**chapter, 'index': i}
f2df4071 3774 if self.chapters and warning:
5ec1b6b7 3775 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3776
b4e0d758 3777 for start, end in self.ranges or []:
3778 yield {
3779 'start_time': self._handle_negative_timestamp(start, info_dict),
3780 'end_time': self._handle_negative_timestamp(end, info_dict),
3781 }
3782
3783 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3784 yield {
e59e2074 3785 'start_time': info_dict.get('start_time') or 0,
3786 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3787 }
e59e2074 3788 elif not self.ranges and not self.chapters:
3789 yield {}
b4e0d758 3790
3791 @staticmethod
3792 def _handle_negative_timestamp(time, info):
3793 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3794
f2df4071 3795 def __eq__(self, other):
3796 return (isinstance(other, download_range_func)
3797 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3798
71df9b7f 3799 def __repr__(self):
a5387729 3800 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3801
5ec1b6b7 3802
bf6427d2
YCH
3803def parse_dfxp_time_expr(time_expr):
3804 if not time_expr:
d631d5f9 3805 return
bf6427d2 3806
1d485a1a 3807 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3808 if mobj:
3809 return float(mobj.group('time_offset'))
3810
db2fe38b 3811 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3812 if mobj:
db2fe38b 3813 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3814
3815
c1c924ab 3816def srt_subtitles_timecode(seconds):
aa7785f8 3817 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3818
3819
3820def ass_subtitles_timecode(seconds):
3821 time = timetuple_from_msec(seconds * 1000)
3822 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3823
3824
3825def dfxp2srt(dfxp_data):
3869028f
YCH
3826 '''
3827 @param dfxp_data A bytes-like object containing DFXP data
3828 @returns A unicode object containing converted SRT data
3829 '''
5b995f71 3830 LEGACY_NAMESPACES = (
3869028f
YCH
3831 (b'http://www.w3.org/ns/ttml', [
3832 b'http://www.w3.org/2004/11/ttaf1',
3833 b'http://www.w3.org/2006/04/ttaf1',
3834 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3835 ]),
3869028f
YCH
3836 (b'http://www.w3.org/ns/ttml#styling', [
3837 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3838 ]),
3839 )
3840
3841 SUPPORTED_STYLING = [
3842 'color',
3843 'fontFamily',
3844 'fontSize',
3845 'fontStyle',
3846 'fontWeight',
3847 'textDecoration'
3848 ]
3849
4e335771 3850 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3851 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3852 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3853 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3854 })
bf6427d2 3855
5b995f71
RA
3856 styles = {}
3857 default_style = {}
3858
86e5f3ed 3859 class TTMLPElementParser:
5b995f71
RA
3860 _out = ''
3861 _unclosed_elements = []
3862 _applied_styles = []
bf6427d2 3863
2b14cb56 3864 def start(self, tag, attrib):
5b995f71
RA
3865 if tag in (_x('ttml:br'), 'br'):
3866 self._out += '\n'
3867 else:
3868 unclosed_elements = []
3869 style = {}
3870 element_style_id = attrib.get('style')
3871 if default_style:
3872 style.update(default_style)
3873 if element_style_id:
3874 style.update(styles.get(element_style_id, {}))
3875 for prop in SUPPORTED_STYLING:
3876 prop_val = attrib.get(_x('tts:' + prop))
3877 if prop_val:
3878 style[prop] = prop_val
3879 if style:
3880 font = ''
3881 for k, v in sorted(style.items()):
3882 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3883 continue
3884 if k == 'color':
3885 font += ' color="%s"' % v
3886 elif k == 'fontSize':
3887 font += ' size="%s"' % v
3888 elif k == 'fontFamily':
3889 font += ' face="%s"' % v
3890 elif k == 'fontWeight' and v == 'bold':
3891 self._out += '<b>'
3892 unclosed_elements.append('b')
3893 elif k == 'fontStyle' and v == 'italic':
3894 self._out += '<i>'
3895 unclosed_elements.append('i')
3896 elif k == 'textDecoration' and v == 'underline':
3897 self._out += '<u>'
3898 unclosed_elements.append('u')
3899 if font:
3900 self._out += '<font' + font + '>'
3901 unclosed_elements.append('font')
3902 applied_style = {}
3903 if self._applied_styles:
3904 applied_style.update(self._applied_styles[-1])
3905 applied_style.update(style)
3906 self._applied_styles.append(applied_style)
3907 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3908
2b14cb56 3909 def end(self, tag):
5b995f71
RA
3910 if tag not in (_x('ttml:br'), 'br'):
3911 unclosed_elements = self._unclosed_elements.pop()
3912 for element in reversed(unclosed_elements):
3913 self._out += '</%s>' % element
3914 if unclosed_elements and self._applied_styles:
3915 self._applied_styles.pop()
bf6427d2 3916
2b14cb56 3917 def data(self, data):
5b995f71 3918 self._out += data
2b14cb56 3919
3920 def close(self):
5b995f71 3921 return self._out.strip()
2b14cb56 3922
6a765f13 3923 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3924 # This will not trigger false positives since only UTF-8 text is being replaced
3925 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3926
2b14cb56 3927 def parse_node(node):
3928 target = TTMLPElementParser()
3929 parser = xml.etree.ElementTree.XMLParser(target=target)
3930 parser.feed(xml.etree.ElementTree.tostring(node))
3931 return parser.close()
bf6427d2 3932
5b995f71
RA
3933 for k, v in LEGACY_NAMESPACES:
3934 for ns in v:
3935 dfxp_data = dfxp_data.replace(ns, k)
3936
3869028f 3937 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3938 out = []
5b995f71 3939 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3940
3941 if not paras:
3942 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3943
5b995f71
RA
3944 repeat = False
3945 while True:
3946 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3947 style_id = style.get('id') or style.get(_x('xml:id'))
3948 if not style_id:
3949 continue
5b995f71
RA
3950 parent_style_id = style.get('style')
3951 if parent_style_id:
3952 if parent_style_id not in styles:
3953 repeat = True
3954 continue
3955 styles[style_id] = styles[parent_style_id].copy()
3956 for prop in SUPPORTED_STYLING:
3957 prop_val = style.get(_x('tts:' + prop))
3958 if prop_val:
3959 styles.setdefault(style_id, {})[prop] = prop_val
3960 if repeat:
3961 repeat = False
3962 else:
3963 break
3964
3965 for p in ('body', 'div'):
3966 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3967 if ele is None:
3968 continue
3969 style = styles.get(ele.get('style'))
3970 if not style:
3971 continue
3972 default_style.update(style)
3973
bf6427d2 3974 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3975 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3976 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3977 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3978 if begin_time is None:
3979 continue
7dff0363 3980 if not end_time:
d631d5f9
YCH
3981 if not dur:
3982 continue
3983 end_time = begin_time + dur
bf6427d2
YCH
3984 out.append('%d\n%s --> %s\n%s\n\n' % (
3985 index,
c1c924ab
YCH
3986 srt_subtitles_timecode(begin_time),
3987 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3988 parse_node(para)))
3989
3990 return ''.join(out)
3991
3992
c487cf00 3993def cli_option(params, command_option, param, separator=None):
66e289ba 3994 param = params.get(param)
c487cf00 3995 return ([] if param is None
3996 else [command_option, str(param)] if separator is None
3997 else [f'{command_option}{separator}{param}'])
66e289ba
S
3998
3999
4000def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4001 param = params.get(param)
c487cf00 4002 assert param in (True, False, None)
4003 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4004
4005
4006def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4007 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4008
4009
e92caff5 4010def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4011 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4012 if use_compat:
5b1ecbb3 4013 return argdict
4014 else:
4015 argdict = None
eab9b2bc 4016 if argdict is None:
5b1ecbb3 4017 return default
eab9b2bc 4018 assert isinstance(argdict, dict)
4019
e92caff5 4020 assert isinstance(keys, (list, tuple))
4021 for key_list in keys:
e92caff5 4022 arg_list = list(filter(
4023 lambda x: x is not None,
6606817a 4024 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4025 if arg_list:
4026 return [arg for args in arg_list for arg in args]
4027 return default
66e289ba 4028
6251555f 4029
330690a2 4030def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4031 main_key, exe = main_key.lower(), exe.lower()
4032 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4033 keys = [f'{root_key}{k}' for k in (keys or [''])]
4034 if root_key in keys:
4035 if main_key != exe:
4036 keys.append((main_key, exe))
4037 keys.append('default')
4038 else:
4039 use_compat = False
4040 return cli_configuration_args(argdict, keys, default, use_compat)
4041
66e289ba 4042
86e5f3ed 4043class ISO639Utils:
39672624
YCH
4044 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4045 _lang_map = {
4046 'aa': 'aar',
4047 'ab': 'abk',
4048 'ae': 'ave',
4049 'af': 'afr',
4050 'ak': 'aka',
4051 'am': 'amh',
4052 'an': 'arg',
4053 'ar': 'ara',
4054 'as': 'asm',
4055 'av': 'ava',
4056 'ay': 'aym',
4057 'az': 'aze',
4058 'ba': 'bak',
4059 'be': 'bel',
4060 'bg': 'bul',
4061 'bh': 'bih',
4062 'bi': 'bis',
4063 'bm': 'bam',
4064 'bn': 'ben',
4065 'bo': 'bod',
4066 'br': 'bre',
4067 'bs': 'bos',
4068 'ca': 'cat',
4069 'ce': 'che',
4070 'ch': 'cha',
4071 'co': 'cos',
4072 'cr': 'cre',
4073 'cs': 'ces',
4074 'cu': 'chu',
4075 'cv': 'chv',
4076 'cy': 'cym',
4077 'da': 'dan',
4078 'de': 'deu',
4079 'dv': 'div',
4080 'dz': 'dzo',
4081 'ee': 'ewe',
4082 'el': 'ell',
4083 'en': 'eng',
4084 'eo': 'epo',
4085 'es': 'spa',
4086 'et': 'est',
4087 'eu': 'eus',
4088 'fa': 'fas',
4089 'ff': 'ful',
4090 'fi': 'fin',
4091 'fj': 'fij',
4092 'fo': 'fao',
4093 'fr': 'fra',
4094 'fy': 'fry',
4095 'ga': 'gle',
4096 'gd': 'gla',
4097 'gl': 'glg',
4098 'gn': 'grn',
4099 'gu': 'guj',
4100 'gv': 'glv',
4101 'ha': 'hau',
4102 'he': 'heb',
b7acc835 4103 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4104 'hi': 'hin',
4105 'ho': 'hmo',
4106 'hr': 'hrv',
4107 'ht': 'hat',
4108 'hu': 'hun',
4109 'hy': 'hye',
4110 'hz': 'her',
4111 'ia': 'ina',
4112 'id': 'ind',
b7acc835 4113 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4114 'ie': 'ile',
4115 'ig': 'ibo',
4116 'ii': 'iii',
4117 'ik': 'ipk',
4118 'io': 'ido',
4119 'is': 'isl',
4120 'it': 'ita',
4121 'iu': 'iku',
4122 'ja': 'jpn',
4123 'jv': 'jav',
4124 'ka': 'kat',
4125 'kg': 'kon',
4126 'ki': 'kik',
4127 'kj': 'kua',
4128 'kk': 'kaz',
4129 'kl': 'kal',
4130 'km': 'khm',
4131 'kn': 'kan',
4132 'ko': 'kor',
4133 'kr': 'kau',
4134 'ks': 'kas',
4135 'ku': 'kur',
4136 'kv': 'kom',
4137 'kw': 'cor',
4138 'ky': 'kir',
4139 'la': 'lat',
4140 'lb': 'ltz',
4141 'lg': 'lug',
4142 'li': 'lim',
4143 'ln': 'lin',
4144 'lo': 'lao',
4145 'lt': 'lit',
4146 'lu': 'lub',
4147 'lv': 'lav',
4148 'mg': 'mlg',
4149 'mh': 'mah',
4150 'mi': 'mri',
4151 'mk': 'mkd',
4152 'ml': 'mal',
4153 'mn': 'mon',
4154 'mr': 'mar',
4155 'ms': 'msa',
4156 'mt': 'mlt',
4157 'my': 'mya',
4158 'na': 'nau',
4159 'nb': 'nob',
4160 'nd': 'nde',
4161 'ne': 'nep',
4162 'ng': 'ndo',
4163 'nl': 'nld',
4164 'nn': 'nno',
4165 'no': 'nor',
4166 'nr': 'nbl',
4167 'nv': 'nav',
4168 'ny': 'nya',
4169 'oc': 'oci',
4170 'oj': 'oji',
4171 'om': 'orm',
4172 'or': 'ori',
4173 'os': 'oss',
4174 'pa': 'pan',
7bcd4813 4175 'pe': 'per',
39672624
YCH
4176 'pi': 'pli',
4177 'pl': 'pol',
4178 'ps': 'pus',
4179 'pt': 'por',
4180 'qu': 'que',
4181 'rm': 'roh',
4182 'rn': 'run',
4183 'ro': 'ron',
4184 'ru': 'rus',
4185 'rw': 'kin',
4186 'sa': 'san',
4187 'sc': 'srd',
4188 'sd': 'snd',
4189 'se': 'sme',
4190 'sg': 'sag',
4191 'si': 'sin',
4192 'sk': 'slk',
4193 'sl': 'slv',
4194 'sm': 'smo',
4195 'sn': 'sna',
4196 'so': 'som',
4197 'sq': 'sqi',
4198 'sr': 'srp',
4199 'ss': 'ssw',
4200 'st': 'sot',
4201 'su': 'sun',
4202 'sv': 'swe',
4203 'sw': 'swa',
4204 'ta': 'tam',
4205 'te': 'tel',
4206 'tg': 'tgk',
4207 'th': 'tha',
4208 'ti': 'tir',
4209 'tk': 'tuk',
4210 'tl': 'tgl',
4211 'tn': 'tsn',
4212 'to': 'ton',
4213 'tr': 'tur',
4214 'ts': 'tso',
4215 'tt': 'tat',
4216 'tw': 'twi',
4217 'ty': 'tah',
4218 'ug': 'uig',
4219 'uk': 'ukr',
4220 'ur': 'urd',
4221 'uz': 'uzb',
4222 've': 'ven',
4223 'vi': 'vie',
4224 'vo': 'vol',
4225 'wa': 'wln',
4226 'wo': 'wol',
4227 'xh': 'xho',
4228 'yi': 'yid',
e9a50fba 4229 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4230 'yo': 'yor',
4231 'za': 'zha',
4232 'zh': 'zho',
4233 'zu': 'zul',
4234 }
4235
4236 @classmethod
4237 def short2long(cls, code):
4238 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4239 return cls._lang_map.get(code[:2])
4240
4241 @classmethod
4242 def long2short(cls, code):
4243 """Convert language code from ISO 639-2/T to ISO 639-1"""
4244 for short_name, long_name in cls._lang_map.items():
4245 if long_name == code:
4246 return short_name
4247
4248
86e5f3ed 4249class ISO3166Utils:
4eb10f66
YCH
4250 # From http://data.okfn.org/data/core/country-list
4251 _country_map = {
4252 'AF': 'Afghanistan',
4253 'AX': 'Åland Islands',
4254 'AL': 'Albania',
4255 'DZ': 'Algeria',
4256 'AS': 'American Samoa',
4257 'AD': 'Andorra',
4258 'AO': 'Angola',
4259 'AI': 'Anguilla',
4260 'AQ': 'Antarctica',
4261 'AG': 'Antigua and Barbuda',
4262 'AR': 'Argentina',
4263 'AM': 'Armenia',
4264 'AW': 'Aruba',
4265 'AU': 'Australia',
4266 'AT': 'Austria',
4267 'AZ': 'Azerbaijan',
4268 'BS': 'Bahamas',
4269 'BH': 'Bahrain',
4270 'BD': 'Bangladesh',
4271 'BB': 'Barbados',
4272 'BY': 'Belarus',
4273 'BE': 'Belgium',
4274 'BZ': 'Belize',
4275 'BJ': 'Benin',
4276 'BM': 'Bermuda',
4277 'BT': 'Bhutan',
4278 'BO': 'Bolivia, Plurinational State of',
4279 'BQ': 'Bonaire, Sint Eustatius and Saba',
4280 'BA': 'Bosnia and Herzegovina',
4281 'BW': 'Botswana',
4282 'BV': 'Bouvet Island',
4283 'BR': 'Brazil',
4284 'IO': 'British Indian Ocean Territory',
4285 'BN': 'Brunei Darussalam',
4286 'BG': 'Bulgaria',
4287 'BF': 'Burkina Faso',
4288 'BI': 'Burundi',
4289 'KH': 'Cambodia',
4290 'CM': 'Cameroon',
4291 'CA': 'Canada',
4292 'CV': 'Cape Verde',
4293 'KY': 'Cayman Islands',
4294 'CF': 'Central African Republic',
4295 'TD': 'Chad',
4296 'CL': 'Chile',
4297 'CN': 'China',
4298 'CX': 'Christmas Island',
4299 'CC': 'Cocos (Keeling) Islands',
4300 'CO': 'Colombia',
4301 'KM': 'Comoros',
4302 'CG': 'Congo',
4303 'CD': 'Congo, the Democratic Republic of the',
4304 'CK': 'Cook Islands',
4305 'CR': 'Costa Rica',
4306 'CI': 'Côte d\'Ivoire',
4307 'HR': 'Croatia',
4308 'CU': 'Cuba',
4309 'CW': 'Curaçao',
4310 'CY': 'Cyprus',
4311 'CZ': 'Czech Republic',
4312 'DK': 'Denmark',
4313 'DJ': 'Djibouti',
4314 'DM': 'Dominica',
4315 'DO': 'Dominican Republic',
4316 'EC': 'Ecuador',
4317 'EG': 'Egypt',
4318 'SV': 'El Salvador',
4319 'GQ': 'Equatorial Guinea',
4320 'ER': 'Eritrea',
4321 'EE': 'Estonia',
4322 'ET': 'Ethiopia',
4323 'FK': 'Falkland Islands (Malvinas)',
4324 'FO': 'Faroe Islands',
4325 'FJ': 'Fiji',
4326 'FI': 'Finland',
4327 'FR': 'France',
4328 'GF': 'French Guiana',
4329 'PF': 'French Polynesia',
4330 'TF': 'French Southern Territories',
4331 'GA': 'Gabon',
4332 'GM': 'Gambia',
4333 'GE': 'Georgia',
4334 'DE': 'Germany',
4335 'GH': 'Ghana',
4336 'GI': 'Gibraltar',
4337 'GR': 'Greece',
4338 'GL': 'Greenland',
4339 'GD': 'Grenada',
4340 'GP': 'Guadeloupe',
4341 'GU': 'Guam',
4342 'GT': 'Guatemala',
4343 'GG': 'Guernsey',
4344 'GN': 'Guinea',
4345 'GW': 'Guinea-Bissau',
4346 'GY': 'Guyana',
4347 'HT': 'Haiti',
4348 'HM': 'Heard Island and McDonald Islands',
4349 'VA': 'Holy See (Vatican City State)',
4350 'HN': 'Honduras',
4351 'HK': 'Hong Kong',
4352 'HU': 'Hungary',
4353 'IS': 'Iceland',
4354 'IN': 'India',
4355 'ID': 'Indonesia',
4356 'IR': 'Iran, Islamic Republic of',
4357 'IQ': 'Iraq',
4358 'IE': 'Ireland',
4359 'IM': 'Isle of Man',
4360 'IL': 'Israel',
4361 'IT': 'Italy',
4362 'JM': 'Jamaica',
4363 'JP': 'Japan',
4364 'JE': 'Jersey',
4365 'JO': 'Jordan',
4366 'KZ': 'Kazakhstan',
4367 'KE': 'Kenya',
4368 'KI': 'Kiribati',
4369 'KP': 'Korea, Democratic People\'s Republic of',
4370 'KR': 'Korea, Republic of',
4371 'KW': 'Kuwait',
4372 'KG': 'Kyrgyzstan',
4373 'LA': 'Lao People\'s Democratic Republic',
4374 'LV': 'Latvia',
4375 'LB': 'Lebanon',
4376 'LS': 'Lesotho',
4377 'LR': 'Liberia',
4378 'LY': 'Libya',
4379 'LI': 'Liechtenstein',
4380 'LT': 'Lithuania',
4381 'LU': 'Luxembourg',
4382 'MO': 'Macao',
4383 'MK': 'Macedonia, the Former Yugoslav Republic of',
4384 'MG': 'Madagascar',
4385 'MW': 'Malawi',
4386 'MY': 'Malaysia',
4387 'MV': 'Maldives',
4388 'ML': 'Mali',
4389 'MT': 'Malta',
4390 'MH': 'Marshall Islands',
4391 'MQ': 'Martinique',
4392 'MR': 'Mauritania',
4393 'MU': 'Mauritius',
4394 'YT': 'Mayotte',
4395 'MX': 'Mexico',
4396 'FM': 'Micronesia, Federated States of',
4397 'MD': 'Moldova, Republic of',
4398 'MC': 'Monaco',
4399 'MN': 'Mongolia',
4400 'ME': 'Montenegro',
4401 'MS': 'Montserrat',
4402 'MA': 'Morocco',
4403 'MZ': 'Mozambique',
4404 'MM': 'Myanmar',
4405 'NA': 'Namibia',
4406 'NR': 'Nauru',
4407 'NP': 'Nepal',
4408 'NL': 'Netherlands',
4409 'NC': 'New Caledonia',
4410 'NZ': 'New Zealand',
4411 'NI': 'Nicaragua',
4412 'NE': 'Niger',
4413 'NG': 'Nigeria',
4414 'NU': 'Niue',
4415 'NF': 'Norfolk Island',
4416 'MP': 'Northern Mariana Islands',
4417 'NO': 'Norway',
4418 'OM': 'Oman',
4419 'PK': 'Pakistan',
4420 'PW': 'Palau',
4421 'PS': 'Palestine, State of',
4422 'PA': 'Panama',
4423 'PG': 'Papua New Guinea',
4424 'PY': 'Paraguay',
4425 'PE': 'Peru',
4426 'PH': 'Philippines',
4427 'PN': 'Pitcairn',
4428 'PL': 'Poland',
4429 'PT': 'Portugal',
4430 'PR': 'Puerto Rico',
4431 'QA': 'Qatar',
4432 'RE': 'Réunion',
4433 'RO': 'Romania',
4434 'RU': 'Russian Federation',
4435 'RW': 'Rwanda',
4436 'BL': 'Saint Barthélemy',
4437 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4438 'KN': 'Saint Kitts and Nevis',
4439 'LC': 'Saint Lucia',
4440 'MF': 'Saint Martin (French part)',
4441 'PM': 'Saint Pierre and Miquelon',
4442 'VC': 'Saint Vincent and the Grenadines',
4443 'WS': 'Samoa',
4444 'SM': 'San Marino',
4445 'ST': 'Sao Tome and Principe',
4446 'SA': 'Saudi Arabia',
4447 'SN': 'Senegal',
4448 'RS': 'Serbia',
4449 'SC': 'Seychelles',
4450 'SL': 'Sierra Leone',
4451 'SG': 'Singapore',
4452 'SX': 'Sint Maarten (Dutch part)',
4453 'SK': 'Slovakia',
4454 'SI': 'Slovenia',
4455 'SB': 'Solomon Islands',
4456 'SO': 'Somalia',
4457 'ZA': 'South Africa',
4458 'GS': 'South Georgia and the South Sandwich Islands',
4459 'SS': 'South Sudan',
4460 'ES': 'Spain',
4461 'LK': 'Sri Lanka',
4462 'SD': 'Sudan',
4463 'SR': 'Suriname',
4464 'SJ': 'Svalbard and Jan Mayen',
4465 'SZ': 'Swaziland',
4466 'SE': 'Sweden',
4467 'CH': 'Switzerland',
4468 'SY': 'Syrian Arab Republic',
4469 'TW': 'Taiwan, Province of China',
4470 'TJ': 'Tajikistan',
4471 'TZ': 'Tanzania, United Republic of',
4472 'TH': 'Thailand',
4473 'TL': 'Timor-Leste',
4474 'TG': 'Togo',
4475 'TK': 'Tokelau',
4476 'TO': 'Tonga',
4477 'TT': 'Trinidad and Tobago',
4478 'TN': 'Tunisia',
4479 'TR': 'Turkey',
4480 'TM': 'Turkmenistan',
4481 'TC': 'Turks and Caicos Islands',
4482 'TV': 'Tuvalu',
4483 'UG': 'Uganda',
4484 'UA': 'Ukraine',
4485 'AE': 'United Arab Emirates',
4486 'GB': 'United Kingdom',
4487 'US': 'United States',
4488 'UM': 'United States Minor Outlying Islands',
4489 'UY': 'Uruguay',
4490 'UZ': 'Uzbekistan',
4491 'VU': 'Vanuatu',
4492 'VE': 'Venezuela, Bolivarian Republic of',
4493 'VN': 'Viet Nam',
4494 'VG': 'Virgin Islands, British',
4495 'VI': 'Virgin Islands, U.S.',
4496 'WF': 'Wallis and Futuna',
4497 'EH': 'Western Sahara',
4498 'YE': 'Yemen',
4499 'ZM': 'Zambia',
4500 'ZW': 'Zimbabwe',
2f97cc61 4501 # Not ISO 3166 codes, but used for IP blocks
4502 'AP': 'Asia/Pacific Region',
4503 'EU': 'Europe',
4eb10f66
YCH
4504 }
4505
4506 @classmethod
4507 def short2full(cls, code):
4508 """Convert an ISO 3166-2 country code to the corresponding full name"""
4509 return cls._country_map.get(code.upper())
4510
4511
86e5f3ed 4512class GeoUtils:
773f291d
S
4513 # Major IPv4 address blocks per country
4514 _country_ip_map = {
53896ca5 4515 'AD': '46.172.224.0/19',
773f291d
S
4516 'AE': '94.200.0.0/13',
4517 'AF': '149.54.0.0/17',
4518 'AG': '209.59.64.0/18',
4519 'AI': '204.14.248.0/21',
4520 'AL': '46.99.0.0/16',
4521 'AM': '46.70.0.0/15',
4522 'AO': '105.168.0.0/13',
53896ca5
S
4523 'AP': '182.50.184.0/21',
4524 'AQ': '23.154.160.0/24',
773f291d
S
4525 'AR': '181.0.0.0/12',
4526 'AS': '202.70.112.0/20',
53896ca5 4527 'AT': '77.116.0.0/14',
773f291d
S
4528 'AU': '1.128.0.0/11',
4529 'AW': '181.41.0.0/18',
53896ca5
S
4530 'AX': '185.217.4.0/22',
4531 'AZ': '5.197.0.0/16',
773f291d
S
4532 'BA': '31.176.128.0/17',
4533 'BB': '65.48.128.0/17',
4534 'BD': '114.130.0.0/16',
4535 'BE': '57.0.0.0/8',
53896ca5 4536 'BF': '102.178.0.0/15',
773f291d
S
4537 'BG': '95.42.0.0/15',
4538 'BH': '37.131.0.0/17',
4539 'BI': '154.117.192.0/18',
4540 'BJ': '137.255.0.0/16',
53896ca5 4541 'BL': '185.212.72.0/23',
773f291d
S
4542 'BM': '196.12.64.0/18',
4543 'BN': '156.31.0.0/16',
4544 'BO': '161.56.0.0/16',
4545 'BQ': '161.0.80.0/20',
53896ca5 4546 'BR': '191.128.0.0/12',
773f291d
S
4547 'BS': '24.51.64.0/18',
4548 'BT': '119.2.96.0/19',
4549 'BW': '168.167.0.0/16',
4550 'BY': '178.120.0.0/13',
4551 'BZ': '179.42.192.0/18',
4552 'CA': '99.224.0.0/11',
4553 'CD': '41.243.0.0/16',
53896ca5
S
4554 'CF': '197.242.176.0/21',
4555 'CG': '160.113.0.0/16',
773f291d 4556 'CH': '85.0.0.0/13',
53896ca5 4557 'CI': '102.136.0.0/14',
773f291d
S
4558 'CK': '202.65.32.0/19',
4559 'CL': '152.172.0.0/14',
53896ca5 4560 'CM': '102.244.0.0/14',
773f291d
S
4561 'CN': '36.128.0.0/10',
4562 'CO': '181.240.0.0/12',
4563 'CR': '201.192.0.0/12',
4564 'CU': '152.206.0.0/15',
4565 'CV': '165.90.96.0/19',
4566 'CW': '190.88.128.0/17',
53896ca5 4567 'CY': '31.153.0.0/16',
773f291d
S
4568 'CZ': '88.100.0.0/14',
4569 'DE': '53.0.0.0/8',
4570 'DJ': '197.241.0.0/17',
4571 'DK': '87.48.0.0/12',
4572 'DM': '192.243.48.0/20',
4573 'DO': '152.166.0.0/15',
4574 'DZ': '41.96.0.0/12',
4575 'EC': '186.68.0.0/15',
4576 'EE': '90.190.0.0/15',
4577 'EG': '156.160.0.0/11',
4578 'ER': '196.200.96.0/20',
4579 'ES': '88.0.0.0/11',
4580 'ET': '196.188.0.0/14',
4581 'EU': '2.16.0.0/13',
4582 'FI': '91.152.0.0/13',
4583 'FJ': '144.120.0.0/16',
53896ca5 4584 'FK': '80.73.208.0/21',
773f291d
S
4585 'FM': '119.252.112.0/20',
4586 'FO': '88.85.32.0/19',
4587 'FR': '90.0.0.0/9',
4588 'GA': '41.158.0.0/15',
4589 'GB': '25.0.0.0/8',
4590 'GD': '74.122.88.0/21',
4591 'GE': '31.146.0.0/16',
4592 'GF': '161.22.64.0/18',
4593 'GG': '62.68.160.0/19',
53896ca5
S
4594 'GH': '154.160.0.0/12',
4595 'GI': '95.164.0.0/16',
773f291d
S
4596 'GL': '88.83.0.0/19',
4597 'GM': '160.182.0.0/15',
4598 'GN': '197.149.192.0/18',
4599 'GP': '104.250.0.0/19',
4600 'GQ': '105.235.224.0/20',
4601 'GR': '94.64.0.0/13',
4602 'GT': '168.234.0.0/16',
4603 'GU': '168.123.0.0/16',
4604 'GW': '197.214.80.0/20',
4605 'GY': '181.41.64.0/18',
4606 'HK': '113.252.0.0/14',
4607 'HN': '181.210.0.0/16',
4608 'HR': '93.136.0.0/13',
4609 'HT': '148.102.128.0/17',
4610 'HU': '84.0.0.0/14',
4611 'ID': '39.192.0.0/10',
4612 'IE': '87.32.0.0/12',
4613 'IL': '79.176.0.0/13',
4614 'IM': '5.62.80.0/20',
4615 'IN': '117.192.0.0/10',
4616 'IO': '203.83.48.0/21',
4617 'IQ': '37.236.0.0/14',
4618 'IR': '2.176.0.0/12',
4619 'IS': '82.221.0.0/16',
4620 'IT': '79.0.0.0/10',
4621 'JE': '87.244.64.0/18',
4622 'JM': '72.27.0.0/17',
4623 'JO': '176.29.0.0/16',
53896ca5 4624 'JP': '133.0.0.0/8',
773f291d
S
4625 'KE': '105.48.0.0/12',
4626 'KG': '158.181.128.0/17',
4627 'KH': '36.37.128.0/17',
4628 'KI': '103.25.140.0/22',
4629 'KM': '197.255.224.0/20',
53896ca5 4630 'KN': '198.167.192.0/19',
773f291d
S
4631 'KP': '175.45.176.0/22',
4632 'KR': '175.192.0.0/10',
4633 'KW': '37.36.0.0/14',
4634 'KY': '64.96.0.0/15',
4635 'KZ': '2.72.0.0/13',
4636 'LA': '115.84.64.0/18',
4637 'LB': '178.135.0.0/16',
53896ca5 4638 'LC': '24.92.144.0/20',
773f291d
S
4639 'LI': '82.117.0.0/19',
4640 'LK': '112.134.0.0/15',
53896ca5 4641 'LR': '102.183.0.0/16',
773f291d
S
4642 'LS': '129.232.0.0/17',
4643 'LT': '78.56.0.0/13',
4644 'LU': '188.42.0.0/16',
4645 'LV': '46.109.0.0/16',
4646 'LY': '41.252.0.0/14',
4647 'MA': '105.128.0.0/11',
4648 'MC': '88.209.64.0/18',
4649 'MD': '37.246.0.0/16',
4650 'ME': '178.175.0.0/17',
4651 'MF': '74.112.232.0/21',
4652 'MG': '154.126.0.0/17',
4653 'MH': '117.103.88.0/21',
4654 'MK': '77.28.0.0/15',
4655 'ML': '154.118.128.0/18',
4656 'MM': '37.111.0.0/17',
4657 'MN': '49.0.128.0/17',
4658 'MO': '60.246.0.0/16',
4659 'MP': '202.88.64.0/20',
4660 'MQ': '109.203.224.0/19',
4661 'MR': '41.188.64.0/18',
4662 'MS': '208.90.112.0/22',
4663 'MT': '46.11.0.0/16',
4664 'MU': '105.16.0.0/12',
4665 'MV': '27.114.128.0/18',
53896ca5 4666 'MW': '102.70.0.0/15',
773f291d
S
4667 'MX': '187.192.0.0/11',
4668 'MY': '175.136.0.0/13',
4669 'MZ': '197.218.0.0/15',
4670 'NA': '41.182.0.0/16',
4671 'NC': '101.101.0.0/18',
4672 'NE': '197.214.0.0/18',
4673 'NF': '203.17.240.0/22',
4674 'NG': '105.112.0.0/12',
4675 'NI': '186.76.0.0/15',
4676 'NL': '145.96.0.0/11',
4677 'NO': '84.208.0.0/13',
4678 'NP': '36.252.0.0/15',
4679 'NR': '203.98.224.0/19',
4680 'NU': '49.156.48.0/22',
4681 'NZ': '49.224.0.0/14',
4682 'OM': '5.36.0.0/15',
4683 'PA': '186.72.0.0/15',
4684 'PE': '186.160.0.0/14',
4685 'PF': '123.50.64.0/18',
4686 'PG': '124.240.192.0/19',
4687 'PH': '49.144.0.0/13',
4688 'PK': '39.32.0.0/11',
4689 'PL': '83.0.0.0/11',
4690 'PM': '70.36.0.0/20',
4691 'PR': '66.50.0.0/16',
4692 'PS': '188.161.0.0/16',
4693 'PT': '85.240.0.0/13',
4694 'PW': '202.124.224.0/20',
4695 'PY': '181.120.0.0/14',
4696 'QA': '37.210.0.0/15',
53896ca5 4697 'RE': '102.35.0.0/16',
773f291d 4698 'RO': '79.112.0.0/13',
53896ca5 4699 'RS': '93.86.0.0/15',
773f291d 4700 'RU': '5.136.0.0/13',
53896ca5 4701 'RW': '41.186.0.0/16',
773f291d
S
4702 'SA': '188.48.0.0/13',
4703 'SB': '202.1.160.0/19',
4704 'SC': '154.192.0.0/11',
53896ca5 4705 'SD': '102.120.0.0/13',
773f291d 4706 'SE': '78.64.0.0/12',
53896ca5 4707 'SG': '8.128.0.0/10',
773f291d
S
4708 'SI': '188.196.0.0/14',
4709 'SK': '78.98.0.0/15',
53896ca5 4710 'SL': '102.143.0.0/17',
773f291d
S
4711 'SM': '89.186.32.0/19',
4712 'SN': '41.82.0.0/15',
53896ca5 4713 'SO': '154.115.192.0/18',
773f291d
S
4714 'SR': '186.179.128.0/17',
4715 'SS': '105.235.208.0/21',
4716 'ST': '197.159.160.0/19',
4717 'SV': '168.243.0.0/16',
4718 'SX': '190.102.0.0/20',
4719 'SY': '5.0.0.0/16',
4720 'SZ': '41.84.224.0/19',
4721 'TC': '65.255.48.0/20',
4722 'TD': '154.68.128.0/19',
4723 'TG': '196.168.0.0/14',
4724 'TH': '171.96.0.0/13',
4725 'TJ': '85.9.128.0/18',
4726 'TK': '27.96.24.0/21',
4727 'TL': '180.189.160.0/20',
4728 'TM': '95.85.96.0/19',
4729 'TN': '197.0.0.0/11',
4730 'TO': '175.176.144.0/21',
4731 'TR': '78.160.0.0/11',
4732 'TT': '186.44.0.0/15',
4733 'TV': '202.2.96.0/19',
4734 'TW': '120.96.0.0/11',
4735 'TZ': '156.156.0.0/14',
53896ca5
S
4736 'UA': '37.52.0.0/14',
4737 'UG': '102.80.0.0/13',
4738 'US': '6.0.0.0/8',
773f291d 4739 'UY': '167.56.0.0/13',
53896ca5 4740 'UZ': '84.54.64.0/18',
773f291d 4741 'VA': '212.77.0.0/19',
53896ca5 4742 'VC': '207.191.240.0/21',
773f291d 4743 'VE': '186.88.0.0/13',
53896ca5 4744 'VG': '66.81.192.0/20',
773f291d
S
4745 'VI': '146.226.0.0/16',
4746 'VN': '14.160.0.0/11',
4747 'VU': '202.80.32.0/20',
4748 'WF': '117.20.32.0/21',
4749 'WS': '202.4.32.0/19',
4750 'YE': '134.35.0.0/16',
4751 'YT': '41.242.116.0/22',
4752 'ZA': '41.0.0.0/11',
53896ca5
S
4753 'ZM': '102.144.0.0/13',
4754 'ZW': '102.177.192.0/18',
773f291d
S
4755 }
4756
4757 @classmethod
5f95927a
S
4758 def random_ipv4(cls, code_or_block):
4759 if len(code_or_block) == 2:
4760 block = cls._country_ip_map.get(code_or_block.upper())
4761 if not block:
4762 return None
4763 else:
4764 block = code_or_block
773f291d 4765 addr, preflen = block.split('/')
ac668111 4766 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4767 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4768 return str(socket.inet_ntoa(
ac668111 4769 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4770
4771
ac668111 4772class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4773 def __init__(self, proxies=None):
4774 # Set default handlers
4775 for type in ('http', 'https'):
4776 setattr(self, '%s_open' % type,
4777 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4778 meth(r, proxy, type))
ac668111 4779 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4780
91410c9b 4781 def proxy_open(self, req, proxy, type):
2461f79d 4782 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4783 if req_proxy is not None:
4784 proxy = req_proxy
2461f79d
PH
4785 del req.headers['Ytdl-request-proxy']
4786
4787 if proxy == '__noproxy__':
4788 return None # No Proxy
14f25df2 4789 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4790 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4791 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4792 return None
ac668111 4793 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4794 self, req, proxy, type)
5bc880b9
YCH
4795
4796
0a5445dd
YCH
4797# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4798# released into Public Domain
4799# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4800
4801def long_to_bytes(n, blocksize=0):
4802 """long_to_bytes(n:long, blocksize:int) : string
4803 Convert a long integer to a byte string.
4804
4805 If optional blocksize is given and greater than zero, pad the front of the
4806 byte string with binary zeros so that the length is a multiple of
4807 blocksize.
4808 """
4809 # after much testing, this algorithm was deemed to be the fastest
4810 s = b''
4811 n = int(n)
4812 while n > 0:
ac668111 4813 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4814 n = n >> 32
4815 # strip off leading zeros
4816 for i in range(len(s)):
4817 if s[i] != b'\000'[0]:
4818 break
4819 else:
4820 # only happens when n == 0
4821 s = b'\000'
4822 i = 0
4823 s = s[i:]
4824 # add back some pad bytes. this could be done more efficiently w.r.t. the
4825 # de-padding being done above, but sigh...
4826 if blocksize > 0 and len(s) % blocksize:
4827 s = (blocksize - len(s) % blocksize) * b'\000' + s
4828 return s
4829
4830
4831def bytes_to_long(s):
4832 """bytes_to_long(string) : long
4833 Convert a byte string to a long integer.
4834
4835 This is (essentially) the inverse of long_to_bytes().
4836 """
4837 acc = 0
4838 length = len(s)
4839 if length % 4:
4840 extra = (4 - length % 4)
4841 s = b'\000' * extra + s
4842 length = length + extra
4843 for i in range(0, length, 4):
ac668111 4844 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4845 return acc
4846
4847
5bc880b9
YCH
4848def ohdave_rsa_encrypt(data, exponent, modulus):
4849 '''
4850 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4851
4852 Input:
4853 data: data to encrypt, bytes-like object
4854 exponent, modulus: parameter e and N of RSA algorithm, both integer
4855 Output: hex string of encrypted data
4856
4857 Limitation: supports one block encryption only
4858 '''
4859
4860 payload = int(binascii.hexlify(data[::-1]), 16)
4861 encrypted = pow(payload, exponent, modulus)
4862 return '%x' % encrypted
81bdc8fd
YCH
4863
4864
f48409c7
YCH
4865def pkcs1pad(data, length):
4866 """
4867 Padding input data with PKCS#1 scheme
4868
4869 @param {int[]} data input data
4870 @param {int} length target length
4871 @returns {int[]} padded data
4872 """
4873 if len(data) > length - 11:
4874 raise ValueError('Input data too long for PKCS#1 padding')
4875
4876 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4877 return [0, 2] + pseudo_random + [0] + data
4878
4879
7b2c3f47 4880def _base_n_table(n, table):
4881 if not table and not n:
4882 raise ValueError('Either table or n must be specified')
612f2be5 4883 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4884
44f14eb4 4885 if n and n != len(table):
612f2be5 4886 raise ValueError(f'base {n} exceeds table length {len(table)}')
4887 return table
59f898b7 4888
5eb6bdce 4889
7b2c3f47 4890def encode_base_n(num, n=None, table=None):
4891 """Convert given int to a base-n string"""
612f2be5 4892 table = _base_n_table(n, table)
7b2c3f47 4893 if not num:
5eb6bdce
YCH
4894 return table[0]
4895
7b2c3f47 4896 result, base = '', len(table)
81bdc8fd 4897 while num:
7b2c3f47 4898 result = table[num % base] + result
612f2be5 4899 num = num // base
7b2c3f47 4900 return result
4901
4902
4903def decode_base_n(string, n=None, table=None):
4904 """Convert given base-n string to int"""
4905 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4906 result, base = 0, len(table)
4907 for char in string:
4908 result = result * base + table[char]
4909 return result
4910
4911
f52354a8 4912def decode_packed_codes(code):
06b3fe29 4913 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4914 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4915 base = int(base)
4916 count = int(count)
4917 symbols = symbols.split('|')
4918 symbol_table = {}
4919
4920 while count:
4921 count -= 1
5eb6bdce 4922 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4923 symbol_table[base_n_count] = symbols[count] or base_n_count
4924
4925 return re.sub(
4926 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4927 obfuscated_code)
e154c651 4928
4929
1ced2221
S
4930def caesar(s, alphabet, shift):
4931 if shift == 0:
4932 return s
4933 l = len(alphabet)
4934 return ''.join(
4935 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4936 for c in s)
4937
4938
4939def rot47(s):
4940 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4941
4942
e154c651 4943def parse_m3u8_attributes(attrib):
4944 info = {}
4945 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4946 if val.startswith('"'):
4947 val = val[1:-1]
4948 info[key] = val
4949 return info
1143535d
YCH
4950
4951
4952def urshift(val, n):
4953 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4954
4955
efa97bdc 4956def write_xattr(path, key, value):
6f7563be 4957 # Windows: Write xattrs to NTFS Alternate Data Streams:
4958 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4959 if compat_os_name == 'nt':
4960 assert ':' not in key
4961 assert os.path.exists(path)
efa97bdc
YCH
4962
4963 try:
6f7563be 4964 with open(f'{path}:{key}', 'wb') as f:
4965 f.write(value)
86e5f3ed 4966 except OSError as e:
efa97bdc 4967 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4968 return
efa97bdc 4969
6f7563be 4970 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4971
6f7563be 4972 setxattr = None
4973 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4974 # Unicode arguments are not supported in pyxattr until version 0.5.0
4975 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4976 if version_tuple(xattr.__version__) >= (0, 5, 0):
4977 setxattr = xattr.set
4978 elif xattr:
4979 setxattr = xattr.setxattr
efa97bdc 4980
6f7563be 4981 if setxattr:
4982 try:
4983 setxattr(path, key, value)
4984 except OSError as e:
4985 raise XAttrMetadataError(e.errno, e.strerror)
4986 return
efa97bdc 4987
6f7563be 4988 # UNIX Method 2. Use setfattr/xattr executables
4989 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4990 else 'xattr' if check_executable('xattr', ['-h']) else None)
4991 if not exe:
4992 raise XAttrUnavailableError(
4993 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4994 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4995
0f06bcd7 4996 value = value.decode()
6f7563be 4997 try:
f0c9fb96 4998 _, stderr, returncode = Popen.run(
6f7563be 4999 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5000 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5001 except OSError as e:
5002 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5003 if returncode:
5004 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5005
5006
5007def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5008 start_date = datetime.date(1950, 1, 1)
5009 end_date = datetime.date(1995, 12, 31)
5010 offset = random.randint(0, (end_date - start_date).days)
5011 random_date = start_date + datetime.timedelta(offset)
0c265486 5012 return {
aa374bc7
AS
5013 year_field: str(random_date.year),
5014 month_field: str(random_date.month),
5015 day_field: str(random_date.day),
0c265486 5016 }
732044af 5017
c76eb41b 5018
8c53322c
L
5019def find_available_port(interface=''):
5020 try:
5021 with socket.socket() as sock:
5022 sock.bind((interface, 0))
5023 return sock.getsockname()[1]
5024 except OSError:
5025 return None
5026
5027
732044af 5028# Templates for internet shortcut files, which are plain text files.
e5a998f3 5029DOT_URL_LINK_TEMPLATE = '''\
732044af 5030[InternetShortcut]
5031URL=%(url)s
e5a998f3 5032'''
732044af 5033
e5a998f3 5034DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5035<?xml version="1.0" encoding="UTF-8"?>
5036<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5037<plist version="1.0">
5038<dict>
5039\t<key>URL</key>
5040\t<string>%(url)s</string>
5041</dict>
5042</plist>
e5a998f3 5043'''
732044af 5044
e5a998f3 5045DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5046[Desktop Entry]
5047Encoding=UTF-8
5048Name=%(filename)s
5049Type=Link
5050URL=%(url)s
5051Icon=text-html
e5a998f3 5052'''
732044af 5053
08438d2c 5054LINK_TEMPLATES = {
5055 'url': DOT_URL_LINK_TEMPLATE,
5056 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5057 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5058}
5059
732044af 5060
5061def iri_to_uri(iri):
5062 """
5063 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5064
5065 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5066 """
5067
14f25df2 5068 iri_parts = urllib.parse.urlparse(iri)
732044af 5069
5070 if '[' in iri_parts.netloc:
5071 raise ValueError('IPv6 URIs are not, yet, supported.')
5072 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5073
5074 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5075
5076 net_location = ''
5077 if iri_parts.username:
f9934b96 5078 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5079 if iri_parts.password is not None:
f9934b96 5080 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5081 net_location += '@'
5082
0f06bcd7 5083 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5084 # The 'idna' encoding produces ASCII text.
5085 if iri_parts.port is not None and iri_parts.port != 80:
5086 net_location += ':' + str(iri_parts.port)
5087
f9934b96 5088 return urllib.parse.urlunparse(
732044af 5089 (iri_parts.scheme,
5090 net_location,
5091
f9934b96 5092 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5093
5094 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5095 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5096
5097 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5098 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5099
f9934b96 5100 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5101
5102 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5103
5104
5105def to_high_limit_path(path):
5106 if sys.platform in ['win32', 'cygwin']:
5107 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5108 return '\\\\?\\' + os.path.abspath(path)
732044af 5109
5110 return path
76d321f6 5111
c76eb41b 5112
7b2c3f47 5113def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 5114 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 5115 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5116 return default
7b2c3f47 5117 return template % func(val)
00dd0cd5 5118
5119
5120def clean_podcast_url(url):
91302ed3 5121 url = re.sub(r'''(?x)
00dd0cd5 5122 (?:
5123 (?:
5124 chtbl\.com/track|
5125 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5126 play\.podtrac\.com
5127 )/[^/]+|
5128 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5129 flex\.acast\.com|
5130 pd(?:
5131 cn\.co| # https://podcorn.com/analytics-prefix/
5132 st\.fm # https://podsights.com/docs/
5133 )/e
5134 )/''', '', url)
91302ed3 5135 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
5136
5137
5138_HEX_TABLE = '0123456789abcdef'
5139
5140
5141def random_uuidv4():
5142 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5143
5144
5145def make_dir(path, to_screen=None):
5146 try:
5147 dn = os.path.dirname(path)
b25d6cb9
AI
5148 if dn:
5149 os.makedirs(dn, exist_ok=True)
0202b52a 5150 return True
86e5f3ed 5151 except OSError as err:
0202b52a 5152 if callable(to_screen) is not None:
69bec673 5153 to_screen(f'unable to create directory {err}')
0202b52a 5154 return False
f74980cb 5155
5156
5157def get_executable_path():
69bec673 5158 from ..update import _get_variant_and_executable_path
c487cf00 5159
b5899f4f 5160 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5161
5162
8e40b9d1 5163def get_user_config_dirs(package_name):
8e40b9d1
M
5164 # .config (e.g. ~/.config/package_name)
5165 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5166 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5167
5168 # appdata (%APPDATA%/package_name)
5169 appdata_dir = os.getenv('appdata')
5170 if appdata_dir:
773c272d 5171 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5172
5173 # home (~/.package_name)
773c272d 5174 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5175
5176
5177def get_system_config_dirs(package_name):
8e40b9d1 5178 # /etc/package_name
773c272d 5179 yield os.path.join('/etc', package_name)
06167fbb 5180
5181
3e9b66d7 5182def time_seconds(**kwargs):
83c4970e
L
5183 """
5184 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5185 """
5186 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5187
5188
49fa4d9a
N
5189# create a JSON Web Signature (jws) with HS256 algorithm
5190# the resulting format is in JWS Compact Serialization
5191# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5192# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5193def jwt_encode_hs256(payload_data, key, headers={}):
5194 header_data = {
5195 'alg': 'HS256',
5196 'typ': 'JWT',
5197 }
5198 if headers:
5199 header_data.update(headers)
0f06bcd7 5200 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5201 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5202 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5203 signature_b64 = base64.b64encode(h.digest())
5204 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5205 return token
819e0531 5206
5207
16b0d7e6 5208# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5209def jwt_decode_hs256(jwt):
5210 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5211 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5212 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5213 return payload_data
5214
5215
53973b4d 5216WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5217
5218
7a32c70d 5219@functools.cache
819e0531 5220def supports_terminal_sequences(stream):
5221 if compat_os_name == 'nt':
8a82af35 5222 if not WINDOWS_VT_MODE:
819e0531 5223 return False
5224 elif not os.getenv('TERM'):
5225 return False
5226 try:
5227 return stream.isatty()
5228 except BaseException:
5229 return False
5230
5231
c53a18f0 5232def windows_enable_vt_mode():
5233 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5234 if get_windows_version() < (10, 0, 10586):
53973b4d 5235 return
53973b4d 5236
c53a18f0 5237 import ctypes
5238 import ctypes.wintypes
5239 import msvcrt
5240
5241 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5242
5243 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5244 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5245 try:
5246 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5247 dw_original_mode = ctypes.wintypes.DWORD()
5248 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5249 if not success:
5250 raise Exception('GetConsoleMode failed')
5251
5252 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5253 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5254 if not success:
5255 raise Exception('SetConsoleMode failed')
c53a18f0 5256 finally:
5257 os.close(handle)
53973b4d 5258
f0795149 5259 global WINDOWS_VT_MODE
5260 WINDOWS_VT_MODE = True
5261 supports_terminal_sequences.cache_clear()
5262
53973b4d 5263
ec11a9f4 5264_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5265
5266
5267def remove_terminal_sequences(string):
5268 return _terminal_sequences_re.sub('', string)
5269
5270
5271def number_of_digits(number):
5272 return len('%d' % number)
34921b43 5273
5274
5275def join_nonempty(*values, delim='-', from_dict=None):
5276 if from_dict is not None:
69bec673 5277 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5278 return delim.join(map(str, filter(None, values)))
06e57990 5279
5280
27231526
ZM
5281def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5282 """
5283 Find the largest format dimensions in terms of video width and, for each thumbnail:
5284 * Modify the URL: Match the width with the provided regex and replace with the former width
5285 * Update dimensions
5286
5287 This function is useful with video services that scale the provided thumbnails on demand
5288 """
5289 _keys = ('width', 'height')
5290 max_dimensions = max(
86e5f3ed 5291 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5292 default=(0, 0))
5293 if not max_dimensions[0]:
5294 return thumbnails
5295 return [
5296 merge_dicts(
5297 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5298 dict(zip(_keys, max_dimensions)), thumbnail)
5299 for thumbnail in thumbnails
5300 ]
5301
5302
93c8410d
LNO
5303def parse_http_range(range):
5304 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5305 if not range:
5306 return None, None, None
5307 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5308 if not crg:
5309 return None, None, None
5310 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5311
5312
6b9e832d 5313def read_stdin(what):
5314 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5315 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5316 return sys.stdin
5317
5318
a904a7f8
L
5319def determine_file_encoding(data):
5320 """
88f60feb 5321 Detect the text encoding used
a904a7f8
L
5322 @returns (encoding, bytes to skip)
5323 """
5324
88f60feb 5325 # BOM marks are given priority over declarations
a904a7f8 5326 for bom, enc in BOMS:
a904a7f8
L
5327 if data.startswith(bom):
5328 return enc, len(bom)
5329
88f60feb 5330 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5331 # We ignore the endianness to get a good enough match
a904a7f8 5332 data = data.replace(b'\0', b'')
88f60feb 5333 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5334 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5335
5336
06e57990 5337class Config:
5338 own_args = None
9e491463 5339 parsed_args = None
06e57990 5340 filename = None
5341 __initialized = False
5342
5343 def __init__(self, parser, label=None):
9e491463 5344 self.parser, self.label = parser, label
06e57990 5345 self._loaded_paths, self.configs = set(), []
5346
5347 def init(self, args=None, filename=None):
5348 assert not self.__initialized
284a60c5 5349 self.own_args, self.filename = args, filename
5350 return self.load_configs()
5351
5352 def load_configs(self):
65662dff 5353 directory = ''
284a60c5 5354 if self.filename:
5355 location = os.path.realpath(self.filename)
65662dff 5356 directory = os.path.dirname(location)
06e57990 5357 if location in self._loaded_paths:
5358 return False
5359 self._loaded_paths.add(location)
5360
284a60c5 5361 self.__initialized = True
5362 opts, _ = self.parser.parse_known_args(self.own_args)
5363 self.parsed_args = self.own_args
9e491463 5364 for location in opts.config_locations or []:
6b9e832d 5365 if location == '-':
1060f82f 5366 if location in self._loaded_paths:
5367 continue
5368 self._loaded_paths.add(location)
6b9e832d 5369 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5370 continue
65662dff 5371 location = os.path.join(directory, expand_path(location))
06e57990 5372 if os.path.isdir(location):
5373 location = os.path.join(location, 'yt-dlp.conf')
5374 if not os.path.exists(location):
9e491463 5375 self.parser.error(f'config location {location} does not exist')
06e57990 5376 self.append_config(self.read_file(location), location)
5377 return True
5378
5379 def __str__(self):
5380 label = join_nonempty(
5381 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5382 delim=' ')
5383 return join_nonempty(
5384 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5385 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5386 delim='\n')
5387
7a32c70d 5388 @staticmethod
06e57990 5389 def read_file(filename, default=[]):
5390 try:
a904a7f8 5391 optionf = open(filename, 'rb')
86e5f3ed 5392 except OSError:
06e57990 5393 return default # silently skip if file is not present
a904a7f8
L
5394 try:
5395 enc, skip = determine_file_encoding(optionf.read(512))
5396 optionf.seek(skip, io.SEEK_SET)
5397 except OSError:
5398 enc = None # silently skip read errors
06e57990 5399 try:
5400 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5401 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5402 res = shlex.split(contents, comments=True)
44a6fcff 5403 except Exception as err:
5404 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5405 finally:
5406 optionf.close()
5407 return res
5408
7a32c70d 5409 @staticmethod
06e57990 5410 def hide_login_info(opts):
86e5f3ed 5411 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5412 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5413
5414 def _scrub_eq(o):
5415 m = eqre.match(o)
5416 if m:
5417 return m.group('key') + '=PRIVATE'
5418 else:
5419 return o
5420
5421 opts = list(map(_scrub_eq, opts))
5422 for idx, opt in enumerate(opts):
5423 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5424 opts[idx + 1] = 'PRIVATE'
5425 return opts
5426
5427 def append_config(self, *args, label=None):
9e491463 5428 config = type(self)(self.parser, label)
06e57990 5429 config._loaded_paths = self._loaded_paths
5430 if config.init(*args):
5431 self.configs.append(config)
5432
7a32c70d 5433 @property
06e57990 5434 def all_args(self):
5435 for config in reversed(self.configs):
5436 yield from config.all_args
9e491463 5437 yield from self.parsed_args or []
5438
5439 def parse_known_args(self, **kwargs):
5440 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5441
5442 def parse_args(self):
9e491463 5443 return self.parser.parse_args(self.all_args)
da42679b
LNO
5444
5445
d5d1df8a 5446class WebSocketsWrapper:
da42679b 5447 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5448 pool = None
da42679b 5449
3cea3edd 5450 def __init__(self, url, headers=None, connect=True):
059bc4db 5451 self.loop = asyncio.new_event_loop()
9cd08050 5452 # XXX: "loop" is deprecated
5453 self.conn = websockets.connect(
5454 url, extra_headers=headers, ping_interval=None,
5455 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5456 if connect:
5457 self.__enter__()
15dfb392 5458 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5459
5460 def __enter__(self):
3cea3edd 5461 if not self.pool:
9cd08050 5462 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5463 return self
5464
5465 def send(self, *args):
5466 self.run_with_loop(self.pool.send(*args), self.loop)
5467
5468 def recv(self, *args):
5469 return self.run_with_loop(self.pool.recv(*args), self.loop)
5470
5471 def __exit__(self, type, value, traceback):
5472 try:
5473 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5474 finally:
5475 self.loop.close()
15dfb392 5476 self._cancel_all_tasks(self.loop)
da42679b
LNO
5477
5478 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5479 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5480 @staticmethod
da42679b 5481 def run_with_loop(main, loop):
059bc4db 5482 if not asyncio.iscoroutine(main):
da42679b
LNO
5483 raise ValueError(f'a coroutine was expected, got {main!r}')
5484
5485 try:
5486 return loop.run_until_complete(main)
5487 finally:
5488 loop.run_until_complete(loop.shutdown_asyncgens())
5489 if hasattr(loop, 'shutdown_default_executor'):
5490 loop.run_until_complete(loop.shutdown_default_executor())
5491
7a32c70d 5492 @staticmethod
da42679b 5493 def _cancel_all_tasks(loop):
059bc4db 5494 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5495
5496 if not to_cancel:
5497 return
5498
5499 for task in to_cancel:
5500 task.cancel()
5501
9cd08050 5502 # XXX: "loop" is removed in python 3.10+
da42679b 5503 loop.run_until_complete(
059bc4db 5504 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5505
5506 for task in to_cancel:
5507 if task.cancelled():
5508 continue
5509 if task.exception() is not None:
5510 loop.call_exception_handler({
5511 'message': 'unhandled exception during asyncio.run() shutdown',
5512 'exception': task.exception(),
5513 'task': task,
5514 })
5515
5516
8b7539d2 5517def merge_headers(*dicts):
08d30158 5518 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5519 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5520
5521
b1f94422 5522def cached_method(f):
5523 """Cache a method"""
5524 signature = inspect.signature(f)
5525
7a32c70d 5526 @functools.wraps(f)
b1f94422 5527 def wrapper(self, *args, **kwargs):
5528 bound_args = signature.bind(self, *args, **kwargs)
5529 bound_args.apply_defaults()
d5d1df8a 5530 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5531
6368e2e6 5532 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5533 if key not in cache:
5534 cache[key] = f(self, *args, **kwargs)
5535 return cache[key]
5536 return wrapper
5537
5538
28787f16 5539class classproperty:
83cc7b8a 5540 """property access for class methods with optional caching"""
5541 def __new__(cls, func=None, *args, **kwargs):
5542 if not func:
5543 return functools.partial(cls, *args, **kwargs)
5544 return super().__new__(cls)
c487cf00 5545
83cc7b8a 5546 def __init__(self, func, *, cache=False):
c487cf00 5547 functools.update_wrapper(self, func)
5548 self.func = func
83cc7b8a 5549 self._cache = {} if cache else None
28787f16 5550
5551 def __get__(self, _, cls):
83cc7b8a 5552 if self._cache is None:
5553 return self.func(cls)
5554 elif cls not in self._cache:
5555 self._cache[cls] = self.func(cls)
5556 return self._cache[cls]
19a03940 5557
5558
a5387729 5559class function_with_repr:
b2e0343b 5560 def __init__(self, func, repr_=None):
a5387729 5561 functools.update_wrapper(self, func)
b2e0343b 5562 self.func, self.__repr = func, repr_
a5387729 5563
5564 def __call__(self, *args, **kwargs):
5565 return self.func(*args, **kwargs)
5566
5567 def __repr__(self):
b2e0343b 5568 if self.__repr:
5569 return self.__repr
a5387729 5570 return f'{self.func.__module__}.{self.func.__qualname__}'
5571
5572
64fa820c 5573class Namespace(types.SimpleNamespace):
591bb9d3 5574 """Immutable namespace"""
591bb9d3 5575
7896214c 5576 def __iter__(self):
64fa820c 5577 return iter(self.__dict__.values())
7896214c 5578
7a32c70d 5579 @property
64fa820c 5580 def items_(self):
5581 return self.__dict__.items()
9b8ee23b 5582
5583
8dc59305 5584MEDIA_EXTENSIONS = Namespace(
5585 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5586 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5587 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5588 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5589 thumbnails=('jpg', 'png', 'webp'),
5590 storyboards=('mhtml', ),
5591 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5592 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5593)
5594MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5595MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5596
5597KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5598
5599
be5c1ae8 5600class RetryManager:
5601 """Usage:
5602 for retry in RetryManager(...):
5603 try:
5604 ...
5605 except SomeException as err:
5606 retry.error = err
5607 continue
5608 """
5609 attempt, _error = 0, None
5610
5611 def __init__(self, _retries, _error_callback, **kwargs):
5612 self.retries = _retries or 0
5613 self.error_callback = functools.partial(_error_callback, **kwargs)
5614
5615 def _should_retry(self):
5616 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5617
7a32c70d 5618 @property
be5c1ae8 5619 def error(self):
5620 if self._error is NO_DEFAULT:
5621 return None
5622 return self._error
5623
7a32c70d 5624 @error.setter
be5c1ae8 5625 def error(self, value):
5626 self._error = value
5627
5628 def __iter__(self):
5629 while self._should_retry():
5630 self.error = NO_DEFAULT
5631 self.attempt += 1
5632 yield self
5633 if self.error:
5634 self.error_callback(self.error, self.attempt, self.retries)
5635
7a32c70d 5636 @staticmethod
be5c1ae8 5637 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5638 """Utility function for reporting retries"""
5639 if count > retries:
5640 if error:
5641 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5642 raise e
5643
5644 if not count:
5645 return warn(e)
5646 elif isinstance(e, ExtractorError):
3ce29336 5647 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5648 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5649
5650 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5651 if delay:
5652 info(f'Sleeping {delay:.2f} seconds ...')
5653 time.sleep(delay)
5654
5655
0647d925 5656def make_archive_id(ie, video_id):
5657 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5658 return f'{ie_key.lower()} {video_id}'
5659
5660
a1c5bd82 5661def truncate_string(s, left, right=0):
5662 assert left > 3 and right >= 0
5663 if s is None or len(s) <= left + right:
5664 return s
71df9b7f 5665 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5666
5667
5314b521 5668def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5669 assert 'all' in alias_dict, '"all" alias is required'
5670 requested = list(start or [])
5671 for val in options:
5672 discard = val.startswith('-')
5673 if discard:
5674 val = val[1:]
5675
5676 if val in alias_dict:
5677 val = alias_dict[val] if not discard else [
5678 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5679 # NB: Do not allow regex in aliases for performance
5680 requested = orderedSet_from_options(val, alias_dict, start=requested)
5681 continue
5682
5683 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5684 else [val] if val in alias_dict['all'] else None)
5685 if current is None:
5686 raise ValueError(val)
5687
5688 if discard:
5689 for item in current:
5690 while item in requested:
5691 requested.remove(item)
5692 else:
5693 requested.extend(current)
5694
5695 return orderedSet(requested)
5696
5697
eedda525 5698# TODO: Rewrite
d0d74b71 5699class FormatSorter:
5700 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5701
5702 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5703 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5704 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5705 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5706 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5707 'fps', 'fs_approx', 'source', 'id')
5708
5709 settings = {
5710 'vcodec': {'type': 'ordered', 'regex': True,
5711 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5712 'acodec': {'type': 'ordered', 'regex': True,
71082216 5713 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5714 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5715 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5716 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5717 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5718 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5719 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5720 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5721 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5722 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5723 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5724 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5725 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5726 'field': ('vcodec', 'acodec'),
5727 'function': lambda it: int(any(v != 'none' for v in it))},
5728 'ie_pref': {'priority': True, 'type': 'extractor'},
5729 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5730 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5731 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5732 'quality': {'convert': 'float', 'default': -1},
5733 'filesize': {'convert': 'bytes'},
5734 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5735 'id': {'convert': 'string', 'field': 'format_id'},
5736 'height': {'convert': 'float_none'},
5737 'width': {'convert': 'float_none'},
5738 'fps': {'convert': 'float_none'},
5739 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5740 'tbr': {'convert': 'float_none'},
5741 'vbr': {'convert': 'float_none'},
5742 'abr': {'convert': 'float_none'},
5743 'asr': {'convert': 'float_none'},
5744 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5745
5746 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5747 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5748 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5749 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5750 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5751 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5752 'res': {'type': 'multiple', 'field': ('height', 'width'),
5753 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5754
5755 # Actual field names
5756 'format_id': {'type': 'alias', 'field': 'id'},
5757 'preference': {'type': 'alias', 'field': 'ie_pref'},
5758 'language_preference': {'type': 'alias', 'field': 'lang'},
5759 'source_preference': {'type': 'alias', 'field': 'source'},
5760 'protocol': {'type': 'alias', 'field': 'proto'},
5761 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5762 'audio_channels': {'type': 'alias', 'field': 'channels'},
5763
5764 # Deprecated
5765 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5766 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5767 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5768 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5769 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5770 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5771 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5772 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5773 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5774 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5775 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5776 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5777 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5778 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5779 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5780 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5781 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5782 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5783 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5784 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5785 }
5786
5787 def __init__(self, ydl, field_preference):
5788 self.ydl = ydl
5789 self._order = []
5790 self.evaluate_params(self.ydl.params, field_preference)
5791 if ydl.params.get('verbose'):
5792 self.print_verbose_info(self.ydl.write_debug)
5793
5794 def _get_field_setting(self, field, key):
5795 if field not in self.settings:
5796 if key in ('forced', 'priority'):
5797 return False
5798 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5799 'deprecated and may be removed in a future version')
5800 self.settings[field] = {}
5801 propObj = self.settings[field]
5802 if key not in propObj:
5803 type = propObj.get('type')
5804 if key == 'field':
5805 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5806 elif key == 'convert':
5807 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5808 else:
5809 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5810 propObj[key] = default
5811 return propObj[key]
5812
5813 def _resolve_field_value(self, field, value, convertNone=False):
5814 if value is None:
5815 if not convertNone:
5816 return None
5817 else:
5818 value = value.lower()
5819 conversion = self._get_field_setting(field, 'convert')
5820 if conversion == 'ignore':
5821 return None
5822 if conversion == 'string':
5823 return value
5824 elif conversion == 'float_none':
5825 return float_or_none(value)
5826 elif conversion == 'bytes':
5827 return parse_bytes(value)
5828 elif conversion == 'order':
5829 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5830 use_regex = self._get_field_setting(field, 'regex')
5831 list_length = len(order_list)
5832 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5833 if use_regex and value is not None:
5834 for i, regex in enumerate(order_list):
5835 if regex and re.match(regex, value):
5836 return list_length - i
5837 return list_length - empty_pos # not in list
5838 else: # not regex or value = None
5839 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5840 else:
5841 if value.isnumeric():
5842 return float(value)
5843 else:
5844 self.settings[field]['convert'] = 'string'
5845 return value
5846
5847 def evaluate_params(self, params, sort_extractor):
5848 self._use_free_order = params.get('prefer_free_formats', False)
5849 self._sort_user = params.get('format_sort', [])
5850 self._sort_extractor = sort_extractor
5851
5852 def add_item(field, reverse, closest, limit_text):
5853 field = field.lower()
5854 if field in self._order:
5855 return
5856 self._order.append(field)
5857 limit = self._resolve_field_value(field, limit_text)
5858 data = {
5859 'reverse': reverse,
5860 'closest': False if limit is None else closest,
5861 'limit_text': limit_text,
5862 'limit': limit}
5863 if field in self.settings:
5864 self.settings[field].update(data)
5865 else:
5866 self.settings[field] = data
5867
5868 sort_list = (
5869 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5870 + (tuple() if params.get('format_sort_force', False)
5871 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5872 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5873
5874 for item in sort_list:
5875 match = re.match(self.regex, item)
5876 if match is None:
5877 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5878 field = match.group('field')
5879 if field is None:
5880 continue
5881 if self._get_field_setting(field, 'type') == 'alias':
5882 alias, field = field, self._get_field_setting(field, 'field')
5883 if self._get_field_setting(alias, 'deprecated'):
5884 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5885 f'be removed in a future version. Please use {field} instead')
5886 reverse = match.group('reverse') is not None
5887 closest = match.group('separator') == '~'
5888 limit_text = match.group('limit')
5889
5890 has_limit = limit_text is not None
5891 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5892 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5893
5894 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5895 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5896 limit_count = len(limits)
5897 for (i, f) in enumerate(fields):
5898 add_item(f, reverse, closest,
5899 limits[i] if i < limit_count
5900 else limits[0] if has_limit and not has_multiple_limits
5901 else None)
5902
5903 def print_verbose_info(self, write_debug):
5904 if self._sort_user:
5905 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5906 if self._sort_extractor:
5907 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5908 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5909 '+' if self._get_field_setting(field, 'reverse') else '', field,
5910 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5911 self._get_field_setting(field, 'limit_text'),
5912 self._get_field_setting(field, 'limit'))
5913 if self._get_field_setting(field, 'limit_text') is not None else '')
5914 for field in self._order if self._get_field_setting(field, 'visible')]))
5915
5916 def _calculate_field_preference_from_value(self, format, field, type, value):
5917 reverse = self._get_field_setting(field, 'reverse')
5918 closest = self._get_field_setting(field, 'closest')
5919 limit = self._get_field_setting(field, 'limit')
5920
5921 if type == 'extractor':
5922 maximum = self._get_field_setting(field, 'max')
5923 if value is None or (maximum is not None and value >= maximum):
5924 value = -1
5925 elif type == 'boolean':
5926 in_list = self._get_field_setting(field, 'in_list')
5927 not_in_list = self._get_field_setting(field, 'not_in_list')
5928 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5929 elif type == 'ordered':
5930 value = self._resolve_field_value(field, value, True)
5931
5932 # try to convert to number
5933 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5934 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5935 if is_num:
5936 value = val_num
5937
5938 return ((-10, 0) if value is None
5939 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5940 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5941 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5942 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5943 else (-1, value, 0))
5944
5945 def _calculate_field_preference(self, format, field):
5946 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5947 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5948 if type == 'multiple':
5949 type = 'field' # Only 'field' is allowed in multiple for now
5950 actual_fields = self._get_field_setting(field, 'field')
5951
5952 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5953 else:
5954 value = get_value(field)
5955 return self._calculate_field_preference_from_value(format, field, type, value)
5956
5957 def calculate_preference(self, format):
5958 # Determine missing protocol
5959 if not format.get('protocol'):
5960 format['protocol'] = determine_protocol(format)
5961
5962 # Determine missing ext
5963 if not format.get('ext') and 'url' in format:
5964 format['ext'] = determine_ext(format['url'])
5965 if format.get('vcodec') == 'none':
5966 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5967 format['video_ext'] = 'none'
5968 else:
5969 format['video_ext'] = format['ext']
5970 format['audio_ext'] = 'none'
5971 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5972 # format['preference'] = -1000
5973
5424dbaf
L
5974 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5975 # HEVC-over-FLV is out-of-spec by FLV's original spec
5976 # ref. https://trac.ffmpeg.org/ticket/6389
5977 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5978 format['preference'] = -100
5979
d0d74b71 5980 # Determine missing bitrates
eedda525 5981 if format.get('vcodec') == 'none':
5982 format['vbr'] = 0
5983 if format.get('acodec') == 'none':
5984 format['abr'] = 0
5985 if not format.get('vbr') and format.get('vcodec') != 'none':
5986 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5987 if not format.get('abr') and format.get('acodec') != 'none':
5988 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5989 if not format.get('tbr'):
5990 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5991
5992 return tuple(self._calculate_field_preference(format, field) for field in self._order)