]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[core] Improve HTTP redirect handling (#7094)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
69bec673 50from . import traversal
51
52from ..compat import functools # isort: split
53from ..compat import (
36e6f62c 54 compat_etree_fromstring,
51098426 55 compat_expanduser,
f8271158 56 compat_HTMLParseError,
efa97bdc 57 compat_os_name,
702ccf2d 58 compat_shlex_quote,
8c25f81b 59)
69bec673 60from ..dependencies import brotli, certifi, websockets, xattr
61from ..socks import ProxyType, sockssocket
51fb4995 62
46f1370e 63__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
64
468e2e92
FV
65# This is not clearly defined otherwise
66compiled_regex_type = type(re.compile(''))
67
f7a147e3
S
68
69def random_user_agent():
70 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
71 _CHROME_VERSIONS = (
19b4c74d 72 '90.0.4430.212',
73 '90.0.4430.24',
74 '90.0.4430.70',
75 '90.0.4430.72',
76 '90.0.4430.85',
77 '90.0.4430.93',
78 '91.0.4472.101',
79 '91.0.4472.106',
80 '91.0.4472.114',
81 '91.0.4472.124',
82 '91.0.4472.164',
83 '91.0.4472.19',
84 '91.0.4472.77',
85 '92.0.4515.107',
86 '92.0.4515.115',
87 '92.0.4515.131',
88 '92.0.4515.159',
89 '92.0.4515.43',
90 '93.0.4556.0',
91 '93.0.4577.15',
92 '93.0.4577.63',
93 '93.0.4577.82',
94 '94.0.4606.41',
95 '94.0.4606.54',
96 '94.0.4606.61',
97 '94.0.4606.71',
98 '94.0.4606.81',
99 '94.0.4606.85',
100 '95.0.4638.17',
101 '95.0.4638.50',
102 '95.0.4638.54',
103 '95.0.4638.69',
104 '95.0.4638.74',
105 '96.0.4664.18',
106 '96.0.4664.45',
107 '96.0.4664.55',
108 '96.0.4664.93',
109 '97.0.4692.20',
f7a147e3
S
110 )
111 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
112
113
4390d5ec 114SUPPORTED_ENCODINGS = [
115 'gzip', 'deflate'
116]
9b8ee23b 117if brotli:
4390d5ec 118 SUPPORTED_ENCODINGS.append('br')
119
3e669f36 120std_headers = {
f7a147e3 121 'User-Agent': random_user_agent(),
59ae15a5 122 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 123 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 124 'Sec-Fetch-Mode': 'navigate',
3e669f36 125}
f427df17 126
5f6a1245 127
fb37eb25
S
128USER_AGENTS = {
129 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
130}
131
132
4823ec9f 133class NO_DEFAULT:
134 pass
135
136
137def IDENTITY(x):
138 return x
139
bf42a990 140
7105440c
YCH
141ENGLISH_MONTH_NAMES = [
142 'January', 'February', 'March', 'April', 'May', 'June',
143 'July', 'August', 'September', 'October', 'November', 'December']
144
f6717dec
S
145MONTH_NAMES = {
146 'en': ENGLISH_MONTH_NAMES,
147 'fr': [
3e4185c3
S
148 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
149 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 150 # these follow the genitive grammatical case (dopełniacz)
151 # some websites might be using nominative, which will require another month list
152 # https://en.wikibooks.org/wiki/Polish/Noun_cases
153 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
154 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 155}
a942d6cb 156
8f53dc44 157# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
158TIMEZONE_NAMES = {
159 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
160 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
161 'EST': -5, 'EDT': -4, # Eastern
162 'CST': -6, 'CDT': -5, # Central
163 'MST': -7, 'MDT': -6, # Mountain
164 'PST': -8, 'PDT': -7 # Pacific
165}
166
c587cbb7 167# needed for sanitizing filenames in restricted mode
c8827027 168ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
169 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
170 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 171
46f59e89
S
172DATE_FORMATS = (
173 '%d %B %Y',
174 '%d %b %Y',
175 '%B %d %Y',
cb655f34
S
176 '%B %dst %Y',
177 '%B %dnd %Y',
9d30c213 178 '%B %drd %Y',
cb655f34 179 '%B %dth %Y',
46f59e89 180 '%b %d %Y',
cb655f34
S
181 '%b %dst %Y',
182 '%b %dnd %Y',
9d30c213 183 '%b %drd %Y',
cb655f34 184 '%b %dth %Y',
46f59e89
S
185 '%b %dst %Y %I:%M',
186 '%b %dnd %Y %I:%M',
9d30c213 187 '%b %drd %Y %I:%M',
46f59e89
S
188 '%b %dth %Y %I:%M',
189 '%Y %m %d',
190 '%Y-%m-%d',
bccdbd22 191 '%Y.%m.%d.',
46f59e89 192 '%Y/%m/%d',
81c13222 193 '%Y/%m/%d %H:%M',
46f59e89 194 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
195 '%Y%m%d%H%M',
196 '%Y%m%d%H%M%S',
4f3fa23e 197 '%Y%m%d',
0c1c6f4b 198 '%Y-%m-%d %H:%M',
46f59e89
S
199 '%Y-%m-%d %H:%M:%S',
200 '%Y-%m-%d %H:%M:%S.%f',
5014558a 201 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
202 '%d.%m.%Y %H:%M',
203 '%d.%m.%Y %H.%M',
204 '%Y-%m-%dT%H:%M:%SZ',
205 '%Y-%m-%dT%H:%M:%S.%fZ',
206 '%Y-%m-%dT%H:%M:%S.%f0Z',
207 '%Y-%m-%dT%H:%M:%S',
208 '%Y-%m-%dT%H:%M:%S.%f',
209 '%Y-%m-%dT%H:%M',
c6eed6b8
S
210 '%b %d %Y at %H:%M',
211 '%b %d %Y at %H:%M:%S',
b555ae9b
S
212 '%B %d %Y at %H:%M',
213 '%B %d %Y at %H:%M:%S',
a63d9bd0 214 '%H:%M %d-%b-%Y',
46f59e89
S
215)
216
217DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
218DATE_FORMATS_DAY_FIRST.extend([
219 '%d-%m-%Y',
220 '%d.%m.%Y',
221 '%d.%m.%y',
222 '%d/%m/%Y',
223 '%d/%m/%y',
224 '%d/%m/%Y %H:%M:%S',
47304e07 225 '%d-%m-%Y %H:%M',
46f59e89
S
226])
227
228DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
229DATE_FORMATS_MONTH_FIRST.extend([
230 '%m-%d-%Y',
231 '%m.%d.%Y',
232 '%m/%d/%Y',
233 '%m/%d/%y',
234 '%m/%d/%Y %H:%M:%S',
235])
236
06b3fe29 237PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 238JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 239
1d485a1a 240NUMBER_RE = r'\d+(?:\.\d+)?'
241
7105440c 242
0b9c08b4 243@functools.cache
d77c3dfd 244def preferredencoding():
59ae15a5 245 """Get preferred encoding.
d77c3dfd 246
59ae15a5
PH
247 Returns the best encoding scheme for the system, based on
248 locale.getpreferredencoding() and some further tweaks.
249 """
250 try:
251 pref = locale.getpreferredencoding()
28e614de 252 'TEST'.encode(pref)
70a1165b 253 except Exception:
59ae15a5 254 pref = 'UTF-8'
bae611f2 255
59ae15a5 256 return pref
d77c3dfd 257
f4bfd65f 258
181c8655 259def write_json_file(obj, fn):
1394646a 260 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 261
cfb0511d 262 tf = tempfile.NamedTemporaryFile(
263 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
264 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
265
266 try:
267 with tf:
45d86abe 268 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
269 if sys.platform == 'win32':
270 # Need to remove existing file on Windows, else os.rename raises
271 # WindowsError or FileExistsError.
19a03940 272 with contextlib.suppress(OSError):
1394646a 273 os.unlink(fn)
19a03940 274 with contextlib.suppress(OSError):
9cd5f54e
R
275 mask = os.umask(0)
276 os.umask(mask)
277 os.chmod(tf.name, 0o666 & ~mask)
181c8655 278 os.rename(tf.name, fn)
70a1165b 279 except Exception:
19a03940 280 with contextlib.suppress(OSError):
181c8655 281 os.remove(tf.name)
181c8655
PH
282 raise
283
284
cfb0511d 285def find_xpath_attr(node, xpath, key, val=None):
286 """ Find the xpath xpath[@key=val] """
287 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 288 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 289 return node.find(expr)
59ae56fa 290
d7e66d39
JMF
291# On python2.6 the xml.etree.ElementTree.Element methods don't support
292# the namespace parameter
5f6a1245
JW
293
294
d7e66d39
JMF
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
a41fb80c 307def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 308 def _find_xpath(xpath):
f9934b96 309 return node.find(xpath)
578c0745 310
14f25df2 311 if isinstance(xpath, str):
578c0745
S
312 n = _find_xpath(xpath)
313 else:
314 for xp in xpath:
315 n = _find_xpath(xp)
316 if n is not None:
317 break
d74bebd5 318
8e636da4 319 if n is None:
bf42a990
S
320 if default is not NO_DEFAULT:
321 return default
322 elif fatal:
bf0ff932
PH
323 name = xpath if name is None else name
324 raise ExtractorError('Could not find XML element %s' % name)
325 else:
326 return None
a41fb80c
S
327 return n
328
329
330def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
331 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
332 if n is None or n == default:
333 return n
334 if n.text is None:
335 if default is not NO_DEFAULT:
336 return default
337 elif fatal:
338 name = xpath if name is None else name
339 raise ExtractorError('Could not find XML element\'s text %s' % name)
340 else:
341 return None
342 return n.text
a41fb80c
S
343
344
345def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
346 n = find_xpath_attr(node, xpath, key)
347 if n is None:
348 if default is not NO_DEFAULT:
349 return default
350 elif fatal:
86e5f3ed 351 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
352 raise ExtractorError('Could not find XML attribute %s' % name)
353 else:
354 return None
355 return n.attrib[key]
bf0ff932
PH
356
357
c487cf00 358def get_element_by_id(id, html, **kwargs):
43e8fafd 359 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 360 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 361
12ea2f30 362
c487cf00 363def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 364 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 365 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
366
367
84c237fb 368def get_element_by_class(class_name, html):
2af12ad9
TC
369 """Return the content of the first tag with the specified class in the passed HTML document"""
370 retval = get_elements_by_class(class_name, html)
371 return retval[0] if retval else None
372
373
6f32a0b5
ZM
374def get_element_html_by_class(class_name, html):
375 """Return the html of the first tag with the specified class in the passed HTML document"""
376 retval = get_elements_html_by_class(class_name, html)
377 return retval[0] if retval else None
378
379
c487cf00 380def get_element_by_attribute(attribute, value, html, **kwargs):
381 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
382 return retval[0] if retval else None
383
384
c487cf00 385def get_element_html_by_attribute(attribute, value, html, **kargs):
386 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
387 return retval[0] if retval else None
388
389
c487cf00 390def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
391 """Return the content of all tags with the specified class in the passed HTML document as a list"""
392 return get_elements_by_attribute(
64fa820c 393 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
394 html, escape_value=False)
395
396
6f32a0b5
ZM
397def get_elements_html_by_class(class_name, html):
398 """Return the html of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_html_by_attribute(
64fa820c 400 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
401 html, escape_value=False)
402
403
404def get_elements_by_attribute(*args, **kwargs):
43e8fafd 405 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
406 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
407
408
409def get_elements_html_by_attribute(*args, **kwargs):
410 """Return the html of the tag with the specified attribute in the passed HTML document"""
411 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
412
413
4c9a1a3b 414def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
415 """
416 Return the text (content) and the html (whole) of the tag with the specified
417 attribute in the passed HTML document
418 """
c61473c1
M
419 if not value:
420 return
9e6dd238 421
86e5f3ed 422 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 423
84c237fb
YCH
424 value = re.escape(value) if escape_value else value
425
86e5f3ed 426 partial_element_re = rf'''(?x)
4c9a1a3b 427 <(?P<tag>{tag})
0254f162 428 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 429 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
430 '''
38285056 431
0254f162
ZM
432 for m in re.finditer(partial_element_re, html):
433 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 434
0254f162
ZM
435 yield (
436 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
437 whole
438 )
a921f407 439
c5229f39 440
ac668111 441class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
442 """
443 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
444 closing tag for the first opening tag it has encountered, and can be used
445 as a context manager
446 """
447
448 class HTMLBreakOnClosingTagException(Exception):
449 pass
450
451 def __init__(self):
452 self.tagstack = collections.deque()
ac668111 453 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
454
455 def __enter__(self):
456 return self
457
458 def __exit__(self, *_):
459 self.close()
460
461 def close(self):
462 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
463 # so data remains buffered; we no longer have any interest in it, thus
464 # override this method to discard it
465 pass
466
467 def handle_starttag(self, tag, _):
468 self.tagstack.append(tag)
469
470 def handle_endtag(self, tag):
471 if not self.tagstack:
472 raise compat_HTMLParseError('no tags in the stack')
473 while self.tagstack:
474 inner_tag = self.tagstack.pop()
475 if inner_tag == tag:
476 break
477 else:
478 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
479 if not self.tagstack:
480 raise self.HTMLBreakOnClosingTagException()
481
482
46d09f87 483# XXX: This should be far less strict
6f32a0b5
ZM
484def get_element_text_and_html_by_tag(tag, html):
485 """
486 For the first element with the specified tag in the passed HTML document
487 return its' content (text) and the whole element (html)
488 """
489 def find_or_raise(haystack, needle, exc):
490 try:
491 return haystack.index(needle)
492 except ValueError:
493 raise exc
494 closing_tag = f'</{tag}>'
495 whole_start = find_or_raise(
496 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
497 content_start = find_or_raise(
498 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
499 content_start += whole_start + 1
500 with HTMLBreakOnClosingTagParser() as parser:
501 parser.feed(html[whole_start:content_start])
502 if not parser.tagstack or parser.tagstack[0] != tag:
503 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
504 offset = content_start
505 while offset < len(html):
506 next_closing_tag_start = find_or_raise(
507 html[offset:], closing_tag,
508 compat_HTMLParseError(f'closing {tag} tag not found'))
509 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
510 try:
511 parser.feed(html[offset:offset + next_closing_tag_end])
512 offset += next_closing_tag_end
513 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
514 return html[content_start:offset + next_closing_tag_start], \
515 html[whole_start:offset + next_closing_tag_end]
516 raise compat_HTMLParseError('unexpected end of html')
517
518
ac668111 519class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 520 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 521
8bb56eee 522 def __init__(self):
c5229f39 523 self.attrs = {}
ac668111 524 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
525
526 def handle_starttag(self, tag, attrs):
527 self.attrs = dict(attrs)
7053aa3a 528 raise compat_HTMLParseError('done')
8bb56eee 529
c5229f39 530
ac668111 531class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
532 """HTML parser to gather the attributes for the elements of a list"""
533
534 def __init__(self):
ac668111 535 html.parser.HTMLParser.__init__(self)
73673ccf
FF
536 self.items = []
537 self._level = 0
538
539 def handle_starttag(self, tag, attrs):
540 if tag == 'li' and self._level == 0:
541 self.items.append(dict(attrs))
542 self._level += 1
543
544 def handle_endtag(self, tag):
545 self._level -= 1
546
547
8bb56eee
BF
548def extract_attributes(html_element):
549 """Given a string for an HTML element such as
550 <el
551 a="foo" B="bar" c="&98;az" d=boz
552 empty= noval entity="&amp;"
553 sq='"' dq="'"
554 >
555 Decode and return a dictionary of attributes.
556 {
557 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
558 'empty': '', 'noval': None, 'entity': '&',
559 'sq': '"', 'dq': '\''
560 }.
8bb56eee
BF
561 """
562 parser = HTMLAttributeParser()
19a03940 563 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
564 parser.feed(html_element)
565 parser.close()
8bb56eee 566 return parser.attrs
9e6dd238 567
c5229f39 568
73673ccf
FF
569def parse_list(webpage):
570 """Given a string for an series of HTML <li> elements,
571 return a dictionary of their attributes"""
572 parser = HTMLListAttrsParser()
573 parser.feed(webpage)
574 parser.close()
575 return parser.items
576
577
9e6dd238 578def clean_html(html):
59ae15a5 579 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
580
581 if html is None: # Convenience for sanitizing descriptions etc.
582 return html
583
49185227 584 html = re.sub(r'\s+', ' ', html)
585 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
586 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
587 # Strip html tags
588 html = re.sub('<.*?>', '', html)
589 # Replace html entities
590 html = unescapeHTML(html)
7decf895 591 return html.strip()
9e6dd238
FV
592
593
b7c47b74 594class LenientJSONDecoder(json.JSONDecoder):
cc090836 595 # TODO: Write tests
596 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 597 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 598 self._close_attempts = 2 * close_objects
b7c47b74 599 super().__init__(*args, **kwargs)
600
cc090836 601 @staticmethod
602 def _close_object(err):
603 doc = err.doc[:err.pos]
604 # We need to add comma first to get the correct error message
605 if err.msg.startswith('Expecting \',\''):
606 return doc + ','
607 elif not doc.endswith(','):
608 return
609
610 if err.msg.startswith('Expecting property name'):
611 return doc[:-1] + '}'
612 elif err.msg.startswith('Expecting value'):
613 return doc[:-1] + ']'
614
b7c47b74 615 def decode(self, s):
616 if self.transform_source:
617 s = self.transform_source(s)
cc090836 618 for attempt in range(self._close_attempts + 1):
619 try:
620 if self.ignore_extra:
621 return self.raw_decode(s.lstrip())[0]
622 return super().decode(s)
623 except json.JSONDecodeError as e:
624 if e.pos is None:
625 raise
626 elif attempt < self._close_attempts:
627 s = self._close_object(e)
628 if s is not None:
629 continue
2fa669f7 630 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 631 assert False, 'Too many attempts to decode JSON'
b7c47b74 632
633
d77c3dfd 634def sanitize_open(filename, open_mode):
59ae15a5
PH
635 """Try to open the given filename, and slightly tweak it if this fails.
636
637 Attempts to open the given filename. If this fails, it tries to change
638 the filename slightly, step by step, until it's either able to open it
639 or it fails and raises a final exception, like the standard open()
640 function.
641
642 It returns the tuple (stream, definitive_file_name).
643 """
0edb3e33 644 if filename == '-':
645 if sys.platform == 'win32':
646 import msvcrt
be5c1ae8 647
62b58c09 648 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 649 with contextlib.suppress(io.UnsupportedOperation):
650 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 651 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 652
0edb3e33 653 for attempt in range(2):
654 try:
655 try:
89737671 656 if sys.platform == 'win32':
b506289f 657 # FIXME: An exclusive lock also locks the file from being read.
658 # Since windows locks are mandatory, don't lock the file on windows (for now).
659 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 660 raise LockingUnsupportedError()
0edb3e33 661 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 662 except OSError:
0edb3e33 663 stream = open(filename, open_mode)
8a82af35 664 return stream, filename
86e5f3ed 665 except OSError as err:
0edb3e33 666 if attempt or err.errno in (errno.EACCES,):
667 raise
668 old_filename, filename = filename, sanitize_path(filename)
669 if old_filename == filename:
670 raise
d77c3dfd
FV
671
672
673def timeconvert(timestr):
59ae15a5
PH
674 """Convert RFC 2822 defined time string into system timestamp"""
675 timestamp = None
676 timetuple = email.utils.parsedate_tz(timestr)
677 if timetuple is not None:
678 timestamp = email.utils.mktime_tz(timetuple)
679 return timestamp
1c469a94 680
5f6a1245 681
5c3895ff 682def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 683 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 684 @param restricted Use a stricter subset of allowed characters
685 @param is_id Whether this is an ID that should be kept unchanged if possible.
686 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 687 """
5c3895ff 688 if s == '':
689 return ''
690
59ae15a5 691 def replace_insane(char):
c587cbb7
AT
692 if restricted and char in ACCENT_CHARS:
693 return ACCENT_CHARS[char]
91dd88b9 694 elif not restricted and char == '\n':
5c3895ff 695 return '\0 '
989a01c2 696 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
697 # Replace with their full-width unicode counterparts
698 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 699 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
700 return ''
701 elif char == '"':
702 return '' if restricted else '\''
703 elif char == ':':
5c3895ff 704 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 705 elif char in '\\/|*<>':
5c3895ff 706 return '\0_'
707 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
708 return '\0_'
59ae15a5
PH
709 return char
710
db4678e4 711 # Replace look-alike Unicode glyphs
712 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 713 s = unicodedata.normalize('NFKC', s)
5c3895ff 714 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 715 result = ''.join(map(replace_insane, s))
5c3895ff 716 if is_id is NO_DEFAULT:
ae61d108 717 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
718 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 719 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
720 result = result.replace('\0', '') or '_'
721
796173d0
PH
722 if not is_id:
723 while '__' in result:
724 result = result.replace('__', '_')
725 result = result.strip('_')
726 # Common case of "Foreign band name - English song title"
727 if restricted and result.startswith('-_'):
728 result = result[2:]
5a42414b
PH
729 if result.startswith('-'):
730 result = '_' + result[len('-'):]
a7440261 731 result = result.lstrip('.')
796173d0
PH
732 if not result:
733 result = '_'
59ae15a5 734 return result
d77c3dfd 735
5f6a1245 736
c2934512 737def sanitize_path(s, force=False):
a2aaf4db 738 """Sanitizes and normalizes path on Windows"""
c2934512 739 if sys.platform == 'win32':
c4218ac3 740 force = False
c2934512 741 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 742 elif force:
743 drive_or_unc = ''
744 else:
a2aaf4db 745 return s
c2934512 746
be531ef1
S
747 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
748 if drive_or_unc:
a2aaf4db
S
749 norm_path.pop(0)
750 sanitized_path = [
ec85ded8 751 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 752 for path_part in norm_path]
be531ef1
S
753 if drive_or_unc:
754 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 755 elif force and s and s[0] == os.path.sep:
c4218ac3 756 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
757 return os.path.join(*sanitized_path)
758
759
8f97a15d 760def sanitize_url(url, *, scheme='http'):
befa4708
S
761 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
762 # the number of unwanted failures due to missing protocol
21633673 763 if url is None:
764 return
765 elif url.startswith('//'):
8f97a15d 766 return f'{scheme}:{url}'
befa4708
S
767 # Fix some common typos seen so far
768 COMMON_TYPOS = (
067aa17e 769 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
770 (r'^httpss://', r'https://'),
771 # https://bx1.be/lives/direct-tv/
772 (r'^rmtp([es]?)://', r'rtmp\1://'),
773 )
774 for mistake, fixup in COMMON_TYPOS:
775 if re.match(mistake, url):
776 return re.sub(mistake, fixup, url)
bc6b9bcd 777 return url
17bcc626
S
778
779
5435dcf9 780def extract_basic_auth(url):
14f25df2 781 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
782 if parts.username is None:
783 return url, None
14f25df2 784 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
785 parts.hostname if parts.port is None
786 else '%s:%d' % (parts.hostname, parts.port))))
787 auth_payload = base64.b64encode(
0f06bcd7 788 ('%s:%s' % (parts.username, parts.password or '')).encode())
789 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
790
791
67dda517 792def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 793 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
794 if auth_header is not None:
795 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
796 headers['Authorization'] = auth_header
ac668111 797 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
798
799
51098426 800def expand_path(s):
2fa669f7 801 """Expand shell variables and ~"""
51098426
S
802 return os.path.expandvars(compat_expanduser(s))
803
804
7e9a6125 805def orderedSet(iterable, *, lazy=False):
806 """Remove all duplicates from the input iterable"""
807 def _iter():
808 seen = [] # Do not use set since the items can be unhashable
809 for x in iterable:
810 if x not in seen:
811 seen.append(x)
812 yield x
813
814 return _iter() if lazy else list(_iter())
d77c3dfd 815
912b38b4 816
55b2f099 817def _htmlentity_transform(entity_with_semicolon):
4e408e47 818 """Transforms an HTML entity to a character."""
55b2f099
YCH
819 entity = entity_with_semicolon[:-1]
820
4e408e47 821 # Known non-numeric HTML entity
ac668111 822 if entity in html.entities.name2codepoint:
823 return chr(html.entities.name2codepoint[entity])
4e408e47 824
62b58c09
L
825 # TODO: HTML5 allows entities without a semicolon.
826 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 827 if entity_with_semicolon in html.entities.html5:
828 return html.entities.html5[entity_with_semicolon]
55b2f099 829
91757b0f 830 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
831 if mobj is not None:
832 numstr = mobj.group(1)
28e614de 833 if numstr.startswith('x'):
4e408e47 834 base = 16
28e614de 835 numstr = '0%s' % numstr
4e408e47
PH
836 else:
837 base = 10
067aa17e 838 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 839 with contextlib.suppress(ValueError):
ac668111 840 return chr(int(numstr, base))
4e408e47
PH
841
842 # Unknown entity in name, return its literal representation
7a3f0c00 843 return '&%s;' % entity
4e408e47
PH
844
845
d77c3dfd 846def unescapeHTML(s):
912b38b4
PH
847 if s is None:
848 return None
19a03940 849 assert isinstance(s, str)
d77c3dfd 850
4e408e47 851 return re.sub(
95f3f7c2 852 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 853
8bf48f23 854
cdb19aa4 855def escapeHTML(text):
856 return (
857 text
858 .replace('&', '&amp;')
859 .replace('<', '&lt;')
860 .replace('>', '&gt;')
861 .replace('"', '&quot;')
862 .replace("'", '&#39;')
863 )
864
865
f5b1bca9 866def process_communicate_or_kill(p, *args, **kwargs):
da4db748 867 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
868 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 869 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 870
871
d3c93ec2 872class Popen(subprocess.Popen):
873 if sys.platform == 'win32':
874 _startupinfo = subprocess.STARTUPINFO()
875 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
876 else:
877 _startupinfo = None
878
82ea226c
L
879 @staticmethod
880 def _fix_pyinstaller_ld_path(env):
881 """Restore LD_LIBRARY_PATH when using PyInstaller
882 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
883 https://github.com/yt-dlp/yt-dlp/issues/4573
884 """
885 if not hasattr(sys, '_MEIPASS'):
886 return
887
888 def _fix(key):
889 orig = env.get(f'{key}_ORIG')
890 if orig is None:
891 env.pop(key, None)
892 else:
893 env[key] = orig
894
895 _fix('LD_LIBRARY_PATH') # Linux
896 _fix('DYLD_LIBRARY_PATH') # macOS
897
898 def __init__(self, *args, env=None, text=False, **kwargs):
899 if env is None:
900 env = os.environ.copy()
901 self._fix_pyinstaller_ld_path(env)
902
da8e2912 903 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 904 if text is True:
905 kwargs['universal_newlines'] = True # For 3.6 compatibility
906 kwargs.setdefault('encoding', 'utf-8')
907 kwargs.setdefault('errors', 'replace')
82ea226c 908 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 909
910 def communicate_or_kill(self, *args, **kwargs):
8a82af35 911 try:
912 return self.communicate(*args, **kwargs)
913 except BaseException: # Including KeyboardInterrupt
f0c9fb96 914 self.kill(timeout=None)
8a82af35 915 raise
d3c93ec2 916
f0c9fb96 917 def kill(self, *, timeout=0):
918 super().kill()
919 if timeout != 0:
920 self.wait(timeout=timeout)
921
922 @classmethod
992dc6b4 923 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 924 with cls(*args, **kwargs) as proc:
da8e2912 925 default = '' if proc.__text_mode else b''
992dc6b4 926 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 927 return stdout or default, stderr or default, proc.returncode
f0c9fb96 928
d3c93ec2 929
f07b74fc 930def encodeArgument(s):
cfb0511d 931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
14f25df2 933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 934 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
935
936
aa7785f8 937_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
938
939
940def timetuple_from_msec(msec):
941 secs, msec = divmod(msec, 1000)
942 mins, secs = divmod(secs, 60)
943 hrs, mins = divmod(mins, 60)
944 return _timetuple(hrs, mins, secs, msec)
945
946
cdb19aa4 947def formatSeconds(secs, delim=':', msec=False):
aa7785f8 948 time = timetuple_from_msec(secs * 1000)
949 if time.hours:
950 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
951 elif time.minutes:
952 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 953 else:
aa7785f8 954 ret = '%d' % time.seconds
955 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 956
a0ddb8a2 957
77562778 958def _ssl_load_windows_store_certs(ssl_context, storename):
959 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
960 try:
961 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
962 if encoding == 'x509_asn' and (
963 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
964 except PermissionError:
965 return
966 for cert in certs:
19a03940 967 with contextlib.suppress(ssl.SSLError):
77562778 968 ssl_context.load_verify_locations(cadata=cert)
a2366922 969
77562778 970
971def make_HTTPS_handler(params, **kwargs):
972 opts_check_certificate = not params.get('nocheckcertificate')
973 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
974 context.check_hostname = opts_check_certificate
f81c62a6 975 if params.get('legacyserverconnect'):
976 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 977 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
978 context.set_ciphers('DEFAULT')
ac8e69dd
M
979 elif (
980 sys.version_info < (3, 10)
981 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
982 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
983 ):
5b9f253f
M
984 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
985 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
986 # in some situations [2][3].
987 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
988 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 989 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
990 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
991 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
992 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
993 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
994 # 5. https://peps.python.org/pep-0644/#libressl-support
995 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
996 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
997 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 998
77562778 999 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000 if opts_check_certificate:
69bec673 1001 if certifi and 'no-certifi' not in params.get('compat_opts', []):
d5820461 1002 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1003 else:
1004 try:
1005 context.load_default_certs()
1006 # Work around the issue in load_default_certs when there are bad certificates. See:
1007 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1008 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1009 except ssl.SSLError:
1010 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1011 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1012 for storename in ('CA', 'ROOT'):
1013 _ssl_load_windows_store_certs(context, storename)
1014 context.set_default_verify_paths()
8a82af35 1015
bb58c9ed 1016 client_certfile = params.get('client_certificate')
1017 if client_certfile:
1018 try:
1019 context.load_cert_chain(
1020 client_certfile, keyfile=params.get('client_certificate_key'),
1021 password=params.get('client_certificate_password'))
1022 except ssl.SSLError:
1023 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1024
1025 # Some servers may reject requests if ALPN extension is not sent. See:
1026 # https://github.com/python/cpython/issues/85140
1027 # https://github.com/yt-dlp/yt-dlp/issues/3878
1028 with contextlib.suppress(NotImplementedError):
1029 context.set_alpn_protocols(['http/1.1'])
1030
77562778 1031 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1032
732ea2f0 1033
5873d4cc 1034def bug_reports_message(before=';'):
69bec673 1035 from ..update import REPOSITORY
57e0f077 1036
1037 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1038 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1039
1040 before = before.rstrip()
1041 if not before or before.endswith(('.', '!', '?')):
1042 msg = msg[0].title() + msg[1:]
1043
1044 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1045
1046
bf5b9d85
PM
1047class YoutubeDLError(Exception):
1048 """Base exception for YoutubeDL errors."""
aa9369a2 1049 msg = None
1050
1051 def __init__(self, msg=None):
1052 if msg is not None:
1053 self.msg = msg
1054 elif self.msg is None:
1055 self.msg = type(self).__name__
1056 super().__init__(self.msg)
bf5b9d85
PM
1057
1058
ac668111 1059network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1060if hasattr(ssl, 'CertificateError'):
1061 network_exceptions.append(ssl.CertificateError)
1062network_exceptions = tuple(network_exceptions)
1063
1064
bf5b9d85 1065class ExtractorError(YoutubeDLError):
1c256f70 1066 """Error during info extraction."""
5f6a1245 1067
1151c407 1068 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1069 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1070 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1071 """
3158150c 1072 if sys.exc_info()[0] in network_exceptions:
9a82b238 1073 expected = True
d5979c5d 1074
7265a219 1075 self.orig_msg = str(msg)
1c256f70 1076 self.traceback = tb
1151c407 1077 self.expected = expected
2eabb802 1078 self.cause = cause
d11271dd 1079 self.video_id = video_id
1151c407 1080 self.ie = ie
1081 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1082 if isinstance(self.exc_info[1], ExtractorError):
1083 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1084 super().__init__(self.__msg)
1151c407 1085
9bcfe33b 1086 @property
1087 def __msg(self):
1088 return ''.join((
1089 format_field(self.ie, None, '[%s] '),
1090 format_field(self.video_id, None, '%s: '),
1091 self.orig_msg,
1092 format_field(self.cause, None, ' (caused by %r)'),
1093 '' if self.expected else bug_reports_message()))
1c256f70 1094
01951dda 1095 def format_traceback(self):
497d2fab 1096 return join_nonempty(
1097 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1098 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1099 delim='\n') or None
01951dda 1100
9bcfe33b 1101 def __setattr__(self, name, value):
1102 super().__setattr__(name, value)
1103 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1104 self.msg = self.__msg or type(self).__name__
1105 self.args = (self.msg, ) # Cannot be property
1106
1c256f70 1107
416c7fcb
PH
1108class UnsupportedError(ExtractorError):
1109 def __init__(self, url):
86e5f3ed 1110 super().__init__(
416c7fcb
PH
1111 'Unsupported URL: %s' % url, expected=True)
1112 self.url = url
1113
1114
55b3e45b
JMF
1115class RegexNotFoundError(ExtractorError):
1116 """Error when a regex didn't match"""
1117 pass
1118
1119
773f291d
S
1120class GeoRestrictedError(ExtractorError):
1121 """Geographic restriction Error exception.
1122
1123 This exception may be thrown when a video is not available from your
1124 geographic location due to geographic restrictions imposed by a website.
1125 """
b6e0c7d2 1126
0db3bae8 1127 def __init__(self, msg, countries=None, **kwargs):
1128 kwargs['expected'] = True
86e5f3ed 1129 super().__init__(msg, **kwargs)
773f291d
S
1130 self.countries = countries
1131
1132
693f0600 1133class UserNotLive(ExtractorError):
1134 """Error when a channel/user is not live"""
1135
1136 def __init__(self, msg=None, **kwargs):
1137 kwargs['expected'] = True
1138 super().__init__(msg or 'The channel is not currently live', **kwargs)
1139
1140
bf5b9d85 1141class DownloadError(YoutubeDLError):
59ae15a5 1142 """Download Error exception.
d77c3dfd 1143
59ae15a5
PH
1144 This exception may be thrown by FileDownloader objects if they are not
1145 configured to continue on errors. They will contain the appropriate
1146 error message.
1147 """
5f6a1245 1148
8cc83b8d
FV
1149 def __init__(self, msg, exc_info=None):
1150 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1151 super().__init__(msg)
8cc83b8d 1152 self.exc_info = exc_info
d77c3dfd
FV
1153
1154
498f5606 1155class EntryNotInPlaylist(YoutubeDLError):
1156 """Entry not in playlist exception.
1157
1158 This exception will be thrown by YoutubeDL when a requested entry
1159 is not found in the playlist info_dict
1160 """
aa9369a2 1161 msg = 'Entry not found in info'
498f5606 1162
1163
bf5b9d85 1164class SameFileError(YoutubeDLError):
59ae15a5 1165 """Same File exception.
d77c3dfd 1166
59ae15a5
PH
1167 This exception will be thrown by FileDownloader objects if they detect
1168 multiple files would have to be downloaded to the same file on disk.
1169 """
aa9369a2 1170 msg = 'Fixed output name but more than one file to download'
1171
1172 def __init__(self, filename=None):
1173 if filename is not None:
1174 self.msg += f': {filename}'
1175 super().__init__(self.msg)
d77c3dfd
FV
1176
1177
bf5b9d85 1178class PostProcessingError(YoutubeDLError):
59ae15a5 1179 """Post Processing exception.
d77c3dfd 1180
59ae15a5
PH
1181 This exception may be raised by PostProcessor's .run() method to
1182 indicate an error in the postprocessing task.
1183 """
5f6a1245 1184
5f6a1245 1185
48f79687 1186class DownloadCancelled(YoutubeDLError):
1187 """ Exception raised when the download queue should be interrupted """
1188 msg = 'The download was cancelled'
8b0d7497 1189
8b0d7497 1190
48f79687 1191class ExistingVideoReached(DownloadCancelled):
1192 """ --break-on-existing triggered """
1193 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1194
48f79687 1195
1196class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1197 """ --break-match-filter triggered """
1198 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1199
1200
48f79687 1201class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1202 """ --max-downloads limit has been reached. """
48f79687 1203 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1204
1205
f2ebc5c7 1206class ReExtractInfo(YoutubeDLError):
1207 """ Video info needs to be re-extracted. """
1208
1209 def __init__(self, msg, expected=False):
1210 super().__init__(msg)
1211 self.expected = expected
1212
1213
1214class ThrottledDownload(ReExtractInfo):
48f79687 1215 """ Download speed below --throttled-rate. """
aa9369a2 1216 msg = 'The download speed is below throttle limit'
d77c3dfd 1217
43b22906 1218 def __init__(self):
1219 super().__init__(self.msg, expected=False)
f2ebc5c7 1220
d77c3dfd 1221
bf5b9d85 1222class UnavailableVideoError(YoutubeDLError):
59ae15a5 1223 """Unavailable Format exception.
d77c3dfd 1224
59ae15a5
PH
1225 This exception will be thrown when a video is requested
1226 in a format that is not available for that video.
1227 """
aa9369a2 1228 msg = 'Unable to download video'
1229
1230 def __init__(self, err=None):
1231 if err is not None:
1232 self.msg += f': {err}'
1233 super().__init__(self.msg)
d77c3dfd
FV
1234
1235
bf5b9d85 1236class ContentTooShortError(YoutubeDLError):
59ae15a5 1237 """Content Too Short exception.
d77c3dfd 1238
59ae15a5
PH
1239 This exception may be raised by FileDownloader objects when a file they
1240 download is too small for what the server announced first, indicating
1241 the connection was probably interrupted.
1242 """
d77c3dfd 1243
59ae15a5 1244 def __init__(self, downloaded, expected):
86e5f3ed 1245 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1246 # Both in bytes
59ae15a5
PH
1247 self.downloaded = downloaded
1248 self.expected = expected
d77c3dfd 1249
5f6a1245 1250
bf5b9d85 1251class XAttrMetadataError(YoutubeDLError):
efa97bdc 1252 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1253 super().__init__(msg)
efa97bdc 1254 self.code = code
bd264412 1255 self.msg = msg
efa97bdc
YCH
1256
1257 # Parsing code and msg
3089bc74 1258 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1259 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1260 self.reason = 'NO_SPACE'
1261 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1262 self.reason = 'VALUE_TOO_LONG'
1263 else:
1264 self.reason = 'NOT_SUPPORTED'
1265
1266
bf5b9d85 1267class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1268 pass
1269
1270
c5a59d93 1271def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1272 hc = http_class(*args, **kwargs)
be4a824d 1273 source_address = ydl_handler._params.get('source_address')
8959018a 1274
be4a824d 1275 if source_address is not None:
8959018a
AU
1276 # This is to workaround _create_connection() from socket where it will try all
1277 # address data from getaddrinfo() including IPv6. This filters the result from
1278 # getaddrinfo() based on the source_address value.
1279 # This is based on the cpython socket.create_connection() function.
1280 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1281 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1282 host, port = address
1283 err = None
1284 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1285 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1286 ip_addrs = [addr for addr in addrs if addr[0] == af]
1287 if addrs and not ip_addrs:
1288 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1289 raise OSError(
9e21e6d9
S
1290 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1291 % (ip_version, source_address[0]))
8959018a
AU
1292 for res in ip_addrs:
1293 af, socktype, proto, canonname, sa = res
1294 sock = None
1295 try:
1296 sock = socket.socket(af, socktype, proto)
1297 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1298 sock.settimeout(timeout)
1299 sock.bind(source_address)
1300 sock.connect(sa)
1301 err = None # Explicitly break reference cycle
1302 return sock
86e5f3ed 1303 except OSError as _:
8959018a
AU
1304 err = _
1305 if sock is not None:
1306 sock.close()
1307 if err is not None:
1308 raise err
1309 else:
86e5f3ed 1310 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1311 if hasattr(hc, '_create_connection'):
1312 hc._create_connection = _create_connection
cfb0511d 1313 hc.source_address = (source_address, 0)
be4a824d
PH
1314
1315 return hc
1316
1317
ac668111 1318class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1319 """Handler for HTTP requests and responses.
1320
1321 This class, when installed with an OpenerDirector, automatically adds
955c8958 1322 the standard headers to every HTTP request and handles gzipped, deflated and
1323 brotli responses from web servers.
59ae15a5
PH
1324
1325 Part of this code was copied from:
1326
1327 http://techknack.net/python-urllib2-handlers/
1328
1329 Andrew Rowls, the author of that code, agreed to release it to the
1330 public domain.
1331 """
1332
be4a824d 1333 def __init__(self, params, *args, **kwargs):
ac668111 1334 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1335 self._params = params
1336
1337 def http_open(self, req):
ac668111 1338 conn_class = http.client.HTTPConnection
71aff188
YCH
1339
1340 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341 if socks_proxy:
1342 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343 del req.headers['Ytdl-socks-proxy']
1344
be4a824d 1345 return self.do_open(functools.partial(
71aff188 1346 _create_http_connection, self, conn_class, False),
be4a824d
PH
1347 req)
1348
59ae15a5
PH
1349 @staticmethod
1350 def deflate(data):
fc2119f2 1351 if not data:
1352 return data
59ae15a5
PH
1353 try:
1354 return zlib.decompress(data, -zlib.MAX_WBITS)
1355 except zlib.error:
1356 return zlib.decompress(data)
1357
4390d5ec 1358 @staticmethod
1359 def brotli(data):
1360 if not data:
1361 return data
9b8ee23b 1362 return brotli.decompress(data)
4390d5ec 1363
acebc9cd 1364 def http_request(self, req):
51f267d9
S
1365 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1366 # always respected by websites, some tend to give out URLs with non percent-encoded
1367 # non-ASCII characters (see telemb.py, ard.py [#3412])
1368 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1369 # To work around aforementioned issue we will replace request's original URL with
1370 # percent-encoded one
1371 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1372 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1373 url = req.get_full_url()
1374 url_escaped = escape_url(url)
1375
1376 # Substitute URL if any change after escaping
1377 if url != url_escaped:
15d260eb 1378 req = update_Request(req, url=url_escaped)
51f267d9 1379
8b7539d2 1380 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1381 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1382 # The dict keys are capitalized because of this bug by urllib
1383 if h.capitalize() not in req.headers:
33ac271b 1384 req.add_header(h, v)
87f0e62d 1385
955c8958 1386 if 'Youtubedl-no-compression' in req.headers: # deprecated
1387 req.headers.pop('Youtubedl-no-compression', None)
1388 req.add_header('Accept-encoding', 'identity')
1389
af14914b 1390 if 'Accept-encoding' not in req.headers:
1391 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1392
379a4f16 1393 return super().do_request_(req)
59ae15a5 1394
acebc9cd 1395 def http_response(self, req, resp):
59ae15a5
PH
1396 old_resp = resp
1397 # gzip
1398 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1399 content = resp.read()
1400 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1401 try:
1402 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1403 except OSError as original_ioerror:
aa3e9507
PH
1404 # There may be junk add the end of the file
1405 # See http://stackoverflow.com/q/4928560/35070 for details
1406 for i in range(1, 1024):
1407 try:
1408 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1409 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1410 except OSError:
aa3e9507
PH
1411 continue
1412 break
1413 else:
1414 raise original_ioerror
ac668111 1415 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
1416 resp.msg = old_resp.msg
1417 # deflate
1418 if resp.headers.get('Content-encoding', '') == 'deflate':
1419 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1420 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1421 resp.msg = old_resp.msg
4390d5ec 1422 # brotli
1423 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1424 resp = urllib.request.addinfourl(
4390d5ec 1425 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1426 resp.msg = old_resp.msg
ad729172 1427 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1428 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1429 if 300 <= resp.code < 400:
1430 location = resp.headers.get('Location')
1431 if location:
1432 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1433 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1434 location_escaped = escape_url(location)
1435 if location != location_escaped:
1436 del resp.headers['Location']
1437 resp.headers['Location'] = location_escaped
59ae15a5 1438 return resp
0f8d03f8 1439
acebc9cd
PH
1440 https_request = http_request
1441 https_response = http_response
bf50b038 1442
5de90176 1443
71aff188
YCH
1444def make_socks_conn_class(base_class, socks_proxy):
1445 assert issubclass(base_class, (
ac668111 1446 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1447
14f25df2 1448 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1449 if url_components.scheme.lower() == 'socks5':
1450 socks_type = ProxyType.SOCKS5
1451 elif url_components.scheme.lower() in ('socks', 'socks4'):
1452 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1453 elif url_components.scheme.lower() == 'socks4a':
1454 socks_type = ProxyType.SOCKS4A
71aff188 1455
cdd94c2e
YCH
1456 def unquote_if_non_empty(s):
1457 if not s:
1458 return s
ac668111 1459 return urllib.parse.unquote_plus(s)
cdd94c2e 1460
71aff188
YCH
1461 proxy_args = (
1462 socks_type,
1463 url_components.hostname, url_components.port or 1080,
1464 True, # Remote DNS
cdd94c2e
YCH
1465 unquote_if_non_empty(url_components.username),
1466 unquote_if_non_empty(url_components.password),
71aff188
YCH
1467 )
1468
1469 class SocksConnection(base_class):
1470 def connect(self):
1471 self.sock = sockssocket()
1472 self.sock.setproxy(*proxy_args)
19a03940 1473 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1474 self.sock.settimeout(self.timeout)
1475 self.sock.connect((self.host, self.port))
1476
ac668111 1477 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1478 if hasattr(self, '_context'): # Python > 2.6
1479 self.sock = self._context.wrap_socket(
1480 self.sock, server_hostname=self.host)
1481 else:
1482 self.sock = ssl.wrap_socket(self.sock)
1483
1484 return SocksConnection
1485
1486
ac668111 1487class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1488 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1489 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1490 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1491 self._params = params
1492
1493 def https_open(self, req):
4f264c02 1494 kwargs = {}
71aff188
YCH
1495 conn_class = self._https_conn_class
1496
4f264c02
JMF
1497 if hasattr(self, '_context'): # python > 2.6
1498 kwargs['context'] = self._context
1499 if hasattr(self, '_check_hostname'): # python 3.x
1500 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1501
1502 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1503 if socks_proxy:
1504 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1505 del req.headers['Ytdl-socks-proxy']
1506
4f28b537 1507 try:
1508 return self.do_open(
1509 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1510 except urllib.error.URLError as e:
1511 if (isinstance(e.reason, ssl.SSLError)
1512 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1513 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1514 raise
be4a824d
PH
1515
1516
941e881e 1517def is_path_like(f):
1518 return isinstance(f, (str, bytes, os.PathLike))
1519
1520
ac668111 1521class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1522 """
1523 See [1] for cookie file format.
1524
1525 1. https://curl.haxx.se/docs/http-cookies.html
1526 """
e7e62441 1527 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1528 _ENTRY_LEN = 7
1529 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1530# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1531
1532'''
1533 _CookieFileEntry = collections.namedtuple(
1534 'CookieFileEntry',
1535 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1536
d76fa1f3 1537 def __init__(self, filename=None, *args, **kwargs):
1538 super().__init__(None, *args, **kwargs)
941e881e 1539 if is_path_like(filename):
d76fa1f3 1540 filename = os.fspath(filename)
1541 self.filename = filename
1542
24146491 1543 @staticmethod
1544 def _true_or_false(cndn):
1545 return 'TRUE' if cndn else 'FALSE'
1546
d76fa1f3 1547 @contextlib.contextmanager
1548 def open(self, file, *, write=False):
941e881e 1549 if is_path_like(file):
d76fa1f3 1550 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1551 yield f
1552 else:
1553 if write:
1554 file.truncate(0)
1555 yield file
1556
24146491 1557 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1558 now = time.time()
1559 for cookie in self:
1560 if (not ignore_discard and cookie.discard
1561 or not ignore_expires and cookie.is_expired(now)):
1562 continue
1563 name, value = cookie.name, cookie.value
1564 if value is None:
1565 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1566 # with no name, whereas http.cookiejar regards it as a
1567 # cookie with no value.
1568 name, value = '', name
1569 f.write('%s\n' % '\t'.join((
1570 cookie.domain,
1571 self._true_or_false(cookie.domain.startswith('.')),
1572 cookie.path,
1573 self._true_or_false(cookie.secure),
1574 str_or_none(cookie.expires, default=''),
1575 name, value
1576 )))
1577
1578 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1579 """
1580 Save cookies to a file.
24146491 1581 Code is taken from CPython 3.6
1582 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1583
c380cc28
S
1584 if filename is None:
1585 if self.filename is not None:
1586 filename = self.filename
1587 else:
ac668111 1588 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1589
24146491 1590 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1591 for cookie in self:
1592 if cookie.expires is None:
1593 cookie.expires = 0
c380cc28 1594
d76fa1f3 1595 with self.open(filename, write=True) as f:
c380cc28 1596 f.write(self._HEADER)
24146491 1597 self._really_save(f, *args, **kwargs)
1bab3437
S
1598
1599 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1600 """Load cookies from a file."""
1601 if filename is None:
1602 if self.filename is not None:
1603 filename = self.filename
1604 else:
ac668111 1605 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1606
c380cc28
S
1607 def prepare_line(line):
1608 if line.startswith(self._HTTPONLY_PREFIX):
1609 line = line[len(self._HTTPONLY_PREFIX):]
1610 # comments and empty lines are fine
1611 if line.startswith('#') or not line.strip():
1612 return line
1613 cookie_list = line.split('\t')
1614 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1615 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1616 cookie = self._CookieFileEntry(*cookie_list)
1617 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1618 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1619 return line
1620
e7e62441 1621 cf = io.StringIO()
d76fa1f3 1622 with self.open(filename) as f:
e7e62441 1623 for line in f:
c380cc28
S
1624 try:
1625 cf.write(prepare_line(line))
ac668111 1626 except http.cookiejar.LoadError as e:
94aa0644 1627 if f'{line.strip()} '[0] in '[{"':
ac668111 1628 raise http.cookiejar.LoadError(
94aa0644 1629 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1630 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1631 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1632 continue
e7e62441 1633 cf.seek(0)
1634 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1635 # Session cookies are denoted by either `expires` field set to
1636 # an empty string or 0. MozillaCookieJar only recognizes the former
1637 # (see [1]). So we need force the latter to be recognized as session
1638 # cookies on our own.
1639 # Session cookies may be important for cookies-based authentication,
1640 # e.g. usually, when user does not check 'Remember me' check box while
1641 # logging in on a site, some important cookies are stored as session
1642 # cookies so that not recognizing them will result in failed login.
1643 # 1. https://bugs.python.org/issue17164
1644 for cookie in self:
1645 # Treat `expires=0` cookies as session cookies
1646 if cookie.expires == 0:
1647 cookie.expires = None
1648 cookie.discard = True
1649
1650
ac668111 1651class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1652 def __init__(self, cookiejar=None):
ac668111 1653 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1654
1655 def http_response(self, request, response):
ac668111 1656 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1657
ac668111 1658 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1659 https_response = http_response
1660
1661
ac668111 1662class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1663 """YoutubeDL redirect handler
1664
1665 The code is based on HTTPRedirectHandler implementation from CPython [1].
1666
08916a49 1667 This redirect handler fixes and improves the logic to better align with RFC7261
1668 and what browsers tend to do [2][3]
201c1459 1669
1670 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
08916a49 1671 2. https://datatracker.ietf.org/doc/html/rfc7231
1672 3. https://github.com/python/cpython/issues/91306
201c1459 1673 """
1674
ac668111 1675 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1676
1677 def redirect_request(self, req, fp, code, msg, headers, newurl):
08916a49 1678 if code not in (301, 302, 303, 307, 308):
14f25df2 1679 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
afac4caa 1680
08916a49 1681 new_method = req.get_method()
1682 new_data = req.data
1683 remove_headers = []
afac4caa 1684 # A 303 must either use GET or HEAD for subsequent request
1685 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
08916a49 1686 if code == 303 and req.get_method() != 'HEAD':
1687 new_method = 'GET'
afac4caa 1688 # 301 and 302 redirects are commonly turned into a GET from a POST
1689 # for subsequent requests by browsers, so we'll do the same.
1690 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1691 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
08916a49 1692 elif code in (301, 302) and req.get_method() == 'POST':
1693 new_method = 'GET'
1694
1695 # only remove payload if method changed (e.g. POST to GET)
1696 if new_method != req.get_method():
1697 new_data = None
1698 remove_headers.extend(['Content-Length', 'Content-Type'])
1699
1700 new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
afac4caa 1701
ac668111 1702 return urllib.request.Request(
08916a49 1703 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1704 unverifiable=True, method=new_method, data=new_data)
fca6dba8
S
1705
1706
46f59e89
S
1707def extract_timezone(date_str):
1708 m = re.search(
f137e4c2 1709 r'''(?x)
1710 ^.{8,}? # >=8 char non-TZ prefix, if present
1711 (?P<tz>Z| # just the UTC Z, or
1712 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1713 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714 [ ]? # optional space
1715 (?P<sign>\+|-) # +/-
1716 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1717 $)
1718 ''', date_str)
46f59e89 1719 if not m:
8f53dc44 1720 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1721 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1722 if timezone is not None:
1723 date_str = date_str[:-len(m.group('tz'))]
1724 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1725 else:
1726 date_str = date_str[:-len(m.group('tz'))]
1727 if not m.group('sign'):
1728 timezone = datetime.timedelta()
1729 else:
1730 sign = 1 if m.group('sign') == '+' else -1
1731 timezone = datetime.timedelta(
1732 hours=sign * int(m.group('hours')),
1733 minutes=sign * int(m.group('minutes')))
1734 return timezone, date_str
1735
1736
08b38d54 1737def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1738 """ Return a UNIX timestamp from the given date """
1739
1740 if date_str is None:
1741 return None
1742
52c3a6e4
S
1743 date_str = re.sub(r'\.[0-9]+', '', date_str)
1744
08b38d54 1745 if timezone is None:
46f59e89
S
1746 timezone, date_str = extract_timezone(date_str)
1747
19a03940 1748 with contextlib.suppress(ValueError):
86e5f3ed 1749 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1750 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1751 return calendar.timegm(dt.timetuple())
912b38b4
PH
1752
1753
46f59e89
S
1754def date_formats(day_first=True):
1755 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
42bdd9d0 1758def unified_strdate(date_str, day_first=True):
bf50b038 1759 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1760
1761 if date_str is None:
1762 return None
bf50b038 1763 upload_date = None
5f6a1245 1764 # Replace commas
026fcc04 1765 date_str = date_str.replace(',', ' ')
42bdd9d0 1766 # Remove AM/PM + timezone
9bb8e0a3 1767 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1768 _, date_str = extract_timezone(date_str)
42bdd9d0 1769
46f59e89 1770 for expression in date_formats(day_first):
19a03940 1771 with contextlib.suppress(ValueError):
bf50b038 1772 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1773 if upload_date is None:
1774 timetuple = email.utils.parsedate_tz(date_str)
1775 if timetuple:
19a03940 1776 with contextlib.suppress(ValueError):
c6b9cf05 1777 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1778 if upload_date is not None:
14f25df2 1779 return str(upload_date)
bf50b038 1780
5f6a1245 1781
46f59e89
S
1782def unified_timestamp(date_str, day_first=True):
1783 if date_str is None:
1784 return None
1785
8f53dc44 1786 date_str = re.sub(r'\s+', ' ', re.sub(
1787 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1788
7dc2a74e 1789 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1790 timezone, date_str = extract_timezone(date_str)
1791
1792 # Remove AM/PM + timezone
1793 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1794
deef3195
S
1795 # Remove unrecognized timezones from ISO 8601 alike timestamps
1796 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1797 if m:
1798 date_str = date_str[:-len(m.group('tz'))]
1799
f226880c
PH
1800 # Python only supports microseconds, so remove nanoseconds
1801 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1802 if m:
1803 date_str = m.group(1)
1804
46f59e89 1805 for expression in date_formats(day_first):
19a03940 1806 with contextlib.suppress(ValueError):
7dc2a74e 1807 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1808 return calendar.timegm(dt.timetuple())
8f53dc44 1809
46f59e89
S
1810 timetuple = email.utils.parsedate_tz(date_str)
1811 if timetuple:
8f53dc44 1812 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1813
1814
28e614de 1815def determine_ext(url, default_ext='unknown_video'):
85750f89 1816 if url is None or '.' not in url:
f4776371 1817 return default_ext
9cb9a5df 1818 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1819 if re.match(r'^[A-Za-z0-9]+$', guess):
1820 return guess
a7aaa398
S
1821 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1822 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1823 return guess.rstrip('/')
73e79f2a 1824 else:
cbdbb766 1825 return default_ext
73e79f2a 1826
5f6a1245 1827
824fa511
S
1828def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1829 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1830
5f6a1245 1831
9e62f283 1832def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1833 R"""
1834 Return a datetime object from a string.
1835 Supported format:
1836 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1837
1838 @param format strftime format of DATE
1839 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1840 auto: round to the unit provided in date_str (if applicable).
9e62f283 1841 """
1842 auto_precision = False
1843 if precision == 'auto':
1844 auto_precision = True
1845 precision = 'microsecond'
396a76f7 1846 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1847 if date_str in ('now', 'today'):
37254abc 1848 return today
f8795e10
PH
1849 if date_str == 'yesterday':
1850 return today - datetime.timedelta(days=1)
9e62f283 1851 match = re.match(
3d38b2d6 1852 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1853 date_str)
37254abc 1854 if match is not None:
9e62f283 1855 start_time = datetime_from_str(match.group('start'), precision, format)
1856 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1857 unit = match.group('unit')
9e62f283 1858 if unit == 'month' or unit == 'year':
1859 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1860 unit = 'day'
9e62f283 1861 else:
1862 if unit == 'week':
1863 unit = 'day'
1864 time *= 7
1865 delta = datetime.timedelta(**{unit + 's': time})
1866 new_date = start_time + delta
1867 if auto_precision:
1868 return datetime_round(new_date, unit)
1869 return new_date
1870
1871 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1872
1873
d49f8db3 1874def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1875 R"""
1876 Return a date object from a string using datetime_from_str
9e62f283 1877
3d38b2d6 1878 @param strict Restrict allowed patterns to "YYYYMMDD" and
1879 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1880 """
3d38b2d6 1881 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1882 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1883 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886def datetime_add_months(dt, months):
1887 """Increment/Decrement a datetime object by months."""
1888 month = dt.month + months - 1
1889 year = dt.year + month // 12
1890 month = month % 12 + 1
1891 day = min(dt.day, calendar.monthrange(year, month)[1])
1892 return dt.replace(year, month, day)
1893
1894
1895def datetime_round(dt, precision='day'):
1896 """
1897 Round a datetime object's time to a specific precision
1898 """
1899 if precision == 'microsecond':
1900 return dt
1901
1902 unit_seconds = {
1903 'day': 86400,
1904 'hour': 3600,
1905 'minute': 60,
1906 'second': 1,
1907 }
1908 roundto = lambda x, n: ((x + n / 2) // n) * n
1909 timestamp = calendar.timegm(dt.timetuple())
1910 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1911
1912
e63fc1be 1913def hyphenate_date(date_str):
1914 """
1915 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917 if match is not None:
1918 return '-'.join(match.groups())
1919 else:
1920 return date_str
1921
5f6a1245 1922
86e5f3ed 1923class DateRange:
bd558525 1924 """Represents a time interval between two dates"""
5f6a1245 1925
bd558525
JMF
1926 def __init__(self, start=None, end=None):
1927 """start and end must be strings in the format accepted by date"""
1928 if start is not None:
d49f8db3 1929 self.start = date_from_str(start, strict=True)
bd558525
JMF
1930 else:
1931 self.start = datetime.datetime.min.date()
1932 if end is not None:
d49f8db3 1933 self.end = date_from_str(end, strict=True)
bd558525
JMF
1934 else:
1935 self.end = datetime.datetime.max.date()
37254abc 1936 if self.start > self.end:
bd558525 1937 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1938
bd558525
JMF
1939 @classmethod
1940 def day(cls, day):
1941 """Returns a range that only contains the given day"""
5f6a1245
JW
1942 return cls(day, day)
1943
bd558525
JMF
1944 def __contains__(self, date):
1945 """Check if the date is in the range"""
37254abc
JMF
1946 if not isinstance(date, datetime.date):
1947 date = date_from_str(date)
1948 return self.start <= date <= self.end
5f6a1245 1949
46f1370e 1950 def __repr__(self):
1951 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1952
f2df4071 1953 def __eq__(self, other):
1954 return (isinstance(other, DateRange)
1955 and self.start == other.start and self.end == other.end)
1956
c496ca96 1957
b1f94422 1958@functools.cache
1959def system_identifier():
1960 python_implementation = platform.python_implementation()
1961 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1962 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1963 libc_ver = []
1964 with contextlib.suppress(OSError): # We may not have access to the executable
1965 libc_ver = platform.libc_ver()
b1f94422 1966
17fc3dc4 1967 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1968 platform.python_version(),
1969 python_implementation,
17fc3dc4 1970 platform.machine(),
b1f94422 1971 platform.architecture()[0],
1972 platform.platform(),
5b9f253f
M
1973 ssl.OPENSSL_VERSION,
1974 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1975 )
c257baff
PH
1976
1977
0b9c08b4 1978@functools.cache
49fa4d9a 1979def get_windows_version():
8a82af35 1980 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1981 if compat_os_name == 'nt':
1982 return version_tuple(platform.win32_ver()[1])
1983 else:
8a82af35 1984 return ()
49fa4d9a
N
1985
1986
734f90bb 1987def write_string(s, out=None, encoding=None):
19a03940 1988 assert isinstance(s, str)
1989 out = out or sys.stderr
3b479100
SS
1990 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1991 if not out:
1992 return
7459e3a2 1993
fe1daad3 1994 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1995 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1996
8a82af35 1997 enc, buffer = None, out
cfb0511d 1998 if 'b' in getattr(out, 'mode', ''):
c487cf00 1999 enc = encoding or preferredencoding()
104aa738 2000 elif hasattr(out, 'buffer'):
8a82af35 2001 buffer = out.buffer
104aa738 2002 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2003
8a82af35 2004 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2005 out.flush()
2006
2007
da4db748 2008def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 2009 from .. import _IN_CLI
da4db748 2010 if _IN_CLI:
2011 if msg in deprecation_warning._cache:
2012 return
2013 deprecation_warning._cache.add(msg)
2014 if printer:
2015 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2016 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2017 else:
2018 import warnings
2019 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2020
2021
2022deprecation_warning._cache = set()
2023
2024
48ea9cea
PH
2025def bytes_to_intlist(bs):
2026 if not bs:
2027 return []
2028 if isinstance(bs[0], int): # Python 3
2029 return list(bs)
2030 else:
2031 return [ord(c) for c in bs]
2032
c257baff 2033
cba892fa 2034def intlist_to_bytes(xs):
2035 if not xs:
2036 return b''
ac668111 2037 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2038
2039
8a82af35 2040class LockingUnsupportedError(OSError):
1890fc63 2041 msg = 'File locking is not supported'
0edb3e33 2042
2043 def __init__(self):
2044 super().__init__(self.msg)
2045
2046
c1c9a79c
PH
2047# Cross-platform file locking
2048if sys.platform == 'win32':
fe0918bb 2049 import ctypes
c1c9a79c
PH
2050 import ctypes.wintypes
2051 import msvcrt
2052
2053 class OVERLAPPED(ctypes.Structure):
2054 _fields_ = [
2055 ('Internal', ctypes.wintypes.LPVOID),
2056 ('InternalHigh', ctypes.wintypes.LPVOID),
2057 ('Offset', ctypes.wintypes.DWORD),
2058 ('OffsetHigh', ctypes.wintypes.DWORD),
2059 ('hEvent', ctypes.wintypes.HANDLE),
2060 ]
2061
37e325b9 2062 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
2063 LockFileEx = kernel32.LockFileEx
2064 LockFileEx.argtypes = [
2065 ctypes.wintypes.HANDLE, # hFile
2066 ctypes.wintypes.DWORD, # dwFlags
2067 ctypes.wintypes.DWORD, # dwReserved
2068 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2069 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2070 ctypes.POINTER(OVERLAPPED) # Overlapped
2071 ]
2072 LockFileEx.restype = ctypes.wintypes.BOOL
2073 UnlockFileEx = kernel32.UnlockFileEx
2074 UnlockFileEx.argtypes = [
2075 ctypes.wintypes.HANDLE, # hFile
2076 ctypes.wintypes.DWORD, # dwReserved
2077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2078 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2079 ctypes.POINTER(OVERLAPPED) # Overlapped
2080 ]
2081 UnlockFileEx.restype = ctypes.wintypes.BOOL
2082 whole_low = 0xffffffff
2083 whole_high = 0x7fffffff
2084
747c0bd1 2085 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2086 overlapped = OVERLAPPED()
2087 overlapped.Offset = 0
2088 overlapped.OffsetHigh = 0
2089 overlapped.hEvent = 0
2090 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2091
2092 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2093 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2094 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2095 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2096 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2097
2098 def _unlock_file(f):
2099 assert f._lock_file_overlapped_p
2100 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2101 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2102 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2103
2104else:
399a76e6
YCH
2105 try:
2106 import fcntl
c1c9a79c 2107
a3125791 2108 def _lock_file(f, exclusive, block):
b63837bc 2109 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2110 if not block:
2111 flags |= fcntl.LOCK_NB
acea8d7c 2112 try:
b63837bc 2113 fcntl.flock(f, flags)
acea8d7c
JK
2114 except BlockingIOError:
2115 raise
2116 except OSError: # AOSP does not have flock()
b63837bc 2117 fcntl.lockf(f, flags)
c1c9a79c 2118
399a76e6 2119 def _unlock_file(f):
45998b3e
E
2120 with contextlib.suppress(OSError):
2121 return fcntl.flock(f, fcntl.LOCK_UN)
2122 with contextlib.suppress(OSError):
2123 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2124 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 2125
399a76e6 2126 except ImportError:
399a76e6 2127
a3125791 2128 def _lock_file(f, exclusive, block):
0edb3e33 2129 raise LockingUnsupportedError()
399a76e6
YCH
2130
2131 def _unlock_file(f):
0edb3e33 2132 raise LockingUnsupportedError()
c1c9a79c
PH
2133
2134
86e5f3ed 2135class locked_file:
0edb3e33 2136 locked = False
747c0bd1 2137
a3125791 2138 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2139 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2140 raise NotImplementedError(mode)
2141 self.mode, self.block = mode, block
2142
2143 writable = any(f in mode for f in 'wax+')
2144 readable = any(f in mode for f in 'r+')
2145 flags = functools.reduce(operator.ior, (
2146 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2147 getattr(os, 'O_BINARY', 0), # Windows only
2148 getattr(os, 'O_NOINHERIT', 0), # Windows only
2149 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2150 os.O_APPEND if 'a' in mode else 0,
2151 os.O_EXCL if 'x' in mode else 0,
2152 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2153 ))
2154
98804d03 2155 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2156
2157 def __enter__(self):
a3125791 2158 exclusive = 'r' not in self.mode
c1c9a79c 2159 try:
a3125791 2160 _lock_file(self.f, exclusive, self.block)
0edb3e33 2161 self.locked = True
86e5f3ed 2162 except OSError:
c1c9a79c
PH
2163 self.f.close()
2164 raise
fcfa8853 2165 if 'w' in self.mode:
131e14dc
JK
2166 try:
2167 self.f.truncate()
2168 except OSError as e:
1890fc63 2169 if e.errno not in (
2170 errno.ESPIPE, # Illegal seek - expected for FIFO
2171 errno.EINVAL, # Invalid argument - expected for /dev/null
2172 ):
2173 raise
c1c9a79c
PH
2174 return self
2175
0edb3e33 2176 def unlock(self):
2177 if not self.locked:
2178 return
c1c9a79c 2179 try:
0edb3e33 2180 _unlock_file(self.f)
c1c9a79c 2181 finally:
0edb3e33 2182 self.locked = False
c1c9a79c 2183
0edb3e33 2184 def __exit__(self, *_):
2185 try:
2186 self.unlock()
2187 finally:
2188 self.f.close()
4eb7f1d1 2189
0edb3e33 2190 open = __enter__
2191 close = __exit__
a3125791 2192
0edb3e33 2193 def __getattr__(self, attr):
2194 return getattr(self.f, attr)
a3125791 2195
0edb3e33 2196 def __iter__(self):
2197 return iter(self.f)
a3125791 2198
4eb7f1d1 2199
0b9c08b4 2200@functools.cache
4644ac55
S
2201def get_filesystem_encoding():
2202 encoding = sys.getfilesystemencoding()
2203 return encoding if encoding is not None else 'utf-8'
2204
2205
4eb7f1d1 2206def shell_quote(args):
a6a173c2 2207 quoted_args = []
4644ac55 2208 encoding = get_filesystem_encoding()
a6a173c2
JMF
2209 for a in args:
2210 if isinstance(a, bytes):
2211 # We may get a filename encoded with 'encodeFilename'
2212 a = a.decode(encoding)
aefce8e6 2213 quoted_args.append(compat_shlex_quote(a))
28e614de 2214 return ' '.join(quoted_args)
9d4660ca
PH
2215
2216
2217def smuggle_url(url, data):
2218 """ Pass additional data in a URL for internal use. """
2219
81953d1a
RA
2220 url, idata = unsmuggle_url(url, {})
2221 data.update(idata)
14f25df2 2222 sdata = urllib.parse.urlencode(
28e614de
PH
2223 {'__youtubedl_smuggle': json.dumps(data)})
2224 return url + '#' + sdata
9d4660ca
PH
2225
2226
79f82953 2227def unsmuggle_url(smug_url, default=None):
83e865a3 2228 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2229 return smug_url, default
28e614de 2230 url, _, sdata = smug_url.rpartition('#')
14f25df2 2231 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2232 data = json.loads(jsond)
2233 return url, data
02dbf93f
PH
2234
2235
e0fd9573 2236def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2237 """ Formats numbers with decimal sufixes like K, M, etc """
2238 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2239 if num is None or num < 0:
e0fd9573 2240 return None
eeb2a770 2241 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2242 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2243 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2244 if factor == 1024:
2245 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2246 converted = num / (factor ** exponent)
abbeeebc 2247 return fmt % (converted, suffix)
e0fd9573 2248
2249
02dbf93f 2250def format_bytes(bytes):
f02d24d8 2251 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2252
1c088fa8 2253
64c464a1 2254def lookup_unit_table(unit_table, s, strict=False):
2255 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2256 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2257 m = (re.fullmatch if strict else re.match)(
2258 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2259 if not m:
2260 return None
64c464a1 2261
2262 num = float(m.group('num').replace(',', '.'))
fb47597b 2263 mult = unit_table[m.group('unit')]
64c464a1 2264 return round(num * mult)
2265
2266
2267def parse_bytes(s):
2268 """Parse a string indicating a byte quantity into an integer"""
2269 return lookup_unit_table(
2270 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2271 s.upper(), strict=True)
fb47597b
S
2272
2273
be64b5b0
PH
2274def parse_filesize(s):
2275 if s is None:
2276 return None
2277
dfb1b146 2278 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2279 # but we support those too
2280 _UNIT_TABLE = {
2281 'B': 1,
2282 'b': 1,
70852b47 2283 'bytes': 1,
be64b5b0
PH
2284 'KiB': 1024,
2285 'KB': 1000,
2286 'kB': 1024,
2287 'Kb': 1000,
13585d76 2288 'kb': 1000,
70852b47
YCH
2289 'kilobytes': 1000,
2290 'kibibytes': 1024,
be64b5b0
PH
2291 'MiB': 1024 ** 2,
2292 'MB': 1000 ** 2,
2293 'mB': 1024 ** 2,
2294 'Mb': 1000 ** 2,
13585d76 2295 'mb': 1000 ** 2,
70852b47
YCH
2296 'megabytes': 1000 ** 2,
2297 'mebibytes': 1024 ** 2,
be64b5b0
PH
2298 'GiB': 1024 ** 3,
2299 'GB': 1000 ** 3,
2300 'gB': 1024 ** 3,
2301 'Gb': 1000 ** 3,
13585d76 2302 'gb': 1000 ** 3,
70852b47
YCH
2303 'gigabytes': 1000 ** 3,
2304 'gibibytes': 1024 ** 3,
be64b5b0
PH
2305 'TiB': 1024 ** 4,
2306 'TB': 1000 ** 4,
2307 'tB': 1024 ** 4,
2308 'Tb': 1000 ** 4,
13585d76 2309 'tb': 1000 ** 4,
70852b47
YCH
2310 'terabytes': 1000 ** 4,
2311 'tebibytes': 1024 ** 4,
be64b5b0
PH
2312 'PiB': 1024 ** 5,
2313 'PB': 1000 ** 5,
2314 'pB': 1024 ** 5,
2315 'Pb': 1000 ** 5,
13585d76 2316 'pb': 1000 ** 5,
70852b47
YCH
2317 'petabytes': 1000 ** 5,
2318 'pebibytes': 1024 ** 5,
be64b5b0
PH
2319 'EiB': 1024 ** 6,
2320 'EB': 1000 ** 6,
2321 'eB': 1024 ** 6,
2322 'Eb': 1000 ** 6,
13585d76 2323 'eb': 1000 ** 6,
70852b47
YCH
2324 'exabytes': 1000 ** 6,
2325 'exbibytes': 1024 ** 6,
be64b5b0
PH
2326 'ZiB': 1024 ** 7,
2327 'ZB': 1000 ** 7,
2328 'zB': 1024 ** 7,
2329 'Zb': 1000 ** 7,
13585d76 2330 'zb': 1000 ** 7,
70852b47
YCH
2331 'zettabytes': 1000 ** 7,
2332 'zebibytes': 1024 ** 7,
be64b5b0
PH
2333 'YiB': 1024 ** 8,
2334 'YB': 1000 ** 8,
2335 'yB': 1024 ** 8,
2336 'Yb': 1000 ** 8,
13585d76 2337 'yb': 1000 ** 8,
70852b47
YCH
2338 'yottabytes': 1000 ** 8,
2339 'yobibytes': 1024 ** 8,
be64b5b0
PH
2340 }
2341
fb47597b
S
2342 return lookup_unit_table(_UNIT_TABLE, s)
2343
2344
2345def parse_count(s):
2346 if s is None:
be64b5b0
PH
2347 return None
2348
352d5da8 2349 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2350
2351 if re.match(r'^[\d,.]+$', s):
2352 return str_to_int(s)
2353
2354 _UNIT_TABLE = {
2355 'k': 1000,
2356 'K': 1000,
2357 'm': 1000 ** 2,
2358 'M': 1000 ** 2,
2359 'kk': 1000 ** 2,
2360 'KK': 1000 ** 2,
352d5da8 2361 'b': 1000 ** 3,
2362 'B': 1000 ** 3,
fb47597b 2363 }
be64b5b0 2364
352d5da8 2365 ret = lookup_unit_table(_UNIT_TABLE, s)
2366 if ret is not None:
2367 return ret
2368
2369 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2370 if mobj:
2371 return str_to_int(mobj.group(1))
be64b5b0 2372
2f7ae819 2373
5d45484c 2374def parse_resolution(s, *, lenient=False):
b871d7e9
S
2375 if s is None:
2376 return {}
2377
5d45484c
LNO
2378 if lenient:
2379 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2380 else:
2381 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2382 if mobj:
2383 return {
2384 'width': int(mobj.group('w')),
2385 'height': int(mobj.group('h')),
2386 }
2387
17ec8bcf 2388 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2389 if mobj:
2390 return {'height': int(mobj.group(1))}
2391
2392 mobj = re.search(r'\b([48])[kK]\b', s)
2393 if mobj:
2394 return {'height': int(mobj.group(1)) * 540}
2395
2396 return {}
2397
2398
0dc41787 2399def parse_bitrate(s):
14f25df2 2400 if not isinstance(s, str):
0dc41787
S
2401 return
2402 mobj = re.search(r'\b(\d+)\s*kbps', s)
2403 if mobj:
2404 return int(mobj.group(1))
2405
2406
a942d6cb 2407def month_by_name(name, lang='en'):
caefb1de
PH
2408 """ Return the number of a month by (locale-independently) English name """
2409
f6717dec 2410 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2411
caefb1de 2412 try:
f6717dec 2413 return month_names.index(name) + 1
7105440c
YCH
2414 except ValueError:
2415 return None
2416
2417
2418def month_by_abbreviation(abbrev):
2419 """ Return the number of a month by (locale-independently) English
2420 abbreviations """
2421
2422 try:
2423 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2424 except ValueError:
2425 return None
18258362
JMF
2426
2427
5aafe895 2428def fix_xml_ampersands(xml_str):
18258362 2429 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2430 return re.sub(
2431 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2432 '&amp;',
5aafe895 2433 xml_str)
e3946f98
PH
2434
2435
2436def setproctitle(title):
14f25df2 2437 assert isinstance(title, str)
c1c05c67 2438
fe0918bb 2439 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2440 try:
2441 import ctypes
2442 except ImportError:
c1c05c67
YCH
2443 return
2444
e3946f98 2445 try:
611c1dd9 2446 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2447 except OSError:
2448 return
2f49bcd6
RC
2449 except TypeError:
2450 # LoadLibrary in Windows Python 2.7.13 only expects
2451 # a bytestring, but since unicode_literals turns
2452 # every string into a unicode string, it fails.
2453 return
0f06bcd7 2454 title_bytes = title.encode()
6eefe533
PH
2455 buf = ctypes.create_string_buffer(len(title_bytes))
2456 buf.value = title_bytes
e3946f98 2457 try:
6eefe533 2458 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2459 except AttributeError:
2460 return # Strange libc, just skip this
d7dda168
PH
2461
2462
2463def remove_start(s, start):
46bc9b7d 2464 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2465
2466
2b9faf55 2467def remove_end(s, end):
46bc9b7d 2468 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2469
2470
31b2051e
S
2471def remove_quotes(s):
2472 if s is None or len(s) < 2:
2473 return s
2474 for quote in ('"', "'", ):
2475 if s[0] == quote and s[-1] == quote:
2476 return s[1:-1]
2477 return s
2478
2479
b6e0c7d2 2480def get_domain(url):
ebf99aaf 2481 """
2482 This implementation is inconsistent, but is kept for compatibility.
2483 Use this only for "webpage_url_domain"
2484 """
2485 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2486
2487
29eb5174 2488def url_basename(url):
14f25df2 2489 path = urllib.parse.urlparse(url).path
28e614de 2490 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2491
2492
02dc0a36 2493def base_url(url):
7657ec7e 2494 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2495
2496
e34c3361 2497def urljoin(base, path):
4b5de77b 2498 if isinstance(path, bytes):
0f06bcd7 2499 path = path.decode()
14f25df2 2500 if not isinstance(path, str) or not path:
e34c3361 2501 return None
fad4ceb5 2502 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2503 return path
4b5de77b 2504 if isinstance(base, bytes):
0f06bcd7 2505 base = base.decode()
14f25df2 2506 if not isinstance(base, str) or not re.match(
4b5de77b 2507 r'^(?:https?:)?//', base):
e34c3361 2508 return None
14f25df2 2509 return urllib.parse.urljoin(base, path)
e34c3361
S
2510
2511
ac668111 2512class HEADRequest(urllib.request.Request):
aa94a6d3 2513 def get_method(self):
611c1dd9 2514 return 'HEAD'
7217e148
PH
2515
2516
ac668111 2517class PUTRequest(urllib.request.Request):
95cf60e8
S
2518 def get_method(self):
2519 return 'PUT'
2520
2521
9732d77e 2522def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2523 if get_attr and v is not None:
2524 v = getattr(v, get_attr, None)
1812afb7
S
2525 try:
2526 return int(v) * invscale // scale
31c49255 2527 except (ValueError, TypeError, OverflowError):
af98f8ff 2528 return default
9732d77e 2529
9572013d 2530
40a90862 2531def str_or_none(v, default=None):
14f25df2 2532 return default if v is None else str(v)
40a90862 2533
9732d77e
PH
2534
2535def str_to_int(int_str):
48d4681e 2536 """ A more relaxed version of int_or_none """
f9934b96 2537 if isinstance(int_str, int):
348c6bf1 2538 return int_str
14f25df2 2539 elif isinstance(int_str, str):
42db58ec
S
2540 int_str = re.sub(r'[,\.\+]', '', int_str)
2541 return int_or_none(int_str)
608d11f5
PH
2542
2543
9732d77e 2544def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2545 if v is None:
2546 return default
2547 try:
2548 return float(v) * invscale / scale
5e1271c5 2549 except (ValueError, TypeError):
caf80631 2550 return default
43f775e4
PH
2551
2552
c7e327c4
S
2553def bool_or_none(v, default=None):
2554 return v if isinstance(v, bool) else default
2555
2556
53cd37ba 2557def strip_or_none(v, default=None):
14f25df2 2558 return v.strip() if isinstance(v, str) else default
b72b4431
S
2559
2560
af03000a 2561def url_or_none(url):
14f25df2 2562 if not url or not isinstance(url, str):
af03000a
S
2563 return None
2564 url = url.strip()
29f7c58a 2565 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2566
2567
3e9b66d7 2568def request_to_url(req):
ac668111 2569 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2570 return req.get_full_url()
2571 else:
2572 return req
2573
2574
e29663c6 2575def strftime_or_none(timestamp, date_format, default=None):
2576 datetime_object = None
2577 try:
f9934b96 2578 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2579 # Using naive datetime here can break timestamp() in Windows
2580 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2581 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2582 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2583 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2584 date_format = re.sub( # Support %s on windows
2585 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2586 return datetime_object.strftime(date_format)
2587 except (ValueError, TypeError, AttributeError):
2588 return default
2589
2590
608d11f5 2591def parse_duration(s):
f9934b96 2592 if not isinstance(s, str):
608d11f5 2593 return None
ca7b3246 2594 s = s.strip()
38d79fd1 2595 if not s:
2596 return None
ca7b3246 2597
acaff495 2598 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2599 m = re.match(r'''(?x)
2600 (?P<before_secs>
2601 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2602 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2603 (?P<ms>[.:][0-9]+)?Z?$
2604 ''', s)
acaff495 2605 if m:
8bd1c00b 2606 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2607 else:
2608 m = re.match(
056653bb
S
2609 r'''(?ix)(?:P?
2610 (?:
1c1b2f96 2611 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2612 )?
2613 (?:
1c1b2f96 2614 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2615 )?
2616 (?:
1c1b2f96 2617 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2618 )?
8f4b58d7 2619 (?:
1c1b2f96 2620 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2621 )?
056653bb 2622 T)?
acaff495 2623 (?:
1c1b2f96 2624 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2625 )?
2626 (?:
1c1b2f96 2627 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2628 )?
2629 (?:
2630 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2631 )?Z?$''', s)
acaff495 2632 if m:
2633 days, hours, mins, secs, ms = m.groups()
2634 else:
15846398 2635 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2636 if m:
2637 hours, mins = m.groups()
2638 else:
2639 return None
2640
acaff495 2641 if ms:
19a03940 2642 ms = ms.replace(':', '.')
2643 return sum(float(part or 0) * mult for part, mult in (
2644 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2645
2646
e65e4c88 2647def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2648 name, real_ext = os.path.splitext(filename)
e65e4c88 2649 return (
86e5f3ed 2650 f'{name}.{ext}{real_ext}'
e65e4c88 2651 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2652 else f'{filename}.{ext}')
d70ad093
PH
2653
2654
b3ed15b7
S
2655def replace_extension(filename, ext, expected_real_ext=None):
2656 name, real_ext = os.path.splitext(filename)
86e5f3ed 2657 return '{}.{}'.format(
b3ed15b7
S
2658 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2659 ext)
2660
2661
d70ad093
PH
2662def check_executable(exe, args=[]):
2663 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2664 args can be a list of arguments for a short output (like -version) """
2665 try:
f0c9fb96 2666 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2667 except OSError:
2668 return False
2669 return exe
b7ab0590
PH
2670
2671
7aaf4cd2 2672def _get_exe_version_output(exe, args):
95807118 2673 try:
b64d04c1 2674 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2675 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2676 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2677 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2678 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2679 if ret:
2680 return None
95807118
PH
2681 except OSError:
2682 return False
f0c9fb96 2683 return stdout
cae97f65
PH
2684
2685
2686def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2687 assert isinstance(output, str)
cae97f65
PH
2688 if version_re is None:
2689 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2690 m = re.search(version_re, output)
95807118
PH
2691 if m:
2692 return m.group(1)
2693 else:
2694 return unrecognized
2695
2696
9af98e17 2697def get_exe_version(exe, args=['--version'],
1cdda329 2698 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2699 """ Returns the version of the specified executable,
2700 or False if the executable is not present """
1cdda329 2701 unrecognized = variadic(unrecognized)
2702 assert len(unrecognized) in (1, 2)
9af98e17 2703 out = _get_exe_version_output(exe, args)
1cdda329 2704 if out is None:
2705 return unrecognized[-1]
2706 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2707
2708
7e88d7d7 2709def frange(start=0, stop=None, step=1):
2710 """Float range"""
2711 if stop is None:
2712 start, stop = 0, start
2713 sign = [-1, 1][step > 0] if step else 0
2714 while sign * start < sign * stop:
2715 yield start
2716 start += step
2717
2718
cb89cfc1 2719class LazyList(collections.abc.Sequence):
0f06bcd7 2720 """Lazy immutable list from an iterable
2721 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2722
8e5fecc8 2723 class IndexError(IndexError):
2724 pass
2725
282f5709 2726 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2727 self._iterable = iter(iterable)
2728 self._cache = [] if _cache is None else _cache
2729 self._reversed = reverse
483336e7 2730
2731 def __iter__(self):
0f06bcd7 2732 if self._reversed:
28419ca2 2733 # We need to consume the entire iterable to iterate in reverse
981052c9 2734 yield from self.exhaust()
28419ca2 2735 return
0f06bcd7 2736 yield from self._cache
2737 for item in self._iterable:
2738 self._cache.append(item)
483336e7 2739 yield item
2740
0f06bcd7 2741 def _exhaust(self):
2742 self._cache.extend(self._iterable)
2743 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2744 return self._cache
28419ca2 2745
981052c9 2746 def exhaust(self):
0f06bcd7 2747 """Evaluate the entire iterable"""
2748 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2749
28419ca2 2750 @staticmethod
0f06bcd7 2751 def _reverse_index(x):
f2df4071 2752 return None if x is None else ~x
483336e7 2753
2754 def __getitem__(self, idx):
2755 if isinstance(idx, slice):
0f06bcd7 2756 if self._reversed:
2757 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2758 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2759 elif isinstance(idx, int):
0f06bcd7 2760 if self._reversed:
2761 idx = self._reverse_index(idx)
e0f2b4b4 2762 start, stop, step = idx, idx, 0
483336e7 2763 else:
2764 raise TypeError('indices must be integers or slices')
e0f2b4b4 2765 if ((start or 0) < 0 or (stop or 0) < 0
2766 or (start is None and step < 0)
2767 or (stop is None and step > 0)):
483336e7 2768 # We need to consume the entire iterable to be able to slice from the end
2769 # Obviously, never use this with infinite iterables
0f06bcd7 2770 self._exhaust()
8e5fecc8 2771 try:
0f06bcd7 2772 return self._cache[idx]
8e5fecc8 2773 except IndexError as e:
2774 raise self.IndexError(e) from e
0f06bcd7 2775 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2776 if n > 0:
0f06bcd7 2777 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2778 try:
0f06bcd7 2779 return self._cache[idx]
8e5fecc8 2780 except IndexError as e:
2781 raise self.IndexError(e) from e
483336e7 2782
2783 def __bool__(self):
2784 try:
0f06bcd7 2785 self[-1] if self._reversed else self[0]
8e5fecc8 2786 except self.IndexError:
483336e7 2787 return False
2788 return True
2789
2790 def __len__(self):
0f06bcd7 2791 self._exhaust()
2792 return len(self._cache)
483336e7 2793
282f5709 2794 def __reversed__(self):
0f06bcd7 2795 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2796
2797 def __copy__(self):
0f06bcd7 2798 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2799
28419ca2 2800 def __repr__(self):
2801 # repr and str should mimic a list. So we exhaust the iterable
2802 return repr(self.exhaust())
2803
2804 def __str__(self):
2805 return repr(self.exhaust())
2806
483336e7 2807
7be9ccff 2808class PagedList:
c07a39ae 2809
2810 class IndexError(IndexError):
2811 pass
2812
dd26ced1
PH
2813 def __len__(self):
2814 # This is only useful for tests
2815 return len(self.getslice())
2816
7be9ccff 2817 def __init__(self, pagefunc, pagesize, use_cache=True):
2818 self._pagefunc = pagefunc
2819 self._pagesize = pagesize
f1d13090 2820 self._pagecount = float('inf')
7be9ccff 2821 self._use_cache = use_cache
2822 self._cache = {}
2823
2824 def getpage(self, pagenum):
d8cf8d97 2825 page_results = self._cache.get(pagenum)
2826 if page_results is None:
f1d13090 2827 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2828 if self._use_cache:
2829 self._cache[pagenum] = page_results
2830 return page_results
2831
2832 def getslice(self, start=0, end=None):
2833 return list(self._getslice(start, end))
2834
2835 def _getslice(self, start, end):
55575225 2836 raise NotImplementedError('This method must be implemented by subclasses')
2837
2838 def __getitem__(self, idx):
f1d13090 2839 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2840 if not isinstance(idx, int) or idx < 0:
2841 raise TypeError('indices must be non-negative integers')
2842 entries = self.getslice(idx, idx + 1)
d8cf8d97 2843 if not entries:
c07a39ae 2844 raise self.IndexError()
d8cf8d97 2845 return entries[0]
55575225 2846
9c44d242
PH
2847
2848class OnDemandPagedList(PagedList):
a44ca5a4 2849 """Download pages until a page with less than maximum results"""
86e5f3ed 2850
7be9ccff 2851 def _getslice(self, start, end):
b7ab0590
PH
2852 for pagenum in itertools.count(start // self._pagesize):
2853 firstid = pagenum * self._pagesize
2854 nextfirstid = pagenum * self._pagesize + self._pagesize
2855 if start >= nextfirstid:
2856 continue
2857
b7ab0590
PH
2858 startv = (
2859 start % self._pagesize
2860 if firstid <= start < nextfirstid
2861 else 0)
b7ab0590
PH
2862 endv = (
2863 ((end - 1) % self._pagesize) + 1
2864 if (end is not None and firstid <= end <= nextfirstid)
2865 else None)
2866
f1d13090 2867 try:
2868 page_results = self.getpage(pagenum)
2869 except Exception:
2870 self._pagecount = pagenum - 1
2871 raise
b7ab0590
PH
2872 if startv != 0 or endv is not None:
2873 page_results = page_results[startv:endv]
7be9ccff 2874 yield from page_results
b7ab0590
PH
2875
2876 # A little optimization - if current page is not "full", ie. does
2877 # not contain page_size videos then we can assume that this page
2878 # is the last one - there are no more ids on further pages -
2879 # i.e. no need to query again.
2880 if len(page_results) + startv < self._pagesize:
2881 break
2882
2883 # If we got the whole page, but the next page is not interesting,
2884 # break out early as well
2885 if end == nextfirstid:
2886 break
81c2f20b
PH
2887
2888
9c44d242 2889class InAdvancePagedList(PagedList):
a44ca5a4 2890 """PagedList with total number of pages known in advance"""
86e5f3ed 2891
9c44d242 2892 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2893 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2894 self._pagecount = pagecount
9c44d242 2895
7be9ccff 2896 def _getslice(self, start, end):
9c44d242 2897 start_page = start // self._pagesize
d37707bd 2898 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2899 skip_elems = start - start_page * self._pagesize
2900 only_more = None if end is None else end - start
2901 for pagenum in range(start_page, end_page):
7be9ccff 2902 page_results = self.getpage(pagenum)
9c44d242 2903 if skip_elems:
7be9ccff 2904 page_results = page_results[skip_elems:]
9c44d242
PH
2905 skip_elems = None
2906 if only_more is not None:
7be9ccff 2907 if len(page_results) < only_more:
2908 only_more -= len(page_results)
9c44d242 2909 else:
7be9ccff 2910 yield from page_results[:only_more]
9c44d242 2911 break
7be9ccff 2912 yield from page_results
9c44d242
PH
2913
2914
7e88d7d7 2915class PlaylistEntries:
2916 MissingEntry = object()
2917 is_exhausted = False
2918
2919 def __init__(self, ydl, info_dict):
7e9a6125 2920 self.ydl = ydl
2921
2922 # _entries must be assigned now since infodict can change during iteration
2923 entries = info_dict.get('entries')
2924 if entries is None:
2925 raise EntryNotInPlaylist('There are no entries')
2926 elif isinstance(entries, list):
2927 self.is_exhausted = True
2928
2929 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2930 self.is_incomplete = requested_entries is not None
7e9a6125 2931 if self.is_incomplete:
2932 assert self.is_exhausted
bc5c2f8a 2933 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2934 for i, entry in zip(requested_entries, entries):
2935 self._entries[i - 1] = entry
2936 elif isinstance(entries, (list, PagedList, LazyList)):
2937 self._entries = entries
2938 else:
2939 self._entries = LazyList(entries)
7e88d7d7 2940
2941 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2942 (?P<start>[+-]?\d+)?
2943 (?P<range>[:-]
2944 (?P<end>[+-]?\d+|inf(?:inite)?)?
2945 (?::(?P<step>[+-]?\d+))?
2946 )?''')
2947
2948 @classmethod
2949 def parse_playlist_items(cls, string):
2950 for segment in string.split(','):
2951 if not segment:
2952 raise ValueError('There is two or more consecutive commas')
2953 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2954 if not mobj:
2955 raise ValueError(f'{segment!r} is not a valid specification')
2956 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2957 if int_or_none(step) == 0:
2958 raise ValueError(f'Step in {segment!r} cannot be zero')
2959 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2960
2961 def get_requested_items(self):
2962 playlist_items = self.ydl.params.get('playlist_items')
2963 playlist_start = self.ydl.params.get('playliststart', 1)
2964 playlist_end = self.ydl.params.get('playlistend')
2965 # For backwards compatibility, interpret -1 as whole list
2966 if playlist_end in (-1, None):
2967 playlist_end = ''
2968 if not playlist_items:
2969 playlist_items = f'{playlist_start}:{playlist_end}'
2970 elif playlist_start != 1 or playlist_end:
2971 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2972
2973 for index in self.parse_playlist_items(playlist_items):
2974 for i, entry in self[index]:
2975 yield i, entry
1ac4fd80 2976 if not entry:
2977 continue
7e88d7d7 2978 try:
d21056f4 2979 # The item may have just been added to archive. Don't break due to it
2980 if not self.ydl.params.get('lazy_playlist'):
2981 # TODO: Add auto-generated fields
2982 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2983 except (ExistingVideoReached, RejectedVideoReached):
2984 return
2985
7e9a6125 2986 def get_full_count(self):
2987 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2988 return len(self)
2989 elif isinstance(self._entries, InAdvancePagedList):
2990 if self._entries._pagesize == 1:
2991 return self._entries._pagecount
2992
7e88d7d7 2993 @functools.cached_property
2994 def _getter(self):
2995 if isinstance(self._entries, list):
2996 def get_entry(i):
2997 try:
2998 entry = self._entries[i]
2999 except IndexError:
3000 entry = self.MissingEntry
3001 if not self.is_incomplete:
3002 raise self.IndexError()
3003 if entry is self.MissingEntry:
bc5c2f8a 3004 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3005 return entry
3006 else:
3007 def get_entry(i):
3008 try:
3009 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3010 except (LazyList.IndexError, PagedList.IndexError):
3011 raise self.IndexError()
3012 return get_entry
3013
3014 def __getitem__(self, idx):
3015 if isinstance(idx, int):
3016 idx = slice(idx, idx)
3017
3018 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3019 step = 1 if idx.step is None else idx.step
3020 if idx.start is None:
3021 start = 0 if step > 0 else len(self) - 1
3022 else:
3023 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3024
3025 # NB: Do not call len(self) when idx == [:]
3026 if idx.stop is None:
3027 stop = 0 if step < 0 else float('inf')
3028 else:
3029 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3030 stop += [-1, 1][step > 0]
3031
3032 for i in frange(start, stop, step):
3033 if i < 0:
3034 continue
3035 try:
7e9a6125 3036 entry = self._getter(i)
3037 except self.IndexError:
3038 self.is_exhausted = True
3039 if step > 0:
7e88d7d7 3040 break
7e9a6125 3041 continue
7e88d7d7 3042 yield i + 1, entry
3043
3044 def __len__(self):
3045 return len(tuple(self[:]))
3046
3047 class IndexError(IndexError):
3048 pass
3049
3050
81c2f20b 3051def uppercase_escape(s):
676eb3f2 3052 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3053 return re.sub(
a612753d 3054 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3055 lambda m: unicode_escape(m.group(0))[0],
3056 s)
0fe2ff78
YCH
3057
3058
3059def lowercase_escape(s):
3060 unicode_escape = codecs.getdecoder('unicode_escape')
3061 return re.sub(
3062 r'\\u[0-9a-fA-F]{4}',
3063 lambda m: unicode_escape(m.group(0))[0],
3064 s)
b53466e1 3065
d05cfe06
S
3066
3067def escape_rfc3986(s):
3068 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3069 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3070
3071
3072def escape_url(url):
3073 """Escape URL as suggested by RFC 3986"""
14f25df2 3074 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3075 return url_parsed._replace(
efbed08d 3076 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3077 path=escape_rfc3986(url_parsed.path),
3078 params=escape_rfc3986(url_parsed.params),
3079 query=escape_rfc3986(url_parsed.query),
3080 fragment=escape_rfc3986(url_parsed.fragment)
3081 ).geturl()
3082
62e609ab 3083
96b9e9cf 3084def parse_qs(url, **kwargs):
3085 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3086
3087
62e609ab
PH
3088def read_batch_urls(batch_fd):
3089 def fixup(url):
14f25df2 3090 if not isinstance(url, str):
62e609ab 3091 url = url.decode('utf-8', 'replace')
8c04f0be 3092 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3093 for bom in BOM_UTF8:
3094 if url.startswith(bom):
3095 url = url[len(bom):]
3096 url = url.lstrip()
3097 if not url or url.startswith(('#', ';', ']')):
62e609ab 3098 return False
8c04f0be 3099 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3100 # However, it can be safely stripped out if following a whitespace
8c04f0be 3101 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3102
3103 with contextlib.closing(batch_fd) as fd:
3104 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3105
3106
3107def urlencode_postdata(*args, **kargs):
14f25df2 3108 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3109
3110
45b2ee6f 3111def update_url(url, *, query_update=None, **kwargs):
3112 """Replace URL components specified by kwargs
3113 @param url str or parse url tuple
3114 @param query_update update query
3115 @returns str
3116 """
3117 if isinstance(url, str):
3118 if not kwargs and not query_update:
3119 return url
3120 else:
3121 url = urllib.parse.urlparse(url)
3122 if query_update:
3123 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3124 kwargs['query'] = urllib.parse.urlencode({
3125 **urllib.parse.parse_qs(url.query),
3126 **query_update
3127 }, True)
3128 return urllib.parse.urlunparse(url._replace(**kwargs))
3129
3130
38f9ef31 3131def update_url_query(url, query):
45b2ee6f 3132 return update_url(url, query_update=query)
16392824 3133
8e60dc75 3134
c043c246 3135def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3136 req_headers = req.headers.copy()
c043c246 3137 req_headers.update(headers or {})
ed0291d1
S
3138 req_data = data or req.data
3139 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3140 req_get_method = req.get_method()
3141 if req_get_method == 'HEAD':
3142 req_type = HEADRequest
3143 elif req_get_method == 'PUT':
3144 req_type = PUTRequest
3145 else:
ac668111 3146 req_type = urllib.request.Request
ed0291d1
S
3147 new_req = req_type(
3148 req_url, data=req_data, headers=req_headers,
3149 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3150 if hasattr(req, 'timeout'):
3151 new_req.timeout = req.timeout
3152 return new_req
3153
3154
10c87c15 3155def _multipart_encode_impl(data, boundary):
0c265486
YCH
3156 content_type = 'multipart/form-data; boundary=%s' % boundary
3157
3158 out = b''
3159 for k, v in data.items():
3160 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3161 if isinstance(k, str):
0f06bcd7 3162 k = k.encode()
14f25df2 3163 if isinstance(v, str):
0f06bcd7 3164 v = v.encode()
0c265486
YCH
3165 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3166 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3167 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3168 if boundary.encode('ascii') in content:
3169 raise ValueError('Boundary overlaps with data')
3170 out += content
3171
3172 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3173
3174 return out, content_type
3175
3176
3177def multipart_encode(data, boundary=None):
3178 '''
3179 Encode a dict to RFC 7578-compliant form-data
3180
3181 data:
3182 A dict where keys and values can be either Unicode or bytes-like
3183 objects.
3184 boundary:
3185 If specified a Unicode object, it's used as the boundary. Otherwise
3186 a random boundary is generated.
3187
3188 Reference: https://tools.ietf.org/html/rfc7578
3189 '''
3190 has_specified_boundary = boundary is not None
3191
3192 while True:
3193 if boundary is None:
3194 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3195
3196 try:
10c87c15 3197 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3198 break
3199 except ValueError:
3200 if has_specified_boundary:
3201 raise
3202 boundary = None
3203
3204 return out, content_type
3205
3206
b079c26f
SS
3207def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3208 if blocked_types is NO_DEFAULT:
3209 blocked_types = (str, bytes, collections.abc.Mapping)
3210 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3211
3212
3213def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 3214 if not isinstance(allowed_types, (tuple, type)):
3215 deprecation_warning('allowed_types should be a tuple or a type')
3216 allowed_types = tuple(allowed_types)
6f2287cb 3217 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 3218
3219
c4f60dd7 3220def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3221 for f in funcs:
a32a9a7e 3222 try:
c4f60dd7 3223 val = f(*args, **kwargs)
ab029d7e 3224 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3225 pass
3226 else:
c4f60dd7 3227 if expected_type is None or isinstance(val, expected_type):
3228 return val
3229
3230
3231def try_get(src, getter, expected_type=None):
3232 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3233
3234
90137ca4 3235def filter_dict(dct, cndn=lambda _, v: v is not None):
3236 return {k: v for k, v in dct.items() if cndn(k, v)}
3237
3238
6cc62232
S
3239def merge_dicts(*dicts):
3240 merged = {}
3241 for a_dict in dicts:
3242 for k, v in a_dict.items():
90137ca4 3243 if (v is not None and k not in merged
3244 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3245 merged[k] = v
3246 return merged
3247
3248
8e60dc75 3249def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3250 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3251
16392824 3252
a1a530b0
PH
3253US_RATINGS = {
3254 'G': 0,
3255 'PG': 10,
3256 'PG-13': 13,
3257 'R': 16,
3258 'NC': 18,
3259}
fac55558
PH
3260
3261
a8795327 3262TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3263 'TV-Y': 0,
3264 'TV-Y7': 7,
3265 'TV-G': 0,
3266 'TV-PG': 0,
3267 'TV-14': 14,
3268 'TV-MA': 17,
a8795327
S
3269}
3270
3271
146c80e2 3272def parse_age_limit(s):
19a03940 3273 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3274 if type(s) is int: # noqa: E721
a8795327 3275 return s if 0 <= s <= 21 else None
19a03940 3276 elif not isinstance(s, str):
d838b1bd 3277 return None
146c80e2 3278 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3279 if m:
3280 return int(m.group('age'))
5c5fae6d 3281 s = s.upper()
a8795327
S
3282 if s in US_RATINGS:
3283 return US_RATINGS[s]
5a16c9d9 3284 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3285 if m:
5a16c9d9 3286 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3287 return None
146c80e2
S
3288
3289
fac55558 3290def strip_jsonp(code):
609a61e3 3291 return re.sub(
5552c9eb 3292 r'''(?sx)^
e9c671d5 3293 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3294 (?:\s*&&\s*(?P=func_name))?
3295 \s*\(\s*(?P<callback_data>.*)\);?
3296 \s*?(?://[^\n]*)*$''',
3297 r'\g<callback_data>', code)
478c2c61
PH
3298
3299
8f53dc44 3300def js_to_json(code, vars={}, *, strict=False):
5c610515 3301 # vars is a dict of var, val pairs to substitute
0898c5c8 3302 STRING_QUOTES = '\'"`'
a71b812f 3303 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3304 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3305 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3306 INTEGER_TABLE = (
86e5f3ed 3307 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3308 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3309 )
3310
a71b812f
SS
3311 def process_escape(match):
3312 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3313 escape = match.group(1) or match.group(2)
3314
3315 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3316 else R'\u00' if escape == 'x'
3317 else '' if escape == '\n'
3318 else escape)
3319
0898c5c8
SS
3320 def template_substitute(match):
3321 evaluated = js_to_json(match.group(1), vars, strict=strict)
3322 if evaluated[0] == '"':
3323 return json.loads(evaluated)
3324 return evaluated
3325
e05f6939 3326 def fix_kv(m):
e7b6d122
PH
3327 v = m.group(0)
3328 if v in ('true', 'false', 'null'):
3329 return v
421ddcb8
C
3330 elif v in ('undefined', 'void 0'):
3331 return 'null'
8bdd16b4 3332 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3333 return ''
3334
3335 if v[0] in STRING_QUOTES:
0898c5c8
SS
3336 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3337 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
3338 return f'"{escaped}"'
3339
3340 for regex, base in INTEGER_TABLE:
3341 im = re.match(regex, v)
3342 if im:
3343 i = int(im.group(1), base)
3344 return f'"{i}":' if v.endswith(':') else str(i)
3345
3346 if v in vars:
d5f043d1
C
3347 try:
3348 if not strict:
3349 json.loads(vars[v])
08e29b9f 3350 except json.JSONDecodeError:
d5f043d1
C
3351 return json.dumps(vars[v])
3352 else:
3353 return vars[v]
89ac4a19 3354
a71b812f
SS
3355 if not strict:
3356 return f'"{v}"'
5c610515 3357
a71b812f 3358 raise ValueError(f'Unknown value: {v}')
e05f6939 3359
8072ef2b 3360 def create_map(mobj):
3361 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3362
8072ef2b 3363 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3364 if not strict:
3365 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3366 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3367 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3368 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3369
a71b812f
SS
3370 return re.sub(rf'''(?sx)
3371 {STRING_RE}|
3372 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3373 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3374 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3375 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3376 !+
a71b812f 3377 ''', fix_kv, code)
e05f6939
PH
3378
3379
478c2c61
PH
3380def qualities(quality_ids):
3381 """ Get a numeric quality value out of a list of possible values """
3382 def q(qid):
3383 try:
3384 return quality_ids.index(qid)
3385 except ValueError:
3386 return -1
3387 return q
3388
acd69589 3389
119e40ef 3390POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3391
3392
de6000d9 3393DEFAULT_OUTTMPL = {
3394 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3395 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3396}
3397OUTTMPL_TYPES = {
72755351 3398 'chapter': None,
de6000d9 3399 'subtitle': None,
3400 'thumbnail': None,
3401 'description': 'description',
3402 'annotation': 'annotations.xml',
3403 'infojson': 'info.json',
08438d2c 3404 'link': None,
3b603dbd 3405 'pl_video': None,
5112f26a 3406 'pl_thumbnail': None,
de6000d9 3407 'pl_description': 'description',
3408 'pl_infojson': 'info.json',
3409}
0a871f68 3410
143db31d 3411# As of [1] format syntax is:
3412# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3413# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3414STR_FORMAT_RE_TMPL = r'''(?x)
3415 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3416 %
524e2e4f 3417 (?P<has_key>\((?P<key>{0})\))?
752cda38 3418 (?P<format>
524e2e4f 3419 (?P<conversion>[#0\-+ ]+)?
3420 (?P<min_width>\d+)?
3421 (?P<precision>\.\d+)?
3422 (?P<len_mod>[hlL])? # unused in python
901130bb 3423 {1} # conversion type
752cda38 3424 )
143db31d 3425'''
3426
7d1eb38a 3427
901130bb 3428STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3429
7d1eb38a 3430
a020a0dc
PH
3431def limit_length(s, length):
3432 """ Add ellipses to overly long strings """
3433 if s is None:
3434 return None
3435 ELLIPSES = '...'
3436 if len(s) > length:
3437 return s[:length - len(ELLIPSES)] + ELLIPSES
3438 return s
48844745
PH
3439
3440
3441def version_tuple(v):
5f9b8394 3442 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3443
3444
3445def is_outdated_version(version, limit, assume_new=True):
3446 if not version:
3447 return not assume_new
3448 try:
3449 return version_tuple(version) < version_tuple(limit)
3450 except ValueError:
3451 return not assume_new
732ea2f0
PH
3452
3453
3454def ytdl_is_updateable():
7a5c1cfe 3455 """ Returns if yt-dlp can be updated with -U """
735d865e 3456
69bec673 3457 from ..update import is_non_updateable
732ea2f0 3458
5d535b4a 3459 return not is_non_updateable()
7d4111ed
PH
3460
3461
3462def args_to_str(args):
3463 # Get a short string representation for a subprocess command
702ccf2d 3464 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3465
3466
a44ca5a4 3467def error_to_str(err):
3468 return f'{type(err).__name__}: {err}'
3469
3470
2647c933 3471def mimetype2ext(mt, default=NO_DEFAULT):
3472 if not isinstance(mt, str):
3473 if default is not NO_DEFAULT:
3474 return default
eb9ee194
S
3475 return None
3476
2647c933 3477 MAP = {
3478 # video
f6861ec9 3479 '3gpp': '3gp',
2647c933 3480 'mp2t': 'ts',
3481 'mp4': 'mp4',
3482 'mpeg': 'mpeg',
3483 'mpegurl': 'm3u8',
3484 'quicktime': 'mov',
3485 'webm': 'webm',
3486 'vp9': 'vp9',
f6861ec9 3487 'x-flv': 'flv',
2647c933 3488 'x-m4v': 'm4v',
3489 'x-matroska': 'mkv',
3490 'x-mng': 'mng',
a0d8d704 3491 'x-mp4-fragmented': 'mp4',
2647c933 3492 'x-ms-asf': 'asf',
a0d8d704 3493 'x-ms-wmv': 'wmv',
2647c933 3494 'x-msvideo': 'avi',
3495
3496 # application (streaming playlists)
b4173f15 3497 'dash+xml': 'mpd',
b4173f15 3498 'f4m+xml': 'f4m',
f164b971 3499 'hds+xml': 'f4m',
2647c933 3500 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3501 'vnd.ms-sstr+xml': 'ism',
2647c933 3502 'x-mpegurl': 'm3u8',
3503
3504 # audio
3505 'audio/mp4': 'm4a',
3506 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3507 # Using .mp3 as it's the most popular one
3508 'audio/mpeg': 'mp3',
d80ca5de 3509 'audio/webm': 'webm',
2647c933 3510 'audio/x-matroska': 'mka',
3511 'audio/x-mpegurl': 'm3u',
3512 'midi': 'mid',
3513 'ogg': 'ogg',
3514 'wav': 'wav',
3515 'wave': 'wav',
3516 'x-aac': 'aac',
3517 'x-flac': 'flac',
3518 'x-m4a': 'm4a',
3519 'x-realaudio': 'ra',
39e7107d 3520 'x-wav': 'wav',
9359f3d4 3521
2647c933 3522 # image
3523 'avif': 'avif',
3524 'bmp': 'bmp',
3525 'gif': 'gif',
3526 'jpeg': 'jpg',
3527 'png': 'png',
3528 'svg+xml': 'svg',
3529 'tiff': 'tif',
3530 'vnd.wap.wbmp': 'wbmp',
3531 'webp': 'webp',
3532 'x-icon': 'ico',
3533 'x-jng': 'jng',
3534 'x-ms-bmp': 'bmp',
3535
3536 # caption
3537 'filmstrip+json': 'fs',
3538 'smptett+xml': 'tt',
3539 'ttaf+xml': 'dfxp',
3540 'ttml+xml': 'ttml',
3541 'x-ms-sami': 'sami',
9359f3d4 3542
2647c933 3543 # misc
3544 'gzip': 'gz',
9359f3d4
F
3545 'json': 'json',
3546 'xml': 'xml',
3547 'zip': 'zip',
9359f3d4
F
3548 }
3549
2647c933 3550 mimetype = mt.partition(';')[0].strip().lower()
3551 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3552
69bec673 3553 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 3554 if ext:
3555 return ext
3556 elif default is not NO_DEFAULT:
3557 return default
9359f3d4 3558 return subtype.replace('+', '.')
c460bdd5
PH
3559
3560
2814f12b
THD
3561def ext2mimetype(ext_or_url):
3562 if not ext_or_url:
3563 return None
3564 if '.' not in ext_or_url:
3565 ext_or_url = f'file.{ext_or_url}'
3566 return mimetypes.guess_type(ext_or_url)[0]
3567
3568
4f3c5e06 3569def parse_codecs(codecs_str):
3570 # http://tools.ietf.org/html/rfc6381
3571 if not codecs_str:
3572 return {}
a0566bbf 3573 split_codecs = list(filter(None, map(
dbf5416a 3574 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3575 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3576 for full_codec in split_codecs:
d816f61f 3577 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3578 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3579 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3580 if vcodec:
3581 continue
3582 vcodec = full_codec
3583 if parts[0] in ('dvh1', 'dvhe'):
3584 hdr = 'DV'
69bec673 3585 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 3586 hdr = 'HDR10'
3587 elif parts[:2] == ['vp9', '2']:
3588 hdr = 'HDR10'
71082216 3589 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3590 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3591 acodec = acodec or full_codec
3592 elif parts[0] in ('stpp', 'wvtt'):
3593 scodec = scodec or full_codec
4f3c5e06 3594 else:
19a03940 3595 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3596 if vcodec or acodec or scodec:
4f3c5e06 3597 return {
3598 'vcodec': vcodec or 'none',
3599 'acodec': acodec or 'none',
176f1866 3600 'dynamic_range': hdr,
3fe75fdc 3601 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3602 }
b69fd25c 3603 elif len(split_codecs) == 2:
3604 return {
3605 'vcodec': split_codecs[0],
3606 'acodec': split_codecs[1],
3607 }
4f3c5e06 3608 return {}
3609
3610
fc61aff4
LL
3611def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3612 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3613
3614 allow_mkv = not preferences or 'mkv' in preferences
3615
3616 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3617 return 'mkv' # TODO: any other format allows this?
3618
3619 # TODO: All codecs supported by parse_codecs isn't handled here
3620 COMPATIBLE_CODECS = {
3621 'mp4': {
71082216 3622 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3623 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3624 },
3625 'webm': {
3626 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3627 'vp9x', 'vp8x', # in the webm spec
3628 },
3629 }
3630
69bec673 3631 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
8f84770a 3632 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3633
3634 for ext in preferences or COMPATIBLE_CODECS.keys():
3635 codec_set = COMPATIBLE_CODECS.get(ext, set())
3636 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3637 return ext
3638
3639 COMPATIBLE_EXTS = (
3640 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3641 {'webm', 'weba'},
fc61aff4
LL
3642 )
3643 for ext in preferences or vexts:
3644 current_exts = {ext, *vexts, *aexts}
3645 if ext == 'mkv' or current_exts == {ext} or any(
3646 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3647 return ext
3648 return 'mkv' if allow_mkv else preferences[-1]
3649
3650
2647c933 3651def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3652 getheader = url_handle.headers.get
2ccd1b10 3653
b55ee18f
PH
3654 cd = getheader('Content-Disposition')
3655 if cd:
3656 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3657 if m:
3658 e = determine_ext(m.group('filename'), default_ext=None)
3659 if e:
3660 return e
3661
2647c933 3662 meta_ext = getheader('x-amz-meta-name')
3663 if meta_ext:
3664 e = meta_ext.rpartition('.')[2]
3665 if e:
3666 return e
3667
3668 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3669
3670
1e399778
YCH
3671def encode_data_uri(data, mime_type):
3672 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3673
3674
05900629 3675def age_restricted(content_limit, age_limit):
6ec6cb4e 3676 """ Returns True iff the content should be blocked """
05900629
PH
3677
3678 if age_limit is None: # No limit set
3679 return False
3680 if content_limit is None:
3681 return False # Content available for everyone
3682 return age_limit < content_limit
61ca9a80
PH
3683
3684
88f60feb 3685# List of known byte-order-marks (BOM)
a904a7f8
L
3686BOMS = [
3687 (b'\xef\xbb\xbf', 'utf-8'),
3688 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3689 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3690 (b'\xff\xfe', 'utf-16-le'),
3691 (b'\xfe\xff', 'utf-16-be'),
3692]
a904a7f8
L
3693
3694
61ca9a80
PH
3695def is_html(first_bytes):
3696 """ Detect whether a file contains HTML by examining its first bytes. """
3697
80e8493e 3698 encoding = 'utf-8'
61ca9a80 3699 for bom, enc in BOMS:
80e8493e 3700 while first_bytes.startswith(bom):
3701 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3702
80e8493e 3703 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3704
3705
3706def determine_protocol(info_dict):
3707 protocol = info_dict.get('protocol')
3708 if protocol is not None:
3709 return protocol
3710
7de837a5 3711 url = sanitize_url(info_dict['url'])
a055469f
PH
3712 if url.startswith('rtmp'):
3713 return 'rtmp'
3714 elif url.startswith('mms'):
3715 return 'mms'
3716 elif url.startswith('rtsp'):
3717 return 'rtsp'
3718
3719 ext = determine_ext(url)
3720 if ext == 'm3u8':
deae7c17 3721 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3722 elif ext == 'f4m':
3723 return 'f4m'
3724
14f25df2 3725 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3726
3727
c5e3f849 3728def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3729 """ Render a list of rows, each as a list of values.
3730 Text after a \t will be right aligned """
ec11a9f4 3731 def width(string):
c5e3f849 3732 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3733
3734 def get_max_lens(table):
ec11a9f4 3735 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3736
3737 def filter_using_list(row, filterArray):
d16df59d 3738 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3739
d16df59d 3740 max_lens = get_max_lens(data) if hide_empty else []
3741 header_row = filter_using_list(header_row, max_lens)
3742 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3743
cfb56d1a 3744 table = [header_row] + data
76d321f6 3745 max_lens = get_max_lens(table)
c5e3f849 3746 extra_gap += 1
76d321f6 3747 if delim:
c5e3f849 3748 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3749 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3750 for row in table:
3751 for pos, text in enumerate(map(str, row)):
c5e3f849 3752 if '\t' in text:
3753 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3754 else:
3755 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3756 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3757 return ret
347de493
PH
3758
3759
8f18aca8 3760def _match_one(filter_part, dct, incomplete):
77b87f05 3761 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3762 STRING_OPERATORS = {
3763 '*=': operator.contains,
3764 '^=': lambda attr, value: attr.startswith(value),
3765 '$=': lambda attr, value: attr.endswith(value),
3766 '~=': lambda attr, value: re.search(value, attr),
3767 }
347de493 3768 COMPARISON_OPERATORS = {
a047eeb6 3769 **STRING_OPERATORS,
3770 '<=': operator.le, # "<=" must be defined above "<"
347de493 3771 '<': operator.lt,
347de493 3772 '>=': operator.ge,
a047eeb6 3773 '>': operator.gt,
347de493 3774 '=': operator.eq,
347de493 3775 }
a047eeb6 3776
6db9c4d5 3777 if isinstance(incomplete, bool):
3778 is_incomplete = lambda _: incomplete
3779 else:
3780 is_incomplete = lambda k: k in incomplete
3781
64fa820c 3782 operator_rex = re.compile(r'''(?x)
347de493 3783 (?P<key>[a-z_]+)
77b87f05 3784 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3785 (?:
a047eeb6 3786 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3787 (?P<strval>.+?)
347de493 3788 )
347de493 3789 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3790 m = operator_rex.fullmatch(filter_part.strip())
347de493 3791 if m:
18f96d12 3792 m = m.groupdict()
3793 unnegated_op = COMPARISON_OPERATORS[m['op']]
3794 if m['negation']:
77b87f05
MT
3795 op = lambda attr, value: not unnegated_op(attr, value)
3796 else:
3797 op = unnegated_op
18f96d12 3798 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3799 if m['quote']:
3800 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3801 actual_value = dct.get(m['key'])
3802 numeric_comparison = None
f9934b96 3803 if isinstance(actual_value, (int, float)):
e5a088dc
S
3804 # If the original field is a string and matching comparisonvalue is
3805 # a number we should respect the origin of the original field
3806 # and process comparison value as a string (see
18f96d12 3807 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3808 try:
18f96d12 3809 numeric_comparison = int(comparison_value)
347de493 3810 except ValueError:
18f96d12 3811 numeric_comparison = parse_filesize(comparison_value)
3812 if numeric_comparison is None:
3813 numeric_comparison = parse_filesize(f'{comparison_value}B')
3814 if numeric_comparison is None:
3815 numeric_comparison = parse_duration(comparison_value)
3816 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3817 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3818 if actual_value is None:
6db9c4d5 3819 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3820 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3821
3822 UNARY_OPERATORS = {
1cc47c66
S
3823 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3824 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3825 }
64fa820c 3826 operator_rex = re.compile(r'''(?x)
347de493 3827 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3828 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3829 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3830 if m:
3831 op = UNARY_OPERATORS[m.group('op')]
3832 actual_value = dct.get(m.group('key'))
6db9c4d5 3833 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3834 return True
347de493
PH
3835 return op(actual_value)
3836
3837 raise ValueError('Invalid filter part %r' % filter_part)
3838
3839
8f18aca8 3840def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3841 """ Filter a dictionary with a simple string syntax.
3842 @returns Whether the filter passes
3843 @param incomplete Set of keys that is expected to be missing from dct.
3844 Can be True/False to indicate all/none of the keys may be missing.
3845 All conditions on incomplete keys pass if the key is missing
8f18aca8 3846 """
347de493 3847 return all(
8f18aca8 3848 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3849 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3850
3851
fe2ce85a 3852def match_filter_func(filters, breaking_filters=None):
3853 if not filters and not breaking_filters:
d1b5f70b 3854 return None
fe2ce85a 3855 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3856 filters = set(variadic(filters or []))
d1b5f70b 3857
492272fe 3858 interactive = '-' in filters
3859 if interactive:
3860 filters.remove('-')
3861
3862 def _match_func(info_dict, incomplete=False):
fe2ce85a 3863 ret = breaking_filters(info_dict, incomplete)
3864 if ret is not None:
3865 raise RejectedVideoReached(ret)
3866
492272fe 3867 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3868 return NO_DEFAULT if interactive and not incomplete else None
347de493 3869 else:
3bec830a 3870 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3871 filter_str = ') | ('.join(map(str.strip, filters))
3872 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3873 return _match_func
91410c9b
PH
3874
3875
f2df4071 3876class download_range_func:
3877 def __init__(self, chapters, ranges):
3878 self.chapters, self.ranges = chapters, ranges
3879
3880 def __call__(self, info_dict, ydl):
0500ee3d 3881 if not self.ranges and not self.chapters:
3882 yield {}
3883
5ec1b6b7 3884 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3885 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3886 for regex in self.chapters or []:
5ec1b6b7 3887 for i, chapter in enumerate(info_dict.get('chapters') or []):
3888 if re.search(regex, chapter['title']):
3889 warning = None
3890 yield {**chapter, 'index': i}
f2df4071 3891 if self.chapters and warning:
5ec1b6b7 3892 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3893
f2df4071 3894 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3895
f2df4071 3896 def __eq__(self, other):
3897 return (isinstance(other, download_range_func)
3898 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3899
71df9b7f 3900 def __repr__(self):
a5387729 3901 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3902
5ec1b6b7 3903
bf6427d2
YCH
3904def parse_dfxp_time_expr(time_expr):
3905 if not time_expr:
d631d5f9 3906 return
bf6427d2 3907
1d485a1a 3908 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3909 if mobj:
3910 return float(mobj.group('time_offset'))
3911
db2fe38b 3912 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3913 if mobj:
db2fe38b 3914 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3915
3916
c1c924ab 3917def srt_subtitles_timecode(seconds):
aa7785f8 3918 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3919
3920
3921def ass_subtitles_timecode(seconds):
3922 time = timetuple_from_msec(seconds * 1000)
3923 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3924
3925
3926def dfxp2srt(dfxp_data):
3869028f
YCH
3927 '''
3928 @param dfxp_data A bytes-like object containing DFXP data
3929 @returns A unicode object containing converted SRT data
3930 '''
5b995f71 3931 LEGACY_NAMESPACES = (
3869028f
YCH
3932 (b'http://www.w3.org/ns/ttml', [
3933 b'http://www.w3.org/2004/11/ttaf1',
3934 b'http://www.w3.org/2006/04/ttaf1',
3935 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3936 ]),
3869028f
YCH
3937 (b'http://www.w3.org/ns/ttml#styling', [
3938 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3939 ]),
3940 )
3941
3942 SUPPORTED_STYLING = [
3943 'color',
3944 'fontFamily',
3945 'fontSize',
3946 'fontStyle',
3947 'fontWeight',
3948 'textDecoration'
3949 ]
3950
4e335771 3951 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3952 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3953 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3954 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3955 })
bf6427d2 3956
5b995f71
RA
3957 styles = {}
3958 default_style = {}
3959
86e5f3ed 3960 class TTMLPElementParser:
5b995f71
RA
3961 _out = ''
3962 _unclosed_elements = []
3963 _applied_styles = []
bf6427d2 3964
2b14cb56 3965 def start(self, tag, attrib):
5b995f71
RA
3966 if tag in (_x('ttml:br'), 'br'):
3967 self._out += '\n'
3968 else:
3969 unclosed_elements = []
3970 style = {}
3971 element_style_id = attrib.get('style')
3972 if default_style:
3973 style.update(default_style)
3974 if element_style_id:
3975 style.update(styles.get(element_style_id, {}))
3976 for prop in SUPPORTED_STYLING:
3977 prop_val = attrib.get(_x('tts:' + prop))
3978 if prop_val:
3979 style[prop] = prop_val
3980 if style:
3981 font = ''
3982 for k, v in sorted(style.items()):
3983 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3984 continue
3985 if k == 'color':
3986 font += ' color="%s"' % v
3987 elif k == 'fontSize':
3988 font += ' size="%s"' % v
3989 elif k == 'fontFamily':
3990 font += ' face="%s"' % v
3991 elif k == 'fontWeight' and v == 'bold':
3992 self._out += '<b>'
3993 unclosed_elements.append('b')
3994 elif k == 'fontStyle' and v == 'italic':
3995 self._out += '<i>'
3996 unclosed_elements.append('i')
3997 elif k == 'textDecoration' and v == 'underline':
3998 self._out += '<u>'
3999 unclosed_elements.append('u')
4000 if font:
4001 self._out += '<font' + font + '>'
4002 unclosed_elements.append('font')
4003 applied_style = {}
4004 if self._applied_styles:
4005 applied_style.update(self._applied_styles[-1])
4006 applied_style.update(style)
4007 self._applied_styles.append(applied_style)
4008 self._unclosed_elements.append(unclosed_elements)
bf6427d2 4009
2b14cb56 4010 def end(self, tag):
5b995f71
RA
4011 if tag not in (_x('ttml:br'), 'br'):
4012 unclosed_elements = self._unclosed_elements.pop()
4013 for element in reversed(unclosed_elements):
4014 self._out += '</%s>' % element
4015 if unclosed_elements and self._applied_styles:
4016 self._applied_styles.pop()
bf6427d2 4017
2b14cb56 4018 def data(self, data):
5b995f71 4019 self._out += data
2b14cb56 4020
4021 def close(self):
5b995f71 4022 return self._out.strip()
2b14cb56 4023
6a765f13 4024 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4025 # This will not trigger false positives since only UTF-8 text is being replaced
4026 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4027
2b14cb56 4028 def parse_node(node):
4029 target = TTMLPElementParser()
4030 parser = xml.etree.ElementTree.XMLParser(target=target)
4031 parser.feed(xml.etree.ElementTree.tostring(node))
4032 return parser.close()
bf6427d2 4033
5b995f71
RA
4034 for k, v in LEGACY_NAMESPACES:
4035 for ns in v:
4036 dfxp_data = dfxp_data.replace(ns, k)
4037
3869028f 4038 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4039 out = []
5b995f71 4040 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4041
4042 if not paras:
4043 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4044
5b995f71
RA
4045 repeat = False
4046 while True:
4047 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4048 style_id = style.get('id') or style.get(_x('xml:id'))
4049 if not style_id:
4050 continue
5b995f71
RA
4051 parent_style_id = style.get('style')
4052 if parent_style_id:
4053 if parent_style_id not in styles:
4054 repeat = True
4055 continue
4056 styles[style_id] = styles[parent_style_id].copy()
4057 for prop in SUPPORTED_STYLING:
4058 prop_val = style.get(_x('tts:' + prop))
4059 if prop_val:
4060 styles.setdefault(style_id, {})[prop] = prop_val
4061 if repeat:
4062 repeat = False
4063 else:
4064 break
4065
4066 for p in ('body', 'div'):
4067 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4068 if ele is None:
4069 continue
4070 style = styles.get(ele.get('style'))
4071 if not style:
4072 continue
4073 default_style.update(style)
4074
bf6427d2 4075 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4076 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4077 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4078 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4079 if begin_time is None:
4080 continue
7dff0363 4081 if not end_time:
d631d5f9
YCH
4082 if not dur:
4083 continue
4084 end_time = begin_time + dur
bf6427d2
YCH
4085 out.append('%d\n%s --> %s\n%s\n\n' % (
4086 index,
c1c924ab
YCH
4087 srt_subtitles_timecode(begin_time),
4088 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4089 parse_node(para)))
4090
4091 return ''.join(out)
4092
4093
c487cf00 4094def cli_option(params, command_option, param, separator=None):
66e289ba 4095 param = params.get(param)
c487cf00 4096 return ([] if param is None
4097 else [command_option, str(param)] if separator is None
4098 else [f'{command_option}{separator}{param}'])
66e289ba
S
4099
4100
4101def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4102 param = params.get(param)
c487cf00 4103 assert param in (True, False, None)
4104 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4105
4106
4107def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4108 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4109
4110
e92caff5 4111def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4112 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4113 if use_compat:
5b1ecbb3 4114 return argdict
4115 else:
4116 argdict = None
eab9b2bc 4117 if argdict is None:
5b1ecbb3 4118 return default
eab9b2bc 4119 assert isinstance(argdict, dict)
4120
e92caff5 4121 assert isinstance(keys, (list, tuple))
4122 for key_list in keys:
e92caff5 4123 arg_list = list(filter(
4124 lambda x: x is not None,
6606817a 4125 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4126 if arg_list:
4127 return [arg for args in arg_list for arg in args]
4128 return default
66e289ba 4129
6251555f 4130
330690a2 4131def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4132 main_key, exe = main_key.lower(), exe.lower()
4133 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4134 keys = [f'{root_key}{k}' for k in (keys or [''])]
4135 if root_key in keys:
4136 if main_key != exe:
4137 keys.append((main_key, exe))
4138 keys.append('default')
4139 else:
4140 use_compat = False
4141 return cli_configuration_args(argdict, keys, default, use_compat)
4142
66e289ba 4143
86e5f3ed 4144class ISO639Utils:
39672624
YCH
4145 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4146 _lang_map = {
4147 'aa': 'aar',
4148 'ab': 'abk',
4149 'ae': 'ave',
4150 'af': 'afr',
4151 'ak': 'aka',
4152 'am': 'amh',
4153 'an': 'arg',
4154 'ar': 'ara',
4155 'as': 'asm',
4156 'av': 'ava',
4157 'ay': 'aym',
4158 'az': 'aze',
4159 'ba': 'bak',
4160 'be': 'bel',
4161 'bg': 'bul',
4162 'bh': 'bih',
4163 'bi': 'bis',
4164 'bm': 'bam',
4165 'bn': 'ben',
4166 'bo': 'bod',
4167 'br': 'bre',
4168 'bs': 'bos',
4169 'ca': 'cat',
4170 'ce': 'che',
4171 'ch': 'cha',
4172 'co': 'cos',
4173 'cr': 'cre',
4174 'cs': 'ces',
4175 'cu': 'chu',
4176 'cv': 'chv',
4177 'cy': 'cym',
4178 'da': 'dan',
4179 'de': 'deu',
4180 'dv': 'div',
4181 'dz': 'dzo',
4182 'ee': 'ewe',
4183 'el': 'ell',
4184 'en': 'eng',
4185 'eo': 'epo',
4186 'es': 'spa',
4187 'et': 'est',
4188 'eu': 'eus',
4189 'fa': 'fas',
4190 'ff': 'ful',
4191 'fi': 'fin',
4192 'fj': 'fij',
4193 'fo': 'fao',
4194 'fr': 'fra',
4195 'fy': 'fry',
4196 'ga': 'gle',
4197 'gd': 'gla',
4198 'gl': 'glg',
4199 'gn': 'grn',
4200 'gu': 'guj',
4201 'gv': 'glv',
4202 'ha': 'hau',
4203 'he': 'heb',
b7acc835 4204 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4205 'hi': 'hin',
4206 'ho': 'hmo',
4207 'hr': 'hrv',
4208 'ht': 'hat',
4209 'hu': 'hun',
4210 'hy': 'hye',
4211 'hz': 'her',
4212 'ia': 'ina',
4213 'id': 'ind',
b7acc835 4214 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4215 'ie': 'ile',
4216 'ig': 'ibo',
4217 'ii': 'iii',
4218 'ik': 'ipk',
4219 'io': 'ido',
4220 'is': 'isl',
4221 'it': 'ita',
4222 'iu': 'iku',
4223 'ja': 'jpn',
4224 'jv': 'jav',
4225 'ka': 'kat',
4226 'kg': 'kon',
4227 'ki': 'kik',
4228 'kj': 'kua',
4229 'kk': 'kaz',
4230 'kl': 'kal',
4231 'km': 'khm',
4232 'kn': 'kan',
4233 'ko': 'kor',
4234 'kr': 'kau',
4235 'ks': 'kas',
4236 'ku': 'kur',
4237 'kv': 'kom',
4238 'kw': 'cor',
4239 'ky': 'kir',
4240 'la': 'lat',
4241 'lb': 'ltz',
4242 'lg': 'lug',
4243 'li': 'lim',
4244 'ln': 'lin',
4245 'lo': 'lao',
4246 'lt': 'lit',
4247 'lu': 'lub',
4248 'lv': 'lav',
4249 'mg': 'mlg',
4250 'mh': 'mah',
4251 'mi': 'mri',
4252 'mk': 'mkd',
4253 'ml': 'mal',
4254 'mn': 'mon',
4255 'mr': 'mar',
4256 'ms': 'msa',
4257 'mt': 'mlt',
4258 'my': 'mya',
4259 'na': 'nau',
4260 'nb': 'nob',
4261 'nd': 'nde',
4262 'ne': 'nep',
4263 'ng': 'ndo',
4264 'nl': 'nld',
4265 'nn': 'nno',
4266 'no': 'nor',
4267 'nr': 'nbl',
4268 'nv': 'nav',
4269 'ny': 'nya',
4270 'oc': 'oci',
4271 'oj': 'oji',
4272 'om': 'orm',
4273 'or': 'ori',
4274 'os': 'oss',
4275 'pa': 'pan',
4276 'pi': 'pli',
4277 'pl': 'pol',
4278 'ps': 'pus',
4279 'pt': 'por',
4280 'qu': 'que',
4281 'rm': 'roh',
4282 'rn': 'run',
4283 'ro': 'ron',
4284 'ru': 'rus',
4285 'rw': 'kin',
4286 'sa': 'san',
4287 'sc': 'srd',
4288 'sd': 'snd',
4289 'se': 'sme',
4290 'sg': 'sag',
4291 'si': 'sin',
4292 'sk': 'slk',
4293 'sl': 'slv',
4294 'sm': 'smo',
4295 'sn': 'sna',
4296 'so': 'som',
4297 'sq': 'sqi',
4298 'sr': 'srp',
4299 'ss': 'ssw',
4300 'st': 'sot',
4301 'su': 'sun',
4302 'sv': 'swe',
4303 'sw': 'swa',
4304 'ta': 'tam',
4305 'te': 'tel',
4306 'tg': 'tgk',
4307 'th': 'tha',
4308 'ti': 'tir',
4309 'tk': 'tuk',
4310 'tl': 'tgl',
4311 'tn': 'tsn',
4312 'to': 'ton',
4313 'tr': 'tur',
4314 'ts': 'tso',
4315 'tt': 'tat',
4316 'tw': 'twi',
4317 'ty': 'tah',
4318 'ug': 'uig',
4319 'uk': 'ukr',
4320 'ur': 'urd',
4321 'uz': 'uzb',
4322 've': 'ven',
4323 'vi': 'vie',
4324 'vo': 'vol',
4325 'wa': 'wln',
4326 'wo': 'wol',
4327 'xh': 'xho',
4328 'yi': 'yid',
e9a50fba 4329 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4330 'yo': 'yor',
4331 'za': 'zha',
4332 'zh': 'zho',
4333 'zu': 'zul',
4334 }
4335
4336 @classmethod
4337 def short2long(cls, code):
4338 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4339 return cls._lang_map.get(code[:2])
4340
4341 @classmethod
4342 def long2short(cls, code):
4343 """Convert language code from ISO 639-2/T to ISO 639-1"""
4344 for short_name, long_name in cls._lang_map.items():
4345 if long_name == code:
4346 return short_name
4347
4348
86e5f3ed 4349class ISO3166Utils:
4eb10f66
YCH
4350 # From http://data.okfn.org/data/core/country-list
4351 _country_map = {
4352 'AF': 'Afghanistan',
4353 'AX': 'Åland Islands',
4354 'AL': 'Albania',
4355 'DZ': 'Algeria',
4356 'AS': 'American Samoa',
4357 'AD': 'Andorra',
4358 'AO': 'Angola',
4359 'AI': 'Anguilla',
4360 'AQ': 'Antarctica',
4361 'AG': 'Antigua and Barbuda',
4362 'AR': 'Argentina',
4363 'AM': 'Armenia',
4364 'AW': 'Aruba',
4365 'AU': 'Australia',
4366 'AT': 'Austria',
4367 'AZ': 'Azerbaijan',
4368 'BS': 'Bahamas',
4369 'BH': 'Bahrain',
4370 'BD': 'Bangladesh',
4371 'BB': 'Barbados',
4372 'BY': 'Belarus',
4373 'BE': 'Belgium',
4374 'BZ': 'Belize',
4375 'BJ': 'Benin',
4376 'BM': 'Bermuda',
4377 'BT': 'Bhutan',
4378 'BO': 'Bolivia, Plurinational State of',
4379 'BQ': 'Bonaire, Sint Eustatius and Saba',
4380 'BA': 'Bosnia and Herzegovina',
4381 'BW': 'Botswana',
4382 'BV': 'Bouvet Island',
4383 'BR': 'Brazil',
4384 'IO': 'British Indian Ocean Territory',
4385 'BN': 'Brunei Darussalam',
4386 'BG': 'Bulgaria',
4387 'BF': 'Burkina Faso',
4388 'BI': 'Burundi',
4389 'KH': 'Cambodia',
4390 'CM': 'Cameroon',
4391 'CA': 'Canada',
4392 'CV': 'Cape Verde',
4393 'KY': 'Cayman Islands',
4394 'CF': 'Central African Republic',
4395 'TD': 'Chad',
4396 'CL': 'Chile',
4397 'CN': 'China',
4398 'CX': 'Christmas Island',
4399 'CC': 'Cocos (Keeling) Islands',
4400 'CO': 'Colombia',
4401 'KM': 'Comoros',
4402 'CG': 'Congo',
4403 'CD': 'Congo, the Democratic Republic of the',
4404 'CK': 'Cook Islands',
4405 'CR': 'Costa Rica',
4406 'CI': 'Côte d\'Ivoire',
4407 'HR': 'Croatia',
4408 'CU': 'Cuba',
4409 'CW': 'Curaçao',
4410 'CY': 'Cyprus',
4411 'CZ': 'Czech Republic',
4412 'DK': 'Denmark',
4413 'DJ': 'Djibouti',
4414 'DM': 'Dominica',
4415 'DO': 'Dominican Republic',
4416 'EC': 'Ecuador',
4417 'EG': 'Egypt',
4418 'SV': 'El Salvador',
4419 'GQ': 'Equatorial Guinea',
4420 'ER': 'Eritrea',
4421 'EE': 'Estonia',
4422 'ET': 'Ethiopia',
4423 'FK': 'Falkland Islands (Malvinas)',
4424 'FO': 'Faroe Islands',
4425 'FJ': 'Fiji',
4426 'FI': 'Finland',
4427 'FR': 'France',
4428 'GF': 'French Guiana',
4429 'PF': 'French Polynesia',
4430 'TF': 'French Southern Territories',
4431 'GA': 'Gabon',
4432 'GM': 'Gambia',
4433 'GE': 'Georgia',
4434 'DE': 'Germany',
4435 'GH': 'Ghana',
4436 'GI': 'Gibraltar',
4437 'GR': 'Greece',
4438 'GL': 'Greenland',
4439 'GD': 'Grenada',
4440 'GP': 'Guadeloupe',
4441 'GU': 'Guam',
4442 'GT': 'Guatemala',
4443 'GG': 'Guernsey',
4444 'GN': 'Guinea',
4445 'GW': 'Guinea-Bissau',
4446 'GY': 'Guyana',
4447 'HT': 'Haiti',
4448 'HM': 'Heard Island and McDonald Islands',
4449 'VA': 'Holy See (Vatican City State)',
4450 'HN': 'Honduras',
4451 'HK': 'Hong Kong',
4452 'HU': 'Hungary',
4453 'IS': 'Iceland',
4454 'IN': 'India',
4455 'ID': 'Indonesia',
4456 'IR': 'Iran, Islamic Republic of',
4457 'IQ': 'Iraq',
4458 'IE': 'Ireland',
4459 'IM': 'Isle of Man',
4460 'IL': 'Israel',
4461 'IT': 'Italy',
4462 'JM': 'Jamaica',
4463 'JP': 'Japan',
4464 'JE': 'Jersey',
4465 'JO': 'Jordan',
4466 'KZ': 'Kazakhstan',
4467 'KE': 'Kenya',
4468 'KI': 'Kiribati',
4469 'KP': 'Korea, Democratic People\'s Republic of',
4470 'KR': 'Korea, Republic of',
4471 'KW': 'Kuwait',
4472 'KG': 'Kyrgyzstan',
4473 'LA': 'Lao People\'s Democratic Republic',
4474 'LV': 'Latvia',
4475 'LB': 'Lebanon',
4476 'LS': 'Lesotho',
4477 'LR': 'Liberia',
4478 'LY': 'Libya',
4479 'LI': 'Liechtenstein',
4480 'LT': 'Lithuania',
4481 'LU': 'Luxembourg',
4482 'MO': 'Macao',
4483 'MK': 'Macedonia, the Former Yugoslav Republic of',
4484 'MG': 'Madagascar',
4485 'MW': 'Malawi',
4486 'MY': 'Malaysia',
4487 'MV': 'Maldives',
4488 'ML': 'Mali',
4489 'MT': 'Malta',
4490 'MH': 'Marshall Islands',
4491 'MQ': 'Martinique',
4492 'MR': 'Mauritania',
4493 'MU': 'Mauritius',
4494 'YT': 'Mayotte',
4495 'MX': 'Mexico',
4496 'FM': 'Micronesia, Federated States of',
4497 'MD': 'Moldova, Republic of',
4498 'MC': 'Monaco',
4499 'MN': 'Mongolia',
4500 'ME': 'Montenegro',
4501 'MS': 'Montserrat',
4502 'MA': 'Morocco',
4503 'MZ': 'Mozambique',
4504 'MM': 'Myanmar',
4505 'NA': 'Namibia',
4506 'NR': 'Nauru',
4507 'NP': 'Nepal',
4508 'NL': 'Netherlands',
4509 'NC': 'New Caledonia',
4510 'NZ': 'New Zealand',
4511 'NI': 'Nicaragua',
4512 'NE': 'Niger',
4513 'NG': 'Nigeria',
4514 'NU': 'Niue',
4515 'NF': 'Norfolk Island',
4516 'MP': 'Northern Mariana Islands',
4517 'NO': 'Norway',
4518 'OM': 'Oman',
4519 'PK': 'Pakistan',
4520 'PW': 'Palau',
4521 'PS': 'Palestine, State of',
4522 'PA': 'Panama',
4523 'PG': 'Papua New Guinea',
4524 'PY': 'Paraguay',
4525 'PE': 'Peru',
4526 'PH': 'Philippines',
4527 'PN': 'Pitcairn',
4528 'PL': 'Poland',
4529 'PT': 'Portugal',
4530 'PR': 'Puerto Rico',
4531 'QA': 'Qatar',
4532 'RE': 'Réunion',
4533 'RO': 'Romania',
4534 'RU': 'Russian Federation',
4535 'RW': 'Rwanda',
4536 'BL': 'Saint Barthélemy',
4537 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4538 'KN': 'Saint Kitts and Nevis',
4539 'LC': 'Saint Lucia',
4540 'MF': 'Saint Martin (French part)',
4541 'PM': 'Saint Pierre and Miquelon',
4542 'VC': 'Saint Vincent and the Grenadines',
4543 'WS': 'Samoa',
4544 'SM': 'San Marino',
4545 'ST': 'Sao Tome and Principe',
4546 'SA': 'Saudi Arabia',
4547 'SN': 'Senegal',
4548 'RS': 'Serbia',
4549 'SC': 'Seychelles',
4550 'SL': 'Sierra Leone',
4551 'SG': 'Singapore',
4552 'SX': 'Sint Maarten (Dutch part)',
4553 'SK': 'Slovakia',
4554 'SI': 'Slovenia',
4555 'SB': 'Solomon Islands',
4556 'SO': 'Somalia',
4557 'ZA': 'South Africa',
4558 'GS': 'South Georgia and the South Sandwich Islands',
4559 'SS': 'South Sudan',
4560 'ES': 'Spain',
4561 'LK': 'Sri Lanka',
4562 'SD': 'Sudan',
4563 'SR': 'Suriname',
4564 'SJ': 'Svalbard and Jan Mayen',
4565 'SZ': 'Swaziland',
4566 'SE': 'Sweden',
4567 'CH': 'Switzerland',
4568 'SY': 'Syrian Arab Republic',
4569 'TW': 'Taiwan, Province of China',
4570 'TJ': 'Tajikistan',
4571 'TZ': 'Tanzania, United Republic of',
4572 'TH': 'Thailand',
4573 'TL': 'Timor-Leste',
4574 'TG': 'Togo',
4575 'TK': 'Tokelau',
4576 'TO': 'Tonga',
4577 'TT': 'Trinidad and Tobago',
4578 'TN': 'Tunisia',
4579 'TR': 'Turkey',
4580 'TM': 'Turkmenistan',
4581 'TC': 'Turks and Caicos Islands',
4582 'TV': 'Tuvalu',
4583 'UG': 'Uganda',
4584 'UA': 'Ukraine',
4585 'AE': 'United Arab Emirates',
4586 'GB': 'United Kingdom',
4587 'US': 'United States',
4588 'UM': 'United States Minor Outlying Islands',
4589 'UY': 'Uruguay',
4590 'UZ': 'Uzbekistan',
4591 'VU': 'Vanuatu',
4592 'VE': 'Venezuela, Bolivarian Republic of',
4593 'VN': 'Viet Nam',
4594 'VG': 'Virgin Islands, British',
4595 'VI': 'Virgin Islands, U.S.',
4596 'WF': 'Wallis and Futuna',
4597 'EH': 'Western Sahara',
4598 'YE': 'Yemen',
4599 'ZM': 'Zambia',
4600 'ZW': 'Zimbabwe',
2f97cc61 4601 # Not ISO 3166 codes, but used for IP blocks
4602 'AP': 'Asia/Pacific Region',
4603 'EU': 'Europe',
4eb10f66
YCH
4604 }
4605
4606 @classmethod
4607 def short2full(cls, code):
4608 """Convert an ISO 3166-2 country code to the corresponding full name"""
4609 return cls._country_map.get(code.upper())
4610
4611
86e5f3ed 4612class GeoUtils:
773f291d
S
4613 # Major IPv4 address blocks per country
4614 _country_ip_map = {
53896ca5 4615 'AD': '46.172.224.0/19',
773f291d
S
4616 'AE': '94.200.0.0/13',
4617 'AF': '149.54.0.0/17',
4618 'AG': '209.59.64.0/18',
4619 'AI': '204.14.248.0/21',
4620 'AL': '46.99.0.0/16',
4621 'AM': '46.70.0.0/15',
4622 'AO': '105.168.0.0/13',
53896ca5
S
4623 'AP': '182.50.184.0/21',
4624 'AQ': '23.154.160.0/24',
773f291d
S
4625 'AR': '181.0.0.0/12',
4626 'AS': '202.70.112.0/20',
53896ca5 4627 'AT': '77.116.0.0/14',
773f291d
S
4628 'AU': '1.128.0.0/11',
4629 'AW': '181.41.0.0/18',
53896ca5
S
4630 'AX': '185.217.4.0/22',
4631 'AZ': '5.197.0.0/16',
773f291d
S
4632 'BA': '31.176.128.0/17',
4633 'BB': '65.48.128.0/17',
4634 'BD': '114.130.0.0/16',
4635 'BE': '57.0.0.0/8',
53896ca5 4636 'BF': '102.178.0.0/15',
773f291d
S
4637 'BG': '95.42.0.0/15',
4638 'BH': '37.131.0.0/17',
4639 'BI': '154.117.192.0/18',
4640 'BJ': '137.255.0.0/16',
53896ca5 4641 'BL': '185.212.72.0/23',
773f291d
S
4642 'BM': '196.12.64.0/18',
4643 'BN': '156.31.0.0/16',
4644 'BO': '161.56.0.0/16',
4645 'BQ': '161.0.80.0/20',
53896ca5 4646 'BR': '191.128.0.0/12',
773f291d
S
4647 'BS': '24.51.64.0/18',
4648 'BT': '119.2.96.0/19',
4649 'BW': '168.167.0.0/16',
4650 'BY': '178.120.0.0/13',
4651 'BZ': '179.42.192.0/18',
4652 'CA': '99.224.0.0/11',
4653 'CD': '41.243.0.0/16',
53896ca5
S
4654 'CF': '197.242.176.0/21',
4655 'CG': '160.113.0.0/16',
773f291d 4656 'CH': '85.0.0.0/13',
53896ca5 4657 'CI': '102.136.0.0/14',
773f291d
S
4658 'CK': '202.65.32.0/19',
4659 'CL': '152.172.0.0/14',
53896ca5 4660 'CM': '102.244.0.0/14',
773f291d
S
4661 'CN': '36.128.0.0/10',
4662 'CO': '181.240.0.0/12',
4663 'CR': '201.192.0.0/12',
4664 'CU': '152.206.0.0/15',
4665 'CV': '165.90.96.0/19',
4666 'CW': '190.88.128.0/17',
53896ca5 4667 'CY': '31.153.0.0/16',
773f291d
S
4668 'CZ': '88.100.0.0/14',
4669 'DE': '53.0.0.0/8',
4670 'DJ': '197.241.0.0/17',
4671 'DK': '87.48.0.0/12',
4672 'DM': '192.243.48.0/20',
4673 'DO': '152.166.0.0/15',
4674 'DZ': '41.96.0.0/12',
4675 'EC': '186.68.0.0/15',
4676 'EE': '90.190.0.0/15',
4677 'EG': '156.160.0.0/11',
4678 'ER': '196.200.96.0/20',
4679 'ES': '88.0.0.0/11',
4680 'ET': '196.188.0.0/14',
4681 'EU': '2.16.0.0/13',
4682 'FI': '91.152.0.0/13',
4683 'FJ': '144.120.0.0/16',
53896ca5 4684 'FK': '80.73.208.0/21',
773f291d
S
4685 'FM': '119.252.112.0/20',
4686 'FO': '88.85.32.0/19',
4687 'FR': '90.0.0.0/9',
4688 'GA': '41.158.0.0/15',
4689 'GB': '25.0.0.0/8',
4690 'GD': '74.122.88.0/21',
4691 'GE': '31.146.0.0/16',
4692 'GF': '161.22.64.0/18',
4693 'GG': '62.68.160.0/19',
53896ca5
S
4694 'GH': '154.160.0.0/12',
4695 'GI': '95.164.0.0/16',
773f291d
S
4696 'GL': '88.83.0.0/19',
4697 'GM': '160.182.0.0/15',
4698 'GN': '197.149.192.0/18',
4699 'GP': '104.250.0.0/19',
4700 'GQ': '105.235.224.0/20',
4701 'GR': '94.64.0.0/13',
4702 'GT': '168.234.0.0/16',
4703 'GU': '168.123.0.0/16',
4704 'GW': '197.214.80.0/20',
4705 'GY': '181.41.64.0/18',
4706 'HK': '113.252.0.0/14',
4707 'HN': '181.210.0.0/16',
4708 'HR': '93.136.0.0/13',
4709 'HT': '148.102.128.0/17',
4710 'HU': '84.0.0.0/14',
4711 'ID': '39.192.0.0/10',
4712 'IE': '87.32.0.0/12',
4713 'IL': '79.176.0.0/13',
4714 'IM': '5.62.80.0/20',
4715 'IN': '117.192.0.0/10',
4716 'IO': '203.83.48.0/21',
4717 'IQ': '37.236.0.0/14',
4718 'IR': '2.176.0.0/12',
4719 'IS': '82.221.0.0/16',
4720 'IT': '79.0.0.0/10',
4721 'JE': '87.244.64.0/18',
4722 'JM': '72.27.0.0/17',
4723 'JO': '176.29.0.0/16',
53896ca5 4724 'JP': '133.0.0.0/8',
773f291d
S
4725 'KE': '105.48.0.0/12',
4726 'KG': '158.181.128.0/17',
4727 'KH': '36.37.128.0/17',
4728 'KI': '103.25.140.0/22',
4729 'KM': '197.255.224.0/20',
53896ca5 4730 'KN': '198.167.192.0/19',
773f291d
S
4731 'KP': '175.45.176.0/22',
4732 'KR': '175.192.0.0/10',
4733 'KW': '37.36.0.0/14',
4734 'KY': '64.96.0.0/15',
4735 'KZ': '2.72.0.0/13',
4736 'LA': '115.84.64.0/18',
4737 'LB': '178.135.0.0/16',
53896ca5 4738 'LC': '24.92.144.0/20',
773f291d
S
4739 'LI': '82.117.0.0/19',
4740 'LK': '112.134.0.0/15',
53896ca5 4741 'LR': '102.183.0.0/16',
773f291d
S
4742 'LS': '129.232.0.0/17',
4743 'LT': '78.56.0.0/13',
4744 'LU': '188.42.0.0/16',
4745 'LV': '46.109.0.0/16',
4746 'LY': '41.252.0.0/14',
4747 'MA': '105.128.0.0/11',
4748 'MC': '88.209.64.0/18',
4749 'MD': '37.246.0.0/16',
4750 'ME': '178.175.0.0/17',
4751 'MF': '74.112.232.0/21',
4752 'MG': '154.126.0.0/17',
4753 'MH': '117.103.88.0/21',
4754 'MK': '77.28.0.0/15',
4755 'ML': '154.118.128.0/18',
4756 'MM': '37.111.0.0/17',
4757 'MN': '49.0.128.0/17',
4758 'MO': '60.246.0.0/16',
4759 'MP': '202.88.64.0/20',
4760 'MQ': '109.203.224.0/19',
4761 'MR': '41.188.64.0/18',
4762 'MS': '208.90.112.0/22',
4763 'MT': '46.11.0.0/16',
4764 'MU': '105.16.0.0/12',
4765 'MV': '27.114.128.0/18',
53896ca5 4766 'MW': '102.70.0.0/15',
773f291d
S
4767 'MX': '187.192.0.0/11',
4768 'MY': '175.136.0.0/13',
4769 'MZ': '197.218.0.0/15',
4770 'NA': '41.182.0.0/16',
4771 'NC': '101.101.0.0/18',
4772 'NE': '197.214.0.0/18',
4773 'NF': '203.17.240.0/22',
4774 'NG': '105.112.0.0/12',
4775 'NI': '186.76.0.0/15',
4776 'NL': '145.96.0.0/11',
4777 'NO': '84.208.0.0/13',
4778 'NP': '36.252.0.0/15',
4779 'NR': '203.98.224.0/19',
4780 'NU': '49.156.48.0/22',
4781 'NZ': '49.224.0.0/14',
4782 'OM': '5.36.0.0/15',
4783 'PA': '186.72.0.0/15',
4784 'PE': '186.160.0.0/14',
4785 'PF': '123.50.64.0/18',
4786 'PG': '124.240.192.0/19',
4787 'PH': '49.144.0.0/13',
4788 'PK': '39.32.0.0/11',
4789 'PL': '83.0.0.0/11',
4790 'PM': '70.36.0.0/20',
4791 'PR': '66.50.0.0/16',
4792 'PS': '188.161.0.0/16',
4793 'PT': '85.240.0.0/13',
4794 'PW': '202.124.224.0/20',
4795 'PY': '181.120.0.0/14',
4796 'QA': '37.210.0.0/15',
53896ca5 4797 'RE': '102.35.0.0/16',
773f291d 4798 'RO': '79.112.0.0/13',
53896ca5 4799 'RS': '93.86.0.0/15',
773f291d 4800 'RU': '5.136.0.0/13',
53896ca5 4801 'RW': '41.186.0.0/16',
773f291d
S
4802 'SA': '188.48.0.0/13',
4803 'SB': '202.1.160.0/19',
4804 'SC': '154.192.0.0/11',
53896ca5 4805 'SD': '102.120.0.0/13',
773f291d 4806 'SE': '78.64.0.0/12',
53896ca5 4807 'SG': '8.128.0.0/10',
773f291d
S
4808 'SI': '188.196.0.0/14',
4809 'SK': '78.98.0.0/15',
53896ca5 4810 'SL': '102.143.0.0/17',
773f291d
S
4811 'SM': '89.186.32.0/19',
4812 'SN': '41.82.0.0/15',
53896ca5 4813 'SO': '154.115.192.0/18',
773f291d
S
4814 'SR': '186.179.128.0/17',
4815 'SS': '105.235.208.0/21',
4816 'ST': '197.159.160.0/19',
4817 'SV': '168.243.0.0/16',
4818 'SX': '190.102.0.0/20',
4819 'SY': '5.0.0.0/16',
4820 'SZ': '41.84.224.0/19',
4821 'TC': '65.255.48.0/20',
4822 'TD': '154.68.128.0/19',
4823 'TG': '196.168.0.0/14',
4824 'TH': '171.96.0.0/13',
4825 'TJ': '85.9.128.0/18',
4826 'TK': '27.96.24.0/21',
4827 'TL': '180.189.160.0/20',
4828 'TM': '95.85.96.0/19',
4829 'TN': '197.0.0.0/11',
4830 'TO': '175.176.144.0/21',
4831 'TR': '78.160.0.0/11',
4832 'TT': '186.44.0.0/15',
4833 'TV': '202.2.96.0/19',
4834 'TW': '120.96.0.0/11',
4835 'TZ': '156.156.0.0/14',
53896ca5
S
4836 'UA': '37.52.0.0/14',
4837 'UG': '102.80.0.0/13',
4838 'US': '6.0.0.0/8',
773f291d 4839 'UY': '167.56.0.0/13',
53896ca5 4840 'UZ': '84.54.64.0/18',
773f291d 4841 'VA': '212.77.0.0/19',
53896ca5 4842 'VC': '207.191.240.0/21',
773f291d 4843 'VE': '186.88.0.0/13',
53896ca5 4844 'VG': '66.81.192.0/20',
773f291d
S
4845 'VI': '146.226.0.0/16',
4846 'VN': '14.160.0.0/11',
4847 'VU': '202.80.32.0/20',
4848 'WF': '117.20.32.0/21',
4849 'WS': '202.4.32.0/19',
4850 'YE': '134.35.0.0/16',
4851 'YT': '41.242.116.0/22',
4852 'ZA': '41.0.0.0/11',
53896ca5
S
4853 'ZM': '102.144.0.0/13',
4854 'ZW': '102.177.192.0/18',
773f291d
S
4855 }
4856
4857 @classmethod
5f95927a
S
4858 def random_ipv4(cls, code_or_block):
4859 if len(code_or_block) == 2:
4860 block = cls._country_ip_map.get(code_or_block.upper())
4861 if not block:
4862 return None
4863 else:
4864 block = code_or_block
773f291d 4865 addr, preflen = block.split('/')
ac668111 4866 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4867 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4868 return str(socket.inet_ntoa(
ac668111 4869 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4870
4871
ac668111 4872class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4873 def __init__(self, proxies=None):
4874 # Set default handlers
4875 for type in ('http', 'https'):
4876 setattr(self, '%s_open' % type,
4877 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4878 meth(r, proxy, type))
ac668111 4879 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4880
91410c9b 4881 def proxy_open(self, req, proxy, type):
2461f79d 4882 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4883 if req_proxy is not None:
4884 proxy = req_proxy
2461f79d
PH
4885 del req.headers['Ytdl-request-proxy']
4886
4887 if proxy == '__noproxy__':
4888 return None # No Proxy
14f25df2 4889 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4890 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4891 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4892 return None
ac668111 4893 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4894 self, req, proxy, type)
5bc880b9
YCH
4895
4896
0a5445dd
YCH
4897# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4898# released into Public Domain
4899# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4900
4901def long_to_bytes(n, blocksize=0):
4902 """long_to_bytes(n:long, blocksize:int) : string
4903 Convert a long integer to a byte string.
4904
4905 If optional blocksize is given and greater than zero, pad the front of the
4906 byte string with binary zeros so that the length is a multiple of
4907 blocksize.
4908 """
4909 # after much testing, this algorithm was deemed to be the fastest
4910 s = b''
4911 n = int(n)
4912 while n > 0:
ac668111 4913 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4914 n = n >> 32
4915 # strip off leading zeros
4916 for i in range(len(s)):
4917 if s[i] != b'\000'[0]:
4918 break
4919 else:
4920 # only happens when n == 0
4921 s = b'\000'
4922 i = 0
4923 s = s[i:]
4924 # add back some pad bytes. this could be done more efficiently w.r.t. the
4925 # de-padding being done above, but sigh...
4926 if blocksize > 0 and len(s) % blocksize:
4927 s = (blocksize - len(s) % blocksize) * b'\000' + s
4928 return s
4929
4930
4931def bytes_to_long(s):
4932 """bytes_to_long(string) : long
4933 Convert a byte string to a long integer.
4934
4935 This is (essentially) the inverse of long_to_bytes().
4936 """
4937 acc = 0
4938 length = len(s)
4939 if length % 4:
4940 extra = (4 - length % 4)
4941 s = b'\000' * extra + s
4942 length = length + extra
4943 for i in range(0, length, 4):
ac668111 4944 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4945 return acc
4946
4947
5bc880b9
YCH
4948def ohdave_rsa_encrypt(data, exponent, modulus):
4949 '''
4950 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4951
4952 Input:
4953 data: data to encrypt, bytes-like object
4954 exponent, modulus: parameter e and N of RSA algorithm, both integer
4955 Output: hex string of encrypted data
4956
4957 Limitation: supports one block encryption only
4958 '''
4959
4960 payload = int(binascii.hexlify(data[::-1]), 16)
4961 encrypted = pow(payload, exponent, modulus)
4962 return '%x' % encrypted
81bdc8fd
YCH
4963
4964
f48409c7
YCH
4965def pkcs1pad(data, length):
4966 """
4967 Padding input data with PKCS#1 scheme
4968
4969 @param {int[]} data input data
4970 @param {int} length target length
4971 @returns {int[]} padded data
4972 """
4973 if len(data) > length - 11:
4974 raise ValueError('Input data too long for PKCS#1 padding')
4975
4976 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4977 return [0, 2] + pseudo_random + [0] + data
4978
4979
7b2c3f47 4980def _base_n_table(n, table):
4981 if not table and not n:
4982 raise ValueError('Either table or n must be specified')
612f2be5 4983 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4984
44f14eb4 4985 if n and n != len(table):
612f2be5 4986 raise ValueError(f'base {n} exceeds table length {len(table)}')
4987 return table
59f898b7 4988
5eb6bdce 4989
7b2c3f47 4990def encode_base_n(num, n=None, table=None):
4991 """Convert given int to a base-n string"""
612f2be5 4992 table = _base_n_table(n, table)
7b2c3f47 4993 if not num:
5eb6bdce
YCH
4994 return table[0]
4995
7b2c3f47 4996 result, base = '', len(table)
81bdc8fd 4997 while num:
7b2c3f47 4998 result = table[num % base] + result
612f2be5 4999 num = num // base
7b2c3f47 5000 return result
5001
5002
5003def decode_base_n(string, n=None, table=None):
5004 """Convert given base-n string to int"""
5005 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5006 result, base = 0, len(table)
5007 for char in string:
5008 result = result * base + table[char]
5009 return result
5010
5011
f52354a8 5012def decode_packed_codes(code):
06b3fe29 5013 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 5014 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
5015 base = int(base)
5016 count = int(count)
5017 symbols = symbols.split('|')
5018 symbol_table = {}
5019
5020 while count:
5021 count -= 1
5eb6bdce 5022 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5023 symbol_table[base_n_count] = symbols[count] or base_n_count
5024
5025 return re.sub(
5026 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5027 obfuscated_code)
e154c651 5028
5029
1ced2221
S
5030def caesar(s, alphabet, shift):
5031 if shift == 0:
5032 return s
5033 l = len(alphabet)
5034 return ''.join(
5035 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5036 for c in s)
5037
5038
5039def rot47(s):
5040 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5041
5042
e154c651 5043def parse_m3u8_attributes(attrib):
5044 info = {}
5045 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5046 if val.startswith('"'):
5047 val = val[1:-1]
5048 info[key] = val
5049 return info
1143535d
YCH
5050
5051
5052def urshift(val, n):
5053 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5054
5055
efa97bdc 5056def write_xattr(path, key, value):
6f7563be 5057 # Windows: Write xattrs to NTFS Alternate Data Streams:
5058 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5059 if compat_os_name == 'nt':
5060 assert ':' not in key
5061 assert os.path.exists(path)
efa97bdc
YCH
5062
5063 try:
6f7563be 5064 with open(f'{path}:{key}', 'wb') as f:
5065 f.write(value)
86e5f3ed 5066 except OSError as e:
efa97bdc 5067 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5068 return
efa97bdc 5069
6f7563be 5070 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5071
6f7563be 5072 setxattr = None
5073 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5074 # Unicode arguments are not supported in pyxattr until version 0.5.0
5075 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5076 if version_tuple(xattr.__version__) >= (0, 5, 0):
5077 setxattr = xattr.set
5078 elif xattr:
5079 setxattr = xattr.setxattr
efa97bdc 5080
6f7563be 5081 if setxattr:
5082 try:
5083 setxattr(path, key, value)
5084 except OSError as e:
5085 raise XAttrMetadataError(e.errno, e.strerror)
5086 return
efa97bdc 5087
6f7563be 5088 # UNIX Method 2. Use setfattr/xattr executables
5089 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5090 else 'xattr' if check_executable('xattr', ['-h']) else None)
5091 if not exe:
5092 raise XAttrUnavailableError(
5093 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5094 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5095
0f06bcd7 5096 value = value.decode()
6f7563be 5097 try:
f0c9fb96 5098 _, stderr, returncode = Popen.run(
6f7563be 5099 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5100 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5101 except OSError as e:
5102 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5103 if returncode:
5104 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5105
5106
5107def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5108 start_date = datetime.date(1950, 1, 1)
5109 end_date = datetime.date(1995, 12, 31)
5110 offset = random.randint(0, (end_date - start_date).days)
5111 random_date = start_date + datetime.timedelta(offset)
0c265486 5112 return {
aa374bc7
AS
5113 year_field: str(random_date.year),
5114 month_field: str(random_date.month),
5115 day_field: str(random_date.day),
0c265486 5116 }
732044af 5117
c76eb41b 5118
8c53322c
L
5119def find_available_port(interface=''):
5120 try:
5121 with socket.socket() as sock:
5122 sock.bind((interface, 0))
5123 return sock.getsockname()[1]
5124 except OSError:
5125 return None
5126
5127
732044af 5128# Templates for internet shortcut files, which are plain text files.
e5a998f3 5129DOT_URL_LINK_TEMPLATE = '''\
732044af 5130[InternetShortcut]
5131URL=%(url)s
e5a998f3 5132'''
732044af 5133
e5a998f3 5134DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5135<?xml version="1.0" encoding="UTF-8"?>
5136<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5137<plist version="1.0">
5138<dict>
5139\t<key>URL</key>
5140\t<string>%(url)s</string>
5141</dict>
5142</plist>
e5a998f3 5143'''
732044af 5144
e5a998f3 5145DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5146[Desktop Entry]
5147Encoding=UTF-8
5148Name=%(filename)s
5149Type=Link
5150URL=%(url)s
5151Icon=text-html
e5a998f3 5152'''
732044af 5153
08438d2c 5154LINK_TEMPLATES = {
5155 'url': DOT_URL_LINK_TEMPLATE,
5156 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5157 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5158}
5159
732044af 5160
5161def iri_to_uri(iri):
5162 """
5163 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5164
5165 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5166 """
5167
14f25df2 5168 iri_parts = urllib.parse.urlparse(iri)
732044af 5169
5170 if '[' in iri_parts.netloc:
5171 raise ValueError('IPv6 URIs are not, yet, supported.')
5172 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5173
5174 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5175
5176 net_location = ''
5177 if iri_parts.username:
f9934b96 5178 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5179 if iri_parts.password is not None:
f9934b96 5180 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5181 net_location += '@'
5182
0f06bcd7 5183 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5184 # The 'idna' encoding produces ASCII text.
5185 if iri_parts.port is not None and iri_parts.port != 80:
5186 net_location += ':' + str(iri_parts.port)
5187
f9934b96 5188 return urllib.parse.urlunparse(
732044af 5189 (iri_parts.scheme,
5190 net_location,
5191
f9934b96 5192 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5193
5194 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5195 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5196
5197 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5198 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5199
f9934b96 5200 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5201
5202 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5203
5204
5205def to_high_limit_path(path):
5206 if sys.platform in ['win32', 'cygwin']:
5207 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5208 return '\\\\?\\' + os.path.abspath(path)
732044af 5209
5210 return path
76d321f6 5211
c76eb41b 5212
7b2c3f47 5213def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 5214 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 5215 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5216 return default
7b2c3f47 5217 return template % func(val)
00dd0cd5 5218
5219
5220def clean_podcast_url(url):
5221 return re.sub(r'''(?x)
5222 (?:
5223 (?:
5224 chtbl\.com/track|
5225 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5226 play\.podtrac\.com
5227 )/[^/]+|
5228 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5229 flex\.acast\.com|
5230 pd(?:
5231 cn\.co| # https://podcorn.com/analytics-prefix/
5232 st\.fm # https://podsights.com/docs/
5233 )/e
5234 )/''', '', url)
ffcb8191
THD
5235
5236
5237_HEX_TABLE = '0123456789abcdef'
5238
5239
5240def random_uuidv4():
5241 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5242
5243
5244def make_dir(path, to_screen=None):
5245 try:
5246 dn = os.path.dirname(path)
b25d6cb9
AI
5247 if dn:
5248 os.makedirs(dn, exist_ok=True)
0202b52a 5249 return True
86e5f3ed 5250 except OSError as err:
0202b52a 5251 if callable(to_screen) is not None:
69bec673 5252 to_screen(f'unable to create directory {err}')
0202b52a 5253 return False
f74980cb 5254
5255
5256def get_executable_path():
69bec673 5257 from ..update import _get_variant_and_executable_path
c487cf00 5258
b5899f4f 5259 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5260
5261
8e40b9d1 5262def get_user_config_dirs(package_name):
8e40b9d1
M
5263 # .config (e.g. ~/.config/package_name)
5264 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5265 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5266
5267 # appdata (%APPDATA%/package_name)
5268 appdata_dir = os.getenv('appdata')
5269 if appdata_dir:
773c272d 5270 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5271
5272 # home (~/.package_name)
773c272d 5273 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5274
5275
5276def get_system_config_dirs(package_name):
8e40b9d1 5277 # /etc/package_name
773c272d 5278 yield os.path.join('/etc', package_name)
06167fbb 5279
5280
3e9b66d7 5281def time_seconds(**kwargs):
83c4970e
L
5282 """
5283 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5284 """
5285 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5286
5287
49fa4d9a
N
5288# create a JSON Web Signature (jws) with HS256 algorithm
5289# the resulting format is in JWS Compact Serialization
5290# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5291# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5292def jwt_encode_hs256(payload_data, key, headers={}):
5293 header_data = {
5294 'alg': 'HS256',
5295 'typ': 'JWT',
5296 }
5297 if headers:
5298 header_data.update(headers)
0f06bcd7 5299 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5300 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5301 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5302 signature_b64 = base64.b64encode(h.digest())
5303 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5304 return token
819e0531 5305
5306
16b0d7e6 5307# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5308def jwt_decode_hs256(jwt):
5309 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5310 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5311 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5312 return payload_data
5313
5314
53973b4d 5315WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5316
5317
7a32c70d 5318@functools.cache
819e0531 5319def supports_terminal_sequences(stream):
5320 if compat_os_name == 'nt':
8a82af35 5321 if not WINDOWS_VT_MODE:
819e0531 5322 return False
5323 elif not os.getenv('TERM'):
5324 return False
5325 try:
5326 return stream.isatty()
5327 except BaseException:
5328 return False
5329
5330
c53a18f0 5331def windows_enable_vt_mode():
5332 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5333 if get_windows_version() < (10, 0, 10586):
53973b4d 5334 return
53973b4d 5335
c53a18f0 5336 import ctypes
5337 import ctypes.wintypes
5338 import msvcrt
5339
5340 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5341
5342 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5343 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5344 try:
5345 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5346 dw_original_mode = ctypes.wintypes.DWORD()
5347 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5348 if not success:
5349 raise Exception('GetConsoleMode failed')
5350
5351 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5352 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5353 if not success:
5354 raise Exception('SetConsoleMode failed')
c53a18f0 5355 finally:
5356 os.close(handle)
53973b4d 5357
f0795149 5358 global WINDOWS_VT_MODE
5359 WINDOWS_VT_MODE = True
5360 supports_terminal_sequences.cache_clear()
5361
53973b4d 5362
ec11a9f4 5363_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5364
5365
5366def remove_terminal_sequences(string):
5367 return _terminal_sequences_re.sub('', string)
5368
5369
5370def number_of_digits(number):
5371 return len('%d' % number)
34921b43 5372
5373
5374def join_nonempty(*values, delim='-', from_dict=None):
5375 if from_dict is not None:
69bec673 5376 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5377 return delim.join(map(str, filter(None, values)))
06e57990 5378
5379
27231526
ZM
5380def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5381 """
5382 Find the largest format dimensions in terms of video width and, for each thumbnail:
5383 * Modify the URL: Match the width with the provided regex and replace with the former width
5384 * Update dimensions
5385
5386 This function is useful with video services that scale the provided thumbnails on demand
5387 """
5388 _keys = ('width', 'height')
5389 max_dimensions = max(
86e5f3ed 5390 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5391 default=(0, 0))
5392 if not max_dimensions[0]:
5393 return thumbnails
5394 return [
5395 merge_dicts(
5396 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5397 dict(zip(_keys, max_dimensions)), thumbnail)
5398 for thumbnail in thumbnails
5399 ]
5400
5401
93c8410d
LNO
5402def parse_http_range(range):
5403 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5404 if not range:
5405 return None, None, None
5406 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5407 if not crg:
5408 return None, None, None
5409 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5410
5411
6b9e832d 5412def read_stdin(what):
5413 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5414 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5415 return sys.stdin
5416
5417
a904a7f8
L
5418def determine_file_encoding(data):
5419 """
88f60feb 5420 Detect the text encoding used
a904a7f8
L
5421 @returns (encoding, bytes to skip)
5422 """
5423
88f60feb 5424 # BOM marks are given priority over declarations
a904a7f8 5425 for bom, enc in BOMS:
a904a7f8
L
5426 if data.startswith(bom):
5427 return enc, len(bom)
5428
88f60feb 5429 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5430 # We ignore the endianness to get a good enough match
a904a7f8 5431 data = data.replace(b'\0', b'')
88f60feb 5432 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5433 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5434
5435
06e57990 5436class Config:
5437 own_args = None
9e491463 5438 parsed_args = None
06e57990 5439 filename = None
5440 __initialized = False
5441
5442 def __init__(self, parser, label=None):
9e491463 5443 self.parser, self.label = parser, label
06e57990 5444 self._loaded_paths, self.configs = set(), []
5445
5446 def init(self, args=None, filename=None):
5447 assert not self.__initialized
284a60c5 5448 self.own_args, self.filename = args, filename
5449 return self.load_configs()
5450
5451 def load_configs(self):
65662dff 5452 directory = ''
284a60c5 5453 if self.filename:
5454 location = os.path.realpath(self.filename)
65662dff 5455 directory = os.path.dirname(location)
06e57990 5456 if location in self._loaded_paths:
5457 return False
5458 self._loaded_paths.add(location)
5459
284a60c5 5460 self.__initialized = True
5461 opts, _ = self.parser.parse_known_args(self.own_args)
5462 self.parsed_args = self.own_args
9e491463 5463 for location in opts.config_locations or []:
6b9e832d 5464 if location == '-':
1060f82f 5465 if location in self._loaded_paths:
5466 continue
5467 self._loaded_paths.add(location)
6b9e832d 5468 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5469 continue
65662dff 5470 location = os.path.join(directory, expand_path(location))
06e57990 5471 if os.path.isdir(location):
5472 location = os.path.join(location, 'yt-dlp.conf')
5473 if not os.path.exists(location):
9e491463 5474 self.parser.error(f'config location {location} does not exist')
06e57990 5475 self.append_config(self.read_file(location), location)
5476 return True
5477
5478 def __str__(self):
5479 label = join_nonempty(
5480 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5481 delim=' ')
5482 return join_nonempty(
5483 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5484 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5485 delim='\n')
5486
7a32c70d 5487 @staticmethod
06e57990 5488 def read_file(filename, default=[]):
5489 try:
a904a7f8 5490 optionf = open(filename, 'rb')
86e5f3ed 5491 except OSError:
06e57990 5492 return default # silently skip if file is not present
a904a7f8
L
5493 try:
5494 enc, skip = determine_file_encoding(optionf.read(512))
5495 optionf.seek(skip, io.SEEK_SET)
5496 except OSError:
5497 enc = None # silently skip read errors
06e57990 5498 try:
5499 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5500 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5501 res = shlex.split(contents, comments=True)
44a6fcff 5502 except Exception as err:
5503 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5504 finally:
5505 optionf.close()
5506 return res
5507
7a32c70d 5508 @staticmethod
06e57990 5509 def hide_login_info(opts):
86e5f3ed 5510 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5511 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5512
5513 def _scrub_eq(o):
5514 m = eqre.match(o)
5515 if m:
5516 return m.group('key') + '=PRIVATE'
5517 else:
5518 return o
5519
5520 opts = list(map(_scrub_eq, opts))
5521 for idx, opt in enumerate(opts):
5522 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5523 opts[idx + 1] = 'PRIVATE'
5524 return opts
5525
5526 def append_config(self, *args, label=None):
9e491463 5527 config = type(self)(self.parser, label)
06e57990 5528 config._loaded_paths = self._loaded_paths
5529 if config.init(*args):
5530 self.configs.append(config)
5531
7a32c70d 5532 @property
06e57990 5533 def all_args(self):
5534 for config in reversed(self.configs):
5535 yield from config.all_args
9e491463 5536 yield from self.parsed_args or []
5537
5538 def parse_known_args(self, **kwargs):
5539 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5540
5541 def parse_args(self):
9e491463 5542 return self.parser.parse_args(self.all_args)
da42679b
LNO
5543
5544
d5d1df8a 5545class WebSocketsWrapper:
da42679b 5546 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5547 pool = None
da42679b 5548
3cea3edd 5549 def __init__(self, url, headers=None, connect=True):
059bc4db 5550 self.loop = asyncio.new_event_loop()
9cd08050 5551 # XXX: "loop" is deprecated
5552 self.conn = websockets.connect(
5553 url, extra_headers=headers, ping_interval=None,
5554 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5555 if connect:
5556 self.__enter__()
15dfb392 5557 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5558
5559 def __enter__(self):
3cea3edd 5560 if not self.pool:
9cd08050 5561 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5562 return self
5563
5564 def send(self, *args):
5565 self.run_with_loop(self.pool.send(*args), self.loop)
5566
5567 def recv(self, *args):
5568 return self.run_with_loop(self.pool.recv(*args), self.loop)
5569
5570 def __exit__(self, type, value, traceback):
5571 try:
5572 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5573 finally:
5574 self.loop.close()
15dfb392 5575 self._cancel_all_tasks(self.loop)
da42679b
LNO
5576
5577 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5578 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5579 @staticmethod
da42679b 5580 def run_with_loop(main, loop):
059bc4db 5581 if not asyncio.iscoroutine(main):
da42679b
LNO
5582 raise ValueError(f'a coroutine was expected, got {main!r}')
5583
5584 try:
5585 return loop.run_until_complete(main)
5586 finally:
5587 loop.run_until_complete(loop.shutdown_asyncgens())
5588 if hasattr(loop, 'shutdown_default_executor'):
5589 loop.run_until_complete(loop.shutdown_default_executor())
5590
7a32c70d 5591 @staticmethod
da42679b 5592 def _cancel_all_tasks(loop):
059bc4db 5593 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5594
5595 if not to_cancel:
5596 return
5597
5598 for task in to_cancel:
5599 task.cancel()
5600
9cd08050 5601 # XXX: "loop" is removed in python 3.10+
da42679b 5602 loop.run_until_complete(
059bc4db 5603 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5604
5605 for task in to_cancel:
5606 if task.cancelled():
5607 continue
5608 if task.exception() is not None:
5609 loop.call_exception_handler({
5610 'message': 'unhandled exception during asyncio.run() shutdown',
5611 'exception': task.exception(),
5612 'task': task,
5613 })
5614
5615
8b7539d2 5616def merge_headers(*dicts):
08d30158 5617 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5618 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5619
5620
b1f94422 5621def cached_method(f):
5622 """Cache a method"""
5623 signature = inspect.signature(f)
5624
7a32c70d 5625 @functools.wraps(f)
b1f94422 5626 def wrapper(self, *args, **kwargs):
5627 bound_args = signature.bind(self, *args, **kwargs)
5628 bound_args.apply_defaults()
d5d1df8a 5629 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5630
6368e2e6 5631 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5632 if key not in cache:
5633 cache[key] = f(self, *args, **kwargs)
5634 return cache[key]
5635 return wrapper
5636
5637
28787f16 5638class classproperty:
83cc7b8a 5639 """property access for class methods with optional caching"""
5640 def __new__(cls, func=None, *args, **kwargs):
5641 if not func:
5642 return functools.partial(cls, *args, **kwargs)
5643 return super().__new__(cls)
c487cf00 5644
83cc7b8a 5645 def __init__(self, func, *, cache=False):
c487cf00 5646 functools.update_wrapper(self, func)
5647 self.func = func
83cc7b8a 5648 self._cache = {} if cache else None
28787f16 5649
5650 def __get__(self, _, cls):
83cc7b8a 5651 if self._cache is None:
5652 return self.func(cls)
5653 elif cls not in self._cache:
5654 self._cache[cls] = self.func(cls)
5655 return self._cache[cls]
19a03940 5656
5657
a5387729 5658class function_with_repr:
b2e0343b 5659 def __init__(self, func, repr_=None):
a5387729 5660 functools.update_wrapper(self, func)
b2e0343b 5661 self.func, self.__repr = func, repr_
a5387729 5662
5663 def __call__(self, *args, **kwargs):
5664 return self.func(*args, **kwargs)
5665
5666 def __repr__(self):
b2e0343b 5667 if self.__repr:
5668 return self.__repr
a5387729 5669 return f'{self.func.__module__}.{self.func.__qualname__}'
5670
5671
64fa820c 5672class Namespace(types.SimpleNamespace):
591bb9d3 5673 """Immutable namespace"""
591bb9d3 5674
7896214c 5675 def __iter__(self):
64fa820c 5676 return iter(self.__dict__.values())
7896214c 5677
7a32c70d 5678 @property
64fa820c 5679 def items_(self):
5680 return self.__dict__.items()
9b8ee23b 5681
5682
8dc59305 5683MEDIA_EXTENSIONS = Namespace(
5684 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5685 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5686 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5687 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5688 thumbnails=('jpg', 'png', 'webp'),
5689 storyboards=('mhtml', ),
5690 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5691 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5692)
5693MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5694MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5695
5696KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5697
5698
be5c1ae8 5699class RetryManager:
5700 """Usage:
5701 for retry in RetryManager(...):
5702 try:
5703 ...
5704 except SomeException as err:
5705 retry.error = err
5706 continue
5707 """
5708 attempt, _error = 0, None
5709
5710 def __init__(self, _retries, _error_callback, **kwargs):
5711 self.retries = _retries or 0
5712 self.error_callback = functools.partial(_error_callback, **kwargs)
5713
5714 def _should_retry(self):
5715 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5716
7a32c70d 5717 @property
be5c1ae8 5718 def error(self):
5719 if self._error is NO_DEFAULT:
5720 return None
5721 return self._error
5722
7a32c70d 5723 @error.setter
be5c1ae8 5724 def error(self, value):
5725 self._error = value
5726
5727 def __iter__(self):
5728 while self._should_retry():
5729 self.error = NO_DEFAULT
5730 self.attempt += 1
5731 yield self
5732 if self.error:
5733 self.error_callback(self.error, self.attempt, self.retries)
5734
7a32c70d 5735 @staticmethod
be5c1ae8 5736 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5737 """Utility function for reporting retries"""
5738 if count > retries:
5739 if error:
5740 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5741 raise e
5742
5743 if not count:
5744 return warn(e)
5745 elif isinstance(e, ExtractorError):
3ce29336 5746 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5747 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5748
5749 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5750 if delay:
5751 info(f'Sleeping {delay:.2f} seconds ...')
5752 time.sleep(delay)
5753
5754
0647d925 5755def make_archive_id(ie, video_id):
5756 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5757 return f'{ie_key.lower()} {video_id}'
5758
5759
a1c5bd82 5760def truncate_string(s, left, right=0):
5761 assert left > 3 and right >= 0
5762 if s is None or len(s) <= left + right:
5763 return s
71df9b7f 5764 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5765
5766
5314b521 5767def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5768 assert 'all' in alias_dict, '"all" alias is required'
5769 requested = list(start or [])
5770 for val in options:
5771 discard = val.startswith('-')
5772 if discard:
5773 val = val[1:]
5774
5775 if val in alias_dict:
5776 val = alias_dict[val] if not discard else [
5777 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5778 # NB: Do not allow regex in aliases for performance
5779 requested = orderedSet_from_options(val, alias_dict, start=requested)
5780 continue
5781
5782 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5783 else [val] if val in alias_dict['all'] else None)
5784 if current is None:
5785 raise ValueError(val)
5786
5787 if discard:
5788 for item in current:
5789 while item in requested:
5790 requested.remove(item)
5791 else:
5792 requested.extend(current)
5793
5794 return orderedSet(requested)
5795
5796
d0d74b71 5797class FormatSorter:
5798 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5799
5800 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5801 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5802 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5803 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5804 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5805 'fps', 'fs_approx', 'source', 'id')
5806
5807 settings = {
5808 'vcodec': {'type': 'ordered', 'regex': True,
5809 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5810 'acodec': {'type': 'ordered', 'regex': True,
71082216 5811 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5812 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5813 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5814 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5815 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5816 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5817 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5818 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5819 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5820 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5821 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5822 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5823 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5824 'field': ('vcodec', 'acodec'),
5825 'function': lambda it: int(any(v != 'none' for v in it))},
5826 'ie_pref': {'priority': True, 'type': 'extractor'},
5827 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5828 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5829 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5830 'quality': {'convert': 'float', 'default': -1},
5831 'filesize': {'convert': 'bytes'},
5832 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5833 'id': {'convert': 'string', 'field': 'format_id'},
5834 'height': {'convert': 'float_none'},
5835 'width': {'convert': 'float_none'},
5836 'fps': {'convert': 'float_none'},
5837 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5838 'tbr': {'convert': 'float_none'},
5839 'vbr': {'convert': 'float_none'},
5840 'abr': {'convert': 'float_none'},
5841 'asr': {'convert': 'float_none'},
5842 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5843
5844 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5845 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5846 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5847 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5848 'res': {'type': 'multiple', 'field': ('height', 'width'),
5849 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5850
5851 # Actual field names
5852 'format_id': {'type': 'alias', 'field': 'id'},
5853 'preference': {'type': 'alias', 'field': 'ie_pref'},
5854 'language_preference': {'type': 'alias', 'field': 'lang'},
5855 'source_preference': {'type': 'alias', 'field': 'source'},
5856 'protocol': {'type': 'alias', 'field': 'proto'},
5857 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5858 'audio_channels': {'type': 'alias', 'field': 'channels'},
5859
5860 # Deprecated
5861 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5862 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5863 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5864 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5865 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5866 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5867 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5868 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5869 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5870 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5871 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5872 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5873 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5874 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5875 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5876 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5877 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5878 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5879 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5880 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5881 }
5882
5883 def __init__(self, ydl, field_preference):
5884 self.ydl = ydl
5885 self._order = []
5886 self.evaluate_params(self.ydl.params, field_preference)
5887 if ydl.params.get('verbose'):
5888 self.print_verbose_info(self.ydl.write_debug)
5889
5890 def _get_field_setting(self, field, key):
5891 if field not in self.settings:
5892 if key in ('forced', 'priority'):
5893 return False
5894 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5895 'deprecated and may be removed in a future version')
5896 self.settings[field] = {}
5897 propObj = self.settings[field]
5898 if key not in propObj:
5899 type = propObj.get('type')
5900 if key == 'field':
5901 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5902 elif key == 'convert':
5903 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5904 else:
5905 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5906 propObj[key] = default
5907 return propObj[key]
5908
5909 def _resolve_field_value(self, field, value, convertNone=False):
5910 if value is None:
5911 if not convertNone:
5912 return None
5913 else:
5914 value = value.lower()
5915 conversion = self._get_field_setting(field, 'convert')
5916 if conversion == 'ignore':
5917 return None
5918 if conversion == 'string':
5919 return value
5920 elif conversion == 'float_none':
5921 return float_or_none(value)
5922 elif conversion == 'bytes':
5923 return parse_bytes(value)
5924 elif conversion == 'order':
5925 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5926 use_regex = self._get_field_setting(field, 'regex')
5927 list_length = len(order_list)
5928 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5929 if use_regex and value is not None:
5930 for i, regex in enumerate(order_list):
5931 if regex and re.match(regex, value):
5932 return list_length - i
5933 return list_length - empty_pos # not in list
5934 else: # not regex or value = None
5935 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5936 else:
5937 if value.isnumeric():
5938 return float(value)
5939 else:
5940 self.settings[field]['convert'] = 'string'
5941 return value
5942
5943 def evaluate_params(self, params, sort_extractor):
5944 self._use_free_order = params.get('prefer_free_formats', False)
5945 self._sort_user = params.get('format_sort', [])
5946 self._sort_extractor = sort_extractor
5947
5948 def add_item(field, reverse, closest, limit_text):
5949 field = field.lower()
5950 if field in self._order:
5951 return
5952 self._order.append(field)
5953 limit = self._resolve_field_value(field, limit_text)
5954 data = {
5955 'reverse': reverse,
5956 'closest': False if limit is None else closest,
5957 'limit_text': limit_text,
5958 'limit': limit}
5959 if field in self.settings:
5960 self.settings[field].update(data)
5961 else:
5962 self.settings[field] = data
5963
5964 sort_list = (
5965 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5966 + (tuple() if params.get('format_sort_force', False)
5967 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5968 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5969
5970 for item in sort_list:
5971 match = re.match(self.regex, item)
5972 if match is None:
5973 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5974 field = match.group('field')
5975 if field is None:
5976 continue
5977 if self._get_field_setting(field, 'type') == 'alias':
5978 alias, field = field, self._get_field_setting(field, 'field')
5979 if self._get_field_setting(alias, 'deprecated'):
5980 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5981 f'be removed in a future version. Please use {field} instead')
5982 reverse = match.group('reverse') is not None
5983 closest = match.group('separator') == '~'
5984 limit_text = match.group('limit')
5985
5986 has_limit = limit_text is not None
5987 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5988 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5989
5990 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5991 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5992 limit_count = len(limits)
5993 for (i, f) in enumerate(fields):
5994 add_item(f, reverse, closest,
5995 limits[i] if i < limit_count
5996 else limits[0] if has_limit and not has_multiple_limits
5997 else None)
5998
5999 def print_verbose_info(self, write_debug):
6000 if self._sort_user:
6001 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6002 if self._sort_extractor:
6003 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6004 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6005 '+' if self._get_field_setting(field, 'reverse') else '', field,
6006 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6007 self._get_field_setting(field, 'limit_text'),
6008 self._get_field_setting(field, 'limit'))
6009 if self._get_field_setting(field, 'limit_text') is not None else '')
6010 for field in self._order if self._get_field_setting(field, 'visible')]))
6011
6012 def _calculate_field_preference_from_value(self, format, field, type, value):
6013 reverse = self._get_field_setting(field, 'reverse')
6014 closest = self._get_field_setting(field, 'closest')
6015 limit = self._get_field_setting(field, 'limit')
6016
6017 if type == 'extractor':
6018 maximum = self._get_field_setting(field, 'max')
6019 if value is None or (maximum is not None and value >= maximum):
6020 value = -1
6021 elif type == 'boolean':
6022 in_list = self._get_field_setting(field, 'in_list')
6023 not_in_list = self._get_field_setting(field, 'not_in_list')
6024 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6025 elif type == 'ordered':
6026 value = self._resolve_field_value(field, value, True)
6027
6028 # try to convert to number
6029 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6030 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6031 if is_num:
6032 value = val_num
6033
6034 return ((-10, 0) if value is None
6035 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6036 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6037 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6038 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6039 else (-1, value, 0))
6040
6041 def _calculate_field_preference(self, format, field):
6042 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6043 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6044 if type == 'multiple':
6045 type = 'field' # Only 'field' is allowed in multiple for now
6046 actual_fields = self._get_field_setting(field, 'field')
6047
6048 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6049 else:
6050 value = get_value(field)
6051 return self._calculate_field_preference_from_value(format, field, type, value)
6052
6053 def calculate_preference(self, format):
6054 # Determine missing protocol
6055 if not format.get('protocol'):
6056 format['protocol'] = determine_protocol(format)
6057
6058 # Determine missing ext
6059 if not format.get('ext') and 'url' in format:
6060 format['ext'] = determine_ext(format['url'])
6061 if format.get('vcodec') == 'none':
6062 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6063 format['video_ext'] = 'none'
6064 else:
6065 format['video_ext'] = format['ext']
6066 format['audio_ext'] = 'none'
6067 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6068 # format['preference'] = -1000
6069
5424dbaf
L
6070 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6071 # HEVC-over-FLV is out-of-spec by FLV's original spec
6072 # ref. https://trac.ffmpeg.org/ticket/6389
6073 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6074 format['preference'] = -100
6075
d0d74b71 6076 # Determine missing bitrates
6077 if format.get('tbr') is None:
6078 if format.get('vbr') is not None and format.get('abr') is not None:
6079 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6080 else:
6081 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6082 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6083 if format.get('acodec') != 'none' and format.get('abr') is None:
6084 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6085
6086 return tuple(self._calculate_field_preference(format, field) for field in self._order)