]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[core] Deprecate internal `Youtubedl-no-compression` header (#6876)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
69bec673 50from . import traversal
51
52from ..compat import functools # isort: split
53from ..compat import (
36e6f62c 54 compat_etree_fromstring,
51098426 55 compat_expanduser,
f8271158 56 compat_HTMLParseError,
efa97bdc 57 compat_os_name,
702ccf2d 58 compat_shlex_quote,
8c25f81b 59)
69bec673 60from ..dependencies import brotli, certifi, websockets, xattr
61from ..socks import ProxyType, sockssocket
51fb4995 62
468e2e92
FV
63# This is not clearly defined otherwise
64compiled_regex_type = type(re.compile(''))
65
f7a147e3
S
66
67def random_user_agent():
68 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
69 _CHROME_VERSIONS = (
19b4c74d 70 '90.0.4430.212',
71 '90.0.4430.24',
72 '90.0.4430.70',
73 '90.0.4430.72',
74 '90.0.4430.85',
75 '90.0.4430.93',
76 '91.0.4472.101',
77 '91.0.4472.106',
78 '91.0.4472.114',
79 '91.0.4472.124',
80 '91.0.4472.164',
81 '91.0.4472.19',
82 '91.0.4472.77',
83 '92.0.4515.107',
84 '92.0.4515.115',
85 '92.0.4515.131',
86 '92.0.4515.159',
87 '92.0.4515.43',
88 '93.0.4556.0',
89 '93.0.4577.15',
90 '93.0.4577.63',
91 '93.0.4577.82',
92 '94.0.4606.41',
93 '94.0.4606.54',
94 '94.0.4606.61',
95 '94.0.4606.71',
96 '94.0.4606.81',
97 '94.0.4606.85',
98 '95.0.4638.17',
99 '95.0.4638.50',
100 '95.0.4638.54',
101 '95.0.4638.69',
102 '95.0.4638.74',
103 '96.0.4664.18',
104 '96.0.4664.45',
105 '96.0.4664.55',
106 '96.0.4664.93',
107 '97.0.4692.20',
f7a147e3
S
108 )
109 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
110
111
4390d5ec 112SUPPORTED_ENCODINGS = [
113 'gzip', 'deflate'
114]
9b8ee23b 115if brotli:
4390d5ec 116 SUPPORTED_ENCODINGS.append('br')
117
3e669f36 118std_headers = {
f7a147e3 119 'User-Agent': random_user_agent(),
59ae15a5 120 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 121 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 122 'Sec-Fetch-Mode': 'navigate',
3e669f36 123}
f427df17 124
5f6a1245 125
fb37eb25
S
126USER_AGENTS = {
127 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
128}
129
130
bf42a990 131NO_DEFAULT = object()
7b2c3f47 132IDENTITY = lambda x: x
bf42a990 133
7105440c
YCH
134ENGLISH_MONTH_NAMES = [
135 'January', 'February', 'March', 'April', 'May', 'June',
136 'July', 'August', 'September', 'October', 'November', 'December']
137
f6717dec
S
138MONTH_NAMES = {
139 'en': ENGLISH_MONTH_NAMES,
140 'fr': [
3e4185c3
S
141 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
142 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 143 # these follow the genitive grammatical case (dopełniacz)
144 # some websites might be using nominative, which will require another month list
145 # https://en.wikibooks.org/wiki/Polish/Noun_cases
146 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
147 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 148}
a942d6cb 149
8f53dc44 150# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
151TIMEZONE_NAMES = {
152 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
153 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
154 'EST': -5, 'EDT': -4, # Eastern
155 'CST': -6, 'CDT': -5, # Central
156 'MST': -7, 'MDT': -6, # Mountain
157 'PST': -8, 'PDT': -7 # Pacific
158}
159
c587cbb7 160# needed for sanitizing filenames in restricted mode
c8827027 161ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
162 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
163 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 164
46f59e89
S
165DATE_FORMATS = (
166 '%d %B %Y',
167 '%d %b %Y',
168 '%B %d %Y',
cb655f34
S
169 '%B %dst %Y',
170 '%B %dnd %Y',
9d30c213 171 '%B %drd %Y',
cb655f34 172 '%B %dth %Y',
46f59e89 173 '%b %d %Y',
cb655f34
S
174 '%b %dst %Y',
175 '%b %dnd %Y',
9d30c213 176 '%b %drd %Y',
cb655f34 177 '%b %dth %Y',
46f59e89
S
178 '%b %dst %Y %I:%M',
179 '%b %dnd %Y %I:%M',
9d30c213 180 '%b %drd %Y %I:%M',
46f59e89
S
181 '%b %dth %Y %I:%M',
182 '%Y %m %d',
183 '%Y-%m-%d',
bccdbd22 184 '%Y.%m.%d.',
46f59e89 185 '%Y/%m/%d',
81c13222 186 '%Y/%m/%d %H:%M',
46f59e89 187 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
188 '%Y%m%d%H%M',
189 '%Y%m%d%H%M%S',
4f3fa23e 190 '%Y%m%d',
0c1c6f4b 191 '%Y-%m-%d %H:%M',
46f59e89
S
192 '%Y-%m-%d %H:%M:%S',
193 '%Y-%m-%d %H:%M:%S.%f',
5014558a 194 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
195 '%d.%m.%Y %H:%M',
196 '%d.%m.%Y %H.%M',
197 '%Y-%m-%dT%H:%M:%SZ',
198 '%Y-%m-%dT%H:%M:%S.%fZ',
199 '%Y-%m-%dT%H:%M:%S.%f0Z',
200 '%Y-%m-%dT%H:%M:%S',
201 '%Y-%m-%dT%H:%M:%S.%f',
202 '%Y-%m-%dT%H:%M',
c6eed6b8
S
203 '%b %d %Y at %H:%M',
204 '%b %d %Y at %H:%M:%S',
b555ae9b
S
205 '%B %d %Y at %H:%M',
206 '%B %d %Y at %H:%M:%S',
a63d9bd0 207 '%H:%M %d-%b-%Y',
46f59e89
S
208)
209
210DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
211DATE_FORMATS_DAY_FIRST.extend([
212 '%d-%m-%Y',
213 '%d.%m.%Y',
214 '%d.%m.%y',
215 '%d/%m/%Y',
216 '%d/%m/%y',
217 '%d/%m/%Y %H:%M:%S',
47304e07 218 '%d-%m-%Y %H:%M',
46f59e89
S
219])
220
221DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
222DATE_FORMATS_MONTH_FIRST.extend([
223 '%m-%d-%Y',
224 '%m.%d.%Y',
225 '%m/%d/%Y',
226 '%m/%d/%y',
227 '%m/%d/%Y %H:%M:%S',
228])
229
06b3fe29 230PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 231JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 232
1d485a1a 233NUMBER_RE = r'\d+(?:\.\d+)?'
234
7105440c 235
0b9c08b4 236@functools.cache
d77c3dfd 237def preferredencoding():
59ae15a5 238 """Get preferred encoding.
d77c3dfd 239
59ae15a5
PH
240 Returns the best encoding scheme for the system, based on
241 locale.getpreferredencoding() and some further tweaks.
242 """
243 try:
244 pref = locale.getpreferredencoding()
28e614de 245 'TEST'.encode(pref)
70a1165b 246 except Exception:
59ae15a5 247 pref = 'UTF-8'
bae611f2 248
59ae15a5 249 return pref
d77c3dfd 250
f4bfd65f 251
181c8655 252def write_json_file(obj, fn):
1394646a 253 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 254
cfb0511d 255 tf = tempfile.NamedTemporaryFile(
256 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
257 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
258
259 try:
260 with tf:
45d86abe 261 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
262 if sys.platform == 'win32':
263 # Need to remove existing file on Windows, else os.rename raises
264 # WindowsError or FileExistsError.
19a03940 265 with contextlib.suppress(OSError):
1394646a 266 os.unlink(fn)
19a03940 267 with contextlib.suppress(OSError):
9cd5f54e
R
268 mask = os.umask(0)
269 os.umask(mask)
270 os.chmod(tf.name, 0o666 & ~mask)
181c8655 271 os.rename(tf.name, fn)
70a1165b 272 except Exception:
19a03940 273 with contextlib.suppress(OSError):
181c8655 274 os.remove(tf.name)
181c8655
PH
275 raise
276
277
cfb0511d 278def find_xpath_attr(node, xpath, key, val=None):
279 """ Find the xpath xpath[@key=val] """
280 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 281 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 282 return node.find(expr)
59ae56fa 283
d7e66d39
JMF
284# On python2.6 the xml.etree.ElementTree.Element methods don't support
285# the namespace parameter
5f6a1245
JW
286
287
d7e66d39
JMF
288def xpath_with_ns(path, ns_map):
289 components = [c.split(':') for c in path.split('/')]
290 replaced = []
291 for c in components:
292 if len(c) == 1:
293 replaced.append(c[0])
294 else:
295 ns, tag = c
296 replaced.append('{%s}%s' % (ns_map[ns], tag))
297 return '/'.join(replaced)
298
d77c3dfd 299
a41fb80c 300def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 301 def _find_xpath(xpath):
f9934b96 302 return node.find(xpath)
578c0745 303
14f25df2 304 if isinstance(xpath, str):
578c0745
S
305 n = _find_xpath(xpath)
306 else:
307 for xp in xpath:
308 n = _find_xpath(xp)
309 if n is not None:
310 break
d74bebd5 311
8e636da4 312 if n is None:
bf42a990
S
313 if default is not NO_DEFAULT:
314 return default
315 elif fatal:
bf0ff932
PH
316 name = xpath if name is None else name
317 raise ExtractorError('Could not find XML element %s' % name)
318 else:
319 return None
a41fb80c
S
320 return n
321
322
323def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
324 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
325 if n is None or n == default:
326 return n
327 if n.text is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = xpath if name is None else name
332 raise ExtractorError('Could not find XML element\'s text %s' % name)
333 else:
334 return None
335 return n.text
a41fb80c
S
336
337
338def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
339 n = find_xpath_attr(node, xpath, key)
340 if n is None:
341 if default is not NO_DEFAULT:
342 return default
343 elif fatal:
86e5f3ed 344 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
345 raise ExtractorError('Could not find XML attribute %s' % name)
346 else:
347 return None
348 return n.attrib[key]
bf0ff932
PH
349
350
c487cf00 351def get_element_by_id(id, html, **kwargs):
43e8fafd 352 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 353 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 354
12ea2f30 355
c487cf00 356def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 357 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 358 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
359
360
84c237fb 361def get_element_by_class(class_name, html):
2af12ad9
TC
362 """Return the content of the first tag with the specified class in the passed HTML document"""
363 retval = get_elements_by_class(class_name, html)
364 return retval[0] if retval else None
365
366
6f32a0b5
ZM
367def get_element_html_by_class(class_name, html):
368 """Return the html of the first tag with the specified class in the passed HTML document"""
369 retval = get_elements_html_by_class(class_name, html)
370 return retval[0] if retval else None
371
372
c487cf00 373def get_element_by_attribute(attribute, value, html, **kwargs):
374 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
375 return retval[0] if retval else None
376
377
c487cf00 378def get_element_html_by_attribute(attribute, value, html, **kargs):
379 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
380 return retval[0] if retval else None
381
382
c487cf00 383def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
384 """Return the content of all tags with the specified class in the passed HTML document as a list"""
385 return get_elements_by_attribute(
64fa820c 386 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
387 html, escape_value=False)
388
389
6f32a0b5
ZM
390def get_elements_html_by_class(class_name, html):
391 """Return the html of all tags with the specified class in the passed HTML document as a list"""
392 return get_elements_html_by_attribute(
64fa820c 393 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
394 html, escape_value=False)
395
396
397def get_elements_by_attribute(*args, **kwargs):
43e8fafd 398 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
399 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
400
401
402def get_elements_html_by_attribute(*args, **kwargs):
403 """Return the html of the tag with the specified attribute in the passed HTML document"""
404 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
405
406
4c9a1a3b 407def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
408 """
409 Return the text (content) and the html (whole) of the tag with the specified
410 attribute in the passed HTML document
411 """
c61473c1
M
412 if not value:
413 return
9e6dd238 414
86e5f3ed 415 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 416
84c237fb
YCH
417 value = re.escape(value) if escape_value else value
418
86e5f3ed 419 partial_element_re = rf'''(?x)
4c9a1a3b 420 <(?P<tag>{tag})
0254f162 421 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 422 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
423 '''
38285056 424
0254f162
ZM
425 for m in re.finditer(partial_element_re, html):
426 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 427
0254f162
ZM
428 yield (
429 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
430 whole
431 )
a921f407 432
c5229f39 433
ac668111 434class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
435 """
436 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
437 closing tag for the first opening tag it has encountered, and can be used
438 as a context manager
439 """
440
441 class HTMLBreakOnClosingTagException(Exception):
442 pass
443
444 def __init__(self):
445 self.tagstack = collections.deque()
ac668111 446 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
447
448 def __enter__(self):
449 return self
450
451 def __exit__(self, *_):
452 self.close()
453
454 def close(self):
455 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
456 # so data remains buffered; we no longer have any interest in it, thus
457 # override this method to discard it
458 pass
459
460 def handle_starttag(self, tag, _):
461 self.tagstack.append(tag)
462
463 def handle_endtag(self, tag):
464 if not self.tagstack:
465 raise compat_HTMLParseError('no tags in the stack')
466 while self.tagstack:
467 inner_tag = self.tagstack.pop()
468 if inner_tag == tag:
469 break
470 else:
471 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
472 if not self.tagstack:
473 raise self.HTMLBreakOnClosingTagException()
474
475
46d09f87 476# XXX: This should be far less strict
6f32a0b5
ZM
477def get_element_text_and_html_by_tag(tag, html):
478 """
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text) and the whole element (html)
481 """
482 def find_or_raise(haystack, needle, exc):
483 try:
484 return haystack.index(needle)
485 except ValueError:
486 raise exc
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
503 try:
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
510
511
ac668111 512class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 513 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 514
8bb56eee 515 def __init__(self):
c5229f39 516 self.attrs = {}
ac668111 517 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
518
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
7053aa3a 521 raise compat_HTMLParseError('done')
8bb56eee 522
c5229f39 523
ac668111 524class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
525 """HTML parser to gather the attributes for the elements of a list"""
526
527 def __init__(self):
ac668111 528 html.parser.HTMLParser.__init__(self)
73673ccf
FF
529 self.items = []
530 self._level = 0
531
532 def handle_starttag(self, tag, attrs):
533 if tag == 'li' and self._level == 0:
534 self.items.append(dict(attrs))
535 self._level += 1
536
537 def handle_endtag(self, tag):
538 self._level -= 1
539
540
8bb56eee
BF
541def extract_attributes(html_element):
542 """Given a string for an HTML element such as
543 <el
544 a="foo" B="bar" c="&98;az" d=boz
545 empty= noval entity="&amp;"
546 sq='"' dq="'"
547 >
548 Decode and return a dictionary of attributes.
549 {
550 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
551 'empty': '', 'noval': None, 'entity': '&',
552 'sq': '"', 'dq': '\''
553 }.
8bb56eee
BF
554 """
555 parser = HTMLAttributeParser()
19a03940 556 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
557 parser.feed(html_element)
558 parser.close()
8bb56eee 559 return parser.attrs
9e6dd238 560
c5229f39 561
73673ccf
FF
562def parse_list(webpage):
563 """Given a string for an series of HTML <li> elements,
564 return a dictionary of their attributes"""
565 parser = HTMLListAttrsParser()
566 parser.feed(webpage)
567 parser.close()
568 return parser.items
569
570
9e6dd238 571def clean_html(html):
59ae15a5 572 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
573
574 if html is None: # Convenience for sanitizing descriptions etc.
575 return html
576
49185227 577 html = re.sub(r'\s+', ' ', html)
578 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
579 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
580 # Strip html tags
581 html = re.sub('<.*?>', '', html)
582 # Replace html entities
583 html = unescapeHTML(html)
7decf895 584 return html.strip()
9e6dd238
FV
585
586
b7c47b74 587class LenientJSONDecoder(json.JSONDecoder):
cc090836 588 # TODO: Write tests
589 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 590 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 591 self._close_attempts = 2 * close_objects
b7c47b74 592 super().__init__(*args, **kwargs)
593
cc090836 594 @staticmethod
595 def _close_object(err):
596 doc = err.doc[:err.pos]
597 # We need to add comma first to get the correct error message
598 if err.msg.startswith('Expecting \',\''):
599 return doc + ','
600 elif not doc.endswith(','):
601 return
602
603 if err.msg.startswith('Expecting property name'):
604 return doc[:-1] + '}'
605 elif err.msg.startswith('Expecting value'):
606 return doc[:-1] + ']'
607
b7c47b74 608 def decode(self, s):
609 if self.transform_source:
610 s = self.transform_source(s)
cc090836 611 for attempt in range(self._close_attempts + 1):
612 try:
613 if self.ignore_extra:
614 return self.raw_decode(s.lstrip())[0]
615 return super().decode(s)
616 except json.JSONDecodeError as e:
617 if e.pos is None:
618 raise
619 elif attempt < self._close_attempts:
620 s = self._close_object(e)
621 if s is not None:
622 continue
2fa669f7 623 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 624 assert False, 'Too many attempts to decode JSON'
b7c47b74 625
626
d77c3dfd 627def sanitize_open(filename, open_mode):
59ae15a5
PH
628 """Try to open the given filename, and slightly tweak it if this fails.
629
630 Attempts to open the given filename. If this fails, it tries to change
631 the filename slightly, step by step, until it's either able to open it
632 or it fails and raises a final exception, like the standard open()
633 function.
634
635 It returns the tuple (stream, definitive_file_name).
636 """
0edb3e33 637 if filename == '-':
638 if sys.platform == 'win32':
639 import msvcrt
be5c1ae8 640
62b58c09 641 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 642 with contextlib.suppress(io.UnsupportedOperation):
643 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 644 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 645
0edb3e33 646 for attempt in range(2):
647 try:
648 try:
89737671 649 if sys.platform == 'win32':
b506289f 650 # FIXME: An exclusive lock also locks the file from being read.
651 # Since windows locks are mandatory, don't lock the file on windows (for now).
652 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 653 raise LockingUnsupportedError()
0edb3e33 654 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 655 except OSError:
0edb3e33 656 stream = open(filename, open_mode)
8a82af35 657 return stream, filename
86e5f3ed 658 except OSError as err:
0edb3e33 659 if attempt or err.errno in (errno.EACCES,):
660 raise
661 old_filename, filename = filename, sanitize_path(filename)
662 if old_filename == filename:
663 raise
d77c3dfd
FV
664
665
666def timeconvert(timestr):
59ae15a5
PH
667 """Convert RFC 2822 defined time string into system timestamp"""
668 timestamp = None
669 timetuple = email.utils.parsedate_tz(timestr)
670 if timetuple is not None:
671 timestamp = email.utils.mktime_tz(timetuple)
672 return timestamp
1c469a94 673
5f6a1245 674
5c3895ff 675def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 676 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 677 @param restricted Use a stricter subset of allowed characters
678 @param is_id Whether this is an ID that should be kept unchanged if possible.
679 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 680 """
5c3895ff 681 if s == '':
682 return ''
683
59ae15a5 684 def replace_insane(char):
c587cbb7
AT
685 if restricted and char in ACCENT_CHARS:
686 return ACCENT_CHARS[char]
91dd88b9 687 elif not restricted and char == '\n':
5c3895ff 688 return '\0 '
989a01c2 689 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
690 # Replace with their full-width unicode counterparts
691 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 692 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
693 return ''
694 elif char == '"':
695 return '' if restricted else '\''
696 elif char == ':':
5c3895ff 697 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 698 elif char in '\\/|*<>':
5c3895ff 699 return '\0_'
700 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
701 return '\0_'
59ae15a5
PH
702 return char
703
db4678e4 704 # Replace look-alike Unicode glyphs
705 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 706 s = unicodedata.normalize('NFKC', s)
5c3895ff 707 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 708 result = ''.join(map(replace_insane, s))
5c3895ff 709 if is_id is NO_DEFAULT:
ae61d108 710 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
711 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 712 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
713 result = result.replace('\0', '') or '_'
714
796173d0
PH
715 if not is_id:
716 while '__' in result:
717 result = result.replace('__', '_')
718 result = result.strip('_')
719 # Common case of "Foreign band name - English song title"
720 if restricted and result.startswith('-_'):
721 result = result[2:]
5a42414b
PH
722 if result.startswith('-'):
723 result = '_' + result[len('-'):]
a7440261 724 result = result.lstrip('.')
796173d0
PH
725 if not result:
726 result = '_'
59ae15a5 727 return result
d77c3dfd 728
5f6a1245 729
c2934512 730def sanitize_path(s, force=False):
a2aaf4db 731 """Sanitizes and normalizes path on Windows"""
c2934512 732 if sys.platform == 'win32':
c4218ac3 733 force = False
c2934512 734 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 735 elif force:
736 drive_or_unc = ''
737 else:
a2aaf4db 738 return s
c2934512 739
be531ef1
S
740 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
741 if drive_or_unc:
a2aaf4db
S
742 norm_path.pop(0)
743 sanitized_path = [
ec85ded8 744 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 745 for path_part in norm_path]
be531ef1
S
746 if drive_or_unc:
747 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 748 elif force and s and s[0] == os.path.sep:
c4218ac3 749 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
750 return os.path.join(*sanitized_path)
751
752
8f97a15d 753def sanitize_url(url, *, scheme='http'):
befa4708
S
754 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
755 # the number of unwanted failures due to missing protocol
21633673 756 if url is None:
757 return
758 elif url.startswith('//'):
8f97a15d 759 return f'{scheme}:{url}'
befa4708
S
760 # Fix some common typos seen so far
761 COMMON_TYPOS = (
067aa17e 762 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
763 (r'^httpss://', r'https://'),
764 # https://bx1.be/lives/direct-tv/
765 (r'^rmtp([es]?)://', r'rtmp\1://'),
766 )
767 for mistake, fixup in COMMON_TYPOS:
768 if re.match(mistake, url):
769 return re.sub(mistake, fixup, url)
bc6b9bcd 770 return url
17bcc626
S
771
772
5435dcf9 773def extract_basic_auth(url):
14f25df2 774 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
775 if parts.username is None:
776 return url, None
14f25df2 777 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
778 parts.hostname if parts.port is None
779 else '%s:%d' % (parts.hostname, parts.port))))
780 auth_payload = base64.b64encode(
0f06bcd7 781 ('%s:%s' % (parts.username, parts.password or '')).encode())
782 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
783
784
67dda517 785def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 786 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
787 if auth_header is not None:
788 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
789 headers['Authorization'] = auth_header
ac668111 790 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
791
792
51098426 793def expand_path(s):
2fa669f7 794 """Expand shell variables and ~"""
51098426
S
795 return os.path.expandvars(compat_expanduser(s))
796
797
7e9a6125 798def orderedSet(iterable, *, lazy=False):
799 """Remove all duplicates from the input iterable"""
800 def _iter():
801 seen = [] # Do not use set since the items can be unhashable
802 for x in iterable:
803 if x not in seen:
804 seen.append(x)
805 yield x
806
807 return _iter() if lazy else list(_iter())
d77c3dfd 808
912b38b4 809
55b2f099 810def _htmlentity_transform(entity_with_semicolon):
4e408e47 811 """Transforms an HTML entity to a character."""
55b2f099
YCH
812 entity = entity_with_semicolon[:-1]
813
4e408e47 814 # Known non-numeric HTML entity
ac668111 815 if entity in html.entities.name2codepoint:
816 return chr(html.entities.name2codepoint[entity])
4e408e47 817
62b58c09
L
818 # TODO: HTML5 allows entities without a semicolon.
819 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 820 if entity_with_semicolon in html.entities.html5:
821 return html.entities.html5[entity_with_semicolon]
55b2f099 822
91757b0f 823 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
824 if mobj is not None:
825 numstr = mobj.group(1)
28e614de 826 if numstr.startswith('x'):
4e408e47 827 base = 16
28e614de 828 numstr = '0%s' % numstr
4e408e47
PH
829 else:
830 base = 10
067aa17e 831 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 832 with contextlib.suppress(ValueError):
ac668111 833 return chr(int(numstr, base))
4e408e47
PH
834
835 # Unknown entity in name, return its literal representation
7a3f0c00 836 return '&%s;' % entity
4e408e47
PH
837
838
d77c3dfd 839def unescapeHTML(s):
912b38b4
PH
840 if s is None:
841 return None
19a03940 842 assert isinstance(s, str)
d77c3dfd 843
4e408e47 844 return re.sub(
95f3f7c2 845 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 846
8bf48f23 847
cdb19aa4 848def escapeHTML(text):
849 return (
850 text
851 .replace('&', '&amp;')
852 .replace('<', '&lt;')
853 .replace('>', '&gt;')
854 .replace('"', '&quot;')
855 .replace("'", '&#39;')
856 )
857
858
f5b1bca9 859def process_communicate_or_kill(p, *args, **kwargs):
da4db748 860 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
861 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 862 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 863
864
d3c93ec2 865class Popen(subprocess.Popen):
866 if sys.platform == 'win32':
867 _startupinfo = subprocess.STARTUPINFO()
868 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
869 else:
870 _startupinfo = None
871
82ea226c
L
872 @staticmethod
873 def _fix_pyinstaller_ld_path(env):
874 """Restore LD_LIBRARY_PATH when using PyInstaller
875 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
876 https://github.com/yt-dlp/yt-dlp/issues/4573
877 """
878 if not hasattr(sys, '_MEIPASS'):
879 return
880
881 def _fix(key):
882 orig = env.get(f'{key}_ORIG')
883 if orig is None:
884 env.pop(key, None)
885 else:
886 env[key] = orig
887
888 _fix('LD_LIBRARY_PATH') # Linux
889 _fix('DYLD_LIBRARY_PATH') # macOS
890
891 def __init__(self, *args, env=None, text=False, **kwargs):
892 if env is None:
893 env = os.environ.copy()
894 self._fix_pyinstaller_ld_path(env)
895
da8e2912 896 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 897 if text is True:
898 kwargs['universal_newlines'] = True # For 3.6 compatibility
899 kwargs.setdefault('encoding', 'utf-8')
900 kwargs.setdefault('errors', 'replace')
82ea226c 901 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 902
903 def communicate_or_kill(self, *args, **kwargs):
8a82af35 904 try:
905 return self.communicate(*args, **kwargs)
906 except BaseException: # Including KeyboardInterrupt
f0c9fb96 907 self.kill(timeout=None)
8a82af35 908 raise
d3c93ec2 909
f0c9fb96 910 def kill(self, *, timeout=0):
911 super().kill()
912 if timeout != 0:
913 self.wait(timeout=timeout)
914
915 @classmethod
992dc6b4 916 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 917 with cls(*args, **kwargs) as proc:
da8e2912 918 default = '' if proc.__text_mode else b''
992dc6b4 919 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 920 return stdout or default, stderr or default, proc.returncode
f0c9fb96 921
d3c93ec2 922
f07b74fc 923def encodeArgument(s):
cfb0511d 924 # Legacy code that uses byte strings
925 # Uncomment the following line after fixing all post processors
14f25df2 926 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 927 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
928
929
aa7785f8 930_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
931
932
933def timetuple_from_msec(msec):
934 secs, msec = divmod(msec, 1000)
935 mins, secs = divmod(secs, 60)
936 hrs, mins = divmod(mins, 60)
937 return _timetuple(hrs, mins, secs, msec)
938
939
cdb19aa4 940def formatSeconds(secs, delim=':', msec=False):
aa7785f8 941 time = timetuple_from_msec(secs * 1000)
942 if time.hours:
943 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
944 elif time.minutes:
945 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 946 else:
aa7785f8 947 ret = '%d' % time.seconds
948 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 949
a0ddb8a2 950
77562778 951def _ssl_load_windows_store_certs(ssl_context, storename):
952 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
953 try:
954 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
955 if encoding == 'x509_asn' and (
956 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
957 except PermissionError:
958 return
959 for cert in certs:
19a03940 960 with contextlib.suppress(ssl.SSLError):
77562778 961 ssl_context.load_verify_locations(cadata=cert)
a2366922 962
77562778 963
964def make_HTTPS_handler(params, **kwargs):
965 opts_check_certificate = not params.get('nocheckcertificate')
966 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
967 context.check_hostname = opts_check_certificate
f81c62a6 968 if params.get('legacyserverconnect'):
969 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 970 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
971 context.set_ciphers('DEFAULT')
ac8e69dd
M
972 elif (
973 sys.version_info < (3, 10)
974 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
975 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
976 ):
5b9f253f
M
977 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
978 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
979 # in some situations [2][3].
980 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
981 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 982 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
983 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
984 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
985 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
986 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
987 # 5. https://peps.python.org/pep-0644/#libressl-support
988 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
989 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
990 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 991
77562778 992 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
993 if opts_check_certificate:
69bec673 994 if certifi and 'no-certifi' not in params.get('compat_opts', []):
d5820461 995 context.load_verify_locations(cafile=certifi.where())
168bbc4f 996 else:
997 try:
998 context.load_default_certs()
999 # Work around the issue in load_default_certs when there are bad certificates. See:
1000 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1001 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1002 except ssl.SSLError:
1003 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1004 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1005 for storename in ('CA', 'ROOT'):
1006 _ssl_load_windows_store_certs(context, storename)
1007 context.set_default_verify_paths()
8a82af35 1008
bb58c9ed 1009 client_certfile = params.get('client_certificate')
1010 if client_certfile:
1011 try:
1012 context.load_cert_chain(
1013 client_certfile, keyfile=params.get('client_certificate_key'),
1014 password=params.get('client_certificate_password'))
1015 except ssl.SSLError:
1016 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1017
1018 # Some servers may reject requests if ALPN extension is not sent. See:
1019 # https://github.com/python/cpython/issues/85140
1020 # https://github.com/yt-dlp/yt-dlp/issues/3878
1021 with contextlib.suppress(NotImplementedError):
1022 context.set_alpn_protocols(['http/1.1'])
1023
77562778 1024 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1025
732ea2f0 1026
5873d4cc 1027def bug_reports_message(before=';'):
69bec673 1028 from ..update import REPOSITORY
57e0f077 1029
1030 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1031 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1032
1033 before = before.rstrip()
1034 if not before or before.endswith(('.', '!', '?')):
1035 msg = msg[0].title() + msg[1:]
1036
1037 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1038
1039
bf5b9d85
PM
1040class YoutubeDLError(Exception):
1041 """Base exception for YoutubeDL errors."""
aa9369a2 1042 msg = None
1043
1044 def __init__(self, msg=None):
1045 if msg is not None:
1046 self.msg = msg
1047 elif self.msg is None:
1048 self.msg = type(self).__name__
1049 super().__init__(self.msg)
bf5b9d85
PM
1050
1051
ac668111 1052network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1053if hasattr(ssl, 'CertificateError'):
1054 network_exceptions.append(ssl.CertificateError)
1055network_exceptions = tuple(network_exceptions)
1056
1057
bf5b9d85 1058class ExtractorError(YoutubeDLError):
1c256f70 1059 """Error during info extraction."""
5f6a1245 1060
1151c407 1061 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1062 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1063 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1064 """
3158150c 1065 if sys.exc_info()[0] in network_exceptions:
9a82b238 1066 expected = True
d5979c5d 1067
7265a219 1068 self.orig_msg = str(msg)
1c256f70 1069 self.traceback = tb
1151c407 1070 self.expected = expected
2eabb802 1071 self.cause = cause
d11271dd 1072 self.video_id = video_id
1151c407 1073 self.ie = ie
1074 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1075 if isinstance(self.exc_info[1], ExtractorError):
1076 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1077 super().__init__(self.__msg)
1151c407 1078
9bcfe33b 1079 @property
1080 def __msg(self):
1081 return ''.join((
1082 format_field(self.ie, None, '[%s] '),
1083 format_field(self.video_id, None, '%s: '),
1084 self.orig_msg,
1085 format_field(self.cause, None, ' (caused by %r)'),
1086 '' if self.expected else bug_reports_message()))
1c256f70 1087
01951dda 1088 def format_traceback(self):
497d2fab 1089 return join_nonempty(
1090 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1091 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1092 delim='\n') or None
01951dda 1093
9bcfe33b 1094 def __setattr__(self, name, value):
1095 super().__setattr__(name, value)
1096 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1097 self.msg = self.__msg or type(self).__name__
1098 self.args = (self.msg, ) # Cannot be property
1099
1c256f70 1100
416c7fcb
PH
1101class UnsupportedError(ExtractorError):
1102 def __init__(self, url):
86e5f3ed 1103 super().__init__(
416c7fcb
PH
1104 'Unsupported URL: %s' % url, expected=True)
1105 self.url = url
1106
1107
55b3e45b
JMF
1108class RegexNotFoundError(ExtractorError):
1109 """Error when a regex didn't match"""
1110 pass
1111
1112
773f291d
S
1113class GeoRestrictedError(ExtractorError):
1114 """Geographic restriction Error exception.
1115
1116 This exception may be thrown when a video is not available from your
1117 geographic location due to geographic restrictions imposed by a website.
1118 """
b6e0c7d2 1119
0db3bae8 1120 def __init__(self, msg, countries=None, **kwargs):
1121 kwargs['expected'] = True
86e5f3ed 1122 super().__init__(msg, **kwargs)
773f291d
S
1123 self.countries = countries
1124
1125
693f0600 1126class UserNotLive(ExtractorError):
1127 """Error when a channel/user is not live"""
1128
1129 def __init__(self, msg=None, **kwargs):
1130 kwargs['expected'] = True
1131 super().__init__(msg or 'The channel is not currently live', **kwargs)
1132
1133
bf5b9d85 1134class DownloadError(YoutubeDLError):
59ae15a5 1135 """Download Error exception.
d77c3dfd 1136
59ae15a5
PH
1137 This exception may be thrown by FileDownloader objects if they are not
1138 configured to continue on errors. They will contain the appropriate
1139 error message.
1140 """
5f6a1245 1141
8cc83b8d
FV
1142 def __init__(self, msg, exc_info=None):
1143 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1144 super().__init__(msg)
8cc83b8d 1145 self.exc_info = exc_info
d77c3dfd
FV
1146
1147
498f5606 1148class EntryNotInPlaylist(YoutubeDLError):
1149 """Entry not in playlist exception.
1150
1151 This exception will be thrown by YoutubeDL when a requested entry
1152 is not found in the playlist info_dict
1153 """
aa9369a2 1154 msg = 'Entry not found in info'
498f5606 1155
1156
bf5b9d85 1157class SameFileError(YoutubeDLError):
59ae15a5 1158 """Same File exception.
d77c3dfd 1159
59ae15a5
PH
1160 This exception will be thrown by FileDownloader objects if they detect
1161 multiple files would have to be downloaded to the same file on disk.
1162 """
aa9369a2 1163 msg = 'Fixed output name but more than one file to download'
1164
1165 def __init__(self, filename=None):
1166 if filename is not None:
1167 self.msg += f': {filename}'
1168 super().__init__(self.msg)
d77c3dfd
FV
1169
1170
bf5b9d85 1171class PostProcessingError(YoutubeDLError):
59ae15a5 1172 """Post Processing exception.
d77c3dfd 1173
59ae15a5
PH
1174 This exception may be raised by PostProcessor's .run() method to
1175 indicate an error in the postprocessing task.
1176 """
5f6a1245 1177
5f6a1245 1178
48f79687 1179class DownloadCancelled(YoutubeDLError):
1180 """ Exception raised when the download queue should be interrupted """
1181 msg = 'The download was cancelled'
8b0d7497 1182
8b0d7497 1183
48f79687 1184class ExistingVideoReached(DownloadCancelled):
1185 """ --break-on-existing triggered """
1186 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1187
48f79687 1188
1189class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1190 """ --break-match-filter triggered """
1191 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1192
1193
48f79687 1194class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1195 """ --max-downloads limit has been reached. """
48f79687 1196 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1197
1198
f2ebc5c7 1199class ReExtractInfo(YoutubeDLError):
1200 """ Video info needs to be re-extracted. """
1201
1202 def __init__(self, msg, expected=False):
1203 super().__init__(msg)
1204 self.expected = expected
1205
1206
1207class ThrottledDownload(ReExtractInfo):
48f79687 1208 """ Download speed below --throttled-rate. """
aa9369a2 1209 msg = 'The download speed is below throttle limit'
d77c3dfd 1210
43b22906 1211 def __init__(self):
1212 super().__init__(self.msg, expected=False)
f2ebc5c7 1213
d77c3dfd 1214
bf5b9d85 1215class UnavailableVideoError(YoutubeDLError):
59ae15a5 1216 """Unavailable Format exception.
d77c3dfd 1217
59ae15a5
PH
1218 This exception will be thrown when a video is requested
1219 in a format that is not available for that video.
1220 """
aa9369a2 1221 msg = 'Unable to download video'
1222
1223 def __init__(self, err=None):
1224 if err is not None:
1225 self.msg += f': {err}'
1226 super().__init__(self.msg)
d77c3dfd
FV
1227
1228
bf5b9d85 1229class ContentTooShortError(YoutubeDLError):
59ae15a5 1230 """Content Too Short exception.
d77c3dfd 1231
59ae15a5
PH
1232 This exception may be raised by FileDownloader objects when a file they
1233 download is too small for what the server announced first, indicating
1234 the connection was probably interrupted.
1235 """
d77c3dfd 1236
59ae15a5 1237 def __init__(self, downloaded, expected):
86e5f3ed 1238 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1239 # Both in bytes
59ae15a5
PH
1240 self.downloaded = downloaded
1241 self.expected = expected
d77c3dfd 1242
5f6a1245 1243
bf5b9d85 1244class XAttrMetadataError(YoutubeDLError):
efa97bdc 1245 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1246 super().__init__(msg)
efa97bdc 1247 self.code = code
bd264412 1248 self.msg = msg
efa97bdc
YCH
1249
1250 # Parsing code and msg
3089bc74 1251 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1252 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1253 self.reason = 'NO_SPACE'
1254 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1255 self.reason = 'VALUE_TOO_LONG'
1256 else:
1257 self.reason = 'NOT_SUPPORTED'
1258
1259
bf5b9d85 1260class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1261 pass
1262
1263
c5a59d93 1264def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1265 hc = http_class(*args, **kwargs)
be4a824d 1266 source_address = ydl_handler._params.get('source_address')
8959018a 1267
be4a824d 1268 if source_address is not None:
8959018a
AU
1269 # This is to workaround _create_connection() from socket where it will try all
1270 # address data from getaddrinfo() including IPv6. This filters the result from
1271 # getaddrinfo() based on the source_address value.
1272 # This is based on the cpython socket.create_connection() function.
1273 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1274 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1275 host, port = address
1276 err = None
1277 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1278 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1279 ip_addrs = [addr for addr in addrs if addr[0] == af]
1280 if addrs and not ip_addrs:
1281 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1282 raise OSError(
9e21e6d9
S
1283 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1284 % (ip_version, source_address[0]))
8959018a
AU
1285 for res in ip_addrs:
1286 af, socktype, proto, canonname, sa = res
1287 sock = None
1288 try:
1289 sock = socket.socket(af, socktype, proto)
1290 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1291 sock.settimeout(timeout)
1292 sock.bind(source_address)
1293 sock.connect(sa)
1294 err = None # Explicitly break reference cycle
1295 return sock
86e5f3ed 1296 except OSError as _:
8959018a
AU
1297 err = _
1298 if sock is not None:
1299 sock.close()
1300 if err is not None:
1301 raise err
1302 else:
86e5f3ed 1303 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1304 if hasattr(hc, '_create_connection'):
1305 hc._create_connection = _create_connection
cfb0511d 1306 hc.source_address = (source_address, 0)
be4a824d
PH
1307
1308 return hc
1309
1310
ac668111 1311class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1312 """Handler for HTTP requests and responses.
1313
1314 This class, when installed with an OpenerDirector, automatically adds
955c8958 1315 the standard headers to every HTTP request and handles gzipped, deflated and
1316 brotli responses from web servers.
59ae15a5
PH
1317
1318 Part of this code was copied from:
1319
1320 http://techknack.net/python-urllib2-handlers/
1321
1322 Andrew Rowls, the author of that code, agreed to release it to the
1323 public domain.
1324 """
1325
be4a824d 1326 def __init__(self, params, *args, **kwargs):
ac668111 1327 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1328 self._params = params
1329
1330 def http_open(self, req):
ac668111 1331 conn_class = http.client.HTTPConnection
71aff188
YCH
1332
1333 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1334 if socks_proxy:
1335 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1336 del req.headers['Ytdl-socks-proxy']
1337
be4a824d 1338 return self.do_open(functools.partial(
71aff188 1339 _create_http_connection, self, conn_class, False),
be4a824d
PH
1340 req)
1341
59ae15a5
PH
1342 @staticmethod
1343 def deflate(data):
fc2119f2 1344 if not data:
1345 return data
59ae15a5
PH
1346 try:
1347 return zlib.decompress(data, -zlib.MAX_WBITS)
1348 except zlib.error:
1349 return zlib.decompress(data)
1350
4390d5ec 1351 @staticmethod
1352 def brotli(data):
1353 if not data:
1354 return data
9b8ee23b 1355 return brotli.decompress(data)
4390d5ec 1356
acebc9cd 1357 def http_request(self, req):
51f267d9
S
1358 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1359 # always respected by websites, some tend to give out URLs with non percent-encoded
1360 # non-ASCII characters (see telemb.py, ard.py [#3412])
1361 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1362 # To work around aforementioned issue we will replace request's original URL with
1363 # percent-encoded one
1364 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1365 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1366 url = req.get_full_url()
1367 url_escaped = escape_url(url)
1368
1369 # Substitute URL if any change after escaping
1370 if url != url_escaped:
15d260eb 1371 req = update_Request(req, url=url_escaped)
51f267d9 1372
8b7539d2 1373 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1374 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1375 # The dict keys are capitalized because of this bug by urllib
1376 if h.capitalize() not in req.headers:
33ac271b 1377 req.add_header(h, v)
87f0e62d 1378
955c8958 1379 if 'Youtubedl-no-compression' in req.headers: # deprecated
1380 req.headers.pop('Youtubedl-no-compression', None)
1381 req.add_header('Accept-encoding', 'identity')
1382
af14914b 1383 if 'Accept-encoding' not in req.headers:
1384 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1385
379a4f16 1386 return super().do_request_(req)
59ae15a5 1387
acebc9cd 1388 def http_response(self, req, resp):
59ae15a5
PH
1389 old_resp = resp
1390 # gzip
1391 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1392 content = resp.read()
1393 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1394 try:
1395 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1396 except OSError as original_ioerror:
aa3e9507
PH
1397 # There may be junk add the end of the file
1398 # See http://stackoverflow.com/q/4928560/35070 for details
1399 for i in range(1, 1024):
1400 try:
1401 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1402 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1403 except OSError:
aa3e9507
PH
1404 continue
1405 break
1406 else:
1407 raise original_ioerror
ac668111 1408 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
1409 resp.msg = old_resp.msg
1410 # deflate
1411 if resp.headers.get('Content-encoding', '') == 'deflate':
1412 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1413 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1414 resp.msg = old_resp.msg
4390d5ec 1415 # brotli
1416 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1417 resp = urllib.request.addinfourl(
4390d5ec 1418 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1419 resp.msg = old_resp.msg
ad729172 1420 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1421 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1422 if 300 <= resp.code < 400:
1423 location = resp.headers.get('Location')
1424 if location:
1425 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1426 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1427 location_escaped = escape_url(location)
1428 if location != location_escaped:
1429 del resp.headers['Location']
1430 resp.headers['Location'] = location_escaped
59ae15a5 1431 return resp
0f8d03f8 1432
acebc9cd
PH
1433 https_request = http_request
1434 https_response = http_response
bf50b038 1435
5de90176 1436
71aff188
YCH
1437def make_socks_conn_class(base_class, socks_proxy):
1438 assert issubclass(base_class, (
ac668111 1439 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1440
14f25df2 1441 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1442 if url_components.scheme.lower() == 'socks5':
1443 socks_type = ProxyType.SOCKS5
1444 elif url_components.scheme.lower() in ('socks', 'socks4'):
1445 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1446 elif url_components.scheme.lower() == 'socks4a':
1447 socks_type = ProxyType.SOCKS4A
71aff188 1448
cdd94c2e
YCH
1449 def unquote_if_non_empty(s):
1450 if not s:
1451 return s
ac668111 1452 return urllib.parse.unquote_plus(s)
cdd94c2e 1453
71aff188
YCH
1454 proxy_args = (
1455 socks_type,
1456 url_components.hostname, url_components.port or 1080,
1457 True, # Remote DNS
cdd94c2e
YCH
1458 unquote_if_non_empty(url_components.username),
1459 unquote_if_non_empty(url_components.password),
71aff188
YCH
1460 )
1461
1462 class SocksConnection(base_class):
1463 def connect(self):
1464 self.sock = sockssocket()
1465 self.sock.setproxy(*proxy_args)
19a03940 1466 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1467 self.sock.settimeout(self.timeout)
1468 self.sock.connect((self.host, self.port))
1469
ac668111 1470 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1471 if hasattr(self, '_context'): # Python > 2.6
1472 self.sock = self._context.wrap_socket(
1473 self.sock, server_hostname=self.host)
1474 else:
1475 self.sock = ssl.wrap_socket(self.sock)
1476
1477 return SocksConnection
1478
1479
ac668111 1480class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1481 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1482 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1483 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1484 self._params = params
1485
1486 def https_open(self, req):
4f264c02 1487 kwargs = {}
71aff188
YCH
1488 conn_class = self._https_conn_class
1489
4f264c02
JMF
1490 if hasattr(self, '_context'): # python > 2.6
1491 kwargs['context'] = self._context
1492 if hasattr(self, '_check_hostname'): # python 3.x
1493 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1494
1495 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1496 if socks_proxy:
1497 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1498 del req.headers['Ytdl-socks-proxy']
1499
4f28b537 1500 try:
1501 return self.do_open(
1502 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1503 except urllib.error.URLError as e:
1504 if (isinstance(e.reason, ssl.SSLError)
1505 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1506 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1507 raise
be4a824d
PH
1508
1509
941e881e 1510def is_path_like(f):
1511 return isinstance(f, (str, bytes, os.PathLike))
1512
1513
ac668111 1514class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1515 """
1516 See [1] for cookie file format.
1517
1518 1. https://curl.haxx.se/docs/http-cookies.html
1519 """
e7e62441 1520 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1521 _ENTRY_LEN = 7
1522 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1523# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1524
1525'''
1526 _CookieFileEntry = collections.namedtuple(
1527 'CookieFileEntry',
1528 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1529
d76fa1f3 1530 def __init__(self, filename=None, *args, **kwargs):
1531 super().__init__(None, *args, **kwargs)
941e881e 1532 if is_path_like(filename):
d76fa1f3 1533 filename = os.fspath(filename)
1534 self.filename = filename
1535
24146491 1536 @staticmethod
1537 def _true_or_false(cndn):
1538 return 'TRUE' if cndn else 'FALSE'
1539
d76fa1f3 1540 @contextlib.contextmanager
1541 def open(self, file, *, write=False):
941e881e 1542 if is_path_like(file):
d76fa1f3 1543 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1544 yield f
1545 else:
1546 if write:
1547 file.truncate(0)
1548 yield file
1549
24146491 1550 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1551 now = time.time()
1552 for cookie in self:
1553 if (not ignore_discard and cookie.discard
1554 or not ignore_expires and cookie.is_expired(now)):
1555 continue
1556 name, value = cookie.name, cookie.value
1557 if value is None:
1558 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1559 # with no name, whereas http.cookiejar regards it as a
1560 # cookie with no value.
1561 name, value = '', name
1562 f.write('%s\n' % '\t'.join((
1563 cookie.domain,
1564 self._true_or_false(cookie.domain.startswith('.')),
1565 cookie.path,
1566 self._true_or_false(cookie.secure),
1567 str_or_none(cookie.expires, default=''),
1568 name, value
1569 )))
1570
1571 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1572 """
1573 Save cookies to a file.
24146491 1574 Code is taken from CPython 3.6
1575 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1576
c380cc28
S
1577 if filename is None:
1578 if self.filename is not None:
1579 filename = self.filename
1580 else:
ac668111 1581 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1582
24146491 1583 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1584 for cookie in self:
1585 if cookie.expires is None:
1586 cookie.expires = 0
c380cc28 1587
d76fa1f3 1588 with self.open(filename, write=True) as f:
c380cc28 1589 f.write(self._HEADER)
24146491 1590 self._really_save(f, *args, **kwargs)
1bab3437
S
1591
1592 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1593 """Load cookies from a file."""
1594 if filename is None:
1595 if self.filename is not None:
1596 filename = self.filename
1597 else:
ac668111 1598 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1599
c380cc28
S
1600 def prepare_line(line):
1601 if line.startswith(self._HTTPONLY_PREFIX):
1602 line = line[len(self._HTTPONLY_PREFIX):]
1603 # comments and empty lines are fine
1604 if line.startswith('#') or not line.strip():
1605 return line
1606 cookie_list = line.split('\t')
1607 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1608 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1609 cookie = self._CookieFileEntry(*cookie_list)
1610 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1611 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1612 return line
1613
e7e62441 1614 cf = io.StringIO()
d76fa1f3 1615 with self.open(filename) as f:
e7e62441 1616 for line in f:
c380cc28
S
1617 try:
1618 cf.write(prepare_line(line))
ac668111 1619 except http.cookiejar.LoadError as e:
94aa0644 1620 if f'{line.strip()} '[0] in '[{"':
ac668111 1621 raise http.cookiejar.LoadError(
94aa0644 1622 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1623 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1624 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1625 continue
e7e62441 1626 cf.seek(0)
1627 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1628 # Session cookies are denoted by either `expires` field set to
1629 # an empty string or 0. MozillaCookieJar only recognizes the former
1630 # (see [1]). So we need force the latter to be recognized as session
1631 # cookies on our own.
1632 # Session cookies may be important for cookies-based authentication,
1633 # e.g. usually, when user does not check 'Remember me' check box while
1634 # logging in on a site, some important cookies are stored as session
1635 # cookies so that not recognizing them will result in failed login.
1636 # 1. https://bugs.python.org/issue17164
1637 for cookie in self:
1638 # Treat `expires=0` cookies as session cookies
1639 if cookie.expires == 0:
1640 cookie.expires = None
1641 cookie.discard = True
1642
1643
ac668111 1644class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1645 def __init__(self, cookiejar=None):
ac668111 1646 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1647
1648 def http_response(self, request, response):
ac668111 1649 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1650
ac668111 1651 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1652 https_response = http_response
1653
1654
ac668111 1655class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1656 """YoutubeDL redirect handler
1657
1658 The code is based on HTTPRedirectHandler implementation from CPython [1].
1659
1660 This redirect handler solves two issues:
1661 - ensures redirect URL is always unicode under python 2
1662 - introduces support for experimental HTTP response status code
1663 308 Permanent Redirect [2] used by some sites [3]
1664
1665 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1666 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1667 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1668 """
1669
ac668111 1670 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1671
1672 def redirect_request(self, req, fp, code, msg, headers, newurl):
1673 """Return a Request or None in response to a redirect.
1674
1675 This is called by the http_error_30x methods when a
1676 redirection response is received. If a redirection should
1677 take place, return a new Request to allow http_error_30x to
1678 perform the redirect. Otherwise, raise HTTPError if no-one
1679 else should try to handle this url. Return None if you can't
1680 but another Handler might.
1681 """
1682 m = req.get_method()
1683 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1684 or code in (301, 302, 303) and m == "POST")):
14f25df2 1685 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1686 # Strictly (according to RFC 2616), 301 or 302 in response to
1687 # a POST MUST NOT cause a redirection without confirmation
1688 # from the user (of urllib.request, in this case). In practice,
1689 # essentially all clients do redirect in this case, so we do
1690 # the same.
1691
201c1459 1692 # Be conciliant with URIs containing a space. This is mainly
1693 # redundant with the more complete encoding done in http_error_302(),
1694 # but it is kept for compatibility with other callers.
1695 newurl = newurl.replace(' ', '%20')
1696
1697 CONTENT_HEADERS = ("content-length", "content-type")
1698 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1699 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1700
1701 # A 303 must either use GET or HEAD for subsequent request
1702 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1703 if code == 303 and m != 'HEAD':
1704 m = 'GET'
1705 # 301 and 302 redirects are commonly turned into a GET from a POST
1706 # for subsequent requests by browsers, so we'll do the same.
1707 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1708 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1709 if code in (301, 302) and m == 'POST':
1710 m = 'GET'
1711
ac668111 1712 return urllib.request.Request(
201c1459 1713 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1714 unverifiable=True, method=m)
fca6dba8
S
1715
1716
46f59e89
S
1717def extract_timezone(date_str):
1718 m = re.search(
f137e4c2 1719 r'''(?x)
1720 ^.{8,}? # >=8 char non-TZ prefix, if present
1721 (?P<tz>Z| # just the UTC Z, or
1722 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1723 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1724 [ ]? # optional space
1725 (?P<sign>\+|-) # +/-
1726 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1727 $)
1728 ''', date_str)
46f59e89 1729 if not m:
8f53dc44 1730 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1731 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1732 if timezone is not None:
1733 date_str = date_str[:-len(m.group('tz'))]
1734 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1735 else:
1736 date_str = date_str[:-len(m.group('tz'))]
1737 if not m.group('sign'):
1738 timezone = datetime.timedelta()
1739 else:
1740 sign = 1 if m.group('sign') == '+' else -1
1741 timezone = datetime.timedelta(
1742 hours=sign * int(m.group('hours')),
1743 minutes=sign * int(m.group('minutes')))
1744 return timezone, date_str
1745
1746
08b38d54 1747def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1748 """ Return a UNIX timestamp from the given date """
1749
1750 if date_str is None:
1751 return None
1752
52c3a6e4
S
1753 date_str = re.sub(r'\.[0-9]+', '', date_str)
1754
08b38d54 1755 if timezone is None:
46f59e89
S
1756 timezone, date_str = extract_timezone(date_str)
1757
19a03940 1758 with contextlib.suppress(ValueError):
86e5f3ed 1759 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1760 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1761 return calendar.timegm(dt.timetuple())
912b38b4
PH
1762
1763
46f59e89
S
1764def date_formats(day_first=True):
1765 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1766
1767
42bdd9d0 1768def unified_strdate(date_str, day_first=True):
bf50b038 1769 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1770
1771 if date_str is None:
1772 return None
bf50b038 1773 upload_date = None
5f6a1245 1774 # Replace commas
026fcc04 1775 date_str = date_str.replace(',', ' ')
42bdd9d0 1776 # Remove AM/PM + timezone
9bb8e0a3 1777 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1778 _, date_str = extract_timezone(date_str)
42bdd9d0 1779
46f59e89 1780 for expression in date_formats(day_first):
19a03940 1781 with contextlib.suppress(ValueError):
bf50b038 1782 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1783 if upload_date is None:
1784 timetuple = email.utils.parsedate_tz(date_str)
1785 if timetuple:
19a03940 1786 with contextlib.suppress(ValueError):
c6b9cf05 1787 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1788 if upload_date is not None:
14f25df2 1789 return str(upload_date)
bf50b038 1790
5f6a1245 1791
46f59e89
S
1792def unified_timestamp(date_str, day_first=True):
1793 if date_str is None:
1794 return None
1795
8f53dc44 1796 date_str = re.sub(r'\s+', ' ', re.sub(
1797 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1798
7dc2a74e 1799 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1800 timezone, date_str = extract_timezone(date_str)
1801
1802 # Remove AM/PM + timezone
1803 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1804
deef3195
S
1805 # Remove unrecognized timezones from ISO 8601 alike timestamps
1806 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1807 if m:
1808 date_str = date_str[:-len(m.group('tz'))]
1809
f226880c
PH
1810 # Python only supports microseconds, so remove nanoseconds
1811 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1812 if m:
1813 date_str = m.group(1)
1814
46f59e89 1815 for expression in date_formats(day_first):
19a03940 1816 with contextlib.suppress(ValueError):
7dc2a74e 1817 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1818 return calendar.timegm(dt.timetuple())
8f53dc44 1819
46f59e89
S
1820 timetuple = email.utils.parsedate_tz(date_str)
1821 if timetuple:
8f53dc44 1822 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1823
1824
28e614de 1825def determine_ext(url, default_ext='unknown_video'):
85750f89 1826 if url is None or '.' not in url:
f4776371 1827 return default_ext
9cb9a5df 1828 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1829 if re.match(r'^[A-Za-z0-9]+$', guess):
1830 return guess
a7aaa398
S
1831 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1832 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1833 return guess.rstrip('/')
73e79f2a 1834 else:
cbdbb766 1835 return default_ext
73e79f2a 1836
5f6a1245 1837
824fa511
S
1838def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1839 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1840
5f6a1245 1841
9e62f283 1842def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1843 R"""
1844 Return a datetime object from a string.
1845 Supported format:
1846 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1847
1848 @param format strftime format of DATE
1849 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1850 auto: round to the unit provided in date_str (if applicable).
9e62f283 1851 """
1852 auto_precision = False
1853 if precision == 'auto':
1854 auto_precision = True
1855 precision = 'microsecond'
396a76f7 1856 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1857 if date_str in ('now', 'today'):
37254abc 1858 return today
f8795e10
PH
1859 if date_str == 'yesterday':
1860 return today - datetime.timedelta(days=1)
9e62f283 1861 match = re.match(
3d38b2d6 1862 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1863 date_str)
37254abc 1864 if match is not None:
9e62f283 1865 start_time = datetime_from_str(match.group('start'), precision, format)
1866 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1867 unit = match.group('unit')
9e62f283 1868 if unit == 'month' or unit == 'year':
1869 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1870 unit = 'day'
9e62f283 1871 else:
1872 if unit == 'week':
1873 unit = 'day'
1874 time *= 7
1875 delta = datetime.timedelta(**{unit + 's': time})
1876 new_date = start_time + delta
1877 if auto_precision:
1878 return datetime_round(new_date, unit)
1879 return new_date
1880
1881 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1882
1883
d49f8db3 1884def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1885 R"""
1886 Return a date object from a string using datetime_from_str
9e62f283 1887
3d38b2d6 1888 @param strict Restrict allowed patterns to "YYYYMMDD" and
1889 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1890 """
3d38b2d6 1891 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1892 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1893 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1894
1895
1896def datetime_add_months(dt, months):
1897 """Increment/Decrement a datetime object by months."""
1898 month = dt.month + months - 1
1899 year = dt.year + month // 12
1900 month = month % 12 + 1
1901 day = min(dt.day, calendar.monthrange(year, month)[1])
1902 return dt.replace(year, month, day)
1903
1904
1905def datetime_round(dt, precision='day'):
1906 """
1907 Round a datetime object's time to a specific precision
1908 """
1909 if precision == 'microsecond':
1910 return dt
1911
1912 unit_seconds = {
1913 'day': 86400,
1914 'hour': 3600,
1915 'minute': 60,
1916 'second': 1,
1917 }
1918 roundto = lambda x, n: ((x + n / 2) // n) * n
1919 timestamp = calendar.timegm(dt.timetuple())
1920 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1921
1922
e63fc1be 1923def hyphenate_date(date_str):
1924 """
1925 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1926 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1927 if match is not None:
1928 return '-'.join(match.groups())
1929 else:
1930 return date_str
1931
5f6a1245 1932
86e5f3ed 1933class DateRange:
bd558525 1934 """Represents a time interval between two dates"""
5f6a1245 1935
bd558525
JMF
1936 def __init__(self, start=None, end=None):
1937 """start and end must be strings in the format accepted by date"""
1938 if start is not None:
d49f8db3 1939 self.start = date_from_str(start, strict=True)
bd558525
JMF
1940 else:
1941 self.start = datetime.datetime.min.date()
1942 if end is not None:
d49f8db3 1943 self.end = date_from_str(end, strict=True)
bd558525
JMF
1944 else:
1945 self.end = datetime.datetime.max.date()
37254abc 1946 if self.start > self.end:
bd558525 1947 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1948
bd558525
JMF
1949 @classmethod
1950 def day(cls, day):
1951 """Returns a range that only contains the given day"""
5f6a1245
JW
1952 return cls(day, day)
1953
bd558525
JMF
1954 def __contains__(self, date):
1955 """Check if the date is in the range"""
37254abc
JMF
1956 if not isinstance(date, datetime.date):
1957 date = date_from_str(date)
1958 return self.start <= date <= self.end
5f6a1245 1959
bd558525 1960 def __str__(self):
86e5f3ed 1961 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1962
f2df4071 1963 def __eq__(self, other):
1964 return (isinstance(other, DateRange)
1965 and self.start == other.start and self.end == other.end)
1966
c496ca96 1967
b1f94422 1968@functools.cache
1969def system_identifier():
1970 python_implementation = platform.python_implementation()
1971 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1972 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1973 libc_ver = []
1974 with contextlib.suppress(OSError): # We may not have access to the executable
1975 libc_ver = platform.libc_ver()
b1f94422 1976
17fc3dc4 1977 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1978 platform.python_version(),
1979 python_implementation,
17fc3dc4 1980 platform.machine(),
b1f94422 1981 platform.architecture()[0],
1982 platform.platform(),
5b9f253f
M
1983 ssl.OPENSSL_VERSION,
1984 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1985 )
c257baff
PH
1986
1987
0b9c08b4 1988@functools.cache
49fa4d9a 1989def get_windows_version():
8a82af35 1990 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1991 if compat_os_name == 'nt':
1992 return version_tuple(platform.win32_ver()[1])
1993 else:
8a82af35 1994 return ()
49fa4d9a
N
1995
1996
734f90bb 1997def write_string(s, out=None, encoding=None):
19a03940 1998 assert isinstance(s, str)
1999 out = out or sys.stderr
3b479100
SS
2000 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2001 if not out:
2002 return
7459e3a2 2003
fe1daad3 2004 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2005 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2006
8a82af35 2007 enc, buffer = None, out
cfb0511d 2008 if 'b' in getattr(out, 'mode', ''):
c487cf00 2009 enc = encoding or preferredencoding()
104aa738 2010 elif hasattr(out, 'buffer'):
8a82af35 2011 buffer = out.buffer
104aa738 2012 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2013
8a82af35 2014 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2015 out.flush()
2016
2017
da4db748 2018def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 2019 from .. import _IN_CLI
da4db748 2020 if _IN_CLI:
2021 if msg in deprecation_warning._cache:
2022 return
2023 deprecation_warning._cache.add(msg)
2024 if printer:
2025 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2026 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2027 else:
2028 import warnings
2029 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2030
2031
2032deprecation_warning._cache = set()
2033
2034
48ea9cea
PH
2035def bytes_to_intlist(bs):
2036 if not bs:
2037 return []
2038 if isinstance(bs[0], int): # Python 3
2039 return list(bs)
2040 else:
2041 return [ord(c) for c in bs]
2042
c257baff 2043
cba892fa 2044def intlist_to_bytes(xs):
2045 if not xs:
2046 return b''
ac668111 2047 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2048
2049
8a82af35 2050class LockingUnsupportedError(OSError):
1890fc63 2051 msg = 'File locking is not supported'
0edb3e33 2052
2053 def __init__(self):
2054 super().__init__(self.msg)
2055
2056
c1c9a79c
PH
2057# Cross-platform file locking
2058if sys.platform == 'win32':
fe0918bb 2059 import ctypes
c1c9a79c
PH
2060 import ctypes.wintypes
2061 import msvcrt
2062
2063 class OVERLAPPED(ctypes.Structure):
2064 _fields_ = [
2065 ('Internal', ctypes.wintypes.LPVOID),
2066 ('InternalHigh', ctypes.wintypes.LPVOID),
2067 ('Offset', ctypes.wintypes.DWORD),
2068 ('OffsetHigh', ctypes.wintypes.DWORD),
2069 ('hEvent', ctypes.wintypes.HANDLE),
2070 ]
2071
37e325b9 2072 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
2073 LockFileEx = kernel32.LockFileEx
2074 LockFileEx.argtypes = [
2075 ctypes.wintypes.HANDLE, # hFile
2076 ctypes.wintypes.DWORD, # dwFlags
2077 ctypes.wintypes.DWORD, # dwReserved
2078 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2079 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2080 ctypes.POINTER(OVERLAPPED) # Overlapped
2081 ]
2082 LockFileEx.restype = ctypes.wintypes.BOOL
2083 UnlockFileEx = kernel32.UnlockFileEx
2084 UnlockFileEx.argtypes = [
2085 ctypes.wintypes.HANDLE, # hFile
2086 ctypes.wintypes.DWORD, # dwReserved
2087 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2088 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2089 ctypes.POINTER(OVERLAPPED) # Overlapped
2090 ]
2091 UnlockFileEx.restype = ctypes.wintypes.BOOL
2092 whole_low = 0xffffffff
2093 whole_high = 0x7fffffff
2094
747c0bd1 2095 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2096 overlapped = OVERLAPPED()
2097 overlapped.Offset = 0
2098 overlapped.OffsetHigh = 0
2099 overlapped.hEvent = 0
2100 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2101
2102 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2103 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2104 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2105 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2106 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2107
2108 def _unlock_file(f):
2109 assert f._lock_file_overlapped_p
2110 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2111 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2112 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2113
2114else:
399a76e6
YCH
2115 try:
2116 import fcntl
c1c9a79c 2117
a3125791 2118 def _lock_file(f, exclusive, block):
b63837bc 2119 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2120 if not block:
2121 flags |= fcntl.LOCK_NB
acea8d7c 2122 try:
b63837bc 2123 fcntl.flock(f, flags)
acea8d7c
JK
2124 except BlockingIOError:
2125 raise
2126 except OSError: # AOSP does not have flock()
b63837bc 2127 fcntl.lockf(f, flags)
c1c9a79c 2128
399a76e6 2129 def _unlock_file(f):
45998b3e
E
2130 with contextlib.suppress(OSError):
2131 return fcntl.flock(f, fcntl.LOCK_UN)
2132 with contextlib.suppress(OSError):
2133 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2134 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 2135
399a76e6 2136 except ImportError:
399a76e6 2137
a3125791 2138 def _lock_file(f, exclusive, block):
0edb3e33 2139 raise LockingUnsupportedError()
399a76e6
YCH
2140
2141 def _unlock_file(f):
0edb3e33 2142 raise LockingUnsupportedError()
c1c9a79c
PH
2143
2144
86e5f3ed 2145class locked_file:
0edb3e33 2146 locked = False
747c0bd1 2147
a3125791 2148 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2149 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2150 raise NotImplementedError(mode)
2151 self.mode, self.block = mode, block
2152
2153 writable = any(f in mode for f in 'wax+')
2154 readable = any(f in mode for f in 'r+')
2155 flags = functools.reduce(operator.ior, (
2156 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2157 getattr(os, 'O_BINARY', 0), # Windows only
2158 getattr(os, 'O_NOINHERIT', 0), # Windows only
2159 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2160 os.O_APPEND if 'a' in mode else 0,
2161 os.O_EXCL if 'x' in mode else 0,
2162 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2163 ))
2164
98804d03 2165 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2166
2167 def __enter__(self):
a3125791 2168 exclusive = 'r' not in self.mode
c1c9a79c 2169 try:
a3125791 2170 _lock_file(self.f, exclusive, self.block)
0edb3e33 2171 self.locked = True
86e5f3ed 2172 except OSError:
c1c9a79c
PH
2173 self.f.close()
2174 raise
fcfa8853 2175 if 'w' in self.mode:
131e14dc
JK
2176 try:
2177 self.f.truncate()
2178 except OSError as e:
1890fc63 2179 if e.errno not in (
2180 errno.ESPIPE, # Illegal seek - expected for FIFO
2181 errno.EINVAL, # Invalid argument - expected for /dev/null
2182 ):
2183 raise
c1c9a79c
PH
2184 return self
2185
0edb3e33 2186 def unlock(self):
2187 if not self.locked:
2188 return
c1c9a79c 2189 try:
0edb3e33 2190 _unlock_file(self.f)
c1c9a79c 2191 finally:
0edb3e33 2192 self.locked = False
c1c9a79c 2193
0edb3e33 2194 def __exit__(self, *_):
2195 try:
2196 self.unlock()
2197 finally:
2198 self.f.close()
4eb7f1d1 2199
0edb3e33 2200 open = __enter__
2201 close = __exit__
a3125791 2202
0edb3e33 2203 def __getattr__(self, attr):
2204 return getattr(self.f, attr)
a3125791 2205
0edb3e33 2206 def __iter__(self):
2207 return iter(self.f)
a3125791 2208
4eb7f1d1 2209
0b9c08b4 2210@functools.cache
4644ac55
S
2211def get_filesystem_encoding():
2212 encoding = sys.getfilesystemencoding()
2213 return encoding if encoding is not None else 'utf-8'
2214
2215
4eb7f1d1 2216def shell_quote(args):
a6a173c2 2217 quoted_args = []
4644ac55 2218 encoding = get_filesystem_encoding()
a6a173c2
JMF
2219 for a in args:
2220 if isinstance(a, bytes):
2221 # We may get a filename encoded with 'encodeFilename'
2222 a = a.decode(encoding)
aefce8e6 2223 quoted_args.append(compat_shlex_quote(a))
28e614de 2224 return ' '.join(quoted_args)
9d4660ca
PH
2225
2226
2227def smuggle_url(url, data):
2228 """ Pass additional data in a URL for internal use. """
2229
81953d1a
RA
2230 url, idata = unsmuggle_url(url, {})
2231 data.update(idata)
14f25df2 2232 sdata = urllib.parse.urlencode(
28e614de
PH
2233 {'__youtubedl_smuggle': json.dumps(data)})
2234 return url + '#' + sdata
9d4660ca
PH
2235
2236
79f82953 2237def unsmuggle_url(smug_url, default=None):
83e865a3 2238 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2239 return smug_url, default
28e614de 2240 url, _, sdata = smug_url.rpartition('#')
14f25df2 2241 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2242 data = json.loads(jsond)
2243 return url, data
02dbf93f
PH
2244
2245
e0fd9573 2246def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2247 """ Formats numbers with decimal sufixes like K, M, etc """
2248 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2249 if num is None or num < 0:
e0fd9573 2250 return None
eeb2a770 2251 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2252 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2253 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2254 if factor == 1024:
2255 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2256 converted = num / (factor ** exponent)
abbeeebc 2257 return fmt % (converted, suffix)
e0fd9573 2258
2259
02dbf93f 2260def format_bytes(bytes):
f02d24d8 2261 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2262
1c088fa8 2263
64c464a1 2264def lookup_unit_table(unit_table, s, strict=False):
2265 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2266 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2267 m = (re.fullmatch if strict else re.match)(
2268 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2269 if not m:
2270 return None
64c464a1 2271
2272 num = float(m.group('num').replace(',', '.'))
fb47597b 2273 mult = unit_table[m.group('unit')]
64c464a1 2274 return round(num * mult)
2275
2276
2277def parse_bytes(s):
2278 """Parse a string indicating a byte quantity into an integer"""
2279 return lookup_unit_table(
2280 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2281 s.upper(), strict=True)
fb47597b
S
2282
2283
be64b5b0
PH
2284def parse_filesize(s):
2285 if s is None:
2286 return None
2287
dfb1b146 2288 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2289 # but we support those too
2290 _UNIT_TABLE = {
2291 'B': 1,
2292 'b': 1,
70852b47 2293 'bytes': 1,
be64b5b0
PH
2294 'KiB': 1024,
2295 'KB': 1000,
2296 'kB': 1024,
2297 'Kb': 1000,
13585d76 2298 'kb': 1000,
70852b47
YCH
2299 'kilobytes': 1000,
2300 'kibibytes': 1024,
be64b5b0
PH
2301 'MiB': 1024 ** 2,
2302 'MB': 1000 ** 2,
2303 'mB': 1024 ** 2,
2304 'Mb': 1000 ** 2,
13585d76 2305 'mb': 1000 ** 2,
70852b47
YCH
2306 'megabytes': 1000 ** 2,
2307 'mebibytes': 1024 ** 2,
be64b5b0
PH
2308 'GiB': 1024 ** 3,
2309 'GB': 1000 ** 3,
2310 'gB': 1024 ** 3,
2311 'Gb': 1000 ** 3,
13585d76 2312 'gb': 1000 ** 3,
70852b47
YCH
2313 'gigabytes': 1000 ** 3,
2314 'gibibytes': 1024 ** 3,
be64b5b0
PH
2315 'TiB': 1024 ** 4,
2316 'TB': 1000 ** 4,
2317 'tB': 1024 ** 4,
2318 'Tb': 1000 ** 4,
13585d76 2319 'tb': 1000 ** 4,
70852b47
YCH
2320 'terabytes': 1000 ** 4,
2321 'tebibytes': 1024 ** 4,
be64b5b0
PH
2322 'PiB': 1024 ** 5,
2323 'PB': 1000 ** 5,
2324 'pB': 1024 ** 5,
2325 'Pb': 1000 ** 5,
13585d76 2326 'pb': 1000 ** 5,
70852b47
YCH
2327 'petabytes': 1000 ** 5,
2328 'pebibytes': 1024 ** 5,
be64b5b0
PH
2329 'EiB': 1024 ** 6,
2330 'EB': 1000 ** 6,
2331 'eB': 1024 ** 6,
2332 'Eb': 1000 ** 6,
13585d76 2333 'eb': 1000 ** 6,
70852b47
YCH
2334 'exabytes': 1000 ** 6,
2335 'exbibytes': 1024 ** 6,
be64b5b0
PH
2336 'ZiB': 1024 ** 7,
2337 'ZB': 1000 ** 7,
2338 'zB': 1024 ** 7,
2339 'Zb': 1000 ** 7,
13585d76 2340 'zb': 1000 ** 7,
70852b47
YCH
2341 'zettabytes': 1000 ** 7,
2342 'zebibytes': 1024 ** 7,
be64b5b0
PH
2343 'YiB': 1024 ** 8,
2344 'YB': 1000 ** 8,
2345 'yB': 1024 ** 8,
2346 'Yb': 1000 ** 8,
13585d76 2347 'yb': 1000 ** 8,
70852b47
YCH
2348 'yottabytes': 1000 ** 8,
2349 'yobibytes': 1024 ** 8,
be64b5b0
PH
2350 }
2351
fb47597b
S
2352 return lookup_unit_table(_UNIT_TABLE, s)
2353
2354
2355def parse_count(s):
2356 if s is None:
be64b5b0
PH
2357 return None
2358
352d5da8 2359 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2360
2361 if re.match(r'^[\d,.]+$', s):
2362 return str_to_int(s)
2363
2364 _UNIT_TABLE = {
2365 'k': 1000,
2366 'K': 1000,
2367 'm': 1000 ** 2,
2368 'M': 1000 ** 2,
2369 'kk': 1000 ** 2,
2370 'KK': 1000 ** 2,
352d5da8 2371 'b': 1000 ** 3,
2372 'B': 1000 ** 3,
fb47597b 2373 }
be64b5b0 2374
352d5da8 2375 ret = lookup_unit_table(_UNIT_TABLE, s)
2376 if ret is not None:
2377 return ret
2378
2379 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2380 if mobj:
2381 return str_to_int(mobj.group(1))
be64b5b0 2382
2f7ae819 2383
5d45484c 2384def parse_resolution(s, *, lenient=False):
b871d7e9
S
2385 if s is None:
2386 return {}
2387
5d45484c
LNO
2388 if lenient:
2389 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2390 else:
2391 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2392 if mobj:
2393 return {
2394 'width': int(mobj.group('w')),
2395 'height': int(mobj.group('h')),
2396 }
2397
17ec8bcf 2398 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2399 if mobj:
2400 return {'height': int(mobj.group(1))}
2401
2402 mobj = re.search(r'\b([48])[kK]\b', s)
2403 if mobj:
2404 return {'height': int(mobj.group(1)) * 540}
2405
2406 return {}
2407
2408
0dc41787 2409def parse_bitrate(s):
14f25df2 2410 if not isinstance(s, str):
0dc41787
S
2411 return
2412 mobj = re.search(r'\b(\d+)\s*kbps', s)
2413 if mobj:
2414 return int(mobj.group(1))
2415
2416
a942d6cb 2417def month_by_name(name, lang='en'):
caefb1de
PH
2418 """ Return the number of a month by (locale-independently) English name """
2419
f6717dec 2420 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2421
caefb1de 2422 try:
f6717dec 2423 return month_names.index(name) + 1
7105440c
YCH
2424 except ValueError:
2425 return None
2426
2427
2428def month_by_abbreviation(abbrev):
2429 """ Return the number of a month by (locale-independently) English
2430 abbreviations """
2431
2432 try:
2433 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2434 except ValueError:
2435 return None
18258362
JMF
2436
2437
5aafe895 2438def fix_xml_ampersands(xml_str):
18258362 2439 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2440 return re.sub(
2441 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2442 '&amp;',
5aafe895 2443 xml_str)
e3946f98
PH
2444
2445
2446def setproctitle(title):
14f25df2 2447 assert isinstance(title, str)
c1c05c67 2448
fe0918bb 2449 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2450 try:
2451 import ctypes
2452 except ImportError:
c1c05c67
YCH
2453 return
2454
e3946f98 2455 try:
611c1dd9 2456 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2457 except OSError:
2458 return
2f49bcd6
RC
2459 except TypeError:
2460 # LoadLibrary in Windows Python 2.7.13 only expects
2461 # a bytestring, but since unicode_literals turns
2462 # every string into a unicode string, it fails.
2463 return
0f06bcd7 2464 title_bytes = title.encode()
6eefe533
PH
2465 buf = ctypes.create_string_buffer(len(title_bytes))
2466 buf.value = title_bytes
e3946f98 2467 try:
6eefe533 2468 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2469 except AttributeError:
2470 return # Strange libc, just skip this
d7dda168
PH
2471
2472
2473def remove_start(s, start):
46bc9b7d 2474 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2475
2476
2b9faf55 2477def remove_end(s, end):
46bc9b7d 2478 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2479
2480
31b2051e
S
2481def remove_quotes(s):
2482 if s is None or len(s) < 2:
2483 return s
2484 for quote in ('"', "'", ):
2485 if s[0] == quote and s[-1] == quote:
2486 return s[1:-1]
2487 return s
2488
2489
b6e0c7d2 2490def get_domain(url):
ebf99aaf 2491 """
2492 This implementation is inconsistent, but is kept for compatibility.
2493 Use this only for "webpage_url_domain"
2494 """
2495 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2496
2497
29eb5174 2498def url_basename(url):
14f25df2 2499 path = urllib.parse.urlparse(url).path
28e614de 2500 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2501
2502
02dc0a36 2503def base_url(url):
7657ec7e 2504 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2505
2506
e34c3361 2507def urljoin(base, path):
4b5de77b 2508 if isinstance(path, bytes):
0f06bcd7 2509 path = path.decode()
14f25df2 2510 if not isinstance(path, str) or not path:
e34c3361 2511 return None
fad4ceb5 2512 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2513 return path
4b5de77b 2514 if isinstance(base, bytes):
0f06bcd7 2515 base = base.decode()
14f25df2 2516 if not isinstance(base, str) or not re.match(
4b5de77b 2517 r'^(?:https?:)?//', base):
e34c3361 2518 return None
14f25df2 2519 return urllib.parse.urljoin(base, path)
e34c3361
S
2520
2521
ac668111 2522class HEADRequest(urllib.request.Request):
aa94a6d3 2523 def get_method(self):
611c1dd9 2524 return 'HEAD'
7217e148
PH
2525
2526
ac668111 2527class PUTRequest(urllib.request.Request):
95cf60e8
S
2528 def get_method(self):
2529 return 'PUT'
2530
2531
9732d77e 2532def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2533 if get_attr and v is not None:
2534 v = getattr(v, get_attr, None)
1812afb7
S
2535 try:
2536 return int(v) * invscale // scale
31c49255 2537 except (ValueError, TypeError, OverflowError):
af98f8ff 2538 return default
9732d77e 2539
9572013d 2540
40a90862 2541def str_or_none(v, default=None):
14f25df2 2542 return default if v is None else str(v)
40a90862 2543
9732d77e
PH
2544
2545def str_to_int(int_str):
48d4681e 2546 """ A more relaxed version of int_or_none """
f9934b96 2547 if isinstance(int_str, int):
348c6bf1 2548 return int_str
14f25df2 2549 elif isinstance(int_str, str):
42db58ec
S
2550 int_str = re.sub(r'[,\.\+]', '', int_str)
2551 return int_or_none(int_str)
608d11f5
PH
2552
2553
9732d77e 2554def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2555 if v is None:
2556 return default
2557 try:
2558 return float(v) * invscale / scale
5e1271c5 2559 except (ValueError, TypeError):
caf80631 2560 return default
43f775e4
PH
2561
2562
c7e327c4
S
2563def bool_or_none(v, default=None):
2564 return v if isinstance(v, bool) else default
2565
2566
53cd37ba 2567def strip_or_none(v, default=None):
14f25df2 2568 return v.strip() if isinstance(v, str) else default
b72b4431
S
2569
2570
af03000a 2571def url_or_none(url):
14f25df2 2572 if not url or not isinstance(url, str):
af03000a
S
2573 return None
2574 url = url.strip()
29f7c58a 2575 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2576
2577
3e9b66d7 2578def request_to_url(req):
ac668111 2579 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2580 return req.get_full_url()
2581 else:
2582 return req
2583
2584
e29663c6 2585def strftime_or_none(timestamp, date_format, default=None):
2586 datetime_object = None
2587 try:
f9934b96 2588 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2589 # Using naive datetime here can break timestamp() in Windows
2590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2591 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2592 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2593 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2594 date_format = re.sub( # Support %s on windows
2595 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2596 return datetime_object.strftime(date_format)
2597 except (ValueError, TypeError, AttributeError):
2598 return default
2599
2600
608d11f5 2601def parse_duration(s):
f9934b96 2602 if not isinstance(s, str):
608d11f5 2603 return None
ca7b3246 2604 s = s.strip()
38d79fd1 2605 if not s:
2606 return None
ca7b3246 2607
acaff495 2608 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2609 m = re.match(r'''(?x)
2610 (?P<before_secs>
2611 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2612 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2613 (?P<ms>[.:][0-9]+)?Z?$
2614 ''', s)
acaff495 2615 if m:
8bd1c00b 2616 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2617 else:
2618 m = re.match(
056653bb
S
2619 r'''(?ix)(?:P?
2620 (?:
1c1b2f96 2621 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2622 )?
2623 (?:
1c1b2f96 2624 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2625 )?
2626 (?:
1c1b2f96 2627 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2628 )?
8f4b58d7 2629 (?:
1c1b2f96 2630 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2631 )?
056653bb 2632 T)?
acaff495 2633 (?:
1c1b2f96 2634 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2635 )?
2636 (?:
1c1b2f96 2637 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2638 )?
2639 (?:
2640 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2641 )?Z?$''', s)
acaff495 2642 if m:
2643 days, hours, mins, secs, ms = m.groups()
2644 else:
15846398 2645 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2646 if m:
2647 hours, mins = m.groups()
2648 else:
2649 return None
2650
acaff495 2651 if ms:
19a03940 2652 ms = ms.replace(':', '.')
2653 return sum(float(part or 0) * mult for part, mult in (
2654 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2655
2656
e65e4c88 2657def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2658 name, real_ext = os.path.splitext(filename)
e65e4c88 2659 return (
86e5f3ed 2660 f'{name}.{ext}{real_ext}'
e65e4c88 2661 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2662 else f'{filename}.{ext}')
d70ad093
PH
2663
2664
b3ed15b7
S
2665def replace_extension(filename, ext, expected_real_ext=None):
2666 name, real_ext = os.path.splitext(filename)
86e5f3ed 2667 return '{}.{}'.format(
b3ed15b7
S
2668 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2669 ext)
2670
2671
d70ad093
PH
2672def check_executable(exe, args=[]):
2673 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2674 args can be a list of arguments for a short output (like -version) """
2675 try:
f0c9fb96 2676 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2677 except OSError:
2678 return False
2679 return exe
b7ab0590
PH
2680
2681
7aaf4cd2 2682def _get_exe_version_output(exe, args):
95807118 2683 try:
b64d04c1 2684 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2685 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2686 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2687 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2688 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2689 if ret:
2690 return None
95807118
PH
2691 except OSError:
2692 return False
f0c9fb96 2693 return stdout
cae97f65
PH
2694
2695
2696def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2697 assert isinstance(output, str)
cae97f65
PH
2698 if version_re is None:
2699 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2700 m = re.search(version_re, output)
95807118
PH
2701 if m:
2702 return m.group(1)
2703 else:
2704 return unrecognized
2705
2706
9af98e17 2707def get_exe_version(exe, args=['--version'],
1cdda329 2708 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2709 """ Returns the version of the specified executable,
2710 or False if the executable is not present """
1cdda329 2711 unrecognized = variadic(unrecognized)
2712 assert len(unrecognized) in (1, 2)
9af98e17 2713 out = _get_exe_version_output(exe, args)
1cdda329 2714 if out is None:
2715 return unrecognized[-1]
2716 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2717
2718
7e88d7d7 2719def frange(start=0, stop=None, step=1):
2720 """Float range"""
2721 if stop is None:
2722 start, stop = 0, start
2723 sign = [-1, 1][step > 0] if step else 0
2724 while sign * start < sign * stop:
2725 yield start
2726 start += step
2727
2728
cb89cfc1 2729class LazyList(collections.abc.Sequence):
0f06bcd7 2730 """Lazy immutable list from an iterable
2731 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2732
8e5fecc8 2733 class IndexError(IndexError):
2734 pass
2735
282f5709 2736 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2737 self._iterable = iter(iterable)
2738 self._cache = [] if _cache is None else _cache
2739 self._reversed = reverse
483336e7 2740
2741 def __iter__(self):
0f06bcd7 2742 if self._reversed:
28419ca2 2743 # We need to consume the entire iterable to iterate in reverse
981052c9 2744 yield from self.exhaust()
28419ca2 2745 return
0f06bcd7 2746 yield from self._cache
2747 for item in self._iterable:
2748 self._cache.append(item)
483336e7 2749 yield item
2750
0f06bcd7 2751 def _exhaust(self):
2752 self._cache.extend(self._iterable)
2753 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2754 return self._cache
28419ca2 2755
981052c9 2756 def exhaust(self):
0f06bcd7 2757 """Evaluate the entire iterable"""
2758 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2759
28419ca2 2760 @staticmethod
0f06bcd7 2761 def _reverse_index(x):
f2df4071 2762 return None if x is None else ~x
483336e7 2763
2764 def __getitem__(self, idx):
2765 if isinstance(idx, slice):
0f06bcd7 2766 if self._reversed:
2767 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2768 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2769 elif isinstance(idx, int):
0f06bcd7 2770 if self._reversed:
2771 idx = self._reverse_index(idx)
e0f2b4b4 2772 start, stop, step = idx, idx, 0
483336e7 2773 else:
2774 raise TypeError('indices must be integers or slices')
e0f2b4b4 2775 if ((start or 0) < 0 or (stop or 0) < 0
2776 or (start is None and step < 0)
2777 or (stop is None and step > 0)):
483336e7 2778 # We need to consume the entire iterable to be able to slice from the end
2779 # Obviously, never use this with infinite iterables
0f06bcd7 2780 self._exhaust()
8e5fecc8 2781 try:
0f06bcd7 2782 return self._cache[idx]
8e5fecc8 2783 except IndexError as e:
2784 raise self.IndexError(e) from e
0f06bcd7 2785 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2786 if n > 0:
0f06bcd7 2787 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2788 try:
0f06bcd7 2789 return self._cache[idx]
8e5fecc8 2790 except IndexError as e:
2791 raise self.IndexError(e) from e
483336e7 2792
2793 def __bool__(self):
2794 try:
0f06bcd7 2795 self[-1] if self._reversed else self[0]
8e5fecc8 2796 except self.IndexError:
483336e7 2797 return False
2798 return True
2799
2800 def __len__(self):
0f06bcd7 2801 self._exhaust()
2802 return len(self._cache)
483336e7 2803
282f5709 2804 def __reversed__(self):
0f06bcd7 2805 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2806
2807 def __copy__(self):
0f06bcd7 2808 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2809
28419ca2 2810 def __repr__(self):
2811 # repr and str should mimic a list. So we exhaust the iterable
2812 return repr(self.exhaust())
2813
2814 def __str__(self):
2815 return repr(self.exhaust())
2816
483336e7 2817
7be9ccff 2818class PagedList:
c07a39ae 2819
2820 class IndexError(IndexError):
2821 pass
2822
dd26ced1
PH
2823 def __len__(self):
2824 # This is only useful for tests
2825 return len(self.getslice())
2826
7be9ccff 2827 def __init__(self, pagefunc, pagesize, use_cache=True):
2828 self._pagefunc = pagefunc
2829 self._pagesize = pagesize
f1d13090 2830 self._pagecount = float('inf')
7be9ccff 2831 self._use_cache = use_cache
2832 self._cache = {}
2833
2834 def getpage(self, pagenum):
d8cf8d97 2835 page_results = self._cache.get(pagenum)
2836 if page_results is None:
f1d13090 2837 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2838 if self._use_cache:
2839 self._cache[pagenum] = page_results
2840 return page_results
2841
2842 def getslice(self, start=0, end=None):
2843 return list(self._getslice(start, end))
2844
2845 def _getslice(self, start, end):
55575225 2846 raise NotImplementedError('This method must be implemented by subclasses')
2847
2848 def __getitem__(self, idx):
f1d13090 2849 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2850 if not isinstance(idx, int) or idx < 0:
2851 raise TypeError('indices must be non-negative integers')
2852 entries = self.getslice(idx, idx + 1)
d8cf8d97 2853 if not entries:
c07a39ae 2854 raise self.IndexError()
d8cf8d97 2855 return entries[0]
55575225 2856
9c44d242
PH
2857
2858class OnDemandPagedList(PagedList):
a44ca5a4 2859 """Download pages until a page with less than maximum results"""
86e5f3ed 2860
7be9ccff 2861 def _getslice(self, start, end):
b7ab0590
PH
2862 for pagenum in itertools.count(start // self._pagesize):
2863 firstid = pagenum * self._pagesize
2864 nextfirstid = pagenum * self._pagesize + self._pagesize
2865 if start >= nextfirstid:
2866 continue
2867
b7ab0590
PH
2868 startv = (
2869 start % self._pagesize
2870 if firstid <= start < nextfirstid
2871 else 0)
b7ab0590
PH
2872 endv = (
2873 ((end - 1) % self._pagesize) + 1
2874 if (end is not None and firstid <= end <= nextfirstid)
2875 else None)
2876
f1d13090 2877 try:
2878 page_results = self.getpage(pagenum)
2879 except Exception:
2880 self._pagecount = pagenum - 1
2881 raise
b7ab0590
PH
2882 if startv != 0 or endv is not None:
2883 page_results = page_results[startv:endv]
7be9ccff 2884 yield from page_results
b7ab0590
PH
2885
2886 # A little optimization - if current page is not "full", ie. does
2887 # not contain page_size videos then we can assume that this page
2888 # is the last one - there are no more ids on further pages -
2889 # i.e. no need to query again.
2890 if len(page_results) + startv < self._pagesize:
2891 break
2892
2893 # If we got the whole page, but the next page is not interesting,
2894 # break out early as well
2895 if end == nextfirstid:
2896 break
81c2f20b
PH
2897
2898
9c44d242 2899class InAdvancePagedList(PagedList):
a44ca5a4 2900 """PagedList with total number of pages known in advance"""
86e5f3ed 2901
9c44d242 2902 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2903 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2904 self._pagecount = pagecount
9c44d242 2905
7be9ccff 2906 def _getslice(self, start, end):
9c44d242 2907 start_page = start // self._pagesize
d37707bd 2908 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2909 skip_elems = start - start_page * self._pagesize
2910 only_more = None if end is None else end - start
2911 for pagenum in range(start_page, end_page):
7be9ccff 2912 page_results = self.getpage(pagenum)
9c44d242 2913 if skip_elems:
7be9ccff 2914 page_results = page_results[skip_elems:]
9c44d242
PH
2915 skip_elems = None
2916 if only_more is not None:
7be9ccff 2917 if len(page_results) < only_more:
2918 only_more -= len(page_results)
9c44d242 2919 else:
7be9ccff 2920 yield from page_results[:only_more]
9c44d242 2921 break
7be9ccff 2922 yield from page_results
9c44d242
PH
2923
2924
7e88d7d7 2925class PlaylistEntries:
2926 MissingEntry = object()
2927 is_exhausted = False
2928
2929 def __init__(self, ydl, info_dict):
7e9a6125 2930 self.ydl = ydl
2931
2932 # _entries must be assigned now since infodict can change during iteration
2933 entries = info_dict.get('entries')
2934 if entries is None:
2935 raise EntryNotInPlaylist('There are no entries')
2936 elif isinstance(entries, list):
2937 self.is_exhausted = True
2938
2939 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2940 self.is_incomplete = requested_entries is not None
7e9a6125 2941 if self.is_incomplete:
2942 assert self.is_exhausted
bc5c2f8a 2943 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2944 for i, entry in zip(requested_entries, entries):
2945 self._entries[i - 1] = entry
2946 elif isinstance(entries, (list, PagedList, LazyList)):
2947 self._entries = entries
2948 else:
2949 self._entries = LazyList(entries)
7e88d7d7 2950
2951 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2952 (?P<start>[+-]?\d+)?
2953 (?P<range>[:-]
2954 (?P<end>[+-]?\d+|inf(?:inite)?)?
2955 (?::(?P<step>[+-]?\d+))?
2956 )?''')
2957
2958 @classmethod
2959 def parse_playlist_items(cls, string):
2960 for segment in string.split(','):
2961 if not segment:
2962 raise ValueError('There is two or more consecutive commas')
2963 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2964 if not mobj:
2965 raise ValueError(f'{segment!r} is not a valid specification')
2966 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2967 if int_or_none(step) == 0:
2968 raise ValueError(f'Step in {segment!r} cannot be zero')
2969 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2970
2971 def get_requested_items(self):
2972 playlist_items = self.ydl.params.get('playlist_items')
2973 playlist_start = self.ydl.params.get('playliststart', 1)
2974 playlist_end = self.ydl.params.get('playlistend')
2975 # For backwards compatibility, interpret -1 as whole list
2976 if playlist_end in (-1, None):
2977 playlist_end = ''
2978 if not playlist_items:
2979 playlist_items = f'{playlist_start}:{playlist_end}'
2980 elif playlist_start != 1 or playlist_end:
2981 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2982
2983 for index in self.parse_playlist_items(playlist_items):
2984 for i, entry in self[index]:
2985 yield i, entry
1ac4fd80 2986 if not entry:
2987 continue
7e88d7d7 2988 try:
d21056f4 2989 # The item may have just been added to archive. Don't break due to it
2990 if not self.ydl.params.get('lazy_playlist'):
2991 # TODO: Add auto-generated fields
2992 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2993 except (ExistingVideoReached, RejectedVideoReached):
2994 return
2995
7e9a6125 2996 def get_full_count(self):
2997 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2998 return len(self)
2999 elif isinstance(self._entries, InAdvancePagedList):
3000 if self._entries._pagesize == 1:
3001 return self._entries._pagecount
3002
7e88d7d7 3003 @functools.cached_property
3004 def _getter(self):
3005 if isinstance(self._entries, list):
3006 def get_entry(i):
3007 try:
3008 entry = self._entries[i]
3009 except IndexError:
3010 entry = self.MissingEntry
3011 if not self.is_incomplete:
3012 raise self.IndexError()
3013 if entry is self.MissingEntry:
bc5c2f8a 3014 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3015 return entry
3016 else:
3017 def get_entry(i):
3018 try:
3019 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3020 except (LazyList.IndexError, PagedList.IndexError):
3021 raise self.IndexError()
3022 return get_entry
3023
3024 def __getitem__(self, idx):
3025 if isinstance(idx, int):
3026 idx = slice(idx, idx)
3027
3028 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3029 step = 1 if idx.step is None else idx.step
3030 if idx.start is None:
3031 start = 0 if step > 0 else len(self) - 1
3032 else:
3033 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3034
3035 # NB: Do not call len(self) when idx == [:]
3036 if idx.stop is None:
3037 stop = 0 if step < 0 else float('inf')
3038 else:
3039 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3040 stop += [-1, 1][step > 0]
3041
3042 for i in frange(start, stop, step):
3043 if i < 0:
3044 continue
3045 try:
7e9a6125 3046 entry = self._getter(i)
3047 except self.IndexError:
3048 self.is_exhausted = True
3049 if step > 0:
7e88d7d7 3050 break
7e9a6125 3051 continue
7e88d7d7 3052 yield i + 1, entry
3053
3054 def __len__(self):
3055 return len(tuple(self[:]))
3056
3057 class IndexError(IndexError):
3058 pass
3059
3060
81c2f20b 3061def uppercase_escape(s):
676eb3f2 3062 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3063 return re.sub(
a612753d 3064 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3065 lambda m: unicode_escape(m.group(0))[0],
3066 s)
0fe2ff78
YCH
3067
3068
3069def lowercase_escape(s):
3070 unicode_escape = codecs.getdecoder('unicode_escape')
3071 return re.sub(
3072 r'\\u[0-9a-fA-F]{4}',
3073 lambda m: unicode_escape(m.group(0))[0],
3074 s)
b53466e1 3075
d05cfe06
S
3076
3077def escape_rfc3986(s):
3078 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3079 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3080
3081
3082def escape_url(url):
3083 """Escape URL as suggested by RFC 3986"""
14f25df2 3084 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3085 return url_parsed._replace(
efbed08d 3086 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3087 path=escape_rfc3986(url_parsed.path),
3088 params=escape_rfc3986(url_parsed.params),
3089 query=escape_rfc3986(url_parsed.query),
3090 fragment=escape_rfc3986(url_parsed.fragment)
3091 ).geturl()
3092
62e609ab 3093
96b9e9cf 3094def parse_qs(url, **kwargs):
3095 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3096
3097
62e609ab
PH
3098def read_batch_urls(batch_fd):
3099 def fixup(url):
14f25df2 3100 if not isinstance(url, str):
62e609ab 3101 url = url.decode('utf-8', 'replace')
8c04f0be 3102 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3103 for bom in BOM_UTF8:
3104 if url.startswith(bom):
3105 url = url[len(bom):]
3106 url = url.lstrip()
3107 if not url or url.startswith(('#', ';', ']')):
62e609ab 3108 return False
8c04f0be 3109 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3110 # However, it can be safely stripped out if following a whitespace
8c04f0be 3111 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3112
3113 with contextlib.closing(batch_fd) as fd:
3114 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3115
3116
3117def urlencode_postdata(*args, **kargs):
14f25df2 3118 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3119
3120
45b2ee6f 3121def update_url(url, *, query_update=None, **kwargs):
3122 """Replace URL components specified by kwargs
3123 @param url str or parse url tuple
3124 @param query_update update query
3125 @returns str
3126 """
3127 if isinstance(url, str):
3128 if not kwargs and not query_update:
3129 return url
3130 else:
3131 url = urllib.parse.urlparse(url)
3132 if query_update:
3133 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3134 kwargs['query'] = urllib.parse.urlencode({
3135 **urllib.parse.parse_qs(url.query),
3136 **query_update
3137 }, True)
3138 return urllib.parse.urlunparse(url._replace(**kwargs))
3139
3140
38f9ef31 3141def update_url_query(url, query):
45b2ee6f 3142 return update_url(url, query_update=query)
16392824 3143
8e60dc75 3144
c043c246 3145def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3146 req_headers = req.headers.copy()
c043c246 3147 req_headers.update(headers or {})
ed0291d1
S
3148 req_data = data or req.data
3149 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3150 req_get_method = req.get_method()
3151 if req_get_method == 'HEAD':
3152 req_type = HEADRequest
3153 elif req_get_method == 'PUT':
3154 req_type = PUTRequest
3155 else:
ac668111 3156 req_type = urllib.request.Request
ed0291d1
S
3157 new_req = req_type(
3158 req_url, data=req_data, headers=req_headers,
3159 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3160 if hasattr(req, 'timeout'):
3161 new_req.timeout = req.timeout
3162 return new_req
3163
3164
10c87c15 3165def _multipart_encode_impl(data, boundary):
0c265486
YCH
3166 content_type = 'multipart/form-data; boundary=%s' % boundary
3167
3168 out = b''
3169 for k, v in data.items():
3170 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3171 if isinstance(k, str):
0f06bcd7 3172 k = k.encode()
14f25df2 3173 if isinstance(v, str):
0f06bcd7 3174 v = v.encode()
0c265486
YCH
3175 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3176 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3177 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3178 if boundary.encode('ascii') in content:
3179 raise ValueError('Boundary overlaps with data')
3180 out += content
3181
3182 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3183
3184 return out, content_type
3185
3186
3187def multipart_encode(data, boundary=None):
3188 '''
3189 Encode a dict to RFC 7578-compliant form-data
3190
3191 data:
3192 A dict where keys and values can be either Unicode or bytes-like
3193 objects.
3194 boundary:
3195 If specified a Unicode object, it's used as the boundary. Otherwise
3196 a random boundary is generated.
3197
3198 Reference: https://tools.ietf.org/html/rfc7578
3199 '''
3200 has_specified_boundary = boundary is not None
3201
3202 while True:
3203 if boundary is None:
3204 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3205
3206 try:
10c87c15 3207 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3208 break
3209 except ValueError:
3210 if has_specified_boundary:
3211 raise
3212 boundary = None
3213
3214 return out, content_type
3215
3216
b079c26f
SS
3217def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3218 if blocked_types is NO_DEFAULT:
3219 blocked_types = (str, bytes, collections.abc.Mapping)
3220 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3221
3222
3223def variadic(x, allowed_types=NO_DEFAULT):
6f2287cb 3224 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 3225
3226
c4f60dd7 3227def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3228 for f in funcs:
a32a9a7e 3229 try:
c4f60dd7 3230 val = f(*args, **kwargs)
ab029d7e 3231 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3232 pass
3233 else:
c4f60dd7 3234 if expected_type is None or isinstance(val, expected_type):
3235 return val
3236
3237
3238def try_get(src, getter, expected_type=None):
3239 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3240
3241
90137ca4 3242def filter_dict(dct, cndn=lambda _, v: v is not None):
3243 return {k: v for k, v in dct.items() if cndn(k, v)}
3244
3245
6cc62232
S
3246def merge_dicts(*dicts):
3247 merged = {}
3248 for a_dict in dicts:
3249 for k, v in a_dict.items():
90137ca4 3250 if (v is not None and k not in merged
3251 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3252 merged[k] = v
3253 return merged
3254
3255
8e60dc75 3256def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3257 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3258
16392824 3259
a1a530b0
PH
3260US_RATINGS = {
3261 'G': 0,
3262 'PG': 10,
3263 'PG-13': 13,
3264 'R': 16,
3265 'NC': 18,
3266}
fac55558
PH
3267
3268
a8795327 3269TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3270 'TV-Y': 0,
3271 'TV-Y7': 7,
3272 'TV-G': 0,
3273 'TV-PG': 0,
3274 'TV-14': 14,
3275 'TV-MA': 17,
a8795327
S
3276}
3277
3278
146c80e2 3279def parse_age_limit(s):
19a03940 3280 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3281 if type(s) is int: # noqa: E721
a8795327 3282 return s if 0 <= s <= 21 else None
19a03940 3283 elif not isinstance(s, str):
d838b1bd 3284 return None
146c80e2 3285 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3286 if m:
3287 return int(m.group('age'))
5c5fae6d 3288 s = s.upper()
a8795327
S
3289 if s in US_RATINGS:
3290 return US_RATINGS[s]
5a16c9d9 3291 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3292 if m:
5a16c9d9 3293 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3294 return None
146c80e2
S
3295
3296
fac55558 3297def strip_jsonp(code):
609a61e3 3298 return re.sub(
5552c9eb 3299 r'''(?sx)^
e9c671d5 3300 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3301 (?:\s*&&\s*(?P=func_name))?
3302 \s*\(\s*(?P<callback_data>.*)\);?
3303 \s*?(?://[^\n]*)*$''',
3304 r'\g<callback_data>', code)
478c2c61
PH
3305
3306
8f53dc44 3307def js_to_json(code, vars={}, *, strict=False):
5c610515 3308 # vars is a dict of var, val pairs to substitute
0898c5c8 3309 STRING_QUOTES = '\'"`'
a71b812f 3310 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3311 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3312 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3313 INTEGER_TABLE = (
86e5f3ed 3314 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3315 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3316 )
3317
a71b812f
SS
3318 def process_escape(match):
3319 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3320 escape = match.group(1) or match.group(2)
3321
3322 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3323 else R'\u00' if escape == 'x'
3324 else '' if escape == '\n'
3325 else escape)
3326
0898c5c8
SS
3327 def template_substitute(match):
3328 evaluated = js_to_json(match.group(1), vars, strict=strict)
3329 if evaluated[0] == '"':
3330 return json.loads(evaluated)
3331 return evaluated
3332
e05f6939 3333 def fix_kv(m):
e7b6d122
PH
3334 v = m.group(0)
3335 if v in ('true', 'false', 'null'):
3336 return v
421ddcb8
C
3337 elif v in ('undefined', 'void 0'):
3338 return 'null'
8bdd16b4 3339 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3340 return ''
3341
3342 if v[0] in STRING_QUOTES:
0898c5c8
SS
3343 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3344 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
3345 return f'"{escaped}"'
3346
3347 for regex, base in INTEGER_TABLE:
3348 im = re.match(regex, v)
3349 if im:
3350 i = int(im.group(1), base)
3351 return f'"{i}":' if v.endswith(':') else str(i)
3352
3353 if v in vars:
d5f043d1
C
3354 try:
3355 if not strict:
3356 json.loads(vars[v])
08e29b9f 3357 except json.JSONDecodeError:
d5f043d1
C
3358 return json.dumps(vars[v])
3359 else:
3360 return vars[v]
89ac4a19 3361
a71b812f
SS
3362 if not strict:
3363 return f'"{v}"'
5c610515 3364
a71b812f 3365 raise ValueError(f'Unknown value: {v}')
e05f6939 3366
8072ef2b 3367 def create_map(mobj):
3368 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3369
8072ef2b 3370 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3371 if not strict:
3372 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3373 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3374 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3375 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3376
a71b812f
SS
3377 return re.sub(rf'''(?sx)
3378 {STRING_RE}|
3379 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3380 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3381 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3382 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3383 !+
a71b812f 3384 ''', fix_kv, code)
e05f6939
PH
3385
3386
478c2c61
PH
3387def qualities(quality_ids):
3388 """ Get a numeric quality value out of a list of possible values """
3389 def q(qid):
3390 try:
3391 return quality_ids.index(qid)
3392 except ValueError:
3393 return -1
3394 return q
3395
acd69589 3396
119e40ef 3397POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3398
3399
de6000d9 3400DEFAULT_OUTTMPL = {
3401 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3402 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3403}
3404OUTTMPL_TYPES = {
72755351 3405 'chapter': None,
de6000d9 3406 'subtitle': None,
3407 'thumbnail': None,
3408 'description': 'description',
3409 'annotation': 'annotations.xml',
3410 'infojson': 'info.json',
08438d2c 3411 'link': None,
3b603dbd 3412 'pl_video': None,
5112f26a 3413 'pl_thumbnail': None,
de6000d9 3414 'pl_description': 'description',
3415 'pl_infojson': 'info.json',
3416}
0a871f68 3417
143db31d 3418# As of [1] format syntax is:
3419# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3420# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3421STR_FORMAT_RE_TMPL = r'''(?x)
3422 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3423 %
524e2e4f 3424 (?P<has_key>\((?P<key>{0})\))?
752cda38 3425 (?P<format>
524e2e4f 3426 (?P<conversion>[#0\-+ ]+)?
3427 (?P<min_width>\d+)?
3428 (?P<precision>\.\d+)?
3429 (?P<len_mod>[hlL])? # unused in python
901130bb 3430 {1} # conversion type
752cda38 3431 )
143db31d 3432'''
3433
7d1eb38a 3434
901130bb 3435STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3436
7d1eb38a 3437
a020a0dc
PH
3438def limit_length(s, length):
3439 """ Add ellipses to overly long strings """
3440 if s is None:
3441 return None
3442 ELLIPSES = '...'
3443 if len(s) > length:
3444 return s[:length - len(ELLIPSES)] + ELLIPSES
3445 return s
48844745
PH
3446
3447
3448def version_tuple(v):
5f9b8394 3449 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3450
3451
3452def is_outdated_version(version, limit, assume_new=True):
3453 if not version:
3454 return not assume_new
3455 try:
3456 return version_tuple(version) < version_tuple(limit)
3457 except ValueError:
3458 return not assume_new
732ea2f0
PH
3459
3460
3461def ytdl_is_updateable():
7a5c1cfe 3462 """ Returns if yt-dlp can be updated with -U """
735d865e 3463
69bec673 3464 from ..update import is_non_updateable
732ea2f0 3465
5d535b4a 3466 return not is_non_updateable()
7d4111ed
PH
3467
3468
3469def args_to_str(args):
3470 # Get a short string representation for a subprocess command
702ccf2d 3471 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3472
3473
a44ca5a4 3474def error_to_str(err):
3475 return f'{type(err).__name__}: {err}'
3476
3477
2647c933 3478def mimetype2ext(mt, default=NO_DEFAULT):
3479 if not isinstance(mt, str):
3480 if default is not NO_DEFAULT:
3481 return default
eb9ee194
S
3482 return None
3483
2647c933 3484 MAP = {
3485 # video
f6861ec9 3486 '3gpp': '3gp',
2647c933 3487 'mp2t': 'ts',
3488 'mp4': 'mp4',
3489 'mpeg': 'mpeg',
3490 'mpegurl': 'm3u8',
3491 'quicktime': 'mov',
3492 'webm': 'webm',
3493 'vp9': 'vp9',
f6861ec9 3494 'x-flv': 'flv',
2647c933 3495 'x-m4v': 'm4v',
3496 'x-matroska': 'mkv',
3497 'x-mng': 'mng',
a0d8d704 3498 'x-mp4-fragmented': 'mp4',
2647c933 3499 'x-ms-asf': 'asf',
a0d8d704 3500 'x-ms-wmv': 'wmv',
2647c933 3501 'x-msvideo': 'avi',
3502
3503 # application (streaming playlists)
b4173f15 3504 'dash+xml': 'mpd',
b4173f15 3505 'f4m+xml': 'f4m',
f164b971 3506 'hds+xml': 'f4m',
2647c933 3507 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3508 'vnd.ms-sstr+xml': 'ism',
2647c933 3509 'x-mpegurl': 'm3u8',
3510
3511 # audio
3512 'audio/mp4': 'm4a',
3513 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3514 # Using .mp3 as it's the most popular one
3515 'audio/mpeg': 'mp3',
d80ca5de 3516 'audio/webm': 'webm',
2647c933 3517 'audio/x-matroska': 'mka',
3518 'audio/x-mpegurl': 'm3u',
3519 'midi': 'mid',
3520 'ogg': 'ogg',
3521 'wav': 'wav',
3522 'wave': 'wav',
3523 'x-aac': 'aac',
3524 'x-flac': 'flac',
3525 'x-m4a': 'm4a',
3526 'x-realaudio': 'ra',
39e7107d 3527 'x-wav': 'wav',
9359f3d4 3528
2647c933 3529 # image
3530 'avif': 'avif',
3531 'bmp': 'bmp',
3532 'gif': 'gif',
3533 'jpeg': 'jpg',
3534 'png': 'png',
3535 'svg+xml': 'svg',
3536 'tiff': 'tif',
3537 'vnd.wap.wbmp': 'wbmp',
3538 'webp': 'webp',
3539 'x-icon': 'ico',
3540 'x-jng': 'jng',
3541 'x-ms-bmp': 'bmp',
3542
3543 # caption
3544 'filmstrip+json': 'fs',
3545 'smptett+xml': 'tt',
3546 'ttaf+xml': 'dfxp',
3547 'ttml+xml': 'ttml',
3548 'x-ms-sami': 'sami',
9359f3d4 3549
2647c933 3550 # misc
3551 'gzip': 'gz',
9359f3d4
F
3552 'json': 'json',
3553 'xml': 'xml',
3554 'zip': 'zip',
9359f3d4
F
3555 }
3556
2647c933 3557 mimetype = mt.partition(';')[0].strip().lower()
3558 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3559
69bec673 3560 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 3561 if ext:
3562 return ext
3563 elif default is not NO_DEFAULT:
3564 return default
9359f3d4 3565 return subtype.replace('+', '.')
c460bdd5
PH
3566
3567
2814f12b
THD
3568def ext2mimetype(ext_or_url):
3569 if not ext_or_url:
3570 return None
3571 if '.' not in ext_or_url:
3572 ext_or_url = f'file.{ext_or_url}'
3573 return mimetypes.guess_type(ext_or_url)[0]
3574
3575
4f3c5e06 3576def parse_codecs(codecs_str):
3577 # http://tools.ietf.org/html/rfc6381
3578 if not codecs_str:
3579 return {}
a0566bbf 3580 split_codecs = list(filter(None, map(
dbf5416a 3581 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3582 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3583 for full_codec in split_codecs:
d816f61f 3584 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3585 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3586 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3587 if vcodec:
3588 continue
3589 vcodec = full_codec
3590 if parts[0] in ('dvh1', 'dvhe'):
3591 hdr = 'DV'
69bec673 3592 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 3593 hdr = 'HDR10'
3594 elif parts[:2] == ['vp9', '2']:
3595 hdr = 'HDR10'
71082216 3596 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3597 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3598 acodec = acodec or full_codec
3599 elif parts[0] in ('stpp', 'wvtt'):
3600 scodec = scodec or full_codec
4f3c5e06 3601 else:
19a03940 3602 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3603 if vcodec or acodec or scodec:
4f3c5e06 3604 return {
3605 'vcodec': vcodec or 'none',
3606 'acodec': acodec or 'none',
176f1866 3607 'dynamic_range': hdr,
3fe75fdc 3608 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3609 }
b69fd25c 3610 elif len(split_codecs) == 2:
3611 return {
3612 'vcodec': split_codecs[0],
3613 'acodec': split_codecs[1],
3614 }
4f3c5e06 3615 return {}
3616
3617
fc61aff4
LL
3618def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3619 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3620
3621 allow_mkv = not preferences or 'mkv' in preferences
3622
3623 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3624 return 'mkv' # TODO: any other format allows this?
3625
3626 # TODO: All codecs supported by parse_codecs isn't handled here
3627 COMPATIBLE_CODECS = {
3628 'mp4': {
71082216 3629 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3630 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3631 },
3632 'webm': {
3633 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3634 'vp9x', 'vp8x', # in the webm spec
3635 },
3636 }
3637
69bec673 3638 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
8f84770a 3639 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3640
3641 for ext in preferences or COMPATIBLE_CODECS.keys():
3642 codec_set = COMPATIBLE_CODECS.get(ext, set())
3643 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3644 return ext
3645
3646 COMPATIBLE_EXTS = (
3647 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3648 {'webm', 'weba'},
fc61aff4
LL
3649 )
3650 for ext in preferences or vexts:
3651 current_exts = {ext, *vexts, *aexts}
3652 if ext == 'mkv' or current_exts == {ext} or any(
3653 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3654 return ext
3655 return 'mkv' if allow_mkv else preferences[-1]
3656
3657
2647c933 3658def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3659 getheader = url_handle.headers.get
2ccd1b10 3660
b55ee18f
PH
3661 cd = getheader('Content-Disposition')
3662 if cd:
3663 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3664 if m:
3665 e = determine_ext(m.group('filename'), default_ext=None)
3666 if e:
3667 return e
3668
2647c933 3669 meta_ext = getheader('x-amz-meta-name')
3670 if meta_ext:
3671 e = meta_ext.rpartition('.')[2]
3672 if e:
3673 return e
3674
3675 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3676
3677
1e399778
YCH
3678def encode_data_uri(data, mime_type):
3679 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3680
3681
05900629 3682def age_restricted(content_limit, age_limit):
6ec6cb4e 3683 """ Returns True iff the content should be blocked """
05900629
PH
3684
3685 if age_limit is None: # No limit set
3686 return False
3687 if content_limit is None:
3688 return False # Content available for everyone
3689 return age_limit < content_limit
61ca9a80
PH
3690
3691
88f60feb 3692# List of known byte-order-marks (BOM)
a904a7f8
L
3693BOMS = [
3694 (b'\xef\xbb\xbf', 'utf-8'),
3695 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3696 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3697 (b'\xff\xfe', 'utf-16-le'),
3698 (b'\xfe\xff', 'utf-16-be'),
3699]
a904a7f8
L
3700
3701
61ca9a80
PH
3702def is_html(first_bytes):
3703 """ Detect whether a file contains HTML by examining its first bytes. """
3704
80e8493e 3705 encoding = 'utf-8'
61ca9a80 3706 for bom, enc in BOMS:
80e8493e 3707 while first_bytes.startswith(bom):
3708 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3709
80e8493e 3710 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3711
3712
3713def determine_protocol(info_dict):
3714 protocol = info_dict.get('protocol')
3715 if protocol is not None:
3716 return protocol
3717
7de837a5 3718 url = sanitize_url(info_dict['url'])
a055469f
PH
3719 if url.startswith('rtmp'):
3720 return 'rtmp'
3721 elif url.startswith('mms'):
3722 return 'mms'
3723 elif url.startswith('rtsp'):
3724 return 'rtsp'
3725
3726 ext = determine_ext(url)
3727 if ext == 'm3u8':
deae7c17 3728 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3729 elif ext == 'f4m':
3730 return 'f4m'
3731
14f25df2 3732 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3733
3734
c5e3f849 3735def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3736 """ Render a list of rows, each as a list of values.
3737 Text after a \t will be right aligned """
ec11a9f4 3738 def width(string):
c5e3f849 3739 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3740
3741 def get_max_lens(table):
ec11a9f4 3742 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3743
3744 def filter_using_list(row, filterArray):
d16df59d 3745 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3746
d16df59d 3747 max_lens = get_max_lens(data) if hide_empty else []
3748 header_row = filter_using_list(header_row, max_lens)
3749 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3750
cfb56d1a 3751 table = [header_row] + data
76d321f6 3752 max_lens = get_max_lens(table)
c5e3f849 3753 extra_gap += 1
76d321f6 3754 if delim:
c5e3f849 3755 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3756 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3757 for row in table:
3758 for pos, text in enumerate(map(str, row)):
c5e3f849 3759 if '\t' in text:
3760 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3761 else:
3762 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3763 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3764 return ret
347de493
PH
3765
3766
8f18aca8 3767def _match_one(filter_part, dct, incomplete):
77b87f05 3768 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3769 STRING_OPERATORS = {
3770 '*=': operator.contains,
3771 '^=': lambda attr, value: attr.startswith(value),
3772 '$=': lambda attr, value: attr.endswith(value),
3773 '~=': lambda attr, value: re.search(value, attr),
3774 }
347de493 3775 COMPARISON_OPERATORS = {
a047eeb6 3776 **STRING_OPERATORS,
3777 '<=': operator.le, # "<=" must be defined above "<"
347de493 3778 '<': operator.lt,
347de493 3779 '>=': operator.ge,
a047eeb6 3780 '>': operator.gt,
347de493 3781 '=': operator.eq,
347de493 3782 }
a047eeb6 3783
6db9c4d5 3784 if isinstance(incomplete, bool):
3785 is_incomplete = lambda _: incomplete
3786 else:
3787 is_incomplete = lambda k: k in incomplete
3788
64fa820c 3789 operator_rex = re.compile(r'''(?x)
347de493 3790 (?P<key>[a-z_]+)
77b87f05 3791 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3792 (?:
a047eeb6 3793 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3794 (?P<strval>.+?)
347de493 3795 )
347de493 3796 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3797 m = operator_rex.fullmatch(filter_part.strip())
347de493 3798 if m:
18f96d12 3799 m = m.groupdict()
3800 unnegated_op = COMPARISON_OPERATORS[m['op']]
3801 if m['negation']:
77b87f05
MT
3802 op = lambda attr, value: not unnegated_op(attr, value)
3803 else:
3804 op = unnegated_op
18f96d12 3805 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3806 if m['quote']:
3807 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3808 actual_value = dct.get(m['key'])
3809 numeric_comparison = None
f9934b96 3810 if isinstance(actual_value, (int, float)):
e5a088dc
S
3811 # If the original field is a string and matching comparisonvalue is
3812 # a number we should respect the origin of the original field
3813 # and process comparison value as a string (see
18f96d12 3814 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3815 try:
18f96d12 3816 numeric_comparison = int(comparison_value)
347de493 3817 except ValueError:
18f96d12 3818 numeric_comparison = parse_filesize(comparison_value)
3819 if numeric_comparison is None:
3820 numeric_comparison = parse_filesize(f'{comparison_value}B')
3821 if numeric_comparison is None:
3822 numeric_comparison = parse_duration(comparison_value)
3823 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3824 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3825 if actual_value is None:
6db9c4d5 3826 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3827 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3828
3829 UNARY_OPERATORS = {
1cc47c66
S
3830 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3831 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3832 }
64fa820c 3833 operator_rex = re.compile(r'''(?x)
347de493 3834 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3835 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3836 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3837 if m:
3838 op = UNARY_OPERATORS[m.group('op')]
3839 actual_value = dct.get(m.group('key'))
6db9c4d5 3840 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3841 return True
347de493
PH
3842 return op(actual_value)
3843
3844 raise ValueError('Invalid filter part %r' % filter_part)
3845
3846
8f18aca8 3847def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3848 """ Filter a dictionary with a simple string syntax.
3849 @returns Whether the filter passes
3850 @param incomplete Set of keys that is expected to be missing from dct.
3851 Can be True/False to indicate all/none of the keys may be missing.
3852 All conditions on incomplete keys pass if the key is missing
8f18aca8 3853 """
347de493 3854 return all(
8f18aca8 3855 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3856 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3857
3858
fe2ce85a 3859def match_filter_func(filters, breaking_filters=None):
3860 if not filters and not breaking_filters:
d1b5f70b 3861 return None
fe2ce85a 3862 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3863 filters = set(variadic(filters or []))
d1b5f70b 3864
492272fe 3865 interactive = '-' in filters
3866 if interactive:
3867 filters.remove('-')
3868
3869 def _match_func(info_dict, incomplete=False):
fe2ce85a 3870 ret = breaking_filters(info_dict, incomplete)
3871 if ret is not None:
3872 raise RejectedVideoReached(ret)
3873
492272fe 3874 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3875 return NO_DEFAULT if interactive and not incomplete else None
347de493 3876 else:
3bec830a 3877 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3878 filter_str = ') | ('.join(map(str.strip, filters))
3879 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3880 return _match_func
91410c9b
PH
3881
3882
f2df4071 3883class download_range_func:
3884 def __init__(self, chapters, ranges):
3885 self.chapters, self.ranges = chapters, ranges
3886
3887 def __call__(self, info_dict, ydl):
0500ee3d 3888 if not self.ranges and not self.chapters:
3889 yield {}
3890
5ec1b6b7 3891 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3892 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3893 for regex in self.chapters or []:
5ec1b6b7 3894 for i, chapter in enumerate(info_dict.get('chapters') or []):
3895 if re.search(regex, chapter['title']):
3896 warning = None
3897 yield {**chapter, 'index': i}
f2df4071 3898 if self.chapters and warning:
5ec1b6b7 3899 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3900
f2df4071 3901 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3902
f2df4071 3903 def __eq__(self, other):
3904 return (isinstance(other, download_range_func)
3905 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3906
71df9b7f 3907 def __repr__(self):
a5387729 3908 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3909
5ec1b6b7 3910
bf6427d2
YCH
3911def parse_dfxp_time_expr(time_expr):
3912 if not time_expr:
d631d5f9 3913 return
bf6427d2 3914
1d485a1a 3915 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3916 if mobj:
3917 return float(mobj.group('time_offset'))
3918
db2fe38b 3919 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3920 if mobj:
db2fe38b 3921 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3922
3923
c1c924ab 3924def srt_subtitles_timecode(seconds):
aa7785f8 3925 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3926
3927
3928def ass_subtitles_timecode(seconds):
3929 time = timetuple_from_msec(seconds * 1000)
3930 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3931
3932
3933def dfxp2srt(dfxp_data):
3869028f
YCH
3934 '''
3935 @param dfxp_data A bytes-like object containing DFXP data
3936 @returns A unicode object containing converted SRT data
3937 '''
5b995f71 3938 LEGACY_NAMESPACES = (
3869028f
YCH
3939 (b'http://www.w3.org/ns/ttml', [
3940 b'http://www.w3.org/2004/11/ttaf1',
3941 b'http://www.w3.org/2006/04/ttaf1',
3942 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3943 ]),
3869028f
YCH
3944 (b'http://www.w3.org/ns/ttml#styling', [
3945 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3946 ]),
3947 )
3948
3949 SUPPORTED_STYLING = [
3950 'color',
3951 'fontFamily',
3952 'fontSize',
3953 'fontStyle',
3954 'fontWeight',
3955 'textDecoration'
3956 ]
3957
4e335771 3958 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3959 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3960 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3961 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3962 })
bf6427d2 3963
5b995f71
RA
3964 styles = {}
3965 default_style = {}
3966
86e5f3ed 3967 class TTMLPElementParser:
5b995f71
RA
3968 _out = ''
3969 _unclosed_elements = []
3970 _applied_styles = []
bf6427d2 3971
2b14cb56 3972 def start(self, tag, attrib):
5b995f71
RA
3973 if tag in (_x('ttml:br'), 'br'):
3974 self._out += '\n'
3975 else:
3976 unclosed_elements = []
3977 style = {}
3978 element_style_id = attrib.get('style')
3979 if default_style:
3980 style.update(default_style)
3981 if element_style_id:
3982 style.update(styles.get(element_style_id, {}))
3983 for prop in SUPPORTED_STYLING:
3984 prop_val = attrib.get(_x('tts:' + prop))
3985 if prop_val:
3986 style[prop] = prop_val
3987 if style:
3988 font = ''
3989 for k, v in sorted(style.items()):
3990 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3991 continue
3992 if k == 'color':
3993 font += ' color="%s"' % v
3994 elif k == 'fontSize':
3995 font += ' size="%s"' % v
3996 elif k == 'fontFamily':
3997 font += ' face="%s"' % v
3998 elif k == 'fontWeight' and v == 'bold':
3999 self._out += '<b>'
4000 unclosed_elements.append('b')
4001 elif k == 'fontStyle' and v == 'italic':
4002 self._out += '<i>'
4003 unclosed_elements.append('i')
4004 elif k == 'textDecoration' and v == 'underline':
4005 self._out += '<u>'
4006 unclosed_elements.append('u')
4007 if font:
4008 self._out += '<font' + font + '>'
4009 unclosed_elements.append('font')
4010 applied_style = {}
4011 if self._applied_styles:
4012 applied_style.update(self._applied_styles[-1])
4013 applied_style.update(style)
4014 self._applied_styles.append(applied_style)
4015 self._unclosed_elements.append(unclosed_elements)
bf6427d2 4016
2b14cb56 4017 def end(self, tag):
5b995f71
RA
4018 if tag not in (_x('ttml:br'), 'br'):
4019 unclosed_elements = self._unclosed_elements.pop()
4020 for element in reversed(unclosed_elements):
4021 self._out += '</%s>' % element
4022 if unclosed_elements and self._applied_styles:
4023 self._applied_styles.pop()
bf6427d2 4024
2b14cb56 4025 def data(self, data):
5b995f71 4026 self._out += data
2b14cb56 4027
4028 def close(self):
5b995f71 4029 return self._out.strip()
2b14cb56 4030
6a765f13 4031 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4032 # This will not trigger false positives since only UTF-8 text is being replaced
4033 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4034
2b14cb56 4035 def parse_node(node):
4036 target = TTMLPElementParser()
4037 parser = xml.etree.ElementTree.XMLParser(target=target)
4038 parser.feed(xml.etree.ElementTree.tostring(node))
4039 return parser.close()
bf6427d2 4040
5b995f71
RA
4041 for k, v in LEGACY_NAMESPACES:
4042 for ns in v:
4043 dfxp_data = dfxp_data.replace(ns, k)
4044
3869028f 4045 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4046 out = []
5b995f71 4047 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4048
4049 if not paras:
4050 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4051
5b995f71
RA
4052 repeat = False
4053 while True:
4054 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4055 style_id = style.get('id') or style.get(_x('xml:id'))
4056 if not style_id:
4057 continue
5b995f71
RA
4058 parent_style_id = style.get('style')
4059 if parent_style_id:
4060 if parent_style_id not in styles:
4061 repeat = True
4062 continue
4063 styles[style_id] = styles[parent_style_id].copy()
4064 for prop in SUPPORTED_STYLING:
4065 prop_val = style.get(_x('tts:' + prop))
4066 if prop_val:
4067 styles.setdefault(style_id, {})[prop] = prop_val
4068 if repeat:
4069 repeat = False
4070 else:
4071 break
4072
4073 for p in ('body', 'div'):
4074 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4075 if ele is None:
4076 continue
4077 style = styles.get(ele.get('style'))
4078 if not style:
4079 continue
4080 default_style.update(style)
4081
bf6427d2 4082 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4083 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4084 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4085 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4086 if begin_time is None:
4087 continue
7dff0363 4088 if not end_time:
d631d5f9
YCH
4089 if not dur:
4090 continue
4091 end_time = begin_time + dur
bf6427d2
YCH
4092 out.append('%d\n%s --> %s\n%s\n\n' % (
4093 index,
c1c924ab
YCH
4094 srt_subtitles_timecode(begin_time),
4095 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4096 parse_node(para)))
4097
4098 return ''.join(out)
4099
4100
c487cf00 4101def cli_option(params, command_option, param, separator=None):
66e289ba 4102 param = params.get(param)
c487cf00 4103 return ([] if param is None
4104 else [command_option, str(param)] if separator is None
4105 else [f'{command_option}{separator}{param}'])
66e289ba
S
4106
4107
4108def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4109 param = params.get(param)
c487cf00 4110 assert param in (True, False, None)
4111 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4112
4113
4114def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4115 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4116
4117
e92caff5 4118def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4119 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4120 if use_compat:
5b1ecbb3 4121 return argdict
4122 else:
4123 argdict = None
eab9b2bc 4124 if argdict is None:
5b1ecbb3 4125 return default
eab9b2bc 4126 assert isinstance(argdict, dict)
4127
e92caff5 4128 assert isinstance(keys, (list, tuple))
4129 for key_list in keys:
e92caff5 4130 arg_list = list(filter(
4131 lambda x: x is not None,
6606817a 4132 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4133 if arg_list:
4134 return [arg for args in arg_list for arg in args]
4135 return default
66e289ba 4136
6251555f 4137
330690a2 4138def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4139 main_key, exe = main_key.lower(), exe.lower()
4140 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4141 keys = [f'{root_key}{k}' for k in (keys or [''])]
4142 if root_key in keys:
4143 if main_key != exe:
4144 keys.append((main_key, exe))
4145 keys.append('default')
4146 else:
4147 use_compat = False
4148 return cli_configuration_args(argdict, keys, default, use_compat)
4149
66e289ba 4150
86e5f3ed 4151class ISO639Utils:
39672624
YCH
4152 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4153 _lang_map = {
4154 'aa': 'aar',
4155 'ab': 'abk',
4156 'ae': 'ave',
4157 'af': 'afr',
4158 'ak': 'aka',
4159 'am': 'amh',
4160 'an': 'arg',
4161 'ar': 'ara',
4162 'as': 'asm',
4163 'av': 'ava',
4164 'ay': 'aym',
4165 'az': 'aze',
4166 'ba': 'bak',
4167 'be': 'bel',
4168 'bg': 'bul',
4169 'bh': 'bih',
4170 'bi': 'bis',
4171 'bm': 'bam',
4172 'bn': 'ben',
4173 'bo': 'bod',
4174 'br': 'bre',
4175 'bs': 'bos',
4176 'ca': 'cat',
4177 'ce': 'che',
4178 'ch': 'cha',
4179 'co': 'cos',
4180 'cr': 'cre',
4181 'cs': 'ces',
4182 'cu': 'chu',
4183 'cv': 'chv',
4184 'cy': 'cym',
4185 'da': 'dan',
4186 'de': 'deu',
4187 'dv': 'div',
4188 'dz': 'dzo',
4189 'ee': 'ewe',
4190 'el': 'ell',
4191 'en': 'eng',
4192 'eo': 'epo',
4193 'es': 'spa',
4194 'et': 'est',
4195 'eu': 'eus',
4196 'fa': 'fas',
4197 'ff': 'ful',
4198 'fi': 'fin',
4199 'fj': 'fij',
4200 'fo': 'fao',
4201 'fr': 'fra',
4202 'fy': 'fry',
4203 'ga': 'gle',
4204 'gd': 'gla',
4205 'gl': 'glg',
4206 'gn': 'grn',
4207 'gu': 'guj',
4208 'gv': 'glv',
4209 'ha': 'hau',
4210 'he': 'heb',
b7acc835 4211 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4212 'hi': 'hin',
4213 'ho': 'hmo',
4214 'hr': 'hrv',
4215 'ht': 'hat',
4216 'hu': 'hun',
4217 'hy': 'hye',
4218 'hz': 'her',
4219 'ia': 'ina',
4220 'id': 'ind',
b7acc835 4221 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4222 'ie': 'ile',
4223 'ig': 'ibo',
4224 'ii': 'iii',
4225 'ik': 'ipk',
4226 'io': 'ido',
4227 'is': 'isl',
4228 'it': 'ita',
4229 'iu': 'iku',
4230 'ja': 'jpn',
4231 'jv': 'jav',
4232 'ka': 'kat',
4233 'kg': 'kon',
4234 'ki': 'kik',
4235 'kj': 'kua',
4236 'kk': 'kaz',
4237 'kl': 'kal',
4238 'km': 'khm',
4239 'kn': 'kan',
4240 'ko': 'kor',
4241 'kr': 'kau',
4242 'ks': 'kas',
4243 'ku': 'kur',
4244 'kv': 'kom',
4245 'kw': 'cor',
4246 'ky': 'kir',
4247 'la': 'lat',
4248 'lb': 'ltz',
4249 'lg': 'lug',
4250 'li': 'lim',
4251 'ln': 'lin',
4252 'lo': 'lao',
4253 'lt': 'lit',
4254 'lu': 'lub',
4255 'lv': 'lav',
4256 'mg': 'mlg',
4257 'mh': 'mah',
4258 'mi': 'mri',
4259 'mk': 'mkd',
4260 'ml': 'mal',
4261 'mn': 'mon',
4262 'mr': 'mar',
4263 'ms': 'msa',
4264 'mt': 'mlt',
4265 'my': 'mya',
4266 'na': 'nau',
4267 'nb': 'nob',
4268 'nd': 'nde',
4269 'ne': 'nep',
4270 'ng': 'ndo',
4271 'nl': 'nld',
4272 'nn': 'nno',
4273 'no': 'nor',
4274 'nr': 'nbl',
4275 'nv': 'nav',
4276 'ny': 'nya',
4277 'oc': 'oci',
4278 'oj': 'oji',
4279 'om': 'orm',
4280 'or': 'ori',
4281 'os': 'oss',
4282 'pa': 'pan',
4283 'pi': 'pli',
4284 'pl': 'pol',
4285 'ps': 'pus',
4286 'pt': 'por',
4287 'qu': 'que',
4288 'rm': 'roh',
4289 'rn': 'run',
4290 'ro': 'ron',
4291 'ru': 'rus',
4292 'rw': 'kin',
4293 'sa': 'san',
4294 'sc': 'srd',
4295 'sd': 'snd',
4296 'se': 'sme',
4297 'sg': 'sag',
4298 'si': 'sin',
4299 'sk': 'slk',
4300 'sl': 'slv',
4301 'sm': 'smo',
4302 'sn': 'sna',
4303 'so': 'som',
4304 'sq': 'sqi',
4305 'sr': 'srp',
4306 'ss': 'ssw',
4307 'st': 'sot',
4308 'su': 'sun',
4309 'sv': 'swe',
4310 'sw': 'swa',
4311 'ta': 'tam',
4312 'te': 'tel',
4313 'tg': 'tgk',
4314 'th': 'tha',
4315 'ti': 'tir',
4316 'tk': 'tuk',
4317 'tl': 'tgl',
4318 'tn': 'tsn',
4319 'to': 'ton',
4320 'tr': 'tur',
4321 'ts': 'tso',
4322 'tt': 'tat',
4323 'tw': 'twi',
4324 'ty': 'tah',
4325 'ug': 'uig',
4326 'uk': 'ukr',
4327 'ur': 'urd',
4328 'uz': 'uzb',
4329 've': 'ven',
4330 'vi': 'vie',
4331 'vo': 'vol',
4332 'wa': 'wln',
4333 'wo': 'wol',
4334 'xh': 'xho',
4335 'yi': 'yid',
e9a50fba 4336 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4337 'yo': 'yor',
4338 'za': 'zha',
4339 'zh': 'zho',
4340 'zu': 'zul',
4341 }
4342
4343 @classmethod
4344 def short2long(cls, code):
4345 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4346 return cls._lang_map.get(code[:2])
4347
4348 @classmethod
4349 def long2short(cls, code):
4350 """Convert language code from ISO 639-2/T to ISO 639-1"""
4351 for short_name, long_name in cls._lang_map.items():
4352 if long_name == code:
4353 return short_name
4354
4355
86e5f3ed 4356class ISO3166Utils:
4eb10f66
YCH
4357 # From http://data.okfn.org/data/core/country-list
4358 _country_map = {
4359 'AF': 'Afghanistan',
4360 'AX': 'Åland Islands',
4361 'AL': 'Albania',
4362 'DZ': 'Algeria',
4363 'AS': 'American Samoa',
4364 'AD': 'Andorra',
4365 'AO': 'Angola',
4366 'AI': 'Anguilla',
4367 'AQ': 'Antarctica',
4368 'AG': 'Antigua and Barbuda',
4369 'AR': 'Argentina',
4370 'AM': 'Armenia',
4371 'AW': 'Aruba',
4372 'AU': 'Australia',
4373 'AT': 'Austria',
4374 'AZ': 'Azerbaijan',
4375 'BS': 'Bahamas',
4376 'BH': 'Bahrain',
4377 'BD': 'Bangladesh',
4378 'BB': 'Barbados',
4379 'BY': 'Belarus',
4380 'BE': 'Belgium',
4381 'BZ': 'Belize',
4382 'BJ': 'Benin',
4383 'BM': 'Bermuda',
4384 'BT': 'Bhutan',
4385 'BO': 'Bolivia, Plurinational State of',
4386 'BQ': 'Bonaire, Sint Eustatius and Saba',
4387 'BA': 'Bosnia and Herzegovina',
4388 'BW': 'Botswana',
4389 'BV': 'Bouvet Island',
4390 'BR': 'Brazil',
4391 'IO': 'British Indian Ocean Territory',
4392 'BN': 'Brunei Darussalam',
4393 'BG': 'Bulgaria',
4394 'BF': 'Burkina Faso',
4395 'BI': 'Burundi',
4396 'KH': 'Cambodia',
4397 'CM': 'Cameroon',
4398 'CA': 'Canada',
4399 'CV': 'Cape Verde',
4400 'KY': 'Cayman Islands',
4401 'CF': 'Central African Republic',
4402 'TD': 'Chad',
4403 'CL': 'Chile',
4404 'CN': 'China',
4405 'CX': 'Christmas Island',
4406 'CC': 'Cocos (Keeling) Islands',
4407 'CO': 'Colombia',
4408 'KM': 'Comoros',
4409 'CG': 'Congo',
4410 'CD': 'Congo, the Democratic Republic of the',
4411 'CK': 'Cook Islands',
4412 'CR': 'Costa Rica',
4413 'CI': 'Côte d\'Ivoire',
4414 'HR': 'Croatia',
4415 'CU': 'Cuba',
4416 'CW': 'Curaçao',
4417 'CY': 'Cyprus',
4418 'CZ': 'Czech Republic',
4419 'DK': 'Denmark',
4420 'DJ': 'Djibouti',
4421 'DM': 'Dominica',
4422 'DO': 'Dominican Republic',
4423 'EC': 'Ecuador',
4424 'EG': 'Egypt',
4425 'SV': 'El Salvador',
4426 'GQ': 'Equatorial Guinea',
4427 'ER': 'Eritrea',
4428 'EE': 'Estonia',
4429 'ET': 'Ethiopia',
4430 'FK': 'Falkland Islands (Malvinas)',
4431 'FO': 'Faroe Islands',
4432 'FJ': 'Fiji',
4433 'FI': 'Finland',
4434 'FR': 'France',
4435 'GF': 'French Guiana',
4436 'PF': 'French Polynesia',
4437 'TF': 'French Southern Territories',
4438 'GA': 'Gabon',
4439 'GM': 'Gambia',
4440 'GE': 'Georgia',
4441 'DE': 'Germany',
4442 'GH': 'Ghana',
4443 'GI': 'Gibraltar',
4444 'GR': 'Greece',
4445 'GL': 'Greenland',
4446 'GD': 'Grenada',
4447 'GP': 'Guadeloupe',
4448 'GU': 'Guam',
4449 'GT': 'Guatemala',
4450 'GG': 'Guernsey',
4451 'GN': 'Guinea',
4452 'GW': 'Guinea-Bissau',
4453 'GY': 'Guyana',
4454 'HT': 'Haiti',
4455 'HM': 'Heard Island and McDonald Islands',
4456 'VA': 'Holy See (Vatican City State)',
4457 'HN': 'Honduras',
4458 'HK': 'Hong Kong',
4459 'HU': 'Hungary',
4460 'IS': 'Iceland',
4461 'IN': 'India',
4462 'ID': 'Indonesia',
4463 'IR': 'Iran, Islamic Republic of',
4464 'IQ': 'Iraq',
4465 'IE': 'Ireland',
4466 'IM': 'Isle of Man',
4467 'IL': 'Israel',
4468 'IT': 'Italy',
4469 'JM': 'Jamaica',
4470 'JP': 'Japan',
4471 'JE': 'Jersey',
4472 'JO': 'Jordan',
4473 'KZ': 'Kazakhstan',
4474 'KE': 'Kenya',
4475 'KI': 'Kiribati',
4476 'KP': 'Korea, Democratic People\'s Republic of',
4477 'KR': 'Korea, Republic of',
4478 'KW': 'Kuwait',
4479 'KG': 'Kyrgyzstan',
4480 'LA': 'Lao People\'s Democratic Republic',
4481 'LV': 'Latvia',
4482 'LB': 'Lebanon',
4483 'LS': 'Lesotho',
4484 'LR': 'Liberia',
4485 'LY': 'Libya',
4486 'LI': 'Liechtenstein',
4487 'LT': 'Lithuania',
4488 'LU': 'Luxembourg',
4489 'MO': 'Macao',
4490 'MK': 'Macedonia, the Former Yugoslav Republic of',
4491 'MG': 'Madagascar',
4492 'MW': 'Malawi',
4493 'MY': 'Malaysia',
4494 'MV': 'Maldives',
4495 'ML': 'Mali',
4496 'MT': 'Malta',
4497 'MH': 'Marshall Islands',
4498 'MQ': 'Martinique',
4499 'MR': 'Mauritania',
4500 'MU': 'Mauritius',
4501 'YT': 'Mayotte',
4502 'MX': 'Mexico',
4503 'FM': 'Micronesia, Federated States of',
4504 'MD': 'Moldova, Republic of',
4505 'MC': 'Monaco',
4506 'MN': 'Mongolia',
4507 'ME': 'Montenegro',
4508 'MS': 'Montserrat',
4509 'MA': 'Morocco',
4510 'MZ': 'Mozambique',
4511 'MM': 'Myanmar',
4512 'NA': 'Namibia',
4513 'NR': 'Nauru',
4514 'NP': 'Nepal',
4515 'NL': 'Netherlands',
4516 'NC': 'New Caledonia',
4517 'NZ': 'New Zealand',
4518 'NI': 'Nicaragua',
4519 'NE': 'Niger',
4520 'NG': 'Nigeria',
4521 'NU': 'Niue',
4522 'NF': 'Norfolk Island',
4523 'MP': 'Northern Mariana Islands',
4524 'NO': 'Norway',
4525 'OM': 'Oman',
4526 'PK': 'Pakistan',
4527 'PW': 'Palau',
4528 'PS': 'Palestine, State of',
4529 'PA': 'Panama',
4530 'PG': 'Papua New Guinea',
4531 'PY': 'Paraguay',
4532 'PE': 'Peru',
4533 'PH': 'Philippines',
4534 'PN': 'Pitcairn',
4535 'PL': 'Poland',
4536 'PT': 'Portugal',
4537 'PR': 'Puerto Rico',
4538 'QA': 'Qatar',
4539 'RE': 'Réunion',
4540 'RO': 'Romania',
4541 'RU': 'Russian Federation',
4542 'RW': 'Rwanda',
4543 'BL': 'Saint Barthélemy',
4544 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4545 'KN': 'Saint Kitts and Nevis',
4546 'LC': 'Saint Lucia',
4547 'MF': 'Saint Martin (French part)',
4548 'PM': 'Saint Pierre and Miquelon',
4549 'VC': 'Saint Vincent and the Grenadines',
4550 'WS': 'Samoa',
4551 'SM': 'San Marino',
4552 'ST': 'Sao Tome and Principe',
4553 'SA': 'Saudi Arabia',
4554 'SN': 'Senegal',
4555 'RS': 'Serbia',
4556 'SC': 'Seychelles',
4557 'SL': 'Sierra Leone',
4558 'SG': 'Singapore',
4559 'SX': 'Sint Maarten (Dutch part)',
4560 'SK': 'Slovakia',
4561 'SI': 'Slovenia',
4562 'SB': 'Solomon Islands',
4563 'SO': 'Somalia',
4564 'ZA': 'South Africa',
4565 'GS': 'South Georgia and the South Sandwich Islands',
4566 'SS': 'South Sudan',
4567 'ES': 'Spain',
4568 'LK': 'Sri Lanka',
4569 'SD': 'Sudan',
4570 'SR': 'Suriname',
4571 'SJ': 'Svalbard and Jan Mayen',
4572 'SZ': 'Swaziland',
4573 'SE': 'Sweden',
4574 'CH': 'Switzerland',
4575 'SY': 'Syrian Arab Republic',
4576 'TW': 'Taiwan, Province of China',
4577 'TJ': 'Tajikistan',
4578 'TZ': 'Tanzania, United Republic of',
4579 'TH': 'Thailand',
4580 'TL': 'Timor-Leste',
4581 'TG': 'Togo',
4582 'TK': 'Tokelau',
4583 'TO': 'Tonga',
4584 'TT': 'Trinidad and Tobago',
4585 'TN': 'Tunisia',
4586 'TR': 'Turkey',
4587 'TM': 'Turkmenistan',
4588 'TC': 'Turks and Caicos Islands',
4589 'TV': 'Tuvalu',
4590 'UG': 'Uganda',
4591 'UA': 'Ukraine',
4592 'AE': 'United Arab Emirates',
4593 'GB': 'United Kingdom',
4594 'US': 'United States',
4595 'UM': 'United States Minor Outlying Islands',
4596 'UY': 'Uruguay',
4597 'UZ': 'Uzbekistan',
4598 'VU': 'Vanuatu',
4599 'VE': 'Venezuela, Bolivarian Republic of',
4600 'VN': 'Viet Nam',
4601 'VG': 'Virgin Islands, British',
4602 'VI': 'Virgin Islands, U.S.',
4603 'WF': 'Wallis and Futuna',
4604 'EH': 'Western Sahara',
4605 'YE': 'Yemen',
4606 'ZM': 'Zambia',
4607 'ZW': 'Zimbabwe',
2f97cc61 4608 # Not ISO 3166 codes, but used for IP blocks
4609 'AP': 'Asia/Pacific Region',
4610 'EU': 'Europe',
4eb10f66
YCH
4611 }
4612
4613 @classmethod
4614 def short2full(cls, code):
4615 """Convert an ISO 3166-2 country code to the corresponding full name"""
4616 return cls._country_map.get(code.upper())
4617
4618
86e5f3ed 4619class GeoUtils:
773f291d
S
4620 # Major IPv4 address blocks per country
4621 _country_ip_map = {
53896ca5 4622 'AD': '46.172.224.0/19',
773f291d
S
4623 'AE': '94.200.0.0/13',
4624 'AF': '149.54.0.0/17',
4625 'AG': '209.59.64.0/18',
4626 'AI': '204.14.248.0/21',
4627 'AL': '46.99.0.0/16',
4628 'AM': '46.70.0.0/15',
4629 'AO': '105.168.0.0/13',
53896ca5
S
4630 'AP': '182.50.184.0/21',
4631 'AQ': '23.154.160.0/24',
773f291d
S
4632 'AR': '181.0.0.0/12',
4633 'AS': '202.70.112.0/20',
53896ca5 4634 'AT': '77.116.0.0/14',
773f291d
S
4635 'AU': '1.128.0.0/11',
4636 'AW': '181.41.0.0/18',
53896ca5
S
4637 'AX': '185.217.4.0/22',
4638 'AZ': '5.197.0.0/16',
773f291d
S
4639 'BA': '31.176.128.0/17',
4640 'BB': '65.48.128.0/17',
4641 'BD': '114.130.0.0/16',
4642 'BE': '57.0.0.0/8',
53896ca5 4643 'BF': '102.178.0.0/15',
773f291d
S
4644 'BG': '95.42.0.0/15',
4645 'BH': '37.131.0.0/17',
4646 'BI': '154.117.192.0/18',
4647 'BJ': '137.255.0.0/16',
53896ca5 4648 'BL': '185.212.72.0/23',
773f291d
S
4649 'BM': '196.12.64.0/18',
4650 'BN': '156.31.0.0/16',
4651 'BO': '161.56.0.0/16',
4652 'BQ': '161.0.80.0/20',
53896ca5 4653 'BR': '191.128.0.0/12',
773f291d
S
4654 'BS': '24.51.64.0/18',
4655 'BT': '119.2.96.0/19',
4656 'BW': '168.167.0.0/16',
4657 'BY': '178.120.0.0/13',
4658 'BZ': '179.42.192.0/18',
4659 'CA': '99.224.0.0/11',
4660 'CD': '41.243.0.0/16',
53896ca5
S
4661 'CF': '197.242.176.0/21',
4662 'CG': '160.113.0.0/16',
773f291d 4663 'CH': '85.0.0.0/13',
53896ca5 4664 'CI': '102.136.0.0/14',
773f291d
S
4665 'CK': '202.65.32.0/19',
4666 'CL': '152.172.0.0/14',
53896ca5 4667 'CM': '102.244.0.0/14',
773f291d
S
4668 'CN': '36.128.0.0/10',
4669 'CO': '181.240.0.0/12',
4670 'CR': '201.192.0.0/12',
4671 'CU': '152.206.0.0/15',
4672 'CV': '165.90.96.0/19',
4673 'CW': '190.88.128.0/17',
53896ca5 4674 'CY': '31.153.0.0/16',
773f291d
S
4675 'CZ': '88.100.0.0/14',
4676 'DE': '53.0.0.0/8',
4677 'DJ': '197.241.0.0/17',
4678 'DK': '87.48.0.0/12',
4679 'DM': '192.243.48.0/20',
4680 'DO': '152.166.0.0/15',
4681 'DZ': '41.96.0.0/12',
4682 'EC': '186.68.0.0/15',
4683 'EE': '90.190.0.0/15',
4684 'EG': '156.160.0.0/11',
4685 'ER': '196.200.96.0/20',
4686 'ES': '88.0.0.0/11',
4687 'ET': '196.188.0.0/14',
4688 'EU': '2.16.0.0/13',
4689 'FI': '91.152.0.0/13',
4690 'FJ': '144.120.0.0/16',
53896ca5 4691 'FK': '80.73.208.0/21',
773f291d
S
4692 'FM': '119.252.112.0/20',
4693 'FO': '88.85.32.0/19',
4694 'FR': '90.0.0.0/9',
4695 'GA': '41.158.0.0/15',
4696 'GB': '25.0.0.0/8',
4697 'GD': '74.122.88.0/21',
4698 'GE': '31.146.0.0/16',
4699 'GF': '161.22.64.0/18',
4700 'GG': '62.68.160.0/19',
53896ca5
S
4701 'GH': '154.160.0.0/12',
4702 'GI': '95.164.0.0/16',
773f291d
S
4703 'GL': '88.83.0.0/19',
4704 'GM': '160.182.0.0/15',
4705 'GN': '197.149.192.0/18',
4706 'GP': '104.250.0.0/19',
4707 'GQ': '105.235.224.0/20',
4708 'GR': '94.64.0.0/13',
4709 'GT': '168.234.0.0/16',
4710 'GU': '168.123.0.0/16',
4711 'GW': '197.214.80.0/20',
4712 'GY': '181.41.64.0/18',
4713 'HK': '113.252.0.0/14',
4714 'HN': '181.210.0.0/16',
4715 'HR': '93.136.0.0/13',
4716 'HT': '148.102.128.0/17',
4717 'HU': '84.0.0.0/14',
4718 'ID': '39.192.0.0/10',
4719 'IE': '87.32.0.0/12',
4720 'IL': '79.176.0.0/13',
4721 'IM': '5.62.80.0/20',
4722 'IN': '117.192.0.0/10',
4723 'IO': '203.83.48.0/21',
4724 'IQ': '37.236.0.0/14',
4725 'IR': '2.176.0.0/12',
4726 'IS': '82.221.0.0/16',
4727 'IT': '79.0.0.0/10',
4728 'JE': '87.244.64.0/18',
4729 'JM': '72.27.0.0/17',
4730 'JO': '176.29.0.0/16',
53896ca5 4731 'JP': '133.0.0.0/8',
773f291d
S
4732 'KE': '105.48.0.0/12',
4733 'KG': '158.181.128.0/17',
4734 'KH': '36.37.128.0/17',
4735 'KI': '103.25.140.0/22',
4736 'KM': '197.255.224.0/20',
53896ca5 4737 'KN': '198.167.192.0/19',
773f291d
S
4738 'KP': '175.45.176.0/22',
4739 'KR': '175.192.0.0/10',
4740 'KW': '37.36.0.0/14',
4741 'KY': '64.96.0.0/15',
4742 'KZ': '2.72.0.0/13',
4743 'LA': '115.84.64.0/18',
4744 'LB': '178.135.0.0/16',
53896ca5 4745 'LC': '24.92.144.0/20',
773f291d
S
4746 'LI': '82.117.0.0/19',
4747 'LK': '112.134.0.0/15',
53896ca5 4748 'LR': '102.183.0.0/16',
773f291d
S
4749 'LS': '129.232.0.0/17',
4750 'LT': '78.56.0.0/13',
4751 'LU': '188.42.0.0/16',
4752 'LV': '46.109.0.0/16',
4753 'LY': '41.252.0.0/14',
4754 'MA': '105.128.0.0/11',
4755 'MC': '88.209.64.0/18',
4756 'MD': '37.246.0.0/16',
4757 'ME': '178.175.0.0/17',
4758 'MF': '74.112.232.0/21',
4759 'MG': '154.126.0.0/17',
4760 'MH': '117.103.88.0/21',
4761 'MK': '77.28.0.0/15',
4762 'ML': '154.118.128.0/18',
4763 'MM': '37.111.0.0/17',
4764 'MN': '49.0.128.0/17',
4765 'MO': '60.246.0.0/16',
4766 'MP': '202.88.64.0/20',
4767 'MQ': '109.203.224.0/19',
4768 'MR': '41.188.64.0/18',
4769 'MS': '208.90.112.0/22',
4770 'MT': '46.11.0.0/16',
4771 'MU': '105.16.0.0/12',
4772 'MV': '27.114.128.0/18',
53896ca5 4773 'MW': '102.70.0.0/15',
773f291d
S
4774 'MX': '187.192.0.0/11',
4775 'MY': '175.136.0.0/13',
4776 'MZ': '197.218.0.0/15',
4777 'NA': '41.182.0.0/16',
4778 'NC': '101.101.0.0/18',
4779 'NE': '197.214.0.0/18',
4780 'NF': '203.17.240.0/22',
4781 'NG': '105.112.0.0/12',
4782 'NI': '186.76.0.0/15',
4783 'NL': '145.96.0.0/11',
4784 'NO': '84.208.0.0/13',
4785 'NP': '36.252.0.0/15',
4786 'NR': '203.98.224.0/19',
4787 'NU': '49.156.48.0/22',
4788 'NZ': '49.224.0.0/14',
4789 'OM': '5.36.0.0/15',
4790 'PA': '186.72.0.0/15',
4791 'PE': '186.160.0.0/14',
4792 'PF': '123.50.64.0/18',
4793 'PG': '124.240.192.0/19',
4794 'PH': '49.144.0.0/13',
4795 'PK': '39.32.0.0/11',
4796 'PL': '83.0.0.0/11',
4797 'PM': '70.36.0.0/20',
4798 'PR': '66.50.0.0/16',
4799 'PS': '188.161.0.0/16',
4800 'PT': '85.240.0.0/13',
4801 'PW': '202.124.224.0/20',
4802 'PY': '181.120.0.0/14',
4803 'QA': '37.210.0.0/15',
53896ca5 4804 'RE': '102.35.0.0/16',
773f291d 4805 'RO': '79.112.0.0/13',
53896ca5 4806 'RS': '93.86.0.0/15',
773f291d 4807 'RU': '5.136.0.0/13',
53896ca5 4808 'RW': '41.186.0.0/16',
773f291d
S
4809 'SA': '188.48.0.0/13',
4810 'SB': '202.1.160.0/19',
4811 'SC': '154.192.0.0/11',
53896ca5 4812 'SD': '102.120.0.0/13',
773f291d 4813 'SE': '78.64.0.0/12',
53896ca5 4814 'SG': '8.128.0.0/10',
773f291d
S
4815 'SI': '188.196.0.0/14',
4816 'SK': '78.98.0.0/15',
53896ca5 4817 'SL': '102.143.0.0/17',
773f291d
S
4818 'SM': '89.186.32.0/19',
4819 'SN': '41.82.0.0/15',
53896ca5 4820 'SO': '154.115.192.0/18',
773f291d
S
4821 'SR': '186.179.128.0/17',
4822 'SS': '105.235.208.0/21',
4823 'ST': '197.159.160.0/19',
4824 'SV': '168.243.0.0/16',
4825 'SX': '190.102.0.0/20',
4826 'SY': '5.0.0.0/16',
4827 'SZ': '41.84.224.0/19',
4828 'TC': '65.255.48.0/20',
4829 'TD': '154.68.128.0/19',
4830 'TG': '196.168.0.0/14',
4831 'TH': '171.96.0.0/13',
4832 'TJ': '85.9.128.0/18',
4833 'TK': '27.96.24.0/21',
4834 'TL': '180.189.160.0/20',
4835 'TM': '95.85.96.0/19',
4836 'TN': '197.0.0.0/11',
4837 'TO': '175.176.144.0/21',
4838 'TR': '78.160.0.0/11',
4839 'TT': '186.44.0.0/15',
4840 'TV': '202.2.96.0/19',
4841 'TW': '120.96.0.0/11',
4842 'TZ': '156.156.0.0/14',
53896ca5
S
4843 'UA': '37.52.0.0/14',
4844 'UG': '102.80.0.0/13',
4845 'US': '6.0.0.0/8',
773f291d 4846 'UY': '167.56.0.0/13',
53896ca5 4847 'UZ': '84.54.64.0/18',
773f291d 4848 'VA': '212.77.0.0/19',
53896ca5 4849 'VC': '207.191.240.0/21',
773f291d 4850 'VE': '186.88.0.0/13',
53896ca5 4851 'VG': '66.81.192.0/20',
773f291d
S
4852 'VI': '146.226.0.0/16',
4853 'VN': '14.160.0.0/11',
4854 'VU': '202.80.32.0/20',
4855 'WF': '117.20.32.0/21',
4856 'WS': '202.4.32.0/19',
4857 'YE': '134.35.0.0/16',
4858 'YT': '41.242.116.0/22',
4859 'ZA': '41.0.0.0/11',
53896ca5
S
4860 'ZM': '102.144.0.0/13',
4861 'ZW': '102.177.192.0/18',
773f291d
S
4862 }
4863
4864 @classmethod
5f95927a
S
4865 def random_ipv4(cls, code_or_block):
4866 if len(code_or_block) == 2:
4867 block = cls._country_ip_map.get(code_or_block.upper())
4868 if not block:
4869 return None
4870 else:
4871 block = code_or_block
773f291d 4872 addr, preflen = block.split('/')
ac668111 4873 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4874 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4875 return str(socket.inet_ntoa(
ac668111 4876 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4877
4878
ac668111 4879class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4880 def __init__(self, proxies=None):
4881 # Set default handlers
4882 for type in ('http', 'https'):
4883 setattr(self, '%s_open' % type,
4884 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4885 meth(r, proxy, type))
ac668111 4886 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4887
91410c9b 4888 def proxy_open(self, req, proxy, type):
2461f79d 4889 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4890 if req_proxy is not None:
4891 proxy = req_proxy
2461f79d
PH
4892 del req.headers['Ytdl-request-proxy']
4893
4894 if proxy == '__noproxy__':
4895 return None # No Proxy
14f25df2 4896 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4897 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4898 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4899 return None
ac668111 4900 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4901 self, req, proxy, type)
5bc880b9
YCH
4902
4903
0a5445dd
YCH
4904# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4905# released into Public Domain
4906# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4907
4908def long_to_bytes(n, blocksize=0):
4909 """long_to_bytes(n:long, blocksize:int) : string
4910 Convert a long integer to a byte string.
4911
4912 If optional blocksize is given and greater than zero, pad the front of the
4913 byte string with binary zeros so that the length is a multiple of
4914 blocksize.
4915 """
4916 # after much testing, this algorithm was deemed to be the fastest
4917 s = b''
4918 n = int(n)
4919 while n > 0:
ac668111 4920 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4921 n = n >> 32
4922 # strip off leading zeros
4923 for i in range(len(s)):
4924 if s[i] != b'\000'[0]:
4925 break
4926 else:
4927 # only happens when n == 0
4928 s = b'\000'
4929 i = 0
4930 s = s[i:]
4931 # add back some pad bytes. this could be done more efficiently w.r.t. the
4932 # de-padding being done above, but sigh...
4933 if blocksize > 0 and len(s) % blocksize:
4934 s = (blocksize - len(s) % blocksize) * b'\000' + s
4935 return s
4936
4937
4938def bytes_to_long(s):
4939 """bytes_to_long(string) : long
4940 Convert a byte string to a long integer.
4941
4942 This is (essentially) the inverse of long_to_bytes().
4943 """
4944 acc = 0
4945 length = len(s)
4946 if length % 4:
4947 extra = (4 - length % 4)
4948 s = b'\000' * extra + s
4949 length = length + extra
4950 for i in range(0, length, 4):
ac668111 4951 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4952 return acc
4953
4954
5bc880b9
YCH
4955def ohdave_rsa_encrypt(data, exponent, modulus):
4956 '''
4957 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4958
4959 Input:
4960 data: data to encrypt, bytes-like object
4961 exponent, modulus: parameter e and N of RSA algorithm, both integer
4962 Output: hex string of encrypted data
4963
4964 Limitation: supports one block encryption only
4965 '''
4966
4967 payload = int(binascii.hexlify(data[::-1]), 16)
4968 encrypted = pow(payload, exponent, modulus)
4969 return '%x' % encrypted
81bdc8fd
YCH
4970
4971
f48409c7
YCH
4972def pkcs1pad(data, length):
4973 """
4974 Padding input data with PKCS#1 scheme
4975
4976 @param {int[]} data input data
4977 @param {int} length target length
4978 @returns {int[]} padded data
4979 """
4980 if len(data) > length - 11:
4981 raise ValueError('Input data too long for PKCS#1 padding')
4982
4983 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4984 return [0, 2] + pseudo_random + [0] + data
4985
4986
7b2c3f47 4987def _base_n_table(n, table):
4988 if not table and not n:
4989 raise ValueError('Either table or n must be specified')
612f2be5 4990 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4991
44f14eb4 4992 if n and n != len(table):
612f2be5 4993 raise ValueError(f'base {n} exceeds table length {len(table)}')
4994 return table
59f898b7 4995
5eb6bdce 4996
7b2c3f47 4997def encode_base_n(num, n=None, table=None):
4998 """Convert given int to a base-n string"""
612f2be5 4999 table = _base_n_table(n, table)
7b2c3f47 5000 if not num:
5eb6bdce
YCH
5001 return table[0]
5002
7b2c3f47 5003 result, base = '', len(table)
81bdc8fd 5004 while num:
7b2c3f47 5005 result = table[num % base] + result
612f2be5 5006 num = num // base
7b2c3f47 5007 return result
5008
5009
5010def decode_base_n(string, n=None, table=None):
5011 """Convert given base-n string to int"""
5012 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5013 result, base = 0, len(table)
5014 for char in string:
5015 result = result * base + table[char]
5016 return result
5017
5018
f52354a8 5019def decode_packed_codes(code):
06b3fe29 5020 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 5021 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
5022 base = int(base)
5023 count = int(count)
5024 symbols = symbols.split('|')
5025 symbol_table = {}
5026
5027 while count:
5028 count -= 1
5eb6bdce 5029 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5030 symbol_table[base_n_count] = symbols[count] or base_n_count
5031
5032 return re.sub(
5033 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5034 obfuscated_code)
e154c651 5035
5036
1ced2221
S
5037def caesar(s, alphabet, shift):
5038 if shift == 0:
5039 return s
5040 l = len(alphabet)
5041 return ''.join(
5042 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5043 for c in s)
5044
5045
5046def rot47(s):
5047 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5048
5049
e154c651 5050def parse_m3u8_attributes(attrib):
5051 info = {}
5052 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5053 if val.startswith('"'):
5054 val = val[1:-1]
5055 info[key] = val
5056 return info
1143535d
YCH
5057
5058
5059def urshift(val, n):
5060 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5061
5062
efa97bdc 5063def write_xattr(path, key, value):
6f7563be 5064 # Windows: Write xattrs to NTFS Alternate Data Streams:
5065 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5066 if compat_os_name == 'nt':
5067 assert ':' not in key
5068 assert os.path.exists(path)
efa97bdc
YCH
5069
5070 try:
6f7563be 5071 with open(f'{path}:{key}', 'wb') as f:
5072 f.write(value)
86e5f3ed 5073 except OSError as e:
efa97bdc 5074 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5075 return
efa97bdc 5076
6f7563be 5077 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5078
6f7563be 5079 setxattr = None
5080 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5081 # Unicode arguments are not supported in pyxattr until version 0.5.0
5082 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5083 if version_tuple(xattr.__version__) >= (0, 5, 0):
5084 setxattr = xattr.set
5085 elif xattr:
5086 setxattr = xattr.setxattr
efa97bdc 5087
6f7563be 5088 if setxattr:
5089 try:
5090 setxattr(path, key, value)
5091 except OSError as e:
5092 raise XAttrMetadataError(e.errno, e.strerror)
5093 return
efa97bdc 5094
6f7563be 5095 # UNIX Method 2. Use setfattr/xattr executables
5096 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5097 else 'xattr' if check_executable('xattr', ['-h']) else None)
5098 if not exe:
5099 raise XAttrUnavailableError(
5100 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5101 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5102
0f06bcd7 5103 value = value.decode()
6f7563be 5104 try:
f0c9fb96 5105 _, stderr, returncode = Popen.run(
6f7563be 5106 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5107 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5108 except OSError as e:
5109 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5110 if returncode:
5111 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5112
5113
5114def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5115 start_date = datetime.date(1950, 1, 1)
5116 end_date = datetime.date(1995, 12, 31)
5117 offset = random.randint(0, (end_date - start_date).days)
5118 random_date = start_date + datetime.timedelta(offset)
0c265486 5119 return {
aa374bc7
AS
5120 year_field: str(random_date.year),
5121 month_field: str(random_date.month),
5122 day_field: str(random_date.day),
0c265486 5123 }
732044af 5124
c76eb41b 5125
8c53322c
L
5126def find_available_port(interface=''):
5127 try:
5128 with socket.socket() as sock:
5129 sock.bind((interface, 0))
5130 return sock.getsockname()[1]
5131 except OSError:
5132 return None
5133
5134
732044af 5135# Templates for internet shortcut files, which are plain text files.
e5a998f3 5136DOT_URL_LINK_TEMPLATE = '''\
732044af 5137[InternetShortcut]
5138URL=%(url)s
e5a998f3 5139'''
732044af 5140
e5a998f3 5141DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5142<?xml version="1.0" encoding="UTF-8"?>
5143<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5144<plist version="1.0">
5145<dict>
5146\t<key>URL</key>
5147\t<string>%(url)s</string>
5148</dict>
5149</plist>
e5a998f3 5150'''
732044af 5151
e5a998f3 5152DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5153[Desktop Entry]
5154Encoding=UTF-8
5155Name=%(filename)s
5156Type=Link
5157URL=%(url)s
5158Icon=text-html
e5a998f3 5159'''
732044af 5160
08438d2c 5161LINK_TEMPLATES = {
5162 'url': DOT_URL_LINK_TEMPLATE,
5163 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5164 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5165}
5166
732044af 5167
5168def iri_to_uri(iri):
5169 """
5170 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5171
5172 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5173 """
5174
14f25df2 5175 iri_parts = urllib.parse.urlparse(iri)
732044af 5176
5177 if '[' in iri_parts.netloc:
5178 raise ValueError('IPv6 URIs are not, yet, supported.')
5179 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5180
5181 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5182
5183 net_location = ''
5184 if iri_parts.username:
f9934b96 5185 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5186 if iri_parts.password is not None:
f9934b96 5187 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5188 net_location += '@'
5189
0f06bcd7 5190 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5191 # The 'idna' encoding produces ASCII text.
5192 if iri_parts.port is not None and iri_parts.port != 80:
5193 net_location += ':' + str(iri_parts.port)
5194
f9934b96 5195 return urllib.parse.urlunparse(
732044af 5196 (iri_parts.scheme,
5197 net_location,
5198
f9934b96 5199 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5200
5201 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5202 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5203
5204 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5205 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5206
f9934b96 5207 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5208
5209 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5210
5211
5212def to_high_limit_path(path):
5213 if sys.platform in ['win32', 'cygwin']:
5214 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5215 return '\\\\?\\' + os.path.abspath(path)
732044af 5216
5217 return path
76d321f6 5218
c76eb41b 5219
7b2c3f47 5220def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 5221 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 5222 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5223 return default
7b2c3f47 5224 return template % func(val)
00dd0cd5 5225
5226
5227def clean_podcast_url(url):
5228 return re.sub(r'''(?x)
5229 (?:
5230 (?:
5231 chtbl\.com/track|
5232 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5233 play\.podtrac\.com
5234 )/[^/]+|
5235 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5236 flex\.acast\.com|
5237 pd(?:
5238 cn\.co| # https://podcorn.com/analytics-prefix/
5239 st\.fm # https://podsights.com/docs/
5240 )/e
5241 )/''', '', url)
ffcb8191
THD
5242
5243
5244_HEX_TABLE = '0123456789abcdef'
5245
5246
5247def random_uuidv4():
5248 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5249
5250
5251def make_dir(path, to_screen=None):
5252 try:
5253 dn = os.path.dirname(path)
b25d6cb9
AI
5254 if dn:
5255 os.makedirs(dn, exist_ok=True)
0202b52a 5256 return True
86e5f3ed 5257 except OSError as err:
0202b52a 5258 if callable(to_screen) is not None:
69bec673 5259 to_screen(f'unable to create directory {err}')
0202b52a 5260 return False
f74980cb 5261
5262
5263def get_executable_path():
69bec673 5264 from ..update import _get_variant_and_executable_path
c487cf00 5265
b5899f4f 5266 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5267
5268
8e40b9d1 5269def get_user_config_dirs(package_name):
8e40b9d1
M
5270 # .config (e.g. ~/.config/package_name)
5271 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5272 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5273
5274 # appdata (%APPDATA%/package_name)
5275 appdata_dir = os.getenv('appdata')
5276 if appdata_dir:
773c272d 5277 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5278
5279 # home (~/.package_name)
773c272d 5280 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5281
5282
5283def get_system_config_dirs(package_name):
8e40b9d1 5284 # /etc/package_name
773c272d 5285 yield os.path.join('/etc', package_name)
06167fbb 5286
5287
3e9b66d7 5288def time_seconds(**kwargs):
83c4970e
L
5289 """
5290 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5291 """
5292 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5293
5294
49fa4d9a
N
5295# create a JSON Web Signature (jws) with HS256 algorithm
5296# the resulting format is in JWS Compact Serialization
5297# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5298# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5299def jwt_encode_hs256(payload_data, key, headers={}):
5300 header_data = {
5301 'alg': 'HS256',
5302 'typ': 'JWT',
5303 }
5304 if headers:
5305 header_data.update(headers)
0f06bcd7 5306 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5307 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5308 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5309 signature_b64 = base64.b64encode(h.digest())
5310 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5311 return token
819e0531 5312
5313
16b0d7e6 5314# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5315def jwt_decode_hs256(jwt):
5316 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5317 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5318 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5319 return payload_data
5320
5321
53973b4d 5322WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5323
5324
7a32c70d 5325@functools.cache
819e0531 5326def supports_terminal_sequences(stream):
5327 if compat_os_name == 'nt':
8a82af35 5328 if not WINDOWS_VT_MODE:
819e0531 5329 return False
5330 elif not os.getenv('TERM'):
5331 return False
5332 try:
5333 return stream.isatty()
5334 except BaseException:
5335 return False
5336
5337
c53a18f0 5338def windows_enable_vt_mode():
5339 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5340 if get_windows_version() < (10, 0, 10586):
53973b4d 5341 return
53973b4d 5342
c53a18f0 5343 import ctypes
5344 import ctypes.wintypes
5345 import msvcrt
5346
5347 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5348
5349 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5350 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5351 try:
5352 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5353 dw_original_mode = ctypes.wintypes.DWORD()
5354 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5355 if not success:
5356 raise Exception('GetConsoleMode failed')
5357
5358 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5359 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5360 if not success:
5361 raise Exception('SetConsoleMode failed')
c53a18f0 5362 finally:
5363 os.close(handle)
53973b4d 5364
f0795149 5365 global WINDOWS_VT_MODE
5366 WINDOWS_VT_MODE = True
5367 supports_terminal_sequences.cache_clear()
5368
53973b4d 5369
ec11a9f4 5370_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5371
5372
5373def remove_terminal_sequences(string):
5374 return _terminal_sequences_re.sub('', string)
5375
5376
5377def number_of_digits(number):
5378 return len('%d' % number)
34921b43 5379
5380
5381def join_nonempty(*values, delim='-', from_dict=None):
5382 if from_dict is not None:
69bec673 5383 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5384 return delim.join(map(str, filter(None, values)))
06e57990 5385
5386
27231526
ZM
5387def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5388 """
5389 Find the largest format dimensions in terms of video width and, for each thumbnail:
5390 * Modify the URL: Match the width with the provided regex and replace with the former width
5391 * Update dimensions
5392
5393 This function is useful with video services that scale the provided thumbnails on demand
5394 """
5395 _keys = ('width', 'height')
5396 max_dimensions = max(
86e5f3ed 5397 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5398 default=(0, 0))
5399 if not max_dimensions[0]:
5400 return thumbnails
5401 return [
5402 merge_dicts(
5403 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5404 dict(zip(_keys, max_dimensions)), thumbnail)
5405 for thumbnail in thumbnails
5406 ]
5407
5408
93c8410d
LNO
5409def parse_http_range(range):
5410 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5411 if not range:
5412 return None, None, None
5413 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5414 if not crg:
5415 return None, None, None
5416 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5417
5418
6b9e832d 5419def read_stdin(what):
5420 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5421 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5422 return sys.stdin
5423
5424
a904a7f8
L
5425def determine_file_encoding(data):
5426 """
88f60feb 5427 Detect the text encoding used
a904a7f8
L
5428 @returns (encoding, bytes to skip)
5429 """
5430
88f60feb 5431 # BOM marks are given priority over declarations
a904a7f8 5432 for bom, enc in BOMS:
a904a7f8
L
5433 if data.startswith(bom):
5434 return enc, len(bom)
5435
88f60feb 5436 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5437 # We ignore the endianness to get a good enough match
a904a7f8 5438 data = data.replace(b'\0', b'')
88f60feb 5439 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5440 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5441
5442
06e57990 5443class Config:
5444 own_args = None
9e491463 5445 parsed_args = None
06e57990 5446 filename = None
5447 __initialized = False
5448
5449 def __init__(self, parser, label=None):
9e491463 5450 self.parser, self.label = parser, label
06e57990 5451 self._loaded_paths, self.configs = set(), []
5452
5453 def init(self, args=None, filename=None):
5454 assert not self.__initialized
284a60c5 5455 self.own_args, self.filename = args, filename
5456 return self.load_configs()
5457
5458 def load_configs(self):
65662dff 5459 directory = ''
284a60c5 5460 if self.filename:
5461 location = os.path.realpath(self.filename)
65662dff 5462 directory = os.path.dirname(location)
06e57990 5463 if location in self._loaded_paths:
5464 return False
5465 self._loaded_paths.add(location)
5466
284a60c5 5467 self.__initialized = True
5468 opts, _ = self.parser.parse_known_args(self.own_args)
5469 self.parsed_args = self.own_args
9e491463 5470 for location in opts.config_locations or []:
6b9e832d 5471 if location == '-':
1060f82f 5472 if location in self._loaded_paths:
5473 continue
5474 self._loaded_paths.add(location)
6b9e832d 5475 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5476 continue
65662dff 5477 location = os.path.join(directory, expand_path(location))
06e57990 5478 if os.path.isdir(location):
5479 location = os.path.join(location, 'yt-dlp.conf')
5480 if not os.path.exists(location):
9e491463 5481 self.parser.error(f'config location {location} does not exist')
06e57990 5482 self.append_config(self.read_file(location), location)
5483 return True
5484
5485 def __str__(self):
5486 label = join_nonempty(
5487 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5488 delim=' ')
5489 return join_nonempty(
5490 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5491 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5492 delim='\n')
5493
7a32c70d 5494 @staticmethod
06e57990 5495 def read_file(filename, default=[]):
5496 try:
a904a7f8 5497 optionf = open(filename, 'rb')
86e5f3ed 5498 except OSError:
06e57990 5499 return default # silently skip if file is not present
a904a7f8
L
5500 try:
5501 enc, skip = determine_file_encoding(optionf.read(512))
5502 optionf.seek(skip, io.SEEK_SET)
5503 except OSError:
5504 enc = None # silently skip read errors
06e57990 5505 try:
5506 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5507 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5508 res = shlex.split(contents, comments=True)
44a6fcff 5509 except Exception as err:
5510 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5511 finally:
5512 optionf.close()
5513 return res
5514
7a32c70d 5515 @staticmethod
06e57990 5516 def hide_login_info(opts):
86e5f3ed 5517 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5518 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5519
5520 def _scrub_eq(o):
5521 m = eqre.match(o)
5522 if m:
5523 return m.group('key') + '=PRIVATE'
5524 else:
5525 return o
5526
5527 opts = list(map(_scrub_eq, opts))
5528 for idx, opt in enumerate(opts):
5529 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5530 opts[idx + 1] = 'PRIVATE'
5531 return opts
5532
5533 def append_config(self, *args, label=None):
9e491463 5534 config = type(self)(self.parser, label)
06e57990 5535 config._loaded_paths = self._loaded_paths
5536 if config.init(*args):
5537 self.configs.append(config)
5538
7a32c70d 5539 @property
06e57990 5540 def all_args(self):
5541 for config in reversed(self.configs):
5542 yield from config.all_args
9e491463 5543 yield from self.parsed_args or []
5544
5545 def parse_known_args(self, **kwargs):
5546 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5547
5548 def parse_args(self):
9e491463 5549 return self.parser.parse_args(self.all_args)
da42679b
LNO
5550
5551
d5d1df8a 5552class WebSocketsWrapper:
da42679b 5553 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5554 pool = None
da42679b 5555
3cea3edd 5556 def __init__(self, url, headers=None, connect=True):
059bc4db 5557 self.loop = asyncio.new_event_loop()
9cd08050 5558 # XXX: "loop" is deprecated
5559 self.conn = websockets.connect(
5560 url, extra_headers=headers, ping_interval=None,
5561 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5562 if connect:
5563 self.__enter__()
15dfb392 5564 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5565
5566 def __enter__(self):
3cea3edd 5567 if not self.pool:
9cd08050 5568 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5569 return self
5570
5571 def send(self, *args):
5572 self.run_with_loop(self.pool.send(*args), self.loop)
5573
5574 def recv(self, *args):
5575 return self.run_with_loop(self.pool.recv(*args), self.loop)
5576
5577 def __exit__(self, type, value, traceback):
5578 try:
5579 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5580 finally:
5581 self.loop.close()
15dfb392 5582 self._cancel_all_tasks(self.loop)
da42679b
LNO
5583
5584 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5585 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5586 @staticmethod
da42679b 5587 def run_with_loop(main, loop):
059bc4db 5588 if not asyncio.iscoroutine(main):
da42679b
LNO
5589 raise ValueError(f'a coroutine was expected, got {main!r}')
5590
5591 try:
5592 return loop.run_until_complete(main)
5593 finally:
5594 loop.run_until_complete(loop.shutdown_asyncgens())
5595 if hasattr(loop, 'shutdown_default_executor'):
5596 loop.run_until_complete(loop.shutdown_default_executor())
5597
7a32c70d 5598 @staticmethod
da42679b 5599 def _cancel_all_tasks(loop):
059bc4db 5600 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5601
5602 if not to_cancel:
5603 return
5604
5605 for task in to_cancel:
5606 task.cancel()
5607
9cd08050 5608 # XXX: "loop" is removed in python 3.10+
da42679b 5609 loop.run_until_complete(
059bc4db 5610 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5611
5612 for task in to_cancel:
5613 if task.cancelled():
5614 continue
5615 if task.exception() is not None:
5616 loop.call_exception_handler({
5617 'message': 'unhandled exception during asyncio.run() shutdown',
5618 'exception': task.exception(),
5619 'task': task,
5620 })
5621
5622
8b7539d2 5623def merge_headers(*dicts):
08d30158 5624 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5625 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5626
5627
b1f94422 5628def cached_method(f):
5629 """Cache a method"""
5630 signature = inspect.signature(f)
5631
7a32c70d 5632 @functools.wraps(f)
b1f94422 5633 def wrapper(self, *args, **kwargs):
5634 bound_args = signature.bind(self, *args, **kwargs)
5635 bound_args.apply_defaults()
d5d1df8a 5636 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5637
6368e2e6 5638 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5639 if key not in cache:
5640 cache[key] = f(self, *args, **kwargs)
5641 return cache[key]
5642 return wrapper
5643
5644
28787f16 5645class classproperty:
83cc7b8a 5646 """property access for class methods with optional caching"""
5647 def __new__(cls, func=None, *args, **kwargs):
5648 if not func:
5649 return functools.partial(cls, *args, **kwargs)
5650 return super().__new__(cls)
c487cf00 5651
83cc7b8a 5652 def __init__(self, func, *, cache=False):
c487cf00 5653 functools.update_wrapper(self, func)
5654 self.func = func
83cc7b8a 5655 self._cache = {} if cache else None
28787f16 5656
5657 def __get__(self, _, cls):
83cc7b8a 5658 if self._cache is None:
5659 return self.func(cls)
5660 elif cls not in self._cache:
5661 self._cache[cls] = self.func(cls)
5662 return self._cache[cls]
19a03940 5663
5664
a5387729 5665class function_with_repr:
b2e0343b 5666 def __init__(self, func, repr_=None):
a5387729 5667 functools.update_wrapper(self, func)
b2e0343b 5668 self.func, self.__repr = func, repr_
a5387729 5669
5670 def __call__(self, *args, **kwargs):
5671 return self.func(*args, **kwargs)
5672
5673 def __repr__(self):
b2e0343b 5674 if self.__repr:
5675 return self.__repr
a5387729 5676 return f'{self.func.__module__}.{self.func.__qualname__}'
5677
5678
64fa820c 5679class Namespace(types.SimpleNamespace):
591bb9d3 5680 """Immutable namespace"""
591bb9d3 5681
7896214c 5682 def __iter__(self):
64fa820c 5683 return iter(self.__dict__.values())
7896214c 5684
7a32c70d 5685 @property
64fa820c 5686 def items_(self):
5687 return self.__dict__.items()
9b8ee23b 5688
5689
8dc59305 5690MEDIA_EXTENSIONS = Namespace(
5691 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5692 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5693 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5694 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5695 thumbnails=('jpg', 'png', 'webp'),
5696 storyboards=('mhtml', ),
5697 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5698 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5699)
5700MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5701MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5702
5703KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5704
5705
be5c1ae8 5706class RetryManager:
5707 """Usage:
5708 for retry in RetryManager(...):
5709 try:
5710 ...
5711 except SomeException as err:
5712 retry.error = err
5713 continue
5714 """
5715 attempt, _error = 0, None
5716
5717 def __init__(self, _retries, _error_callback, **kwargs):
5718 self.retries = _retries or 0
5719 self.error_callback = functools.partial(_error_callback, **kwargs)
5720
5721 def _should_retry(self):
5722 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5723
7a32c70d 5724 @property
be5c1ae8 5725 def error(self):
5726 if self._error is NO_DEFAULT:
5727 return None
5728 return self._error
5729
7a32c70d 5730 @error.setter
be5c1ae8 5731 def error(self, value):
5732 self._error = value
5733
5734 def __iter__(self):
5735 while self._should_retry():
5736 self.error = NO_DEFAULT
5737 self.attempt += 1
5738 yield self
5739 if self.error:
5740 self.error_callback(self.error, self.attempt, self.retries)
5741
7a32c70d 5742 @staticmethod
be5c1ae8 5743 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5744 """Utility function for reporting retries"""
5745 if count > retries:
5746 if error:
5747 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5748 raise e
5749
5750 if not count:
5751 return warn(e)
5752 elif isinstance(e, ExtractorError):
3ce29336 5753 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5754 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5755
5756 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5757 if delay:
5758 info(f'Sleeping {delay:.2f} seconds ...')
5759 time.sleep(delay)
5760
5761
0647d925 5762def make_archive_id(ie, video_id):
5763 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5764 return f'{ie_key.lower()} {video_id}'
5765
5766
a1c5bd82 5767def truncate_string(s, left, right=0):
5768 assert left > 3 and right >= 0
5769 if s is None or len(s) <= left + right:
5770 return s
71df9b7f 5771 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5772
5773
5314b521 5774def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5775 assert 'all' in alias_dict, '"all" alias is required'
5776 requested = list(start or [])
5777 for val in options:
5778 discard = val.startswith('-')
5779 if discard:
5780 val = val[1:]
5781
5782 if val in alias_dict:
5783 val = alias_dict[val] if not discard else [
5784 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5785 # NB: Do not allow regex in aliases for performance
5786 requested = orderedSet_from_options(val, alias_dict, start=requested)
5787 continue
5788
5789 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5790 else [val] if val in alias_dict['all'] else None)
5791 if current is None:
5792 raise ValueError(val)
5793
5794 if discard:
5795 for item in current:
5796 while item in requested:
5797 requested.remove(item)
5798 else:
5799 requested.extend(current)
5800
5801 return orderedSet(requested)
5802
5803
d0d74b71 5804class FormatSorter:
5805 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5806
5807 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5808 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5809 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5810 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5811 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5812 'fps', 'fs_approx', 'source', 'id')
5813
5814 settings = {
5815 'vcodec': {'type': 'ordered', 'regex': True,
5816 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5817 'acodec': {'type': 'ordered', 'regex': True,
71082216 5818 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5819 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5820 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5821 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5822 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5823 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5824 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5825 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5826 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5827 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5828 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5829 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5830 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5831 'field': ('vcodec', 'acodec'),
5832 'function': lambda it: int(any(v != 'none' for v in it))},
5833 'ie_pref': {'priority': True, 'type': 'extractor'},
5834 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5835 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5836 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5837 'quality': {'convert': 'float', 'default': -1},
5838 'filesize': {'convert': 'bytes'},
5839 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5840 'id': {'convert': 'string', 'field': 'format_id'},
5841 'height': {'convert': 'float_none'},
5842 'width': {'convert': 'float_none'},
5843 'fps': {'convert': 'float_none'},
5844 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5845 'tbr': {'convert': 'float_none'},
5846 'vbr': {'convert': 'float_none'},
5847 'abr': {'convert': 'float_none'},
5848 'asr': {'convert': 'float_none'},
5849 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5850
5851 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5852 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5853 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5854 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5855 'res': {'type': 'multiple', 'field': ('height', 'width'),
5856 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5857
5858 # Actual field names
5859 'format_id': {'type': 'alias', 'field': 'id'},
5860 'preference': {'type': 'alias', 'field': 'ie_pref'},
5861 'language_preference': {'type': 'alias', 'field': 'lang'},
5862 'source_preference': {'type': 'alias', 'field': 'source'},
5863 'protocol': {'type': 'alias', 'field': 'proto'},
5864 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5865 'audio_channels': {'type': 'alias', 'field': 'channels'},
5866
5867 # Deprecated
5868 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5869 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5870 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5871 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5872 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5873 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5874 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5875 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5876 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5877 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5878 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5879 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5880 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5881 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5882 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5883 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5884 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5885 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5886 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5887 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5888 }
5889
5890 def __init__(self, ydl, field_preference):
5891 self.ydl = ydl
5892 self._order = []
5893 self.evaluate_params(self.ydl.params, field_preference)
5894 if ydl.params.get('verbose'):
5895 self.print_verbose_info(self.ydl.write_debug)
5896
5897 def _get_field_setting(self, field, key):
5898 if field not in self.settings:
5899 if key in ('forced', 'priority'):
5900 return False
5901 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5902 'deprecated and may be removed in a future version')
5903 self.settings[field] = {}
5904 propObj = self.settings[field]
5905 if key not in propObj:
5906 type = propObj.get('type')
5907 if key == 'field':
5908 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5909 elif key == 'convert':
5910 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5911 else:
5912 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5913 propObj[key] = default
5914 return propObj[key]
5915
5916 def _resolve_field_value(self, field, value, convertNone=False):
5917 if value is None:
5918 if not convertNone:
5919 return None
5920 else:
5921 value = value.lower()
5922 conversion = self._get_field_setting(field, 'convert')
5923 if conversion == 'ignore':
5924 return None
5925 if conversion == 'string':
5926 return value
5927 elif conversion == 'float_none':
5928 return float_or_none(value)
5929 elif conversion == 'bytes':
5930 return parse_bytes(value)
5931 elif conversion == 'order':
5932 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5933 use_regex = self._get_field_setting(field, 'regex')
5934 list_length = len(order_list)
5935 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5936 if use_regex and value is not None:
5937 for i, regex in enumerate(order_list):
5938 if regex and re.match(regex, value):
5939 return list_length - i
5940 return list_length - empty_pos # not in list
5941 else: # not regex or value = None
5942 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5943 else:
5944 if value.isnumeric():
5945 return float(value)
5946 else:
5947 self.settings[field]['convert'] = 'string'
5948 return value
5949
5950 def evaluate_params(self, params, sort_extractor):
5951 self._use_free_order = params.get('prefer_free_formats', False)
5952 self._sort_user = params.get('format_sort', [])
5953 self._sort_extractor = sort_extractor
5954
5955 def add_item(field, reverse, closest, limit_text):
5956 field = field.lower()
5957 if field in self._order:
5958 return
5959 self._order.append(field)
5960 limit = self._resolve_field_value(field, limit_text)
5961 data = {
5962 'reverse': reverse,
5963 'closest': False if limit is None else closest,
5964 'limit_text': limit_text,
5965 'limit': limit}
5966 if field in self.settings:
5967 self.settings[field].update(data)
5968 else:
5969 self.settings[field] = data
5970
5971 sort_list = (
5972 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5973 + (tuple() if params.get('format_sort_force', False)
5974 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5975 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5976
5977 for item in sort_list:
5978 match = re.match(self.regex, item)
5979 if match is None:
5980 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5981 field = match.group('field')
5982 if field is None:
5983 continue
5984 if self._get_field_setting(field, 'type') == 'alias':
5985 alias, field = field, self._get_field_setting(field, 'field')
5986 if self._get_field_setting(alias, 'deprecated'):
5987 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5988 f'be removed in a future version. Please use {field} instead')
5989 reverse = match.group('reverse') is not None
5990 closest = match.group('separator') == '~'
5991 limit_text = match.group('limit')
5992
5993 has_limit = limit_text is not None
5994 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5995 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5996
5997 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5998 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5999 limit_count = len(limits)
6000 for (i, f) in enumerate(fields):
6001 add_item(f, reverse, closest,
6002 limits[i] if i < limit_count
6003 else limits[0] if has_limit and not has_multiple_limits
6004 else None)
6005
6006 def print_verbose_info(self, write_debug):
6007 if self._sort_user:
6008 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6009 if self._sort_extractor:
6010 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6011 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6012 '+' if self._get_field_setting(field, 'reverse') else '', field,
6013 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6014 self._get_field_setting(field, 'limit_text'),
6015 self._get_field_setting(field, 'limit'))
6016 if self._get_field_setting(field, 'limit_text') is not None else '')
6017 for field in self._order if self._get_field_setting(field, 'visible')]))
6018
6019 def _calculate_field_preference_from_value(self, format, field, type, value):
6020 reverse = self._get_field_setting(field, 'reverse')
6021 closest = self._get_field_setting(field, 'closest')
6022 limit = self._get_field_setting(field, 'limit')
6023
6024 if type == 'extractor':
6025 maximum = self._get_field_setting(field, 'max')
6026 if value is None or (maximum is not None and value >= maximum):
6027 value = -1
6028 elif type == 'boolean':
6029 in_list = self._get_field_setting(field, 'in_list')
6030 not_in_list = self._get_field_setting(field, 'not_in_list')
6031 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6032 elif type == 'ordered':
6033 value = self._resolve_field_value(field, value, True)
6034
6035 # try to convert to number
6036 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6037 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6038 if is_num:
6039 value = val_num
6040
6041 return ((-10, 0) if value is None
6042 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6043 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6044 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6045 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6046 else (-1, value, 0))
6047
6048 def _calculate_field_preference(self, format, field):
6049 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6050 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6051 if type == 'multiple':
6052 type = 'field' # Only 'field' is allowed in multiple for now
6053 actual_fields = self._get_field_setting(field, 'field')
6054
6055 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6056 else:
6057 value = get_value(field)
6058 return self._calculate_field_preference_from_value(format, field, type, value)
6059
6060 def calculate_preference(self, format):
6061 # Determine missing protocol
6062 if not format.get('protocol'):
6063 format['protocol'] = determine_protocol(format)
6064
6065 # Determine missing ext
6066 if not format.get('ext') and 'url' in format:
6067 format['ext'] = determine_ext(format['url'])
6068 if format.get('vcodec') == 'none':
6069 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6070 format['video_ext'] = 'none'
6071 else:
6072 format['video_ext'] = format['ext']
6073 format['audio_ext'] = 'none'
6074 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6075 # format['preference'] = -1000
6076
5424dbaf
L
6077 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6078 # HEVC-over-FLV is out-of-spec by FLV's original spec
6079 # ref. https://trac.ffmpeg.org/ticket/6389
6080 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6081 format['preference'] = -100
6082
d0d74b71 6083 # Determine missing bitrates
6084 if format.get('tbr') is None:
6085 if format.get('vbr') is not None and format.get('abr') is not None:
6086 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6087 else:
6088 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6089 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6090 if format.get('acodec') != 'none' and format.get('abr') is None:
6091 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6092
6093 return tuple(self._calculate_field_preference(format, field) for field in self._order)