]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Add pre-processor stage `video`
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
989a01c2 44import unicodedata
14f25df2 45import urllib.error
f8271158 46import urllib.parse
ac668111 47import urllib.request
bcf89ce6 48import xml.etree.ElementTree
d77c3dfd 49import zlib
d77c3dfd 50
6929b41a 51from .compat import functools # isort: split
8c25f81b 52from .compat import (
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
f8271158 55 compat_HTMLParseError,
efa97bdc 56 compat_os_name,
702ccf2d 57 compat_shlex_quote,
8c25f81b 58)
ac668111 59from .dependencies import brotli, certifi, websockets, xattr
f8271158 60from .socks import ProxyType, sockssocket
71aff188 61
4644ac55 62
51fb4995
YCH
63def register_socks_protocols():
64 # "Register" SOCKS protocols
d5ae6bb5
YCH
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
70
71
468e2e92
FV
72# This is not clearly defined otherwise
73compiled_regex_type = type(re.compile(''))
74
f7a147e3
S
75
76def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
19b4c74d 79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
f7a147e3
S
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
4390d5ec 121SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123]
9b8ee23b 124if brotli:
4390d5ec 125 SUPPORTED_ENCODINGS.append('br')
126
3e669f36 127std_headers = {
f7a147e3 128 'User-Agent': random_user_agent(),
59ae15a5 129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 130 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 131 'Sec-Fetch-Mode': 'navigate',
3e669f36 132}
f427df17 133
5f6a1245 134
fb37eb25
S
135USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137}
138
139
bf42a990 140NO_DEFAULT = object()
7b2c3f47 141IDENTITY = lambda x: x
bf42a990 142
7105440c
YCH
143ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
f6717dec
S
147MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
3e4185c3
S
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 157}
a942d6cb 158
8f53dc44 159# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167}
168
c587cbb7 169# needed for sanitizing filenames in restricted mode
c8827027 170ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 173
46f59e89
S
174DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
cb655f34
S
178 '%B %dst %Y',
179 '%B %dnd %Y',
9d30c213 180 '%B %drd %Y',
cb655f34 181 '%B %dth %Y',
46f59e89 182 '%b %d %Y',
cb655f34
S
183 '%b %dst %Y',
184 '%b %dnd %Y',
9d30c213 185 '%b %drd %Y',
cb655f34 186 '%b %dth %Y',
46f59e89
S
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
9d30c213 189 '%b %drd %Y %I:%M',
46f59e89
S
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
bccdbd22 193 '%Y.%m.%d.',
46f59e89 194 '%Y/%m/%d',
81c13222 195 '%Y/%m/%d %H:%M',
46f59e89 196 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
4f3fa23e 199 '%Y%m%d',
0c1c6f4b 200 '%Y-%m-%d %H:%M',
46f59e89
S
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
5014558a 203 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
c6eed6b8
S
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
b555ae9b
S
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
a63d9bd0 216 '%H:%M %d-%b-%Y',
46f59e89
S
217)
218
219DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
47304e07 227 '%d-%m-%Y %H:%M',
46f59e89
S
228])
229
230DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237])
238
06b3fe29 239PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 240JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 241
1d485a1a 242NUMBER_RE = r'\d+(?:\.\d+)?'
243
7105440c 244
0b9c08b4 245@functools.cache
d77c3dfd 246def preferredencoding():
59ae15a5 247 """Get preferred encoding.
d77c3dfd 248
59ae15a5
PH
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
28e614de 254 'TEST'.encode(pref)
70a1165b 255 except Exception:
59ae15a5 256 pref = 'UTF-8'
bae611f2 257
59ae15a5 258 return pref
d77c3dfd 259
f4bfd65f 260
181c8655 261def write_json_file(obj, fn):
1394646a 262 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 263
cfb0511d 264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
267
268 try:
269 with tf:
45d86abe 270 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
19a03940 274 with contextlib.suppress(OSError):
1394646a 275 os.unlink(fn)
19a03940 276 with contextlib.suppress(OSError):
9cd5f54e
R
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
181c8655 280 os.rename(tf.name, fn)
70a1165b 281 except Exception:
19a03940 282 with contextlib.suppress(OSError):
181c8655 283 os.remove(tf.name)
181c8655
PH
284 raise
285
286
cfb0511d 287def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 291 return node.find(expr)
59ae56fa 292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
5f6a1245
JW
295
296
d7e66d39
JMF
297def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
d77c3dfd 308
a41fb80c 309def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 310 def _find_xpath(xpath):
f9934b96 311 return node.find(xpath)
578c0745 312
14f25df2 313 if isinstance(xpath, str):
578c0745
S
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
d74bebd5 320
8e636da4 321 if n is None:
bf42a990
S
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
bf0ff932
PH
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
a41fb80c
S
329 return n
330
331
332def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
a41fb80c
S
345
346
347def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
86e5f3ed 353 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
bf0ff932
PH
358
359
c487cf00 360def get_element_by_id(id, html, **kwargs):
43e8fafd 361 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 362 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 363
12ea2f30 364
c487cf00 365def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 366 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 367 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
368
369
84c237fb 370def get_element_by_class(class_name, html):
2af12ad9
TC
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
6f32a0b5
ZM
376def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
c487cf00 382def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
384 return retval[0] if retval else None
385
386
c487cf00 387def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
389 return retval[0] if retval else None
390
391
c487cf00 392def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
64fa820c 395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
396 html, escape_value=False)
397
398
6f32a0b5
ZM
399def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
64fa820c 402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
403 html, escape_value=False)
404
405
406def get_elements_by_attribute(*args, **kwargs):
43e8fafd 407 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
4c9a1a3b 416def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
c61473c1
M
421 if not value:
422 return
9e6dd238 423
86e5f3ed 424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 425
84c237fb
YCH
426 value = re.escape(value) if escape_value else value
427
86e5f3ed 428 partial_element_re = rf'''(?x)
4c9a1a3b 429 <(?P<tag>{tag})
0254f162 430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
38285056 433
0254f162
ZM
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 436
0254f162
ZM
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
a921f407 441
c5229f39 442
ac668111 443class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
ac668111 455 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
46d09f87 485# XXX: This should be far less strict
6f32a0b5
ZM
486def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
ac668111 521class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 522 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 523
8bb56eee 524 def __init__(self):
c5229f39 525 self.attrs = {}
ac668111 526 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
7053aa3a 530 raise compat_HTMLParseError('done')
8bb56eee 531
c5229f39 532
ac668111 533class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
ac668111 537 html.parser.HTMLParser.__init__(self)
73673ccf
FF
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
8bb56eee
BF
550def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
8bb56eee
BF
563 """
564 parser = HTMLAttributeParser()
19a03940 565 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
566 parser.feed(html_element)
567 parser.close()
8bb56eee 568 return parser.attrs
9e6dd238 569
c5229f39 570
73673ccf
FF
571def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
9e6dd238 580def clean_html(html):
59ae15a5 581 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
49185227 586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
7decf895 593 return html.strip()
9e6dd238
FV
594
595
b7c47b74 596class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
2fa669f7 604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
b7c47b74 612
613
d77c3dfd 614def sanitize_open(filename, open_mode):
59ae15a5
PH
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
0edb3e33 624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
be5c1ae8 627
62b58c09 628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 632
0edb3e33 633 for attempt in range(2):
634 try:
635 try:
89737671 636 if sys.platform == 'win32':
b506289f 637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 640 raise LockingUnsupportedError()
0edb3e33 641 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 642 except OSError:
0edb3e33 643 stream = open(filename, open_mode)
8a82af35 644 return stream, filename
86e5f3ed 645 except OSError as err:
0edb3e33 646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
d77c3dfd
FV
651
652
653def timeconvert(timestr):
59ae15a5
PH
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
1c469a94 660
5f6a1245 661
5c3895ff 662def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 663 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 667 """
5c3895ff 668 if s == '':
669 return ''
670
59ae15a5 671 def replace_insane(char):
c587cbb7
AT
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
91dd88b9 674 elif not restricted and char == '\n':
5c3895ff 675 return '\0 '
989a01c2 676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
5c3895ff 684 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 685 elif char in '\\/|*<>':
5c3895ff 686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
59ae15a5
PH
689 return char
690
db4678e4 691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 693 s = unicodedata.normalize('NFKC', s)
5c3895ff 694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 695 result = ''.join(map(replace_insane, s))
5c3895ff 696 if is_id is NO_DEFAULT:
ae61d108 697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
796173d0
PH
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
5a42414b
PH
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
a7440261 711 result = result.lstrip('.')
796173d0
PH
712 if not result:
713 result = '_'
59ae15a5 714 return result
d77c3dfd 715
5f6a1245 716
c2934512 717def sanitize_path(s, force=False):
a2aaf4db 718 """Sanitizes and normalizes path on Windows"""
c2934512 719 if sys.platform == 'win32':
c4218ac3 720 force = False
c2934512 721 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 722 elif force:
723 drive_or_unc = ''
724 else:
a2aaf4db 725 return s
c2934512 726
be531ef1
S
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
a2aaf4db
S
729 norm_path.pop(0)
730 sanitized_path = [
ec85ded8 731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 732 for path_part in norm_path]
be531ef1
S
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 735 elif force and s and s[0] == os.path.sep:
c4218ac3 736 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
737 return os.path.join(*sanitized_path)
738
739
8f97a15d 740def sanitize_url(url, *, scheme='http'):
befa4708
S
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
21633673 743 if url is None:
744 return
745 elif url.startswith('//'):
8f97a15d 746 return f'{scheme}:{url}'
befa4708
S
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
067aa17e 749 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
bc6b9bcd 757 return url
17bcc626
S
758
759
5435dcf9 760def extract_basic_auth(url):
14f25df2 761 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
762 if parts.username is None:
763 return url, None
14f25df2 764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
0f06bcd7 768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
770
771
67dda517 772def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
ac668111 777 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
778
779
51098426 780def expand_path(s):
2fa669f7 781 """Expand shell variables and ~"""
51098426
S
782 return os.path.expandvars(compat_expanduser(s))
783
784
7e9a6125 785def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
d77c3dfd 795
912b38b4 796
55b2f099 797def _htmlentity_transform(entity_with_semicolon):
4e408e47 798 """Transforms an HTML entity to a character."""
55b2f099
YCH
799 entity = entity_with_semicolon[:-1]
800
4e408e47 801 # Known non-numeric HTML entity
ac668111 802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
4e408e47 804
62b58c09
L
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
55b2f099 809
91757b0f 810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
811 if mobj is not None:
812 numstr = mobj.group(1)
28e614de 813 if numstr.startswith('x'):
4e408e47 814 base = 16
28e614de 815 numstr = '0%s' % numstr
4e408e47
PH
816 else:
817 base = 10
067aa17e 818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 819 with contextlib.suppress(ValueError):
ac668111 820 return chr(int(numstr, base))
4e408e47
PH
821
822 # Unknown entity in name, return its literal representation
7a3f0c00 823 return '&%s;' % entity
4e408e47
PH
824
825
d77c3dfd 826def unescapeHTML(s):
912b38b4
PH
827 if s is None:
828 return None
19a03940 829 assert isinstance(s, str)
d77c3dfd 830
4e408e47 831 return re.sub(
95f3f7c2 832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 833
8bf48f23 834
cdb19aa4 835def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
f5b1bca9 846def process_communicate_or_kill(p, *args, **kwargs):
da4db748 847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 849 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 850
851
d3c93ec2 852class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
82ea226c
L
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
f0c9fb96 883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
82ea226c 887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 888
889 def communicate_or_kill(self, *args, **kwargs):
8a82af35 890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
f0c9fb96 893 self.kill(timeout=None)
8a82af35 894 raise
d3c93ec2 895
f0c9fb96 896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
992dc6b4 902 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 903 with cls(*args, **kwargs) as proc:
914491b8 904 default = '' if proc.text_mode else b''
992dc6b4 905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 906 return stdout or default, stderr or default, proc.returncode
f0c9fb96 907
d3c93ec2 908
aa49acd1
S
909def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
8bf48f23 921def encodeFilename(s, for_subprocess=False):
19a03940 922 assert isinstance(s, str)
cfb0511d 923 return s
aa49acd1
S
924
925
926def decodeFilename(b, for_subprocess=False):
cfb0511d 927 return b
8bf48f23 928
f07b74fc
PH
929
930def encodeArgument(s):
cfb0511d 931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
14f25df2 933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 934 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
935
936
aa49acd1 937def decodeArgument(b):
cfb0511d 938 return b
aa49acd1
S
939
940
8271226a
PH
941def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
14f25df2 947 assert isinstance(optval, str)
8271226a 948 return optval
1c256f70 949
5f6a1245 950
aa7785f8 951_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
cdb19aa4 961def formatSeconds(secs, delim=':', msec=False):
aa7785f8 962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 967 else:
aa7785f8 968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 970
a0ddb8a2 971
77562778 972def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
19a03940 981 with contextlib.suppress(ssl.SSLError):
77562778 982 ssl_context.load_verify_locations(cadata=cert)
a2366922 983
77562778 984
985def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
f81c62a6 989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
ac8e69dd
M
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
5b9f253f
M
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1012
77562778 1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
d5820461 1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
8a82af35 1029
bb58c9ed 1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
77562778 1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1046
732ea2f0 1047
5873d4cc 1048def bug_reports_message(before=';'):
57e0f077 1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1059
1060
bf5b9d85
PM
1061class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
aa9369a2 1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
bf5b9d85
PM
1071
1072
ac668111 1073network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1074if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076network_exceptions = tuple(network_exceptions)
1077
1078
bf5b9d85 1079class ExtractorError(YoutubeDLError):
1c256f70 1080 """Error during info extraction."""
5f6a1245 1081
1151c407 1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1083 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1085 """
3158150c 1086 if sys.exc_info()[0] in network_exceptions:
9a82b238 1087 expected = True
d5979c5d 1088
7265a219 1089 self.orig_msg = str(msg)
1c256f70 1090 self.traceback = tb
1151c407 1091 self.expected = expected
2eabb802 1092 self.cause = cause
d11271dd 1093 self.video_id = video_id
1151c407 1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1098 super().__init__(self.__msg)
1151c407 1099
9bcfe33b 1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1c256f70 1108
01951dda 1109 def format_traceback(self):
497d2fab 1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1113 delim='\n') or None
01951dda 1114
9bcfe33b 1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1c256f70 1121
416c7fcb
PH
1122class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
86e5f3ed 1124 super().__init__(
416c7fcb
PH
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
55b3e45b
JMF
1129class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
773f291d
S
1134class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
b6e0c7d2 1140
0db3bae8 1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
86e5f3ed 1143 super().__init__(msg, **kwargs)
773f291d
S
1144 self.countries = countries
1145
1146
693f0600 1147class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
bf5b9d85 1155class DownloadError(YoutubeDLError):
59ae15a5 1156 """Download Error exception.
d77c3dfd 1157
59ae15a5
PH
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
5f6a1245 1162
8cc83b8d
FV
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1165 super().__init__(msg)
8cc83b8d 1166 self.exc_info = exc_info
d77c3dfd
FV
1167
1168
498f5606 1169class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
aa9369a2 1175 msg = 'Entry not found in info'
498f5606 1176
1177
bf5b9d85 1178class SameFileError(YoutubeDLError):
59ae15a5 1179 """Same File exception.
d77c3dfd 1180
59ae15a5
PH
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
aa9369a2 1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
d77c3dfd
FV
1190
1191
bf5b9d85 1192class PostProcessingError(YoutubeDLError):
59ae15a5 1193 """Post Processing exception.
d77c3dfd 1194
59ae15a5
PH
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
5f6a1245 1198
5f6a1245 1199
48f79687 1200class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
8b0d7497 1203
8b0d7497 1204
48f79687 1205class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1208
48f79687 1209
1210class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1213
1214
48f79687 1215class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1216 """ --max-downloads limit has been reached. """
48f79687 1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
f2ebc5c7 1220class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228class ThrottledDownload(ReExtractInfo):
48f79687 1229 """ Download speed below --throttled-rate. """
aa9369a2 1230 msg = 'The download speed is below throttle limit'
d77c3dfd 1231
43b22906 1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
f2ebc5c7 1234
d77c3dfd 1235
bf5b9d85 1236class UnavailableVideoError(YoutubeDLError):
59ae15a5 1237 """Unavailable Format exception.
d77c3dfd 1238
59ae15a5
PH
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
aa9369a2 1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
d77c3dfd
FV
1248
1249
bf5b9d85 1250class ContentTooShortError(YoutubeDLError):
59ae15a5 1251 """Content Too Short exception.
d77c3dfd 1252
59ae15a5
PH
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
d77c3dfd 1257
59ae15a5 1258 def __init__(self, downloaded, expected):
86e5f3ed 1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1260 # Both in bytes
59ae15a5
PH
1261 self.downloaded = downloaded
1262 self.expected = expected
d77c3dfd 1263
5f6a1245 1264
bf5b9d85 1265class XAttrMetadataError(YoutubeDLError):
efa97bdc 1266 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1267 super().__init__(msg)
efa97bdc 1268 self.code = code
bd264412 1269 self.msg = msg
efa97bdc
YCH
1270
1271 # Parsing code and msg
3089bc74 1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
bf5b9d85 1281class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1282 pass
1283
1284
c5a59d93 1285def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1286 hc = http_class(*args, **kwargs)
be4a824d 1287 source_address = ydl_handler._params.get('source_address')
8959018a 1288
be4a824d 1289 if source_address is not None:
8959018a
AU
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1303 raise OSError(
9e21e6d9
S
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
8959018a
AU
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
86e5f3ed 1317 except OSError as _:
8959018a
AU
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
86e5f3ed 1324 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
cfb0511d 1327 hc.source_address = (source_address, 0)
be4a824d
PH
1328
1329 return hc
1330
1331
87f0e62d 1332def handle_youtubedl_headers(headers):
992fc9d6
YCH
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1337 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1338
992fc9d6 1339 return filtered_headers
87f0e62d
YCH
1340
1341
ac668111 1342class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
0424ec30 1349 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
be4a824d 1360 def __init__(self, params, *args, **kwargs):
ac668111 1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1362 self._params = params
1363
1364 def http_open(self, req):
ac668111 1365 conn_class = http.client.HTTPConnection
71aff188
YCH
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
be4a824d 1372 return self.do_open(functools.partial(
71aff188 1373 _create_http_connection, self, conn_class, False),
be4a824d
PH
1374 req)
1375
59ae15a5
PH
1376 @staticmethod
1377 def deflate(data):
fc2119f2 1378 if not data:
1379 return data
59ae15a5
PH
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
4390d5ec 1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
9b8ee23b 1389 return brotli.decompress(data)
4390d5ec 1390
acebc9cd 1391 def http_request(self, req):
51f267d9
S
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
15d260eb 1405 req = update_Request(req, url=url_escaped)
51f267d9 1406
8b7539d2 1407 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
33ac271b 1411 req.add_header(h, v)
87f0e62d 1412
af14914b 1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
87f0e62d 1416 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1417
379a4f16 1418 return super().do_request_(req)
59ae15a5 1419
acebc9cd 1420 def http_response(self, req, resp):
59ae15a5
PH
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1428 except OSError as original_ioerror:
aa3e9507
PH
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1435 except OSError:
aa3e9507
PH
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
ac668111 1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1441 resp.msg = old_resp.msg
c047270c 1442 del resp.headers['Content-encoding']
59ae15a5
PH
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1447 resp.msg = old_resp.msg
c047270c 1448 del resp.headers['Content-encoding']
4390d5ec 1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1451 resp = urllib.request.addinfourl(
4390d5ec 1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
ad729172 1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1461 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
59ae15a5 1466 return resp
0f8d03f8 1467
acebc9cd
PH
1468 https_request = http_request
1469 https_response = http_response
bf50b038 1470
5de90176 1471
71aff188
YCH
1472def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
ac668111 1474 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1475
14f25df2 1476 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
71aff188 1483
cdd94c2e
YCH
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
ac668111 1487 return urllib.parse.unquote_plus(s)
cdd94c2e 1488
71aff188
YCH
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
cdd94c2e
YCH
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
71aff188
YCH
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
19a03940 1501 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
ac668111 1505 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
ac668111 1515class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1519 self._params = params
1520
1521 def https_open(self, req):
4f264c02 1522 kwargs = {}
71aff188
YCH
1523 conn_class = self._https_conn_class
1524
4f264c02
JMF
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
4f28b537 1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
be4a824d
PH
1543
1544
941e881e 1545def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
ac668111 1549class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
e7e62441 1555 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1558# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1559
1560'''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1564
d76fa1f3 1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
941e881e 1567 if is_path_like(filename):
d76fa1f3 1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
24146491 1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
d76fa1f3 1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
941e881e 1577 if is_path_like(file):
d76fa1f3 1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
24146491 1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1607 """
1608 Save cookies to a file.
24146491 1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1611
c380cc28
S
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
ac668111 1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1617
24146491 1618 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
c380cc28 1622
d76fa1f3 1623 with self.open(filename, write=True) as f:
c380cc28 1624 f.write(self._HEADER)
24146491 1625 self._really_save(f, *args, **kwargs)
1bab3437
S
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
ac668111 1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1634
c380cc28
S
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1647 return line
1648
e7e62441 1649 cf = io.StringIO()
d76fa1f3 1650 with self.open(filename) as f:
e7e62441 1651 for line in f:
c380cc28
S
1652 try:
1653 cf.write(prepare_line(line))
ac668111 1654 except http.cookiejar.LoadError as e:
94aa0644 1655 if f'{line.strip()} '[0] in '[{"':
ac668111 1656 raise http.cookiejar.LoadError(
94aa0644 1657 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1660 continue
e7e62441 1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
ac668111 1679class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1680 def __init__(self, cookiejar=None):
ac668111 1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1682
1683 def http_response(self, request, response):
ac668111 1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1685
ac668111 1686 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1687 https_response = http_response
1688
1689
ac668111 1690class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
ac668111 1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
14f25df2 1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
201c1459 1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
ac668111 1747 return urllib.request.Request(
201c1459 1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1749 unverifiable=True, method=m)
fca6dba8
S
1750
1751
46f59e89
S
1752def extract_timezone(date_str):
1753 m = re.search(
f137e4c2 1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
46f59e89 1764 if not m:
8f53dc44 1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
08b38d54 1782def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
52c3a6e4
S
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
08b38d54 1790 if timezone is None:
46f59e89
S
1791 timezone, date_str = extract_timezone(date_str)
1792
19a03940 1793 with contextlib.suppress(ValueError):
86e5f3ed 1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
912b38b4
PH
1797
1798
46f59e89
S
1799def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
42bdd9d0 1803def unified_strdate(date_str, day_first=True):
bf50b038 1804 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1805
1806 if date_str is None:
1807 return None
bf50b038 1808 upload_date = None
5f6a1245 1809 # Replace commas
026fcc04 1810 date_str = date_str.replace(',', ' ')
42bdd9d0 1811 # Remove AM/PM + timezone
9bb8e0a3 1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1813 _, date_str = extract_timezone(date_str)
42bdd9d0 1814
46f59e89 1815 for expression in date_formats(day_first):
19a03940 1816 with contextlib.suppress(ValueError):
bf50b038 1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
19a03940 1821 with contextlib.suppress(ValueError):
c6b9cf05 1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1823 if upload_date is not None:
14f25df2 1824 return str(upload_date)
bf50b038 1825
5f6a1245 1826
46f59e89
S
1827def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
8f53dc44 1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1833
7dc2a74e 1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
deef3195
S
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
f226880c
PH
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
46f59e89 1850 for expression in date_formats(day_first):
19a03940 1851 with contextlib.suppress(ValueError):
7dc2a74e 1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1853 return calendar.timegm(dt.timetuple())
8f53dc44 1854
46f59e89
S
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
8f53dc44 1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1858
1859
28e614de 1860def determine_ext(url, default_ext='unknown_video'):
85750f89 1861 if url is None or '.' not in url:
f4776371 1862 return default_ext
9cb9a5df 1863 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
a7aaa398
S
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1868 return guess.rstrip('/')
73e79f2a 1869 else:
cbdbb766 1870 return default_ext
73e79f2a 1871
5f6a1245 1872
824fa511
S
1873def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1875
5f6a1245 1876
9e62f283 1877def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
9e62f283 1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
396a76f7 1891 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1892 if date_str in ('now', 'today'):
37254abc 1893 return today
f8795e10
PH
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
9e62f283 1896 match = re.match(
3d38b2d6 1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1898 date_str)
37254abc 1899 if match is not None:
9e62f283 1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1902 unit = match.group('unit')
9e62f283 1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1905 unit = 'day'
9e62f283 1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
d49f8db3 1919def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1920 R"""
1921 Return a date object from a string using datetime_from_str
9e62f283 1922
3d38b2d6 1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1925 """
3d38b2d6 1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1956
1957
e63fc1be 1958def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
5f6a1245 1967
86e5f3ed 1968class DateRange:
bd558525 1969 """Represents a time interval between two dates"""
5f6a1245 1970
bd558525
JMF
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
d49f8db3 1974 self.start = date_from_str(start, strict=True)
bd558525
JMF
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
d49f8db3 1978 self.end = date_from_str(end, strict=True)
bd558525
JMF
1979 else:
1980 self.end = datetime.datetime.max.date()
37254abc 1981 if self.start > self.end:
bd558525 1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1983
bd558525
JMF
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
5f6a1245
JW
1987 return cls(day, day)
1988
bd558525
JMF
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
37254abc
JMF
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
5f6a1245 1994
bd558525 1995 def __str__(self):
86e5f3ed 1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1997
f2df4071 1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
c496ca96
PH
2002
2003def platform_name():
14f25df2 2004 """ Returns the platform name as a str """
da4db748 2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 2006 return platform.platform()
c496ca96 2007
b1f94422 2008
2009@functools.cache
2010def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
b1f94422 2017
17fc3dc4 2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 2019 platform.python_version(),
2020 python_implementation,
17fc3dc4 2021 platform.machine(),
b1f94422 2022 platform.architecture()[0],
2023 platform.platform(),
5b9f253f
M
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 2026 )
c257baff
PH
2027
2028
0b9c08b4 2029@functools.cache
49fa4d9a 2030def get_windows_version():
8a82af35 2031 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
8a82af35 2035 return ()
49fa4d9a
N
2036
2037
734f90bb 2038def write_string(s, out=None, encoding=None):
19a03940 2039 assert isinstance(s, str)
2040 out = out or sys.stderr
7459e3a2 2041
fe1daad3 2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2043 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2044
8a82af35 2045 enc, buffer = None, out
cfb0511d 2046 if 'b' in getattr(out, 'mode', ''):
c487cf00 2047 enc = encoding or preferredencoding()
104aa738 2048 elif hasattr(out, 'buffer'):
8a82af35 2049 buffer = out.buffer
104aa738 2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2051
8a82af35 2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2053 out.flush()
2054
2055
da4db748 2056def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070deprecation_warning._cache = set()
2071
2072
48ea9cea
PH
2073def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
c257baff 2081
cba892fa 2082def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
ac668111 2085 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2086
2087
8a82af35 2088class LockingUnsupportedError(OSError):
1890fc63 2089 msg = 'File locking is not supported'
0edb3e33 2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
c1c9a79c
PH
2095# Cross-platform file locking
2096if sys.platform == 'win32':
fe0918bb 2097 import ctypes
c1c9a79c
PH
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
747c0bd1 2133 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152else:
399a76e6
YCH
2153 try:
2154 import fcntl
c1c9a79c 2155
a3125791 2156 def _lock_file(f, exclusive, block):
b63837bc 2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
acea8d7c 2160 try:
b63837bc 2161 fcntl.flock(f, flags)
acea8d7c
JK
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
b63837bc 2165 fcntl.lockf(f, flags)
c1c9a79c 2166
399a76e6 2167 def _unlock_file(f):
acea8d7c
JK
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2172
399a76e6 2173 except ImportError:
399a76e6 2174
a3125791 2175 def _lock_file(f, exclusive, block):
0edb3e33 2176 raise LockingUnsupportedError()
399a76e6
YCH
2177
2178 def _unlock_file(f):
0edb3e33 2179 raise LockingUnsupportedError()
c1c9a79c
PH
2180
2181
86e5f3ed 2182class locked_file:
0edb3e33 2183 locked = False
747c0bd1 2184
a3125791 2185 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
98804d03 2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2203
2204 def __enter__(self):
a3125791 2205 exclusive = 'r' not in self.mode
c1c9a79c 2206 try:
a3125791 2207 _lock_file(self.f, exclusive, self.block)
0edb3e33 2208 self.locked = True
86e5f3ed 2209 except OSError:
c1c9a79c
PH
2210 self.f.close()
2211 raise
fcfa8853 2212 if 'w' in self.mode:
131e14dc
JK
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
1890fc63 2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
c1c9a79c
PH
2221 return self
2222
0edb3e33 2223 def unlock(self):
2224 if not self.locked:
2225 return
c1c9a79c 2226 try:
0edb3e33 2227 _unlock_file(self.f)
c1c9a79c 2228 finally:
0edb3e33 2229 self.locked = False
c1c9a79c 2230
0edb3e33 2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
4eb7f1d1 2236
0edb3e33 2237 open = __enter__
2238 close = __exit__
a3125791 2239
0edb3e33 2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
a3125791 2242
0edb3e33 2243 def __iter__(self):
2244 return iter(self.f)
a3125791 2245
4eb7f1d1 2246
0b9c08b4 2247@functools.cache
4644ac55
S
2248def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
4eb7f1d1 2253def shell_quote(args):
a6a173c2 2254 quoted_args = []
4644ac55 2255 encoding = get_filesystem_encoding()
a6a173c2
JMF
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
aefce8e6 2260 quoted_args.append(compat_shlex_quote(a))
28e614de 2261 return ' '.join(quoted_args)
9d4660ca
PH
2262
2263
2264def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
81953d1a
RA
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
14f25df2 2269 sdata = urllib.parse.urlencode(
28e614de
PH
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
9d4660ca
PH
2272
2273
79f82953 2274def unsmuggle_url(smug_url, default=None):
83e865a3 2275 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2276 return smug_url, default
28e614de 2277 url, _, sdata = smug_url.rpartition('#')
14f25df2 2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2279 data = json.loads(jsond)
2280 return url, data
02dbf93f
PH
2281
2282
e0fd9573 2283def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2286 if num is None or num < 0:
e0fd9573 2287 return None
eeb2a770 2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2293 converted = num / (factor ** exponent)
abbeeebc 2294 return fmt % (converted, suffix)
e0fd9573 2295
2296
02dbf93f 2297def format_bytes(bytes):
f02d24d8 2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2299
1c088fa8 2300
64c464a1 2301def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2303 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2306 if not m:
2307 return None
64c464a1 2308
2309 num = float(m.group('num').replace(',', '.'))
fb47597b 2310 mult = unit_table[m.group('unit')]
64c464a1 2311 return round(num * mult)
2312
2313
2314def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
fb47597b
S
2319
2320
be64b5b0
PH
2321def parse_filesize(s):
2322 if s is None:
2323 return None
2324
dfb1b146 2325 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
70852b47 2330 'bytes': 1,
be64b5b0
PH
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
13585d76 2335 'kb': 1000,
70852b47
YCH
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
be64b5b0
PH
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
13585d76 2342 'mb': 1000 ** 2,
70852b47
YCH
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
be64b5b0
PH
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
13585d76 2349 'gb': 1000 ** 3,
70852b47
YCH
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
be64b5b0
PH
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
13585d76 2356 'tb': 1000 ** 4,
70852b47
YCH
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
be64b5b0
PH
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
13585d76 2363 'pb': 1000 ** 5,
70852b47
YCH
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
be64b5b0
PH
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
13585d76 2370 'eb': 1000 ** 6,
70852b47
YCH
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
be64b5b0
PH
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
13585d76 2377 'zb': 1000 ** 7,
70852b47
YCH
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
be64b5b0
PH
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
13585d76 2384 'yb': 1000 ** 8,
70852b47
YCH
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
be64b5b0
PH
2387 }
2388
fb47597b
S
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392def parse_count(s):
2393 if s is None:
be64b5b0
PH
2394 return None
2395
352d5da8 2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
352d5da8 2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
fb47597b 2410 }
be64b5b0 2411
352d5da8 2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
be64b5b0 2419
2f7ae819 2420
5d45484c 2421def parse_resolution(s, *, lenient=False):
b871d7e9
S
2422 if s is None:
2423 return {}
2424
5d45484c
LNO
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
17ec8bcf 2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
0dc41787 2446def parse_bitrate(s):
14f25df2 2447 if not isinstance(s, str):
0dc41787
S
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
a942d6cb 2454def month_by_name(name, lang='en'):
caefb1de
PH
2455 """ Return the number of a month by (locale-independently) English name """
2456
f6717dec 2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2458
caefb1de 2459 try:
f6717dec 2460 return month_names.index(name) + 1
7105440c
YCH
2461 except ValueError:
2462 return None
2463
2464
2465def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2471 except ValueError:
2472 return None
18258362
JMF
2473
2474
5aafe895 2475def fix_xml_ampersands(xml_str):
18258362 2476 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2479 '&amp;',
5aafe895 2480 xml_str)
e3946f98
PH
2481
2482
2483def setproctitle(title):
14f25df2 2484 assert isinstance(title, str)
c1c05c67 2485
fe0918bb 2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
c1c05c67
YCH
2490 return
2491
e3946f98 2492 try:
611c1dd9 2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2494 except OSError:
2495 return
2f49bcd6
RC
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
0f06bcd7 2501 title_bytes = title.encode()
6eefe533
PH
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
e3946f98 2504 try:
6eefe533 2505 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2506 except AttributeError:
2507 return # Strange libc, just skip this
d7dda168
PH
2508
2509
2510def remove_start(s, start):
46bc9b7d 2511 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2512
2513
2b9faf55 2514def remove_end(s, end):
46bc9b7d 2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2516
2517
31b2051e
S
2518def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
b6e0c7d2 2527def get_domain(url):
ebf99aaf 2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2533
2534
29eb5174 2535def url_basename(url):
14f25df2 2536 path = urllib.parse.urlparse(url).path
28e614de 2537 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2538
2539
02dc0a36 2540def base_url(url):
7657ec7e 2541 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2542
2543
e34c3361 2544def urljoin(base, path):
4b5de77b 2545 if isinstance(path, bytes):
0f06bcd7 2546 path = path.decode()
14f25df2 2547 if not isinstance(path, str) or not path:
e34c3361 2548 return None
fad4ceb5 2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2550 return path
4b5de77b 2551 if isinstance(base, bytes):
0f06bcd7 2552 base = base.decode()
14f25df2 2553 if not isinstance(base, str) or not re.match(
4b5de77b 2554 r'^(?:https?:)?//', base):
e34c3361 2555 return None
14f25df2 2556 return urllib.parse.urljoin(base, path)
e34c3361
S
2557
2558
ac668111 2559class HEADRequest(urllib.request.Request):
aa94a6d3 2560 def get_method(self):
611c1dd9 2561 return 'HEAD'
7217e148
PH
2562
2563
ac668111 2564class PUTRequest(urllib.request.Request):
95cf60e8
S
2565 def get_method(self):
2566 return 'PUT'
2567
2568
9732d77e 2569def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
1812afb7
S
2572 try:
2573 return int(v) * invscale // scale
31c49255 2574 except (ValueError, TypeError, OverflowError):
af98f8ff 2575 return default
9732d77e 2576
9572013d 2577
40a90862 2578def str_or_none(v, default=None):
14f25df2 2579 return default if v is None else str(v)
40a90862 2580
9732d77e
PH
2581
2582def str_to_int(int_str):
48d4681e 2583 """ A more relaxed version of int_or_none """
f9934b96 2584 if isinstance(int_str, int):
348c6bf1 2585 return int_str
14f25df2 2586 elif isinstance(int_str, str):
42db58ec
S
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
608d11f5
PH
2589
2590
9732d77e 2591def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
5e1271c5 2596 except (ValueError, TypeError):
caf80631 2597 return default
43f775e4
PH
2598
2599
c7e327c4
S
2600def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
53cd37ba 2604def strip_or_none(v, default=None):
14f25df2 2605 return v.strip() if isinstance(v, str) else default
b72b4431
S
2606
2607
af03000a 2608def url_or_none(url):
14f25df2 2609 if not url or not isinstance(url, str):
af03000a
S
2610 return None
2611 url = url.strip()
29f7c58a 2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2613
2614
3e9b66d7 2615def request_to_url(req):
ac668111 2616 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
e29663c6 2622def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
f9934b96 2625 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2629 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
608d11f5 2638def parse_duration(s):
f9934b96 2639 if not isinstance(s, str):
608d11f5 2640 return None
ca7b3246 2641 s = s.strip()
38d79fd1 2642 if not s:
2643 return None
ca7b3246 2644
acaff495 2645 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
acaff495 2652 if m:
8bd1c00b 2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2654 else:
2655 m = re.match(
056653bb
S
2656 r'''(?ix)(?:P?
2657 (?:
1c1b2f96 2658 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2659 )?
2660 (?:
1c1b2f96 2661 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2662 )?
2663 (?:
1c1b2f96 2664 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2665 )?
8f4b58d7 2666 (?:
1c1b2f96 2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2668 )?
056653bb 2669 T)?
acaff495 2670 (?:
1c1b2f96 2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2672 )?
2673 (?:
1c1b2f96 2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2678 )?Z?$''', s)
acaff495 2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
15846398 2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
acaff495 2688 if ms:
19a03940 2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2692
2693
e65e4c88 2694def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2695 name, real_ext = os.path.splitext(filename)
e65e4c88 2696 return (
86e5f3ed 2697 f'{name}.{ext}{real_ext}'
e65e4c88 2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2699 else f'{filename}.{ext}')
d70ad093
PH
2700
2701
b3ed15b7
S
2702def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
86e5f3ed 2704 return '{}.{}'.format(
b3ed15b7
S
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
d70ad093
PH
2709def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
f0c9fb96 2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2714 except OSError:
2715 return False
2716 return exe
b7ab0590
PH
2717
2718
7aaf4cd2 2719def _get_exe_version_output(exe, args):
95807118 2720 try:
b64d04c1 2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2722 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2726 except OSError:
2727 return False
f0c9fb96 2728 return stdout
cae97f65
PH
2729
2730
2731def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2732 assert isinstance(output, str)
cae97f65
PH
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
95807118
PH
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
9af98e17 2742def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
7e88d7d7 2750def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
cb89cfc1 2760class LazyList(collections.abc.Sequence):
0f06bcd7 2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2763
8e5fecc8 2764 class IndexError(IndexError):
2765 pass
2766
282f5709 2767 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
483336e7 2771
2772 def __iter__(self):
0f06bcd7 2773 if self._reversed:
28419ca2 2774 # We need to consume the entire iterable to iterate in reverse
981052c9 2775 yield from self.exhaust()
28419ca2 2776 return
0f06bcd7 2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
483336e7 2780 yield item
2781
0f06bcd7 2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
28419ca2 2786
981052c9 2787 def exhaust(self):
0f06bcd7 2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2790
28419ca2 2791 @staticmethod
0f06bcd7 2792 def _reverse_index(x):
f2df4071 2793 return None if x is None else ~x
483336e7 2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
0f06bcd7 2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2799 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2800 elif isinstance(idx, int):
0f06bcd7 2801 if self._reversed:
2802 idx = self._reverse_index(idx)
e0f2b4b4 2803 start, stop, step = idx, idx, 0
483336e7 2804 else:
2805 raise TypeError('indices must be integers or slices')
e0f2b4b4 2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
483336e7 2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
0f06bcd7 2811 self._exhaust()
8e5fecc8 2812 try:
0f06bcd7 2813 return self._cache[idx]
8e5fecc8 2814 except IndexError as e:
2815 raise self.IndexError(e) from e
0f06bcd7 2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2817 if n > 0:
0f06bcd7 2818 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2819 try:
0f06bcd7 2820 return self._cache[idx]
8e5fecc8 2821 except IndexError as e:
2822 raise self.IndexError(e) from e
483336e7 2823
2824 def __bool__(self):
2825 try:
0f06bcd7 2826 self[-1] if self._reversed else self[0]
8e5fecc8 2827 except self.IndexError:
483336e7 2828 return False
2829 return True
2830
2831 def __len__(self):
0f06bcd7 2832 self._exhaust()
2833 return len(self._cache)
483336e7 2834
282f5709 2835 def __reversed__(self):
0f06bcd7 2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2837
2838 def __copy__(self):
0f06bcd7 2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2840
28419ca2 2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
483336e7 2848
7be9ccff 2849class PagedList:
c07a39ae 2850
2851 class IndexError(IndexError):
2852 pass
2853
dd26ced1
PH
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
7be9ccff 2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
f1d13090 2861 self._pagecount = float('inf')
7be9ccff 2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
d8cf8d97 2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
f1d13090 2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
55575225 2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
f1d13090 2880 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
d8cf8d97 2884 if not entries:
c07a39ae 2885 raise self.IndexError()
d8cf8d97 2886 return entries[0]
55575225 2887
9c44d242
PH
2888
2889class OnDemandPagedList(PagedList):
a44ca5a4 2890 """Download pages until a page with less than maximum results"""
86e5f3ed 2891
7be9ccff 2892 def _getslice(self, start, end):
b7ab0590
PH
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
b7ab0590
PH
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
b7ab0590
PH
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
f1d13090 2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
b7ab0590
PH
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
7be9ccff 2915 yield from page_results
b7ab0590
PH
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
81c2f20b
PH
2928
2929
9c44d242 2930class InAdvancePagedList(PagedList):
a44ca5a4 2931 """PagedList with total number of pages known in advance"""
86e5f3ed 2932
9c44d242 2933 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2934 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2935 self._pagecount = pagecount
9c44d242 2936
7be9ccff 2937 def _getslice(self, start, end):
9c44d242 2938 start_page = start // self._pagesize
d37707bd 2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
7be9ccff 2943 page_results = self.getpage(pagenum)
9c44d242 2944 if skip_elems:
7be9ccff 2945 page_results = page_results[skip_elems:]
9c44d242
PH
2946 skip_elems = None
2947 if only_more is not None:
7be9ccff 2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
9c44d242 2950 else:
7be9ccff 2951 yield from page_results[:only_more]
9c44d242 2952 break
7be9ccff 2953 yield from page_results
9c44d242
PH
2954
2955
7e88d7d7 2956class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
7e9a6125 2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2971 self.is_incomplete = requested_entries is not None
7e9a6125 2972 if self.is_incomplete:
2973 assert self.is_exhausted
bc5c2f8a 2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
7e88d7d7 2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
1ac4fd80 3017 if not entry:
3018 continue
7e88d7d7 3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
7e9a6125 3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
7e88d7d7 3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
bc5c2f8a 3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
7e9a6125 3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
7e88d7d7 3079 break
7e9a6125 3080 continue
7e88d7d7 3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
81c2f20b 3090def uppercase_escape(s):
676eb3f2 3091 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3092 return re.sub(
a612753d 3093 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
0fe2ff78
YCH
3096
3097
3098def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
b53466e1 3104
d05cfe06
S
3105
3106def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3109
3110
3111def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
14f25df2 3113 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3114 return url_parsed._replace(
efbed08d 3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
62e609ab 3122
96b9e9cf 3123def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3125
3126
62e609ab
PH
3127def read_batch_urls(batch_fd):
3128 def fixup(url):
14f25df2 3129 if not isinstance(url, str):
62e609ab 3130 url = url.decode('utf-8', 'replace')
8c04f0be 3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
62e609ab 3137 return False
8c04f0be 3138 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3139 # However, it can be safely stripped out if following a whitespace
8c04f0be 3140 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3144
3145
3146def urlencode_postdata(*args, **kargs):
14f25df2 3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3148
3149
38f9ef31 3150def update_url_query(url, query):
cacd9966
YCH
3151 if not query:
3152 return url
14f25df2 3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3155 qs.update(query)
14f25df2 3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
16392824 3158
8e60dc75 3159
c043c246 3160def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3161 req_headers = req.headers.copy()
c043c246 3162 req_headers.update(headers or {})
ed0291d1
S
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
ac668111 3171 req_type = urllib.request.Request
ed0291d1
S
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
10c87c15 3180def _multipart_encode_impl(data, boundary):
0c265486
YCH
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3186 if isinstance(k, str):
0f06bcd7 3187 k = k.encode()
14f25df2 3188 if isinstance(v, str):
0f06bcd7 3189 v = v.encode()
0c265486
YCH
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
10c87c15 3222 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
304ad45a 3232def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
86296ad2 3236def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
cbecc9b9
S
3241
3242
c4f60dd7 3243def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
a32a9a7e 3245 try:
c4f60dd7 3246 val = f(*args, **kwargs)
ab029d7e 3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3248 pass
3249 else:
c4f60dd7 3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3256
3257
90137ca4 3258def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
6cc62232
S
3262def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
90137ca4 3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3268 merged[k] = v
3269 return merged
3270
3271
8e60dc75 3272def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3273 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3274
16392824 3275
a1a530b0
PH
3276US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282}
fac55558
PH
3283
3284
a8795327 3285TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
a8795327
S
3292}
3293
3294
146c80e2 3295def parse_age_limit(s):
19a03940 3296 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3297 if type(s) is int: # noqa: E721
a8795327 3298 return s if 0 <= s <= 21 else None
19a03940 3299 elif not isinstance(s, str):
d838b1bd 3300 return None
146c80e2 3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3302 if m:
3303 return int(m.group('age'))
5c5fae6d 3304 s = s.upper()
a8795327
S
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
5a16c9d9 3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3308 if m:
5a16c9d9 3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3310 return None
146c80e2
S
3311
3312
fac55558 3313def strip_jsonp(code):
609a61e3 3314 return re.sub(
5552c9eb 3315 r'''(?sx)^
e9c671d5 3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
478c2c61
PH
3321
3322
8f53dc44 3323def js_to_json(code, vars={}, *, strict=False):
5c610515 3324 # vars is a dict of var, val pairs to substitute
a71b812f
SS
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3329 INTEGER_TABLE = (
86e5f3ed 3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3332 )
3333
a71b812f
SS
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
e05f6939 3343 def fix_kv(m):
e7b6d122
PH
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
421ddcb8
C
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
8bdd16b4 3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 return json.dumps(vars[v])
89ac4a19 3364
a71b812f
SS
3365 if not strict:
3366 return f'"{v}"'
5c610515 3367
a71b812f 3368 raise ValueError(f'Unknown value: {v}')
e05f6939 3369
8072ef2b 3370 def create_map(mobj):
3371 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
8072ef2b 3373 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3374 if not strict:
3375 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3376 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
febff4c1 3377
a71b812f
SS
3378 return re.sub(rf'''(?sx)
3379 {STRING_RE}|
3380 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3381 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3382 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3384 !+
a71b812f 3385 ''', fix_kv, code)
e05f6939
PH
3386
3387
478c2c61
PH
3388def qualities(quality_ids):
3389 """ Get a numeric quality value out of a list of possible values """
3390 def q(qid):
3391 try:
3392 return quality_ids.index(qid)
3393 except ValueError:
3394 return -1
3395 return q
3396
acd69589 3397
119e40ef 3398POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3399
3400
de6000d9 3401DEFAULT_OUTTMPL = {
3402 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3403 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3404}
3405OUTTMPL_TYPES = {
72755351 3406 'chapter': None,
de6000d9 3407 'subtitle': None,
3408 'thumbnail': None,
3409 'description': 'description',
3410 'annotation': 'annotations.xml',
3411 'infojson': 'info.json',
08438d2c 3412 'link': None,
3b603dbd 3413 'pl_video': None,
5112f26a 3414 'pl_thumbnail': None,
de6000d9 3415 'pl_description': 'description',
3416 'pl_infojson': 'info.json',
3417}
0a871f68 3418
143db31d 3419# As of [1] format syntax is:
3420# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3422STR_FORMAT_RE_TMPL = r'''(?x)
3423 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3424 %
524e2e4f 3425 (?P<has_key>\((?P<key>{0})\))?
752cda38 3426 (?P<format>
524e2e4f 3427 (?P<conversion>[#0\-+ ]+)?
3428 (?P<min_width>\d+)?
3429 (?P<precision>\.\d+)?
3430 (?P<len_mod>[hlL])? # unused in python
901130bb 3431 {1} # conversion type
752cda38 3432 )
143db31d 3433'''
3434
7d1eb38a 3435
901130bb 3436STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3437
7d1eb38a 3438
a020a0dc
PH
3439def limit_length(s, length):
3440 """ Add ellipses to overly long strings """
3441 if s is None:
3442 return None
3443 ELLIPSES = '...'
3444 if len(s) > length:
3445 return s[:length - len(ELLIPSES)] + ELLIPSES
3446 return s
48844745
PH
3447
3448
3449def version_tuple(v):
5f9b8394 3450 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3451
3452
3453def is_outdated_version(version, limit, assume_new=True):
3454 if not version:
3455 return not assume_new
3456 try:
3457 return version_tuple(version) < version_tuple(limit)
3458 except ValueError:
3459 return not assume_new
732ea2f0
PH
3460
3461
3462def ytdl_is_updateable():
7a5c1cfe 3463 """ Returns if yt-dlp can be updated with -U """
735d865e 3464
5d535b4a 3465 from .update import is_non_updateable
732ea2f0 3466
5d535b4a 3467 return not is_non_updateable()
7d4111ed
PH
3468
3469
3470def args_to_str(args):
3471 # Get a short string representation for a subprocess command
702ccf2d 3472 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3473
3474
9b9c5355 3475def error_to_compat_str(err):
cfb0511d 3476 return str(err)
fdae2358
S
3477
3478
a44ca5a4 3479def error_to_str(err):
3480 return f'{type(err).__name__}: {err}'
3481
3482
2647c933 3483def mimetype2ext(mt, default=NO_DEFAULT):
3484 if not isinstance(mt, str):
3485 if default is not NO_DEFAULT:
3486 return default
eb9ee194
S
3487 return None
3488
2647c933 3489 MAP = {
3490 # video
f6861ec9 3491 '3gpp': '3gp',
2647c933 3492 'mp2t': 'ts',
3493 'mp4': 'mp4',
3494 'mpeg': 'mpeg',
3495 'mpegurl': 'm3u8',
3496 'quicktime': 'mov',
3497 'webm': 'webm',
3498 'vp9': 'vp9',
f6861ec9 3499 'x-flv': 'flv',
2647c933 3500 'x-m4v': 'm4v',
3501 'x-matroska': 'mkv',
3502 'x-mng': 'mng',
a0d8d704 3503 'x-mp4-fragmented': 'mp4',
2647c933 3504 'x-ms-asf': 'asf',
a0d8d704 3505 'x-ms-wmv': 'wmv',
2647c933 3506 'x-msvideo': 'avi',
3507
3508 # application (streaming playlists)
b4173f15 3509 'dash+xml': 'mpd',
b4173f15 3510 'f4m+xml': 'f4m',
f164b971 3511 'hds+xml': 'f4m',
2647c933 3512 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3513 'vnd.ms-sstr+xml': 'ism',
2647c933 3514 'x-mpegurl': 'm3u8',
3515
3516 # audio
3517 'audio/mp4': 'm4a',
3518 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3519 # Using .mp3 as it's the most popular one
3520 'audio/mpeg': 'mp3',
3521 'audio/webm': 'weba',
3522 'audio/x-matroska': 'mka',
3523 'audio/x-mpegurl': 'm3u',
3524 'midi': 'mid',
3525 'ogg': 'ogg',
3526 'wav': 'wav',
3527 'wave': 'wav',
3528 'x-aac': 'aac',
3529 'x-flac': 'flac',
3530 'x-m4a': 'm4a',
3531 'x-realaudio': 'ra',
39e7107d 3532 'x-wav': 'wav',
9359f3d4 3533
2647c933 3534 # image
3535 'avif': 'avif',
3536 'bmp': 'bmp',
3537 'gif': 'gif',
3538 'jpeg': 'jpg',
3539 'png': 'png',
3540 'svg+xml': 'svg',
3541 'tiff': 'tif',
3542 'vnd.wap.wbmp': 'wbmp',
3543 'webp': 'webp',
3544 'x-icon': 'ico',
3545 'x-jng': 'jng',
3546 'x-ms-bmp': 'bmp',
3547
3548 # caption
3549 'filmstrip+json': 'fs',
3550 'smptett+xml': 'tt',
3551 'ttaf+xml': 'dfxp',
3552 'ttml+xml': 'ttml',
3553 'x-ms-sami': 'sami',
9359f3d4 3554
2647c933 3555 # misc
3556 'gzip': 'gz',
9359f3d4
F
3557 'json': 'json',
3558 'xml': 'xml',
3559 'zip': 'zip',
9359f3d4
F
3560 }
3561
2647c933 3562 mimetype = mt.partition(';')[0].strip().lower()
3563 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3564
2647c933 3565 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3566 if ext:
3567 return ext
3568 elif default is not NO_DEFAULT:
3569 return default
9359f3d4 3570 return subtype.replace('+', '.')
c460bdd5
PH
3571
3572
2814f12b
THD
3573def ext2mimetype(ext_or_url):
3574 if not ext_or_url:
3575 return None
3576 if '.' not in ext_or_url:
3577 ext_or_url = f'file.{ext_or_url}'
3578 return mimetypes.guess_type(ext_or_url)[0]
3579
3580
4f3c5e06 3581def parse_codecs(codecs_str):
3582 # http://tools.ietf.org/html/rfc6381
3583 if not codecs_str:
3584 return {}
a0566bbf 3585 split_codecs = list(filter(None, map(
dbf5416a 3586 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3587 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3588 for full_codec in split_codecs:
d816f61f 3589 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3590 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3591 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3592 if vcodec:
3593 continue
3594 vcodec = full_codec
3595 if parts[0] in ('dvh1', 'dvhe'):
3596 hdr = 'DV'
3597 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3598 hdr = 'HDR10'
3599 elif parts[:2] == ['vp9', '2']:
3600 hdr = 'HDR10'
71082216 3601 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3602 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3603 acodec = acodec or full_codec
3604 elif parts[0] in ('stpp', 'wvtt'):
3605 scodec = scodec or full_codec
4f3c5e06 3606 else:
19a03940 3607 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3608 if vcodec or acodec or scodec:
4f3c5e06 3609 return {
3610 'vcodec': vcodec or 'none',
3611 'acodec': acodec or 'none',
176f1866 3612 'dynamic_range': hdr,
3fe75fdc 3613 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3614 }
b69fd25c 3615 elif len(split_codecs) == 2:
3616 return {
3617 'vcodec': split_codecs[0],
3618 'acodec': split_codecs[1],
3619 }
4f3c5e06 3620 return {}
3621
3622
fc61aff4
LL
3623def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3624 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3625
3626 allow_mkv = not preferences or 'mkv' in preferences
3627
3628 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3629 return 'mkv' # TODO: any other format allows this?
3630
3631 # TODO: All codecs supported by parse_codecs isn't handled here
3632 COMPATIBLE_CODECS = {
3633 'mp4': {
71082216 3634 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3635 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3636 },
3637 'webm': {
3638 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3639 'vp9x', 'vp8x', # in the webm spec
3640 },
3641 }
3642
8f84770a 3643 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3644 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3645
3646 for ext in preferences or COMPATIBLE_CODECS.keys():
3647 codec_set = COMPATIBLE_CODECS.get(ext, set())
3648 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3649 return ext
3650
3651 COMPATIBLE_EXTS = (
3652 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3653 {'webm'},
3654 )
3655 for ext in preferences or vexts:
3656 current_exts = {ext, *vexts, *aexts}
3657 if ext == 'mkv' or current_exts == {ext} or any(
3658 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3659 return ext
3660 return 'mkv' if allow_mkv else preferences[-1]
3661
3662
2647c933 3663def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3664 getheader = url_handle.headers.get
2ccd1b10 3665
b55ee18f
PH
3666 cd = getheader('Content-Disposition')
3667 if cd:
3668 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3669 if m:
3670 e = determine_ext(m.group('filename'), default_ext=None)
3671 if e:
3672 return e
3673
2647c933 3674 meta_ext = getheader('x-amz-meta-name')
3675 if meta_ext:
3676 e = meta_ext.rpartition('.')[2]
3677 if e:
3678 return e
3679
3680 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3681
3682
1e399778
YCH
3683def encode_data_uri(data, mime_type):
3684 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3685
3686
05900629 3687def age_restricted(content_limit, age_limit):
6ec6cb4e 3688 """ Returns True iff the content should be blocked """
05900629
PH
3689
3690 if age_limit is None: # No limit set
3691 return False
3692 if content_limit is None:
3693 return False # Content available for everyone
3694 return age_limit < content_limit
61ca9a80
PH
3695
3696
88f60feb 3697# List of known byte-order-marks (BOM)
a904a7f8
L
3698BOMS = [
3699 (b'\xef\xbb\xbf', 'utf-8'),
3700 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3701 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3702 (b'\xff\xfe', 'utf-16-le'),
3703 (b'\xfe\xff', 'utf-16-be'),
3704]
a904a7f8
L
3705
3706
61ca9a80
PH
3707def is_html(first_bytes):
3708 """ Detect whether a file contains HTML by examining its first bytes. """
3709
80e8493e 3710 encoding = 'utf-8'
61ca9a80 3711 for bom, enc in BOMS:
80e8493e 3712 while first_bytes.startswith(bom):
3713 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3714
80e8493e 3715 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3716
3717
3718def determine_protocol(info_dict):
3719 protocol = info_dict.get('protocol')
3720 if protocol is not None:
3721 return protocol
3722
7de837a5 3723 url = sanitize_url(info_dict['url'])
a055469f
PH
3724 if url.startswith('rtmp'):
3725 return 'rtmp'
3726 elif url.startswith('mms'):
3727 return 'mms'
3728 elif url.startswith('rtsp'):
3729 return 'rtsp'
3730
3731 ext = determine_ext(url)
3732 if ext == 'm3u8':
deae7c17 3733 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3734 elif ext == 'f4m':
3735 return 'f4m'
3736
14f25df2 3737 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3738
3739
c5e3f849 3740def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3741 """ Render a list of rows, each as a list of values.
3742 Text after a \t will be right aligned """
ec11a9f4 3743 def width(string):
c5e3f849 3744 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3745
3746 def get_max_lens(table):
ec11a9f4 3747 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3748
3749 def filter_using_list(row, filterArray):
d16df59d 3750 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3751
d16df59d 3752 max_lens = get_max_lens(data) if hide_empty else []
3753 header_row = filter_using_list(header_row, max_lens)
3754 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3755
cfb56d1a 3756 table = [header_row] + data
76d321f6 3757 max_lens = get_max_lens(table)
c5e3f849 3758 extra_gap += 1
76d321f6 3759 if delim:
c5e3f849 3760 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3761 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3762 for row in table:
3763 for pos, text in enumerate(map(str, row)):
c5e3f849 3764 if '\t' in text:
3765 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3766 else:
3767 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3768 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3769 return ret
347de493
PH
3770
3771
8f18aca8 3772def _match_one(filter_part, dct, incomplete):
77b87f05 3773 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3774 STRING_OPERATORS = {
3775 '*=': operator.contains,
3776 '^=': lambda attr, value: attr.startswith(value),
3777 '$=': lambda attr, value: attr.endswith(value),
3778 '~=': lambda attr, value: re.search(value, attr),
3779 }
347de493 3780 COMPARISON_OPERATORS = {
a047eeb6 3781 **STRING_OPERATORS,
3782 '<=': operator.le, # "<=" must be defined above "<"
347de493 3783 '<': operator.lt,
347de493 3784 '>=': operator.ge,
a047eeb6 3785 '>': operator.gt,
347de493 3786 '=': operator.eq,
347de493 3787 }
a047eeb6 3788
6db9c4d5 3789 if isinstance(incomplete, bool):
3790 is_incomplete = lambda _: incomplete
3791 else:
3792 is_incomplete = lambda k: k in incomplete
3793
64fa820c 3794 operator_rex = re.compile(r'''(?x)
347de493 3795 (?P<key>[a-z_]+)
77b87f05 3796 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3797 (?:
a047eeb6 3798 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3799 (?P<strval>.+?)
347de493 3800 )
347de493 3801 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3802 m = operator_rex.fullmatch(filter_part.strip())
347de493 3803 if m:
18f96d12 3804 m = m.groupdict()
3805 unnegated_op = COMPARISON_OPERATORS[m['op']]
3806 if m['negation']:
77b87f05
MT
3807 op = lambda attr, value: not unnegated_op(attr, value)
3808 else:
3809 op = unnegated_op
18f96d12 3810 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3811 if m['quote']:
3812 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3813 actual_value = dct.get(m['key'])
3814 numeric_comparison = None
f9934b96 3815 if isinstance(actual_value, (int, float)):
e5a088dc
S
3816 # If the original field is a string and matching comparisonvalue is
3817 # a number we should respect the origin of the original field
3818 # and process comparison value as a string (see
18f96d12 3819 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3820 try:
18f96d12 3821 numeric_comparison = int(comparison_value)
347de493 3822 except ValueError:
18f96d12 3823 numeric_comparison = parse_filesize(comparison_value)
3824 if numeric_comparison is None:
3825 numeric_comparison = parse_filesize(f'{comparison_value}B')
3826 if numeric_comparison is None:
3827 numeric_comparison = parse_duration(comparison_value)
3828 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3829 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3830 if actual_value is None:
6db9c4d5 3831 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3832 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3833
3834 UNARY_OPERATORS = {
1cc47c66
S
3835 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3836 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3837 }
64fa820c 3838 operator_rex = re.compile(r'''(?x)
347de493 3839 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3840 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3841 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3842 if m:
3843 op = UNARY_OPERATORS[m.group('op')]
3844 actual_value = dct.get(m.group('key'))
6db9c4d5 3845 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3846 return True
347de493
PH
3847 return op(actual_value)
3848
3849 raise ValueError('Invalid filter part %r' % filter_part)
3850
3851
8f18aca8 3852def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3853 """ Filter a dictionary with a simple string syntax.
3854 @returns Whether the filter passes
3855 @param incomplete Set of keys that is expected to be missing from dct.
3856 Can be True/False to indicate all/none of the keys may be missing.
3857 All conditions on incomplete keys pass if the key is missing
8f18aca8 3858 """
347de493 3859 return all(
8f18aca8 3860 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3861 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3862
3863
b1a7cd05 3864def match_filter_func(filters):
3865 if not filters:
d1b5f70b 3866 return None
492272fe 3867 filters = set(variadic(filters))
d1b5f70b 3868
492272fe 3869 interactive = '-' in filters
3870 if interactive:
3871 filters.remove('-')
3872
3873 def _match_func(info_dict, incomplete=False):
3874 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3875 return NO_DEFAULT if interactive and not incomplete else None
347de493 3876 else:
3bec830a 3877 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3878 filter_str = ') | ('.join(map(str.strip, filters))
3879 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3880 return _match_func
91410c9b
PH
3881
3882
f2df4071 3883class download_range_func:
3884 def __init__(self, chapters, ranges):
3885 self.chapters, self.ranges = chapters, ranges
3886
3887 def __call__(self, info_dict, ydl):
0500ee3d 3888 if not self.ranges and not self.chapters:
3889 yield {}
3890
5ec1b6b7 3891 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3892 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3893 for regex in self.chapters or []:
5ec1b6b7 3894 for i, chapter in enumerate(info_dict.get('chapters') or []):
3895 if re.search(regex, chapter['title']):
3896 warning = None
3897 yield {**chapter, 'index': i}
f2df4071 3898 if self.chapters and warning:
5ec1b6b7 3899 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3900
f2df4071 3901 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3902
f2df4071 3903 def __eq__(self, other):
3904 return (isinstance(other, download_range_func)
3905 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3906
71df9b7f 3907 def __repr__(self):
3908 return f'{type(self).__name__}({self.chapters}, {self.ranges})'
3909
5ec1b6b7 3910
bf6427d2
YCH
3911def parse_dfxp_time_expr(time_expr):
3912 if not time_expr:
d631d5f9 3913 return
bf6427d2 3914
1d485a1a 3915 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3916 if mobj:
3917 return float(mobj.group('time_offset'))
3918
db2fe38b 3919 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3920 if mobj:
db2fe38b 3921 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3922
3923
c1c924ab 3924def srt_subtitles_timecode(seconds):
aa7785f8 3925 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3926
3927
3928def ass_subtitles_timecode(seconds):
3929 time = timetuple_from_msec(seconds * 1000)
3930 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3931
3932
3933def dfxp2srt(dfxp_data):
3869028f
YCH
3934 '''
3935 @param dfxp_data A bytes-like object containing DFXP data
3936 @returns A unicode object containing converted SRT data
3937 '''
5b995f71 3938 LEGACY_NAMESPACES = (
3869028f
YCH
3939 (b'http://www.w3.org/ns/ttml', [
3940 b'http://www.w3.org/2004/11/ttaf1',
3941 b'http://www.w3.org/2006/04/ttaf1',
3942 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3943 ]),
3869028f
YCH
3944 (b'http://www.w3.org/ns/ttml#styling', [
3945 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3946 ]),
3947 )
3948
3949 SUPPORTED_STYLING = [
3950 'color',
3951 'fontFamily',
3952 'fontSize',
3953 'fontStyle',
3954 'fontWeight',
3955 'textDecoration'
3956 ]
3957
4e335771 3958 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3959 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3960 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3961 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3962 })
bf6427d2 3963
5b995f71
RA
3964 styles = {}
3965 default_style = {}
3966
86e5f3ed 3967 class TTMLPElementParser:
5b995f71
RA
3968 _out = ''
3969 _unclosed_elements = []
3970 _applied_styles = []
bf6427d2 3971
2b14cb56 3972 def start(self, tag, attrib):
5b995f71
RA
3973 if tag in (_x('ttml:br'), 'br'):
3974 self._out += '\n'
3975 else:
3976 unclosed_elements = []
3977 style = {}
3978 element_style_id = attrib.get('style')
3979 if default_style:
3980 style.update(default_style)
3981 if element_style_id:
3982 style.update(styles.get(element_style_id, {}))
3983 for prop in SUPPORTED_STYLING:
3984 prop_val = attrib.get(_x('tts:' + prop))
3985 if prop_val:
3986 style[prop] = prop_val
3987 if style:
3988 font = ''
3989 for k, v in sorted(style.items()):
3990 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3991 continue
3992 if k == 'color':
3993 font += ' color="%s"' % v
3994 elif k == 'fontSize':
3995 font += ' size="%s"' % v
3996 elif k == 'fontFamily':
3997 font += ' face="%s"' % v
3998 elif k == 'fontWeight' and v == 'bold':
3999 self._out += '<b>'
4000 unclosed_elements.append('b')
4001 elif k == 'fontStyle' and v == 'italic':
4002 self._out += '<i>'
4003 unclosed_elements.append('i')
4004 elif k == 'textDecoration' and v == 'underline':
4005 self._out += '<u>'
4006 unclosed_elements.append('u')
4007 if font:
4008 self._out += '<font' + font + '>'
4009 unclosed_elements.append('font')
4010 applied_style = {}
4011 if self._applied_styles:
4012 applied_style.update(self._applied_styles[-1])
4013 applied_style.update(style)
4014 self._applied_styles.append(applied_style)
4015 self._unclosed_elements.append(unclosed_elements)
bf6427d2 4016
2b14cb56 4017 def end(self, tag):
5b995f71
RA
4018 if tag not in (_x('ttml:br'), 'br'):
4019 unclosed_elements = self._unclosed_elements.pop()
4020 for element in reversed(unclosed_elements):
4021 self._out += '</%s>' % element
4022 if unclosed_elements and self._applied_styles:
4023 self._applied_styles.pop()
bf6427d2 4024
2b14cb56 4025 def data(self, data):
5b995f71 4026 self._out += data
2b14cb56 4027
4028 def close(self):
5b995f71 4029 return self._out.strip()
2b14cb56 4030
4031 def parse_node(node):
4032 target = TTMLPElementParser()
4033 parser = xml.etree.ElementTree.XMLParser(target=target)
4034 parser.feed(xml.etree.ElementTree.tostring(node))
4035 return parser.close()
bf6427d2 4036
5b995f71
RA
4037 for k, v in LEGACY_NAMESPACES:
4038 for ns in v:
4039 dfxp_data = dfxp_data.replace(ns, k)
4040
3869028f 4041 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4042 out = []
5b995f71 4043 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4044
4045 if not paras:
4046 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4047
5b995f71
RA
4048 repeat = False
4049 while True:
4050 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4051 style_id = style.get('id') or style.get(_x('xml:id'))
4052 if not style_id:
4053 continue
5b995f71
RA
4054 parent_style_id = style.get('style')
4055 if parent_style_id:
4056 if parent_style_id not in styles:
4057 repeat = True
4058 continue
4059 styles[style_id] = styles[parent_style_id].copy()
4060 for prop in SUPPORTED_STYLING:
4061 prop_val = style.get(_x('tts:' + prop))
4062 if prop_val:
4063 styles.setdefault(style_id, {})[prop] = prop_val
4064 if repeat:
4065 repeat = False
4066 else:
4067 break
4068
4069 for p in ('body', 'div'):
4070 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4071 if ele is None:
4072 continue
4073 style = styles.get(ele.get('style'))
4074 if not style:
4075 continue
4076 default_style.update(style)
4077
bf6427d2 4078 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4079 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4080 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4081 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4082 if begin_time is None:
4083 continue
7dff0363 4084 if not end_time:
d631d5f9
YCH
4085 if not dur:
4086 continue
4087 end_time = begin_time + dur
bf6427d2
YCH
4088 out.append('%d\n%s --> %s\n%s\n\n' % (
4089 index,
c1c924ab
YCH
4090 srt_subtitles_timecode(begin_time),
4091 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4092 parse_node(para)))
4093
4094 return ''.join(out)
4095
4096
c487cf00 4097def cli_option(params, command_option, param, separator=None):
66e289ba 4098 param = params.get(param)
c487cf00 4099 return ([] if param is None
4100 else [command_option, str(param)] if separator is None
4101 else [f'{command_option}{separator}{param}'])
66e289ba
S
4102
4103
4104def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4105 param = params.get(param)
c487cf00 4106 assert param in (True, False, None)
4107 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4108
4109
4110def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4111 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4112
4113
e92caff5 4114def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4115 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4116 if use_compat:
5b1ecbb3 4117 return argdict
4118 else:
4119 argdict = None
eab9b2bc 4120 if argdict is None:
5b1ecbb3 4121 return default
eab9b2bc 4122 assert isinstance(argdict, dict)
4123
e92caff5 4124 assert isinstance(keys, (list, tuple))
4125 for key_list in keys:
e92caff5 4126 arg_list = list(filter(
4127 lambda x: x is not None,
6606817a 4128 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4129 if arg_list:
4130 return [arg for args in arg_list for arg in args]
4131 return default
66e289ba 4132
6251555f 4133
330690a2 4134def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4135 main_key, exe = main_key.lower(), exe.lower()
4136 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4137 keys = [f'{root_key}{k}' for k in (keys or [''])]
4138 if root_key in keys:
4139 if main_key != exe:
4140 keys.append((main_key, exe))
4141 keys.append('default')
4142 else:
4143 use_compat = False
4144 return cli_configuration_args(argdict, keys, default, use_compat)
4145
66e289ba 4146
86e5f3ed 4147class ISO639Utils:
39672624
YCH
4148 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4149 _lang_map = {
4150 'aa': 'aar',
4151 'ab': 'abk',
4152 'ae': 'ave',
4153 'af': 'afr',
4154 'ak': 'aka',
4155 'am': 'amh',
4156 'an': 'arg',
4157 'ar': 'ara',
4158 'as': 'asm',
4159 'av': 'ava',
4160 'ay': 'aym',
4161 'az': 'aze',
4162 'ba': 'bak',
4163 'be': 'bel',
4164 'bg': 'bul',
4165 'bh': 'bih',
4166 'bi': 'bis',
4167 'bm': 'bam',
4168 'bn': 'ben',
4169 'bo': 'bod',
4170 'br': 'bre',
4171 'bs': 'bos',
4172 'ca': 'cat',
4173 'ce': 'che',
4174 'ch': 'cha',
4175 'co': 'cos',
4176 'cr': 'cre',
4177 'cs': 'ces',
4178 'cu': 'chu',
4179 'cv': 'chv',
4180 'cy': 'cym',
4181 'da': 'dan',
4182 'de': 'deu',
4183 'dv': 'div',
4184 'dz': 'dzo',
4185 'ee': 'ewe',
4186 'el': 'ell',
4187 'en': 'eng',
4188 'eo': 'epo',
4189 'es': 'spa',
4190 'et': 'est',
4191 'eu': 'eus',
4192 'fa': 'fas',
4193 'ff': 'ful',
4194 'fi': 'fin',
4195 'fj': 'fij',
4196 'fo': 'fao',
4197 'fr': 'fra',
4198 'fy': 'fry',
4199 'ga': 'gle',
4200 'gd': 'gla',
4201 'gl': 'glg',
4202 'gn': 'grn',
4203 'gu': 'guj',
4204 'gv': 'glv',
4205 'ha': 'hau',
4206 'he': 'heb',
b7acc835 4207 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4208 'hi': 'hin',
4209 'ho': 'hmo',
4210 'hr': 'hrv',
4211 'ht': 'hat',
4212 'hu': 'hun',
4213 'hy': 'hye',
4214 'hz': 'her',
4215 'ia': 'ina',
4216 'id': 'ind',
b7acc835 4217 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4218 'ie': 'ile',
4219 'ig': 'ibo',
4220 'ii': 'iii',
4221 'ik': 'ipk',
4222 'io': 'ido',
4223 'is': 'isl',
4224 'it': 'ita',
4225 'iu': 'iku',
4226 'ja': 'jpn',
4227 'jv': 'jav',
4228 'ka': 'kat',
4229 'kg': 'kon',
4230 'ki': 'kik',
4231 'kj': 'kua',
4232 'kk': 'kaz',
4233 'kl': 'kal',
4234 'km': 'khm',
4235 'kn': 'kan',
4236 'ko': 'kor',
4237 'kr': 'kau',
4238 'ks': 'kas',
4239 'ku': 'kur',
4240 'kv': 'kom',
4241 'kw': 'cor',
4242 'ky': 'kir',
4243 'la': 'lat',
4244 'lb': 'ltz',
4245 'lg': 'lug',
4246 'li': 'lim',
4247 'ln': 'lin',
4248 'lo': 'lao',
4249 'lt': 'lit',
4250 'lu': 'lub',
4251 'lv': 'lav',
4252 'mg': 'mlg',
4253 'mh': 'mah',
4254 'mi': 'mri',
4255 'mk': 'mkd',
4256 'ml': 'mal',
4257 'mn': 'mon',
4258 'mr': 'mar',
4259 'ms': 'msa',
4260 'mt': 'mlt',
4261 'my': 'mya',
4262 'na': 'nau',
4263 'nb': 'nob',
4264 'nd': 'nde',
4265 'ne': 'nep',
4266 'ng': 'ndo',
4267 'nl': 'nld',
4268 'nn': 'nno',
4269 'no': 'nor',
4270 'nr': 'nbl',
4271 'nv': 'nav',
4272 'ny': 'nya',
4273 'oc': 'oci',
4274 'oj': 'oji',
4275 'om': 'orm',
4276 'or': 'ori',
4277 'os': 'oss',
4278 'pa': 'pan',
4279 'pi': 'pli',
4280 'pl': 'pol',
4281 'ps': 'pus',
4282 'pt': 'por',
4283 'qu': 'que',
4284 'rm': 'roh',
4285 'rn': 'run',
4286 'ro': 'ron',
4287 'ru': 'rus',
4288 'rw': 'kin',
4289 'sa': 'san',
4290 'sc': 'srd',
4291 'sd': 'snd',
4292 'se': 'sme',
4293 'sg': 'sag',
4294 'si': 'sin',
4295 'sk': 'slk',
4296 'sl': 'slv',
4297 'sm': 'smo',
4298 'sn': 'sna',
4299 'so': 'som',
4300 'sq': 'sqi',
4301 'sr': 'srp',
4302 'ss': 'ssw',
4303 'st': 'sot',
4304 'su': 'sun',
4305 'sv': 'swe',
4306 'sw': 'swa',
4307 'ta': 'tam',
4308 'te': 'tel',
4309 'tg': 'tgk',
4310 'th': 'tha',
4311 'ti': 'tir',
4312 'tk': 'tuk',
4313 'tl': 'tgl',
4314 'tn': 'tsn',
4315 'to': 'ton',
4316 'tr': 'tur',
4317 'ts': 'tso',
4318 'tt': 'tat',
4319 'tw': 'twi',
4320 'ty': 'tah',
4321 'ug': 'uig',
4322 'uk': 'ukr',
4323 'ur': 'urd',
4324 'uz': 'uzb',
4325 've': 'ven',
4326 'vi': 'vie',
4327 'vo': 'vol',
4328 'wa': 'wln',
4329 'wo': 'wol',
4330 'xh': 'xho',
4331 'yi': 'yid',
e9a50fba 4332 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4333 'yo': 'yor',
4334 'za': 'zha',
4335 'zh': 'zho',
4336 'zu': 'zul',
4337 }
4338
4339 @classmethod
4340 def short2long(cls, code):
4341 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4342 return cls._lang_map.get(code[:2])
4343
4344 @classmethod
4345 def long2short(cls, code):
4346 """Convert language code from ISO 639-2/T to ISO 639-1"""
4347 for short_name, long_name in cls._lang_map.items():
4348 if long_name == code:
4349 return short_name
4350
4351
86e5f3ed 4352class ISO3166Utils:
4eb10f66
YCH
4353 # From http://data.okfn.org/data/core/country-list
4354 _country_map = {
4355 'AF': 'Afghanistan',
4356 'AX': 'Åland Islands',
4357 'AL': 'Albania',
4358 'DZ': 'Algeria',
4359 'AS': 'American Samoa',
4360 'AD': 'Andorra',
4361 'AO': 'Angola',
4362 'AI': 'Anguilla',
4363 'AQ': 'Antarctica',
4364 'AG': 'Antigua and Barbuda',
4365 'AR': 'Argentina',
4366 'AM': 'Armenia',
4367 'AW': 'Aruba',
4368 'AU': 'Australia',
4369 'AT': 'Austria',
4370 'AZ': 'Azerbaijan',
4371 'BS': 'Bahamas',
4372 'BH': 'Bahrain',
4373 'BD': 'Bangladesh',
4374 'BB': 'Barbados',
4375 'BY': 'Belarus',
4376 'BE': 'Belgium',
4377 'BZ': 'Belize',
4378 'BJ': 'Benin',
4379 'BM': 'Bermuda',
4380 'BT': 'Bhutan',
4381 'BO': 'Bolivia, Plurinational State of',
4382 'BQ': 'Bonaire, Sint Eustatius and Saba',
4383 'BA': 'Bosnia and Herzegovina',
4384 'BW': 'Botswana',
4385 'BV': 'Bouvet Island',
4386 'BR': 'Brazil',
4387 'IO': 'British Indian Ocean Territory',
4388 'BN': 'Brunei Darussalam',
4389 'BG': 'Bulgaria',
4390 'BF': 'Burkina Faso',
4391 'BI': 'Burundi',
4392 'KH': 'Cambodia',
4393 'CM': 'Cameroon',
4394 'CA': 'Canada',
4395 'CV': 'Cape Verde',
4396 'KY': 'Cayman Islands',
4397 'CF': 'Central African Republic',
4398 'TD': 'Chad',
4399 'CL': 'Chile',
4400 'CN': 'China',
4401 'CX': 'Christmas Island',
4402 'CC': 'Cocos (Keeling) Islands',
4403 'CO': 'Colombia',
4404 'KM': 'Comoros',
4405 'CG': 'Congo',
4406 'CD': 'Congo, the Democratic Republic of the',
4407 'CK': 'Cook Islands',
4408 'CR': 'Costa Rica',
4409 'CI': 'Côte d\'Ivoire',
4410 'HR': 'Croatia',
4411 'CU': 'Cuba',
4412 'CW': 'Curaçao',
4413 'CY': 'Cyprus',
4414 'CZ': 'Czech Republic',
4415 'DK': 'Denmark',
4416 'DJ': 'Djibouti',
4417 'DM': 'Dominica',
4418 'DO': 'Dominican Republic',
4419 'EC': 'Ecuador',
4420 'EG': 'Egypt',
4421 'SV': 'El Salvador',
4422 'GQ': 'Equatorial Guinea',
4423 'ER': 'Eritrea',
4424 'EE': 'Estonia',
4425 'ET': 'Ethiopia',
4426 'FK': 'Falkland Islands (Malvinas)',
4427 'FO': 'Faroe Islands',
4428 'FJ': 'Fiji',
4429 'FI': 'Finland',
4430 'FR': 'France',
4431 'GF': 'French Guiana',
4432 'PF': 'French Polynesia',
4433 'TF': 'French Southern Territories',
4434 'GA': 'Gabon',
4435 'GM': 'Gambia',
4436 'GE': 'Georgia',
4437 'DE': 'Germany',
4438 'GH': 'Ghana',
4439 'GI': 'Gibraltar',
4440 'GR': 'Greece',
4441 'GL': 'Greenland',
4442 'GD': 'Grenada',
4443 'GP': 'Guadeloupe',
4444 'GU': 'Guam',
4445 'GT': 'Guatemala',
4446 'GG': 'Guernsey',
4447 'GN': 'Guinea',
4448 'GW': 'Guinea-Bissau',
4449 'GY': 'Guyana',
4450 'HT': 'Haiti',
4451 'HM': 'Heard Island and McDonald Islands',
4452 'VA': 'Holy See (Vatican City State)',
4453 'HN': 'Honduras',
4454 'HK': 'Hong Kong',
4455 'HU': 'Hungary',
4456 'IS': 'Iceland',
4457 'IN': 'India',
4458 'ID': 'Indonesia',
4459 'IR': 'Iran, Islamic Republic of',
4460 'IQ': 'Iraq',
4461 'IE': 'Ireland',
4462 'IM': 'Isle of Man',
4463 'IL': 'Israel',
4464 'IT': 'Italy',
4465 'JM': 'Jamaica',
4466 'JP': 'Japan',
4467 'JE': 'Jersey',
4468 'JO': 'Jordan',
4469 'KZ': 'Kazakhstan',
4470 'KE': 'Kenya',
4471 'KI': 'Kiribati',
4472 'KP': 'Korea, Democratic People\'s Republic of',
4473 'KR': 'Korea, Republic of',
4474 'KW': 'Kuwait',
4475 'KG': 'Kyrgyzstan',
4476 'LA': 'Lao People\'s Democratic Republic',
4477 'LV': 'Latvia',
4478 'LB': 'Lebanon',
4479 'LS': 'Lesotho',
4480 'LR': 'Liberia',
4481 'LY': 'Libya',
4482 'LI': 'Liechtenstein',
4483 'LT': 'Lithuania',
4484 'LU': 'Luxembourg',
4485 'MO': 'Macao',
4486 'MK': 'Macedonia, the Former Yugoslav Republic of',
4487 'MG': 'Madagascar',
4488 'MW': 'Malawi',
4489 'MY': 'Malaysia',
4490 'MV': 'Maldives',
4491 'ML': 'Mali',
4492 'MT': 'Malta',
4493 'MH': 'Marshall Islands',
4494 'MQ': 'Martinique',
4495 'MR': 'Mauritania',
4496 'MU': 'Mauritius',
4497 'YT': 'Mayotte',
4498 'MX': 'Mexico',
4499 'FM': 'Micronesia, Federated States of',
4500 'MD': 'Moldova, Republic of',
4501 'MC': 'Monaco',
4502 'MN': 'Mongolia',
4503 'ME': 'Montenegro',
4504 'MS': 'Montserrat',
4505 'MA': 'Morocco',
4506 'MZ': 'Mozambique',
4507 'MM': 'Myanmar',
4508 'NA': 'Namibia',
4509 'NR': 'Nauru',
4510 'NP': 'Nepal',
4511 'NL': 'Netherlands',
4512 'NC': 'New Caledonia',
4513 'NZ': 'New Zealand',
4514 'NI': 'Nicaragua',
4515 'NE': 'Niger',
4516 'NG': 'Nigeria',
4517 'NU': 'Niue',
4518 'NF': 'Norfolk Island',
4519 'MP': 'Northern Mariana Islands',
4520 'NO': 'Norway',
4521 'OM': 'Oman',
4522 'PK': 'Pakistan',
4523 'PW': 'Palau',
4524 'PS': 'Palestine, State of',
4525 'PA': 'Panama',
4526 'PG': 'Papua New Guinea',
4527 'PY': 'Paraguay',
4528 'PE': 'Peru',
4529 'PH': 'Philippines',
4530 'PN': 'Pitcairn',
4531 'PL': 'Poland',
4532 'PT': 'Portugal',
4533 'PR': 'Puerto Rico',
4534 'QA': 'Qatar',
4535 'RE': 'Réunion',
4536 'RO': 'Romania',
4537 'RU': 'Russian Federation',
4538 'RW': 'Rwanda',
4539 'BL': 'Saint Barthélemy',
4540 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4541 'KN': 'Saint Kitts and Nevis',
4542 'LC': 'Saint Lucia',
4543 'MF': 'Saint Martin (French part)',
4544 'PM': 'Saint Pierre and Miquelon',
4545 'VC': 'Saint Vincent and the Grenadines',
4546 'WS': 'Samoa',
4547 'SM': 'San Marino',
4548 'ST': 'Sao Tome and Principe',
4549 'SA': 'Saudi Arabia',
4550 'SN': 'Senegal',
4551 'RS': 'Serbia',
4552 'SC': 'Seychelles',
4553 'SL': 'Sierra Leone',
4554 'SG': 'Singapore',
4555 'SX': 'Sint Maarten (Dutch part)',
4556 'SK': 'Slovakia',
4557 'SI': 'Slovenia',
4558 'SB': 'Solomon Islands',
4559 'SO': 'Somalia',
4560 'ZA': 'South Africa',
4561 'GS': 'South Georgia and the South Sandwich Islands',
4562 'SS': 'South Sudan',
4563 'ES': 'Spain',
4564 'LK': 'Sri Lanka',
4565 'SD': 'Sudan',
4566 'SR': 'Suriname',
4567 'SJ': 'Svalbard and Jan Mayen',
4568 'SZ': 'Swaziland',
4569 'SE': 'Sweden',
4570 'CH': 'Switzerland',
4571 'SY': 'Syrian Arab Republic',
4572 'TW': 'Taiwan, Province of China',
4573 'TJ': 'Tajikistan',
4574 'TZ': 'Tanzania, United Republic of',
4575 'TH': 'Thailand',
4576 'TL': 'Timor-Leste',
4577 'TG': 'Togo',
4578 'TK': 'Tokelau',
4579 'TO': 'Tonga',
4580 'TT': 'Trinidad and Tobago',
4581 'TN': 'Tunisia',
4582 'TR': 'Turkey',
4583 'TM': 'Turkmenistan',
4584 'TC': 'Turks and Caicos Islands',
4585 'TV': 'Tuvalu',
4586 'UG': 'Uganda',
4587 'UA': 'Ukraine',
4588 'AE': 'United Arab Emirates',
4589 'GB': 'United Kingdom',
4590 'US': 'United States',
4591 'UM': 'United States Minor Outlying Islands',
4592 'UY': 'Uruguay',
4593 'UZ': 'Uzbekistan',
4594 'VU': 'Vanuatu',
4595 'VE': 'Venezuela, Bolivarian Republic of',
4596 'VN': 'Viet Nam',
4597 'VG': 'Virgin Islands, British',
4598 'VI': 'Virgin Islands, U.S.',
4599 'WF': 'Wallis and Futuna',
4600 'EH': 'Western Sahara',
4601 'YE': 'Yemen',
4602 'ZM': 'Zambia',
4603 'ZW': 'Zimbabwe',
2f97cc61 4604 # Not ISO 3166 codes, but used for IP blocks
4605 'AP': 'Asia/Pacific Region',
4606 'EU': 'Europe',
4eb10f66
YCH
4607 }
4608
4609 @classmethod
4610 def short2full(cls, code):
4611 """Convert an ISO 3166-2 country code to the corresponding full name"""
4612 return cls._country_map.get(code.upper())
4613
4614
86e5f3ed 4615class GeoUtils:
773f291d
S
4616 # Major IPv4 address blocks per country
4617 _country_ip_map = {
53896ca5 4618 'AD': '46.172.224.0/19',
773f291d
S
4619 'AE': '94.200.0.0/13',
4620 'AF': '149.54.0.0/17',
4621 'AG': '209.59.64.0/18',
4622 'AI': '204.14.248.0/21',
4623 'AL': '46.99.0.0/16',
4624 'AM': '46.70.0.0/15',
4625 'AO': '105.168.0.0/13',
53896ca5
S
4626 'AP': '182.50.184.0/21',
4627 'AQ': '23.154.160.0/24',
773f291d
S
4628 'AR': '181.0.0.0/12',
4629 'AS': '202.70.112.0/20',
53896ca5 4630 'AT': '77.116.0.0/14',
773f291d
S
4631 'AU': '1.128.0.0/11',
4632 'AW': '181.41.0.0/18',
53896ca5
S
4633 'AX': '185.217.4.0/22',
4634 'AZ': '5.197.0.0/16',
773f291d
S
4635 'BA': '31.176.128.0/17',
4636 'BB': '65.48.128.0/17',
4637 'BD': '114.130.0.0/16',
4638 'BE': '57.0.0.0/8',
53896ca5 4639 'BF': '102.178.0.0/15',
773f291d
S
4640 'BG': '95.42.0.0/15',
4641 'BH': '37.131.0.0/17',
4642 'BI': '154.117.192.0/18',
4643 'BJ': '137.255.0.0/16',
53896ca5 4644 'BL': '185.212.72.0/23',
773f291d
S
4645 'BM': '196.12.64.0/18',
4646 'BN': '156.31.0.0/16',
4647 'BO': '161.56.0.0/16',
4648 'BQ': '161.0.80.0/20',
53896ca5 4649 'BR': '191.128.0.0/12',
773f291d
S
4650 'BS': '24.51.64.0/18',
4651 'BT': '119.2.96.0/19',
4652 'BW': '168.167.0.0/16',
4653 'BY': '178.120.0.0/13',
4654 'BZ': '179.42.192.0/18',
4655 'CA': '99.224.0.0/11',
4656 'CD': '41.243.0.0/16',
53896ca5
S
4657 'CF': '197.242.176.0/21',
4658 'CG': '160.113.0.0/16',
773f291d 4659 'CH': '85.0.0.0/13',
53896ca5 4660 'CI': '102.136.0.0/14',
773f291d
S
4661 'CK': '202.65.32.0/19',
4662 'CL': '152.172.0.0/14',
53896ca5 4663 'CM': '102.244.0.0/14',
773f291d
S
4664 'CN': '36.128.0.0/10',
4665 'CO': '181.240.0.0/12',
4666 'CR': '201.192.0.0/12',
4667 'CU': '152.206.0.0/15',
4668 'CV': '165.90.96.0/19',
4669 'CW': '190.88.128.0/17',
53896ca5 4670 'CY': '31.153.0.0/16',
773f291d
S
4671 'CZ': '88.100.0.0/14',
4672 'DE': '53.0.0.0/8',
4673 'DJ': '197.241.0.0/17',
4674 'DK': '87.48.0.0/12',
4675 'DM': '192.243.48.0/20',
4676 'DO': '152.166.0.0/15',
4677 'DZ': '41.96.0.0/12',
4678 'EC': '186.68.0.0/15',
4679 'EE': '90.190.0.0/15',
4680 'EG': '156.160.0.0/11',
4681 'ER': '196.200.96.0/20',
4682 'ES': '88.0.0.0/11',
4683 'ET': '196.188.0.0/14',
4684 'EU': '2.16.0.0/13',
4685 'FI': '91.152.0.0/13',
4686 'FJ': '144.120.0.0/16',
53896ca5 4687 'FK': '80.73.208.0/21',
773f291d
S
4688 'FM': '119.252.112.0/20',
4689 'FO': '88.85.32.0/19',
4690 'FR': '90.0.0.0/9',
4691 'GA': '41.158.0.0/15',
4692 'GB': '25.0.0.0/8',
4693 'GD': '74.122.88.0/21',
4694 'GE': '31.146.0.0/16',
4695 'GF': '161.22.64.0/18',
4696 'GG': '62.68.160.0/19',
53896ca5
S
4697 'GH': '154.160.0.0/12',
4698 'GI': '95.164.0.0/16',
773f291d
S
4699 'GL': '88.83.0.0/19',
4700 'GM': '160.182.0.0/15',
4701 'GN': '197.149.192.0/18',
4702 'GP': '104.250.0.0/19',
4703 'GQ': '105.235.224.0/20',
4704 'GR': '94.64.0.0/13',
4705 'GT': '168.234.0.0/16',
4706 'GU': '168.123.0.0/16',
4707 'GW': '197.214.80.0/20',
4708 'GY': '181.41.64.0/18',
4709 'HK': '113.252.0.0/14',
4710 'HN': '181.210.0.0/16',
4711 'HR': '93.136.0.0/13',
4712 'HT': '148.102.128.0/17',
4713 'HU': '84.0.0.0/14',
4714 'ID': '39.192.0.0/10',
4715 'IE': '87.32.0.0/12',
4716 'IL': '79.176.0.0/13',
4717 'IM': '5.62.80.0/20',
4718 'IN': '117.192.0.0/10',
4719 'IO': '203.83.48.0/21',
4720 'IQ': '37.236.0.0/14',
4721 'IR': '2.176.0.0/12',
4722 'IS': '82.221.0.0/16',
4723 'IT': '79.0.0.0/10',
4724 'JE': '87.244.64.0/18',
4725 'JM': '72.27.0.0/17',
4726 'JO': '176.29.0.0/16',
53896ca5 4727 'JP': '133.0.0.0/8',
773f291d
S
4728 'KE': '105.48.0.0/12',
4729 'KG': '158.181.128.0/17',
4730 'KH': '36.37.128.0/17',
4731 'KI': '103.25.140.0/22',
4732 'KM': '197.255.224.0/20',
53896ca5 4733 'KN': '198.167.192.0/19',
773f291d
S
4734 'KP': '175.45.176.0/22',
4735 'KR': '175.192.0.0/10',
4736 'KW': '37.36.0.0/14',
4737 'KY': '64.96.0.0/15',
4738 'KZ': '2.72.0.0/13',
4739 'LA': '115.84.64.0/18',
4740 'LB': '178.135.0.0/16',
53896ca5 4741 'LC': '24.92.144.0/20',
773f291d
S
4742 'LI': '82.117.0.0/19',
4743 'LK': '112.134.0.0/15',
53896ca5 4744 'LR': '102.183.0.0/16',
773f291d
S
4745 'LS': '129.232.0.0/17',
4746 'LT': '78.56.0.0/13',
4747 'LU': '188.42.0.0/16',
4748 'LV': '46.109.0.0/16',
4749 'LY': '41.252.0.0/14',
4750 'MA': '105.128.0.0/11',
4751 'MC': '88.209.64.0/18',
4752 'MD': '37.246.0.0/16',
4753 'ME': '178.175.0.0/17',
4754 'MF': '74.112.232.0/21',
4755 'MG': '154.126.0.0/17',
4756 'MH': '117.103.88.0/21',
4757 'MK': '77.28.0.0/15',
4758 'ML': '154.118.128.0/18',
4759 'MM': '37.111.0.0/17',
4760 'MN': '49.0.128.0/17',
4761 'MO': '60.246.0.0/16',
4762 'MP': '202.88.64.0/20',
4763 'MQ': '109.203.224.0/19',
4764 'MR': '41.188.64.0/18',
4765 'MS': '208.90.112.0/22',
4766 'MT': '46.11.0.0/16',
4767 'MU': '105.16.0.0/12',
4768 'MV': '27.114.128.0/18',
53896ca5 4769 'MW': '102.70.0.0/15',
773f291d
S
4770 'MX': '187.192.0.0/11',
4771 'MY': '175.136.0.0/13',
4772 'MZ': '197.218.0.0/15',
4773 'NA': '41.182.0.0/16',
4774 'NC': '101.101.0.0/18',
4775 'NE': '197.214.0.0/18',
4776 'NF': '203.17.240.0/22',
4777 'NG': '105.112.0.0/12',
4778 'NI': '186.76.0.0/15',
4779 'NL': '145.96.0.0/11',
4780 'NO': '84.208.0.0/13',
4781 'NP': '36.252.0.0/15',
4782 'NR': '203.98.224.0/19',
4783 'NU': '49.156.48.0/22',
4784 'NZ': '49.224.0.0/14',
4785 'OM': '5.36.0.0/15',
4786 'PA': '186.72.0.0/15',
4787 'PE': '186.160.0.0/14',
4788 'PF': '123.50.64.0/18',
4789 'PG': '124.240.192.0/19',
4790 'PH': '49.144.0.0/13',
4791 'PK': '39.32.0.0/11',
4792 'PL': '83.0.0.0/11',
4793 'PM': '70.36.0.0/20',
4794 'PR': '66.50.0.0/16',
4795 'PS': '188.161.0.0/16',
4796 'PT': '85.240.0.0/13',
4797 'PW': '202.124.224.0/20',
4798 'PY': '181.120.0.0/14',
4799 'QA': '37.210.0.0/15',
53896ca5 4800 'RE': '102.35.0.0/16',
773f291d 4801 'RO': '79.112.0.0/13',
53896ca5 4802 'RS': '93.86.0.0/15',
773f291d 4803 'RU': '5.136.0.0/13',
53896ca5 4804 'RW': '41.186.0.0/16',
773f291d
S
4805 'SA': '188.48.0.0/13',
4806 'SB': '202.1.160.0/19',
4807 'SC': '154.192.0.0/11',
53896ca5 4808 'SD': '102.120.0.0/13',
773f291d 4809 'SE': '78.64.0.0/12',
53896ca5 4810 'SG': '8.128.0.0/10',
773f291d
S
4811 'SI': '188.196.0.0/14',
4812 'SK': '78.98.0.0/15',
53896ca5 4813 'SL': '102.143.0.0/17',
773f291d
S
4814 'SM': '89.186.32.0/19',
4815 'SN': '41.82.0.0/15',
53896ca5 4816 'SO': '154.115.192.0/18',
773f291d
S
4817 'SR': '186.179.128.0/17',
4818 'SS': '105.235.208.0/21',
4819 'ST': '197.159.160.0/19',
4820 'SV': '168.243.0.0/16',
4821 'SX': '190.102.0.0/20',
4822 'SY': '5.0.0.0/16',
4823 'SZ': '41.84.224.0/19',
4824 'TC': '65.255.48.0/20',
4825 'TD': '154.68.128.0/19',
4826 'TG': '196.168.0.0/14',
4827 'TH': '171.96.0.0/13',
4828 'TJ': '85.9.128.0/18',
4829 'TK': '27.96.24.0/21',
4830 'TL': '180.189.160.0/20',
4831 'TM': '95.85.96.0/19',
4832 'TN': '197.0.0.0/11',
4833 'TO': '175.176.144.0/21',
4834 'TR': '78.160.0.0/11',
4835 'TT': '186.44.0.0/15',
4836 'TV': '202.2.96.0/19',
4837 'TW': '120.96.0.0/11',
4838 'TZ': '156.156.0.0/14',
53896ca5
S
4839 'UA': '37.52.0.0/14',
4840 'UG': '102.80.0.0/13',
4841 'US': '6.0.0.0/8',
773f291d 4842 'UY': '167.56.0.0/13',
53896ca5 4843 'UZ': '84.54.64.0/18',
773f291d 4844 'VA': '212.77.0.0/19',
53896ca5 4845 'VC': '207.191.240.0/21',
773f291d 4846 'VE': '186.88.0.0/13',
53896ca5 4847 'VG': '66.81.192.0/20',
773f291d
S
4848 'VI': '146.226.0.0/16',
4849 'VN': '14.160.0.0/11',
4850 'VU': '202.80.32.0/20',
4851 'WF': '117.20.32.0/21',
4852 'WS': '202.4.32.0/19',
4853 'YE': '134.35.0.0/16',
4854 'YT': '41.242.116.0/22',
4855 'ZA': '41.0.0.0/11',
53896ca5
S
4856 'ZM': '102.144.0.0/13',
4857 'ZW': '102.177.192.0/18',
773f291d
S
4858 }
4859
4860 @classmethod
5f95927a
S
4861 def random_ipv4(cls, code_or_block):
4862 if len(code_or_block) == 2:
4863 block = cls._country_ip_map.get(code_or_block.upper())
4864 if not block:
4865 return None
4866 else:
4867 block = code_or_block
773f291d 4868 addr, preflen = block.split('/')
ac668111 4869 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4870 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4871 return str(socket.inet_ntoa(
ac668111 4872 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4873
4874
ac668111 4875class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4876 def __init__(self, proxies=None):
4877 # Set default handlers
4878 for type in ('http', 'https'):
4879 setattr(self, '%s_open' % type,
4880 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4881 meth(r, proxy, type))
ac668111 4882 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4883
91410c9b 4884 def proxy_open(self, req, proxy, type):
2461f79d 4885 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4886 if req_proxy is not None:
4887 proxy = req_proxy
2461f79d
PH
4888 del req.headers['Ytdl-request-proxy']
4889
4890 if proxy == '__noproxy__':
4891 return None # No Proxy
14f25df2 4892 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4893 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4894 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4895 return None
ac668111 4896 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4897 self, req, proxy, type)
5bc880b9
YCH
4898
4899
0a5445dd
YCH
4900# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4901# released into Public Domain
4902# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4903
4904def long_to_bytes(n, blocksize=0):
4905 """long_to_bytes(n:long, blocksize:int) : string
4906 Convert a long integer to a byte string.
4907
4908 If optional blocksize is given and greater than zero, pad the front of the
4909 byte string with binary zeros so that the length is a multiple of
4910 blocksize.
4911 """
4912 # after much testing, this algorithm was deemed to be the fastest
4913 s = b''
4914 n = int(n)
4915 while n > 0:
ac668111 4916 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4917 n = n >> 32
4918 # strip off leading zeros
4919 for i in range(len(s)):
4920 if s[i] != b'\000'[0]:
4921 break
4922 else:
4923 # only happens when n == 0
4924 s = b'\000'
4925 i = 0
4926 s = s[i:]
4927 # add back some pad bytes. this could be done more efficiently w.r.t. the
4928 # de-padding being done above, but sigh...
4929 if blocksize > 0 and len(s) % blocksize:
4930 s = (blocksize - len(s) % blocksize) * b'\000' + s
4931 return s
4932
4933
4934def bytes_to_long(s):
4935 """bytes_to_long(string) : long
4936 Convert a byte string to a long integer.
4937
4938 This is (essentially) the inverse of long_to_bytes().
4939 """
4940 acc = 0
4941 length = len(s)
4942 if length % 4:
4943 extra = (4 - length % 4)
4944 s = b'\000' * extra + s
4945 length = length + extra
4946 for i in range(0, length, 4):
ac668111 4947 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4948 return acc
4949
4950
5bc880b9
YCH
4951def ohdave_rsa_encrypt(data, exponent, modulus):
4952 '''
4953 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4954
4955 Input:
4956 data: data to encrypt, bytes-like object
4957 exponent, modulus: parameter e and N of RSA algorithm, both integer
4958 Output: hex string of encrypted data
4959
4960 Limitation: supports one block encryption only
4961 '''
4962
4963 payload = int(binascii.hexlify(data[::-1]), 16)
4964 encrypted = pow(payload, exponent, modulus)
4965 return '%x' % encrypted
81bdc8fd
YCH
4966
4967
f48409c7
YCH
4968def pkcs1pad(data, length):
4969 """
4970 Padding input data with PKCS#1 scheme
4971
4972 @param {int[]} data input data
4973 @param {int} length target length
4974 @returns {int[]} padded data
4975 """
4976 if len(data) > length - 11:
4977 raise ValueError('Input data too long for PKCS#1 padding')
4978
4979 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4980 return [0, 2] + pseudo_random + [0] + data
4981
4982
7b2c3f47 4983def _base_n_table(n, table):
4984 if not table and not n:
4985 raise ValueError('Either table or n must be specified')
612f2be5 4986 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4987
44f14eb4 4988 if n and n != len(table):
612f2be5 4989 raise ValueError(f'base {n} exceeds table length {len(table)}')
4990 return table
59f898b7 4991
5eb6bdce 4992
7b2c3f47 4993def encode_base_n(num, n=None, table=None):
4994 """Convert given int to a base-n string"""
612f2be5 4995 table = _base_n_table(n, table)
7b2c3f47 4996 if not num:
5eb6bdce
YCH
4997 return table[0]
4998
7b2c3f47 4999 result, base = '', len(table)
81bdc8fd 5000 while num:
7b2c3f47 5001 result = table[num % base] + result
612f2be5 5002 num = num // base
7b2c3f47 5003 return result
5004
5005
5006def decode_base_n(string, n=None, table=None):
5007 """Convert given base-n string to int"""
5008 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5009 result, base = 0, len(table)
5010 for char in string:
5011 result = result * base + table[char]
5012 return result
5013
5014
5015def decode_base(value, digits):
da4db748 5016 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5017 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 5018 return decode_base_n(value, table=digits)
f52354a8
YCH
5019
5020
5021def decode_packed_codes(code):
06b3fe29 5022 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 5023 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
5024 base = int(base)
5025 count = int(count)
5026 symbols = symbols.split('|')
5027 symbol_table = {}
5028
5029 while count:
5030 count -= 1
5eb6bdce 5031 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5032 symbol_table[base_n_count] = symbols[count] or base_n_count
5033
5034 return re.sub(
5035 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5036 obfuscated_code)
e154c651 5037
5038
1ced2221
S
5039def caesar(s, alphabet, shift):
5040 if shift == 0:
5041 return s
5042 l = len(alphabet)
5043 return ''.join(
5044 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5045 for c in s)
5046
5047
5048def rot47(s):
5049 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5050
5051
e154c651 5052def parse_m3u8_attributes(attrib):
5053 info = {}
5054 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5055 if val.startswith('"'):
5056 val = val[1:-1]
5057 info[key] = val
5058 return info
1143535d
YCH
5059
5060
5061def urshift(val, n):
5062 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5063
5064
5065# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 5066# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
5067def decode_png(png_data):
5068 # Reference: https://www.w3.org/TR/PNG/
5069 header = png_data[8:]
5070
5071 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 5072 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
5073
5074 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 5075 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
5076
5077 chunks = []
5078
5079 while header:
5080 length = unpack_integer(header[:4])
5081 header = header[4:]
5082
5083 chunk_type = header[:4]
5084 header = header[4:]
5085
5086 chunk_data = header[:length]
5087 header = header[length:]
5088
5089 header = header[4:] # Skip CRC
5090
5091 chunks.append({
5092 'type': chunk_type,
5093 'length': length,
5094 'data': chunk_data
5095 })
5096
5097 ihdr = chunks[0]['data']
5098
5099 width = unpack_integer(ihdr[:4])
5100 height = unpack_integer(ihdr[4:8])
5101
5102 idat = b''
5103
5104 for chunk in chunks:
5105 if chunk['type'] == b'IDAT':
5106 idat += chunk['data']
5107
5108 if not idat:
86e5f3ed 5109 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5110
5111 decompressed_data = bytearray(zlib.decompress(idat))
5112
5113 stride = width * 3
5114 pixels = []
5115
5116 def _get_pixel(idx):
5117 x = idx % stride
5118 y = idx // stride
5119 return pixels[y][x]
5120
5121 for y in range(height):
5122 basePos = y * (1 + stride)
5123 filter_type = decompressed_data[basePos]
5124
5125 current_row = []
5126
5127 pixels.append(current_row)
5128
5129 for x in range(stride):
5130 color = decompressed_data[1 + basePos + x]
5131 basex = y * stride + x
5132 left = 0
5133 up = 0
5134
5135 if x > 2:
5136 left = _get_pixel(basex - 3)
5137 if y > 0:
5138 up = _get_pixel(basex - stride)
5139
5140 if filter_type == 1: # Sub
5141 color = (color + left) & 0xff
5142 elif filter_type == 2: # Up
5143 color = (color + up) & 0xff
5144 elif filter_type == 3: # Average
5145 color = (color + ((left + up) >> 1)) & 0xff
5146 elif filter_type == 4: # Paeth
5147 a = left
5148 b = up
5149 c = 0
5150
5151 if x > 2 and y > 0:
5152 c = _get_pixel(basex - stride - 3)
5153
5154 p = a + b - c
5155
5156 pa = abs(p - a)
5157 pb = abs(p - b)
5158 pc = abs(p - c)
5159
5160 if pa <= pb and pa <= pc:
5161 color = (color + a) & 0xff
5162 elif pb <= pc:
5163 color = (color + b) & 0xff
5164 else:
5165 color = (color + c) & 0xff
5166
5167 current_row.append(color)
5168
5169 return width, height, pixels
efa97bdc
YCH
5170
5171
5172def write_xattr(path, key, value):
6f7563be 5173 # Windows: Write xattrs to NTFS Alternate Data Streams:
5174 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5175 if compat_os_name == 'nt':
5176 assert ':' not in key
5177 assert os.path.exists(path)
efa97bdc
YCH
5178
5179 try:
6f7563be 5180 with open(f'{path}:{key}', 'wb') as f:
5181 f.write(value)
86e5f3ed 5182 except OSError as e:
efa97bdc 5183 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5184 return
efa97bdc 5185
6f7563be 5186 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5187
6f7563be 5188 setxattr = None
5189 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5190 # Unicode arguments are not supported in pyxattr until version 0.5.0
5191 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5192 if version_tuple(xattr.__version__) >= (0, 5, 0):
5193 setxattr = xattr.set
5194 elif xattr:
5195 setxattr = xattr.setxattr
efa97bdc 5196
6f7563be 5197 if setxattr:
5198 try:
5199 setxattr(path, key, value)
5200 except OSError as e:
5201 raise XAttrMetadataError(e.errno, e.strerror)
5202 return
efa97bdc 5203
6f7563be 5204 # UNIX Method 2. Use setfattr/xattr executables
5205 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5206 else 'xattr' if check_executable('xattr', ['-h']) else None)
5207 if not exe:
5208 raise XAttrUnavailableError(
5209 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5210 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5211
0f06bcd7 5212 value = value.decode()
6f7563be 5213 try:
f0c9fb96 5214 _, stderr, returncode = Popen.run(
6f7563be 5215 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5216 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5217 except OSError as e:
5218 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5219 if returncode:
5220 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5221
5222
5223def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5224 start_date = datetime.date(1950, 1, 1)
5225 end_date = datetime.date(1995, 12, 31)
5226 offset = random.randint(0, (end_date - start_date).days)
5227 random_date = start_date + datetime.timedelta(offset)
0c265486 5228 return {
aa374bc7
AS
5229 year_field: str(random_date.year),
5230 month_field: str(random_date.month),
5231 day_field: str(random_date.day),
0c265486 5232 }
732044af 5233
c76eb41b 5234
732044af 5235# Templates for internet shortcut files, which are plain text files.
e5a998f3 5236DOT_URL_LINK_TEMPLATE = '''\
732044af 5237[InternetShortcut]
5238URL=%(url)s
e5a998f3 5239'''
732044af 5240
e5a998f3 5241DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5242<?xml version="1.0" encoding="UTF-8"?>
5243<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5244<plist version="1.0">
5245<dict>
5246\t<key>URL</key>
5247\t<string>%(url)s</string>
5248</dict>
5249</plist>
e5a998f3 5250'''
732044af 5251
e5a998f3 5252DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5253[Desktop Entry]
5254Encoding=UTF-8
5255Name=%(filename)s
5256Type=Link
5257URL=%(url)s
5258Icon=text-html
e5a998f3 5259'''
732044af 5260
08438d2c 5261LINK_TEMPLATES = {
5262 'url': DOT_URL_LINK_TEMPLATE,
5263 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5264 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5265}
5266
732044af 5267
5268def iri_to_uri(iri):
5269 """
5270 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5271
5272 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5273 """
5274
14f25df2 5275 iri_parts = urllib.parse.urlparse(iri)
732044af 5276
5277 if '[' in iri_parts.netloc:
5278 raise ValueError('IPv6 URIs are not, yet, supported.')
5279 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5280
5281 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5282
5283 net_location = ''
5284 if iri_parts.username:
f9934b96 5285 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5286 if iri_parts.password is not None:
f9934b96 5287 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5288 net_location += '@'
5289
0f06bcd7 5290 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5291 # The 'idna' encoding produces ASCII text.
5292 if iri_parts.port is not None and iri_parts.port != 80:
5293 net_location += ':' + str(iri_parts.port)
5294
f9934b96 5295 return urllib.parse.urlunparse(
732044af 5296 (iri_parts.scheme,
5297 net_location,
5298
f9934b96 5299 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5300
5301 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5302 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5303
5304 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5305 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5306
f9934b96 5307 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5308
5309 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5310
5311
5312def to_high_limit_path(path):
5313 if sys.platform in ['win32', 'cygwin']:
5314 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5315 return '\\\\?\\' + os.path.abspath(path)
732044af 5316
5317 return path
76d321f6 5318
c76eb41b 5319
7b2c3f47 5320def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5321 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5322 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5323 return default
7b2c3f47 5324 return template % func(val)
00dd0cd5 5325
5326
5327def clean_podcast_url(url):
5328 return re.sub(r'''(?x)
5329 (?:
5330 (?:
5331 chtbl\.com/track|
5332 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5333 play\.podtrac\.com
5334 )/[^/]+|
5335 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5336 flex\.acast\.com|
5337 pd(?:
5338 cn\.co| # https://podcorn.com/analytics-prefix/
5339 st\.fm # https://podsights.com/docs/
5340 )/e
5341 )/''', '', url)
ffcb8191
THD
5342
5343
5344_HEX_TABLE = '0123456789abcdef'
5345
5346
5347def random_uuidv4():
5348 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5349
5350
5351def make_dir(path, to_screen=None):
5352 try:
5353 dn = os.path.dirname(path)
5354 if dn and not os.path.exists(dn):
5355 os.makedirs(dn)
5356 return True
86e5f3ed 5357 except OSError as err:
0202b52a 5358 if callable(to_screen) is not None:
5359 to_screen('unable to create directory ' + error_to_compat_str(err))
5360 return False
f74980cb 5361
5362
5363def get_executable_path():
b5899f4f 5364 from .update import _get_variant_and_executable_path
c487cf00 5365
b5899f4f 5366 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5367
5368
2f567473 5369def load_plugins(name, suffix, namespace):
3ae5e797 5370 classes = {}
19a03940 5371 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5372 plugins_spec = importlib.util.spec_from_file_location(
5373 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5374 plugins = importlib.util.module_from_spec(plugins_spec)
5375 sys.modules[plugins_spec.name] = plugins
5376 plugins_spec.loader.exec_module(plugins)
f74980cb 5377 for name in dir(plugins):
2f567473 5378 if name in namespace:
5379 continue
5380 if not name.endswith(suffix):
f74980cb 5381 continue
5382 klass = getattr(plugins, name)
3ae5e797 5383 classes[name] = namespace[name] = klass
f74980cb 5384 return classes
06167fbb 5385
5386
325ebc17 5387def traverse_obj(
f99bbfc9 5388 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5389 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5390 """
5391 Safely traverse nested `dict`s and `Sequence`s
5392
5393 >>> obj = [{}, {"key": "value"}]
5394 >>> traverse_obj(obj, (1, "key"))
5395 "value"
5396
5397 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5398 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5399 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
ab029d7e
SS
5400 A value of None is treated as the absence of a value.
5401
5402 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5403
5404 The keys in the path can be one of:
5405 - `None`: Return the current object.
7b0127e1 5406 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
ab029d7e
SS
5407 - `slice`: Branch out and return all values in `obj[key]`.
5408 - `Ellipsis`: Branch out and return a list of all values.
5409 - `tuple`/`list`: Branch out and return a list of all matching values.
5410 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5411 - `function`: Branch out and return values filtered by the function.
5412 Read as: `[value for key, value in obj if function(key, value)]`.
5413 For `Sequence`s, `key` is the index of the value.
5414 - `dict` Transform the current object and return a matching dict.
5415 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5416
7b0127e1 5417 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5418
5419 @params paths Paths which to traverse by.
5420 @param default Value to return if the paths do not match.
5421 @param expected_type If a `type`, only accept final values of this type.
5422 If any other callable, try to call the function on each result.
5423 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5424 @param casesense If `False`, consider string dictionary keys as case insensitive.
5425
5426 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5427
5428 @param is_user_input Whether the keys are generated from user input.
5429 If `True` strings get converted to `int`/`slice` if needed.
5430 @param traverse_string Whether to traverse into objects as strings.
5431 If `True`, any non-compatible object will first be
5432 converted into a string and then traversed into.
5433
5434
5435 @returns The result of the object traversal.
5436 If successful, `get_all=True`, and the path branches at least once,
5437 then a list of results is returned instead.
f99bbfc9 5438 A list is always returned if the last path branches and no `default` is given.
ab029d7e
SS
5439 """
5440 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5441 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5442
352d63fd 5443 if isinstance(expected_type, type):
5444 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5445 else:
ab029d7e
SS
5446 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5447
5448 def apply_key(key, obj):
5449 if obj is None:
5450 return
5451
5452 elif key is None:
5453 yield obj
5454
5455 elif isinstance(key, (list, tuple)):
5456 for branch in key:
5457 _, result = apply_path(obj, branch)
5458 yield from result
5459
5460 elif key is ...:
5461 if isinstance(obj, collections.abc.Mapping):
5462 yield from obj.values()
5463 elif is_sequence(obj):
5464 yield from obj
7b0127e1
SS
5465 elif isinstance(obj, re.Match):
5466 yield from obj.groups()
ab029d7e
SS
5467 elif traverse_string:
5468 yield from str(obj)
5469
5470 elif callable(key):
5471 if is_sequence(obj):
5472 iter_obj = enumerate(obj)
5473 elif isinstance(obj, collections.abc.Mapping):
5474 iter_obj = obj.items()
7b0127e1
SS
5475 elif isinstance(obj, re.Match):
5476 iter_obj = enumerate((obj.group(), *obj.groups()))
ab029d7e
SS
5477 elif traverse_string:
5478 iter_obj = enumerate(str(obj))
352d63fd 5479 else:
ab029d7e
SS
5480 return
5481 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5482
5483 elif isinstance(key, dict):
5484 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5485 yield {k: v if v is not None else default for k, v in iter_obj
f99bbfc9 5486 if v is not None or default is not NO_DEFAULT}
ab029d7e 5487
7b0127e1 5488 elif isinstance(obj, collections.abc.Mapping):
ab029d7e
SS
5489 yield (obj.get(key) if casesense or (key in obj)
5490 else next((v for k, v in obj.items() if casefold(k) == key), None))
5491
7b0127e1
SS
5492 elif isinstance(obj, re.Match):
5493 if isinstance(key, int) or casesense:
5494 with contextlib.suppress(IndexError):
5495 yield obj.group(key)
5496 return
5497
5498 if not isinstance(key, str):
5499 return
5500
5501 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5502
ab029d7e
SS
5503 else:
5504 if is_user_input:
5505 key = (int_or_none(key) if ':' not in key
5506 else slice(*map(int_or_none, key.split(':'))))
5507
5508 if not isinstance(key, (int, slice)):
5509 return
5510
5511 if not is_sequence(obj):
5512 if not traverse_string:
5513 return
5514 obj = str(obj)
5515
5516 with contextlib.suppress(IndexError):
5517 yield obj[key]
5518
5519 def apply_path(start_obj, path):
5520 objs = (start_obj,)
5521 has_branched = False
5522
5523 for key in variadic(path):
5524 if is_user_input and key == ':':
5525 key = ...
5526
5527 if not casesense and isinstance(key, str):
5528 key = key.casefold()
5529
5530 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5531 has_branched = True
5532
5533 key_func = functools.partial(apply_key, key)
5534 objs = itertools.chain.from_iterable(map(key_func, objs))
5535
5536 return has_branched, objs
5537
f99bbfc9 5538 def _traverse_obj(obj, path, use_list=True):
ab029d7e
SS
5539 has_branched, results = apply_path(obj, path)
5540 results = LazyList(x for x in map(type_test, results) if x is not None)
ab029d7e 5541
f99bbfc9
SS
5542 if get_all and has_branched:
5543 return results.exhaust() if results or use_list else None
5544
5545 return results[0] if results else None
5546
5547 for index, path in enumerate(paths, 1):
5548 use_list = default is NO_DEFAULT and index == len(paths)
5549 result = _traverse_obj(obj, path, use_list)
ab029d7e
SS
5550 if result is not None:
5551 return result
5552
f99bbfc9 5553 return None if default is NO_DEFAULT else default
324ad820 5554
5555
5556def traverse_dict(dictn, keys, casesense=True):
da4db748 5557 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5558 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5559 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5560
5561
ff91cf74 5562def get_first(obj, keys, **kwargs):
5563 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5564
5565
3e9b66d7
LNO
5566def time_seconds(**kwargs):
5567 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5568 return t.timestamp()
5569
5570
49fa4d9a
N
5571# create a JSON Web Signature (jws) with HS256 algorithm
5572# the resulting format is in JWS Compact Serialization
5573# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5574# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5575def jwt_encode_hs256(payload_data, key, headers={}):
5576 header_data = {
5577 'alg': 'HS256',
5578 'typ': 'JWT',
5579 }
5580 if headers:
5581 header_data.update(headers)
0f06bcd7 5582 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5583 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5584 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5585 signature_b64 = base64.b64encode(h.digest())
5586 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5587 return token
819e0531 5588
5589
16b0d7e6 5590# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5591def jwt_decode_hs256(jwt):
5592 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5593 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5594 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5595 return payload_data
5596
5597
53973b4d 5598WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5599
5600
7a32c70d 5601@functools.cache
819e0531 5602def supports_terminal_sequences(stream):
5603 if compat_os_name == 'nt':
8a82af35 5604 if not WINDOWS_VT_MODE:
819e0531 5605 return False
5606 elif not os.getenv('TERM'):
5607 return False
5608 try:
5609 return stream.isatty()
5610 except BaseException:
5611 return False
5612
5613
c53a18f0 5614def windows_enable_vt_mode():
5615 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5616 if get_windows_version() < (10, 0, 10586):
53973b4d 5617 return
53973b4d 5618
c53a18f0 5619 import ctypes
5620 import ctypes.wintypes
5621 import msvcrt
5622
5623 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5624
5625 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5626 handle = os.open('CONOUT$', os.O_RDWR)
5627
5628 try:
5629 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5630 dw_original_mode = ctypes.wintypes.DWORD()
5631 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5632 if not success:
5633 raise Exception('GetConsoleMode failed')
5634
5635 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5636 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5637 if not success:
5638 raise Exception('SetConsoleMode failed')
5639 except Exception as e:
5640 write_string(f'WARNING: Cannot enable VT mode - {e}')
5641 else:
5642 global WINDOWS_VT_MODE
5643 WINDOWS_VT_MODE = True
5644 supports_terminal_sequences.cache_clear()
5645 finally:
5646 os.close(handle)
53973b4d 5647
5648
ec11a9f4 5649_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5650
5651
5652def remove_terminal_sequences(string):
5653 return _terminal_sequences_re.sub('', string)
5654
5655
5656def number_of_digits(number):
5657 return len('%d' % number)
34921b43 5658
5659
5660def join_nonempty(*values, delim='-', from_dict=None):
5661 if from_dict is not None:
7b2c3f47 5662 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5663 return delim.join(map(str, filter(None, values)))
06e57990 5664
5665
27231526
ZM
5666def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5667 """
5668 Find the largest format dimensions in terms of video width and, for each thumbnail:
5669 * Modify the URL: Match the width with the provided regex and replace with the former width
5670 * Update dimensions
5671
5672 This function is useful with video services that scale the provided thumbnails on demand
5673 """
5674 _keys = ('width', 'height')
5675 max_dimensions = max(
86e5f3ed 5676 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5677 default=(0, 0))
5678 if not max_dimensions[0]:
5679 return thumbnails
5680 return [
5681 merge_dicts(
5682 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5683 dict(zip(_keys, max_dimensions)), thumbnail)
5684 for thumbnail in thumbnails
5685 ]
5686
5687
93c8410d
LNO
5688def parse_http_range(range):
5689 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5690 if not range:
5691 return None, None, None
5692 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5693 if not crg:
5694 return None, None, None
5695 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5696
5697
6b9e832d 5698def read_stdin(what):
5699 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5700 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5701 return sys.stdin
5702
5703
a904a7f8
L
5704def determine_file_encoding(data):
5705 """
88f60feb 5706 Detect the text encoding used
a904a7f8
L
5707 @returns (encoding, bytes to skip)
5708 """
5709
88f60feb 5710 # BOM marks are given priority over declarations
a904a7f8 5711 for bom, enc in BOMS:
a904a7f8
L
5712 if data.startswith(bom):
5713 return enc, len(bom)
5714
88f60feb 5715 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5716 # We ignore the endianness to get a good enough match
a904a7f8 5717 data = data.replace(b'\0', b'')
88f60feb 5718 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5719 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5720
5721
06e57990 5722class Config:
5723 own_args = None
9e491463 5724 parsed_args = None
06e57990 5725 filename = None
5726 __initialized = False
5727
5728 def __init__(self, parser, label=None):
9e491463 5729 self.parser, self.label = parser, label
06e57990 5730 self._loaded_paths, self.configs = set(), []
5731
5732 def init(self, args=None, filename=None):
5733 assert not self.__initialized
284a60c5 5734 self.own_args, self.filename = args, filename
5735 return self.load_configs()
5736
5737 def load_configs(self):
65662dff 5738 directory = ''
284a60c5 5739 if self.filename:
5740 location = os.path.realpath(self.filename)
65662dff 5741 directory = os.path.dirname(location)
06e57990 5742 if location in self._loaded_paths:
5743 return False
5744 self._loaded_paths.add(location)
5745
284a60c5 5746 self.__initialized = True
5747 opts, _ = self.parser.parse_known_args(self.own_args)
5748 self.parsed_args = self.own_args
9e491463 5749 for location in opts.config_locations or []:
6b9e832d 5750 if location == '-':
1060f82f 5751 if location in self._loaded_paths:
5752 continue
5753 self._loaded_paths.add(location)
6b9e832d 5754 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5755 continue
65662dff 5756 location = os.path.join(directory, expand_path(location))
06e57990 5757 if os.path.isdir(location):
5758 location = os.path.join(location, 'yt-dlp.conf')
5759 if not os.path.exists(location):
9e491463 5760 self.parser.error(f'config location {location} does not exist')
06e57990 5761 self.append_config(self.read_file(location), location)
5762 return True
5763
5764 def __str__(self):
5765 label = join_nonempty(
5766 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5767 delim=' ')
5768 return join_nonempty(
5769 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5770 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5771 delim='\n')
5772
7a32c70d 5773 @staticmethod
06e57990 5774 def read_file(filename, default=[]):
5775 try:
a904a7f8 5776 optionf = open(filename, 'rb')
86e5f3ed 5777 except OSError:
06e57990 5778 return default # silently skip if file is not present
a904a7f8
L
5779 try:
5780 enc, skip = determine_file_encoding(optionf.read(512))
5781 optionf.seek(skip, io.SEEK_SET)
5782 except OSError:
5783 enc = None # silently skip read errors
06e57990 5784 try:
5785 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5786 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5787 res = shlex.split(contents, comments=True)
44a6fcff 5788 except Exception as err:
5789 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5790 finally:
5791 optionf.close()
5792 return res
5793
7a32c70d 5794 @staticmethod
06e57990 5795 def hide_login_info(opts):
86e5f3ed 5796 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5797 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5798
5799 def _scrub_eq(o):
5800 m = eqre.match(o)
5801 if m:
5802 return m.group('key') + '=PRIVATE'
5803 else:
5804 return o
5805
5806 opts = list(map(_scrub_eq, opts))
5807 for idx, opt in enumerate(opts):
5808 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5809 opts[idx + 1] = 'PRIVATE'
5810 return opts
5811
5812 def append_config(self, *args, label=None):
9e491463 5813 config = type(self)(self.parser, label)
06e57990 5814 config._loaded_paths = self._loaded_paths
5815 if config.init(*args):
5816 self.configs.append(config)
5817
7a32c70d 5818 @property
06e57990 5819 def all_args(self):
5820 for config in reversed(self.configs):
5821 yield from config.all_args
9e491463 5822 yield from self.parsed_args or []
5823
5824 def parse_known_args(self, **kwargs):
5825 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5826
5827 def parse_args(self):
9e491463 5828 return self.parser.parse_args(self.all_args)
da42679b
LNO
5829
5830
d5d1df8a 5831class WebSocketsWrapper:
da42679b 5832 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5833 pool = None
da42679b 5834
3cea3edd 5835 def __init__(self, url, headers=None, connect=True):
059bc4db 5836 self.loop = asyncio.new_event_loop()
9cd08050 5837 # XXX: "loop" is deprecated
5838 self.conn = websockets.connect(
5839 url, extra_headers=headers, ping_interval=None,
5840 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5841 if connect:
5842 self.__enter__()
15dfb392 5843 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5844
5845 def __enter__(self):
3cea3edd 5846 if not self.pool:
9cd08050 5847 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5848 return self
5849
5850 def send(self, *args):
5851 self.run_with_loop(self.pool.send(*args), self.loop)
5852
5853 def recv(self, *args):
5854 return self.run_with_loop(self.pool.recv(*args), self.loop)
5855
5856 def __exit__(self, type, value, traceback):
5857 try:
5858 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5859 finally:
5860 self.loop.close()
15dfb392 5861 self._cancel_all_tasks(self.loop)
da42679b
LNO
5862
5863 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5864 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5865 @staticmethod
da42679b 5866 def run_with_loop(main, loop):
059bc4db 5867 if not asyncio.iscoroutine(main):
da42679b
LNO
5868 raise ValueError(f'a coroutine was expected, got {main!r}')
5869
5870 try:
5871 return loop.run_until_complete(main)
5872 finally:
5873 loop.run_until_complete(loop.shutdown_asyncgens())
5874 if hasattr(loop, 'shutdown_default_executor'):
5875 loop.run_until_complete(loop.shutdown_default_executor())
5876
7a32c70d 5877 @staticmethod
da42679b 5878 def _cancel_all_tasks(loop):
059bc4db 5879 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5880
5881 if not to_cancel:
5882 return
5883
5884 for task in to_cancel:
5885 task.cancel()
5886
9cd08050 5887 # XXX: "loop" is removed in python 3.10+
da42679b 5888 loop.run_until_complete(
059bc4db 5889 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5890
5891 for task in to_cancel:
5892 if task.cancelled():
5893 continue
5894 if task.exception() is not None:
5895 loop.call_exception_handler({
5896 'message': 'unhandled exception during asyncio.run() shutdown',
5897 'exception': task.exception(),
5898 'task': task,
5899 })
5900
5901
8b7539d2 5902def merge_headers(*dicts):
08d30158 5903 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5904 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5905
5906
b1f94422 5907def cached_method(f):
5908 """Cache a method"""
5909 signature = inspect.signature(f)
5910
7a32c70d 5911 @functools.wraps(f)
b1f94422 5912 def wrapper(self, *args, **kwargs):
5913 bound_args = signature.bind(self, *args, **kwargs)
5914 bound_args.apply_defaults()
d5d1df8a 5915 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5916
6368e2e6 5917 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5918 if key not in cache:
5919 cache[key] = f(self, *args, **kwargs)
5920 return cache[key]
5921 return wrapper
5922
5923
28787f16 5924class classproperty:
83cc7b8a 5925 """property access for class methods with optional caching"""
5926 def __new__(cls, func=None, *args, **kwargs):
5927 if not func:
5928 return functools.partial(cls, *args, **kwargs)
5929 return super().__new__(cls)
c487cf00 5930
83cc7b8a 5931 def __init__(self, func, *, cache=False):
c487cf00 5932 functools.update_wrapper(self, func)
5933 self.func = func
83cc7b8a 5934 self._cache = {} if cache else None
28787f16 5935
5936 def __get__(self, _, cls):
83cc7b8a 5937 if self._cache is None:
5938 return self.func(cls)
5939 elif cls not in self._cache:
5940 self._cache[cls] = self.func(cls)
5941 return self._cache[cls]
19a03940 5942
5943
64fa820c 5944class Namespace(types.SimpleNamespace):
591bb9d3 5945 """Immutable namespace"""
591bb9d3 5946
7896214c 5947 def __iter__(self):
64fa820c 5948 return iter(self.__dict__.values())
7896214c 5949
7a32c70d 5950 @property
64fa820c 5951 def items_(self):
5952 return self.__dict__.items()
9b8ee23b 5953
5954
8dc59305 5955MEDIA_EXTENSIONS = Namespace(
5956 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5957 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5958 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5959 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5960 thumbnails=('jpg', 'png', 'webp'),
5961 storyboards=('mhtml', ),
5962 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5963 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5964)
5965MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5966MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5967
5968KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5969
5970
be5c1ae8 5971class RetryManager:
5972 """Usage:
5973 for retry in RetryManager(...):
5974 try:
5975 ...
5976 except SomeException as err:
5977 retry.error = err
5978 continue
5979 """
5980 attempt, _error = 0, None
5981
5982 def __init__(self, _retries, _error_callback, **kwargs):
5983 self.retries = _retries or 0
5984 self.error_callback = functools.partial(_error_callback, **kwargs)
5985
5986 def _should_retry(self):
5987 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5988
7a32c70d 5989 @property
be5c1ae8 5990 def error(self):
5991 if self._error is NO_DEFAULT:
5992 return None
5993 return self._error
5994
7a32c70d 5995 @error.setter
be5c1ae8 5996 def error(self, value):
5997 self._error = value
5998
5999 def __iter__(self):
6000 while self._should_retry():
6001 self.error = NO_DEFAULT
6002 self.attempt += 1
6003 yield self
6004 if self.error:
6005 self.error_callback(self.error, self.attempt, self.retries)
6006
7a32c70d 6007 @staticmethod
be5c1ae8 6008 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6009 """Utility function for reporting retries"""
6010 if count > retries:
6011 if error:
6012 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6013 raise e
6014
6015 if not count:
6016 return warn(e)
6017 elif isinstance(e, ExtractorError):
3ce29336 6018 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 6019 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6020
6021 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6022 if delay:
6023 info(f'Sleeping {delay:.2f} seconds ...')
6024 time.sleep(delay)
6025
6026
0647d925 6027def make_archive_id(ie, video_id):
6028 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6029 return f'{ie_key.lower()} {video_id}'
6030
6031
a1c5bd82 6032def truncate_string(s, left, right=0):
6033 assert left > 3 and right >= 0
6034 if s is None or len(s) <= left + right:
6035 return s
71df9b7f 6036 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 6037
6038
5314b521 6039def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6040 assert 'all' in alias_dict, '"all" alias is required'
6041 requested = list(start or [])
6042 for val in options:
6043 discard = val.startswith('-')
6044 if discard:
6045 val = val[1:]
6046
6047 if val in alias_dict:
6048 val = alias_dict[val] if not discard else [
6049 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6050 # NB: Do not allow regex in aliases for performance
6051 requested = orderedSet_from_options(val, alias_dict, start=requested)
6052 continue
6053
6054 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6055 else [val] if val in alias_dict['all'] else None)
6056 if current is None:
6057 raise ValueError(val)
6058
6059 if discard:
6060 for item in current:
6061 while item in requested:
6062 requested.remove(item)
6063 else:
6064 requested.extend(current)
6065
6066 return orderedSet(requested)
6067
6068
d0d74b71 6069class FormatSorter:
6070 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6071
6072 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6073 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6074 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6075 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6076 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6077 'fps', 'fs_approx', 'source', 'id')
6078
6079 settings = {
6080 'vcodec': {'type': 'ordered', 'regex': True,
6081 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6082 'acodec': {'type': 'ordered', 'regex': True,
71082216 6083 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 6084 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6085 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6086 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6087 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6088 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 6089 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6090 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
d0d74b71 6091 'aext': {'type': 'ordered', 'field': 'audio_ext',
6092 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6093 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6094 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6095 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6096 'field': ('vcodec', 'acodec'),
6097 'function': lambda it: int(any(v != 'none' for v in it))},
6098 'ie_pref': {'priority': True, 'type': 'extractor'},
6099 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6100 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6101 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6102 'quality': {'convert': 'float', 'default': -1},
6103 'filesize': {'convert': 'bytes'},
6104 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6105 'id': {'convert': 'string', 'field': 'format_id'},
6106 'height': {'convert': 'float_none'},
6107 'width': {'convert': 'float_none'},
6108 'fps': {'convert': 'float_none'},
6109 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6110 'tbr': {'convert': 'float_none'},
6111 'vbr': {'convert': 'float_none'},
6112 'abr': {'convert': 'float_none'},
6113 'asr': {'convert': 'float_none'},
6114 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6115
6116 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6117 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6118 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6119 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6120 'res': {'type': 'multiple', 'field': ('height', 'width'),
6121 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6122
6123 # Actual field names
6124 'format_id': {'type': 'alias', 'field': 'id'},
6125 'preference': {'type': 'alias', 'field': 'ie_pref'},
6126 'language_preference': {'type': 'alias', 'field': 'lang'},
6127 'source_preference': {'type': 'alias', 'field': 'source'},
6128 'protocol': {'type': 'alias', 'field': 'proto'},
6129 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6130 'audio_channels': {'type': 'alias', 'field': 'channels'},
6131
6132 # Deprecated
6133 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6134 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6135 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6136 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6137 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6138 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6139 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6140 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6141 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6142 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6143 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6144 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6145 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6146 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6147 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6148 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6149 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6150 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6151 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6152 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6153 }
6154
6155 def __init__(self, ydl, field_preference):
6156 self.ydl = ydl
6157 self._order = []
6158 self.evaluate_params(self.ydl.params, field_preference)
6159 if ydl.params.get('verbose'):
6160 self.print_verbose_info(self.ydl.write_debug)
6161
6162 def _get_field_setting(self, field, key):
6163 if field not in self.settings:
6164 if key in ('forced', 'priority'):
6165 return False
6166 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6167 'deprecated and may be removed in a future version')
6168 self.settings[field] = {}
6169 propObj = self.settings[field]
6170 if key not in propObj:
6171 type = propObj.get('type')
6172 if key == 'field':
6173 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6174 elif key == 'convert':
6175 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6176 else:
6177 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6178 propObj[key] = default
6179 return propObj[key]
6180
6181 def _resolve_field_value(self, field, value, convertNone=False):
6182 if value is None:
6183 if not convertNone:
6184 return None
6185 else:
6186 value = value.lower()
6187 conversion = self._get_field_setting(field, 'convert')
6188 if conversion == 'ignore':
6189 return None
6190 if conversion == 'string':
6191 return value
6192 elif conversion == 'float_none':
6193 return float_or_none(value)
6194 elif conversion == 'bytes':
6195 return parse_bytes(value)
6196 elif conversion == 'order':
6197 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6198 use_regex = self._get_field_setting(field, 'regex')
6199 list_length = len(order_list)
6200 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6201 if use_regex and value is not None:
6202 for i, regex in enumerate(order_list):
6203 if regex and re.match(regex, value):
6204 return list_length - i
6205 return list_length - empty_pos # not in list
6206 else: # not regex or value = None
6207 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6208 else:
6209 if value.isnumeric():
6210 return float(value)
6211 else:
6212 self.settings[field]['convert'] = 'string'
6213 return value
6214
6215 def evaluate_params(self, params, sort_extractor):
6216 self._use_free_order = params.get('prefer_free_formats', False)
6217 self._sort_user = params.get('format_sort', [])
6218 self._sort_extractor = sort_extractor
6219
6220 def add_item(field, reverse, closest, limit_text):
6221 field = field.lower()
6222 if field in self._order:
6223 return
6224 self._order.append(field)
6225 limit = self._resolve_field_value(field, limit_text)
6226 data = {
6227 'reverse': reverse,
6228 'closest': False if limit is None else closest,
6229 'limit_text': limit_text,
6230 'limit': limit}
6231 if field in self.settings:
6232 self.settings[field].update(data)
6233 else:
6234 self.settings[field] = data
6235
6236 sort_list = (
6237 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6238 + (tuple() if params.get('format_sort_force', False)
6239 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6240 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6241
6242 for item in sort_list:
6243 match = re.match(self.regex, item)
6244 if match is None:
6245 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6246 field = match.group('field')
6247 if field is None:
6248 continue
6249 if self._get_field_setting(field, 'type') == 'alias':
6250 alias, field = field, self._get_field_setting(field, 'field')
6251 if self._get_field_setting(alias, 'deprecated'):
6252 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6253 f'be removed in a future version. Please use {field} instead')
6254 reverse = match.group('reverse') is not None
6255 closest = match.group('separator') == '~'
6256 limit_text = match.group('limit')
6257
6258 has_limit = limit_text is not None
6259 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6260 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6261
6262 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6263 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6264 limit_count = len(limits)
6265 for (i, f) in enumerate(fields):
6266 add_item(f, reverse, closest,
6267 limits[i] if i < limit_count
6268 else limits[0] if has_limit and not has_multiple_limits
6269 else None)
6270
6271 def print_verbose_info(self, write_debug):
6272 if self._sort_user:
6273 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6274 if self._sort_extractor:
6275 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6276 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6277 '+' if self._get_field_setting(field, 'reverse') else '', field,
6278 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6279 self._get_field_setting(field, 'limit_text'),
6280 self._get_field_setting(field, 'limit'))
6281 if self._get_field_setting(field, 'limit_text') is not None else '')
6282 for field in self._order if self._get_field_setting(field, 'visible')]))
6283
6284 def _calculate_field_preference_from_value(self, format, field, type, value):
6285 reverse = self._get_field_setting(field, 'reverse')
6286 closest = self._get_field_setting(field, 'closest')
6287 limit = self._get_field_setting(field, 'limit')
6288
6289 if type == 'extractor':
6290 maximum = self._get_field_setting(field, 'max')
6291 if value is None or (maximum is not None and value >= maximum):
6292 value = -1
6293 elif type == 'boolean':
6294 in_list = self._get_field_setting(field, 'in_list')
6295 not_in_list = self._get_field_setting(field, 'not_in_list')
6296 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6297 elif type == 'ordered':
6298 value = self._resolve_field_value(field, value, True)
6299
6300 # try to convert to number
6301 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6302 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6303 if is_num:
6304 value = val_num
6305
6306 return ((-10, 0) if value is None
6307 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6308 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6309 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6310 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6311 else (-1, value, 0))
6312
6313 def _calculate_field_preference(self, format, field):
6314 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6315 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6316 if type == 'multiple':
6317 type = 'field' # Only 'field' is allowed in multiple for now
6318 actual_fields = self._get_field_setting(field, 'field')
6319
6320 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6321 else:
6322 value = get_value(field)
6323 return self._calculate_field_preference_from_value(format, field, type, value)
6324
6325 def calculate_preference(self, format):
6326 # Determine missing protocol
6327 if not format.get('protocol'):
6328 format['protocol'] = determine_protocol(format)
6329
6330 # Determine missing ext
6331 if not format.get('ext') and 'url' in format:
6332 format['ext'] = determine_ext(format['url'])
6333 if format.get('vcodec') == 'none':
6334 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6335 format['video_ext'] = 'none'
6336 else:
6337 format['video_ext'] = format['ext']
6338 format['audio_ext'] = 'none'
6339 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6340 # format['preference'] = -1000
6341
5424dbaf
L
6342 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6343 # HEVC-over-FLV is out-of-spec by FLV's original spec
6344 # ref. https://trac.ffmpeg.org/ticket/6389
6345 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6346 format['preference'] = -100
6347
d0d74b71 6348 # Determine missing bitrates
6349 if format.get('tbr') is None:
6350 if format.get('vbr') is not None and format.get('abr') is not None:
6351 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6352 else:
6353 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6354 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6355 if format.get('acodec') != 'none' and format.get('abr') is None:
6356 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6357
6358 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6359
6360
9b8ee23b 6361# Deprecated
6362has_certifi = bool(certifi)
6363has_websockets = bool(websockets)