]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Add `ac4` to known codecs
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
989a01c2 44import unicodedata
14f25df2 45import urllib.error
f8271158 46import urllib.parse
ac668111 47import urllib.request
bcf89ce6 48import xml.etree.ElementTree
d77c3dfd 49import zlib
d77c3dfd 50
6929b41a 51from .compat import functools # isort: split
8c25f81b 52from .compat import (
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
f8271158 55 compat_HTMLParseError,
efa97bdc 56 compat_os_name,
702ccf2d 57 compat_shlex_quote,
8c25f81b 58)
ac668111 59from .dependencies import brotli, certifi, websockets, xattr
f8271158 60from .socks import ProxyType, sockssocket
71aff188 61
4644ac55 62
51fb4995
YCH
63def register_socks_protocols():
64 # "Register" SOCKS protocols
d5ae6bb5
YCH
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
70
71
468e2e92
FV
72# This is not clearly defined otherwise
73compiled_regex_type = type(re.compile(''))
74
f7a147e3
S
75
76def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
19b4c74d 79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
f7a147e3
S
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
4390d5ec 121SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123]
9b8ee23b 124if brotli:
4390d5ec 125 SUPPORTED_ENCODINGS.append('br')
126
3e669f36 127std_headers = {
f7a147e3 128 'User-Agent': random_user_agent(),
59ae15a5 129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 130 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 131 'Sec-Fetch-Mode': 'navigate',
3e669f36 132}
f427df17 133
5f6a1245 134
fb37eb25
S
135USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137}
138
139
bf42a990 140NO_DEFAULT = object()
7b2c3f47 141IDENTITY = lambda x: x
bf42a990 142
7105440c
YCH
143ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
f6717dec
S
147MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
3e4185c3
S
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 157}
a942d6cb 158
8f53dc44 159# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167}
168
c587cbb7 169# needed for sanitizing filenames in restricted mode
c8827027 170ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 173
46f59e89
S
174DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
cb655f34
S
178 '%B %dst %Y',
179 '%B %dnd %Y',
9d30c213 180 '%B %drd %Y',
cb655f34 181 '%B %dth %Y',
46f59e89 182 '%b %d %Y',
cb655f34
S
183 '%b %dst %Y',
184 '%b %dnd %Y',
9d30c213 185 '%b %drd %Y',
cb655f34 186 '%b %dth %Y',
46f59e89
S
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
9d30c213 189 '%b %drd %Y %I:%M',
46f59e89
S
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
bccdbd22 193 '%Y.%m.%d.',
46f59e89 194 '%Y/%m/%d',
81c13222 195 '%Y/%m/%d %H:%M',
46f59e89 196 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
4f3fa23e 199 '%Y%m%d',
0c1c6f4b 200 '%Y-%m-%d %H:%M',
46f59e89
S
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
5014558a 203 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
c6eed6b8
S
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
b555ae9b
S
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
a63d9bd0 216 '%H:%M %d-%b-%Y',
46f59e89
S
217)
218
219DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
47304e07 227 '%d-%m-%Y %H:%M',
46f59e89
S
228])
229
230DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237])
238
06b3fe29 239PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 240JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 241
1d485a1a 242NUMBER_RE = r'\d+(?:\.\d+)?'
243
7105440c 244
0b9c08b4 245@functools.cache
d77c3dfd 246def preferredencoding():
59ae15a5 247 """Get preferred encoding.
d77c3dfd 248
59ae15a5
PH
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
28e614de 254 'TEST'.encode(pref)
70a1165b 255 except Exception:
59ae15a5 256 pref = 'UTF-8'
bae611f2 257
59ae15a5 258 return pref
d77c3dfd 259
f4bfd65f 260
181c8655 261def write_json_file(obj, fn):
1394646a 262 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 263
cfb0511d 264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
267
268 try:
269 with tf:
45d86abe 270 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
19a03940 274 with contextlib.suppress(OSError):
1394646a 275 os.unlink(fn)
19a03940 276 with contextlib.suppress(OSError):
9cd5f54e
R
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
181c8655 280 os.rename(tf.name, fn)
70a1165b 281 except Exception:
19a03940 282 with contextlib.suppress(OSError):
181c8655 283 os.remove(tf.name)
181c8655
PH
284 raise
285
286
cfb0511d 287def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 291 return node.find(expr)
59ae56fa 292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
5f6a1245
JW
295
296
d7e66d39
JMF
297def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
d77c3dfd 308
a41fb80c 309def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 310 def _find_xpath(xpath):
f9934b96 311 return node.find(xpath)
578c0745 312
14f25df2 313 if isinstance(xpath, str):
578c0745
S
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
d74bebd5 320
8e636da4 321 if n is None:
bf42a990
S
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
bf0ff932
PH
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
a41fb80c
S
329 return n
330
331
332def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
a41fb80c
S
345
346
347def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
86e5f3ed 353 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
bf0ff932
PH
358
359
c487cf00 360def get_element_by_id(id, html, **kwargs):
43e8fafd 361 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 362 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 363
12ea2f30 364
c487cf00 365def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 366 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 367 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
368
369
84c237fb 370def get_element_by_class(class_name, html):
2af12ad9
TC
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
6f32a0b5
ZM
376def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
c487cf00 382def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
384 return retval[0] if retval else None
385
386
c487cf00 387def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
389 return retval[0] if retval else None
390
391
c487cf00 392def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
64fa820c 395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
396 html, escape_value=False)
397
398
6f32a0b5
ZM
399def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
64fa820c 402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
403 html, escape_value=False)
404
405
406def get_elements_by_attribute(*args, **kwargs):
43e8fafd 407 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
4c9a1a3b 416def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
c61473c1
M
421 if not value:
422 return
9e6dd238 423
86e5f3ed 424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 425
84c237fb
YCH
426 value = re.escape(value) if escape_value else value
427
86e5f3ed 428 partial_element_re = rf'''(?x)
4c9a1a3b 429 <(?P<tag>{tag})
0254f162 430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
38285056 433
0254f162
ZM
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 436
0254f162
ZM
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
a921f407 441
c5229f39 442
ac668111 443class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
ac668111 455 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
46d09f87 485# XXX: This should be far less strict
6f32a0b5
ZM
486def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
ac668111 521class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 522 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 523
8bb56eee 524 def __init__(self):
c5229f39 525 self.attrs = {}
ac668111 526 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
7053aa3a 530 raise compat_HTMLParseError('done')
8bb56eee 531
c5229f39 532
ac668111 533class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
ac668111 537 html.parser.HTMLParser.__init__(self)
73673ccf
FF
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
8bb56eee
BF
550def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
8bb56eee
BF
563 """
564 parser = HTMLAttributeParser()
19a03940 565 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
566 parser.feed(html_element)
567 parser.close()
8bb56eee 568 return parser.attrs
9e6dd238 569
c5229f39 570
73673ccf
FF
571def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
9e6dd238 580def clean_html(html):
59ae15a5 581 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
49185227 586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
7decf895 593 return html.strip()
9e6dd238
FV
594
595
b7c47b74 596class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
2fa669f7 604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
b7c47b74 612
613
d77c3dfd 614def sanitize_open(filename, open_mode):
59ae15a5
PH
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
0edb3e33 624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
be5c1ae8 627
62b58c09 628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 632
0edb3e33 633 for attempt in range(2):
634 try:
635 try:
89737671 636 if sys.platform == 'win32':
b506289f 637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 640 raise LockingUnsupportedError()
0edb3e33 641 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 642 except OSError:
0edb3e33 643 stream = open(filename, open_mode)
8a82af35 644 return stream, filename
86e5f3ed 645 except OSError as err:
0edb3e33 646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
d77c3dfd
FV
651
652
653def timeconvert(timestr):
59ae15a5
PH
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
1c469a94 660
5f6a1245 661
5c3895ff 662def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 663 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 667 """
5c3895ff 668 if s == '':
669 return ''
670
59ae15a5 671 def replace_insane(char):
c587cbb7
AT
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
91dd88b9 674 elif not restricted and char == '\n':
5c3895ff 675 return '\0 '
989a01c2 676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
5c3895ff 684 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 685 elif char in '\\/|*<>':
5c3895ff 686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
59ae15a5
PH
689 return char
690
db4678e4 691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 693 s = unicodedata.normalize('NFKC', s)
5c3895ff 694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 695 result = ''.join(map(replace_insane, s))
5c3895ff 696 if is_id is NO_DEFAULT:
ae61d108 697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
796173d0
PH
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
5a42414b
PH
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
a7440261 711 result = result.lstrip('.')
796173d0
PH
712 if not result:
713 result = '_'
59ae15a5 714 return result
d77c3dfd 715
5f6a1245 716
c2934512 717def sanitize_path(s, force=False):
a2aaf4db 718 """Sanitizes and normalizes path on Windows"""
c2934512 719 if sys.platform == 'win32':
c4218ac3 720 force = False
c2934512 721 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 722 elif force:
723 drive_or_unc = ''
724 else:
a2aaf4db 725 return s
c2934512 726
be531ef1
S
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
a2aaf4db
S
729 norm_path.pop(0)
730 sanitized_path = [
ec85ded8 731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 732 for path_part in norm_path]
be531ef1
S
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 735 elif force and s and s[0] == os.path.sep:
c4218ac3 736 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
737 return os.path.join(*sanitized_path)
738
739
8f97a15d 740def sanitize_url(url, *, scheme='http'):
befa4708
S
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
21633673 743 if url is None:
744 return
745 elif url.startswith('//'):
8f97a15d 746 return f'{scheme}:{url}'
befa4708
S
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
067aa17e 749 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
bc6b9bcd 757 return url
17bcc626
S
758
759
5435dcf9 760def extract_basic_auth(url):
14f25df2 761 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
762 if parts.username is None:
763 return url, None
14f25df2 764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
0f06bcd7 768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
770
771
67dda517 772def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
ac668111 777 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
778
779
51098426 780def expand_path(s):
2fa669f7 781 """Expand shell variables and ~"""
51098426
S
782 return os.path.expandvars(compat_expanduser(s))
783
784
7e9a6125 785def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
d77c3dfd 795
912b38b4 796
55b2f099 797def _htmlentity_transform(entity_with_semicolon):
4e408e47 798 """Transforms an HTML entity to a character."""
55b2f099
YCH
799 entity = entity_with_semicolon[:-1]
800
4e408e47 801 # Known non-numeric HTML entity
ac668111 802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
4e408e47 804
62b58c09
L
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
55b2f099 809
91757b0f 810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
811 if mobj is not None:
812 numstr = mobj.group(1)
28e614de 813 if numstr.startswith('x'):
4e408e47 814 base = 16
28e614de 815 numstr = '0%s' % numstr
4e408e47
PH
816 else:
817 base = 10
067aa17e 818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 819 with contextlib.suppress(ValueError):
ac668111 820 return chr(int(numstr, base))
4e408e47
PH
821
822 # Unknown entity in name, return its literal representation
7a3f0c00 823 return '&%s;' % entity
4e408e47
PH
824
825
d77c3dfd 826def unescapeHTML(s):
912b38b4
PH
827 if s is None:
828 return None
19a03940 829 assert isinstance(s, str)
d77c3dfd 830
4e408e47 831 return re.sub(
95f3f7c2 832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 833
8bf48f23 834
cdb19aa4 835def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
f5b1bca9 846def process_communicate_or_kill(p, *args, **kwargs):
da4db748 847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 849 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 850
851
d3c93ec2 852class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
82ea226c
L
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
f0c9fb96 883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
82ea226c 887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 888
889 def communicate_or_kill(self, *args, **kwargs):
8a82af35 890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
f0c9fb96 893 self.kill(timeout=None)
8a82af35 894 raise
d3c93ec2 895
f0c9fb96 896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
992dc6b4 902 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 903 with cls(*args, **kwargs) as proc:
914491b8 904 default = '' if proc.text_mode else b''
992dc6b4 905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 906 return stdout or default, stderr or default, proc.returncode
f0c9fb96 907
d3c93ec2 908
aa49acd1
S
909def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
8bf48f23 921def encodeFilename(s, for_subprocess=False):
19a03940 922 assert isinstance(s, str)
cfb0511d 923 return s
aa49acd1
S
924
925
926def decodeFilename(b, for_subprocess=False):
cfb0511d 927 return b
8bf48f23 928
f07b74fc
PH
929
930def encodeArgument(s):
cfb0511d 931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
14f25df2 933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 934 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
935
936
aa49acd1 937def decodeArgument(b):
cfb0511d 938 return b
aa49acd1
S
939
940
8271226a
PH
941def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
14f25df2 947 assert isinstance(optval, str)
8271226a 948 return optval
1c256f70 949
5f6a1245 950
aa7785f8 951_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
cdb19aa4 961def formatSeconds(secs, delim=':', msec=False):
aa7785f8 962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 967 else:
aa7785f8 968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 970
a0ddb8a2 971
77562778 972def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
19a03940 981 with contextlib.suppress(ssl.SSLError):
77562778 982 ssl_context.load_verify_locations(cadata=cert)
a2366922 983
77562778 984
985def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
f81c62a6 989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
ac8e69dd
M
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
5b9f253f
M
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1012
77562778 1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
d5820461 1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
8a82af35 1029
bb58c9ed 1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
77562778 1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1046
732ea2f0 1047
5873d4cc 1048def bug_reports_message(before=';'):
57e0f077 1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1059
1060
bf5b9d85
PM
1061class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
aa9369a2 1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
bf5b9d85
PM
1071
1072
ac668111 1073network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1074if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076network_exceptions = tuple(network_exceptions)
1077
1078
bf5b9d85 1079class ExtractorError(YoutubeDLError):
1c256f70 1080 """Error during info extraction."""
5f6a1245 1081
1151c407 1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1083 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1085 """
3158150c 1086 if sys.exc_info()[0] in network_exceptions:
9a82b238 1087 expected = True
d5979c5d 1088
7265a219 1089 self.orig_msg = str(msg)
1c256f70 1090 self.traceback = tb
1151c407 1091 self.expected = expected
2eabb802 1092 self.cause = cause
d11271dd 1093 self.video_id = video_id
1151c407 1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1098 super().__init__(self.__msg)
1151c407 1099
9bcfe33b 1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1c256f70 1108
01951dda 1109 def format_traceback(self):
497d2fab 1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1113 delim='\n') or None
01951dda 1114
9bcfe33b 1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1c256f70 1121
416c7fcb
PH
1122class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
86e5f3ed 1124 super().__init__(
416c7fcb
PH
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
55b3e45b
JMF
1129class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
773f291d
S
1134class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
b6e0c7d2 1140
0db3bae8 1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
86e5f3ed 1143 super().__init__(msg, **kwargs)
773f291d
S
1144 self.countries = countries
1145
1146
693f0600 1147class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
bf5b9d85 1155class DownloadError(YoutubeDLError):
59ae15a5 1156 """Download Error exception.
d77c3dfd 1157
59ae15a5
PH
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
5f6a1245 1162
8cc83b8d
FV
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1165 super().__init__(msg)
8cc83b8d 1166 self.exc_info = exc_info
d77c3dfd
FV
1167
1168
498f5606 1169class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
aa9369a2 1175 msg = 'Entry not found in info'
498f5606 1176
1177
bf5b9d85 1178class SameFileError(YoutubeDLError):
59ae15a5 1179 """Same File exception.
d77c3dfd 1180
59ae15a5
PH
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
aa9369a2 1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
d77c3dfd
FV
1190
1191
bf5b9d85 1192class PostProcessingError(YoutubeDLError):
59ae15a5 1193 """Post Processing exception.
d77c3dfd 1194
59ae15a5
PH
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
5f6a1245 1198
5f6a1245 1199
48f79687 1200class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
8b0d7497 1203
8b0d7497 1204
48f79687 1205class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1208
48f79687 1209
1210class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1213
1214
48f79687 1215class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1216 """ --max-downloads limit has been reached. """
48f79687 1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
f2ebc5c7 1220class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228class ThrottledDownload(ReExtractInfo):
48f79687 1229 """ Download speed below --throttled-rate. """
aa9369a2 1230 msg = 'The download speed is below throttle limit'
d77c3dfd 1231
43b22906 1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
f2ebc5c7 1234
d77c3dfd 1235
bf5b9d85 1236class UnavailableVideoError(YoutubeDLError):
59ae15a5 1237 """Unavailable Format exception.
d77c3dfd 1238
59ae15a5
PH
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
aa9369a2 1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
d77c3dfd
FV
1248
1249
bf5b9d85 1250class ContentTooShortError(YoutubeDLError):
59ae15a5 1251 """Content Too Short exception.
d77c3dfd 1252
59ae15a5
PH
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
d77c3dfd 1257
59ae15a5 1258 def __init__(self, downloaded, expected):
86e5f3ed 1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1260 # Both in bytes
59ae15a5
PH
1261 self.downloaded = downloaded
1262 self.expected = expected
d77c3dfd 1263
5f6a1245 1264
bf5b9d85 1265class XAttrMetadataError(YoutubeDLError):
efa97bdc 1266 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1267 super().__init__(msg)
efa97bdc 1268 self.code = code
bd264412 1269 self.msg = msg
efa97bdc
YCH
1270
1271 # Parsing code and msg
3089bc74 1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
bf5b9d85 1281class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1282 pass
1283
1284
c5a59d93 1285def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1286 hc = http_class(*args, **kwargs)
be4a824d 1287 source_address = ydl_handler._params.get('source_address')
8959018a 1288
be4a824d 1289 if source_address is not None:
8959018a
AU
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1303 raise OSError(
9e21e6d9
S
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
8959018a
AU
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
86e5f3ed 1317 except OSError as _:
8959018a
AU
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
86e5f3ed 1324 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
cfb0511d 1327 hc.source_address = (source_address, 0)
be4a824d
PH
1328
1329 return hc
1330
1331
87f0e62d 1332def handle_youtubedl_headers(headers):
992fc9d6
YCH
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1337 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1338
992fc9d6 1339 return filtered_headers
87f0e62d
YCH
1340
1341
ac668111 1342class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
0424ec30 1349 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
be4a824d 1360 def __init__(self, params, *args, **kwargs):
ac668111 1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1362 self._params = params
1363
1364 def http_open(self, req):
ac668111 1365 conn_class = http.client.HTTPConnection
71aff188
YCH
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
be4a824d 1372 return self.do_open(functools.partial(
71aff188 1373 _create_http_connection, self, conn_class, False),
be4a824d
PH
1374 req)
1375
59ae15a5
PH
1376 @staticmethod
1377 def deflate(data):
fc2119f2 1378 if not data:
1379 return data
59ae15a5
PH
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
4390d5ec 1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
9b8ee23b 1389 return brotli.decompress(data)
4390d5ec 1390
acebc9cd 1391 def http_request(self, req):
51f267d9
S
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
15d260eb 1405 req = update_Request(req, url=url_escaped)
51f267d9 1406
8b7539d2 1407 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
33ac271b 1411 req.add_header(h, v)
87f0e62d 1412
af14914b 1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
87f0e62d 1416 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1417
379a4f16 1418 return super().do_request_(req)
59ae15a5 1419
acebc9cd 1420 def http_response(self, req, resp):
59ae15a5
PH
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1428 except OSError as original_ioerror:
aa3e9507
PH
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1435 except OSError:
aa3e9507
PH
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
ac668111 1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1441 resp.msg = old_resp.msg
c047270c 1442 del resp.headers['Content-encoding']
59ae15a5
PH
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1447 resp.msg = old_resp.msg
c047270c 1448 del resp.headers['Content-encoding']
4390d5ec 1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1451 resp = urllib.request.addinfourl(
4390d5ec 1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
ad729172 1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1461 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
59ae15a5 1466 return resp
0f8d03f8 1467
acebc9cd
PH
1468 https_request = http_request
1469 https_response = http_response
bf50b038 1470
5de90176 1471
71aff188
YCH
1472def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
ac668111 1474 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1475
14f25df2 1476 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
71aff188 1483
cdd94c2e
YCH
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
ac668111 1487 return urllib.parse.unquote_plus(s)
cdd94c2e 1488
71aff188
YCH
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
cdd94c2e
YCH
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
71aff188
YCH
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
19a03940 1501 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
ac668111 1505 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
ac668111 1515class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1519 self._params = params
1520
1521 def https_open(self, req):
4f264c02 1522 kwargs = {}
71aff188
YCH
1523 conn_class = self._https_conn_class
1524
4f264c02
JMF
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
4f28b537 1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
be4a824d
PH
1543
1544
941e881e 1545def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
ac668111 1549class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
e7e62441 1555 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1558# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1559
1560'''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1564
d76fa1f3 1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
941e881e 1567 if is_path_like(filename):
d76fa1f3 1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
24146491 1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
d76fa1f3 1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
941e881e 1577 if is_path_like(file):
d76fa1f3 1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
24146491 1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1607 """
1608 Save cookies to a file.
24146491 1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1611
c380cc28
S
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
ac668111 1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1617
24146491 1618 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
c380cc28 1622
d76fa1f3 1623 with self.open(filename, write=True) as f:
c380cc28 1624 f.write(self._HEADER)
24146491 1625 self._really_save(f, *args, **kwargs)
1bab3437
S
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
ac668111 1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1634
c380cc28
S
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1647 return line
1648
e7e62441 1649 cf = io.StringIO()
d76fa1f3 1650 with self.open(filename) as f:
e7e62441 1651 for line in f:
c380cc28
S
1652 try:
1653 cf.write(prepare_line(line))
ac668111 1654 except http.cookiejar.LoadError as e:
94aa0644 1655 if f'{line.strip()} '[0] in '[{"':
ac668111 1656 raise http.cookiejar.LoadError(
94aa0644 1657 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1660 continue
e7e62441 1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
ac668111 1679class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1680 def __init__(self, cookiejar=None):
ac668111 1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1682
1683 def http_response(self, request, response):
ac668111 1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1685
ac668111 1686 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1687 https_response = http_response
1688
1689
ac668111 1690class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
ac668111 1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
14f25df2 1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
201c1459 1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
ac668111 1747 return urllib.request.Request(
201c1459 1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1749 unverifiable=True, method=m)
fca6dba8
S
1750
1751
46f59e89
S
1752def extract_timezone(date_str):
1753 m = re.search(
f137e4c2 1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
46f59e89 1764 if not m:
8f53dc44 1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
08b38d54 1782def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
52c3a6e4
S
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
08b38d54 1790 if timezone is None:
46f59e89
S
1791 timezone, date_str = extract_timezone(date_str)
1792
19a03940 1793 with contextlib.suppress(ValueError):
86e5f3ed 1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
912b38b4
PH
1797
1798
46f59e89
S
1799def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
42bdd9d0 1803def unified_strdate(date_str, day_first=True):
bf50b038 1804 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1805
1806 if date_str is None:
1807 return None
bf50b038 1808 upload_date = None
5f6a1245 1809 # Replace commas
026fcc04 1810 date_str = date_str.replace(',', ' ')
42bdd9d0 1811 # Remove AM/PM + timezone
9bb8e0a3 1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1813 _, date_str = extract_timezone(date_str)
42bdd9d0 1814
46f59e89 1815 for expression in date_formats(day_first):
19a03940 1816 with contextlib.suppress(ValueError):
bf50b038 1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
19a03940 1821 with contextlib.suppress(ValueError):
c6b9cf05 1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1823 if upload_date is not None:
14f25df2 1824 return str(upload_date)
bf50b038 1825
5f6a1245 1826
46f59e89
S
1827def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
8f53dc44 1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1833
7dc2a74e 1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
deef3195
S
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
f226880c
PH
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
46f59e89 1850 for expression in date_formats(day_first):
19a03940 1851 with contextlib.suppress(ValueError):
7dc2a74e 1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1853 return calendar.timegm(dt.timetuple())
8f53dc44 1854
46f59e89
S
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
8f53dc44 1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1858
1859
28e614de 1860def determine_ext(url, default_ext='unknown_video'):
85750f89 1861 if url is None or '.' not in url:
f4776371 1862 return default_ext
9cb9a5df 1863 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
a7aaa398
S
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1868 return guess.rstrip('/')
73e79f2a 1869 else:
cbdbb766 1870 return default_ext
73e79f2a 1871
5f6a1245 1872
824fa511
S
1873def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1875
5f6a1245 1876
9e62f283 1877def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
9e62f283 1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
396a76f7 1891 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1892 if date_str in ('now', 'today'):
37254abc 1893 return today
f8795e10
PH
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
9e62f283 1896 match = re.match(
3d38b2d6 1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1898 date_str)
37254abc 1899 if match is not None:
9e62f283 1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1902 unit = match.group('unit')
9e62f283 1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1905 unit = 'day'
9e62f283 1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
d49f8db3 1919def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1920 R"""
1921 Return a date object from a string using datetime_from_str
9e62f283 1922
3d38b2d6 1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1925 """
3d38b2d6 1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1956
1957
e63fc1be 1958def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
5f6a1245 1967
86e5f3ed 1968class DateRange:
bd558525 1969 """Represents a time interval between two dates"""
5f6a1245 1970
bd558525
JMF
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
d49f8db3 1974 self.start = date_from_str(start, strict=True)
bd558525
JMF
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
d49f8db3 1978 self.end = date_from_str(end, strict=True)
bd558525
JMF
1979 else:
1980 self.end = datetime.datetime.max.date()
37254abc 1981 if self.start > self.end:
bd558525 1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1983
bd558525
JMF
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
5f6a1245
JW
1987 return cls(day, day)
1988
bd558525
JMF
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
37254abc
JMF
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
5f6a1245 1994
bd558525 1995 def __str__(self):
86e5f3ed 1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1997
f2df4071 1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
c496ca96
PH
2002
2003def platform_name():
14f25df2 2004 """ Returns the platform name as a str """
da4db748 2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 2006 return platform.platform()
c496ca96 2007
b1f94422 2008
2009@functools.cache
2010def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
b1f94422 2017
17fc3dc4 2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 2019 platform.python_version(),
2020 python_implementation,
17fc3dc4 2021 platform.machine(),
b1f94422 2022 platform.architecture()[0],
2023 platform.platform(),
5b9f253f
M
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 2026 )
c257baff
PH
2027
2028
0b9c08b4 2029@functools.cache
49fa4d9a 2030def get_windows_version():
8a82af35 2031 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
8a82af35 2035 return ()
49fa4d9a
N
2036
2037
734f90bb 2038def write_string(s, out=None, encoding=None):
19a03940 2039 assert isinstance(s, str)
2040 out = out or sys.stderr
7459e3a2 2041
fe1daad3 2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2043 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2044
8a82af35 2045 enc, buffer = None, out
cfb0511d 2046 if 'b' in getattr(out, 'mode', ''):
c487cf00 2047 enc = encoding or preferredencoding()
104aa738 2048 elif hasattr(out, 'buffer'):
8a82af35 2049 buffer = out.buffer
104aa738 2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2051
8a82af35 2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2053 out.flush()
2054
2055
da4db748 2056def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070deprecation_warning._cache = set()
2071
2072
48ea9cea
PH
2073def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
c257baff 2081
cba892fa 2082def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
ac668111 2085 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2086
2087
8a82af35 2088class LockingUnsupportedError(OSError):
1890fc63 2089 msg = 'File locking is not supported'
0edb3e33 2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
c1c9a79c
PH
2095# Cross-platform file locking
2096if sys.platform == 'win32':
fe0918bb 2097 import ctypes
c1c9a79c
PH
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
747c0bd1 2133 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152else:
399a76e6
YCH
2153 try:
2154 import fcntl
c1c9a79c 2155
a3125791 2156 def _lock_file(f, exclusive, block):
b63837bc 2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
acea8d7c 2160 try:
b63837bc 2161 fcntl.flock(f, flags)
acea8d7c
JK
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
b63837bc 2165 fcntl.lockf(f, flags)
c1c9a79c 2166
399a76e6 2167 def _unlock_file(f):
acea8d7c
JK
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2172
399a76e6 2173 except ImportError:
399a76e6 2174
a3125791 2175 def _lock_file(f, exclusive, block):
0edb3e33 2176 raise LockingUnsupportedError()
399a76e6
YCH
2177
2178 def _unlock_file(f):
0edb3e33 2179 raise LockingUnsupportedError()
c1c9a79c
PH
2180
2181
86e5f3ed 2182class locked_file:
0edb3e33 2183 locked = False
747c0bd1 2184
a3125791 2185 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
98804d03 2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2203
2204 def __enter__(self):
a3125791 2205 exclusive = 'r' not in self.mode
c1c9a79c 2206 try:
a3125791 2207 _lock_file(self.f, exclusive, self.block)
0edb3e33 2208 self.locked = True
86e5f3ed 2209 except OSError:
c1c9a79c
PH
2210 self.f.close()
2211 raise
fcfa8853 2212 if 'w' in self.mode:
131e14dc
JK
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
1890fc63 2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
c1c9a79c
PH
2221 return self
2222
0edb3e33 2223 def unlock(self):
2224 if not self.locked:
2225 return
c1c9a79c 2226 try:
0edb3e33 2227 _unlock_file(self.f)
c1c9a79c 2228 finally:
0edb3e33 2229 self.locked = False
c1c9a79c 2230
0edb3e33 2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
4eb7f1d1 2236
0edb3e33 2237 open = __enter__
2238 close = __exit__
a3125791 2239
0edb3e33 2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
a3125791 2242
0edb3e33 2243 def __iter__(self):
2244 return iter(self.f)
a3125791 2245
4eb7f1d1 2246
0b9c08b4 2247@functools.cache
4644ac55
S
2248def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
4eb7f1d1 2253def shell_quote(args):
a6a173c2 2254 quoted_args = []
4644ac55 2255 encoding = get_filesystem_encoding()
a6a173c2
JMF
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
aefce8e6 2260 quoted_args.append(compat_shlex_quote(a))
28e614de 2261 return ' '.join(quoted_args)
9d4660ca
PH
2262
2263
2264def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
81953d1a
RA
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
14f25df2 2269 sdata = urllib.parse.urlencode(
28e614de
PH
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
9d4660ca
PH
2272
2273
79f82953 2274def unsmuggle_url(smug_url, default=None):
83e865a3 2275 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2276 return smug_url, default
28e614de 2277 url, _, sdata = smug_url.rpartition('#')
14f25df2 2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2279 data = json.loads(jsond)
2280 return url, data
02dbf93f
PH
2281
2282
e0fd9573 2283def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2286 if num is None or num < 0:
e0fd9573 2287 return None
eeb2a770 2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2293 converted = num / (factor ** exponent)
abbeeebc 2294 return fmt % (converted, suffix)
e0fd9573 2295
2296
02dbf93f 2297def format_bytes(bytes):
f02d24d8 2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2299
1c088fa8 2300
64c464a1 2301def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2303 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2306 if not m:
2307 return None
64c464a1 2308
2309 num = float(m.group('num').replace(',', '.'))
fb47597b 2310 mult = unit_table[m.group('unit')]
64c464a1 2311 return round(num * mult)
2312
2313
2314def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
fb47597b
S
2319
2320
be64b5b0
PH
2321def parse_filesize(s):
2322 if s is None:
2323 return None
2324
dfb1b146 2325 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
70852b47 2330 'bytes': 1,
be64b5b0
PH
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
13585d76 2335 'kb': 1000,
70852b47
YCH
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
be64b5b0
PH
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
13585d76 2342 'mb': 1000 ** 2,
70852b47
YCH
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
be64b5b0
PH
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
13585d76 2349 'gb': 1000 ** 3,
70852b47
YCH
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
be64b5b0
PH
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
13585d76 2356 'tb': 1000 ** 4,
70852b47
YCH
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
be64b5b0
PH
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
13585d76 2363 'pb': 1000 ** 5,
70852b47
YCH
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
be64b5b0
PH
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
13585d76 2370 'eb': 1000 ** 6,
70852b47
YCH
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
be64b5b0
PH
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
13585d76 2377 'zb': 1000 ** 7,
70852b47
YCH
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
be64b5b0
PH
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
13585d76 2384 'yb': 1000 ** 8,
70852b47
YCH
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
be64b5b0
PH
2387 }
2388
fb47597b
S
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392def parse_count(s):
2393 if s is None:
be64b5b0
PH
2394 return None
2395
352d5da8 2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
352d5da8 2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
fb47597b 2410 }
be64b5b0 2411
352d5da8 2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
be64b5b0 2419
2f7ae819 2420
5d45484c 2421def parse_resolution(s, *, lenient=False):
b871d7e9
S
2422 if s is None:
2423 return {}
2424
5d45484c
LNO
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
17ec8bcf 2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
0dc41787 2446def parse_bitrate(s):
14f25df2 2447 if not isinstance(s, str):
0dc41787
S
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
a942d6cb 2454def month_by_name(name, lang='en'):
caefb1de
PH
2455 """ Return the number of a month by (locale-independently) English name """
2456
f6717dec 2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2458
caefb1de 2459 try:
f6717dec 2460 return month_names.index(name) + 1
7105440c
YCH
2461 except ValueError:
2462 return None
2463
2464
2465def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2471 except ValueError:
2472 return None
18258362
JMF
2473
2474
5aafe895 2475def fix_xml_ampersands(xml_str):
18258362 2476 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2479 '&amp;',
5aafe895 2480 xml_str)
e3946f98
PH
2481
2482
2483def setproctitle(title):
14f25df2 2484 assert isinstance(title, str)
c1c05c67 2485
fe0918bb 2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
c1c05c67
YCH
2490 return
2491
e3946f98 2492 try:
611c1dd9 2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2494 except OSError:
2495 return
2f49bcd6
RC
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
0f06bcd7 2501 title_bytes = title.encode()
6eefe533
PH
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
e3946f98 2504 try:
6eefe533 2505 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2506 except AttributeError:
2507 return # Strange libc, just skip this
d7dda168
PH
2508
2509
2510def remove_start(s, start):
46bc9b7d 2511 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2512
2513
2b9faf55 2514def remove_end(s, end):
46bc9b7d 2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2516
2517
31b2051e
S
2518def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
b6e0c7d2 2527def get_domain(url):
ebf99aaf 2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2533
2534
29eb5174 2535def url_basename(url):
14f25df2 2536 path = urllib.parse.urlparse(url).path
28e614de 2537 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2538
2539
02dc0a36 2540def base_url(url):
7657ec7e 2541 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2542
2543
e34c3361 2544def urljoin(base, path):
4b5de77b 2545 if isinstance(path, bytes):
0f06bcd7 2546 path = path.decode()
14f25df2 2547 if not isinstance(path, str) or not path:
e34c3361 2548 return None
fad4ceb5 2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2550 return path
4b5de77b 2551 if isinstance(base, bytes):
0f06bcd7 2552 base = base.decode()
14f25df2 2553 if not isinstance(base, str) or not re.match(
4b5de77b 2554 r'^(?:https?:)?//', base):
e34c3361 2555 return None
14f25df2 2556 return urllib.parse.urljoin(base, path)
e34c3361
S
2557
2558
ac668111 2559class HEADRequest(urllib.request.Request):
aa94a6d3 2560 def get_method(self):
611c1dd9 2561 return 'HEAD'
7217e148
PH
2562
2563
ac668111 2564class PUTRequest(urllib.request.Request):
95cf60e8
S
2565 def get_method(self):
2566 return 'PUT'
2567
2568
9732d77e 2569def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
1812afb7
S
2572 try:
2573 return int(v) * invscale // scale
31c49255 2574 except (ValueError, TypeError, OverflowError):
af98f8ff 2575 return default
9732d77e 2576
9572013d 2577
40a90862 2578def str_or_none(v, default=None):
14f25df2 2579 return default if v is None else str(v)
40a90862 2580
9732d77e
PH
2581
2582def str_to_int(int_str):
48d4681e 2583 """ A more relaxed version of int_or_none """
f9934b96 2584 if isinstance(int_str, int):
348c6bf1 2585 return int_str
14f25df2 2586 elif isinstance(int_str, str):
42db58ec
S
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
608d11f5
PH
2589
2590
9732d77e 2591def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
5e1271c5 2596 except (ValueError, TypeError):
caf80631 2597 return default
43f775e4
PH
2598
2599
c7e327c4
S
2600def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
53cd37ba 2604def strip_or_none(v, default=None):
14f25df2 2605 return v.strip() if isinstance(v, str) else default
b72b4431
S
2606
2607
af03000a 2608def url_or_none(url):
14f25df2 2609 if not url or not isinstance(url, str):
af03000a
S
2610 return None
2611 url = url.strip()
29f7c58a 2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2613
2614
3e9b66d7 2615def request_to_url(req):
ac668111 2616 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
e29663c6 2622def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
f9934b96 2625 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2629 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
608d11f5 2638def parse_duration(s):
f9934b96 2639 if not isinstance(s, str):
608d11f5 2640 return None
ca7b3246 2641 s = s.strip()
38d79fd1 2642 if not s:
2643 return None
ca7b3246 2644
acaff495 2645 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
acaff495 2652 if m:
8bd1c00b 2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2654 else:
2655 m = re.match(
056653bb
S
2656 r'''(?ix)(?:P?
2657 (?:
1c1b2f96 2658 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2659 )?
2660 (?:
1c1b2f96 2661 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2662 )?
2663 (?:
1c1b2f96 2664 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2665 )?
8f4b58d7 2666 (?:
1c1b2f96 2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2668 )?
056653bb 2669 T)?
acaff495 2670 (?:
1c1b2f96 2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2672 )?
2673 (?:
1c1b2f96 2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2678 )?Z?$''', s)
acaff495 2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
15846398 2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
acaff495 2688 if ms:
19a03940 2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2692
2693
e65e4c88 2694def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2695 name, real_ext = os.path.splitext(filename)
e65e4c88 2696 return (
86e5f3ed 2697 f'{name}.{ext}{real_ext}'
e65e4c88 2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2699 else f'{filename}.{ext}')
d70ad093
PH
2700
2701
b3ed15b7
S
2702def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
86e5f3ed 2704 return '{}.{}'.format(
b3ed15b7
S
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
d70ad093
PH
2709def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
f0c9fb96 2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2714 except OSError:
2715 return False
2716 return exe
b7ab0590
PH
2717
2718
7aaf4cd2 2719def _get_exe_version_output(exe, args):
95807118 2720 try:
b64d04c1 2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2722 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2726 except OSError:
2727 return False
f0c9fb96 2728 return stdout
cae97f65
PH
2729
2730
2731def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2732 assert isinstance(output, str)
cae97f65
PH
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
95807118
PH
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
9af98e17 2742def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
7e88d7d7 2750def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
cb89cfc1 2760class LazyList(collections.abc.Sequence):
0f06bcd7 2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2763
8e5fecc8 2764 class IndexError(IndexError):
2765 pass
2766
282f5709 2767 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
483336e7 2771
2772 def __iter__(self):
0f06bcd7 2773 if self._reversed:
28419ca2 2774 # We need to consume the entire iterable to iterate in reverse
981052c9 2775 yield from self.exhaust()
28419ca2 2776 return
0f06bcd7 2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
483336e7 2780 yield item
2781
0f06bcd7 2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
28419ca2 2786
981052c9 2787 def exhaust(self):
0f06bcd7 2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2790
28419ca2 2791 @staticmethod
0f06bcd7 2792 def _reverse_index(x):
f2df4071 2793 return None if x is None else ~x
483336e7 2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
0f06bcd7 2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2799 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2800 elif isinstance(idx, int):
0f06bcd7 2801 if self._reversed:
2802 idx = self._reverse_index(idx)
e0f2b4b4 2803 start, stop, step = idx, idx, 0
483336e7 2804 else:
2805 raise TypeError('indices must be integers or slices')
e0f2b4b4 2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
483336e7 2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
0f06bcd7 2811 self._exhaust()
8e5fecc8 2812 try:
0f06bcd7 2813 return self._cache[idx]
8e5fecc8 2814 except IndexError as e:
2815 raise self.IndexError(e) from e
0f06bcd7 2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2817 if n > 0:
0f06bcd7 2818 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2819 try:
0f06bcd7 2820 return self._cache[idx]
8e5fecc8 2821 except IndexError as e:
2822 raise self.IndexError(e) from e
483336e7 2823
2824 def __bool__(self):
2825 try:
0f06bcd7 2826 self[-1] if self._reversed else self[0]
8e5fecc8 2827 except self.IndexError:
483336e7 2828 return False
2829 return True
2830
2831 def __len__(self):
0f06bcd7 2832 self._exhaust()
2833 return len(self._cache)
483336e7 2834
282f5709 2835 def __reversed__(self):
0f06bcd7 2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2837
2838 def __copy__(self):
0f06bcd7 2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2840
28419ca2 2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
483336e7 2848
7be9ccff 2849class PagedList:
c07a39ae 2850
2851 class IndexError(IndexError):
2852 pass
2853
dd26ced1
PH
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
7be9ccff 2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
f1d13090 2861 self._pagecount = float('inf')
7be9ccff 2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
d8cf8d97 2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
f1d13090 2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
55575225 2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
f1d13090 2880 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
d8cf8d97 2884 if not entries:
c07a39ae 2885 raise self.IndexError()
d8cf8d97 2886 return entries[0]
55575225 2887
9c44d242
PH
2888
2889class OnDemandPagedList(PagedList):
a44ca5a4 2890 """Download pages until a page with less than maximum results"""
86e5f3ed 2891
7be9ccff 2892 def _getslice(self, start, end):
b7ab0590
PH
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
b7ab0590
PH
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
b7ab0590
PH
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
f1d13090 2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
b7ab0590
PH
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
7be9ccff 2915 yield from page_results
b7ab0590
PH
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
81c2f20b
PH
2928
2929
9c44d242 2930class InAdvancePagedList(PagedList):
a44ca5a4 2931 """PagedList with total number of pages known in advance"""
86e5f3ed 2932
9c44d242 2933 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2934 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2935 self._pagecount = pagecount
9c44d242 2936
7be9ccff 2937 def _getslice(self, start, end):
9c44d242 2938 start_page = start // self._pagesize
d37707bd 2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
7be9ccff 2943 page_results = self.getpage(pagenum)
9c44d242 2944 if skip_elems:
7be9ccff 2945 page_results = page_results[skip_elems:]
9c44d242
PH
2946 skip_elems = None
2947 if only_more is not None:
7be9ccff 2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
9c44d242 2950 else:
7be9ccff 2951 yield from page_results[:only_more]
9c44d242 2952 break
7be9ccff 2953 yield from page_results
9c44d242
PH
2954
2955
7e88d7d7 2956class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
7e9a6125 2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2971 self.is_incomplete = requested_entries is not None
7e9a6125 2972 if self.is_incomplete:
2973 assert self.is_exhausted
bc5c2f8a 2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
7e88d7d7 2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
1ac4fd80 3017 if not entry:
3018 continue
7e88d7d7 3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
7e9a6125 3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
7e88d7d7 3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
bc5c2f8a 3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
7e9a6125 3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
7e88d7d7 3079 break
7e9a6125 3080 continue
7e88d7d7 3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
81c2f20b 3090def uppercase_escape(s):
676eb3f2 3091 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3092 return re.sub(
a612753d 3093 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
0fe2ff78
YCH
3096
3097
3098def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
b53466e1 3104
d05cfe06
S
3105
3106def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3109
3110
3111def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
14f25df2 3113 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3114 return url_parsed._replace(
efbed08d 3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
62e609ab 3122
96b9e9cf 3123def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3125
3126
62e609ab
PH
3127def read_batch_urls(batch_fd):
3128 def fixup(url):
14f25df2 3129 if not isinstance(url, str):
62e609ab 3130 url = url.decode('utf-8', 'replace')
8c04f0be 3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
62e609ab 3137 return False
8c04f0be 3138 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3139 # However, it can be safely stripped out if following a whitespace
8c04f0be 3140 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3144
3145
3146def urlencode_postdata(*args, **kargs):
14f25df2 3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3148
3149
38f9ef31 3150def update_url_query(url, query):
cacd9966
YCH
3151 if not query:
3152 return url
14f25df2 3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3155 qs.update(query)
14f25df2 3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
16392824 3158
8e60dc75 3159
c043c246 3160def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3161 req_headers = req.headers.copy()
c043c246 3162 req_headers.update(headers or {})
ed0291d1
S
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
ac668111 3171 req_type = urllib.request.Request
ed0291d1
S
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
10c87c15 3180def _multipart_encode_impl(data, boundary):
0c265486
YCH
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3186 if isinstance(k, str):
0f06bcd7 3187 k = k.encode()
14f25df2 3188 if isinstance(v, str):
0f06bcd7 3189 v = v.encode()
0c265486
YCH
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
10c87c15 3222 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
304ad45a 3232def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
86296ad2 3236def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
cbecc9b9
S
3241
3242
c4f60dd7 3243def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
a32a9a7e 3245 try:
c4f60dd7 3246 val = f(*args, **kwargs)
ab029d7e 3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3248 pass
3249 else:
c4f60dd7 3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3256
3257
90137ca4 3258def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
6cc62232
S
3262def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
90137ca4 3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3268 merged[k] = v
3269 return merged
3270
3271
8e60dc75 3272def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3273 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3274
16392824 3275
a1a530b0
PH
3276US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282}
fac55558
PH
3283
3284
a8795327 3285TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
a8795327
S
3292}
3293
3294
146c80e2 3295def parse_age_limit(s):
19a03940 3296 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3297 if type(s) is int: # noqa: E721
a8795327 3298 return s if 0 <= s <= 21 else None
19a03940 3299 elif not isinstance(s, str):
d838b1bd 3300 return None
146c80e2 3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3302 if m:
3303 return int(m.group('age'))
5c5fae6d 3304 s = s.upper()
a8795327
S
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
5a16c9d9 3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3308 if m:
5a16c9d9 3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3310 return None
146c80e2
S
3311
3312
fac55558 3313def strip_jsonp(code):
609a61e3 3314 return re.sub(
5552c9eb 3315 r'''(?sx)^
e9c671d5 3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
478c2c61
PH
3321
3322
8f53dc44 3323def js_to_json(code, vars={}, *, strict=False):
5c610515 3324 # vars is a dict of var, val pairs to substitute
a71b812f
SS
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3329 INTEGER_TABLE = (
86e5f3ed 3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3332 )
3333
a71b812f
SS
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
e05f6939 3343 def fix_kv(m):
e7b6d122
PH
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
421ddcb8
C
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
8bdd16b4 3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 return json.dumps(vars[v])
89ac4a19 3364
a71b812f
SS
3365 if not strict:
3366 return f'"{v}"'
5c610515 3367
a71b812f 3368 raise ValueError(f'Unknown value: {v}')
e05f6939 3369
8072ef2b 3370 def create_map(mobj):
3371 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
8072ef2b 3373 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3374 if not strict:
3375 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3376 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
febff4c1 3377
a71b812f
SS
3378 return re.sub(rf'''(?sx)
3379 {STRING_RE}|
3380 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3381 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3382 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3384 !+
a71b812f 3385 ''', fix_kv, code)
e05f6939
PH
3386
3387
478c2c61
PH
3388def qualities(quality_ids):
3389 """ Get a numeric quality value out of a list of possible values """
3390 def q(qid):
3391 try:
3392 return quality_ids.index(qid)
3393 except ValueError:
3394 return -1
3395 return q
3396
acd69589 3397
8aa0e7cd 3398POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3399
3400
de6000d9 3401DEFAULT_OUTTMPL = {
3402 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3403 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3404}
3405OUTTMPL_TYPES = {
72755351 3406 'chapter': None,
de6000d9 3407 'subtitle': None,
3408 'thumbnail': None,
3409 'description': 'description',
3410 'annotation': 'annotations.xml',
3411 'infojson': 'info.json',
08438d2c 3412 'link': None,
3b603dbd 3413 'pl_video': None,
5112f26a 3414 'pl_thumbnail': None,
de6000d9 3415 'pl_description': 'description',
3416 'pl_infojson': 'info.json',
3417}
0a871f68 3418
143db31d 3419# As of [1] format syntax is:
3420# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3422STR_FORMAT_RE_TMPL = r'''(?x)
3423 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3424 %
524e2e4f 3425 (?P<has_key>\((?P<key>{0})\))?
752cda38 3426 (?P<format>
524e2e4f 3427 (?P<conversion>[#0\-+ ]+)?
3428 (?P<min_width>\d+)?
3429 (?P<precision>\.\d+)?
3430 (?P<len_mod>[hlL])? # unused in python
901130bb 3431 {1} # conversion type
752cda38 3432 )
143db31d 3433'''
3434
7d1eb38a 3435
901130bb 3436STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3437
7d1eb38a 3438
a020a0dc
PH
3439def limit_length(s, length):
3440 """ Add ellipses to overly long strings """
3441 if s is None:
3442 return None
3443 ELLIPSES = '...'
3444 if len(s) > length:
3445 return s[:length - len(ELLIPSES)] + ELLIPSES
3446 return s
48844745
PH
3447
3448
3449def version_tuple(v):
5f9b8394 3450 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3451
3452
3453def is_outdated_version(version, limit, assume_new=True):
3454 if not version:
3455 return not assume_new
3456 try:
3457 return version_tuple(version) < version_tuple(limit)
3458 except ValueError:
3459 return not assume_new
732ea2f0
PH
3460
3461
3462def ytdl_is_updateable():
7a5c1cfe 3463 """ Returns if yt-dlp can be updated with -U """
735d865e 3464
5d535b4a 3465 from .update import is_non_updateable
732ea2f0 3466
5d535b4a 3467 return not is_non_updateable()
7d4111ed
PH
3468
3469
3470def args_to_str(args):
3471 # Get a short string representation for a subprocess command
702ccf2d 3472 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3473
3474
9b9c5355 3475def error_to_compat_str(err):
cfb0511d 3476 return str(err)
fdae2358
S
3477
3478
a44ca5a4 3479def error_to_str(err):
3480 return f'{type(err).__name__}: {err}'
3481
3482
c460bdd5 3483def mimetype2ext(mt):
eb9ee194
S
3484 if mt is None:
3485 return None
3486
9359f3d4
F
3487 mt, _, params = mt.partition(';')
3488 mt = mt.strip()
3489
3490 FULL_MAP = {
765ac263 3491 'audio/mp4': 'm4a',
6c33d24b
YCH
3492 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3493 # it's the most popular one
3494 'audio/mpeg': 'mp3',
ba39289d 3495 'audio/x-wav': 'wav',
9359f3d4
F
3496 'audio/wav': 'wav',
3497 'audio/wave': 'wav',
3498 }
3499
3500 ext = FULL_MAP.get(mt)
765ac263
JMF
3501 if ext is not None:
3502 return ext
3503
9359f3d4 3504 SUBTYPE_MAP = {
f6861ec9 3505 '3gpp': '3gp',
cafcf657 3506 'smptett+xml': 'tt',
cafcf657 3507 'ttaf+xml': 'dfxp',
a0d8d704 3508 'ttml+xml': 'ttml',
f6861ec9 3509 'x-flv': 'flv',
a0d8d704 3510 'x-mp4-fragmented': 'mp4',
d4f05d47 3511 'x-ms-sami': 'sami',
a0d8d704 3512 'x-ms-wmv': 'wmv',
b4173f15
RA
3513 'mpegurl': 'm3u8',
3514 'x-mpegurl': 'm3u8',
3515 'vnd.apple.mpegurl': 'm3u8',
3516 'dash+xml': 'mpd',
b4173f15 3517 'f4m+xml': 'f4m',
f164b971 3518 'hds+xml': 'f4m',
e910fe2f 3519 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3520 'quicktime': 'mov',
98ce1a3f 3521 'mp2t': 'ts',
39e7107d 3522 'x-wav': 'wav',
9359f3d4
F
3523 'filmstrip+json': 'fs',
3524 'svg+xml': 'svg',
3525 }
3526
3527 _, _, subtype = mt.rpartition('/')
3528 ext = SUBTYPE_MAP.get(subtype.lower())
3529 if ext is not None:
3530 return ext
3531
3532 SUFFIX_MAP = {
3533 'json': 'json',
3534 'xml': 'xml',
3535 'zip': 'zip',
3536 'gzip': 'gz',
3537 }
3538
3539 _, _, suffix = subtype.partition('+')
3540 ext = SUFFIX_MAP.get(suffix)
3541 if ext is not None:
3542 return ext
3543
3544 return subtype.replace('+', '.')
c460bdd5
PH
3545
3546
2814f12b
THD
3547def ext2mimetype(ext_or_url):
3548 if not ext_or_url:
3549 return None
3550 if '.' not in ext_or_url:
3551 ext_or_url = f'file.{ext_or_url}'
3552 return mimetypes.guess_type(ext_or_url)[0]
3553
3554
4f3c5e06 3555def parse_codecs(codecs_str):
3556 # http://tools.ietf.org/html/rfc6381
3557 if not codecs_str:
3558 return {}
a0566bbf 3559 split_codecs = list(filter(None, map(
dbf5416a 3560 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3561 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3562 for full_codec in split_codecs:
d816f61f 3563 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3564 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3565 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3566 if vcodec:
3567 continue
3568 vcodec = full_codec
3569 if parts[0] in ('dvh1', 'dvhe'):
3570 hdr = 'DV'
3571 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3572 hdr = 'HDR10'
3573 elif parts[:2] == ['vp9', '2']:
3574 hdr = 'HDR10'
71082216 3575 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3576 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3577 acodec = acodec or full_codec
3578 elif parts[0] in ('stpp', 'wvtt'):
3579 scodec = scodec or full_codec
4f3c5e06 3580 else:
19a03940 3581 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3582 if vcodec or acodec or scodec:
4f3c5e06 3583 return {
3584 'vcodec': vcodec or 'none',
3585 'acodec': acodec or 'none',
176f1866 3586 'dynamic_range': hdr,
3fe75fdc 3587 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3588 }
b69fd25c 3589 elif len(split_codecs) == 2:
3590 return {
3591 'vcodec': split_codecs[0],
3592 'acodec': split_codecs[1],
3593 }
4f3c5e06 3594 return {}
3595
3596
fc61aff4
LL
3597def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3598 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3599
3600 allow_mkv = not preferences or 'mkv' in preferences
3601
3602 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3603 return 'mkv' # TODO: any other format allows this?
3604
3605 # TODO: All codecs supported by parse_codecs isn't handled here
3606 COMPATIBLE_CODECS = {
3607 'mp4': {
71082216 3608 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3609 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3610 },
3611 'webm': {
3612 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3613 'vp9x', 'vp8x', # in the webm spec
3614 },
3615 }
3616
8f84770a 3617 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3618 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3619
3620 for ext in preferences or COMPATIBLE_CODECS.keys():
3621 codec_set = COMPATIBLE_CODECS.get(ext, set())
3622 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3623 return ext
3624
3625 COMPATIBLE_EXTS = (
3626 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3627 {'webm'},
3628 )
3629 for ext in preferences or vexts:
3630 current_exts = {ext, *vexts, *aexts}
3631 if ext == 'mkv' or current_exts == {ext} or any(
3632 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3633 return ext
3634 return 'mkv' if allow_mkv else preferences[-1]
3635
3636
2ccd1b10 3637def urlhandle_detect_ext(url_handle):
79298173 3638 getheader = url_handle.headers.get
2ccd1b10 3639
b55ee18f
PH
3640 cd = getheader('Content-Disposition')
3641 if cd:
3642 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3643 if m:
3644 e = determine_ext(m.group('filename'), default_ext=None)
3645 if e:
3646 return e
3647
c460bdd5 3648 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3649
3650
1e399778
YCH
3651def encode_data_uri(data, mime_type):
3652 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3653
3654
05900629 3655def age_restricted(content_limit, age_limit):
6ec6cb4e 3656 """ Returns True iff the content should be blocked """
05900629
PH
3657
3658 if age_limit is None: # No limit set
3659 return False
3660 if content_limit is None:
3661 return False # Content available for everyone
3662 return age_limit < content_limit
61ca9a80
PH
3663
3664
88f60feb 3665# List of known byte-order-marks (BOM)
a904a7f8
L
3666BOMS = [
3667 (b'\xef\xbb\xbf', 'utf-8'),
3668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3670 (b'\xff\xfe', 'utf-16-le'),
3671 (b'\xfe\xff', 'utf-16-be'),
3672]
a904a7f8
L
3673
3674
61ca9a80
PH
3675def is_html(first_bytes):
3676 """ Detect whether a file contains HTML by examining its first bytes. """
3677
80e8493e 3678 encoding = 'utf-8'
61ca9a80 3679 for bom, enc in BOMS:
80e8493e 3680 while first_bytes.startswith(bom):
3681 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3682
80e8493e 3683 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3684
3685
3686def determine_protocol(info_dict):
3687 protocol = info_dict.get('protocol')
3688 if protocol is not None:
3689 return protocol
3690
7de837a5 3691 url = sanitize_url(info_dict['url'])
a055469f
PH
3692 if url.startswith('rtmp'):
3693 return 'rtmp'
3694 elif url.startswith('mms'):
3695 return 'mms'
3696 elif url.startswith('rtsp'):
3697 return 'rtsp'
3698
3699 ext = determine_ext(url)
3700 if ext == 'm3u8':
deae7c17 3701 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3702 elif ext == 'f4m':
3703 return 'f4m'
3704
14f25df2 3705 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3706
3707
c5e3f849 3708def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3709 """ Render a list of rows, each as a list of values.
3710 Text after a \t will be right aligned """
ec11a9f4 3711 def width(string):
c5e3f849 3712 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3713
3714 def get_max_lens(table):
ec11a9f4 3715 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3716
3717 def filter_using_list(row, filterArray):
d16df59d 3718 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3719
d16df59d 3720 max_lens = get_max_lens(data) if hide_empty else []
3721 header_row = filter_using_list(header_row, max_lens)
3722 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3723
cfb56d1a 3724 table = [header_row] + data
76d321f6 3725 max_lens = get_max_lens(table)
c5e3f849 3726 extra_gap += 1
76d321f6 3727 if delim:
c5e3f849 3728 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3729 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3730 for row in table:
3731 for pos, text in enumerate(map(str, row)):
c5e3f849 3732 if '\t' in text:
3733 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3734 else:
3735 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3736 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3737 return ret
347de493
PH
3738
3739
8f18aca8 3740def _match_one(filter_part, dct, incomplete):
77b87f05 3741 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3742 STRING_OPERATORS = {
3743 '*=': operator.contains,
3744 '^=': lambda attr, value: attr.startswith(value),
3745 '$=': lambda attr, value: attr.endswith(value),
3746 '~=': lambda attr, value: re.search(value, attr),
3747 }
347de493 3748 COMPARISON_OPERATORS = {
a047eeb6 3749 **STRING_OPERATORS,
3750 '<=': operator.le, # "<=" must be defined above "<"
347de493 3751 '<': operator.lt,
347de493 3752 '>=': operator.ge,
a047eeb6 3753 '>': operator.gt,
347de493 3754 '=': operator.eq,
347de493 3755 }
a047eeb6 3756
6db9c4d5 3757 if isinstance(incomplete, bool):
3758 is_incomplete = lambda _: incomplete
3759 else:
3760 is_incomplete = lambda k: k in incomplete
3761
64fa820c 3762 operator_rex = re.compile(r'''(?x)
347de493 3763 (?P<key>[a-z_]+)
77b87f05 3764 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3765 (?:
a047eeb6 3766 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3767 (?P<strval>.+?)
347de493 3768 )
347de493 3769 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3770 m = operator_rex.fullmatch(filter_part.strip())
347de493 3771 if m:
18f96d12 3772 m = m.groupdict()
3773 unnegated_op = COMPARISON_OPERATORS[m['op']]
3774 if m['negation']:
77b87f05
MT
3775 op = lambda attr, value: not unnegated_op(attr, value)
3776 else:
3777 op = unnegated_op
18f96d12 3778 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3779 if m['quote']:
3780 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3781 actual_value = dct.get(m['key'])
3782 numeric_comparison = None
f9934b96 3783 if isinstance(actual_value, (int, float)):
e5a088dc
S
3784 # If the original field is a string and matching comparisonvalue is
3785 # a number we should respect the origin of the original field
3786 # and process comparison value as a string (see
18f96d12 3787 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3788 try:
18f96d12 3789 numeric_comparison = int(comparison_value)
347de493 3790 except ValueError:
18f96d12 3791 numeric_comparison = parse_filesize(comparison_value)
3792 if numeric_comparison is None:
3793 numeric_comparison = parse_filesize(f'{comparison_value}B')
3794 if numeric_comparison is None:
3795 numeric_comparison = parse_duration(comparison_value)
3796 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3797 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3798 if actual_value is None:
6db9c4d5 3799 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3800 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3801
3802 UNARY_OPERATORS = {
1cc47c66
S
3803 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3804 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3805 }
64fa820c 3806 operator_rex = re.compile(r'''(?x)
347de493 3807 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3808 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3809 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3810 if m:
3811 op = UNARY_OPERATORS[m.group('op')]
3812 actual_value = dct.get(m.group('key'))
6db9c4d5 3813 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3814 return True
347de493
PH
3815 return op(actual_value)
3816
3817 raise ValueError('Invalid filter part %r' % filter_part)
3818
3819
8f18aca8 3820def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3821 """ Filter a dictionary with a simple string syntax.
3822 @returns Whether the filter passes
3823 @param incomplete Set of keys that is expected to be missing from dct.
3824 Can be True/False to indicate all/none of the keys may be missing.
3825 All conditions on incomplete keys pass if the key is missing
8f18aca8 3826 """
347de493 3827 return all(
8f18aca8 3828 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3829 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3830
3831
b1a7cd05 3832def match_filter_func(filters):
3833 if not filters:
d1b5f70b 3834 return None
492272fe 3835 filters = set(variadic(filters))
d1b5f70b 3836
492272fe 3837 interactive = '-' in filters
3838 if interactive:
3839 filters.remove('-')
3840
3841 def _match_func(info_dict, incomplete=False):
3842 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3843 return NO_DEFAULT if interactive and not incomplete else None
347de493 3844 else:
3bec830a 3845 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3846 filter_str = ') | ('.join(map(str.strip, filters))
3847 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3848 return _match_func
91410c9b
PH
3849
3850
f2df4071 3851class download_range_func:
3852 def __init__(self, chapters, ranges):
3853 self.chapters, self.ranges = chapters, ranges
3854
3855 def __call__(self, info_dict, ydl):
0500ee3d 3856 if not self.ranges and not self.chapters:
3857 yield {}
3858
5ec1b6b7 3859 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3860 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3861 for regex in self.chapters or []:
5ec1b6b7 3862 for i, chapter in enumerate(info_dict.get('chapters') or []):
3863 if re.search(regex, chapter['title']):
3864 warning = None
3865 yield {**chapter, 'index': i}
f2df4071 3866 if self.chapters and warning:
5ec1b6b7 3867 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3868
f2df4071 3869 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3870
f2df4071 3871 def __eq__(self, other):
3872 return (isinstance(other, download_range_func)
3873 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3874
71df9b7f 3875 def __repr__(self):
3876 return f'{type(self).__name__}({self.chapters}, {self.ranges})'
3877
5ec1b6b7 3878
bf6427d2
YCH
3879def parse_dfxp_time_expr(time_expr):
3880 if not time_expr:
d631d5f9 3881 return
bf6427d2 3882
1d485a1a 3883 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3884 if mobj:
3885 return float(mobj.group('time_offset'))
3886
db2fe38b 3887 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3888 if mobj:
db2fe38b 3889 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3890
3891
c1c924ab 3892def srt_subtitles_timecode(seconds):
aa7785f8 3893 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3894
3895
3896def ass_subtitles_timecode(seconds):
3897 time = timetuple_from_msec(seconds * 1000)
3898 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3899
3900
3901def dfxp2srt(dfxp_data):
3869028f
YCH
3902 '''
3903 @param dfxp_data A bytes-like object containing DFXP data
3904 @returns A unicode object containing converted SRT data
3905 '''
5b995f71 3906 LEGACY_NAMESPACES = (
3869028f
YCH
3907 (b'http://www.w3.org/ns/ttml', [
3908 b'http://www.w3.org/2004/11/ttaf1',
3909 b'http://www.w3.org/2006/04/ttaf1',
3910 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3911 ]),
3869028f
YCH
3912 (b'http://www.w3.org/ns/ttml#styling', [
3913 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3914 ]),
3915 )
3916
3917 SUPPORTED_STYLING = [
3918 'color',
3919 'fontFamily',
3920 'fontSize',
3921 'fontStyle',
3922 'fontWeight',
3923 'textDecoration'
3924 ]
3925
4e335771 3926 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3927 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3928 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3929 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3930 })
bf6427d2 3931
5b995f71
RA
3932 styles = {}
3933 default_style = {}
3934
86e5f3ed 3935 class TTMLPElementParser:
5b995f71
RA
3936 _out = ''
3937 _unclosed_elements = []
3938 _applied_styles = []
bf6427d2 3939
2b14cb56 3940 def start(self, tag, attrib):
5b995f71
RA
3941 if tag in (_x('ttml:br'), 'br'):
3942 self._out += '\n'
3943 else:
3944 unclosed_elements = []
3945 style = {}
3946 element_style_id = attrib.get('style')
3947 if default_style:
3948 style.update(default_style)
3949 if element_style_id:
3950 style.update(styles.get(element_style_id, {}))
3951 for prop in SUPPORTED_STYLING:
3952 prop_val = attrib.get(_x('tts:' + prop))
3953 if prop_val:
3954 style[prop] = prop_val
3955 if style:
3956 font = ''
3957 for k, v in sorted(style.items()):
3958 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3959 continue
3960 if k == 'color':
3961 font += ' color="%s"' % v
3962 elif k == 'fontSize':
3963 font += ' size="%s"' % v
3964 elif k == 'fontFamily':
3965 font += ' face="%s"' % v
3966 elif k == 'fontWeight' and v == 'bold':
3967 self._out += '<b>'
3968 unclosed_elements.append('b')
3969 elif k == 'fontStyle' and v == 'italic':
3970 self._out += '<i>'
3971 unclosed_elements.append('i')
3972 elif k == 'textDecoration' and v == 'underline':
3973 self._out += '<u>'
3974 unclosed_elements.append('u')
3975 if font:
3976 self._out += '<font' + font + '>'
3977 unclosed_elements.append('font')
3978 applied_style = {}
3979 if self._applied_styles:
3980 applied_style.update(self._applied_styles[-1])
3981 applied_style.update(style)
3982 self._applied_styles.append(applied_style)
3983 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3984
2b14cb56 3985 def end(self, tag):
5b995f71
RA
3986 if tag not in (_x('ttml:br'), 'br'):
3987 unclosed_elements = self._unclosed_elements.pop()
3988 for element in reversed(unclosed_elements):
3989 self._out += '</%s>' % element
3990 if unclosed_elements and self._applied_styles:
3991 self._applied_styles.pop()
bf6427d2 3992
2b14cb56 3993 def data(self, data):
5b995f71 3994 self._out += data
2b14cb56 3995
3996 def close(self):
5b995f71 3997 return self._out.strip()
2b14cb56 3998
3999 def parse_node(node):
4000 target = TTMLPElementParser()
4001 parser = xml.etree.ElementTree.XMLParser(target=target)
4002 parser.feed(xml.etree.ElementTree.tostring(node))
4003 return parser.close()
bf6427d2 4004
5b995f71
RA
4005 for k, v in LEGACY_NAMESPACES:
4006 for ns in v:
4007 dfxp_data = dfxp_data.replace(ns, k)
4008
3869028f 4009 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4010 out = []
5b995f71 4011 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4012
4013 if not paras:
4014 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4015
5b995f71
RA
4016 repeat = False
4017 while True:
4018 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4019 style_id = style.get('id') or style.get(_x('xml:id'))
4020 if not style_id:
4021 continue
5b995f71
RA
4022 parent_style_id = style.get('style')
4023 if parent_style_id:
4024 if parent_style_id not in styles:
4025 repeat = True
4026 continue
4027 styles[style_id] = styles[parent_style_id].copy()
4028 for prop in SUPPORTED_STYLING:
4029 prop_val = style.get(_x('tts:' + prop))
4030 if prop_val:
4031 styles.setdefault(style_id, {})[prop] = prop_val
4032 if repeat:
4033 repeat = False
4034 else:
4035 break
4036
4037 for p in ('body', 'div'):
4038 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4039 if ele is None:
4040 continue
4041 style = styles.get(ele.get('style'))
4042 if not style:
4043 continue
4044 default_style.update(style)
4045
bf6427d2 4046 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4047 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4048 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4049 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4050 if begin_time is None:
4051 continue
7dff0363 4052 if not end_time:
d631d5f9
YCH
4053 if not dur:
4054 continue
4055 end_time = begin_time + dur
bf6427d2
YCH
4056 out.append('%d\n%s --> %s\n%s\n\n' % (
4057 index,
c1c924ab
YCH
4058 srt_subtitles_timecode(begin_time),
4059 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4060 parse_node(para)))
4061
4062 return ''.join(out)
4063
4064
c487cf00 4065def cli_option(params, command_option, param, separator=None):
66e289ba 4066 param = params.get(param)
c487cf00 4067 return ([] if param is None
4068 else [command_option, str(param)] if separator is None
4069 else [f'{command_option}{separator}{param}'])
66e289ba
S
4070
4071
4072def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4073 param = params.get(param)
c487cf00 4074 assert param in (True, False, None)
4075 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4076
4077
4078def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4079 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4080
4081
e92caff5 4082def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4083 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4084 if use_compat:
5b1ecbb3 4085 return argdict
4086 else:
4087 argdict = None
eab9b2bc 4088 if argdict is None:
5b1ecbb3 4089 return default
eab9b2bc 4090 assert isinstance(argdict, dict)
4091
e92caff5 4092 assert isinstance(keys, (list, tuple))
4093 for key_list in keys:
e92caff5 4094 arg_list = list(filter(
4095 lambda x: x is not None,
6606817a 4096 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4097 if arg_list:
4098 return [arg for args in arg_list for arg in args]
4099 return default
66e289ba 4100
6251555f 4101
330690a2 4102def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4103 main_key, exe = main_key.lower(), exe.lower()
4104 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4105 keys = [f'{root_key}{k}' for k in (keys or [''])]
4106 if root_key in keys:
4107 if main_key != exe:
4108 keys.append((main_key, exe))
4109 keys.append('default')
4110 else:
4111 use_compat = False
4112 return cli_configuration_args(argdict, keys, default, use_compat)
4113
66e289ba 4114
86e5f3ed 4115class ISO639Utils:
39672624
YCH
4116 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4117 _lang_map = {
4118 'aa': 'aar',
4119 'ab': 'abk',
4120 'ae': 'ave',
4121 'af': 'afr',
4122 'ak': 'aka',
4123 'am': 'amh',
4124 'an': 'arg',
4125 'ar': 'ara',
4126 'as': 'asm',
4127 'av': 'ava',
4128 'ay': 'aym',
4129 'az': 'aze',
4130 'ba': 'bak',
4131 'be': 'bel',
4132 'bg': 'bul',
4133 'bh': 'bih',
4134 'bi': 'bis',
4135 'bm': 'bam',
4136 'bn': 'ben',
4137 'bo': 'bod',
4138 'br': 'bre',
4139 'bs': 'bos',
4140 'ca': 'cat',
4141 'ce': 'che',
4142 'ch': 'cha',
4143 'co': 'cos',
4144 'cr': 'cre',
4145 'cs': 'ces',
4146 'cu': 'chu',
4147 'cv': 'chv',
4148 'cy': 'cym',
4149 'da': 'dan',
4150 'de': 'deu',
4151 'dv': 'div',
4152 'dz': 'dzo',
4153 'ee': 'ewe',
4154 'el': 'ell',
4155 'en': 'eng',
4156 'eo': 'epo',
4157 'es': 'spa',
4158 'et': 'est',
4159 'eu': 'eus',
4160 'fa': 'fas',
4161 'ff': 'ful',
4162 'fi': 'fin',
4163 'fj': 'fij',
4164 'fo': 'fao',
4165 'fr': 'fra',
4166 'fy': 'fry',
4167 'ga': 'gle',
4168 'gd': 'gla',
4169 'gl': 'glg',
4170 'gn': 'grn',
4171 'gu': 'guj',
4172 'gv': 'glv',
4173 'ha': 'hau',
4174 'he': 'heb',
b7acc835 4175 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4176 'hi': 'hin',
4177 'ho': 'hmo',
4178 'hr': 'hrv',
4179 'ht': 'hat',
4180 'hu': 'hun',
4181 'hy': 'hye',
4182 'hz': 'her',
4183 'ia': 'ina',
4184 'id': 'ind',
b7acc835 4185 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4186 'ie': 'ile',
4187 'ig': 'ibo',
4188 'ii': 'iii',
4189 'ik': 'ipk',
4190 'io': 'ido',
4191 'is': 'isl',
4192 'it': 'ita',
4193 'iu': 'iku',
4194 'ja': 'jpn',
4195 'jv': 'jav',
4196 'ka': 'kat',
4197 'kg': 'kon',
4198 'ki': 'kik',
4199 'kj': 'kua',
4200 'kk': 'kaz',
4201 'kl': 'kal',
4202 'km': 'khm',
4203 'kn': 'kan',
4204 'ko': 'kor',
4205 'kr': 'kau',
4206 'ks': 'kas',
4207 'ku': 'kur',
4208 'kv': 'kom',
4209 'kw': 'cor',
4210 'ky': 'kir',
4211 'la': 'lat',
4212 'lb': 'ltz',
4213 'lg': 'lug',
4214 'li': 'lim',
4215 'ln': 'lin',
4216 'lo': 'lao',
4217 'lt': 'lit',
4218 'lu': 'lub',
4219 'lv': 'lav',
4220 'mg': 'mlg',
4221 'mh': 'mah',
4222 'mi': 'mri',
4223 'mk': 'mkd',
4224 'ml': 'mal',
4225 'mn': 'mon',
4226 'mr': 'mar',
4227 'ms': 'msa',
4228 'mt': 'mlt',
4229 'my': 'mya',
4230 'na': 'nau',
4231 'nb': 'nob',
4232 'nd': 'nde',
4233 'ne': 'nep',
4234 'ng': 'ndo',
4235 'nl': 'nld',
4236 'nn': 'nno',
4237 'no': 'nor',
4238 'nr': 'nbl',
4239 'nv': 'nav',
4240 'ny': 'nya',
4241 'oc': 'oci',
4242 'oj': 'oji',
4243 'om': 'orm',
4244 'or': 'ori',
4245 'os': 'oss',
4246 'pa': 'pan',
4247 'pi': 'pli',
4248 'pl': 'pol',
4249 'ps': 'pus',
4250 'pt': 'por',
4251 'qu': 'que',
4252 'rm': 'roh',
4253 'rn': 'run',
4254 'ro': 'ron',
4255 'ru': 'rus',
4256 'rw': 'kin',
4257 'sa': 'san',
4258 'sc': 'srd',
4259 'sd': 'snd',
4260 'se': 'sme',
4261 'sg': 'sag',
4262 'si': 'sin',
4263 'sk': 'slk',
4264 'sl': 'slv',
4265 'sm': 'smo',
4266 'sn': 'sna',
4267 'so': 'som',
4268 'sq': 'sqi',
4269 'sr': 'srp',
4270 'ss': 'ssw',
4271 'st': 'sot',
4272 'su': 'sun',
4273 'sv': 'swe',
4274 'sw': 'swa',
4275 'ta': 'tam',
4276 'te': 'tel',
4277 'tg': 'tgk',
4278 'th': 'tha',
4279 'ti': 'tir',
4280 'tk': 'tuk',
4281 'tl': 'tgl',
4282 'tn': 'tsn',
4283 'to': 'ton',
4284 'tr': 'tur',
4285 'ts': 'tso',
4286 'tt': 'tat',
4287 'tw': 'twi',
4288 'ty': 'tah',
4289 'ug': 'uig',
4290 'uk': 'ukr',
4291 'ur': 'urd',
4292 'uz': 'uzb',
4293 've': 'ven',
4294 'vi': 'vie',
4295 'vo': 'vol',
4296 'wa': 'wln',
4297 'wo': 'wol',
4298 'xh': 'xho',
4299 'yi': 'yid',
e9a50fba 4300 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4301 'yo': 'yor',
4302 'za': 'zha',
4303 'zh': 'zho',
4304 'zu': 'zul',
4305 }
4306
4307 @classmethod
4308 def short2long(cls, code):
4309 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4310 return cls._lang_map.get(code[:2])
4311
4312 @classmethod
4313 def long2short(cls, code):
4314 """Convert language code from ISO 639-2/T to ISO 639-1"""
4315 for short_name, long_name in cls._lang_map.items():
4316 if long_name == code:
4317 return short_name
4318
4319
86e5f3ed 4320class ISO3166Utils:
4eb10f66
YCH
4321 # From http://data.okfn.org/data/core/country-list
4322 _country_map = {
4323 'AF': 'Afghanistan',
4324 'AX': 'Åland Islands',
4325 'AL': 'Albania',
4326 'DZ': 'Algeria',
4327 'AS': 'American Samoa',
4328 'AD': 'Andorra',
4329 'AO': 'Angola',
4330 'AI': 'Anguilla',
4331 'AQ': 'Antarctica',
4332 'AG': 'Antigua and Barbuda',
4333 'AR': 'Argentina',
4334 'AM': 'Armenia',
4335 'AW': 'Aruba',
4336 'AU': 'Australia',
4337 'AT': 'Austria',
4338 'AZ': 'Azerbaijan',
4339 'BS': 'Bahamas',
4340 'BH': 'Bahrain',
4341 'BD': 'Bangladesh',
4342 'BB': 'Barbados',
4343 'BY': 'Belarus',
4344 'BE': 'Belgium',
4345 'BZ': 'Belize',
4346 'BJ': 'Benin',
4347 'BM': 'Bermuda',
4348 'BT': 'Bhutan',
4349 'BO': 'Bolivia, Plurinational State of',
4350 'BQ': 'Bonaire, Sint Eustatius and Saba',
4351 'BA': 'Bosnia and Herzegovina',
4352 'BW': 'Botswana',
4353 'BV': 'Bouvet Island',
4354 'BR': 'Brazil',
4355 'IO': 'British Indian Ocean Territory',
4356 'BN': 'Brunei Darussalam',
4357 'BG': 'Bulgaria',
4358 'BF': 'Burkina Faso',
4359 'BI': 'Burundi',
4360 'KH': 'Cambodia',
4361 'CM': 'Cameroon',
4362 'CA': 'Canada',
4363 'CV': 'Cape Verde',
4364 'KY': 'Cayman Islands',
4365 'CF': 'Central African Republic',
4366 'TD': 'Chad',
4367 'CL': 'Chile',
4368 'CN': 'China',
4369 'CX': 'Christmas Island',
4370 'CC': 'Cocos (Keeling) Islands',
4371 'CO': 'Colombia',
4372 'KM': 'Comoros',
4373 'CG': 'Congo',
4374 'CD': 'Congo, the Democratic Republic of the',
4375 'CK': 'Cook Islands',
4376 'CR': 'Costa Rica',
4377 'CI': 'Côte d\'Ivoire',
4378 'HR': 'Croatia',
4379 'CU': 'Cuba',
4380 'CW': 'Curaçao',
4381 'CY': 'Cyprus',
4382 'CZ': 'Czech Republic',
4383 'DK': 'Denmark',
4384 'DJ': 'Djibouti',
4385 'DM': 'Dominica',
4386 'DO': 'Dominican Republic',
4387 'EC': 'Ecuador',
4388 'EG': 'Egypt',
4389 'SV': 'El Salvador',
4390 'GQ': 'Equatorial Guinea',
4391 'ER': 'Eritrea',
4392 'EE': 'Estonia',
4393 'ET': 'Ethiopia',
4394 'FK': 'Falkland Islands (Malvinas)',
4395 'FO': 'Faroe Islands',
4396 'FJ': 'Fiji',
4397 'FI': 'Finland',
4398 'FR': 'France',
4399 'GF': 'French Guiana',
4400 'PF': 'French Polynesia',
4401 'TF': 'French Southern Territories',
4402 'GA': 'Gabon',
4403 'GM': 'Gambia',
4404 'GE': 'Georgia',
4405 'DE': 'Germany',
4406 'GH': 'Ghana',
4407 'GI': 'Gibraltar',
4408 'GR': 'Greece',
4409 'GL': 'Greenland',
4410 'GD': 'Grenada',
4411 'GP': 'Guadeloupe',
4412 'GU': 'Guam',
4413 'GT': 'Guatemala',
4414 'GG': 'Guernsey',
4415 'GN': 'Guinea',
4416 'GW': 'Guinea-Bissau',
4417 'GY': 'Guyana',
4418 'HT': 'Haiti',
4419 'HM': 'Heard Island and McDonald Islands',
4420 'VA': 'Holy See (Vatican City State)',
4421 'HN': 'Honduras',
4422 'HK': 'Hong Kong',
4423 'HU': 'Hungary',
4424 'IS': 'Iceland',
4425 'IN': 'India',
4426 'ID': 'Indonesia',
4427 'IR': 'Iran, Islamic Republic of',
4428 'IQ': 'Iraq',
4429 'IE': 'Ireland',
4430 'IM': 'Isle of Man',
4431 'IL': 'Israel',
4432 'IT': 'Italy',
4433 'JM': 'Jamaica',
4434 'JP': 'Japan',
4435 'JE': 'Jersey',
4436 'JO': 'Jordan',
4437 'KZ': 'Kazakhstan',
4438 'KE': 'Kenya',
4439 'KI': 'Kiribati',
4440 'KP': 'Korea, Democratic People\'s Republic of',
4441 'KR': 'Korea, Republic of',
4442 'KW': 'Kuwait',
4443 'KG': 'Kyrgyzstan',
4444 'LA': 'Lao People\'s Democratic Republic',
4445 'LV': 'Latvia',
4446 'LB': 'Lebanon',
4447 'LS': 'Lesotho',
4448 'LR': 'Liberia',
4449 'LY': 'Libya',
4450 'LI': 'Liechtenstein',
4451 'LT': 'Lithuania',
4452 'LU': 'Luxembourg',
4453 'MO': 'Macao',
4454 'MK': 'Macedonia, the Former Yugoslav Republic of',
4455 'MG': 'Madagascar',
4456 'MW': 'Malawi',
4457 'MY': 'Malaysia',
4458 'MV': 'Maldives',
4459 'ML': 'Mali',
4460 'MT': 'Malta',
4461 'MH': 'Marshall Islands',
4462 'MQ': 'Martinique',
4463 'MR': 'Mauritania',
4464 'MU': 'Mauritius',
4465 'YT': 'Mayotte',
4466 'MX': 'Mexico',
4467 'FM': 'Micronesia, Federated States of',
4468 'MD': 'Moldova, Republic of',
4469 'MC': 'Monaco',
4470 'MN': 'Mongolia',
4471 'ME': 'Montenegro',
4472 'MS': 'Montserrat',
4473 'MA': 'Morocco',
4474 'MZ': 'Mozambique',
4475 'MM': 'Myanmar',
4476 'NA': 'Namibia',
4477 'NR': 'Nauru',
4478 'NP': 'Nepal',
4479 'NL': 'Netherlands',
4480 'NC': 'New Caledonia',
4481 'NZ': 'New Zealand',
4482 'NI': 'Nicaragua',
4483 'NE': 'Niger',
4484 'NG': 'Nigeria',
4485 'NU': 'Niue',
4486 'NF': 'Norfolk Island',
4487 'MP': 'Northern Mariana Islands',
4488 'NO': 'Norway',
4489 'OM': 'Oman',
4490 'PK': 'Pakistan',
4491 'PW': 'Palau',
4492 'PS': 'Palestine, State of',
4493 'PA': 'Panama',
4494 'PG': 'Papua New Guinea',
4495 'PY': 'Paraguay',
4496 'PE': 'Peru',
4497 'PH': 'Philippines',
4498 'PN': 'Pitcairn',
4499 'PL': 'Poland',
4500 'PT': 'Portugal',
4501 'PR': 'Puerto Rico',
4502 'QA': 'Qatar',
4503 'RE': 'Réunion',
4504 'RO': 'Romania',
4505 'RU': 'Russian Federation',
4506 'RW': 'Rwanda',
4507 'BL': 'Saint Barthélemy',
4508 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4509 'KN': 'Saint Kitts and Nevis',
4510 'LC': 'Saint Lucia',
4511 'MF': 'Saint Martin (French part)',
4512 'PM': 'Saint Pierre and Miquelon',
4513 'VC': 'Saint Vincent and the Grenadines',
4514 'WS': 'Samoa',
4515 'SM': 'San Marino',
4516 'ST': 'Sao Tome and Principe',
4517 'SA': 'Saudi Arabia',
4518 'SN': 'Senegal',
4519 'RS': 'Serbia',
4520 'SC': 'Seychelles',
4521 'SL': 'Sierra Leone',
4522 'SG': 'Singapore',
4523 'SX': 'Sint Maarten (Dutch part)',
4524 'SK': 'Slovakia',
4525 'SI': 'Slovenia',
4526 'SB': 'Solomon Islands',
4527 'SO': 'Somalia',
4528 'ZA': 'South Africa',
4529 'GS': 'South Georgia and the South Sandwich Islands',
4530 'SS': 'South Sudan',
4531 'ES': 'Spain',
4532 'LK': 'Sri Lanka',
4533 'SD': 'Sudan',
4534 'SR': 'Suriname',
4535 'SJ': 'Svalbard and Jan Mayen',
4536 'SZ': 'Swaziland',
4537 'SE': 'Sweden',
4538 'CH': 'Switzerland',
4539 'SY': 'Syrian Arab Republic',
4540 'TW': 'Taiwan, Province of China',
4541 'TJ': 'Tajikistan',
4542 'TZ': 'Tanzania, United Republic of',
4543 'TH': 'Thailand',
4544 'TL': 'Timor-Leste',
4545 'TG': 'Togo',
4546 'TK': 'Tokelau',
4547 'TO': 'Tonga',
4548 'TT': 'Trinidad and Tobago',
4549 'TN': 'Tunisia',
4550 'TR': 'Turkey',
4551 'TM': 'Turkmenistan',
4552 'TC': 'Turks and Caicos Islands',
4553 'TV': 'Tuvalu',
4554 'UG': 'Uganda',
4555 'UA': 'Ukraine',
4556 'AE': 'United Arab Emirates',
4557 'GB': 'United Kingdom',
4558 'US': 'United States',
4559 'UM': 'United States Minor Outlying Islands',
4560 'UY': 'Uruguay',
4561 'UZ': 'Uzbekistan',
4562 'VU': 'Vanuatu',
4563 'VE': 'Venezuela, Bolivarian Republic of',
4564 'VN': 'Viet Nam',
4565 'VG': 'Virgin Islands, British',
4566 'VI': 'Virgin Islands, U.S.',
4567 'WF': 'Wallis and Futuna',
4568 'EH': 'Western Sahara',
4569 'YE': 'Yemen',
4570 'ZM': 'Zambia',
4571 'ZW': 'Zimbabwe',
2f97cc61 4572 # Not ISO 3166 codes, but used for IP blocks
4573 'AP': 'Asia/Pacific Region',
4574 'EU': 'Europe',
4eb10f66
YCH
4575 }
4576
4577 @classmethod
4578 def short2full(cls, code):
4579 """Convert an ISO 3166-2 country code to the corresponding full name"""
4580 return cls._country_map.get(code.upper())
4581
4582
86e5f3ed 4583class GeoUtils:
773f291d
S
4584 # Major IPv4 address blocks per country
4585 _country_ip_map = {
53896ca5 4586 'AD': '46.172.224.0/19',
773f291d
S
4587 'AE': '94.200.0.0/13',
4588 'AF': '149.54.0.0/17',
4589 'AG': '209.59.64.0/18',
4590 'AI': '204.14.248.0/21',
4591 'AL': '46.99.0.0/16',
4592 'AM': '46.70.0.0/15',
4593 'AO': '105.168.0.0/13',
53896ca5
S
4594 'AP': '182.50.184.0/21',
4595 'AQ': '23.154.160.0/24',
773f291d
S
4596 'AR': '181.0.0.0/12',
4597 'AS': '202.70.112.0/20',
53896ca5 4598 'AT': '77.116.0.0/14',
773f291d
S
4599 'AU': '1.128.0.0/11',
4600 'AW': '181.41.0.0/18',
53896ca5
S
4601 'AX': '185.217.4.0/22',
4602 'AZ': '5.197.0.0/16',
773f291d
S
4603 'BA': '31.176.128.0/17',
4604 'BB': '65.48.128.0/17',
4605 'BD': '114.130.0.0/16',
4606 'BE': '57.0.0.0/8',
53896ca5 4607 'BF': '102.178.0.0/15',
773f291d
S
4608 'BG': '95.42.0.0/15',
4609 'BH': '37.131.0.0/17',
4610 'BI': '154.117.192.0/18',
4611 'BJ': '137.255.0.0/16',
53896ca5 4612 'BL': '185.212.72.0/23',
773f291d
S
4613 'BM': '196.12.64.0/18',
4614 'BN': '156.31.0.0/16',
4615 'BO': '161.56.0.0/16',
4616 'BQ': '161.0.80.0/20',
53896ca5 4617 'BR': '191.128.0.0/12',
773f291d
S
4618 'BS': '24.51.64.0/18',
4619 'BT': '119.2.96.0/19',
4620 'BW': '168.167.0.0/16',
4621 'BY': '178.120.0.0/13',
4622 'BZ': '179.42.192.0/18',
4623 'CA': '99.224.0.0/11',
4624 'CD': '41.243.0.0/16',
53896ca5
S
4625 'CF': '197.242.176.0/21',
4626 'CG': '160.113.0.0/16',
773f291d 4627 'CH': '85.0.0.0/13',
53896ca5 4628 'CI': '102.136.0.0/14',
773f291d
S
4629 'CK': '202.65.32.0/19',
4630 'CL': '152.172.0.0/14',
53896ca5 4631 'CM': '102.244.0.0/14',
773f291d
S
4632 'CN': '36.128.0.0/10',
4633 'CO': '181.240.0.0/12',
4634 'CR': '201.192.0.0/12',
4635 'CU': '152.206.0.0/15',
4636 'CV': '165.90.96.0/19',
4637 'CW': '190.88.128.0/17',
53896ca5 4638 'CY': '31.153.0.0/16',
773f291d
S
4639 'CZ': '88.100.0.0/14',
4640 'DE': '53.0.0.0/8',
4641 'DJ': '197.241.0.0/17',
4642 'DK': '87.48.0.0/12',
4643 'DM': '192.243.48.0/20',
4644 'DO': '152.166.0.0/15',
4645 'DZ': '41.96.0.0/12',
4646 'EC': '186.68.0.0/15',
4647 'EE': '90.190.0.0/15',
4648 'EG': '156.160.0.0/11',
4649 'ER': '196.200.96.0/20',
4650 'ES': '88.0.0.0/11',
4651 'ET': '196.188.0.0/14',
4652 'EU': '2.16.0.0/13',
4653 'FI': '91.152.0.0/13',
4654 'FJ': '144.120.0.0/16',
53896ca5 4655 'FK': '80.73.208.0/21',
773f291d
S
4656 'FM': '119.252.112.0/20',
4657 'FO': '88.85.32.0/19',
4658 'FR': '90.0.0.0/9',
4659 'GA': '41.158.0.0/15',
4660 'GB': '25.0.0.0/8',
4661 'GD': '74.122.88.0/21',
4662 'GE': '31.146.0.0/16',
4663 'GF': '161.22.64.0/18',
4664 'GG': '62.68.160.0/19',
53896ca5
S
4665 'GH': '154.160.0.0/12',
4666 'GI': '95.164.0.0/16',
773f291d
S
4667 'GL': '88.83.0.0/19',
4668 'GM': '160.182.0.0/15',
4669 'GN': '197.149.192.0/18',
4670 'GP': '104.250.0.0/19',
4671 'GQ': '105.235.224.0/20',
4672 'GR': '94.64.0.0/13',
4673 'GT': '168.234.0.0/16',
4674 'GU': '168.123.0.0/16',
4675 'GW': '197.214.80.0/20',
4676 'GY': '181.41.64.0/18',
4677 'HK': '113.252.0.0/14',
4678 'HN': '181.210.0.0/16',
4679 'HR': '93.136.0.0/13',
4680 'HT': '148.102.128.0/17',
4681 'HU': '84.0.0.0/14',
4682 'ID': '39.192.0.0/10',
4683 'IE': '87.32.0.0/12',
4684 'IL': '79.176.0.0/13',
4685 'IM': '5.62.80.0/20',
4686 'IN': '117.192.0.0/10',
4687 'IO': '203.83.48.0/21',
4688 'IQ': '37.236.0.0/14',
4689 'IR': '2.176.0.0/12',
4690 'IS': '82.221.0.0/16',
4691 'IT': '79.0.0.0/10',
4692 'JE': '87.244.64.0/18',
4693 'JM': '72.27.0.0/17',
4694 'JO': '176.29.0.0/16',
53896ca5 4695 'JP': '133.0.0.0/8',
773f291d
S
4696 'KE': '105.48.0.0/12',
4697 'KG': '158.181.128.0/17',
4698 'KH': '36.37.128.0/17',
4699 'KI': '103.25.140.0/22',
4700 'KM': '197.255.224.0/20',
53896ca5 4701 'KN': '198.167.192.0/19',
773f291d
S
4702 'KP': '175.45.176.0/22',
4703 'KR': '175.192.0.0/10',
4704 'KW': '37.36.0.0/14',
4705 'KY': '64.96.0.0/15',
4706 'KZ': '2.72.0.0/13',
4707 'LA': '115.84.64.0/18',
4708 'LB': '178.135.0.0/16',
53896ca5 4709 'LC': '24.92.144.0/20',
773f291d
S
4710 'LI': '82.117.0.0/19',
4711 'LK': '112.134.0.0/15',
53896ca5 4712 'LR': '102.183.0.0/16',
773f291d
S
4713 'LS': '129.232.0.0/17',
4714 'LT': '78.56.0.0/13',
4715 'LU': '188.42.0.0/16',
4716 'LV': '46.109.0.0/16',
4717 'LY': '41.252.0.0/14',
4718 'MA': '105.128.0.0/11',
4719 'MC': '88.209.64.0/18',
4720 'MD': '37.246.0.0/16',
4721 'ME': '178.175.0.0/17',
4722 'MF': '74.112.232.0/21',
4723 'MG': '154.126.0.0/17',
4724 'MH': '117.103.88.0/21',
4725 'MK': '77.28.0.0/15',
4726 'ML': '154.118.128.0/18',
4727 'MM': '37.111.0.0/17',
4728 'MN': '49.0.128.0/17',
4729 'MO': '60.246.0.0/16',
4730 'MP': '202.88.64.0/20',
4731 'MQ': '109.203.224.0/19',
4732 'MR': '41.188.64.0/18',
4733 'MS': '208.90.112.0/22',
4734 'MT': '46.11.0.0/16',
4735 'MU': '105.16.0.0/12',
4736 'MV': '27.114.128.0/18',
53896ca5 4737 'MW': '102.70.0.0/15',
773f291d
S
4738 'MX': '187.192.0.0/11',
4739 'MY': '175.136.0.0/13',
4740 'MZ': '197.218.0.0/15',
4741 'NA': '41.182.0.0/16',
4742 'NC': '101.101.0.0/18',
4743 'NE': '197.214.0.0/18',
4744 'NF': '203.17.240.0/22',
4745 'NG': '105.112.0.0/12',
4746 'NI': '186.76.0.0/15',
4747 'NL': '145.96.0.0/11',
4748 'NO': '84.208.0.0/13',
4749 'NP': '36.252.0.0/15',
4750 'NR': '203.98.224.0/19',
4751 'NU': '49.156.48.0/22',
4752 'NZ': '49.224.0.0/14',
4753 'OM': '5.36.0.0/15',
4754 'PA': '186.72.0.0/15',
4755 'PE': '186.160.0.0/14',
4756 'PF': '123.50.64.0/18',
4757 'PG': '124.240.192.0/19',
4758 'PH': '49.144.0.0/13',
4759 'PK': '39.32.0.0/11',
4760 'PL': '83.0.0.0/11',
4761 'PM': '70.36.0.0/20',
4762 'PR': '66.50.0.0/16',
4763 'PS': '188.161.0.0/16',
4764 'PT': '85.240.0.0/13',
4765 'PW': '202.124.224.0/20',
4766 'PY': '181.120.0.0/14',
4767 'QA': '37.210.0.0/15',
53896ca5 4768 'RE': '102.35.0.0/16',
773f291d 4769 'RO': '79.112.0.0/13',
53896ca5 4770 'RS': '93.86.0.0/15',
773f291d 4771 'RU': '5.136.0.0/13',
53896ca5 4772 'RW': '41.186.0.0/16',
773f291d
S
4773 'SA': '188.48.0.0/13',
4774 'SB': '202.1.160.0/19',
4775 'SC': '154.192.0.0/11',
53896ca5 4776 'SD': '102.120.0.0/13',
773f291d 4777 'SE': '78.64.0.0/12',
53896ca5 4778 'SG': '8.128.0.0/10',
773f291d
S
4779 'SI': '188.196.0.0/14',
4780 'SK': '78.98.0.0/15',
53896ca5 4781 'SL': '102.143.0.0/17',
773f291d
S
4782 'SM': '89.186.32.0/19',
4783 'SN': '41.82.0.0/15',
53896ca5 4784 'SO': '154.115.192.0/18',
773f291d
S
4785 'SR': '186.179.128.0/17',
4786 'SS': '105.235.208.0/21',
4787 'ST': '197.159.160.0/19',
4788 'SV': '168.243.0.0/16',
4789 'SX': '190.102.0.0/20',
4790 'SY': '5.0.0.0/16',
4791 'SZ': '41.84.224.0/19',
4792 'TC': '65.255.48.0/20',
4793 'TD': '154.68.128.0/19',
4794 'TG': '196.168.0.0/14',
4795 'TH': '171.96.0.0/13',
4796 'TJ': '85.9.128.0/18',
4797 'TK': '27.96.24.0/21',
4798 'TL': '180.189.160.0/20',
4799 'TM': '95.85.96.0/19',
4800 'TN': '197.0.0.0/11',
4801 'TO': '175.176.144.0/21',
4802 'TR': '78.160.0.0/11',
4803 'TT': '186.44.0.0/15',
4804 'TV': '202.2.96.0/19',
4805 'TW': '120.96.0.0/11',
4806 'TZ': '156.156.0.0/14',
53896ca5
S
4807 'UA': '37.52.0.0/14',
4808 'UG': '102.80.0.0/13',
4809 'US': '6.0.0.0/8',
773f291d 4810 'UY': '167.56.0.0/13',
53896ca5 4811 'UZ': '84.54.64.0/18',
773f291d 4812 'VA': '212.77.0.0/19',
53896ca5 4813 'VC': '207.191.240.0/21',
773f291d 4814 'VE': '186.88.0.0/13',
53896ca5 4815 'VG': '66.81.192.0/20',
773f291d
S
4816 'VI': '146.226.0.0/16',
4817 'VN': '14.160.0.0/11',
4818 'VU': '202.80.32.0/20',
4819 'WF': '117.20.32.0/21',
4820 'WS': '202.4.32.0/19',
4821 'YE': '134.35.0.0/16',
4822 'YT': '41.242.116.0/22',
4823 'ZA': '41.0.0.0/11',
53896ca5
S
4824 'ZM': '102.144.0.0/13',
4825 'ZW': '102.177.192.0/18',
773f291d
S
4826 }
4827
4828 @classmethod
5f95927a
S
4829 def random_ipv4(cls, code_or_block):
4830 if len(code_or_block) == 2:
4831 block = cls._country_ip_map.get(code_or_block.upper())
4832 if not block:
4833 return None
4834 else:
4835 block = code_or_block
773f291d 4836 addr, preflen = block.split('/')
ac668111 4837 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4838 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4839 return str(socket.inet_ntoa(
ac668111 4840 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4841
4842
ac668111 4843class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4844 def __init__(self, proxies=None):
4845 # Set default handlers
4846 for type in ('http', 'https'):
4847 setattr(self, '%s_open' % type,
4848 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4849 meth(r, proxy, type))
ac668111 4850 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4851
91410c9b 4852 def proxy_open(self, req, proxy, type):
2461f79d 4853 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4854 if req_proxy is not None:
4855 proxy = req_proxy
2461f79d
PH
4856 del req.headers['Ytdl-request-proxy']
4857
4858 if proxy == '__noproxy__':
4859 return None # No Proxy
14f25df2 4860 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4861 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4862 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4863 return None
ac668111 4864 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4865 self, req, proxy, type)
5bc880b9
YCH
4866
4867
0a5445dd
YCH
4868# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4869# released into Public Domain
4870# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4871
4872def long_to_bytes(n, blocksize=0):
4873 """long_to_bytes(n:long, blocksize:int) : string
4874 Convert a long integer to a byte string.
4875
4876 If optional blocksize is given and greater than zero, pad the front of the
4877 byte string with binary zeros so that the length is a multiple of
4878 blocksize.
4879 """
4880 # after much testing, this algorithm was deemed to be the fastest
4881 s = b''
4882 n = int(n)
4883 while n > 0:
ac668111 4884 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4885 n = n >> 32
4886 # strip off leading zeros
4887 for i in range(len(s)):
4888 if s[i] != b'\000'[0]:
4889 break
4890 else:
4891 # only happens when n == 0
4892 s = b'\000'
4893 i = 0
4894 s = s[i:]
4895 # add back some pad bytes. this could be done more efficiently w.r.t. the
4896 # de-padding being done above, but sigh...
4897 if blocksize > 0 and len(s) % blocksize:
4898 s = (blocksize - len(s) % blocksize) * b'\000' + s
4899 return s
4900
4901
4902def bytes_to_long(s):
4903 """bytes_to_long(string) : long
4904 Convert a byte string to a long integer.
4905
4906 This is (essentially) the inverse of long_to_bytes().
4907 """
4908 acc = 0
4909 length = len(s)
4910 if length % 4:
4911 extra = (4 - length % 4)
4912 s = b'\000' * extra + s
4913 length = length + extra
4914 for i in range(0, length, 4):
ac668111 4915 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4916 return acc
4917
4918
5bc880b9
YCH
4919def ohdave_rsa_encrypt(data, exponent, modulus):
4920 '''
4921 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4922
4923 Input:
4924 data: data to encrypt, bytes-like object
4925 exponent, modulus: parameter e and N of RSA algorithm, both integer
4926 Output: hex string of encrypted data
4927
4928 Limitation: supports one block encryption only
4929 '''
4930
4931 payload = int(binascii.hexlify(data[::-1]), 16)
4932 encrypted = pow(payload, exponent, modulus)
4933 return '%x' % encrypted
81bdc8fd
YCH
4934
4935
f48409c7
YCH
4936def pkcs1pad(data, length):
4937 """
4938 Padding input data with PKCS#1 scheme
4939
4940 @param {int[]} data input data
4941 @param {int} length target length
4942 @returns {int[]} padded data
4943 """
4944 if len(data) > length - 11:
4945 raise ValueError('Input data too long for PKCS#1 padding')
4946
4947 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4948 return [0, 2] + pseudo_random + [0] + data
4949
4950
7b2c3f47 4951def _base_n_table(n, table):
4952 if not table and not n:
4953 raise ValueError('Either table or n must be specified')
612f2be5 4954 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4955
44f14eb4 4956 if n and n != len(table):
612f2be5 4957 raise ValueError(f'base {n} exceeds table length {len(table)}')
4958 return table
59f898b7 4959
5eb6bdce 4960
7b2c3f47 4961def encode_base_n(num, n=None, table=None):
4962 """Convert given int to a base-n string"""
612f2be5 4963 table = _base_n_table(n, table)
7b2c3f47 4964 if not num:
5eb6bdce
YCH
4965 return table[0]
4966
7b2c3f47 4967 result, base = '', len(table)
81bdc8fd 4968 while num:
7b2c3f47 4969 result = table[num % base] + result
612f2be5 4970 num = num // base
7b2c3f47 4971 return result
4972
4973
4974def decode_base_n(string, n=None, table=None):
4975 """Convert given base-n string to int"""
4976 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4977 result, base = 0, len(table)
4978 for char in string:
4979 result = result * base + table[char]
4980 return result
4981
4982
4983def decode_base(value, digits):
da4db748 4984 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4985 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 4986 return decode_base_n(value, table=digits)
f52354a8
YCH
4987
4988
4989def decode_packed_codes(code):
06b3fe29 4990 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4991 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4992 base = int(base)
4993 count = int(count)
4994 symbols = symbols.split('|')
4995 symbol_table = {}
4996
4997 while count:
4998 count -= 1
5eb6bdce 4999 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5000 symbol_table[base_n_count] = symbols[count] or base_n_count
5001
5002 return re.sub(
5003 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5004 obfuscated_code)
e154c651 5005
5006
1ced2221
S
5007def caesar(s, alphabet, shift):
5008 if shift == 0:
5009 return s
5010 l = len(alphabet)
5011 return ''.join(
5012 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5013 for c in s)
5014
5015
5016def rot47(s):
5017 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5018
5019
e154c651 5020def parse_m3u8_attributes(attrib):
5021 info = {}
5022 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5023 if val.startswith('"'):
5024 val = val[1:-1]
5025 info[key] = val
5026 return info
1143535d
YCH
5027
5028
5029def urshift(val, n):
5030 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5031
5032
5033# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 5034# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
5035def decode_png(png_data):
5036 # Reference: https://www.w3.org/TR/PNG/
5037 header = png_data[8:]
5038
5039 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 5040 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
5041
5042 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 5043 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
5044
5045 chunks = []
5046
5047 while header:
5048 length = unpack_integer(header[:4])
5049 header = header[4:]
5050
5051 chunk_type = header[:4]
5052 header = header[4:]
5053
5054 chunk_data = header[:length]
5055 header = header[length:]
5056
5057 header = header[4:] # Skip CRC
5058
5059 chunks.append({
5060 'type': chunk_type,
5061 'length': length,
5062 'data': chunk_data
5063 })
5064
5065 ihdr = chunks[0]['data']
5066
5067 width = unpack_integer(ihdr[:4])
5068 height = unpack_integer(ihdr[4:8])
5069
5070 idat = b''
5071
5072 for chunk in chunks:
5073 if chunk['type'] == b'IDAT':
5074 idat += chunk['data']
5075
5076 if not idat:
86e5f3ed 5077 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5078
5079 decompressed_data = bytearray(zlib.decompress(idat))
5080
5081 stride = width * 3
5082 pixels = []
5083
5084 def _get_pixel(idx):
5085 x = idx % stride
5086 y = idx // stride
5087 return pixels[y][x]
5088
5089 for y in range(height):
5090 basePos = y * (1 + stride)
5091 filter_type = decompressed_data[basePos]
5092
5093 current_row = []
5094
5095 pixels.append(current_row)
5096
5097 for x in range(stride):
5098 color = decompressed_data[1 + basePos + x]
5099 basex = y * stride + x
5100 left = 0
5101 up = 0
5102
5103 if x > 2:
5104 left = _get_pixel(basex - 3)
5105 if y > 0:
5106 up = _get_pixel(basex - stride)
5107
5108 if filter_type == 1: # Sub
5109 color = (color + left) & 0xff
5110 elif filter_type == 2: # Up
5111 color = (color + up) & 0xff
5112 elif filter_type == 3: # Average
5113 color = (color + ((left + up) >> 1)) & 0xff
5114 elif filter_type == 4: # Paeth
5115 a = left
5116 b = up
5117 c = 0
5118
5119 if x > 2 and y > 0:
5120 c = _get_pixel(basex - stride - 3)
5121
5122 p = a + b - c
5123
5124 pa = abs(p - a)
5125 pb = abs(p - b)
5126 pc = abs(p - c)
5127
5128 if pa <= pb and pa <= pc:
5129 color = (color + a) & 0xff
5130 elif pb <= pc:
5131 color = (color + b) & 0xff
5132 else:
5133 color = (color + c) & 0xff
5134
5135 current_row.append(color)
5136
5137 return width, height, pixels
efa97bdc
YCH
5138
5139
5140def write_xattr(path, key, value):
6f7563be 5141 # Windows: Write xattrs to NTFS Alternate Data Streams:
5142 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5143 if compat_os_name == 'nt':
5144 assert ':' not in key
5145 assert os.path.exists(path)
efa97bdc
YCH
5146
5147 try:
6f7563be 5148 with open(f'{path}:{key}', 'wb') as f:
5149 f.write(value)
86e5f3ed 5150 except OSError as e:
efa97bdc 5151 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5152 return
efa97bdc 5153
6f7563be 5154 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5155
6f7563be 5156 setxattr = None
5157 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5158 # Unicode arguments are not supported in pyxattr until version 0.5.0
5159 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5160 if version_tuple(xattr.__version__) >= (0, 5, 0):
5161 setxattr = xattr.set
5162 elif xattr:
5163 setxattr = xattr.setxattr
efa97bdc 5164
6f7563be 5165 if setxattr:
5166 try:
5167 setxattr(path, key, value)
5168 except OSError as e:
5169 raise XAttrMetadataError(e.errno, e.strerror)
5170 return
efa97bdc 5171
6f7563be 5172 # UNIX Method 2. Use setfattr/xattr executables
5173 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5174 else 'xattr' if check_executable('xattr', ['-h']) else None)
5175 if not exe:
5176 raise XAttrUnavailableError(
5177 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5178 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5179
0f06bcd7 5180 value = value.decode()
6f7563be 5181 try:
f0c9fb96 5182 _, stderr, returncode = Popen.run(
6f7563be 5183 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5184 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5185 except OSError as e:
5186 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5187 if returncode:
5188 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5189
5190
5191def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5192 start_date = datetime.date(1950, 1, 1)
5193 end_date = datetime.date(1995, 12, 31)
5194 offset = random.randint(0, (end_date - start_date).days)
5195 random_date = start_date + datetime.timedelta(offset)
0c265486 5196 return {
aa374bc7
AS
5197 year_field: str(random_date.year),
5198 month_field: str(random_date.month),
5199 day_field: str(random_date.day),
0c265486 5200 }
732044af 5201
c76eb41b 5202
732044af 5203# Templates for internet shortcut files, which are plain text files.
e5a998f3 5204DOT_URL_LINK_TEMPLATE = '''\
732044af 5205[InternetShortcut]
5206URL=%(url)s
e5a998f3 5207'''
732044af 5208
e5a998f3 5209DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5210<?xml version="1.0" encoding="UTF-8"?>
5211<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5212<plist version="1.0">
5213<dict>
5214\t<key>URL</key>
5215\t<string>%(url)s</string>
5216</dict>
5217</plist>
e5a998f3 5218'''
732044af 5219
e5a998f3 5220DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5221[Desktop Entry]
5222Encoding=UTF-8
5223Name=%(filename)s
5224Type=Link
5225URL=%(url)s
5226Icon=text-html
e5a998f3 5227'''
732044af 5228
08438d2c 5229LINK_TEMPLATES = {
5230 'url': DOT_URL_LINK_TEMPLATE,
5231 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5232 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5233}
5234
732044af 5235
5236def iri_to_uri(iri):
5237 """
5238 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5239
5240 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5241 """
5242
14f25df2 5243 iri_parts = urllib.parse.urlparse(iri)
732044af 5244
5245 if '[' in iri_parts.netloc:
5246 raise ValueError('IPv6 URIs are not, yet, supported.')
5247 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5248
5249 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5250
5251 net_location = ''
5252 if iri_parts.username:
f9934b96 5253 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5254 if iri_parts.password is not None:
f9934b96 5255 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5256 net_location += '@'
5257
0f06bcd7 5258 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5259 # The 'idna' encoding produces ASCII text.
5260 if iri_parts.port is not None and iri_parts.port != 80:
5261 net_location += ':' + str(iri_parts.port)
5262
f9934b96 5263 return urllib.parse.urlunparse(
732044af 5264 (iri_parts.scheme,
5265 net_location,
5266
f9934b96 5267 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5268
5269 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5270 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5271
5272 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5273 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5274
f9934b96 5275 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5276
5277 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5278
5279
5280def to_high_limit_path(path):
5281 if sys.platform in ['win32', 'cygwin']:
5282 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5283 return '\\\\?\\' + os.path.abspath(path)
732044af 5284
5285 return path
76d321f6 5286
c76eb41b 5287
7b2c3f47 5288def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5289 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5290 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5291 return default
7b2c3f47 5292 return template % func(val)
00dd0cd5 5293
5294
5295def clean_podcast_url(url):
5296 return re.sub(r'''(?x)
5297 (?:
5298 (?:
5299 chtbl\.com/track|
5300 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5301 play\.podtrac\.com
5302 )/[^/]+|
5303 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5304 flex\.acast\.com|
5305 pd(?:
5306 cn\.co| # https://podcorn.com/analytics-prefix/
5307 st\.fm # https://podsights.com/docs/
5308 )/e
5309 )/''', '', url)
ffcb8191
THD
5310
5311
5312_HEX_TABLE = '0123456789abcdef'
5313
5314
5315def random_uuidv4():
5316 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5317
5318
5319def make_dir(path, to_screen=None):
5320 try:
5321 dn = os.path.dirname(path)
5322 if dn and not os.path.exists(dn):
5323 os.makedirs(dn)
5324 return True
86e5f3ed 5325 except OSError as err:
0202b52a 5326 if callable(to_screen) is not None:
5327 to_screen('unable to create directory ' + error_to_compat_str(err))
5328 return False
f74980cb 5329
5330
5331def get_executable_path():
b5899f4f 5332 from .update import _get_variant_and_executable_path
c487cf00 5333
b5899f4f 5334 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5335
5336
2f567473 5337def load_plugins(name, suffix, namespace):
3ae5e797 5338 classes = {}
19a03940 5339 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5340 plugins_spec = importlib.util.spec_from_file_location(
5341 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5342 plugins = importlib.util.module_from_spec(plugins_spec)
5343 sys.modules[plugins_spec.name] = plugins
5344 plugins_spec.loader.exec_module(plugins)
f74980cb 5345 for name in dir(plugins):
2f567473 5346 if name in namespace:
5347 continue
5348 if not name.endswith(suffix):
f74980cb 5349 continue
5350 klass = getattr(plugins, name)
3ae5e797 5351 classes[name] = namespace[name] = klass
f74980cb 5352 return classes
06167fbb 5353
5354
325ebc17 5355def traverse_obj(
f99bbfc9 5356 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5357 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5358 """
5359 Safely traverse nested `dict`s and `Sequence`s
5360
5361 >>> obj = [{}, {"key": "value"}]
5362 >>> traverse_obj(obj, (1, "key"))
5363 "value"
5364
5365 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5366 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5367 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
ab029d7e
SS
5368 A value of None is treated as the absence of a value.
5369
5370 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5371
5372 The keys in the path can be one of:
5373 - `None`: Return the current object.
7b0127e1 5374 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
ab029d7e
SS
5375 - `slice`: Branch out and return all values in `obj[key]`.
5376 - `Ellipsis`: Branch out and return a list of all values.
5377 - `tuple`/`list`: Branch out and return a list of all matching values.
5378 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5379 - `function`: Branch out and return values filtered by the function.
5380 Read as: `[value for key, value in obj if function(key, value)]`.
5381 For `Sequence`s, `key` is the index of the value.
5382 - `dict` Transform the current object and return a matching dict.
5383 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5384
7b0127e1 5385 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5386
5387 @params paths Paths which to traverse by.
5388 @param default Value to return if the paths do not match.
5389 @param expected_type If a `type`, only accept final values of this type.
5390 If any other callable, try to call the function on each result.
5391 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5392 @param casesense If `False`, consider string dictionary keys as case insensitive.
5393
5394 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5395
5396 @param is_user_input Whether the keys are generated from user input.
5397 If `True` strings get converted to `int`/`slice` if needed.
5398 @param traverse_string Whether to traverse into objects as strings.
5399 If `True`, any non-compatible object will first be
5400 converted into a string and then traversed into.
5401
5402
5403 @returns The result of the object traversal.
5404 If successful, `get_all=True`, and the path branches at least once,
5405 then a list of results is returned instead.
f99bbfc9 5406 A list is always returned if the last path branches and no `default` is given.
ab029d7e
SS
5407 """
5408 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5409 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5410
352d63fd 5411 if isinstance(expected_type, type):
5412 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5413 else:
ab029d7e
SS
5414 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5415
5416 def apply_key(key, obj):
5417 if obj is None:
5418 return
5419
5420 elif key is None:
5421 yield obj
5422
5423 elif isinstance(key, (list, tuple)):
5424 for branch in key:
5425 _, result = apply_path(obj, branch)
5426 yield from result
5427
5428 elif key is ...:
5429 if isinstance(obj, collections.abc.Mapping):
5430 yield from obj.values()
5431 elif is_sequence(obj):
5432 yield from obj
7b0127e1
SS
5433 elif isinstance(obj, re.Match):
5434 yield from obj.groups()
ab029d7e
SS
5435 elif traverse_string:
5436 yield from str(obj)
5437
5438 elif callable(key):
5439 if is_sequence(obj):
5440 iter_obj = enumerate(obj)
5441 elif isinstance(obj, collections.abc.Mapping):
5442 iter_obj = obj.items()
7b0127e1
SS
5443 elif isinstance(obj, re.Match):
5444 iter_obj = enumerate((obj.group(), *obj.groups()))
ab029d7e
SS
5445 elif traverse_string:
5446 iter_obj = enumerate(str(obj))
352d63fd 5447 else:
ab029d7e
SS
5448 return
5449 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5450
5451 elif isinstance(key, dict):
5452 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5453 yield {k: v if v is not None else default for k, v in iter_obj
f99bbfc9 5454 if v is not None or default is not NO_DEFAULT}
ab029d7e 5455
7b0127e1 5456 elif isinstance(obj, collections.abc.Mapping):
ab029d7e
SS
5457 yield (obj.get(key) if casesense or (key in obj)
5458 else next((v for k, v in obj.items() if casefold(k) == key), None))
5459
7b0127e1
SS
5460 elif isinstance(obj, re.Match):
5461 if isinstance(key, int) or casesense:
5462 with contextlib.suppress(IndexError):
5463 yield obj.group(key)
5464 return
5465
5466 if not isinstance(key, str):
5467 return
5468
5469 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5470
ab029d7e
SS
5471 else:
5472 if is_user_input:
5473 key = (int_or_none(key) if ':' not in key
5474 else slice(*map(int_or_none, key.split(':'))))
5475
5476 if not isinstance(key, (int, slice)):
5477 return
5478
5479 if not is_sequence(obj):
5480 if not traverse_string:
5481 return
5482 obj = str(obj)
5483
5484 with contextlib.suppress(IndexError):
5485 yield obj[key]
5486
5487 def apply_path(start_obj, path):
5488 objs = (start_obj,)
5489 has_branched = False
5490
5491 for key in variadic(path):
5492 if is_user_input and key == ':':
5493 key = ...
5494
5495 if not casesense and isinstance(key, str):
5496 key = key.casefold()
5497
5498 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5499 has_branched = True
5500
5501 key_func = functools.partial(apply_key, key)
5502 objs = itertools.chain.from_iterable(map(key_func, objs))
5503
5504 return has_branched, objs
5505
f99bbfc9 5506 def _traverse_obj(obj, path, use_list=True):
ab029d7e
SS
5507 has_branched, results = apply_path(obj, path)
5508 results = LazyList(x for x in map(type_test, results) if x is not None)
ab029d7e 5509
f99bbfc9
SS
5510 if get_all and has_branched:
5511 return results.exhaust() if results or use_list else None
5512
5513 return results[0] if results else None
5514
5515 for index, path in enumerate(paths, 1):
5516 use_list = default is NO_DEFAULT and index == len(paths)
5517 result = _traverse_obj(obj, path, use_list)
ab029d7e
SS
5518 if result is not None:
5519 return result
5520
f99bbfc9 5521 return None if default is NO_DEFAULT else default
324ad820 5522
5523
5524def traverse_dict(dictn, keys, casesense=True):
da4db748 5525 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5526 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5527 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5528
5529
ff91cf74 5530def get_first(obj, keys, **kwargs):
5531 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5532
5533
3e9b66d7
LNO
5534def time_seconds(**kwargs):
5535 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5536 return t.timestamp()
5537
5538
49fa4d9a
N
5539# create a JSON Web Signature (jws) with HS256 algorithm
5540# the resulting format is in JWS Compact Serialization
5541# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5542# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5543def jwt_encode_hs256(payload_data, key, headers={}):
5544 header_data = {
5545 'alg': 'HS256',
5546 'typ': 'JWT',
5547 }
5548 if headers:
5549 header_data.update(headers)
0f06bcd7 5550 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5551 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5552 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5553 signature_b64 = base64.b64encode(h.digest())
5554 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5555 return token
819e0531 5556
5557
16b0d7e6 5558# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5559def jwt_decode_hs256(jwt):
5560 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5561 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5562 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5563 return payload_data
5564
5565
53973b4d 5566WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5567
5568
7a32c70d 5569@functools.cache
819e0531 5570def supports_terminal_sequences(stream):
5571 if compat_os_name == 'nt':
8a82af35 5572 if not WINDOWS_VT_MODE:
819e0531 5573 return False
5574 elif not os.getenv('TERM'):
5575 return False
5576 try:
5577 return stream.isatty()
5578 except BaseException:
5579 return False
5580
5581
c53a18f0 5582def windows_enable_vt_mode():
5583 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5584 if get_windows_version() < (10, 0, 10586):
53973b4d 5585 return
53973b4d 5586
c53a18f0 5587 import ctypes
5588 import ctypes.wintypes
5589 import msvcrt
5590
5591 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5592
5593 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5594 handle = os.open('CONOUT$', os.O_RDWR)
5595
5596 try:
5597 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5598 dw_original_mode = ctypes.wintypes.DWORD()
5599 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5600 if not success:
5601 raise Exception('GetConsoleMode failed')
5602
5603 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5604 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5605 if not success:
5606 raise Exception('SetConsoleMode failed')
5607 except Exception as e:
5608 write_string(f'WARNING: Cannot enable VT mode - {e}')
5609 else:
5610 global WINDOWS_VT_MODE
5611 WINDOWS_VT_MODE = True
5612 supports_terminal_sequences.cache_clear()
5613 finally:
5614 os.close(handle)
53973b4d 5615
5616
ec11a9f4 5617_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5618
5619
5620def remove_terminal_sequences(string):
5621 return _terminal_sequences_re.sub('', string)
5622
5623
5624def number_of_digits(number):
5625 return len('%d' % number)
34921b43 5626
5627
5628def join_nonempty(*values, delim='-', from_dict=None):
5629 if from_dict is not None:
7b2c3f47 5630 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5631 return delim.join(map(str, filter(None, values)))
06e57990 5632
5633
27231526
ZM
5634def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5635 """
5636 Find the largest format dimensions in terms of video width and, for each thumbnail:
5637 * Modify the URL: Match the width with the provided regex and replace with the former width
5638 * Update dimensions
5639
5640 This function is useful with video services that scale the provided thumbnails on demand
5641 """
5642 _keys = ('width', 'height')
5643 max_dimensions = max(
86e5f3ed 5644 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5645 default=(0, 0))
5646 if not max_dimensions[0]:
5647 return thumbnails
5648 return [
5649 merge_dicts(
5650 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5651 dict(zip(_keys, max_dimensions)), thumbnail)
5652 for thumbnail in thumbnails
5653 ]
5654
5655
93c8410d
LNO
5656def parse_http_range(range):
5657 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5658 if not range:
5659 return None, None, None
5660 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5661 if not crg:
5662 return None, None, None
5663 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5664
5665
6b9e832d 5666def read_stdin(what):
5667 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5668 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5669 return sys.stdin
5670
5671
a904a7f8
L
5672def determine_file_encoding(data):
5673 """
88f60feb 5674 Detect the text encoding used
a904a7f8
L
5675 @returns (encoding, bytes to skip)
5676 """
5677
88f60feb 5678 # BOM marks are given priority over declarations
a904a7f8 5679 for bom, enc in BOMS:
a904a7f8
L
5680 if data.startswith(bom):
5681 return enc, len(bom)
5682
88f60feb 5683 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5684 # We ignore the endianness to get a good enough match
a904a7f8 5685 data = data.replace(b'\0', b'')
88f60feb 5686 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5687 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5688
5689
06e57990 5690class Config:
5691 own_args = None
9e491463 5692 parsed_args = None
06e57990 5693 filename = None
5694 __initialized = False
5695
5696 def __init__(self, parser, label=None):
9e491463 5697 self.parser, self.label = parser, label
06e57990 5698 self._loaded_paths, self.configs = set(), []
5699
5700 def init(self, args=None, filename=None):
5701 assert not self.__initialized
284a60c5 5702 self.own_args, self.filename = args, filename
5703 return self.load_configs()
5704
5705 def load_configs(self):
65662dff 5706 directory = ''
284a60c5 5707 if self.filename:
5708 location = os.path.realpath(self.filename)
65662dff 5709 directory = os.path.dirname(location)
06e57990 5710 if location in self._loaded_paths:
5711 return False
5712 self._loaded_paths.add(location)
5713
284a60c5 5714 self.__initialized = True
5715 opts, _ = self.parser.parse_known_args(self.own_args)
5716 self.parsed_args = self.own_args
9e491463 5717 for location in opts.config_locations or []:
6b9e832d 5718 if location == '-':
1060f82f 5719 if location in self._loaded_paths:
5720 continue
5721 self._loaded_paths.add(location)
6b9e832d 5722 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5723 continue
65662dff 5724 location = os.path.join(directory, expand_path(location))
06e57990 5725 if os.path.isdir(location):
5726 location = os.path.join(location, 'yt-dlp.conf')
5727 if not os.path.exists(location):
9e491463 5728 self.parser.error(f'config location {location} does not exist')
06e57990 5729 self.append_config(self.read_file(location), location)
5730 return True
5731
5732 def __str__(self):
5733 label = join_nonempty(
5734 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5735 delim=' ')
5736 return join_nonempty(
5737 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5738 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5739 delim='\n')
5740
7a32c70d 5741 @staticmethod
06e57990 5742 def read_file(filename, default=[]):
5743 try:
a904a7f8 5744 optionf = open(filename, 'rb')
86e5f3ed 5745 except OSError:
06e57990 5746 return default # silently skip if file is not present
a904a7f8
L
5747 try:
5748 enc, skip = determine_file_encoding(optionf.read(512))
5749 optionf.seek(skip, io.SEEK_SET)
5750 except OSError:
5751 enc = None # silently skip read errors
06e57990 5752 try:
5753 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5754 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5755 res = shlex.split(contents, comments=True)
44a6fcff 5756 except Exception as err:
5757 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5758 finally:
5759 optionf.close()
5760 return res
5761
7a32c70d 5762 @staticmethod
06e57990 5763 def hide_login_info(opts):
86e5f3ed 5764 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5765 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5766
5767 def _scrub_eq(o):
5768 m = eqre.match(o)
5769 if m:
5770 return m.group('key') + '=PRIVATE'
5771 else:
5772 return o
5773
5774 opts = list(map(_scrub_eq, opts))
5775 for idx, opt in enumerate(opts):
5776 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5777 opts[idx + 1] = 'PRIVATE'
5778 return opts
5779
5780 def append_config(self, *args, label=None):
9e491463 5781 config = type(self)(self.parser, label)
06e57990 5782 config._loaded_paths = self._loaded_paths
5783 if config.init(*args):
5784 self.configs.append(config)
5785
7a32c70d 5786 @property
06e57990 5787 def all_args(self):
5788 for config in reversed(self.configs):
5789 yield from config.all_args
9e491463 5790 yield from self.parsed_args or []
5791
5792 def parse_known_args(self, **kwargs):
5793 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5794
5795 def parse_args(self):
9e491463 5796 return self.parser.parse_args(self.all_args)
da42679b
LNO
5797
5798
d5d1df8a 5799class WebSocketsWrapper:
da42679b 5800 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5801 pool = None
da42679b 5802
3cea3edd 5803 def __init__(self, url, headers=None, connect=True):
059bc4db 5804 self.loop = asyncio.new_event_loop()
9cd08050 5805 # XXX: "loop" is deprecated
5806 self.conn = websockets.connect(
5807 url, extra_headers=headers, ping_interval=None,
5808 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5809 if connect:
5810 self.__enter__()
15dfb392 5811 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5812
5813 def __enter__(self):
3cea3edd 5814 if not self.pool:
9cd08050 5815 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5816 return self
5817
5818 def send(self, *args):
5819 self.run_with_loop(self.pool.send(*args), self.loop)
5820
5821 def recv(self, *args):
5822 return self.run_with_loop(self.pool.recv(*args), self.loop)
5823
5824 def __exit__(self, type, value, traceback):
5825 try:
5826 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5827 finally:
5828 self.loop.close()
15dfb392 5829 self._cancel_all_tasks(self.loop)
da42679b
LNO
5830
5831 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5832 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5833 @staticmethod
da42679b 5834 def run_with_loop(main, loop):
059bc4db 5835 if not asyncio.iscoroutine(main):
da42679b
LNO
5836 raise ValueError(f'a coroutine was expected, got {main!r}')
5837
5838 try:
5839 return loop.run_until_complete(main)
5840 finally:
5841 loop.run_until_complete(loop.shutdown_asyncgens())
5842 if hasattr(loop, 'shutdown_default_executor'):
5843 loop.run_until_complete(loop.shutdown_default_executor())
5844
7a32c70d 5845 @staticmethod
da42679b 5846 def _cancel_all_tasks(loop):
059bc4db 5847 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5848
5849 if not to_cancel:
5850 return
5851
5852 for task in to_cancel:
5853 task.cancel()
5854
9cd08050 5855 # XXX: "loop" is removed in python 3.10+
da42679b 5856 loop.run_until_complete(
059bc4db 5857 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5858
5859 for task in to_cancel:
5860 if task.cancelled():
5861 continue
5862 if task.exception() is not None:
5863 loop.call_exception_handler({
5864 'message': 'unhandled exception during asyncio.run() shutdown',
5865 'exception': task.exception(),
5866 'task': task,
5867 })
5868
5869
8b7539d2 5870def merge_headers(*dicts):
08d30158 5871 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5872 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5873
5874
b1f94422 5875def cached_method(f):
5876 """Cache a method"""
5877 signature = inspect.signature(f)
5878
7a32c70d 5879 @functools.wraps(f)
b1f94422 5880 def wrapper(self, *args, **kwargs):
5881 bound_args = signature.bind(self, *args, **kwargs)
5882 bound_args.apply_defaults()
d5d1df8a 5883 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5884
6368e2e6 5885 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5886 if key not in cache:
5887 cache[key] = f(self, *args, **kwargs)
5888 return cache[key]
5889 return wrapper
5890
5891
28787f16 5892class classproperty:
83cc7b8a 5893 """property access for class methods with optional caching"""
5894 def __new__(cls, func=None, *args, **kwargs):
5895 if not func:
5896 return functools.partial(cls, *args, **kwargs)
5897 return super().__new__(cls)
c487cf00 5898
83cc7b8a 5899 def __init__(self, func, *, cache=False):
c487cf00 5900 functools.update_wrapper(self, func)
5901 self.func = func
83cc7b8a 5902 self._cache = {} if cache else None
28787f16 5903
5904 def __get__(self, _, cls):
83cc7b8a 5905 if self._cache is None:
5906 return self.func(cls)
5907 elif cls not in self._cache:
5908 self._cache[cls] = self.func(cls)
5909 return self._cache[cls]
19a03940 5910
5911
64fa820c 5912class Namespace(types.SimpleNamespace):
591bb9d3 5913 """Immutable namespace"""
591bb9d3 5914
7896214c 5915 def __iter__(self):
64fa820c 5916 return iter(self.__dict__.values())
7896214c 5917
7a32c70d 5918 @property
64fa820c 5919 def items_(self):
5920 return self.__dict__.items()
9b8ee23b 5921
5922
8dc59305 5923MEDIA_EXTENSIONS = Namespace(
5924 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5925 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5926 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5927 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5928 thumbnails=('jpg', 'png', 'webp'),
5929 storyboards=('mhtml', ),
5930 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5931 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5932)
5933MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5934MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5935
5936KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5937
5938
be5c1ae8 5939class RetryManager:
5940 """Usage:
5941 for retry in RetryManager(...):
5942 try:
5943 ...
5944 except SomeException as err:
5945 retry.error = err
5946 continue
5947 """
5948 attempt, _error = 0, None
5949
5950 def __init__(self, _retries, _error_callback, **kwargs):
5951 self.retries = _retries or 0
5952 self.error_callback = functools.partial(_error_callback, **kwargs)
5953
5954 def _should_retry(self):
5955 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5956
7a32c70d 5957 @property
be5c1ae8 5958 def error(self):
5959 if self._error is NO_DEFAULT:
5960 return None
5961 return self._error
5962
7a32c70d 5963 @error.setter
be5c1ae8 5964 def error(self, value):
5965 self._error = value
5966
5967 def __iter__(self):
5968 while self._should_retry():
5969 self.error = NO_DEFAULT
5970 self.attempt += 1
5971 yield self
5972 if self.error:
5973 self.error_callback(self.error, self.attempt, self.retries)
5974
7a32c70d 5975 @staticmethod
be5c1ae8 5976 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5977 """Utility function for reporting retries"""
5978 if count > retries:
5979 if error:
5980 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5981 raise e
5982
5983 if not count:
5984 return warn(e)
5985 elif isinstance(e, ExtractorError):
3ce29336 5986 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5987 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5988
5989 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5990 if delay:
5991 info(f'Sleeping {delay:.2f} seconds ...')
5992 time.sleep(delay)
5993
5994
0647d925 5995def make_archive_id(ie, video_id):
5996 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5997 return f'{ie_key.lower()} {video_id}'
5998
5999
a1c5bd82 6000def truncate_string(s, left, right=0):
6001 assert left > 3 and right >= 0
6002 if s is None or len(s) <= left + right:
6003 return s
71df9b7f 6004 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 6005
6006
5314b521 6007def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6008 assert 'all' in alias_dict, '"all" alias is required'
6009 requested = list(start or [])
6010 for val in options:
6011 discard = val.startswith('-')
6012 if discard:
6013 val = val[1:]
6014
6015 if val in alias_dict:
6016 val = alias_dict[val] if not discard else [
6017 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6018 # NB: Do not allow regex in aliases for performance
6019 requested = orderedSet_from_options(val, alias_dict, start=requested)
6020 continue
6021
6022 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6023 else [val] if val in alias_dict['all'] else None)
6024 if current is None:
6025 raise ValueError(val)
6026
6027 if discard:
6028 for item in current:
6029 while item in requested:
6030 requested.remove(item)
6031 else:
6032 requested.extend(current)
6033
6034 return orderedSet(requested)
6035
6036
d0d74b71 6037class FormatSorter:
6038 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6039
6040 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6041 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6042 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6043 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6044 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6045 'fps', 'fs_approx', 'source', 'id')
6046
6047 settings = {
6048 'vcodec': {'type': 'ordered', 'regex': True,
6049 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6050 'acodec': {'type': 'ordered', 'regex': True,
71082216 6051 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 6052 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6053 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6054 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6055 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6056 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 6057 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6058 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
d0d74b71 6059 'aext': {'type': 'ordered', 'field': 'audio_ext',
6060 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6061 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6062 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6063 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6064 'field': ('vcodec', 'acodec'),
6065 'function': lambda it: int(any(v != 'none' for v in it))},
6066 'ie_pref': {'priority': True, 'type': 'extractor'},
6067 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6068 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6069 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6070 'quality': {'convert': 'float', 'default': -1},
6071 'filesize': {'convert': 'bytes'},
6072 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6073 'id': {'convert': 'string', 'field': 'format_id'},
6074 'height': {'convert': 'float_none'},
6075 'width': {'convert': 'float_none'},
6076 'fps': {'convert': 'float_none'},
6077 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6078 'tbr': {'convert': 'float_none'},
6079 'vbr': {'convert': 'float_none'},
6080 'abr': {'convert': 'float_none'},
6081 'asr': {'convert': 'float_none'},
6082 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6083
6084 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6085 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6086 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6087 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6088 'res': {'type': 'multiple', 'field': ('height', 'width'),
6089 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6090
6091 # Actual field names
6092 'format_id': {'type': 'alias', 'field': 'id'},
6093 'preference': {'type': 'alias', 'field': 'ie_pref'},
6094 'language_preference': {'type': 'alias', 'field': 'lang'},
6095 'source_preference': {'type': 'alias', 'field': 'source'},
6096 'protocol': {'type': 'alias', 'field': 'proto'},
6097 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6098 'audio_channels': {'type': 'alias', 'field': 'channels'},
6099
6100 # Deprecated
6101 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6102 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6103 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6104 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6105 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6106 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6107 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6108 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6109 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6110 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6111 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6112 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6113 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6114 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6115 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6116 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6117 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6118 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6119 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6120 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6121 }
6122
6123 def __init__(self, ydl, field_preference):
6124 self.ydl = ydl
6125 self._order = []
6126 self.evaluate_params(self.ydl.params, field_preference)
6127 if ydl.params.get('verbose'):
6128 self.print_verbose_info(self.ydl.write_debug)
6129
6130 def _get_field_setting(self, field, key):
6131 if field not in self.settings:
6132 if key in ('forced', 'priority'):
6133 return False
6134 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6135 'deprecated and may be removed in a future version')
6136 self.settings[field] = {}
6137 propObj = self.settings[field]
6138 if key not in propObj:
6139 type = propObj.get('type')
6140 if key == 'field':
6141 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6142 elif key == 'convert':
6143 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6144 else:
6145 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6146 propObj[key] = default
6147 return propObj[key]
6148
6149 def _resolve_field_value(self, field, value, convertNone=False):
6150 if value is None:
6151 if not convertNone:
6152 return None
6153 else:
6154 value = value.lower()
6155 conversion = self._get_field_setting(field, 'convert')
6156 if conversion == 'ignore':
6157 return None
6158 if conversion == 'string':
6159 return value
6160 elif conversion == 'float_none':
6161 return float_or_none(value)
6162 elif conversion == 'bytes':
6163 return parse_bytes(value)
6164 elif conversion == 'order':
6165 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6166 use_regex = self._get_field_setting(field, 'regex')
6167 list_length = len(order_list)
6168 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6169 if use_regex and value is not None:
6170 for i, regex in enumerate(order_list):
6171 if regex and re.match(regex, value):
6172 return list_length - i
6173 return list_length - empty_pos # not in list
6174 else: # not regex or value = None
6175 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6176 else:
6177 if value.isnumeric():
6178 return float(value)
6179 else:
6180 self.settings[field]['convert'] = 'string'
6181 return value
6182
6183 def evaluate_params(self, params, sort_extractor):
6184 self._use_free_order = params.get('prefer_free_formats', False)
6185 self._sort_user = params.get('format_sort', [])
6186 self._sort_extractor = sort_extractor
6187
6188 def add_item(field, reverse, closest, limit_text):
6189 field = field.lower()
6190 if field in self._order:
6191 return
6192 self._order.append(field)
6193 limit = self._resolve_field_value(field, limit_text)
6194 data = {
6195 'reverse': reverse,
6196 'closest': False if limit is None else closest,
6197 'limit_text': limit_text,
6198 'limit': limit}
6199 if field in self.settings:
6200 self.settings[field].update(data)
6201 else:
6202 self.settings[field] = data
6203
6204 sort_list = (
6205 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6206 + (tuple() if params.get('format_sort_force', False)
6207 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6208 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6209
6210 for item in sort_list:
6211 match = re.match(self.regex, item)
6212 if match is None:
6213 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6214 field = match.group('field')
6215 if field is None:
6216 continue
6217 if self._get_field_setting(field, 'type') == 'alias':
6218 alias, field = field, self._get_field_setting(field, 'field')
6219 if self._get_field_setting(alias, 'deprecated'):
6220 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6221 f'be removed in a future version. Please use {field} instead')
6222 reverse = match.group('reverse') is not None
6223 closest = match.group('separator') == '~'
6224 limit_text = match.group('limit')
6225
6226 has_limit = limit_text is not None
6227 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6228 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6229
6230 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6231 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6232 limit_count = len(limits)
6233 for (i, f) in enumerate(fields):
6234 add_item(f, reverse, closest,
6235 limits[i] if i < limit_count
6236 else limits[0] if has_limit and not has_multiple_limits
6237 else None)
6238
6239 def print_verbose_info(self, write_debug):
6240 if self._sort_user:
6241 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6242 if self._sort_extractor:
6243 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6244 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6245 '+' if self._get_field_setting(field, 'reverse') else '', field,
6246 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6247 self._get_field_setting(field, 'limit_text'),
6248 self._get_field_setting(field, 'limit'))
6249 if self._get_field_setting(field, 'limit_text') is not None else '')
6250 for field in self._order if self._get_field_setting(field, 'visible')]))
6251
6252 def _calculate_field_preference_from_value(self, format, field, type, value):
6253 reverse = self._get_field_setting(field, 'reverse')
6254 closest = self._get_field_setting(field, 'closest')
6255 limit = self._get_field_setting(field, 'limit')
6256
6257 if type == 'extractor':
6258 maximum = self._get_field_setting(field, 'max')
6259 if value is None or (maximum is not None and value >= maximum):
6260 value = -1
6261 elif type == 'boolean':
6262 in_list = self._get_field_setting(field, 'in_list')
6263 not_in_list = self._get_field_setting(field, 'not_in_list')
6264 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6265 elif type == 'ordered':
6266 value = self._resolve_field_value(field, value, True)
6267
6268 # try to convert to number
6269 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6270 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6271 if is_num:
6272 value = val_num
6273
6274 return ((-10, 0) if value is None
6275 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6276 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6277 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6278 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6279 else (-1, value, 0))
6280
6281 def _calculate_field_preference(self, format, field):
6282 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6283 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6284 if type == 'multiple':
6285 type = 'field' # Only 'field' is allowed in multiple for now
6286 actual_fields = self._get_field_setting(field, 'field')
6287
6288 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6289 else:
6290 value = get_value(field)
6291 return self._calculate_field_preference_from_value(format, field, type, value)
6292
6293 def calculate_preference(self, format):
6294 # Determine missing protocol
6295 if not format.get('protocol'):
6296 format['protocol'] = determine_protocol(format)
6297
6298 # Determine missing ext
6299 if not format.get('ext') and 'url' in format:
6300 format['ext'] = determine_ext(format['url'])
6301 if format.get('vcodec') == 'none':
6302 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6303 format['video_ext'] = 'none'
6304 else:
6305 format['video_ext'] = format['ext']
6306 format['audio_ext'] = 'none'
6307 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6308 # format['preference'] = -1000
6309
6310 # Determine missing bitrates
6311 if format.get('tbr') is None:
6312 if format.get('vbr') is not None and format.get('abr') is not None:
6313 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6314 else:
6315 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6316 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6317 if format.get('acodec') != 'none' and format.get('abr') is None:
6318 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6319
6320 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6321
6322
9b8ee23b 6323# Deprecated
6324has_certifi = bool(certifi)
6325has_websockets = bool(websockets)