]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[utils] Make `ExtractorError` mutable
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
989a01c2 44import unicodedata
14f25df2 45import urllib.error
f8271158 46import urllib.parse
ac668111 47import urllib.request
bcf89ce6 48import xml.etree.ElementTree
d77c3dfd 49import zlib
d77c3dfd 50
6929b41a 51from .compat import functools # isort: split
8c25f81b 52from .compat import (
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
f8271158 55 compat_HTMLParseError,
efa97bdc 56 compat_os_name,
702ccf2d 57 compat_shlex_quote,
8c25f81b 58)
ac668111 59from .dependencies import brotli, certifi, websockets, xattr
f8271158 60from .socks import ProxyType, sockssocket
71aff188 61
4644ac55 62
51fb4995
YCH
63def register_socks_protocols():
64 # "Register" SOCKS protocols
d5ae6bb5
YCH
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
70
71
468e2e92
FV
72# This is not clearly defined otherwise
73compiled_regex_type = type(re.compile(''))
74
f7a147e3
S
75
76def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
19b4c74d 79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
f7a147e3
S
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
4390d5ec 121SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123]
9b8ee23b 124if brotli:
4390d5ec 125 SUPPORTED_ENCODINGS.append('br')
126
3e669f36 127std_headers = {
f7a147e3 128 'User-Agent': random_user_agent(),
59ae15a5 129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 130 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 131 'Sec-Fetch-Mode': 'navigate',
3e669f36 132}
f427df17 133
5f6a1245 134
fb37eb25
S
135USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137}
138
139
bf42a990 140NO_DEFAULT = object()
7b2c3f47 141IDENTITY = lambda x: x
bf42a990 142
7105440c
YCH
143ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
f6717dec
S
147MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
3e4185c3
S
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 157}
a942d6cb 158
8f53dc44 159# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167}
168
c587cbb7 169# needed for sanitizing filenames in restricted mode
c8827027 170ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 173
46f59e89
S
174DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
cb655f34
S
178 '%B %dst %Y',
179 '%B %dnd %Y',
9d30c213 180 '%B %drd %Y',
cb655f34 181 '%B %dth %Y',
46f59e89 182 '%b %d %Y',
cb655f34
S
183 '%b %dst %Y',
184 '%b %dnd %Y',
9d30c213 185 '%b %drd %Y',
cb655f34 186 '%b %dth %Y',
46f59e89
S
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
9d30c213 189 '%b %drd %Y %I:%M',
46f59e89
S
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
bccdbd22 193 '%Y.%m.%d.',
46f59e89 194 '%Y/%m/%d',
81c13222 195 '%Y/%m/%d %H:%M',
46f59e89 196 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
4f3fa23e 199 '%Y%m%d',
0c1c6f4b 200 '%Y-%m-%d %H:%M',
46f59e89
S
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
5014558a 203 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
c6eed6b8
S
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
b555ae9b
S
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
a63d9bd0 216 '%H:%M %d-%b-%Y',
46f59e89
S
217)
218
219DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
47304e07 227 '%d-%m-%Y %H:%M',
46f59e89
S
228])
229
230DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237])
238
06b3fe29 239PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 240JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 241
1d485a1a 242NUMBER_RE = r'\d+(?:\.\d+)?'
243
7105440c 244
0b9c08b4 245@functools.cache
d77c3dfd 246def preferredencoding():
59ae15a5 247 """Get preferred encoding.
d77c3dfd 248
59ae15a5
PH
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
28e614de 254 'TEST'.encode(pref)
70a1165b 255 except Exception:
59ae15a5 256 pref = 'UTF-8'
bae611f2 257
59ae15a5 258 return pref
d77c3dfd 259
f4bfd65f 260
181c8655 261def write_json_file(obj, fn):
1394646a 262 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 263
cfb0511d 264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
267
268 try:
269 with tf:
45d86abe 270 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
19a03940 274 with contextlib.suppress(OSError):
1394646a 275 os.unlink(fn)
19a03940 276 with contextlib.suppress(OSError):
9cd5f54e
R
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
181c8655 280 os.rename(tf.name, fn)
70a1165b 281 except Exception:
19a03940 282 with contextlib.suppress(OSError):
181c8655 283 os.remove(tf.name)
181c8655
PH
284 raise
285
286
cfb0511d 287def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 291 return node.find(expr)
59ae56fa 292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
5f6a1245
JW
295
296
d7e66d39
JMF
297def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
d77c3dfd 308
a41fb80c 309def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 310 def _find_xpath(xpath):
f9934b96 311 return node.find(xpath)
578c0745 312
14f25df2 313 if isinstance(xpath, str):
578c0745
S
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
d74bebd5 320
8e636da4 321 if n is None:
bf42a990
S
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
bf0ff932
PH
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
a41fb80c
S
329 return n
330
331
332def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
a41fb80c
S
345
346
347def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
86e5f3ed 353 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
bf0ff932
PH
358
359
c487cf00 360def get_element_by_id(id, html, **kwargs):
43e8fafd 361 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 362 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 363
12ea2f30 364
c487cf00 365def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 366 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 367 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
368
369
84c237fb 370def get_element_by_class(class_name, html):
2af12ad9
TC
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
6f32a0b5
ZM
376def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
c487cf00 382def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
384 return retval[0] if retval else None
385
386
c487cf00 387def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
389 return retval[0] if retval else None
390
391
c487cf00 392def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
64fa820c 395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
396 html, escape_value=False)
397
398
6f32a0b5
ZM
399def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
64fa820c 402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
403 html, escape_value=False)
404
405
406def get_elements_by_attribute(*args, **kwargs):
43e8fafd 407 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
4c9a1a3b 416def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
c61473c1
M
421 if not value:
422 return
9e6dd238 423
86e5f3ed 424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 425
84c237fb
YCH
426 value = re.escape(value) if escape_value else value
427
86e5f3ed 428 partial_element_re = rf'''(?x)
4c9a1a3b 429 <(?P<tag>{tag})
0254f162 430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
38285056 433
0254f162
ZM
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 436
0254f162
ZM
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
a921f407 441
c5229f39 442
ac668111 443class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
ac668111 455 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
46d09f87 485# XXX: This should be far less strict
6f32a0b5
ZM
486def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
ac668111 521class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 522 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 523
8bb56eee 524 def __init__(self):
c5229f39 525 self.attrs = {}
ac668111 526 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
7053aa3a 530 raise compat_HTMLParseError('done')
8bb56eee 531
c5229f39 532
ac668111 533class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
ac668111 537 html.parser.HTMLParser.__init__(self)
73673ccf
FF
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
8bb56eee
BF
550def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
8bb56eee
BF
563 """
564 parser = HTMLAttributeParser()
19a03940 565 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
566 parser.feed(html_element)
567 parser.close()
8bb56eee 568 return parser.attrs
9e6dd238 569
c5229f39 570
73673ccf
FF
571def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
9e6dd238 580def clean_html(html):
59ae15a5 581 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
49185227 586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
7decf895 593 return html.strip()
9e6dd238
FV
594
595
b7c47b74 596class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
2fa669f7 604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
b7c47b74 612
613
d77c3dfd 614def sanitize_open(filename, open_mode):
59ae15a5
PH
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
0edb3e33 624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
be5c1ae8 627
62b58c09 628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 632
0edb3e33 633 for attempt in range(2):
634 try:
635 try:
89737671 636 if sys.platform == 'win32':
b506289f 637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 640 raise LockingUnsupportedError()
0edb3e33 641 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 642 except OSError:
0edb3e33 643 stream = open(filename, open_mode)
8a82af35 644 return stream, filename
86e5f3ed 645 except OSError as err:
0edb3e33 646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
d77c3dfd
FV
651
652
653def timeconvert(timestr):
59ae15a5
PH
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
1c469a94 660
5f6a1245 661
5c3895ff 662def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 663 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 667 """
5c3895ff 668 if s == '':
669 return ''
670
59ae15a5 671 def replace_insane(char):
c587cbb7
AT
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
91dd88b9 674 elif not restricted and char == '\n':
5c3895ff 675 return '\0 '
989a01c2 676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
5c3895ff 684 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 685 elif char in '\\/|*<>':
5c3895ff 686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
59ae15a5
PH
689 return char
690
db4678e4 691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 693 s = unicodedata.normalize('NFKC', s)
5c3895ff 694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 695 result = ''.join(map(replace_insane, s))
5c3895ff 696 if is_id is NO_DEFAULT:
ae61d108 697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
796173d0
PH
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
5a42414b
PH
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
a7440261 711 result = result.lstrip('.')
796173d0
PH
712 if not result:
713 result = '_'
59ae15a5 714 return result
d77c3dfd 715
5f6a1245 716
c2934512 717def sanitize_path(s, force=False):
a2aaf4db 718 """Sanitizes and normalizes path on Windows"""
c2934512 719 if sys.platform == 'win32':
c4218ac3 720 force = False
c2934512 721 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 722 elif force:
723 drive_or_unc = ''
724 else:
a2aaf4db 725 return s
c2934512 726
be531ef1
S
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
a2aaf4db
S
729 norm_path.pop(0)
730 sanitized_path = [
ec85ded8 731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 732 for path_part in norm_path]
be531ef1
S
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 735 elif force and s and s[0] == os.path.sep:
c4218ac3 736 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
737 return os.path.join(*sanitized_path)
738
739
8f97a15d 740def sanitize_url(url, *, scheme='http'):
befa4708
S
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
21633673 743 if url is None:
744 return
745 elif url.startswith('//'):
8f97a15d 746 return f'{scheme}:{url}'
befa4708
S
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
067aa17e 749 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
bc6b9bcd 757 return url
17bcc626
S
758
759
5435dcf9 760def extract_basic_auth(url):
14f25df2 761 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
762 if parts.username is None:
763 return url, None
14f25df2 764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
0f06bcd7 768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
770
771
67dda517 772def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
ac668111 777 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
778
779
51098426 780def expand_path(s):
2fa669f7 781 """Expand shell variables and ~"""
51098426
S
782 return os.path.expandvars(compat_expanduser(s))
783
784
7e9a6125 785def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
d77c3dfd 795
912b38b4 796
55b2f099 797def _htmlentity_transform(entity_with_semicolon):
4e408e47 798 """Transforms an HTML entity to a character."""
55b2f099
YCH
799 entity = entity_with_semicolon[:-1]
800
4e408e47 801 # Known non-numeric HTML entity
ac668111 802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
4e408e47 804
62b58c09
L
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
55b2f099 809
91757b0f 810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
811 if mobj is not None:
812 numstr = mobj.group(1)
28e614de 813 if numstr.startswith('x'):
4e408e47 814 base = 16
28e614de 815 numstr = '0%s' % numstr
4e408e47
PH
816 else:
817 base = 10
067aa17e 818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 819 with contextlib.suppress(ValueError):
ac668111 820 return chr(int(numstr, base))
4e408e47
PH
821
822 # Unknown entity in name, return its literal representation
7a3f0c00 823 return '&%s;' % entity
4e408e47
PH
824
825
d77c3dfd 826def unescapeHTML(s):
912b38b4
PH
827 if s is None:
828 return None
19a03940 829 assert isinstance(s, str)
d77c3dfd 830
4e408e47 831 return re.sub(
95f3f7c2 832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 833
8bf48f23 834
cdb19aa4 835def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
f5b1bca9 846def process_communicate_or_kill(p, *args, **kwargs):
da4db748 847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 849 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 850
851
d3c93ec2 852class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
82ea226c
L
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
f0c9fb96 883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
82ea226c 887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 888
889 def communicate_or_kill(self, *args, **kwargs):
8a82af35 890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
f0c9fb96 893 self.kill(timeout=None)
8a82af35 894 raise
d3c93ec2 895
f0c9fb96 896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
992dc6b4 902 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 903 with cls(*args, **kwargs) as proc:
914491b8 904 default = '' if proc.text_mode else b''
992dc6b4 905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 906 return stdout or default, stderr or default, proc.returncode
f0c9fb96 907
d3c93ec2 908
aa49acd1
S
909def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
8bf48f23 921def encodeFilename(s, for_subprocess=False):
19a03940 922 assert isinstance(s, str)
cfb0511d 923 return s
aa49acd1
S
924
925
926def decodeFilename(b, for_subprocess=False):
cfb0511d 927 return b
8bf48f23 928
f07b74fc
PH
929
930def encodeArgument(s):
cfb0511d 931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
14f25df2 933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 934 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
935
936
aa49acd1 937def decodeArgument(b):
cfb0511d 938 return b
aa49acd1
S
939
940
8271226a
PH
941def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
14f25df2 947 assert isinstance(optval, str)
8271226a 948 return optval
1c256f70 949
5f6a1245 950
aa7785f8 951_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
cdb19aa4 961def formatSeconds(secs, delim=':', msec=False):
aa7785f8 962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 967 else:
aa7785f8 968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 970
a0ddb8a2 971
77562778 972def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
19a03940 981 with contextlib.suppress(ssl.SSLError):
77562778 982 ssl_context.load_verify_locations(cadata=cert)
a2366922 983
77562778 984
985def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
f81c62a6 989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
ac8e69dd
M
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
5b9f253f
M
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1012
77562778 1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
d5820461 1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
8a82af35 1029
bb58c9ed 1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
77562778 1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1046
732ea2f0 1047
5873d4cc 1048def bug_reports_message(before=';'):
57e0f077 1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1059
1060
bf5b9d85
PM
1061class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
aa9369a2 1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
bf5b9d85
PM
1071
1072
ac668111 1073network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1074if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076network_exceptions = tuple(network_exceptions)
1077
1078
bf5b9d85 1079class ExtractorError(YoutubeDLError):
1c256f70 1080 """Error during info extraction."""
5f6a1245 1081
1151c407 1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1083 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1085 """
3158150c 1086 if sys.exc_info()[0] in network_exceptions:
9a82b238 1087 expected = True
d5979c5d 1088
7265a219 1089 self.orig_msg = str(msg)
1c256f70 1090 self.traceback = tb
1151c407 1091 self.expected = expected
2eabb802 1092 self.cause = cause
d11271dd 1093 self.video_id = video_id
1151c407 1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1098 super().__init__(self.__msg)
1151c407 1099
9bcfe33b 1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1c256f70 1108
01951dda 1109 def format_traceback(self):
497d2fab 1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1113 delim='\n') or None
01951dda 1114
9bcfe33b 1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1c256f70 1121
416c7fcb
PH
1122class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
86e5f3ed 1124 super().__init__(
416c7fcb
PH
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
55b3e45b
JMF
1129class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
773f291d
S
1134class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
b6e0c7d2 1140
0db3bae8 1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
86e5f3ed 1143 super().__init__(msg, **kwargs)
773f291d
S
1144 self.countries = countries
1145
1146
693f0600 1147class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
bf5b9d85 1155class DownloadError(YoutubeDLError):
59ae15a5 1156 """Download Error exception.
d77c3dfd 1157
59ae15a5
PH
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
5f6a1245 1162
8cc83b8d
FV
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1165 super().__init__(msg)
8cc83b8d 1166 self.exc_info = exc_info
d77c3dfd
FV
1167
1168
498f5606 1169class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
aa9369a2 1175 msg = 'Entry not found in info'
498f5606 1176
1177
bf5b9d85 1178class SameFileError(YoutubeDLError):
59ae15a5 1179 """Same File exception.
d77c3dfd 1180
59ae15a5
PH
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
aa9369a2 1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
d77c3dfd
FV
1190
1191
bf5b9d85 1192class PostProcessingError(YoutubeDLError):
59ae15a5 1193 """Post Processing exception.
d77c3dfd 1194
59ae15a5
PH
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
5f6a1245 1198
5f6a1245 1199
48f79687 1200class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
8b0d7497 1203
8b0d7497 1204
48f79687 1205class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1208
48f79687 1209
1210class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1213
1214
48f79687 1215class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1216 """ --max-downloads limit has been reached. """
48f79687 1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
f2ebc5c7 1220class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228class ThrottledDownload(ReExtractInfo):
48f79687 1229 """ Download speed below --throttled-rate. """
aa9369a2 1230 msg = 'The download speed is below throttle limit'
d77c3dfd 1231
43b22906 1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
f2ebc5c7 1234
d77c3dfd 1235
bf5b9d85 1236class UnavailableVideoError(YoutubeDLError):
59ae15a5 1237 """Unavailable Format exception.
d77c3dfd 1238
59ae15a5
PH
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
aa9369a2 1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
d77c3dfd
FV
1248
1249
bf5b9d85 1250class ContentTooShortError(YoutubeDLError):
59ae15a5 1251 """Content Too Short exception.
d77c3dfd 1252
59ae15a5
PH
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
d77c3dfd 1257
59ae15a5 1258 def __init__(self, downloaded, expected):
86e5f3ed 1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1260 # Both in bytes
59ae15a5
PH
1261 self.downloaded = downloaded
1262 self.expected = expected
d77c3dfd 1263
5f6a1245 1264
bf5b9d85 1265class XAttrMetadataError(YoutubeDLError):
efa97bdc 1266 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1267 super().__init__(msg)
efa97bdc 1268 self.code = code
bd264412 1269 self.msg = msg
efa97bdc
YCH
1270
1271 # Parsing code and msg
3089bc74 1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
bf5b9d85 1281class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1282 pass
1283
1284
c5a59d93 1285def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1286 hc = http_class(*args, **kwargs)
be4a824d 1287 source_address = ydl_handler._params.get('source_address')
8959018a 1288
be4a824d 1289 if source_address is not None:
8959018a
AU
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1303 raise OSError(
9e21e6d9
S
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
8959018a
AU
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
86e5f3ed 1317 except OSError as _:
8959018a
AU
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
86e5f3ed 1324 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
cfb0511d 1327 hc.source_address = (source_address, 0)
be4a824d
PH
1328
1329 return hc
1330
1331
87f0e62d 1332def handle_youtubedl_headers(headers):
992fc9d6
YCH
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1337 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1338
992fc9d6 1339 return filtered_headers
87f0e62d
YCH
1340
1341
ac668111 1342class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
0424ec30 1349 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
be4a824d 1360 def __init__(self, params, *args, **kwargs):
ac668111 1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1362 self._params = params
1363
1364 def http_open(self, req):
ac668111 1365 conn_class = http.client.HTTPConnection
71aff188
YCH
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
be4a824d 1372 return self.do_open(functools.partial(
71aff188 1373 _create_http_connection, self, conn_class, False),
be4a824d
PH
1374 req)
1375
59ae15a5
PH
1376 @staticmethod
1377 def deflate(data):
fc2119f2 1378 if not data:
1379 return data
59ae15a5
PH
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
4390d5ec 1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
9b8ee23b 1389 return brotli.decompress(data)
4390d5ec 1390
acebc9cd 1391 def http_request(self, req):
51f267d9
S
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
15d260eb 1405 req = update_Request(req, url=url_escaped)
51f267d9 1406
8b7539d2 1407 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
33ac271b 1411 req.add_header(h, v)
87f0e62d 1412
af14914b 1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
87f0e62d 1416 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1417
379a4f16 1418 return super().do_request_(req)
59ae15a5 1419
acebc9cd 1420 def http_response(self, req, resp):
59ae15a5
PH
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1428 except OSError as original_ioerror:
aa3e9507
PH
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1435 except OSError:
aa3e9507
PH
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
ac668111 1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1441 resp.msg = old_resp.msg
c047270c 1442 del resp.headers['Content-encoding']
59ae15a5
PH
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1447 resp.msg = old_resp.msg
c047270c 1448 del resp.headers['Content-encoding']
4390d5ec 1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1451 resp = urllib.request.addinfourl(
4390d5ec 1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
ad729172 1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1461 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
59ae15a5 1466 return resp
0f8d03f8 1467
acebc9cd
PH
1468 https_request = http_request
1469 https_response = http_response
bf50b038 1470
5de90176 1471
71aff188
YCH
1472def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
ac668111 1474 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1475
14f25df2 1476 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
71aff188 1483
cdd94c2e
YCH
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
ac668111 1487 return urllib.parse.unquote_plus(s)
cdd94c2e 1488
71aff188
YCH
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
cdd94c2e
YCH
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
71aff188
YCH
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
19a03940 1501 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
ac668111 1505 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
ac668111 1515class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1519 self._params = params
1520
1521 def https_open(self, req):
4f264c02 1522 kwargs = {}
71aff188
YCH
1523 conn_class = self._https_conn_class
1524
4f264c02
JMF
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
4f28b537 1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
be4a824d
PH
1543
1544
941e881e 1545def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
ac668111 1549class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
e7e62441 1555 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1558# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1559
1560'''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1564
d76fa1f3 1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
941e881e 1567 if is_path_like(filename):
d76fa1f3 1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
24146491 1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
d76fa1f3 1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
941e881e 1577 if is_path_like(file):
d76fa1f3 1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
24146491 1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1607 """
1608 Save cookies to a file.
24146491 1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1611
c380cc28
S
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
ac668111 1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1617
24146491 1618 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
c380cc28 1622
d76fa1f3 1623 with self.open(filename, write=True) as f:
c380cc28 1624 f.write(self._HEADER)
24146491 1625 self._really_save(f, *args, **kwargs)
1bab3437
S
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
ac668111 1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1634
c380cc28
S
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1647 return line
1648
e7e62441 1649 cf = io.StringIO()
d76fa1f3 1650 with self.open(filename) as f:
e7e62441 1651 for line in f:
c380cc28
S
1652 try:
1653 cf.write(prepare_line(line))
ac668111 1654 except http.cookiejar.LoadError as e:
94aa0644 1655 if f'{line.strip()} '[0] in '[{"':
ac668111 1656 raise http.cookiejar.LoadError(
94aa0644 1657 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1660 continue
e7e62441 1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
ac668111 1679class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1680 def __init__(self, cookiejar=None):
ac668111 1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1682
1683 def http_response(self, request, response):
ac668111 1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1685
ac668111 1686 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1687 https_response = http_response
1688
1689
ac668111 1690class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
ac668111 1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
14f25df2 1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
201c1459 1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
ac668111 1747 return urllib.request.Request(
201c1459 1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1749 unverifiable=True, method=m)
fca6dba8
S
1750
1751
46f59e89
S
1752def extract_timezone(date_str):
1753 m = re.search(
f137e4c2 1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
46f59e89 1764 if not m:
8f53dc44 1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
08b38d54 1782def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
52c3a6e4
S
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
08b38d54 1790 if timezone is None:
46f59e89
S
1791 timezone, date_str = extract_timezone(date_str)
1792
19a03940 1793 with contextlib.suppress(ValueError):
86e5f3ed 1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
912b38b4
PH
1797
1798
46f59e89
S
1799def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
42bdd9d0 1803def unified_strdate(date_str, day_first=True):
bf50b038 1804 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1805
1806 if date_str is None:
1807 return None
bf50b038 1808 upload_date = None
5f6a1245 1809 # Replace commas
026fcc04 1810 date_str = date_str.replace(',', ' ')
42bdd9d0 1811 # Remove AM/PM + timezone
9bb8e0a3 1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1813 _, date_str = extract_timezone(date_str)
42bdd9d0 1814
46f59e89 1815 for expression in date_formats(day_first):
19a03940 1816 with contextlib.suppress(ValueError):
bf50b038 1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
19a03940 1821 with contextlib.suppress(ValueError):
c6b9cf05 1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1823 if upload_date is not None:
14f25df2 1824 return str(upload_date)
bf50b038 1825
5f6a1245 1826
46f59e89
S
1827def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
8f53dc44 1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1833
7dc2a74e 1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
deef3195
S
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
f226880c
PH
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
46f59e89 1850 for expression in date_formats(day_first):
19a03940 1851 with contextlib.suppress(ValueError):
7dc2a74e 1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1853 return calendar.timegm(dt.timetuple())
8f53dc44 1854
46f59e89
S
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
8f53dc44 1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1858
1859
28e614de 1860def determine_ext(url, default_ext='unknown_video'):
85750f89 1861 if url is None or '.' not in url:
f4776371 1862 return default_ext
9cb9a5df 1863 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
a7aaa398
S
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1868 return guess.rstrip('/')
73e79f2a 1869 else:
cbdbb766 1870 return default_ext
73e79f2a 1871
5f6a1245 1872
824fa511
S
1873def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1875
5f6a1245 1876
9e62f283 1877def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
9e62f283 1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
396a76f7 1891 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1892 if date_str in ('now', 'today'):
37254abc 1893 return today
f8795e10
PH
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
9e62f283 1896 match = re.match(
3d38b2d6 1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1898 date_str)
37254abc 1899 if match is not None:
9e62f283 1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1902 unit = match.group('unit')
9e62f283 1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1905 unit = 'day'
9e62f283 1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
d49f8db3 1919def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1920 R"""
1921 Return a date object from a string using datetime_from_str
9e62f283 1922
3d38b2d6 1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1925 """
3d38b2d6 1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1956
1957
e63fc1be 1958def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
5f6a1245 1967
86e5f3ed 1968class DateRange:
bd558525 1969 """Represents a time interval between two dates"""
5f6a1245 1970
bd558525
JMF
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
d49f8db3 1974 self.start = date_from_str(start, strict=True)
bd558525
JMF
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
d49f8db3 1978 self.end = date_from_str(end, strict=True)
bd558525
JMF
1979 else:
1980 self.end = datetime.datetime.max.date()
37254abc 1981 if self.start > self.end:
bd558525 1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1983
bd558525
JMF
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
5f6a1245
JW
1987 return cls(day, day)
1988
bd558525
JMF
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
37254abc
JMF
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
5f6a1245 1994
bd558525 1995 def __str__(self):
86e5f3ed 1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1997
f2df4071 1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
c496ca96
PH
2002
2003def platform_name():
14f25df2 2004 """ Returns the platform name as a str """
da4db748 2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 2006 return platform.platform()
c496ca96 2007
b1f94422 2008
2009@functools.cache
2010def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
b1f94422 2017
17fc3dc4 2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 2019 platform.python_version(),
2020 python_implementation,
17fc3dc4 2021 platform.machine(),
b1f94422 2022 platform.architecture()[0],
2023 platform.platform(),
5b9f253f
M
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 2026 )
c257baff
PH
2027
2028
0b9c08b4 2029@functools.cache
49fa4d9a 2030def get_windows_version():
8a82af35 2031 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
8a82af35 2035 return ()
49fa4d9a
N
2036
2037
734f90bb 2038def write_string(s, out=None, encoding=None):
19a03940 2039 assert isinstance(s, str)
2040 out = out or sys.stderr
7459e3a2 2041
fe1daad3 2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2043 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2044
8a82af35 2045 enc, buffer = None, out
cfb0511d 2046 if 'b' in getattr(out, 'mode', ''):
c487cf00 2047 enc = encoding or preferredencoding()
104aa738 2048 elif hasattr(out, 'buffer'):
8a82af35 2049 buffer = out.buffer
104aa738 2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2051
8a82af35 2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2053 out.flush()
2054
2055
da4db748 2056def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070deprecation_warning._cache = set()
2071
2072
48ea9cea
PH
2073def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
c257baff 2081
cba892fa 2082def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
ac668111 2085 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2086
2087
8a82af35 2088class LockingUnsupportedError(OSError):
1890fc63 2089 msg = 'File locking is not supported'
0edb3e33 2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
c1c9a79c
PH
2095# Cross-platform file locking
2096if sys.platform == 'win32':
fe0918bb 2097 import ctypes
c1c9a79c
PH
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
747c0bd1 2133 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152else:
399a76e6
YCH
2153 try:
2154 import fcntl
c1c9a79c 2155
a3125791 2156 def _lock_file(f, exclusive, block):
b63837bc 2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
acea8d7c 2160 try:
b63837bc 2161 fcntl.flock(f, flags)
acea8d7c
JK
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
b63837bc 2165 fcntl.lockf(f, flags)
c1c9a79c 2166
399a76e6 2167 def _unlock_file(f):
acea8d7c
JK
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2172
399a76e6 2173 except ImportError:
399a76e6 2174
a3125791 2175 def _lock_file(f, exclusive, block):
0edb3e33 2176 raise LockingUnsupportedError()
399a76e6
YCH
2177
2178 def _unlock_file(f):
0edb3e33 2179 raise LockingUnsupportedError()
c1c9a79c
PH
2180
2181
86e5f3ed 2182class locked_file:
0edb3e33 2183 locked = False
747c0bd1 2184
a3125791 2185 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
98804d03 2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2203
2204 def __enter__(self):
a3125791 2205 exclusive = 'r' not in self.mode
c1c9a79c 2206 try:
a3125791 2207 _lock_file(self.f, exclusive, self.block)
0edb3e33 2208 self.locked = True
86e5f3ed 2209 except OSError:
c1c9a79c
PH
2210 self.f.close()
2211 raise
fcfa8853 2212 if 'w' in self.mode:
131e14dc
JK
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
1890fc63 2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
c1c9a79c
PH
2221 return self
2222
0edb3e33 2223 def unlock(self):
2224 if not self.locked:
2225 return
c1c9a79c 2226 try:
0edb3e33 2227 _unlock_file(self.f)
c1c9a79c 2228 finally:
0edb3e33 2229 self.locked = False
c1c9a79c 2230
0edb3e33 2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
4eb7f1d1 2236
0edb3e33 2237 open = __enter__
2238 close = __exit__
a3125791 2239
0edb3e33 2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
a3125791 2242
0edb3e33 2243 def __iter__(self):
2244 return iter(self.f)
a3125791 2245
4eb7f1d1 2246
0b9c08b4 2247@functools.cache
4644ac55
S
2248def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
4eb7f1d1 2253def shell_quote(args):
a6a173c2 2254 quoted_args = []
4644ac55 2255 encoding = get_filesystem_encoding()
a6a173c2
JMF
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
aefce8e6 2260 quoted_args.append(compat_shlex_quote(a))
28e614de 2261 return ' '.join(quoted_args)
9d4660ca
PH
2262
2263
2264def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
81953d1a
RA
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
14f25df2 2269 sdata = urllib.parse.urlencode(
28e614de
PH
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
9d4660ca
PH
2272
2273
79f82953 2274def unsmuggle_url(smug_url, default=None):
83e865a3 2275 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2276 return smug_url, default
28e614de 2277 url, _, sdata = smug_url.rpartition('#')
14f25df2 2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2279 data = json.loads(jsond)
2280 return url, data
02dbf93f
PH
2281
2282
e0fd9573 2283def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2286 if num is None or num < 0:
e0fd9573 2287 return None
eeb2a770 2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2293 converted = num / (factor ** exponent)
abbeeebc 2294 return fmt % (converted, suffix)
e0fd9573 2295
2296
02dbf93f 2297def format_bytes(bytes):
f02d24d8 2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2299
1c088fa8 2300
64c464a1 2301def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2303 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2306 if not m:
2307 return None
64c464a1 2308
2309 num = float(m.group('num').replace(',', '.'))
fb47597b 2310 mult = unit_table[m.group('unit')]
64c464a1 2311 return round(num * mult)
2312
2313
2314def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
fb47597b
S
2319
2320
be64b5b0
PH
2321def parse_filesize(s):
2322 if s is None:
2323 return None
2324
dfb1b146 2325 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
70852b47 2330 'bytes': 1,
be64b5b0
PH
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
13585d76 2335 'kb': 1000,
70852b47
YCH
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
be64b5b0
PH
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
13585d76 2342 'mb': 1000 ** 2,
70852b47
YCH
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
be64b5b0
PH
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
13585d76 2349 'gb': 1000 ** 3,
70852b47
YCH
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
be64b5b0
PH
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
13585d76 2356 'tb': 1000 ** 4,
70852b47
YCH
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
be64b5b0
PH
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
13585d76 2363 'pb': 1000 ** 5,
70852b47
YCH
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
be64b5b0
PH
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
13585d76 2370 'eb': 1000 ** 6,
70852b47
YCH
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
be64b5b0
PH
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
13585d76 2377 'zb': 1000 ** 7,
70852b47
YCH
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
be64b5b0
PH
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
13585d76 2384 'yb': 1000 ** 8,
70852b47
YCH
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
be64b5b0
PH
2387 }
2388
fb47597b
S
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392def parse_count(s):
2393 if s is None:
be64b5b0
PH
2394 return None
2395
352d5da8 2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
352d5da8 2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
fb47597b 2410 }
be64b5b0 2411
352d5da8 2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
be64b5b0 2419
2f7ae819 2420
5d45484c 2421def parse_resolution(s, *, lenient=False):
b871d7e9
S
2422 if s is None:
2423 return {}
2424
5d45484c
LNO
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
17ec8bcf 2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
0dc41787 2446def parse_bitrate(s):
14f25df2 2447 if not isinstance(s, str):
0dc41787
S
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
a942d6cb 2454def month_by_name(name, lang='en'):
caefb1de
PH
2455 """ Return the number of a month by (locale-independently) English name """
2456
f6717dec 2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2458
caefb1de 2459 try:
f6717dec 2460 return month_names.index(name) + 1
7105440c
YCH
2461 except ValueError:
2462 return None
2463
2464
2465def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2471 except ValueError:
2472 return None
18258362
JMF
2473
2474
5aafe895 2475def fix_xml_ampersands(xml_str):
18258362 2476 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2479 '&amp;',
5aafe895 2480 xml_str)
e3946f98
PH
2481
2482
2483def setproctitle(title):
14f25df2 2484 assert isinstance(title, str)
c1c05c67 2485
fe0918bb 2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
c1c05c67
YCH
2490 return
2491
e3946f98 2492 try:
611c1dd9 2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2494 except OSError:
2495 return
2f49bcd6
RC
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
0f06bcd7 2501 title_bytes = title.encode()
6eefe533
PH
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
e3946f98 2504 try:
6eefe533 2505 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2506 except AttributeError:
2507 return # Strange libc, just skip this
d7dda168
PH
2508
2509
2510def remove_start(s, start):
46bc9b7d 2511 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2512
2513
2b9faf55 2514def remove_end(s, end):
46bc9b7d 2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2516
2517
31b2051e
S
2518def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
b6e0c7d2 2527def get_domain(url):
ebf99aaf 2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2533
2534
29eb5174 2535def url_basename(url):
14f25df2 2536 path = urllib.parse.urlparse(url).path
28e614de 2537 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2538
2539
02dc0a36 2540def base_url(url):
7657ec7e 2541 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2542
2543
e34c3361 2544def urljoin(base, path):
4b5de77b 2545 if isinstance(path, bytes):
0f06bcd7 2546 path = path.decode()
14f25df2 2547 if not isinstance(path, str) or not path:
e34c3361 2548 return None
fad4ceb5 2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2550 return path
4b5de77b 2551 if isinstance(base, bytes):
0f06bcd7 2552 base = base.decode()
14f25df2 2553 if not isinstance(base, str) or not re.match(
4b5de77b 2554 r'^(?:https?:)?//', base):
e34c3361 2555 return None
14f25df2 2556 return urllib.parse.urljoin(base, path)
e34c3361
S
2557
2558
ac668111 2559class HEADRequest(urllib.request.Request):
aa94a6d3 2560 def get_method(self):
611c1dd9 2561 return 'HEAD'
7217e148
PH
2562
2563
ac668111 2564class PUTRequest(urllib.request.Request):
95cf60e8
S
2565 def get_method(self):
2566 return 'PUT'
2567
2568
9732d77e 2569def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
1812afb7
S
2572 try:
2573 return int(v) * invscale // scale
31c49255 2574 except (ValueError, TypeError, OverflowError):
af98f8ff 2575 return default
9732d77e 2576
9572013d 2577
40a90862 2578def str_or_none(v, default=None):
14f25df2 2579 return default if v is None else str(v)
40a90862 2580
9732d77e
PH
2581
2582def str_to_int(int_str):
48d4681e 2583 """ A more relaxed version of int_or_none """
f9934b96 2584 if isinstance(int_str, int):
348c6bf1 2585 return int_str
14f25df2 2586 elif isinstance(int_str, str):
42db58ec
S
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
608d11f5
PH
2589
2590
9732d77e 2591def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
5e1271c5 2596 except (ValueError, TypeError):
caf80631 2597 return default
43f775e4
PH
2598
2599
c7e327c4
S
2600def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
53cd37ba 2604def strip_or_none(v, default=None):
14f25df2 2605 return v.strip() if isinstance(v, str) else default
b72b4431
S
2606
2607
af03000a 2608def url_or_none(url):
14f25df2 2609 if not url or not isinstance(url, str):
af03000a
S
2610 return None
2611 url = url.strip()
29f7c58a 2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2613
2614
3e9b66d7 2615def request_to_url(req):
ac668111 2616 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
e29663c6 2622def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
f9934b96 2625 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2629 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
608d11f5 2638def parse_duration(s):
f9934b96 2639 if not isinstance(s, str):
608d11f5 2640 return None
ca7b3246 2641 s = s.strip()
38d79fd1 2642 if not s:
2643 return None
ca7b3246 2644
acaff495 2645 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
acaff495 2652 if m:
8bd1c00b 2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2654 else:
2655 m = re.match(
056653bb
S
2656 r'''(?ix)(?:P?
2657 (?:
1c1b2f96 2658 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2659 )?
2660 (?:
1c1b2f96 2661 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2662 )?
2663 (?:
1c1b2f96 2664 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2665 )?
8f4b58d7 2666 (?:
1c1b2f96 2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2668 )?
056653bb 2669 T)?
acaff495 2670 (?:
1c1b2f96 2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2672 )?
2673 (?:
1c1b2f96 2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2678 )?Z?$''', s)
acaff495 2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
15846398 2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
acaff495 2688 if ms:
19a03940 2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2692
2693
e65e4c88 2694def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2695 name, real_ext = os.path.splitext(filename)
e65e4c88 2696 return (
86e5f3ed 2697 f'{name}.{ext}{real_ext}'
e65e4c88 2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2699 else f'{filename}.{ext}')
d70ad093
PH
2700
2701
b3ed15b7
S
2702def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
86e5f3ed 2704 return '{}.{}'.format(
b3ed15b7
S
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
d70ad093
PH
2709def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
f0c9fb96 2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2714 except OSError:
2715 return False
2716 return exe
b7ab0590
PH
2717
2718
7aaf4cd2 2719def _get_exe_version_output(exe, args):
95807118 2720 try:
b64d04c1 2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2722 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2726 except OSError:
2727 return False
f0c9fb96 2728 return stdout
cae97f65
PH
2729
2730
2731def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2732 assert isinstance(output, str)
cae97f65
PH
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
95807118
PH
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
9af98e17 2742def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
7e88d7d7 2750def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
cb89cfc1 2760class LazyList(collections.abc.Sequence):
0f06bcd7 2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2763
8e5fecc8 2764 class IndexError(IndexError):
2765 pass
2766
282f5709 2767 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
483336e7 2771
2772 def __iter__(self):
0f06bcd7 2773 if self._reversed:
28419ca2 2774 # We need to consume the entire iterable to iterate in reverse
981052c9 2775 yield from self.exhaust()
28419ca2 2776 return
0f06bcd7 2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
483336e7 2780 yield item
2781
0f06bcd7 2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
28419ca2 2786
981052c9 2787 def exhaust(self):
0f06bcd7 2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2790
28419ca2 2791 @staticmethod
0f06bcd7 2792 def _reverse_index(x):
f2df4071 2793 return None if x is None else ~x
483336e7 2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
0f06bcd7 2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2799 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2800 elif isinstance(idx, int):
0f06bcd7 2801 if self._reversed:
2802 idx = self._reverse_index(idx)
e0f2b4b4 2803 start, stop, step = idx, idx, 0
483336e7 2804 else:
2805 raise TypeError('indices must be integers or slices')
e0f2b4b4 2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
483336e7 2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
0f06bcd7 2811 self._exhaust()
8e5fecc8 2812 try:
0f06bcd7 2813 return self._cache[idx]
8e5fecc8 2814 except IndexError as e:
2815 raise self.IndexError(e) from e
0f06bcd7 2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2817 if n > 0:
0f06bcd7 2818 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2819 try:
0f06bcd7 2820 return self._cache[idx]
8e5fecc8 2821 except IndexError as e:
2822 raise self.IndexError(e) from e
483336e7 2823
2824 def __bool__(self):
2825 try:
0f06bcd7 2826 self[-1] if self._reversed else self[0]
8e5fecc8 2827 except self.IndexError:
483336e7 2828 return False
2829 return True
2830
2831 def __len__(self):
0f06bcd7 2832 self._exhaust()
2833 return len(self._cache)
483336e7 2834
282f5709 2835 def __reversed__(self):
0f06bcd7 2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2837
2838 def __copy__(self):
0f06bcd7 2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2840
28419ca2 2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
483336e7 2848
7be9ccff 2849class PagedList:
c07a39ae 2850
2851 class IndexError(IndexError):
2852 pass
2853
dd26ced1
PH
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
7be9ccff 2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
f1d13090 2861 self._pagecount = float('inf')
7be9ccff 2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
d8cf8d97 2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
f1d13090 2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
55575225 2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
f1d13090 2880 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
d8cf8d97 2884 if not entries:
c07a39ae 2885 raise self.IndexError()
d8cf8d97 2886 return entries[0]
55575225 2887
9c44d242
PH
2888
2889class OnDemandPagedList(PagedList):
a44ca5a4 2890 """Download pages until a page with less than maximum results"""
86e5f3ed 2891
7be9ccff 2892 def _getslice(self, start, end):
b7ab0590
PH
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
b7ab0590
PH
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
b7ab0590
PH
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
f1d13090 2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
b7ab0590
PH
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
7be9ccff 2915 yield from page_results
b7ab0590
PH
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
81c2f20b
PH
2928
2929
9c44d242 2930class InAdvancePagedList(PagedList):
a44ca5a4 2931 """PagedList with total number of pages known in advance"""
86e5f3ed 2932
9c44d242 2933 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2934 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2935 self._pagecount = pagecount
9c44d242 2936
7be9ccff 2937 def _getslice(self, start, end):
9c44d242 2938 start_page = start // self._pagesize
d37707bd 2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
7be9ccff 2943 page_results = self.getpage(pagenum)
9c44d242 2944 if skip_elems:
7be9ccff 2945 page_results = page_results[skip_elems:]
9c44d242
PH
2946 skip_elems = None
2947 if only_more is not None:
7be9ccff 2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
9c44d242 2950 else:
7be9ccff 2951 yield from page_results[:only_more]
9c44d242 2952 break
7be9ccff 2953 yield from page_results
9c44d242
PH
2954
2955
7e88d7d7 2956class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
7e9a6125 2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2971 self.is_incomplete = requested_entries is not None
7e9a6125 2972 if self.is_incomplete:
2973 assert self.is_exhausted
bc5c2f8a 2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
7e88d7d7 2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
1ac4fd80 3017 if not entry:
3018 continue
7e88d7d7 3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
7e9a6125 3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
7e88d7d7 3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
bc5c2f8a 3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
7e9a6125 3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
7e88d7d7 3079 break
7e9a6125 3080 continue
7e88d7d7 3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
81c2f20b 3090def uppercase_escape(s):
676eb3f2 3091 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3092 return re.sub(
a612753d 3093 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
0fe2ff78
YCH
3096
3097
3098def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
b53466e1 3104
d05cfe06
S
3105
3106def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3109
3110
3111def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
14f25df2 3113 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3114 return url_parsed._replace(
efbed08d 3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
62e609ab 3122
96b9e9cf 3123def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3125
3126
62e609ab
PH
3127def read_batch_urls(batch_fd):
3128 def fixup(url):
14f25df2 3129 if not isinstance(url, str):
62e609ab 3130 url = url.decode('utf-8', 'replace')
8c04f0be 3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
62e609ab 3137 return False
8c04f0be 3138 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3139 # However, it can be safely stripped out if following a whitespace
8c04f0be 3140 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3144
3145
3146def urlencode_postdata(*args, **kargs):
14f25df2 3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3148
3149
38f9ef31 3150def update_url_query(url, query):
cacd9966
YCH
3151 if not query:
3152 return url
14f25df2 3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3155 qs.update(query)
14f25df2 3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
16392824 3158
8e60dc75 3159
c043c246 3160def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3161 req_headers = req.headers.copy()
c043c246 3162 req_headers.update(headers or {})
ed0291d1
S
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
ac668111 3171 req_type = urllib.request.Request
ed0291d1
S
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
10c87c15 3180def _multipart_encode_impl(data, boundary):
0c265486
YCH
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3186 if isinstance(k, str):
0f06bcd7 3187 k = k.encode()
14f25df2 3188 if isinstance(v, str):
0f06bcd7 3189 v = v.encode()
0c265486
YCH
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
10c87c15 3222 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
304ad45a 3232def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
86296ad2 3236def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
cbecc9b9
S
3241
3242
c4f60dd7 3243def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
a32a9a7e 3245 try:
c4f60dd7 3246 val = f(*args, **kwargs)
ab029d7e 3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3248 pass
3249 else:
c4f60dd7 3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3256
3257
90137ca4 3258def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
6cc62232
S
3262def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
90137ca4 3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3268 merged[k] = v
3269 return merged
3270
3271
8e60dc75 3272def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3273 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3274
16392824 3275
a1a530b0
PH
3276US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282}
fac55558
PH
3283
3284
a8795327 3285TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
a8795327
S
3292}
3293
3294
146c80e2 3295def parse_age_limit(s):
19a03940 3296 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3297 if type(s) is int: # noqa: E721
a8795327 3298 return s if 0 <= s <= 21 else None
19a03940 3299 elif not isinstance(s, str):
d838b1bd 3300 return None
146c80e2 3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3302 if m:
3303 return int(m.group('age'))
5c5fae6d 3304 s = s.upper()
a8795327
S
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
5a16c9d9 3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3308 if m:
5a16c9d9 3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3310 return None
146c80e2
S
3311
3312
fac55558 3313def strip_jsonp(code):
609a61e3 3314 return re.sub(
5552c9eb 3315 r'''(?sx)^
e9c671d5 3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
478c2c61
PH
3321
3322
8f53dc44 3323def js_to_json(code, vars={}, *, strict=False):
5c610515 3324 # vars is a dict of var, val pairs to substitute
a71b812f
SS
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3329 INTEGER_TABLE = (
86e5f3ed 3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3332 )
3333
a71b812f
SS
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
e05f6939 3343 def fix_kv(m):
e7b6d122
PH
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
421ddcb8
C
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
8bdd16b4 3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 return json.dumps(vars[v])
89ac4a19 3364
a71b812f
SS
3365 if not strict:
3366 return f'"{v}"'
5c610515 3367
a71b812f 3368 raise ValueError(f'Unknown value: {v}')
e05f6939 3369
8072ef2b 3370 def create_map(mobj):
3371 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
8072ef2b 3373 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3374 if not strict:
3375 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3376 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
febff4c1 3377
a71b812f
SS
3378 return re.sub(rf'''(?sx)
3379 {STRING_RE}|
3380 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3381 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3382 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3384 !+
a71b812f 3385 ''', fix_kv, code)
e05f6939
PH
3386
3387
478c2c61
PH
3388def qualities(quality_ids):
3389 """ Get a numeric quality value out of a list of possible values """
3390 def q(qid):
3391 try:
3392 return quality_ids.index(qid)
3393 except ValueError:
3394 return -1
3395 return q
3396
acd69589 3397
8aa0e7cd 3398POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3399
3400
de6000d9 3401DEFAULT_OUTTMPL = {
3402 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3403 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3404}
3405OUTTMPL_TYPES = {
72755351 3406 'chapter': None,
de6000d9 3407 'subtitle': None,
3408 'thumbnail': None,
3409 'description': 'description',
3410 'annotation': 'annotations.xml',
3411 'infojson': 'info.json',
08438d2c 3412 'link': None,
3b603dbd 3413 'pl_video': None,
5112f26a 3414 'pl_thumbnail': None,
de6000d9 3415 'pl_description': 'description',
3416 'pl_infojson': 'info.json',
3417}
0a871f68 3418
143db31d 3419# As of [1] format syntax is:
3420# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3422STR_FORMAT_RE_TMPL = r'''(?x)
3423 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3424 %
524e2e4f 3425 (?P<has_key>\((?P<key>{0})\))?
752cda38 3426 (?P<format>
524e2e4f 3427 (?P<conversion>[#0\-+ ]+)?
3428 (?P<min_width>\d+)?
3429 (?P<precision>\.\d+)?
3430 (?P<len_mod>[hlL])? # unused in python
901130bb 3431 {1} # conversion type
752cda38 3432 )
143db31d 3433'''
3434
7d1eb38a 3435
901130bb 3436STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3437
7d1eb38a 3438
a020a0dc
PH
3439def limit_length(s, length):
3440 """ Add ellipses to overly long strings """
3441 if s is None:
3442 return None
3443 ELLIPSES = '...'
3444 if len(s) > length:
3445 return s[:length - len(ELLIPSES)] + ELLIPSES
3446 return s
48844745
PH
3447
3448
3449def version_tuple(v):
5f9b8394 3450 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3451
3452
3453def is_outdated_version(version, limit, assume_new=True):
3454 if not version:
3455 return not assume_new
3456 try:
3457 return version_tuple(version) < version_tuple(limit)
3458 except ValueError:
3459 return not assume_new
732ea2f0
PH
3460
3461
3462def ytdl_is_updateable():
7a5c1cfe 3463 """ Returns if yt-dlp can be updated with -U """
735d865e 3464
5d535b4a 3465 from .update import is_non_updateable
732ea2f0 3466
5d535b4a 3467 return not is_non_updateable()
7d4111ed
PH
3468
3469
3470def args_to_str(args):
3471 # Get a short string representation for a subprocess command
702ccf2d 3472 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3473
3474
9b9c5355 3475def error_to_compat_str(err):
cfb0511d 3476 return str(err)
fdae2358
S
3477
3478
a44ca5a4 3479def error_to_str(err):
3480 return f'{type(err).__name__}: {err}'
3481
3482
c460bdd5 3483def mimetype2ext(mt):
eb9ee194
S
3484 if mt is None:
3485 return None
3486
9359f3d4
F
3487 mt, _, params = mt.partition(';')
3488 mt = mt.strip()
3489
3490 FULL_MAP = {
765ac263 3491 'audio/mp4': 'm4a',
6c33d24b
YCH
3492 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3493 # it's the most popular one
3494 'audio/mpeg': 'mp3',
ba39289d 3495 'audio/x-wav': 'wav',
9359f3d4
F
3496 'audio/wav': 'wav',
3497 'audio/wave': 'wav',
3498 }
3499
3500 ext = FULL_MAP.get(mt)
765ac263
JMF
3501 if ext is not None:
3502 return ext
3503
9359f3d4 3504 SUBTYPE_MAP = {
f6861ec9 3505 '3gpp': '3gp',
cafcf657 3506 'smptett+xml': 'tt',
cafcf657 3507 'ttaf+xml': 'dfxp',
a0d8d704 3508 'ttml+xml': 'ttml',
f6861ec9 3509 'x-flv': 'flv',
a0d8d704 3510 'x-mp4-fragmented': 'mp4',
d4f05d47 3511 'x-ms-sami': 'sami',
a0d8d704 3512 'x-ms-wmv': 'wmv',
b4173f15
RA
3513 'mpegurl': 'm3u8',
3514 'x-mpegurl': 'm3u8',
3515 'vnd.apple.mpegurl': 'm3u8',
3516 'dash+xml': 'mpd',
b4173f15 3517 'f4m+xml': 'f4m',
f164b971 3518 'hds+xml': 'f4m',
e910fe2f 3519 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3520 'quicktime': 'mov',
98ce1a3f 3521 'mp2t': 'ts',
39e7107d 3522 'x-wav': 'wav',
9359f3d4
F
3523 'filmstrip+json': 'fs',
3524 'svg+xml': 'svg',
3525 }
3526
3527 _, _, subtype = mt.rpartition('/')
3528 ext = SUBTYPE_MAP.get(subtype.lower())
3529 if ext is not None:
3530 return ext
3531
3532 SUFFIX_MAP = {
3533 'json': 'json',
3534 'xml': 'xml',
3535 'zip': 'zip',
3536 'gzip': 'gz',
3537 }
3538
3539 _, _, suffix = subtype.partition('+')
3540 ext = SUFFIX_MAP.get(suffix)
3541 if ext is not None:
3542 return ext
3543
3544 return subtype.replace('+', '.')
c460bdd5
PH
3545
3546
2814f12b
THD
3547def ext2mimetype(ext_or_url):
3548 if not ext_or_url:
3549 return None
3550 if '.' not in ext_or_url:
3551 ext_or_url = f'file.{ext_or_url}'
3552 return mimetypes.guess_type(ext_or_url)[0]
3553
3554
4f3c5e06 3555def parse_codecs(codecs_str):
3556 # http://tools.ietf.org/html/rfc6381
3557 if not codecs_str:
3558 return {}
a0566bbf 3559 split_codecs = list(filter(None, map(
dbf5416a 3560 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3561 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3562 for full_codec in split_codecs:
d816f61f 3563 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3564 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3565 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3566 if vcodec:
3567 continue
3568 vcodec = full_codec
3569 if parts[0] in ('dvh1', 'dvhe'):
3570 hdr = 'DV'
3571 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3572 hdr = 'HDR10'
3573 elif parts[:2] == ['vp9', '2']:
3574 hdr = 'HDR10'
3575 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3576 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3577 acodec = acodec or full_codec
3578 elif parts[0] in ('stpp', 'wvtt'):
3579 scodec = scodec or full_codec
4f3c5e06 3580 else:
19a03940 3581 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3582 if vcodec or acodec or scodec:
4f3c5e06 3583 return {
3584 'vcodec': vcodec or 'none',
3585 'acodec': acodec or 'none',
176f1866 3586 'dynamic_range': hdr,
3fe75fdc 3587 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3588 }
b69fd25c 3589 elif len(split_codecs) == 2:
3590 return {
3591 'vcodec': split_codecs[0],
3592 'acodec': split_codecs[1],
3593 }
4f3c5e06 3594 return {}
3595
3596
fc61aff4
LL
3597def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3598 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3599
3600 allow_mkv = not preferences or 'mkv' in preferences
3601
3602 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3603 return 'mkv' # TODO: any other format allows this?
3604
3605 # TODO: All codecs supported by parse_codecs isn't handled here
3606 COMPATIBLE_CODECS = {
3607 'mp4': {
3608 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
81b6102d 3609 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3610 },
3611 'webm': {
3612 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3613 'vp9x', 'vp8x', # in the webm spec
3614 },
3615 }
3616
8f84770a 3617 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3618 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3619
3620 for ext in preferences or COMPATIBLE_CODECS.keys():
3621 codec_set = COMPATIBLE_CODECS.get(ext, set())
3622 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3623 return ext
3624
3625 COMPATIBLE_EXTS = (
3626 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3627 {'webm'},
3628 )
3629 for ext in preferences or vexts:
3630 current_exts = {ext, *vexts, *aexts}
3631 if ext == 'mkv' or current_exts == {ext} or any(
3632 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3633 return ext
3634 return 'mkv' if allow_mkv else preferences[-1]
3635
3636
2ccd1b10 3637def urlhandle_detect_ext(url_handle):
79298173 3638 getheader = url_handle.headers.get
2ccd1b10 3639
b55ee18f
PH
3640 cd = getheader('Content-Disposition')
3641 if cd:
3642 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3643 if m:
3644 e = determine_ext(m.group('filename'), default_ext=None)
3645 if e:
3646 return e
3647
c460bdd5 3648 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3649
3650
1e399778
YCH
3651def encode_data_uri(data, mime_type):
3652 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3653
3654
05900629 3655def age_restricted(content_limit, age_limit):
6ec6cb4e 3656 """ Returns True iff the content should be blocked """
05900629
PH
3657
3658 if age_limit is None: # No limit set
3659 return False
3660 if content_limit is None:
3661 return False # Content available for everyone
3662 return age_limit < content_limit
61ca9a80
PH
3663
3664
88f60feb 3665# List of known byte-order-marks (BOM)
a904a7f8
L
3666BOMS = [
3667 (b'\xef\xbb\xbf', 'utf-8'),
3668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3670 (b'\xff\xfe', 'utf-16-le'),
3671 (b'\xfe\xff', 'utf-16-be'),
3672]
a904a7f8
L
3673
3674
61ca9a80
PH
3675def is_html(first_bytes):
3676 """ Detect whether a file contains HTML by examining its first bytes. """
3677
80e8493e 3678 encoding = 'utf-8'
61ca9a80 3679 for bom, enc in BOMS:
80e8493e 3680 while first_bytes.startswith(bom):
3681 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3682
80e8493e 3683 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3684
3685
3686def determine_protocol(info_dict):
3687 protocol = info_dict.get('protocol')
3688 if protocol is not None:
3689 return protocol
3690
7de837a5 3691 url = sanitize_url(info_dict['url'])
a055469f
PH
3692 if url.startswith('rtmp'):
3693 return 'rtmp'
3694 elif url.startswith('mms'):
3695 return 'mms'
3696 elif url.startswith('rtsp'):
3697 return 'rtsp'
3698
3699 ext = determine_ext(url)
3700 if ext == 'm3u8':
deae7c17 3701 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3702 elif ext == 'f4m':
3703 return 'f4m'
3704
14f25df2 3705 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3706
3707
c5e3f849 3708def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3709 """ Render a list of rows, each as a list of values.
3710 Text after a \t will be right aligned """
ec11a9f4 3711 def width(string):
c5e3f849 3712 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3713
3714 def get_max_lens(table):
ec11a9f4 3715 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3716
3717 def filter_using_list(row, filterArray):
d16df59d 3718 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3719
d16df59d 3720 max_lens = get_max_lens(data) if hide_empty else []
3721 header_row = filter_using_list(header_row, max_lens)
3722 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3723
cfb56d1a 3724 table = [header_row] + data
76d321f6 3725 max_lens = get_max_lens(table)
c5e3f849 3726 extra_gap += 1
76d321f6 3727 if delim:
c5e3f849 3728 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3729 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3730 for row in table:
3731 for pos, text in enumerate(map(str, row)):
c5e3f849 3732 if '\t' in text:
3733 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3734 else:
3735 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3736 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3737 return ret
347de493
PH
3738
3739
8f18aca8 3740def _match_one(filter_part, dct, incomplete):
77b87f05 3741 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3742 STRING_OPERATORS = {
3743 '*=': operator.contains,
3744 '^=': lambda attr, value: attr.startswith(value),
3745 '$=': lambda attr, value: attr.endswith(value),
3746 '~=': lambda attr, value: re.search(value, attr),
3747 }
347de493 3748 COMPARISON_OPERATORS = {
a047eeb6 3749 **STRING_OPERATORS,
3750 '<=': operator.le, # "<=" must be defined above "<"
347de493 3751 '<': operator.lt,
347de493 3752 '>=': operator.ge,
a047eeb6 3753 '>': operator.gt,
347de493 3754 '=': operator.eq,
347de493 3755 }
a047eeb6 3756
6db9c4d5 3757 if isinstance(incomplete, bool):
3758 is_incomplete = lambda _: incomplete
3759 else:
3760 is_incomplete = lambda k: k in incomplete
3761
64fa820c 3762 operator_rex = re.compile(r'''(?x)
347de493 3763 (?P<key>[a-z_]+)
77b87f05 3764 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3765 (?:
a047eeb6 3766 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3767 (?P<strval>.+?)
347de493 3768 )
347de493 3769 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3770 m = operator_rex.fullmatch(filter_part.strip())
347de493 3771 if m:
18f96d12 3772 m = m.groupdict()
3773 unnegated_op = COMPARISON_OPERATORS[m['op']]
3774 if m['negation']:
77b87f05
MT
3775 op = lambda attr, value: not unnegated_op(attr, value)
3776 else:
3777 op = unnegated_op
18f96d12 3778 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3779 if m['quote']:
3780 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3781 actual_value = dct.get(m['key'])
3782 numeric_comparison = None
f9934b96 3783 if isinstance(actual_value, (int, float)):
e5a088dc
S
3784 # If the original field is a string and matching comparisonvalue is
3785 # a number we should respect the origin of the original field
3786 # and process comparison value as a string (see
18f96d12 3787 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3788 try:
18f96d12 3789 numeric_comparison = int(comparison_value)
347de493 3790 except ValueError:
18f96d12 3791 numeric_comparison = parse_filesize(comparison_value)
3792 if numeric_comparison is None:
3793 numeric_comparison = parse_filesize(f'{comparison_value}B')
3794 if numeric_comparison is None:
3795 numeric_comparison = parse_duration(comparison_value)
3796 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3797 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3798 if actual_value is None:
6db9c4d5 3799 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3800 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3801
3802 UNARY_OPERATORS = {
1cc47c66
S
3803 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3804 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3805 }
64fa820c 3806 operator_rex = re.compile(r'''(?x)
347de493 3807 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3808 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3809 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3810 if m:
3811 op = UNARY_OPERATORS[m.group('op')]
3812 actual_value = dct.get(m.group('key'))
6db9c4d5 3813 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3814 return True
347de493
PH
3815 return op(actual_value)
3816
3817 raise ValueError('Invalid filter part %r' % filter_part)
3818
3819
8f18aca8 3820def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3821 """ Filter a dictionary with a simple string syntax.
3822 @returns Whether the filter passes
3823 @param incomplete Set of keys that is expected to be missing from dct.
3824 Can be True/False to indicate all/none of the keys may be missing.
3825 All conditions on incomplete keys pass if the key is missing
8f18aca8 3826 """
347de493 3827 return all(
8f18aca8 3828 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3829 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3830
3831
b1a7cd05 3832def match_filter_func(filters):
3833 if not filters:
d1b5f70b 3834 return None
492272fe 3835 filters = set(variadic(filters))
d1b5f70b 3836
492272fe 3837 interactive = '-' in filters
3838 if interactive:
3839 filters.remove('-')
3840
3841 def _match_func(info_dict, incomplete=False):
3842 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3843 return NO_DEFAULT if interactive and not incomplete else None
347de493 3844 else:
3bec830a 3845 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3846 filter_str = ') | ('.join(map(str.strip, filters))
3847 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3848 return _match_func
91410c9b
PH
3849
3850
f2df4071 3851class download_range_func:
3852 def __init__(self, chapters, ranges):
3853 self.chapters, self.ranges = chapters, ranges
3854
3855 def __call__(self, info_dict, ydl):
0500ee3d 3856 if not self.ranges and not self.chapters:
3857 yield {}
3858
5ec1b6b7 3859 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3860 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3861 for regex in self.chapters or []:
5ec1b6b7 3862 for i, chapter in enumerate(info_dict.get('chapters') or []):
3863 if re.search(regex, chapter['title']):
3864 warning = None
3865 yield {**chapter, 'index': i}
f2df4071 3866 if self.chapters and warning:
5ec1b6b7 3867 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3868
f2df4071 3869 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3870
f2df4071 3871 def __eq__(self, other):
3872 return (isinstance(other, download_range_func)
3873 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3874
3875
bf6427d2
YCH
3876def parse_dfxp_time_expr(time_expr):
3877 if not time_expr:
d631d5f9 3878 return
bf6427d2 3879
1d485a1a 3880 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3881 if mobj:
3882 return float(mobj.group('time_offset'))
3883
db2fe38b 3884 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3885 if mobj:
db2fe38b 3886 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3887
3888
c1c924ab 3889def srt_subtitles_timecode(seconds):
aa7785f8 3890 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3891
3892
3893def ass_subtitles_timecode(seconds):
3894 time = timetuple_from_msec(seconds * 1000)
3895 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3896
3897
3898def dfxp2srt(dfxp_data):
3869028f
YCH
3899 '''
3900 @param dfxp_data A bytes-like object containing DFXP data
3901 @returns A unicode object containing converted SRT data
3902 '''
5b995f71 3903 LEGACY_NAMESPACES = (
3869028f
YCH
3904 (b'http://www.w3.org/ns/ttml', [
3905 b'http://www.w3.org/2004/11/ttaf1',
3906 b'http://www.w3.org/2006/04/ttaf1',
3907 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3908 ]),
3869028f
YCH
3909 (b'http://www.w3.org/ns/ttml#styling', [
3910 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3911 ]),
3912 )
3913
3914 SUPPORTED_STYLING = [
3915 'color',
3916 'fontFamily',
3917 'fontSize',
3918 'fontStyle',
3919 'fontWeight',
3920 'textDecoration'
3921 ]
3922
4e335771 3923 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3924 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3925 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3926 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3927 })
bf6427d2 3928
5b995f71
RA
3929 styles = {}
3930 default_style = {}
3931
86e5f3ed 3932 class TTMLPElementParser:
5b995f71
RA
3933 _out = ''
3934 _unclosed_elements = []
3935 _applied_styles = []
bf6427d2 3936
2b14cb56 3937 def start(self, tag, attrib):
5b995f71
RA
3938 if tag in (_x('ttml:br'), 'br'):
3939 self._out += '\n'
3940 else:
3941 unclosed_elements = []
3942 style = {}
3943 element_style_id = attrib.get('style')
3944 if default_style:
3945 style.update(default_style)
3946 if element_style_id:
3947 style.update(styles.get(element_style_id, {}))
3948 for prop in SUPPORTED_STYLING:
3949 prop_val = attrib.get(_x('tts:' + prop))
3950 if prop_val:
3951 style[prop] = prop_val
3952 if style:
3953 font = ''
3954 for k, v in sorted(style.items()):
3955 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3956 continue
3957 if k == 'color':
3958 font += ' color="%s"' % v
3959 elif k == 'fontSize':
3960 font += ' size="%s"' % v
3961 elif k == 'fontFamily':
3962 font += ' face="%s"' % v
3963 elif k == 'fontWeight' and v == 'bold':
3964 self._out += '<b>'
3965 unclosed_elements.append('b')
3966 elif k == 'fontStyle' and v == 'italic':
3967 self._out += '<i>'
3968 unclosed_elements.append('i')
3969 elif k == 'textDecoration' and v == 'underline':
3970 self._out += '<u>'
3971 unclosed_elements.append('u')
3972 if font:
3973 self._out += '<font' + font + '>'
3974 unclosed_elements.append('font')
3975 applied_style = {}
3976 if self._applied_styles:
3977 applied_style.update(self._applied_styles[-1])
3978 applied_style.update(style)
3979 self._applied_styles.append(applied_style)
3980 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3981
2b14cb56 3982 def end(self, tag):
5b995f71
RA
3983 if tag not in (_x('ttml:br'), 'br'):
3984 unclosed_elements = self._unclosed_elements.pop()
3985 for element in reversed(unclosed_elements):
3986 self._out += '</%s>' % element
3987 if unclosed_elements and self._applied_styles:
3988 self._applied_styles.pop()
bf6427d2 3989
2b14cb56 3990 def data(self, data):
5b995f71 3991 self._out += data
2b14cb56 3992
3993 def close(self):
5b995f71 3994 return self._out.strip()
2b14cb56 3995
3996 def parse_node(node):
3997 target = TTMLPElementParser()
3998 parser = xml.etree.ElementTree.XMLParser(target=target)
3999 parser.feed(xml.etree.ElementTree.tostring(node))
4000 return parser.close()
bf6427d2 4001
5b995f71
RA
4002 for k, v in LEGACY_NAMESPACES:
4003 for ns in v:
4004 dfxp_data = dfxp_data.replace(ns, k)
4005
3869028f 4006 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4007 out = []
5b995f71 4008 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4009
4010 if not paras:
4011 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4012
5b995f71
RA
4013 repeat = False
4014 while True:
4015 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4016 style_id = style.get('id') or style.get(_x('xml:id'))
4017 if not style_id:
4018 continue
5b995f71
RA
4019 parent_style_id = style.get('style')
4020 if parent_style_id:
4021 if parent_style_id not in styles:
4022 repeat = True
4023 continue
4024 styles[style_id] = styles[parent_style_id].copy()
4025 for prop in SUPPORTED_STYLING:
4026 prop_val = style.get(_x('tts:' + prop))
4027 if prop_val:
4028 styles.setdefault(style_id, {})[prop] = prop_val
4029 if repeat:
4030 repeat = False
4031 else:
4032 break
4033
4034 for p in ('body', 'div'):
4035 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4036 if ele is None:
4037 continue
4038 style = styles.get(ele.get('style'))
4039 if not style:
4040 continue
4041 default_style.update(style)
4042
bf6427d2 4043 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4044 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4045 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4046 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4047 if begin_time is None:
4048 continue
7dff0363 4049 if not end_time:
d631d5f9
YCH
4050 if not dur:
4051 continue
4052 end_time = begin_time + dur
bf6427d2
YCH
4053 out.append('%d\n%s --> %s\n%s\n\n' % (
4054 index,
c1c924ab
YCH
4055 srt_subtitles_timecode(begin_time),
4056 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4057 parse_node(para)))
4058
4059 return ''.join(out)
4060
4061
c487cf00 4062def cli_option(params, command_option, param, separator=None):
66e289ba 4063 param = params.get(param)
c487cf00 4064 return ([] if param is None
4065 else [command_option, str(param)] if separator is None
4066 else [f'{command_option}{separator}{param}'])
66e289ba
S
4067
4068
4069def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4070 param = params.get(param)
c487cf00 4071 assert param in (True, False, None)
4072 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4073
4074
4075def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4076 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4077
4078
e92caff5 4079def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4080 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4081 if use_compat:
5b1ecbb3 4082 return argdict
4083 else:
4084 argdict = None
eab9b2bc 4085 if argdict is None:
5b1ecbb3 4086 return default
eab9b2bc 4087 assert isinstance(argdict, dict)
4088
e92caff5 4089 assert isinstance(keys, (list, tuple))
4090 for key_list in keys:
e92caff5 4091 arg_list = list(filter(
4092 lambda x: x is not None,
6606817a 4093 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4094 if arg_list:
4095 return [arg for args in arg_list for arg in args]
4096 return default
66e289ba 4097
6251555f 4098
330690a2 4099def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4100 main_key, exe = main_key.lower(), exe.lower()
4101 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4102 keys = [f'{root_key}{k}' for k in (keys or [''])]
4103 if root_key in keys:
4104 if main_key != exe:
4105 keys.append((main_key, exe))
4106 keys.append('default')
4107 else:
4108 use_compat = False
4109 return cli_configuration_args(argdict, keys, default, use_compat)
4110
66e289ba 4111
86e5f3ed 4112class ISO639Utils:
39672624
YCH
4113 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4114 _lang_map = {
4115 'aa': 'aar',
4116 'ab': 'abk',
4117 'ae': 'ave',
4118 'af': 'afr',
4119 'ak': 'aka',
4120 'am': 'amh',
4121 'an': 'arg',
4122 'ar': 'ara',
4123 'as': 'asm',
4124 'av': 'ava',
4125 'ay': 'aym',
4126 'az': 'aze',
4127 'ba': 'bak',
4128 'be': 'bel',
4129 'bg': 'bul',
4130 'bh': 'bih',
4131 'bi': 'bis',
4132 'bm': 'bam',
4133 'bn': 'ben',
4134 'bo': 'bod',
4135 'br': 'bre',
4136 'bs': 'bos',
4137 'ca': 'cat',
4138 'ce': 'che',
4139 'ch': 'cha',
4140 'co': 'cos',
4141 'cr': 'cre',
4142 'cs': 'ces',
4143 'cu': 'chu',
4144 'cv': 'chv',
4145 'cy': 'cym',
4146 'da': 'dan',
4147 'de': 'deu',
4148 'dv': 'div',
4149 'dz': 'dzo',
4150 'ee': 'ewe',
4151 'el': 'ell',
4152 'en': 'eng',
4153 'eo': 'epo',
4154 'es': 'spa',
4155 'et': 'est',
4156 'eu': 'eus',
4157 'fa': 'fas',
4158 'ff': 'ful',
4159 'fi': 'fin',
4160 'fj': 'fij',
4161 'fo': 'fao',
4162 'fr': 'fra',
4163 'fy': 'fry',
4164 'ga': 'gle',
4165 'gd': 'gla',
4166 'gl': 'glg',
4167 'gn': 'grn',
4168 'gu': 'guj',
4169 'gv': 'glv',
4170 'ha': 'hau',
4171 'he': 'heb',
b7acc835 4172 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4173 'hi': 'hin',
4174 'ho': 'hmo',
4175 'hr': 'hrv',
4176 'ht': 'hat',
4177 'hu': 'hun',
4178 'hy': 'hye',
4179 'hz': 'her',
4180 'ia': 'ina',
4181 'id': 'ind',
b7acc835 4182 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4183 'ie': 'ile',
4184 'ig': 'ibo',
4185 'ii': 'iii',
4186 'ik': 'ipk',
4187 'io': 'ido',
4188 'is': 'isl',
4189 'it': 'ita',
4190 'iu': 'iku',
4191 'ja': 'jpn',
4192 'jv': 'jav',
4193 'ka': 'kat',
4194 'kg': 'kon',
4195 'ki': 'kik',
4196 'kj': 'kua',
4197 'kk': 'kaz',
4198 'kl': 'kal',
4199 'km': 'khm',
4200 'kn': 'kan',
4201 'ko': 'kor',
4202 'kr': 'kau',
4203 'ks': 'kas',
4204 'ku': 'kur',
4205 'kv': 'kom',
4206 'kw': 'cor',
4207 'ky': 'kir',
4208 'la': 'lat',
4209 'lb': 'ltz',
4210 'lg': 'lug',
4211 'li': 'lim',
4212 'ln': 'lin',
4213 'lo': 'lao',
4214 'lt': 'lit',
4215 'lu': 'lub',
4216 'lv': 'lav',
4217 'mg': 'mlg',
4218 'mh': 'mah',
4219 'mi': 'mri',
4220 'mk': 'mkd',
4221 'ml': 'mal',
4222 'mn': 'mon',
4223 'mr': 'mar',
4224 'ms': 'msa',
4225 'mt': 'mlt',
4226 'my': 'mya',
4227 'na': 'nau',
4228 'nb': 'nob',
4229 'nd': 'nde',
4230 'ne': 'nep',
4231 'ng': 'ndo',
4232 'nl': 'nld',
4233 'nn': 'nno',
4234 'no': 'nor',
4235 'nr': 'nbl',
4236 'nv': 'nav',
4237 'ny': 'nya',
4238 'oc': 'oci',
4239 'oj': 'oji',
4240 'om': 'orm',
4241 'or': 'ori',
4242 'os': 'oss',
4243 'pa': 'pan',
4244 'pi': 'pli',
4245 'pl': 'pol',
4246 'ps': 'pus',
4247 'pt': 'por',
4248 'qu': 'que',
4249 'rm': 'roh',
4250 'rn': 'run',
4251 'ro': 'ron',
4252 'ru': 'rus',
4253 'rw': 'kin',
4254 'sa': 'san',
4255 'sc': 'srd',
4256 'sd': 'snd',
4257 'se': 'sme',
4258 'sg': 'sag',
4259 'si': 'sin',
4260 'sk': 'slk',
4261 'sl': 'slv',
4262 'sm': 'smo',
4263 'sn': 'sna',
4264 'so': 'som',
4265 'sq': 'sqi',
4266 'sr': 'srp',
4267 'ss': 'ssw',
4268 'st': 'sot',
4269 'su': 'sun',
4270 'sv': 'swe',
4271 'sw': 'swa',
4272 'ta': 'tam',
4273 'te': 'tel',
4274 'tg': 'tgk',
4275 'th': 'tha',
4276 'ti': 'tir',
4277 'tk': 'tuk',
4278 'tl': 'tgl',
4279 'tn': 'tsn',
4280 'to': 'ton',
4281 'tr': 'tur',
4282 'ts': 'tso',
4283 'tt': 'tat',
4284 'tw': 'twi',
4285 'ty': 'tah',
4286 'ug': 'uig',
4287 'uk': 'ukr',
4288 'ur': 'urd',
4289 'uz': 'uzb',
4290 've': 'ven',
4291 'vi': 'vie',
4292 'vo': 'vol',
4293 'wa': 'wln',
4294 'wo': 'wol',
4295 'xh': 'xho',
4296 'yi': 'yid',
e9a50fba 4297 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4298 'yo': 'yor',
4299 'za': 'zha',
4300 'zh': 'zho',
4301 'zu': 'zul',
4302 }
4303
4304 @classmethod
4305 def short2long(cls, code):
4306 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4307 return cls._lang_map.get(code[:2])
4308
4309 @classmethod
4310 def long2short(cls, code):
4311 """Convert language code from ISO 639-2/T to ISO 639-1"""
4312 for short_name, long_name in cls._lang_map.items():
4313 if long_name == code:
4314 return short_name
4315
4316
86e5f3ed 4317class ISO3166Utils:
4eb10f66
YCH
4318 # From http://data.okfn.org/data/core/country-list
4319 _country_map = {
4320 'AF': 'Afghanistan',
4321 'AX': 'Åland Islands',
4322 'AL': 'Albania',
4323 'DZ': 'Algeria',
4324 'AS': 'American Samoa',
4325 'AD': 'Andorra',
4326 'AO': 'Angola',
4327 'AI': 'Anguilla',
4328 'AQ': 'Antarctica',
4329 'AG': 'Antigua and Barbuda',
4330 'AR': 'Argentina',
4331 'AM': 'Armenia',
4332 'AW': 'Aruba',
4333 'AU': 'Australia',
4334 'AT': 'Austria',
4335 'AZ': 'Azerbaijan',
4336 'BS': 'Bahamas',
4337 'BH': 'Bahrain',
4338 'BD': 'Bangladesh',
4339 'BB': 'Barbados',
4340 'BY': 'Belarus',
4341 'BE': 'Belgium',
4342 'BZ': 'Belize',
4343 'BJ': 'Benin',
4344 'BM': 'Bermuda',
4345 'BT': 'Bhutan',
4346 'BO': 'Bolivia, Plurinational State of',
4347 'BQ': 'Bonaire, Sint Eustatius and Saba',
4348 'BA': 'Bosnia and Herzegovina',
4349 'BW': 'Botswana',
4350 'BV': 'Bouvet Island',
4351 'BR': 'Brazil',
4352 'IO': 'British Indian Ocean Territory',
4353 'BN': 'Brunei Darussalam',
4354 'BG': 'Bulgaria',
4355 'BF': 'Burkina Faso',
4356 'BI': 'Burundi',
4357 'KH': 'Cambodia',
4358 'CM': 'Cameroon',
4359 'CA': 'Canada',
4360 'CV': 'Cape Verde',
4361 'KY': 'Cayman Islands',
4362 'CF': 'Central African Republic',
4363 'TD': 'Chad',
4364 'CL': 'Chile',
4365 'CN': 'China',
4366 'CX': 'Christmas Island',
4367 'CC': 'Cocos (Keeling) Islands',
4368 'CO': 'Colombia',
4369 'KM': 'Comoros',
4370 'CG': 'Congo',
4371 'CD': 'Congo, the Democratic Republic of the',
4372 'CK': 'Cook Islands',
4373 'CR': 'Costa Rica',
4374 'CI': 'Côte d\'Ivoire',
4375 'HR': 'Croatia',
4376 'CU': 'Cuba',
4377 'CW': 'Curaçao',
4378 'CY': 'Cyprus',
4379 'CZ': 'Czech Republic',
4380 'DK': 'Denmark',
4381 'DJ': 'Djibouti',
4382 'DM': 'Dominica',
4383 'DO': 'Dominican Republic',
4384 'EC': 'Ecuador',
4385 'EG': 'Egypt',
4386 'SV': 'El Salvador',
4387 'GQ': 'Equatorial Guinea',
4388 'ER': 'Eritrea',
4389 'EE': 'Estonia',
4390 'ET': 'Ethiopia',
4391 'FK': 'Falkland Islands (Malvinas)',
4392 'FO': 'Faroe Islands',
4393 'FJ': 'Fiji',
4394 'FI': 'Finland',
4395 'FR': 'France',
4396 'GF': 'French Guiana',
4397 'PF': 'French Polynesia',
4398 'TF': 'French Southern Territories',
4399 'GA': 'Gabon',
4400 'GM': 'Gambia',
4401 'GE': 'Georgia',
4402 'DE': 'Germany',
4403 'GH': 'Ghana',
4404 'GI': 'Gibraltar',
4405 'GR': 'Greece',
4406 'GL': 'Greenland',
4407 'GD': 'Grenada',
4408 'GP': 'Guadeloupe',
4409 'GU': 'Guam',
4410 'GT': 'Guatemala',
4411 'GG': 'Guernsey',
4412 'GN': 'Guinea',
4413 'GW': 'Guinea-Bissau',
4414 'GY': 'Guyana',
4415 'HT': 'Haiti',
4416 'HM': 'Heard Island and McDonald Islands',
4417 'VA': 'Holy See (Vatican City State)',
4418 'HN': 'Honduras',
4419 'HK': 'Hong Kong',
4420 'HU': 'Hungary',
4421 'IS': 'Iceland',
4422 'IN': 'India',
4423 'ID': 'Indonesia',
4424 'IR': 'Iran, Islamic Republic of',
4425 'IQ': 'Iraq',
4426 'IE': 'Ireland',
4427 'IM': 'Isle of Man',
4428 'IL': 'Israel',
4429 'IT': 'Italy',
4430 'JM': 'Jamaica',
4431 'JP': 'Japan',
4432 'JE': 'Jersey',
4433 'JO': 'Jordan',
4434 'KZ': 'Kazakhstan',
4435 'KE': 'Kenya',
4436 'KI': 'Kiribati',
4437 'KP': 'Korea, Democratic People\'s Republic of',
4438 'KR': 'Korea, Republic of',
4439 'KW': 'Kuwait',
4440 'KG': 'Kyrgyzstan',
4441 'LA': 'Lao People\'s Democratic Republic',
4442 'LV': 'Latvia',
4443 'LB': 'Lebanon',
4444 'LS': 'Lesotho',
4445 'LR': 'Liberia',
4446 'LY': 'Libya',
4447 'LI': 'Liechtenstein',
4448 'LT': 'Lithuania',
4449 'LU': 'Luxembourg',
4450 'MO': 'Macao',
4451 'MK': 'Macedonia, the Former Yugoslav Republic of',
4452 'MG': 'Madagascar',
4453 'MW': 'Malawi',
4454 'MY': 'Malaysia',
4455 'MV': 'Maldives',
4456 'ML': 'Mali',
4457 'MT': 'Malta',
4458 'MH': 'Marshall Islands',
4459 'MQ': 'Martinique',
4460 'MR': 'Mauritania',
4461 'MU': 'Mauritius',
4462 'YT': 'Mayotte',
4463 'MX': 'Mexico',
4464 'FM': 'Micronesia, Federated States of',
4465 'MD': 'Moldova, Republic of',
4466 'MC': 'Monaco',
4467 'MN': 'Mongolia',
4468 'ME': 'Montenegro',
4469 'MS': 'Montserrat',
4470 'MA': 'Morocco',
4471 'MZ': 'Mozambique',
4472 'MM': 'Myanmar',
4473 'NA': 'Namibia',
4474 'NR': 'Nauru',
4475 'NP': 'Nepal',
4476 'NL': 'Netherlands',
4477 'NC': 'New Caledonia',
4478 'NZ': 'New Zealand',
4479 'NI': 'Nicaragua',
4480 'NE': 'Niger',
4481 'NG': 'Nigeria',
4482 'NU': 'Niue',
4483 'NF': 'Norfolk Island',
4484 'MP': 'Northern Mariana Islands',
4485 'NO': 'Norway',
4486 'OM': 'Oman',
4487 'PK': 'Pakistan',
4488 'PW': 'Palau',
4489 'PS': 'Palestine, State of',
4490 'PA': 'Panama',
4491 'PG': 'Papua New Guinea',
4492 'PY': 'Paraguay',
4493 'PE': 'Peru',
4494 'PH': 'Philippines',
4495 'PN': 'Pitcairn',
4496 'PL': 'Poland',
4497 'PT': 'Portugal',
4498 'PR': 'Puerto Rico',
4499 'QA': 'Qatar',
4500 'RE': 'Réunion',
4501 'RO': 'Romania',
4502 'RU': 'Russian Federation',
4503 'RW': 'Rwanda',
4504 'BL': 'Saint Barthélemy',
4505 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4506 'KN': 'Saint Kitts and Nevis',
4507 'LC': 'Saint Lucia',
4508 'MF': 'Saint Martin (French part)',
4509 'PM': 'Saint Pierre and Miquelon',
4510 'VC': 'Saint Vincent and the Grenadines',
4511 'WS': 'Samoa',
4512 'SM': 'San Marino',
4513 'ST': 'Sao Tome and Principe',
4514 'SA': 'Saudi Arabia',
4515 'SN': 'Senegal',
4516 'RS': 'Serbia',
4517 'SC': 'Seychelles',
4518 'SL': 'Sierra Leone',
4519 'SG': 'Singapore',
4520 'SX': 'Sint Maarten (Dutch part)',
4521 'SK': 'Slovakia',
4522 'SI': 'Slovenia',
4523 'SB': 'Solomon Islands',
4524 'SO': 'Somalia',
4525 'ZA': 'South Africa',
4526 'GS': 'South Georgia and the South Sandwich Islands',
4527 'SS': 'South Sudan',
4528 'ES': 'Spain',
4529 'LK': 'Sri Lanka',
4530 'SD': 'Sudan',
4531 'SR': 'Suriname',
4532 'SJ': 'Svalbard and Jan Mayen',
4533 'SZ': 'Swaziland',
4534 'SE': 'Sweden',
4535 'CH': 'Switzerland',
4536 'SY': 'Syrian Arab Republic',
4537 'TW': 'Taiwan, Province of China',
4538 'TJ': 'Tajikistan',
4539 'TZ': 'Tanzania, United Republic of',
4540 'TH': 'Thailand',
4541 'TL': 'Timor-Leste',
4542 'TG': 'Togo',
4543 'TK': 'Tokelau',
4544 'TO': 'Tonga',
4545 'TT': 'Trinidad and Tobago',
4546 'TN': 'Tunisia',
4547 'TR': 'Turkey',
4548 'TM': 'Turkmenistan',
4549 'TC': 'Turks and Caicos Islands',
4550 'TV': 'Tuvalu',
4551 'UG': 'Uganda',
4552 'UA': 'Ukraine',
4553 'AE': 'United Arab Emirates',
4554 'GB': 'United Kingdom',
4555 'US': 'United States',
4556 'UM': 'United States Minor Outlying Islands',
4557 'UY': 'Uruguay',
4558 'UZ': 'Uzbekistan',
4559 'VU': 'Vanuatu',
4560 'VE': 'Venezuela, Bolivarian Republic of',
4561 'VN': 'Viet Nam',
4562 'VG': 'Virgin Islands, British',
4563 'VI': 'Virgin Islands, U.S.',
4564 'WF': 'Wallis and Futuna',
4565 'EH': 'Western Sahara',
4566 'YE': 'Yemen',
4567 'ZM': 'Zambia',
4568 'ZW': 'Zimbabwe',
2f97cc61 4569 # Not ISO 3166 codes, but used for IP blocks
4570 'AP': 'Asia/Pacific Region',
4571 'EU': 'Europe',
4eb10f66
YCH
4572 }
4573
4574 @classmethod
4575 def short2full(cls, code):
4576 """Convert an ISO 3166-2 country code to the corresponding full name"""
4577 return cls._country_map.get(code.upper())
4578
4579
86e5f3ed 4580class GeoUtils:
773f291d
S
4581 # Major IPv4 address blocks per country
4582 _country_ip_map = {
53896ca5 4583 'AD': '46.172.224.0/19',
773f291d
S
4584 'AE': '94.200.0.0/13',
4585 'AF': '149.54.0.0/17',
4586 'AG': '209.59.64.0/18',
4587 'AI': '204.14.248.0/21',
4588 'AL': '46.99.0.0/16',
4589 'AM': '46.70.0.0/15',
4590 'AO': '105.168.0.0/13',
53896ca5
S
4591 'AP': '182.50.184.0/21',
4592 'AQ': '23.154.160.0/24',
773f291d
S
4593 'AR': '181.0.0.0/12',
4594 'AS': '202.70.112.0/20',
53896ca5 4595 'AT': '77.116.0.0/14',
773f291d
S
4596 'AU': '1.128.0.0/11',
4597 'AW': '181.41.0.0/18',
53896ca5
S
4598 'AX': '185.217.4.0/22',
4599 'AZ': '5.197.0.0/16',
773f291d
S
4600 'BA': '31.176.128.0/17',
4601 'BB': '65.48.128.0/17',
4602 'BD': '114.130.0.0/16',
4603 'BE': '57.0.0.0/8',
53896ca5 4604 'BF': '102.178.0.0/15',
773f291d
S
4605 'BG': '95.42.0.0/15',
4606 'BH': '37.131.0.0/17',
4607 'BI': '154.117.192.0/18',
4608 'BJ': '137.255.0.0/16',
53896ca5 4609 'BL': '185.212.72.0/23',
773f291d
S
4610 'BM': '196.12.64.0/18',
4611 'BN': '156.31.0.0/16',
4612 'BO': '161.56.0.0/16',
4613 'BQ': '161.0.80.0/20',
53896ca5 4614 'BR': '191.128.0.0/12',
773f291d
S
4615 'BS': '24.51.64.0/18',
4616 'BT': '119.2.96.0/19',
4617 'BW': '168.167.0.0/16',
4618 'BY': '178.120.0.0/13',
4619 'BZ': '179.42.192.0/18',
4620 'CA': '99.224.0.0/11',
4621 'CD': '41.243.0.0/16',
53896ca5
S
4622 'CF': '197.242.176.0/21',
4623 'CG': '160.113.0.0/16',
773f291d 4624 'CH': '85.0.0.0/13',
53896ca5 4625 'CI': '102.136.0.0/14',
773f291d
S
4626 'CK': '202.65.32.0/19',
4627 'CL': '152.172.0.0/14',
53896ca5 4628 'CM': '102.244.0.0/14',
773f291d
S
4629 'CN': '36.128.0.0/10',
4630 'CO': '181.240.0.0/12',
4631 'CR': '201.192.0.0/12',
4632 'CU': '152.206.0.0/15',
4633 'CV': '165.90.96.0/19',
4634 'CW': '190.88.128.0/17',
53896ca5 4635 'CY': '31.153.0.0/16',
773f291d
S
4636 'CZ': '88.100.0.0/14',
4637 'DE': '53.0.0.0/8',
4638 'DJ': '197.241.0.0/17',
4639 'DK': '87.48.0.0/12',
4640 'DM': '192.243.48.0/20',
4641 'DO': '152.166.0.0/15',
4642 'DZ': '41.96.0.0/12',
4643 'EC': '186.68.0.0/15',
4644 'EE': '90.190.0.0/15',
4645 'EG': '156.160.0.0/11',
4646 'ER': '196.200.96.0/20',
4647 'ES': '88.0.0.0/11',
4648 'ET': '196.188.0.0/14',
4649 'EU': '2.16.0.0/13',
4650 'FI': '91.152.0.0/13',
4651 'FJ': '144.120.0.0/16',
53896ca5 4652 'FK': '80.73.208.0/21',
773f291d
S
4653 'FM': '119.252.112.0/20',
4654 'FO': '88.85.32.0/19',
4655 'FR': '90.0.0.0/9',
4656 'GA': '41.158.0.0/15',
4657 'GB': '25.0.0.0/8',
4658 'GD': '74.122.88.0/21',
4659 'GE': '31.146.0.0/16',
4660 'GF': '161.22.64.0/18',
4661 'GG': '62.68.160.0/19',
53896ca5
S
4662 'GH': '154.160.0.0/12',
4663 'GI': '95.164.0.0/16',
773f291d
S
4664 'GL': '88.83.0.0/19',
4665 'GM': '160.182.0.0/15',
4666 'GN': '197.149.192.0/18',
4667 'GP': '104.250.0.0/19',
4668 'GQ': '105.235.224.0/20',
4669 'GR': '94.64.0.0/13',
4670 'GT': '168.234.0.0/16',
4671 'GU': '168.123.0.0/16',
4672 'GW': '197.214.80.0/20',
4673 'GY': '181.41.64.0/18',
4674 'HK': '113.252.0.0/14',
4675 'HN': '181.210.0.0/16',
4676 'HR': '93.136.0.0/13',
4677 'HT': '148.102.128.0/17',
4678 'HU': '84.0.0.0/14',
4679 'ID': '39.192.0.0/10',
4680 'IE': '87.32.0.0/12',
4681 'IL': '79.176.0.0/13',
4682 'IM': '5.62.80.0/20',
4683 'IN': '117.192.0.0/10',
4684 'IO': '203.83.48.0/21',
4685 'IQ': '37.236.0.0/14',
4686 'IR': '2.176.0.0/12',
4687 'IS': '82.221.0.0/16',
4688 'IT': '79.0.0.0/10',
4689 'JE': '87.244.64.0/18',
4690 'JM': '72.27.0.0/17',
4691 'JO': '176.29.0.0/16',
53896ca5 4692 'JP': '133.0.0.0/8',
773f291d
S
4693 'KE': '105.48.0.0/12',
4694 'KG': '158.181.128.0/17',
4695 'KH': '36.37.128.0/17',
4696 'KI': '103.25.140.0/22',
4697 'KM': '197.255.224.0/20',
53896ca5 4698 'KN': '198.167.192.0/19',
773f291d
S
4699 'KP': '175.45.176.0/22',
4700 'KR': '175.192.0.0/10',
4701 'KW': '37.36.0.0/14',
4702 'KY': '64.96.0.0/15',
4703 'KZ': '2.72.0.0/13',
4704 'LA': '115.84.64.0/18',
4705 'LB': '178.135.0.0/16',
53896ca5 4706 'LC': '24.92.144.0/20',
773f291d
S
4707 'LI': '82.117.0.0/19',
4708 'LK': '112.134.0.0/15',
53896ca5 4709 'LR': '102.183.0.0/16',
773f291d
S
4710 'LS': '129.232.0.0/17',
4711 'LT': '78.56.0.0/13',
4712 'LU': '188.42.0.0/16',
4713 'LV': '46.109.0.0/16',
4714 'LY': '41.252.0.0/14',
4715 'MA': '105.128.0.0/11',
4716 'MC': '88.209.64.0/18',
4717 'MD': '37.246.0.0/16',
4718 'ME': '178.175.0.0/17',
4719 'MF': '74.112.232.0/21',
4720 'MG': '154.126.0.0/17',
4721 'MH': '117.103.88.0/21',
4722 'MK': '77.28.0.0/15',
4723 'ML': '154.118.128.0/18',
4724 'MM': '37.111.0.0/17',
4725 'MN': '49.0.128.0/17',
4726 'MO': '60.246.0.0/16',
4727 'MP': '202.88.64.0/20',
4728 'MQ': '109.203.224.0/19',
4729 'MR': '41.188.64.0/18',
4730 'MS': '208.90.112.0/22',
4731 'MT': '46.11.0.0/16',
4732 'MU': '105.16.0.0/12',
4733 'MV': '27.114.128.0/18',
53896ca5 4734 'MW': '102.70.0.0/15',
773f291d
S
4735 'MX': '187.192.0.0/11',
4736 'MY': '175.136.0.0/13',
4737 'MZ': '197.218.0.0/15',
4738 'NA': '41.182.0.0/16',
4739 'NC': '101.101.0.0/18',
4740 'NE': '197.214.0.0/18',
4741 'NF': '203.17.240.0/22',
4742 'NG': '105.112.0.0/12',
4743 'NI': '186.76.0.0/15',
4744 'NL': '145.96.0.0/11',
4745 'NO': '84.208.0.0/13',
4746 'NP': '36.252.0.0/15',
4747 'NR': '203.98.224.0/19',
4748 'NU': '49.156.48.0/22',
4749 'NZ': '49.224.0.0/14',
4750 'OM': '5.36.0.0/15',
4751 'PA': '186.72.0.0/15',
4752 'PE': '186.160.0.0/14',
4753 'PF': '123.50.64.0/18',
4754 'PG': '124.240.192.0/19',
4755 'PH': '49.144.0.0/13',
4756 'PK': '39.32.0.0/11',
4757 'PL': '83.0.0.0/11',
4758 'PM': '70.36.0.0/20',
4759 'PR': '66.50.0.0/16',
4760 'PS': '188.161.0.0/16',
4761 'PT': '85.240.0.0/13',
4762 'PW': '202.124.224.0/20',
4763 'PY': '181.120.0.0/14',
4764 'QA': '37.210.0.0/15',
53896ca5 4765 'RE': '102.35.0.0/16',
773f291d 4766 'RO': '79.112.0.0/13',
53896ca5 4767 'RS': '93.86.0.0/15',
773f291d 4768 'RU': '5.136.0.0/13',
53896ca5 4769 'RW': '41.186.0.0/16',
773f291d
S
4770 'SA': '188.48.0.0/13',
4771 'SB': '202.1.160.0/19',
4772 'SC': '154.192.0.0/11',
53896ca5 4773 'SD': '102.120.0.0/13',
773f291d 4774 'SE': '78.64.0.0/12',
53896ca5 4775 'SG': '8.128.0.0/10',
773f291d
S
4776 'SI': '188.196.0.0/14',
4777 'SK': '78.98.0.0/15',
53896ca5 4778 'SL': '102.143.0.0/17',
773f291d
S
4779 'SM': '89.186.32.0/19',
4780 'SN': '41.82.0.0/15',
53896ca5 4781 'SO': '154.115.192.0/18',
773f291d
S
4782 'SR': '186.179.128.0/17',
4783 'SS': '105.235.208.0/21',
4784 'ST': '197.159.160.0/19',
4785 'SV': '168.243.0.0/16',
4786 'SX': '190.102.0.0/20',
4787 'SY': '5.0.0.0/16',
4788 'SZ': '41.84.224.0/19',
4789 'TC': '65.255.48.0/20',
4790 'TD': '154.68.128.0/19',
4791 'TG': '196.168.0.0/14',
4792 'TH': '171.96.0.0/13',
4793 'TJ': '85.9.128.0/18',
4794 'TK': '27.96.24.0/21',
4795 'TL': '180.189.160.0/20',
4796 'TM': '95.85.96.0/19',
4797 'TN': '197.0.0.0/11',
4798 'TO': '175.176.144.0/21',
4799 'TR': '78.160.0.0/11',
4800 'TT': '186.44.0.0/15',
4801 'TV': '202.2.96.0/19',
4802 'TW': '120.96.0.0/11',
4803 'TZ': '156.156.0.0/14',
53896ca5
S
4804 'UA': '37.52.0.0/14',
4805 'UG': '102.80.0.0/13',
4806 'US': '6.0.0.0/8',
773f291d 4807 'UY': '167.56.0.0/13',
53896ca5 4808 'UZ': '84.54.64.0/18',
773f291d 4809 'VA': '212.77.0.0/19',
53896ca5 4810 'VC': '207.191.240.0/21',
773f291d 4811 'VE': '186.88.0.0/13',
53896ca5 4812 'VG': '66.81.192.0/20',
773f291d
S
4813 'VI': '146.226.0.0/16',
4814 'VN': '14.160.0.0/11',
4815 'VU': '202.80.32.0/20',
4816 'WF': '117.20.32.0/21',
4817 'WS': '202.4.32.0/19',
4818 'YE': '134.35.0.0/16',
4819 'YT': '41.242.116.0/22',
4820 'ZA': '41.0.0.0/11',
53896ca5
S
4821 'ZM': '102.144.0.0/13',
4822 'ZW': '102.177.192.0/18',
773f291d
S
4823 }
4824
4825 @classmethod
5f95927a
S
4826 def random_ipv4(cls, code_or_block):
4827 if len(code_or_block) == 2:
4828 block = cls._country_ip_map.get(code_or_block.upper())
4829 if not block:
4830 return None
4831 else:
4832 block = code_or_block
773f291d 4833 addr, preflen = block.split('/')
ac668111 4834 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4835 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4836 return str(socket.inet_ntoa(
ac668111 4837 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4838
4839
ac668111 4840class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4841 def __init__(self, proxies=None):
4842 # Set default handlers
4843 for type in ('http', 'https'):
4844 setattr(self, '%s_open' % type,
4845 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4846 meth(r, proxy, type))
ac668111 4847 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4848
91410c9b 4849 def proxy_open(self, req, proxy, type):
2461f79d 4850 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4851 if req_proxy is not None:
4852 proxy = req_proxy
2461f79d
PH
4853 del req.headers['Ytdl-request-proxy']
4854
4855 if proxy == '__noproxy__':
4856 return None # No Proxy
14f25df2 4857 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4858 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4859 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4860 return None
ac668111 4861 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4862 self, req, proxy, type)
5bc880b9
YCH
4863
4864
0a5445dd
YCH
4865# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4866# released into Public Domain
4867# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4868
4869def long_to_bytes(n, blocksize=0):
4870 """long_to_bytes(n:long, blocksize:int) : string
4871 Convert a long integer to a byte string.
4872
4873 If optional blocksize is given and greater than zero, pad the front of the
4874 byte string with binary zeros so that the length is a multiple of
4875 blocksize.
4876 """
4877 # after much testing, this algorithm was deemed to be the fastest
4878 s = b''
4879 n = int(n)
4880 while n > 0:
ac668111 4881 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4882 n = n >> 32
4883 # strip off leading zeros
4884 for i in range(len(s)):
4885 if s[i] != b'\000'[0]:
4886 break
4887 else:
4888 # only happens when n == 0
4889 s = b'\000'
4890 i = 0
4891 s = s[i:]
4892 # add back some pad bytes. this could be done more efficiently w.r.t. the
4893 # de-padding being done above, but sigh...
4894 if blocksize > 0 and len(s) % blocksize:
4895 s = (blocksize - len(s) % blocksize) * b'\000' + s
4896 return s
4897
4898
4899def bytes_to_long(s):
4900 """bytes_to_long(string) : long
4901 Convert a byte string to a long integer.
4902
4903 This is (essentially) the inverse of long_to_bytes().
4904 """
4905 acc = 0
4906 length = len(s)
4907 if length % 4:
4908 extra = (4 - length % 4)
4909 s = b'\000' * extra + s
4910 length = length + extra
4911 for i in range(0, length, 4):
ac668111 4912 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4913 return acc
4914
4915
5bc880b9
YCH
4916def ohdave_rsa_encrypt(data, exponent, modulus):
4917 '''
4918 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4919
4920 Input:
4921 data: data to encrypt, bytes-like object
4922 exponent, modulus: parameter e and N of RSA algorithm, both integer
4923 Output: hex string of encrypted data
4924
4925 Limitation: supports one block encryption only
4926 '''
4927
4928 payload = int(binascii.hexlify(data[::-1]), 16)
4929 encrypted = pow(payload, exponent, modulus)
4930 return '%x' % encrypted
81bdc8fd
YCH
4931
4932
f48409c7
YCH
4933def pkcs1pad(data, length):
4934 """
4935 Padding input data with PKCS#1 scheme
4936
4937 @param {int[]} data input data
4938 @param {int} length target length
4939 @returns {int[]} padded data
4940 """
4941 if len(data) > length - 11:
4942 raise ValueError('Input data too long for PKCS#1 padding')
4943
4944 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4945 return [0, 2] + pseudo_random + [0] + data
4946
4947
7b2c3f47 4948def _base_n_table(n, table):
4949 if not table and not n:
4950 raise ValueError('Either table or n must be specified')
612f2be5 4951 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4952
44f14eb4 4953 if n and n != len(table):
612f2be5 4954 raise ValueError(f'base {n} exceeds table length {len(table)}')
4955 return table
59f898b7 4956
5eb6bdce 4957
7b2c3f47 4958def encode_base_n(num, n=None, table=None):
4959 """Convert given int to a base-n string"""
612f2be5 4960 table = _base_n_table(n, table)
7b2c3f47 4961 if not num:
5eb6bdce
YCH
4962 return table[0]
4963
7b2c3f47 4964 result, base = '', len(table)
81bdc8fd 4965 while num:
7b2c3f47 4966 result = table[num % base] + result
612f2be5 4967 num = num // base
7b2c3f47 4968 return result
4969
4970
4971def decode_base_n(string, n=None, table=None):
4972 """Convert given base-n string to int"""
4973 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4974 result, base = 0, len(table)
4975 for char in string:
4976 result = result * base + table[char]
4977 return result
4978
4979
4980def decode_base(value, digits):
da4db748 4981 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4982 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 4983 return decode_base_n(value, table=digits)
f52354a8
YCH
4984
4985
4986def decode_packed_codes(code):
06b3fe29 4987 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4988 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4989 base = int(base)
4990 count = int(count)
4991 symbols = symbols.split('|')
4992 symbol_table = {}
4993
4994 while count:
4995 count -= 1
5eb6bdce 4996 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4997 symbol_table[base_n_count] = symbols[count] or base_n_count
4998
4999 return re.sub(
5000 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5001 obfuscated_code)
e154c651 5002
5003
1ced2221
S
5004def caesar(s, alphabet, shift):
5005 if shift == 0:
5006 return s
5007 l = len(alphabet)
5008 return ''.join(
5009 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5010 for c in s)
5011
5012
5013def rot47(s):
5014 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5015
5016
e154c651 5017def parse_m3u8_attributes(attrib):
5018 info = {}
5019 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5020 if val.startswith('"'):
5021 val = val[1:-1]
5022 info[key] = val
5023 return info
1143535d
YCH
5024
5025
5026def urshift(val, n):
5027 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5028
5029
5030# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 5031# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
5032def decode_png(png_data):
5033 # Reference: https://www.w3.org/TR/PNG/
5034 header = png_data[8:]
5035
5036 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 5037 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
5038
5039 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 5040 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
5041
5042 chunks = []
5043
5044 while header:
5045 length = unpack_integer(header[:4])
5046 header = header[4:]
5047
5048 chunk_type = header[:4]
5049 header = header[4:]
5050
5051 chunk_data = header[:length]
5052 header = header[length:]
5053
5054 header = header[4:] # Skip CRC
5055
5056 chunks.append({
5057 'type': chunk_type,
5058 'length': length,
5059 'data': chunk_data
5060 })
5061
5062 ihdr = chunks[0]['data']
5063
5064 width = unpack_integer(ihdr[:4])
5065 height = unpack_integer(ihdr[4:8])
5066
5067 idat = b''
5068
5069 for chunk in chunks:
5070 if chunk['type'] == b'IDAT':
5071 idat += chunk['data']
5072
5073 if not idat:
86e5f3ed 5074 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5075
5076 decompressed_data = bytearray(zlib.decompress(idat))
5077
5078 stride = width * 3
5079 pixels = []
5080
5081 def _get_pixel(idx):
5082 x = idx % stride
5083 y = idx // stride
5084 return pixels[y][x]
5085
5086 for y in range(height):
5087 basePos = y * (1 + stride)
5088 filter_type = decompressed_data[basePos]
5089
5090 current_row = []
5091
5092 pixels.append(current_row)
5093
5094 for x in range(stride):
5095 color = decompressed_data[1 + basePos + x]
5096 basex = y * stride + x
5097 left = 0
5098 up = 0
5099
5100 if x > 2:
5101 left = _get_pixel(basex - 3)
5102 if y > 0:
5103 up = _get_pixel(basex - stride)
5104
5105 if filter_type == 1: # Sub
5106 color = (color + left) & 0xff
5107 elif filter_type == 2: # Up
5108 color = (color + up) & 0xff
5109 elif filter_type == 3: # Average
5110 color = (color + ((left + up) >> 1)) & 0xff
5111 elif filter_type == 4: # Paeth
5112 a = left
5113 b = up
5114 c = 0
5115
5116 if x > 2 and y > 0:
5117 c = _get_pixel(basex - stride - 3)
5118
5119 p = a + b - c
5120
5121 pa = abs(p - a)
5122 pb = abs(p - b)
5123 pc = abs(p - c)
5124
5125 if pa <= pb and pa <= pc:
5126 color = (color + a) & 0xff
5127 elif pb <= pc:
5128 color = (color + b) & 0xff
5129 else:
5130 color = (color + c) & 0xff
5131
5132 current_row.append(color)
5133
5134 return width, height, pixels
efa97bdc
YCH
5135
5136
5137def write_xattr(path, key, value):
6f7563be 5138 # Windows: Write xattrs to NTFS Alternate Data Streams:
5139 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5140 if compat_os_name == 'nt':
5141 assert ':' not in key
5142 assert os.path.exists(path)
efa97bdc
YCH
5143
5144 try:
6f7563be 5145 with open(f'{path}:{key}', 'wb') as f:
5146 f.write(value)
86e5f3ed 5147 except OSError as e:
efa97bdc 5148 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5149 return
efa97bdc 5150
6f7563be 5151 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5152
6f7563be 5153 setxattr = None
5154 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5155 # Unicode arguments are not supported in pyxattr until version 0.5.0
5156 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5157 if version_tuple(xattr.__version__) >= (0, 5, 0):
5158 setxattr = xattr.set
5159 elif xattr:
5160 setxattr = xattr.setxattr
efa97bdc 5161
6f7563be 5162 if setxattr:
5163 try:
5164 setxattr(path, key, value)
5165 except OSError as e:
5166 raise XAttrMetadataError(e.errno, e.strerror)
5167 return
efa97bdc 5168
6f7563be 5169 # UNIX Method 2. Use setfattr/xattr executables
5170 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5171 else 'xattr' if check_executable('xattr', ['-h']) else None)
5172 if not exe:
5173 raise XAttrUnavailableError(
5174 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5175 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5176
0f06bcd7 5177 value = value.decode()
6f7563be 5178 try:
f0c9fb96 5179 _, stderr, returncode = Popen.run(
6f7563be 5180 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5181 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5182 except OSError as e:
5183 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5184 if returncode:
5185 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5186
5187
5188def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5189 start_date = datetime.date(1950, 1, 1)
5190 end_date = datetime.date(1995, 12, 31)
5191 offset = random.randint(0, (end_date - start_date).days)
5192 random_date = start_date + datetime.timedelta(offset)
0c265486 5193 return {
aa374bc7
AS
5194 year_field: str(random_date.year),
5195 month_field: str(random_date.month),
5196 day_field: str(random_date.day),
0c265486 5197 }
732044af 5198
c76eb41b 5199
732044af 5200# Templates for internet shortcut files, which are plain text files.
e5a998f3 5201DOT_URL_LINK_TEMPLATE = '''\
732044af 5202[InternetShortcut]
5203URL=%(url)s
e5a998f3 5204'''
732044af 5205
e5a998f3 5206DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5207<?xml version="1.0" encoding="UTF-8"?>
5208<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5209<plist version="1.0">
5210<dict>
5211\t<key>URL</key>
5212\t<string>%(url)s</string>
5213</dict>
5214</plist>
e5a998f3 5215'''
732044af 5216
e5a998f3 5217DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5218[Desktop Entry]
5219Encoding=UTF-8
5220Name=%(filename)s
5221Type=Link
5222URL=%(url)s
5223Icon=text-html
e5a998f3 5224'''
732044af 5225
08438d2c 5226LINK_TEMPLATES = {
5227 'url': DOT_URL_LINK_TEMPLATE,
5228 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5229 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5230}
5231
732044af 5232
5233def iri_to_uri(iri):
5234 """
5235 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5236
5237 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5238 """
5239
14f25df2 5240 iri_parts = urllib.parse.urlparse(iri)
732044af 5241
5242 if '[' in iri_parts.netloc:
5243 raise ValueError('IPv6 URIs are not, yet, supported.')
5244 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5245
5246 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5247
5248 net_location = ''
5249 if iri_parts.username:
f9934b96 5250 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5251 if iri_parts.password is not None:
f9934b96 5252 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5253 net_location += '@'
5254
0f06bcd7 5255 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5256 # The 'idna' encoding produces ASCII text.
5257 if iri_parts.port is not None and iri_parts.port != 80:
5258 net_location += ':' + str(iri_parts.port)
5259
f9934b96 5260 return urllib.parse.urlunparse(
732044af 5261 (iri_parts.scheme,
5262 net_location,
5263
f9934b96 5264 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5265
5266 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5267 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5268
5269 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5270 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5271
f9934b96 5272 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5273
5274 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5275
5276
5277def to_high_limit_path(path):
5278 if sys.platform in ['win32', 'cygwin']:
5279 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5280 return '\\\\?\\' + os.path.abspath(path)
732044af 5281
5282 return path
76d321f6 5283
c76eb41b 5284
7b2c3f47 5285def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5286 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5287 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5288 return default
7b2c3f47 5289 return template % func(val)
00dd0cd5 5290
5291
5292def clean_podcast_url(url):
5293 return re.sub(r'''(?x)
5294 (?:
5295 (?:
5296 chtbl\.com/track|
5297 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5298 play\.podtrac\.com
5299 )/[^/]+|
5300 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5301 flex\.acast\.com|
5302 pd(?:
5303 cn\.co| # https://podcorn.com/analytics-prefix/
5304 st\.fm # https://podsights.com/docs/
5305 )/e
5306 )/''', '', url)
ffcb8191
THD
5307
5308
5309_HEX_TABLE = '0123456789abcdef'
5310
5311
5312def random_uuidv4():
5313 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5314
5315
5316def make_dir(path, to_screen=None):
5317 try:
5318 dn = os.path.dirname(path)
5319 if dn and not os.path.exists(dn):
5320 os.makedirs(dn)
5321 return True
86e5f3ed 5322 except OSError as err:
0202b52a 5323 if callable(to_screen) is not None:
5324 to_screen('unable to create directory ' + error_to_compat_str(err))
5325 return False
f74980cb 5326
5327
5328def get_executable_path():
b5899f4f 5329 from .update import _get_variant_and_executable_path
c487cf00 5330
b5899f4f 5331 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5332
5333
2f567473 5334def load_plugins(name, suffix, namespace):
3ae5e797 5335 classes = {}
19a03940 5336 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5337 plugins_spec = importlib.util.spec_from_file_location(
5338 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5339 plugins = importlib.util.module_from_spec(plugins_spec)
5340 sys.modules[plugins_spec.name] = plugins
5341 plugins_spec.loader.exec_module(plugins)
f74980cb 5342 for name in dir(plugins):
2f567473 5343 if name in namespace:
5344 continue
5345 if not name.endswith(suffix):
f74980cb 5346 continue
5347 klass = getattr(plugins, name)
3ae5e797 5348 classes[name] = namespace[name] = klass
f74980cb 5349 return classes
06167fbb 5350
5351
325ebc17 5352def traverse_obj(
f99bbfc9 5353 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5354 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5355 """
5356 Safely traverse nested `dict`s and `Sequence`s
5357
5358 >>> obj = [{}, {"key": "value"}]
5359 >>> traverse_obj(obj, (1, "key"))
5360 "value"
5361
5362 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5363 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5364 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
ab029d7e
SS
5365 A value of None is treated as the absence of a value.
5366
5367 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5368
5369 The keys in the path can be one of:
5370 - `None`: Return the current object.
7b0127e1 5371 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
ab029d7e
SS
5372 - `slice`: Branch out and return all values in `obj[key]`.
5373 - `Ellipsis`: Branch out and return a list of all values.
5374 - `tuple`/`list`: Branch out and return a list of all matching values.
5375 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5376 - `function`: Branch out and return values filtered by the function.
5377 Read as: `[value for key, value in obj if function(key, value)]`.
5378 For `Sequence`s, `key` is the index of the value.
5379 - `dict` Transform the current object and return a matching dict.
5380 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5381
7b0127e1 5382 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5383
5384 @params paths Paths which to traverse by.
5385 @param default Value to return if the paths do not match.
5386 @param expected_type If a `type`, only accept final values of this type.
5387 If any other callable, try to call the function on each result.
5388 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5389 @param casesense If `False`, consider string dictionary keys as case insensitive.
5390
5391 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5392
5393 @param is_user_input Whether the keys are generated from user input.
5394 If `True` strings get converted to `int`/`slice` if needed.
5395 @param traverse_string Whether to traverse into objects as strings.
5396 If `True`, any non-compatible object will first be
5397 converted into a string and then traversed into.
5398
5399
5400 @returns The result of the object traversal.
5401 If successful, `get_all=True`, and the path branches at least once,
5402 then a list of results is returned instead.
f99bbfc9 5403 A list is always returned if the last path branches and no `default` is given.
ab029d7e
SS
5404 """
5405 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5406 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5407
352d63fd 5408 if isinstance(expected_type, type):
5409 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5410 else:
ab029d7e
SS
5411 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5412
5413 def apply_key(key, obj):
5414 if obj is None:
5415 return
5416
5417 elif key is None:
5418 yield obj
5419
5420 elif isinstance(key, (list, tuple)):
5421 for branch in key:
5422 _, result = apply_path(obj, branch)
5423 yield from result
5424
5425 elif key is ...:
5426 if isinstance(obj, collections.abc.Mapping):
5427 yield from obj.values()
5428 elif is_sequence(obj):
5429 yield from obj
7b0127e1
SS
5430 elif isinstance(obj, re.Match):
5431 yield from obj.groups()
ab029d7e
SS
5432 elif traverse_string:
5433 yield from str(obj)
5434
5435 elif callable(key):
5436 if is_sequence(obj):
5437 iter_obj = enumerate(obj)
5438 elif isinstance(obj, collections.abc.Mapping):
5439 iter_obj = obj.items()
7b0127e1
SS
5440 elif isinstance(obj, re.Match):
5441 iter_obj = enumerate((obj.group(), *obj.groups()))
ab029d7e
SS
5442 elif traverse_string:
5443 iter_obj = enumerate(str(obj))
352d63fd 5444 else:
ab029d7e
SS
5445 return
5446 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5447
5448 elif isinstance(key, dict):
5449 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5450 yield {k: v if v is not None else default for k, v in iter_obj
f99bbfc9 5451 if v is not None or default is not NO_DEFAULT}
ab029d7e 5452
7b0127e1 5453 elif isinstance(obj, collections.abc.Mapping):
ab029d7e
SS
5454 yield (obj.get(key) if casesense or (key in obj)
5455 else next((v for k, v in obj.items() if casefold(k) == key), None))
5456
7b0127e1
SS
5457 elif isinstance(obj, re.Match):
5458 if isinstance(key, int) or casesense:
5459 with contextlib.suppress(IndexError):
5460 yield obj.group(key)
5461 return
5462
5463 if not isinstance(key, str):
5464 return
5465
5466 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5467
ab029d7e
SS
5468 else:
5469 if is_user_input:
5470 key = (int_or_none(key) if ':' not in key
5471 else slice(*map(int_or_none, key.split(':'))))
5472
5473 if not isinstance(key, (int, slice)):
5474 return
5475
5476 if not is_sequence(obj):
5477 if not traverse_string:
5478 return
5479 obj = str(obj)
5480
5481 with contextlib.suppress(IndexError):
5482 yield obj[key]
5483
5484 def apply_path(start_obj, path):
5485 objs = (start_obj,)
5486 has_branched = False
5487
5488 for key in variadic(path):
5489 if is_user_input and key == ':':
5490 key = ...
5491
5492 if not casesense and isinstance(key, str):
5493 key = key.casefold()
5494
5495 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5496 has_branched = True
5497
5498 key_func = functools.partial(apply_key, key)
5499 objs = itertools.chain.from_iterable(map(key_func, objs))
5500
5501 return has_branched, objs
5502
f99bbfc9 5503 def _traverse_obj(obj, path, use_list=True):
ab029d7e
SS
5504 has_branched, results = apply_path(obj, path)
5505 results = LazyList(x for x in map(type_test, results) if x is not None)
ab029d7e 5506
f99bbfc9
SS
5507 if get_all and has_branched:
5508 return results.exhaust() if results or use_list else None
5509
5510 return results[0] if results else None
5511
5512 for index, path in enumerate(paths, 1):
5513 use_list = default is NO_DEFAULT and index == len(paths)
5514 result = _traverse_obj(obj, path, use_list)
ab029d7e
SS
5515 if result is not None:
5516 return result
5517
f99bbfc9 5518 return None if default is NO_DEFAULT else default
324ad820 5519
5520
5521def traverse_dict(dictn, keys, casesense=True):
da4db748 5522 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5523 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5524 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5525
5526
ff91cf74 5527def get_first(obj, keys, **kwargs):
5528 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5529
5530
3e9b66d7
LNO
5531def time_seconds(**kwargs):
5532 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5533 return t.timestamp()
5534
5535
49fa4d9a
N
5536# create a JSON Web Signature (jws) with HS256 algorithm
5537# the resulting format is in JWS Compact Serialization
5538# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5539# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5540def jwt_encode_hs256(payload_data, key, headers={}):
5541 header_data = {
5542 'alg': 'HS256',
5543 'typ': 'JWT',
5544 }
5545 if headers:
5546 header_data.update(headers)
0f06bcd7 5547 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5548 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5549 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5550 signature_b64 = base64.b64encode(h.digest())
5551 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5552 return token
819e0531 5553
5554
16b0d7e6 5555# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5556def jwt_decode_hs256(jwt):
5557 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5558 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5559 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5560 return payload_data
5561
5562
53973b4d 5563WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5564
5565
7a32c70d 5566@functools.cache
819e0531 5567def supports_terminal_sequences(stream):
5568 if compat_os_name == 'nt':
8a82af35 5569 if not WINDOWS_VT_MODE:
819e0531 5570 return False
5571 elif not os.getenv('TERM'):
5572 return False
5573 try:
5574 return stream.isatty()
5575 except BaseException:
5576 return False
5577
5578
53973b4d 5579def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5580 if get_windows_version() < (10, 0, 10586):
53973b4d 5581 return
5582 global WINDOWS_VT_MODE
53973b4d 5583 try:
f0c9fb96 5584 Popen.run('', shell=True)
53973b4d 5585 except Exception:
5586 return
5587
5588 WINDOWS_VT_MODE = True
5589 supports_terminal_sequences.cache_clear()
5590
5591
ec11a9f4 5592_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5593
5594
5595def remove_terminal_sequences(string):
5596 return _terminal_sequences_re.sub('', string)
5597
5598
5599def number_of_digits(number):
5600 return len('%d' % number)
34921b43 5601
5602
5603def join_nonempty(*values, delim='-', from_dict=None):
5604 if from_dict is not None:
7b2c3f47 5605 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5606 return delim.join(map(str, filter(None, values)))
06e57990 5607
5608
27231526
ZM
5609def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5610 """
5611 Find the largest format dimensions in terms of video width and, for each thumbnail:
5612 * Modify the URL: Match the width with the provided regex and replace with the former width
5613 * Update dimensions
5614
5615 This function is useful with video services that scale the provided thumbnails on demand
5616 """
5617 _keys = ('width', 'height')
5618 max_dimensions = max(
86e5f3ed 5619 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5620 default=(0, 0))
5621 if not max_dimensions[0]:
5622 return thumbnails
5623 return [
5624 merge_dicts(
5625 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5626 dict(zip(_keys, max_dimensions)), thumbnail)
5627 for thumbnail in thumbnails
5628 ]
5629
5630
93c8410d
LNO
5631def parse_http_range(range):
5632 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5633 if not range:
5634 return None, None, None
5635 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5636 if not crg:
5637 return None, None, None
5638 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5639
5640
6b9e832d 5641def read_stdin(what):
5642 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5643 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5644 return sys.stdin
5645
5646
a904a7f8
L
5647def determine_file_encoding(data):
5648 """
88f60feb 5649 Detect the text encoding used
a904a7f8
L
5650 @returns (encoding, bytes to skip)
5651 """
5652
88f60feb 5653 # BOM marks are given priority over declarations
a904a7f8 5654 for bom, enc in BOMS:
a904a7f8
L
5655 if data.startswith(bom):
5656 return enc, len(bom)
5657
88f60feb 5658 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5659 # We ignore the endianness to get a good enough match
a904a7f8 5660 data = data.replace(b'\0', b'')
88f60feb 5661 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5662 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5663
5664
06e57990 5665class Config:
5666 own_args = None
9e491463 5667 parsed_args = None
06e57990 5668 filename = None
5669 __initialized = False
5670
5671 def __init__(self, parser, label=None):
9e491463 5672 self.parser, self.label = parser, label
06e57990 5673 self._loaded_paths, self.configs = set(), []
5674
5675 def init(self, args=None, filename=None):
5676 assert not self.__initialized
284a60c5 5677 self.own_args, self.filename = args, filename
5678 return self.load_configs()
5679
5680 def load_configs(self):
65662dff 5681 directory = ''
284a60c5 5682 if self.filename:
5683 location = os.path.realpath(self.filename)
65662dff 5684 directory = os.path.dirname(location)
06e57990 5685 if location in self._loaded_paths:
5686 return False
5687 self._loaded_paths.add(location)
5688
284a60c5 5689 self.__initialized = True
5690 opts, _ = self.parser.parse_known_args(self.own_args)
5691 self.parsed_args = self.own_args
9e491463 5692 for location in opts.config_locations or []:
6b9e832d 5693 if location == '-':
1060f82f 5694 if location in self._loaded_paths:
5695 continue
5696 self._loaded_paths.add(location)
6b9e832d 5697 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5698 continue
65662dff 5699 location = os.path.join(directory, expand_path(location))
06e57990 5700 if os.path.isdir(location):
5701 location = os.path.join(location, 'yt-dlp.conf')
5702 if not os.path.exists(location):
9e491463 5703 self.parser.error(f'config location {location} does not exist')
06e57990 5704 self.append_config(self.read_file(location), location)
5705 return True
5706
5707 def __str__(self):
5708 label = join_nonempty(
5709 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5710 delim=' ')
5711 return join_nonempty(
5712 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5713 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5714 delim='\n')
5715
7a32c70d 5716 @staticmethod
06e57990 5717 def read_file(filename, default=[]):
5718 try:
a904a7f8 5719 optionf = open(filename, 'rb')
86e5f3ed 5720 except OSError:
06e57990 5721 return default # silently skip if file is not present
a904a7f8
L
5722 try:
5723 enc, skip = determine_file_encoding(optionf.read(512))
5724 optionf.seek(skip, io.SEEK_SET)
5725 except OSError:
5726 enc = None # silently skip read errors
06e57990 5727 try:
5728 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5729 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5730 res = shlex.split(contents, comments=True)
44a6fcff 5731 except Exception as err:
5732 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5733 finally:
5734 optionf.close()
5735 return res
5736
7a32c70d 5737 @staticmethod
06e57990 5738 def hide_login_info(opts):
86e5f3ed 5739 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5740 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5741
5742 def _scrub_eq(o):
5743 m = eqre.match(o)
5744 if m:
5745 return m.group('key') + '=PRIVATE'
5746 else:
5747 return o
5748
5749 opts = list(map(_scrub_eq, opts))
5750 for idx, opt in enumerate(opts):
5751 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5752 opts[idx + 1] = 'PRIVATE'
5753 return opts
5754
5755 def append_config(self, *args, label=None):
9e491463 5756 config = type(self)(self.parser, label)
06e57990 5757 config._loaded_paths = self._loaded_paths
5758 if config.init(*args):
5759 self.configs.append(config)
5760
7a32c70d 5761 @property
06e57990 5762 def all_args(self):
5763 for config in reversed(self.configs):
5764 yield from config.all_args
9e491463 5765 yield from self.parsed_args or []
5766
5767 def parse_known_args(self, **kwargs):
5768 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5769
5770 def parse_args(self):
9e491463 5771 return self.parser.parse_args(self.all_args)
da42679b
LNO
5772
5773
d5d1df8a 5774class WebSocketsWrapper:
da42679b 5775 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5776 pool = None
da42679b 5777
3cea3edd 5778 def __init__(self, url, headers=None, connect=True):
059bc4db 5779 self.loop = asyncio.new_event_loop()
9cd08050 5780 # XXX: "loop" is deprecated
5781 self.conn = websockets.connect(
5782 url, extra_headers=headers, ping_interval=None,
5783 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5784 if connect:
5785 self.__enter__()
15dfb392 5786 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5787
5788 def __enter__(self):
3cea3edd 5789 if not self.pool:
9cd08050 5790 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5791 return self
5792
5793 def send(self, *args):
5794 self.run_with_loop(self.pool.send(*args), self.loop)
5795
5796 def recv(self, *args):
5797 return self.run_with_loop(self.pool.recv(*args), self.loop)
5798
5799 def __exit__(self, type, value, traceback):
5800 try:
5801 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5802 finally:
5803 self.loop.close()
15dfb392 5804 self._cancel_all_tasks(self.loop)
da42679b
LNO
5805
5806 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5807 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5808 @staticmethod
da42679b 5809 def run_with_loop(main, loop):
059bc4db 5810 if not asyncio.iscoroutine(main):
da42679b
LNO
5811 raise ValueError(f'a coroutine was expected, got {main!r}')
5812
5813 try:
5814 return loop.run_until_complete(main)
5815 finally:
5816 loop.run_until_complete(loop.shutdown_asyncgens())
5817 if hasattr(loop, 'shutdown_default_executor'):
5818 loop.run_until_complete(loop.shutdown_default_executor())
5819
7a32c70d 5820 @staticmethod
da42679b 5821 def _cancel_all_tasks(loop):
059bc4db 5822 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5823
5824 if not to_cancel:
5825 return
5826
5827 for task in to_cancel:
5828 task.cancel()
5829
9cd08050 5830 # XXX: "loop" is removed in python 3.10+
da42679b 5831 loop.run_until_complete(
059bc4db 5832 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5833
5834 for task in to_cancel:
5835 if task.cancelled():
5836 continue
5837 if task.exception() is not None:
5838 loop.call_exception_handler({
5839 'message': 'unhandled exception during asyncio.run() shutdown',
5840 'exception': task.exception(),
5841 'task': task,
5842 })
5843
5844
8b7539d2 5845def merge_headers(*dicts):
08d30158 5846 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5847 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5848
5849
b1f94422 5850def cached_method(f):
5851 """Cache a method"""
5852 signature = inspect.signature(f)
5853
7a32c70d 5854 @functools.wraps(f)
b1f94422 5855 def wrapper(self, *args, **kwargs):
5856 bound_args = signature.bind(self, *args, **kwargs)
5857 bound_args.apply_defaults()
d5d1df8a 5858 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5859
6368e2e6 5860 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5861 if key not in cache:
5862 cache[key] = f(self, *args, **kwargs)
5863 return cache[key]
5864 return wrapper
5865
5866
28787f16 5867class classproperty:
83cc7b8a 5868 """property access for class methods with optional caching"""
5869 def __new__(cls, func=None, *args, **kwargs):
5870 if not func:
5871 return functools.partial(cls, *args, **kwargs)
5872 return super().__new__(cls)
c487cf00 5873
83cc7b8a 5874 def __init__(self, func, *, cache=False):
c487cf00 5875 functools.update_wrapper(self, func)
5876 self.func = func
83cc7b8a 5877 self._cache = {} if cache else None
28787f16 5878
5879 def __get__(self, _, cls):
83cc7b8a 5880 if self._cache is None:
5881 return self.func(cls)
5882 elif cls not in self._cache:
5883 self._cache[cls] = self.func(cls)
5884 return self._cache[cls]
19a03940 5885
5886
64fa820c 5887class Namespace(types.SimpleNamespace):
591bb9d3 5888 """Immutable namespace"""
591bb9d3 5889
7896214c 5890 def __iter__(self):
64fa820c 5891 return iter(self.__dict__.values())
7896214c 5892
7a32c70d 5893 @property
64fa820c 5894 def items_(self):
5895 return self.__dict__.items()
9b8ee23b 5896
5897
8dc59305 5898MEDIA_EXTENSIONS = Namespace(
5899 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5900 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5901 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5902 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5903 thumbnails=('jpg', 'png', 'webp'),
5904 storyboards=('mhtml', ),
5905 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5906 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5907)
5908MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5909MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5910
5911KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5912
5913
be5c1ae8 5914class RetryManager:
5915 """Usage:
5916 for retry in RetryManager(...):
5917 try:
5918 ...
5919 except SomeException as err:
5920 retry.error = err
5921 continue
5922 """
5923 attempt, _error = 0, None
5924
5925 def __init__(self, _retries, _error_callback, **kwargs):
5926 self.retries = _retries or 0
5927 self.error_callback = functools.partial(_error_callback, **kwargs)
5928
5929 def _should_retry(self):
5930 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5931
7a32c70d 5932 @property
be5c1ae8 5933 def error(self):
5934 if self._error is NO_DEFAULT:
5935 return None
5936 return self._error
5937
7a32c70d 5938 @error.setter
be5c1ae8 5939 def error(self, value):
5940 self._error = value
5941
5942 def __iter__(self):
5943 while self._should_retry():
5944 self.error = NO_DEFAULT
5945 self.attempt += 1
5946 yield self
5947 if self.error:
5948 self.error_callback(self.error, self.attempt, self.retries)
5949
7a32c70d 5950 @staticmethod
be5c1ae8 5951 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5952 """Utility function for reporting retries"""
5953 if count > retries:
5954 if error:
5955 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5956 raise e
5957
5958 if not count:
5959 return warn(e)
5960 elif isinstance(e, ExtractorError):
3ce29336 5961 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5962 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5963
5964 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5965 if delay:
5966 info(f'Sleeping {delay:.2f} seconds ...')
5967 time.sleep(delay)
5968
5969
0647d925 5970def make_archive_id(ie, video_id):
5971 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5972 return f'{ie_key.lower()} {video_id}'
5973
5974
a1c5bd82 5975def truncate_string(s, left, right=0):
5976 assert left > 3 and right >= 0
5977 if s is None or len(s) <= left + right:
5978 return s
5979 return f'{s[:left-3]}...{s[-right:]}'
5980
5981
5314b521 5982def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5983 assert 'all' in alias_dict, '"all" alias is required'
5984 requested = list(start or [])
5985 for val in options:
5986 discard = val.startswith('-')
5987 if discard:
5988 val = val[1:]
5989
5990 if val in alias_dict:
5991 val = alias_dict[val] if not discard else [
5992 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5993 # NB: Do not allow regex in aliases for performance
5994 requested = orderedSet_from_options(val, alias_dict, start=requested)
5995 continue
5996
5997 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5998 else [val] if val in alias_dict['all'] else None)
5999 if current is None:
6000 raise ValueError(val)
6001
6002 if discard:
6003 for item in current:
6004 while item in requested:
6005 requested.remove(item)
6006 else:
6007 requested.extend(current)
6008
6009 return orderedSet(requested)
6010
6011
d0d74b71 6012class FormatSorter:
6013 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6014
6015 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6016 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6017 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6018 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6019 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6020 'fps', 'fs_approx', 'source', 'id')
6021
6022 settings = {
6023 'vcodec': {'type': 'ordered', 'regex': True,
6024 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6025 'acodec': {'type': 'ordered', 'regex': True,
6026 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6027 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6028 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6029 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6030 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6031 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 6032 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6033 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
d0d74b71 6034 'aext': {'type': 'ordered', 'field': 'audio_ext',
6035 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6036 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6037 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6038 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6039 'field': ('vcodec', 'acodec'),
6040 'function': lambda it: int(any(v != 'none' for v in it))},
6041 'ie_pref': {'priority': True, 'type': 'extractor'},
6042 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6043 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6044 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6045 'quality': {'convert': 'float', 'default': -1},
6046 'filesize': {'convert': 'bytes'},
6047 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6048 'id': {'convert': 'string', 'field': 'format_id'},
6049 'height': {'convert': 'float_none'},
6050 'width': {'convert': 'float_none'},
6051 'fps': {'convert': 'float_none'},
6052 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6053 'tbr': {'convert': 'float_none'},
6054 'vbr': {'convert': 'float_none'},
6055 'abr': {'convert': 'float_none'},
6056 'asr': {'convert': 'float_none'},
6057 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6058
6059 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6060 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6061 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6062 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6063 'res': {'type': 'multiple', 'field': ('height', 'width'),
6064 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6065
6066 # Actual field names
6067 'format_id': {'type': 'alias', 'field': 'id'},
6068 'preference': {'type': 'alias', 'field': 'ie_pref'},
6069 'language_preference': {'type': 'alias', 'field': 'lang'},
6070 'source_preference': {'type': 'alias', 'field': 'source'},
6071 'protocol': {'type': 'alias', 'field': 'proto'},
6072 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6073 'audio_channels': {'type': 'alias', 'field': 'channels'},
6074
6075 # Deprecated
6076 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6077 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6078 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6079 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6080 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6081 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6082 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6083 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6084 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6085 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6086 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6087 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6088 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6089 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6090 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6091 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6092 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6093 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6094 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6095 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6096 }
6097
6098 def __init__(self, ydl, field_preference):
6099 self.ydl = ydl
6100 self._order = []
6101 self.evaluate_params(self.ydl.params, field_preference)
6102 if ydl.params.get('verbose'):
6103 self.print_verbose_info(self.ydl.write_debug)
6104
6105 def _get_field_setting(self, field, key):
6106 if field not in self.settings:
6107 if key in ('forced', 'priority'):
6108 return False
6109 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6110 'deprecated and may be removed in a future version')
6111 self.settings[field] = {}
6112 propObj = self.settings[field]
6113 if key not in propObj:
6114 type = propObj.get('type')
6115 if key == 'field':
6116 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6117 elif key == 'convert':
6118 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6119 else:
6120 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6121 propObj[key] = default
6122 return propObj[key]
6123
6124 def _resolve_field_value(self, field, value, convertNone=False):
6125 if value is None:
6126 if not convertNone:
6127 return None
6128 else:
6129 value = value.lower()
6130 conversion = self._get_field_setting(field, 'convert')
6131 if conversion == 'ignore':
6132 return None
6133 if conversion == 'string':
6134 return value
6135 elif conversion == 'float_none':
6136 return float_or_none(value)
6137 elif conversion == 'bytes':
6138 return parse_bytes(value)
6139 elif conversion == 'order':
6140 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6141 use_regex = self._get_field_setting(field, 'regex')
6142 list_length = len(order_list)
6143 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6144 if use_regex and value is not None:
6145 for i, regex in enumerate(order_list):
6146 if regex and re.match(regex, value):
6147 return list_length - i
6148 return list_length - empty_pos # not in list
6149 else: # not regex or value = None
6150 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6151 else:
6152 if value.isnumeric():
6153 return float(value)
6154 else:
6155 self.settings[field]['convert'] = 'string'
6156 return value
6157
6158 def evaluate_params(self, params, sort_extractor):
6159 self._use_free_order = params.get('prefer_free_formats', False)
6160 self._sort_user = params.get('format_sort', [])
6161 self._sort_extractor = sort_extractor
6162
6163 def add_item(field, reverse, closest, limit_text):
6164 field = field.lower()
6165 if field in self._order:
6166 return
6167 self._order.append(field)
6168 limit = self._resolve_field_value(field, limit_text)
6169 data = {
6170 'reverse': reverse,
6171 'closest': False if limit is None else closest,
6172 'limit_text': limit_text,
6173 'limit': limit}
6174 if field in self.settings:
6175 self.settings[field].update(data)
6176 else:
6177 self.settings[field] = data
6178
6179 sort_list = (
6180 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6181 + (tuple() if params.get('format_sort_force', False)
6182 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6183 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6184
6185 for item in sort_list:
6186 match = re.match(self.regex, item)
6187 if match is None:
6188 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6189 field = match.group('field')
6190 if field is None:
6191 continue
6192 if self._get_field_setting(field, 'type') == 'alias':
6193 alias, field = field, self._get_field_setting(field, 'field')
6194 if self._get_field_setting(alias, 'deprecated'):
6195 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6196 f'be removed in a future version. Please use {field} instead')
6197 reverse = match.group('reverse') is not None
6198 closest = match.group('separator') == '~'
6199 limit_text = match.group('limit')
6200
6201 has_limit = limit_text is not None
6202 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6203 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6204
6205 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6206 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6207 limit_count = len(limits)
6208 for (i, f) in enumerate(fields):
6209 add_item(f, reverse, closest,
6210 limits[i] if i < limit_count
6211 else limits[0] if has_limit and not has_multiple_limits
6212 else None)
6213
6214 def print_verbose_info(self, write_debug):
6215 if self._sort_user:
6216 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6217 if self._sort_extractor:
6218 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6219 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6220 '+' if self._get_field_setting(field, 'reverse') else '', field,
6221 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6222 self._get_field_setting(field, 'limit_text'),
6223 self._get_field_setting(field, 'limit'))
6224 if self._get_field_setting(field, 'limit_text') is not None else '')
6225 for field in self._order if self._get_field_setting(field, 'visible')]))
6226
6227 def _calculate_field_preference_from_value(self, format, field, type, value):
6228 reverse = self._get_field_setting(field, 'reverse')
6229 closest = self._get_field_setting(field, 'closest')
6230 limit = self._get_field_setting(field, 'limit')
6231
6232 if type == 'extractor':
6233 maximum = self._get_field_setting(field, 'max')
6234 if value is None or (maximum is not None and value >= maximum):
6235 value = -1
6236 elif type == 'boolean':
6237 in_list = self._get_field_setting(field, 'in_list')
6238 not_in_list = self._get_field_setting(field, 'not_in_list')
6239 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6240 elif type == 'ordered':
6241 value = self._resolve_field_value(field, value, True)
6242
6243 # try to convert to number
6244 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6245 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6246 if is_num:
6247 value = val_num
6248
6249 return ((-10, 0) if value is None
6250 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6251 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6252 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6253 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6254 else (-1, value, 0))
6255
6256 def _calculate_field_preference(self, format, field):
6257 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6258 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6259 if type == 'multiple':
6260 type = 'field' # Only 'field' is allowed in multiple for now
6261 actual_fields = self._get_field_setting(field, 'field')
6262
6263 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6264 else:
6265 value = get_value(field)
6266 return self._calculate_field_preference_from_value(format, field, type, value)
6267
6268 def calculate_preference(self, format):
6269 # Determine missing protocol
6270 if not format.get('protocol'):
6271 format['protocol'] = determine_protocol(format)
6272
6273 # Determine missing ext
6274 if not format.get('ext') and 'url' in format:
6275 format['ext'] = determine_ext(format['url'])
6276 if format.get('vcodec') == 'none':
6277 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6278 format['video_ext'] = 'none'
6279 else:
6280 format['video_ext'] = format['ext']
6281 format['audio_ext'] = 'none'
6282 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6283 # format['preference'] = -1000
6284
6285 # Determine missing bitrates
6286 if format.get('tbr') is None:
6287 if format.get('vbr') is not None and format.get('abr') is not None:
6288 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6289 else:
6290 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6291 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6292 if format.get('acodec') != 'none' and format.get('abr') is None:
6293 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6294
6295 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6296
6297
9b8ee23b 6298# Deprecated
6299has_certifi = bool(certifi)
6300has_websockets = bool(websockets)