]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[build] Only archive if `vars.ARCHIVE_REPO` is set
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 156}
a942d6cb 157
8f53dc44 158# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166}
167
c587cbb7 168# needed for sanitizing filenames in restricted mode
c8827027 169ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 172
46f59e89
S
173DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
cb655f34
S
177 '%B %dst %Y',
178 '%B %dnd %Y',
9d30c213 179 '%B %drd %Y',
cb655f34 180 '%B %dth %Y',
46f59e89 181 '%b %d %Y',
cb655f34
S
182 '%b %dst %Y',
183 '%b %dnd %Y',
9d30c213 184 '%b %drd %Y',
cb655f34 185 '%b %dth %Y',
46f59e89
S
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
9d30c213 188 '%b %drd %Y %I:%M',
46f59e89
S
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
bccdbd22 192 '%Y.%m.%d.',
46f59e89 193 '%Y/%m/%d',
81c13222 194 '%Y/%m/%d %H:%M',
46f59e89 195 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
4f3fa23e 198 '%Y%m%d',
0c1c6f4b 199 '%Y-%m-%d %H:%M',
46f59e89
S
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
5014558a 202 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
c6eed6b8
S
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
b555ae9b
S
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
a63d9bd0 215 '%H:%M %d-%b-%Y',
46f59e89
S
216)
217
218DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
47304e07 226 '%d-%m-%Y %H:%M',
46f59e89
S
227])
228
229DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
230DATE_FORMATS_MONTH_FIRST.extend([
231 '%m-%d-%Y',
232 '%m.%d.%Y',
233 '%m/%d/%Y',
234 '%m/%d/%y',
235 '%m/%d/%Y %H:%M:%S',
236])
237
06b3fe29 238PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 239JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 240
1d485a1a 241NUMBER_RE = r'\d+(?:\.\d+)?'
242
7105440c 243
0b9c08b4 244@functools.cache
d77c3dfd 245def preferredencoding():
59ae15a5 246 """Get preferred encoding.
d77c3dfd 247
59ae15a5
PH
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
250 """
251 try:
252 pref = locale.getpreferredencoding()
28e614de 253 'TEST'.encode(pref)
70a1165b 254 except Exception:
59ae15a5 255 pref = 'UTF-8'
bae611f2 256
59ae15a5 257 return pref
d77c3dfd 258
f4bfd65f 259
181c8655 260def write_json_file(obj, fn):
1394646a 261 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 262
cfb0511d 263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
265 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
266
267 try:
268 with tf:
45d86abe 269 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
270 if sys.platform == 'win32':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
19a03940 273 with contextlib.suppress(OSError):
1394646a 274 os.unlink(fn)
19a03940 275 with contextlib.suppress(OSError):
9cd5f54e
R
276 mask = os.umask(0)
277 os.umask(mask)
278 os.chmod(tf.name, 0o666 & ~mask)
181c8655 279 os.rename(tf.name, fn)
70a1165b 280 except Exception:
19a03940 281 with contextlib.suppress(OSError):
181c8655 282 os.remove(tf.name)
181c8655
PH
283 raise
284
285
cfb0511d 286def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 290 return node.find(expr)
59ae56fa 291
d7e66d39
JMF
292# On python2.6 the xml.etree.ElementTree.Element methods don't support
293# the namespace parameter
5f6a1245
JW
294
295
d7e66d39
JMF
296def xpath_with_ns(path, ns_map):
297 components = [c.split(':') for c in path.split('/')]
298 replaced = []
299 for c in components:
300 if len(c) == 1:
301 replaced.append(c[0])
302 else:
303 ns, tag = c
304 replaced.append('{%s}%s' % (ns_map[ns], tag))
305 return '/'.join(replaced)
306
d77c3dfd 307
a41fb80c 308def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 309 def _find_xpath(xpath):
f9934b96 310 return node.find(xpath)
578c0745 311
14f25df2 312 if isinstance(xpath, str):
578c0745
S
313 n = _find_xpath(xpath)
314 else:
315 for xp in xpath:
316 n = _find_xpath(xp)
317 if n is not None:
318 break
d74bebd5 319
8e636da4 320 if n is None:
bf42a990
S
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
bf0ff932
PH
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element %s' % name)
326 else:
327 return None
a41fb80c
S
328 return n
329
330
331def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
332 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
333 if n is None or n == default:
334 return n
335 if n.text is None:
336 if default is not NO_DEFAULT:
337 return default
338 elif fatal:
339 name = xpath if name is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name)
341 else:
342 return None
343 return n.text
a41fb80c
S
344
345
346def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
347 n = find_xpath_attr(node, xpath, key)
348 if n is None:
349 if default is not NO_DEFAULT:
350 return default
351 elif fatal:
86e5f3ed 352 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
353 raise ExtractorError('Could not find XML attribute %s' % name)
354 else:
355 return None
356 return n.attrib[key]
bf0ff932
PH
357
358
c487cf00 359def get_element_by_id(id, html, **kwargs):
43e8fafd 360 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 362
12ea2f30 363
c487cf00 364def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 365 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 366 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
367
368
84c237fb 369def get_element_by_class(class_name, html):
2af12ad9
TC
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
6f32a0b5
ZM
375def get_element_html_by_class(class_name, html):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_html_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_by_attribute(attribute, value, html, **kwargs):
382 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
383 return retval[0] if retval else None
384
385
c487cf00 386def get_element_html_by_attribute(attribute, value, html, **kargs):
387 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
388 return retval[0] if retval else None
389
390
c487cf00 391def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
64fa820c 394 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
395 html, escape_value=False)
396
397
6f32a0b5
ZM
398def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
64fa820c 401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
402 html, escape_value=False)
403
404
405def get_elements_by_attribute(*args, **kwargs):
43e8fafd 406 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
413
414
4c9a1a3b 415def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
416 """
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
419 """
c61473c1
M
420 if not value:
421 return
9e6dd238 422
86e5f3ed 423 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 424
84c237fb
YCH
425 value = re.escape(value) if escape_value else value
426
86e5f3ed 427 partial_element_re = rf'''(?x)
4c9a1a3b 428 <(?P<tag>{tag})
0254f162 429 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
431 '''
38285056 432
0254f162
ZM
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 435
0254f162
ZM
436 yield (
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
438 whole
439 )
a921f407 440
c5229f39 441
ac668111 442class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
443 """
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
446 as a context manager
447 """
448
449 class HTMLBreakOnClosingTagException(Exception):
450 pass
451
452 def __init__(self):
453 self.tagstack = collections.deque()
ac668111 454 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
455
456 def __enter__(self):
457 return self
458
459 def __exit__(self, *_):
460 self.close()
461
462 def close(self):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
466 pass
467
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
470
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags in the stack')
474 while self.tagstack:
475 inner_tag = self.tagstack.pop()
476 if inner_tag == tag:
477 break
478 else:
479 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
482
483
46d09f87 484# XXX: This should be far less strict
6f32a0b5
ZM
485def get_element_text_and_html_by_tag(tag, html):
486 """
487 For the first element with the specified tag in the passed HTML document
488 return its' content (text) and the whole element (html)
489 """
490 def find_or_raise(haystack, needle, exc):
491 try:
492 return haystack.index(needle)
493 except ValueError:
494 raise exc
495 closing_tag = f'</{tag}>'
496 whole_start = find_or_raise(
497 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
498 content_start = find_or_raise(
499 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
500 content_start += whole_start + 1
501 with HTMLBreakOnClosingTagParser() as parser:
502 parser.feed(html[whole_start:content_start])
503 if not parser.tagstack or parser.tagstack[0] != tag:
504 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
505 offset = content_start
506 while offset < len(html):
507 next_closing_tag_start = find_or_raise(
508 html[offset:], closing_tag,
509 compat_HTMLParseError(f'closing {tag} tag not found'))
510 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
511 try:
512 parser.feed(html[offset:offset + next_closing_tag_end])
513 offset += next_closing_tag_end
514 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
515 return html[content_start:offset + next_closing_tag_start], \
516 html[whole_start:offset + next_closing_tag_end]
517 raise compat_HTMLParseError('unexpected end of html')
518
519
ac668111 520class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 521 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 522
8bb56eee 523 def __init__(self):
c5229f39 524 self.attrs = {}
ac668111 525 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
526
527 def handle_starttag(self, tag, attrs):
528 self.attrs = dict(attrs)
7053aa3a 529 raise compat_HTMLParseError('done')
8bb56eee 530
c5229f39 531
ac668111 532class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
533 """HTML parser to gather the attributes for the elements of a list"""
534
535 def __init__(self):
ac668111 536 html.parser.HTMLParser.__init__(self)
73673ccf
FF
537 self.items = []
538 self._level = 0
539
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
543 self._level += 1
544
545 def handle_endtag(self, tag):
546 self._level -= 1
547
548
8bb56eee
BF
549def extract_attributes(html_element):
550 """Given a string for an HTML element such as
551 <el
552 a="foo" B="bar" c="&98;az" d=boz
553 empty= noval entity="&amp;"
554 sq='"' dq="'"
555 >
556 Decode and return a dictionary of attributes.
557 {
558 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
561 }.
8bb56eee
BF
562 """
563 parser = HTMLAttributeParser()
19a03940 564 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
565 parser.feed(html_element)
566 parser.close()
8bb56eee 567 return parser.attrs
9e6dd238 568
c5229f39 569
73673ccf
FF
570def parse_list(webpage):
571 """Given a string for an series of HTML <li> elements,
572 return a dictionary of their attributes"""
573 parser = HTMLListAttrsParser()
574 parser.feed(webpage)
575 parser.close()
576 return parser.items
577
578
9e6dd238 579def clean_html(html):
59ae15a5 580 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
581
582 if html is None: # Convenience for sanitizing descriptions etc.
583 return html
584
49185227 585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
588 # Strip html tags
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
7decf895 592 return html.strip()
9e6dd238
FV
593
594
b7c47b74 595class LenientJSONDecoder(json.JSONDecoder):
cc090836 596 # TODO: Write tests
597 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 599 self._close_attempts = 2 * close_objects
b7c47b74 600 super().__init__(*args, **kwargs)
601
cc090836 602 @staticmethod
603 def _close_object(err):
604 doc = err.doc[:err.pos]
605 # We need to add comma first to get the correct error message
606 if err.msg.startswith('Expecting \',\''):
607 return doc + ','
608 elif not doc.endswith(','):
609 return
610
611 if err.msg.startswith('Expecting property name'):
612 return doc[:-1] + '}'
613 elif err.msg.startswith('Expecting value'):
614 return doc[:-1] + ']'
615
b7c47b74 616 def decode(self, s):
617 if self.transform_source:
618 s = self.transform_source(s)
cc090836 619 for attempt in range(self._close_attempts + 1):
620 try:
621 if self.ignore_extra:
622 return self.raw_decode(s.lstrip())[0]
623 return super().decode(s)
624 except json.JSONDecodeError as e:
625 if e.pos is None:
626 raise
627 elif attempt < self._close_attempts:
628 s = self._close_object(e)
629 if s is not None:
630 continue
2fa669f7 631 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 632 assert False, 'Too many attempts to decode JSON'
b7c47b74 633
634
d77c3dfd 635def sanitize_open(filename, open_mode):
59ae15a5
PH
636 """Try to open the given filename, and slightly tweak it if this fails.
637
638 Attempts to open the given filename. If this fails, it tries to change
639 the filename slightly, step by step, until it's either able to open it
640 or it fails and raises a final exception, like the standard open()
641 function.
642
643 It returns the tuple (stream, definitive_file_name).
644 """
0edb3e33 645 if filename == '-':
646 if sys.platform == 'win32':
647 import msvcrt
be5c1ae8 648
62b58c09 649 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 650 with contextlib.suppress(io.UnsupportedOperation):
651 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 652 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 653
0edb3e33 654 for attempt in range(2):
655 try:
656 try:
89737671 657 if sys.platform == 'win32':
b506289f 658 # FIXME: An exclusive lock also locks the file from being read.
659 # Since windows locks are mandatory, don't lock the file on windows (for now).
660 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 661 raise LockingUnsupportedError()
0edb3e33 662 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 663 except OSError:
0edb3e33 664 stream = open(filename, open_mode)
8a82af35 665 return stream, filename
86e5f3ed 666 except OSError as err:
0edb3e33 667 if attempt or err.errno in (errno.EACCES,):
668 raise
669 old_filename, filename = filename, sanitize_path(filename)
670 if old_filename == filename:
671 raise
d77c3dfd
FV
672
673
674def timeconvert(timestr):
59ae15a5
PH
675 """Convert RFC 2822 defined time string into system timestamp"""
676 timestamp = None
677 timetuple = email.utils.parsedate_tz(timestr)
678 if timetuple is not None:
679 timestamp = email.utils.mktime_tz(timetuple)
680 return timestamp
1c469a94 681
5f6a1245 682
5c3895ff 683def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 684 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 685 @param restricted Use a stricter subset of allowed characters
686 @param is_id Whether this is an ID that should be kept unchanged if possible.
687 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 688 """
5c3895ff 689 if s == '':
690 return ''
691
59ae15a5 692 def replace_insane(char):
c587cbb7
AT
693 if restricted and char in ACCENT_CHARS:
694 return ACCENT_CHARS[char]
91dd88b9 695 elif not restricted and char == '\n':
5c3895ff 696 return '\0 '
989a01c2 697 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
698 # Replace with their full-width unicode counterparts
699 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 700 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
701 return ''
702 elif char == '"':
703 return '' if restricted else '\''
704 elif char == ':':
5c3895ff 705 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 706 elif char in '\\/|*<>':
5c3895ff 707 return '\0_'
708 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
709 return '\0_'
59ae15a5
PH
710 return char
711
db4678e4 712 # Replace look-alike Unicode glyphs
713 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 714 s = unicodedata.normalize('NFKC', s)
5c3895ff 715 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 716 result = ''.join(map(replace_insane, s))
5c3895ff 717 if is_id is NO_DEFAULT:
ae61d108 718 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
719 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 720 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
721 result = result.replace('\0', '') or '_'
722
796173d0
PH
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
5a42414b
PH
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
a7440261 732 result = result.lstrip('.')
796173d0
PH
733 if not result:
734 result = '_'
59ae15a5 735 return result
d77c3dfd 736
5f6a1245 737
c2934512 738def sanitize_path(s, force=False):
a2aaf4db 739 """Sanitizes and normalizes path on Windows"""
c2934512 740 if sys.platform == 'win32':
c4218ac3 741 force = False
c2934512 742 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 743 elif force:
744 drive_or_unc = ''
745 else:
a2aaf4db 746 return s
c2934512 747
be531ef1
S
748 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
749 if drive_or_unc:
a2aaf4db
S
750 norm_path.pop(0)
751 sanitized_path = [
ec85ded8 752 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 753 for path_part in norm_path]
be531ef1
S
754 if drive_or_unc:
755 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 756 elif force and s and s[0] == os.path.sep:
c4218ac3 757 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
758 return os.path.join(*sanitized_path)
759
760
8f97a15d 761def sanitize_url(url, *, scheme='http'):
befa4708
S
762 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
763 # the number of unwanted failures due to missing protocol
21633673 764 if url is None:
765 return
766 elif url.startswith('//'):
8f97a15d 767 return f'{scheme}:{url}'
befa4708
S
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
067aa17e 770 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
bc6b9bcd 778 return url
17bcc626
S
779
780
5435dcf9 781def extract_basic_auth(url):
14f25df2 782 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
783 if parts.username is None:
784 return url, None
14f25df2 785 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
0f06bcd7 789 ('%s:%s' % (parts.username, parts.password or '')).encode())
790 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
791
792
67dda517 793def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
ac668111 798 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
799
800
51098426 801def expand_path(s):
2fa669f7 802 """Expand shell variables and ~"""
51098426
S
803 return os.path.expandvars(compat_expanduser(s))
804
805
7e9a6125 806def orderedSet(iterable, *, lazy=False):
807 """Remove all duplicates from the input iterable"""
808 def _iter():
809 seen = [] # Do not use set since the items can be unhashable
810 for x in iterable:
811 if x not in seen:
812 seen.append(x)
813 yield x
814
815 return _iter() if lazy else list(_iter())
d77c3dfd 816
912b38b4 817
55b2f099 818def _htmlentity_transform(entity_with_semicolon):
4e408e47 819 """Transforms an HTML entity to a character."""
55b2f099
YCH
820 entity = entity_with_semicolon[:-1]
821
4e408e47 822 # Known non-numeric HTML entity
ac668111 823 if entity in html.entities.name2codepoint:
824 return chr(html.entities.name2codepoint[entity])
4e408e47 825
62b58c09
L
826 # TODO: HTML5 allows entities without a semicolon.
827 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 828 if entity_with_semicolon in html.entities.html5:
829 return html.entities.html5[entity_with_semicolon]
55b2f099 830
91757b0f 831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
832 if mobj is not None:
833 numstr = mobj.group(1)
28e614de 834 if numstr.startswith('x'):
4e408e47 835 base = 16
28e614de 836 numstr = '0%s' % numstr
4e408e47
PH
837 else:
838 base = 10
067aa17e 839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 840 with contextlib.suppress(ValueError):
ac668111 841 return chr(int(numstr, base))
4e408e47
PH
842
843 # Unknown entity in name, return its literal representation
7a3f0c00 844 return '&%s;' % entity
4e408e47
PH
845
846
d77c3dfd 847def unescapeHTML(s):
912b38b4
PH
848 if s is None:
849 return None
19a03940 850 assert isinstance(s, str)
d77c3dfd 851
4e408e47 852 return re.sub(
95f3f7c2 853 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 854
8bf48f23 855
cdb19aa4 856def escapeHTML(text):
857 return (
858 text
859 .replace('&', '&amp;')
860 .replace('<', '&lt;')
861 .replace('>', '&gt;')
862 .replace('"', '&quot;')
863 .replace("'", '&#39;')
864 )
865
866
f5b1bca9 867def process_communicate_or_kill(p, *args, **kwargs):
da4db748 868 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
869 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 870 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 871
872
d3c93ec2 873class Popen(subprocess.Popen):
874 if sys.platform == 'win32':
875 _startupinfo = subprocess.STARTUPINFO()
876 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
877 else:
878 _startupinfo = None
879
82ea226c
L
880 @staticmethod
881 def _fix_pyinstaller_ld_path(env):
882 """Restore LD_LIBRARY_PATH when using PyInstaller
883 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
884 https://github.com/yt-dlp/yt-dlp/issues/4573
885 """
886 if not hasattr(sys, '_MEIPASS'):
887 return
888
889 def _fix(key):
890 orig = env.get(f'{key}_ORIG')
891 if orig is None:
892 env.pop(key, None)
893 else:
894 env[key] = orig
895
896 _fix('LD_LIBRARY_PATH') # Linux
897 _fix('DYLD_LIBRARY_PATH') # macOS
898
899 def __init__(self, *args, env=None, text=False, **kwargs):
900 if env is None:
901 env = os.environ.copy()
902 self._fix_pyinstaller_ld_path(env)
903
da8e2912 904 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 905 if text is True:
906 kwargs['universal_newlines'] = True # For 3.6 compatibility
907 kwargs.setdefault('encoding', 'utf-8')
908 kwargs.setdefault('errors', 'replace')
82ea226c 909 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 910
911 def communicate_or_kill(self, *args, **kwargs):
8a82af35 912 try:
913 return self.communicate(*args, **kwargs)
914 except BaseException: # Including KeyboardInterrupt
f0c9fb96 915 self.kill(timeout=None)
8a82af35 916 raise
d3c93ec2 917
f0c9fb96 918 def kill(self, *, timeout=0):
919 super().kill()
920 if timeout != 0:
921 self.wait(timeout=timeout)
922
923 @classmethod
992dc6b4 924 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 925 with cls(*args, **kwargs) as proc:
da8e2912 926 default = '' if proc.__text_mode else b''
992dc6b4 927 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 928 return stdout or default, stderr or default, proc.returncode
f0c9fb96 929
d3c93ec2 930
aa49acd1
S
931def get_subprocess_encoding():
932 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
933 # For subprocess calls, encode with locale encoding
934 # Refer to http://stackoverflow.com/a/9951851/35070
935 encoding = preferredencoding()
936 else:
937 encoding = sys.getfilesystemencoding()
938 if encoding is None:
939 encoding = 'utf-8'
940 return encoding
941
942
8bf48f23 943def encodeFilename(s, for_subprocess=False):
19a03940 944 assert isinstance(s, str)
cfb0511d 945 return s
aa49acd1
S
946
947
948def decodeFilename(b, for_subprocess=False):
cfb0511d 949 return b
8bf48f23 950
f07b74fc
PH
951
952def encodeArgument(s):
cfb0511d 953 # Legacy code that uses byte strings
954 # Uncomment the following line after fixing all post processors
14f25df2 955 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 956 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
957
958
aa49acd1 959def decodeArgument(b):
cfb0511d 960 return b
aa49acd1
S
961
962
8271226a
PH
963def decodeOption(optval):
964 if optval is None:
965 return optval
966 if isinstance(optval, bytes):
967 optval = optval.decode(preferredencoding())
968
14f25df2 969 assert isinstance(optval, str)
8271226a 970 return optval
1c256f70 971
5f6a1245 972
aa7785f8 973_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
974
975
976def timetuple_from_msec(msec):
977 secs, msec = divmod(msec, 1000)
978 mins, secs = divmod(secs, 60)
979 hrs, mins = divmod(mins, 60)
980 return _timetuple(hrs, mins, secs, msec)
981
982
cdb19aa4 983def formatSeconds(secs, delim=':', msec=False):
aa7785f8 984 time = timetuple_from_msec(secs * 1000)
985 if time.hours:
986 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
987 elif time.minutes:
988 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 989 else:
aa7785f8 990 ret = '%d' % time.seconds
991 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 992
a0ddb8a2 993
77562778 994def _ssl_load_windows_store_certs(ssl_context, storename):
995 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
996 try:
997 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
998 if encoding == 'x509_asn' and (
999 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1000 except PermissionError:
1001 return
1002 for cert in certs:
19a03940 1003 with contextlib.suppress(ssl.SSLError):
77562778 1004 ssl_context.load_verify_locations(cadata=cert)
a2366922 1005
77562778 1006
1007def make_HTTPS_handler(params, **kwargs):
1008 opts_check_certificate = not params.get('nocheckcertificate')
1009 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1010 context.check_hostname = opts_check_certificate
f81c62a6 1011 if params.get('legacyserverconnect'):
1012 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 1013 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014 context.set_ciphers('DEFAULT')
ac8e69dd
M
1015 elif (
1016 sys.version_info < (3, 10)
1017 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
1018 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
1019 ):
5b9f253f
M
1020 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022 # in some situations [2][3].
1023 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 1025 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
1026 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
1030 # 5. https://peps.python.org/pep-0644/#libressl-support
1031 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
1032 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1034
77562778 1035 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1036 if opts_check_certificate:
d5820461 1037 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1038 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1039 else:
1040 try:
1041 context.load_default_certs()
1042 # Work around the issue in load_default_certs when there are bad certificates. See:
1043 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045 except ssl.SSLError:
1046 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1048 for storename in ('CA', 'ROOT'):
1049 _ssl_load_windows_store_certs(context, storename)
1050 context.set_default_verify_paths()
8a82af35 1051
bb58c9ed 1052 client_certfile = params.get('client_certificate')
1053 if client_certfile:
1054 try:
1055 context.load_cert_chain(
1056 client_certfile, keyfile=params.get('client_certificate_key'),
1057 password=params.get('client_certificate_password'))
1058 except ssl.SSLError:
1059 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1060
1061 # Some servers may reject requests if ALPN extension is not sent. See:
1062 # https://github.com/python/cpython/issues/85140
1063 # https://github.com/yt-dlp/yt-dlp/issues/3878
1064 with contextlib.suppress(NotImplementedError):
1065 context.set_alpn_protocols(['http/1.1'])
1066
77562778 1067 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1068
732ea2f0 1069
5873d4cc 1070def bug_reports_message(before=';'):
57e0f077 1071 from .update import REPOSITORY
1072
1073 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1074 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1075
1076 before = before.rstrip()
1077 if not before or before.endswith(('.', '!', '?')):
1078 msg = msg[0].title() + msg[1:]
1079
1080 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1081
1082
bf5b9d85
PM
1083class YoutubeDLError(Exception):
1084 """Base exception for YoutubeDL errors."""
aa9369a2 1085 msg = None
1086
1087 def __init__(self, msg=None):
1088 if msg is not None:
1089 self.msg = msg
1090 elif self.msg is None:
1091 self.msg = type(self).__name__
1092 super().__init__(self.msg)
bf5b9d85
PM
1093
1094
ac668111 1095network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1096if hasattr(ssl, 'CertificateError'):
1097 network_exceptions.append(ssl.CertificateError)
1098network_exceptions = tuple(network_exceptions)
1099
1100
bf5b9d85 1101class ExtractorError(YoutubeDLError):
1c256f70 1102 """Error during info extraction."""
5f6a1245 1103
1151c407 1104 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1105 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1106 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1107 """
3158150c 1108 if sys.exc_info()[0] in network_exceptions:
9a82b238 1109 expected = True
d5979c5d 1110
7265a219 1111 self.orig_msg = str(msg)
1c256f70 1112 self.traceback = tb
1151c407 1113 self.expected = expected
2eabb802 1114 self.cause = cause
d11271dd 1115 self.video_id = video_id
1151c407 1116 self.ie = ie
1117 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1118 if isinstance(self.exc_info[1], ExtractorError):
1119 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1120 super().__init__(self.__msg)
1151c407 1121
9bcfe33b 1122 @property
1123 def __msg(self):
1124 return ''.join((
1125 format_field(self.ie, None, '[%s] '),
1126 format_field(self.video_id, None, '%s: '),
1127 self.orig_msg,
1128 format_field(self.cause, None, ' (caused by %r)'),
1129 '' if self.expected else bug_reports_message()))
1c256f70 1130
01951dda 1131 def format_traceback(self):
497d2fab 1132 return join_nonempty(
1133 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1134 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1135 delim='\n') or None
01951dda 1136
9bcfe33b 1137 def __setattr__(self, name, value):
1138 super().__setattr__(name, value)
1139 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1140 self.msg = self.__msg or type(self).__name__
1141 self.args = (self.msg, ) # Cannot be property
1142
1c256f70 1143
416c7fcb
PH
1144class UnsupportedError(ExtractorError):
1145 def __init__(self, url):
86e5f3ed 1146 super().__init__(
416c7fcb
PH
1147 'Unsupported URL: %s' % url, expected=True)
1148 self.url = url
1149
1150
55b3e45b
JMF
1151class RegexNotFoundError(ExtractorError):
1152 """Error when a regex didn't match"""
1153 pass
1154
1155
773f291d
S
1156class GeoRestrictedError(ExtractorError):
1157 """Geographic restriction Error exception.
1158
1159 This exception may be thrown when a video is not available from your
1160 geographic location due to geographic restrictions imposed by a website.
1161 """
b6e0c7d2 1162
0db3bae8 1163 def __init__(self, msg, countries=None, **kwargs):
1164 kwargs['expected'] = True
86e5f3ed 1165 super().__init__(msg, **kwargs)
773f291d
S
1166 self.countries = countries
1167
1168
693f0600 1169class UserNotLive(ExtractorError):
1170 """Error when a channel/user is not live"""
1171
1172 def __init__(self, msg=None, **kwargs):
1173 kwargs['expected'] = True
1174 super().__init__(msg or 'The channel is not currently live', **kwargs)
1175
1176
bf5b9d85 1177class DownloadError(YoutubeDLError):
59ae15a5 1178 """Download Error exception.
d77c3dfd 1179
59ae15a5
PH
1180 This exception may be thrown by FileDownloader objects if they are not
1181 configured to continue on errors. They will contain the appropriate
1182 error message.
1183 """
5f6a1245 1184
8cc83b8d
FV
1185 def __init__(self, msg, exc_info=None):
1186 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1187 super().__init__(msg)
8cc83b8d 1188 self.exc_info = exc_info
d77c3dfd
FV
1189
1190
498f5606 1191class EntryNotInPlaylist(YoutubeDLError):
1192 """Entry not in playlist exception.
1193
1194 This exception will be thrown by YoutubeDL when a requested entry
1195 is not found in the playlist info_dict
1196 """
aa9369a2 1197 msg = 'Entry not found in info'
498f5606 1198
1199
bf5b9d85 1200class SameFileError(YoutubeDLError):
59ae15a5 1201 """Same File exception.
d77c3dfd 1202
59ae15a5
PH
1203 This exception will be thrown by FileDownloader objects if they detect
1204 multiple files would have to be downloaded to the same file on disk.
1205 """
aa9369a2 1206 msg = 'Fixed output name but more than one file to download'
1207
1208 def __init__(self, filename=None):
1209 if filename is not None:
1210 self.msg += f': {filename}'
1211 super().__init__(self.msg)
d77c3dfd
FV
1212
1213
bf5b9d85 1214class PostProcessingError(YoutubeDLError):
59ae15a5 1215 """Post Processing exception.
d77c3dfd 1216
59ae15a5
PH
1217 This exception may be raised by PostProcessor's .run() method to
1218 indicate an error in the postprocessing task.
1219 """
5f6a1245 1220
5f6a1245 1221
48f79687 1222class DownloadCancelled(YoutubeDLError):
1223 """ Exception raised when the download queue should be interrupted """
1224 msg = 'The download was cancelled'
8b0d7497 1225
8b0d7497 1226
48f79687 1227class ExistingVideoReached(DownloadCancelled):
1228 """ --break-on-existing triggered """
1229 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1230
48f79687 1231
1232class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1233 """ --break-match-filter triggered """
1234 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1235
1236
48f79687 1237class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1238 """ --max-downloads limit has been reached. """
48f79687 1239 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1240
1241
f2ebc5c7 1242class ReExtractInfo(YoutubeDLError):
1243 """ Video info needs to be re-extracted. """
1244
1245 def __init__(self, msg, expected=False):
1246 super().__init__(msg)
1247 self.expected = expected
1248
1249
1250class ThrottledDownload(ReExtractInfo):
48f79687 1251 """ Download speed below --throttled-rate. """
aa9369a2 1252 msg = 'The download speed is below throttle limit'
d77c3dfd 1253
43b22906 1254 def __init__(self):
1255 super().__init__(self.msg, expected=False)
f2ebc5c7 1256
d77c3dfd 1257
bf5b9d85 1258class UnavailableVideoError(YoutubeDLError):
59ae15a5 1259 """Unavailable Format exception.
d77c3dfd 1260
59ae15a5
PH
1261 This exception will be thrown when a video is requested
1262 in a format that is not available for that video.
1263 """
aa9369a2 1264 msg = 'Unable to download video'
1265
1266 def __init__(self, err=None):
1267 if err is not None:
1268 self.msg += f': {err}'
1269 super().__init__(self.msg)
d77c3dfd
FV
1270
1271
bf5b9d85 1272class ContentTooShortError(YoutubeDLError):
59ae15a5 1273 """Content Too Short exception.
d77c3dfd 1274
59ae15a5
PH
1275 This exception may be raised by FileDownloader objects when a file they
1276 download is too small for what the server announced first, indicating
1277 the connection was probably interrupted.
1278 """
d77c3dfd 1279
59ae15a5 1280 def __init__(self, downloaded, expected):
86e5f3ed 1281 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1282 # Both in bytes
59ae15a5
PH
1283 self.downloaded = downloaded
1284 self.expected = expected
d77c3dfd 1285
5f6a1245 1286
bf5b9d85 1287class XAttrMetadataError(YoutubeDLError):
efa97bdc 1288 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1289 super().__init__(msg)
efa97bdc 1290 self.code = code
bd264412 1291 self.msg = msg
efa97bdc
YCH
1292
1293 # Parsing code and msg
3089bc74 1294 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1295 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1296 self.reason = 'NO_SPACE'
1297 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1298 self.reason = 'VALUE_TOO_LONG'
1299 else:
1300 self.reason = 'NOT_SUPPORTED'
1301
1302
bf5b9d85 1303class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1304 pass
1305
1306
c5a59d93 1307def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1308 hc = http_class(*args, **kwargs)
be4a824d 1309 source_address = ydl_handler._params.get('source_address')
8959018a 1310
be4a824d 1311 if source_address is not None:
8959018a
AU
1312 # This is to workaround _create_connection() from socket where it will try all
1313 # address data from getaddrinfo() including IPv6. This filters the result from
1314 # getaddrinfo() based on the source_address value.
1315 # This is based on the cpython socket.create_connection() function.
1316 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1318 host, port = address
1319 err = None
1320 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1321 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1322 ip_addrs = [addr for addr in addrs if addr[0] == af]
1323 if addrs and not ip_addrs:
1324 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1325 raise OSError(
9e21e6d9
S
1326 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327 % (ip_version, source_address[0]))
8959018a
AU
1328 for res in ip_addrs:
1329 af, socktype, proto, canonname, sa = res
1330 sock = None
1331 try:
1332 sock = socket.socket(af, socktype, proto)
1333 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1334 sock.settimeout(timeout)
1335 sock.bind(source_address)
1336 sock.connect(sa)
1337 err = None # Explicitly break reference cycle
1338 return sock
86e5f3ed 1339 except OSError as _:
8959018a
AU
1340 err = _
1341 if sock is not None:
1342 sock.close()
1343 if err is not None:
1344 raise err
1345 else:
86e5f3ed 1346 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1347 if hasattr(hc, '_create_connection'):
1348 hc._create_connection = _create_connection
cfb0511d 1349 hc.source_address = (source_address, 0)
be4a824d
PH
1350
1351 return hc
1352
1353
87f0e62d 1354def handle_youtubedl_headers(headers):
992fc9d6
YCH
1355 filtered_headers = headers
1356
1357 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1358 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1359 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1360
992fc9d6 1361 return filtered_headers
87f0e62d
YCH
1362
1363
ac668111 1364class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1365 """Handler for HTTP requests and responses.
1366
1367 This class, when installed with an OpenerDirector, automatically adds
1368 the standard headers to every HTTP request and handles gzipped and
1369 deflated responses from web servers. If compression is to be avoided in
1370 a particular request, the original request in the program code only has
0424ec30 1371 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1372 removed before making the real request.
1373
1374 Part of this code was copied from:
1375
1376 http://techknack.net/python-urllib2-handlers/
1377
1378 Andrew Rowls, the author of that code, agreed to release it to the
1379 public domain.
1380 """
1381
be4a824d 1382 def __init__(self, params, *args, **kwargs):
ac668111 1383 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1384 self._params = params
1385
1386 def http_open(self, req):
ac668111 1387 conn_class = http.client.HTTPConnection
71aff188
YCH
1388
1389 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1390 if socks_proxy:
1391 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1392 del req.headers['Ytdl-socks-proxy']
1393
be4a824d 1394 return self.do_open(functools.partial(
71aff188 1395 _create_http_connection, self, conn_class, False),
be4a824d
PH
1396 req)
1397
59ae15a5
PH
1398 @staticmethod
1399 def deflate(data):
fc2119f2 1400 if not data:
1401 return data
59ae15a5
PH
1402 try:
1403 return zlib.decompress(data, -zlib.MAX_WBITS)
1404 except zlib.error:
1405 return zlib.decompress(data)
1406
4390d5ec 1407 @staticmethod
1408 def brotli(data):
1409 if not data:
1410 return data
9b8ee23b 1411 return brotli.decompress(data)
4390d5ec 1412
acebc9cd 1413 def http_request(self, req):
51f267d9
S
1414 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415 # always respected by websites, some tend to give out URLs with non percent-encoded
1416 # non-ASCII characters (see telemb.py, ard.py [#3412])
1417 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418 # To work around aforementioned issue we will replace request's original URL with
1419 # percent-encoded one
1420 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422 url = req.get_full_url()
1423 url_escaped = escape_url(url)
1424
1425 # Substitute URL if any change after escaping
1426 if url != url_escaped:
15d260eb 1427 req = update_Request(req, url=url_escaped)
51f267d9 1428
8b7539d2 1429 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1430 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431 # The dict keys are capitalized because of this bug by urllib
1432 if h.capitalize() not in req.headers:
33ac271b 1433 req.add_header(h, v)
87f0e62d 1434
af14914b 1435 if 'Accept-encoding' not in req.headers:
1436 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1437
87f0e62d 1438 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1439
379a4f16 1440 return super().do_request_(req)
59ae15a5 1441
acebc9cd 1442 def http_response(self, req, resp):
59ae15a5
PH
1443 old_resp = resp
1444 # gzip
1445 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1446 content = resp.read()
1447 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1448 try:
1449 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1450 except OSError as original_ioerror:
aa3e9507
PH
1451 # There may be junk add the end of the file
1452 # See http://stackoverflow.com/q/4928560/35070 for details
1453 for i in range(1, 1024):
1454 try:
1455 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1456 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1457 except OSError:
aa3e9507
PH
1458 continue
1459 break
1460 else:
1461 raise original_ioerror
ac668111 1462 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
1463 resp.msg = old_resp.msg
1464 # deflate
1465 if resp.headers.get('Content-encoding', '') == 'deflate':
1466 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1467 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1468 resp.msg = old_resp.msg
4390d5ec 1469 # brotli
1470 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1471 resp = urllib.request.addinfourl(
4390d5ec 1472 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1473 resp.msg = old_resp.msg
ad729172 1474 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1475 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1476 if 300 <= resp.code < 400:
1477 location = resp.headers.get('Location')
1478 if location:
1479 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1480 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1481 location_escaped = escape_url(location)
1482 if location != location_escaped:
1483 del resp.headers['Location']
1484 resp.headers['Location'] = location_escaped
59ae15a5 1485 return resp
0f8d03f8 1486
acebc9cd
PH
1487 https_request = http_request
1488 https_response = http_response
bf50b038 1489
5de90176 1490
71aff188
YCH
1491def make_socks_conn_class(base_class, socks_proxy):
1492 assert issubclass(base_class, (
ac668111 1493 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1494
14f25df2 1495 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1496 if url_components.scheme.lower() == 'socks5':
1497 socks_type = ProxyType.SOCKS5
1498 elif url_components.scheme.lower() in ('socks', 'socks4'):
1499 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1500 elif url_components.scheme.lower() == 'socks4a':
1501 socks_type = ProxyType.SOCKS4A
71aff188 1502
cdd94c2e
YCH
1503 def unquote_if_non_empty(s):
1504 if not s:
1505 return s
ac668111 1506 return urllib.parse.unquote_plus(s)
cdd94c2e 1507
71aff188
YCH
1508 proxy_args = (
1509 socks_type,
1510 url_components.hostname, url_components.port or 1080,
1511 True, # Remote DNS
cdd94c2e
YCH
1512 unquote_if_non_empty(url_components.username),
1513 unquote_if_non_empty(url_components.password),
71aff188
YCH
1514 )
1515
1516 class SocksConnection(base_class):
1517 def connect(self):
1518 self.sock = sockssocket()
1519 self.sock.setproxy(*proxy_args)
19a03940 1520 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1521 self.sock.settimeout(self.timeout)
1522 self.sock.connect((self.host, self.port))
1523
ac668111 1524 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1525 if hasattr(self, '_context'): # Python > 2.6
1526 self.sock = self._context.wrap_socket(
1527 self.sock, server_hostname=self.host)
1528 else:
1529 self.sock = ssl.wrap_socket(self.sock)
1530
1531 return SocksConnection
1532
1533
ac668111 1534class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1535 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1536 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1537 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1538 self._params = params
1539
1540 def https_open(self, req):
4f264c02 1541 kwargs = {}
71aff188
YCH
1542 conn_class = self._https_conn_class
1543
4f264c02
JMF
1544 if hasattr(self, '_context'): # python > 2.6
1545 kwargs['context'] = self._context
1546 if hasattr(self, '_check_hostname'): # python 3.x
1547 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1548
1549 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1550 if socks_proxy:
1551 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1552 del req.headers['Ytdl-socks-proxy']
1553
4f28b537 1554 try:
1555 return self.do_open(
1556 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1557 except urllib.error.URLError as e:
1558 if (isinstance(e.reason, ssl.SSLError)
1559 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1561 raise
be4a824d
PH
1562
1563
941e881e 1564def is_path_like(f):
1565 return isinstance(f, (str, bytes, os.PathLike))
1566
1567
ac668111 1568class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1569 """
1570 See [1] for cookie file format.
1571
1572 1. https://curl.haxx.se/docs/http-cookies.html
1573 """
e7e62441 1574 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1575 _ENTRY_LEN = 7
1576 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1577# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1578
1579'''
1580 _CookieFileEntry = collections.namedtuple(
1581 'CookieFileEntry',
1582 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1583
d76fa1f3 1584 def __init__(self, filename=None, *args, **kwargs):
1585 super().__init__(None, *args, **kwargs)
941e881e 1586 if is_path_like(filename):
d76fa1f3 1587 filename = os.fspath(filename)
1588 self.filename = filename
1589
24146491 1590 @staticmethod
1591 def _true_or_false(cndn):
1592 return 'TRUE' if cndn else 'FALSE'
1593
d76fa1f3 1594 @contextlib.contextmanager
1595 def open(self, file, *, write=False):
941e881e 1596 if is_path_like(file):
d76fa1f3 1597 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1598 yield f
1599 else:
1600 if write:
1601 file.truncate(0)
1602 yield file
1603
24146491 1604 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1605 now = time.time()
1606 for cookie in self:
1607 if (not ignore_discard and cookie.discard
1608 or not ignore_expires and cookie.is_expired(now)):
1609 continue
1610 name, value = cookie.name, cookie.value
1611 if value is None:
1612 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613 # with no name, whereas http.cookiejar regards it as a
1614 # cookie with no value.
1615 name, value = '', name
1616 f.write('%s\n' % '\t'.join((
1617 cookie.domain,
1618 self._true_or_false(cookie.domain.startswith('.')),
1619 cookie.path,
1620 self._true_or_false(cookie.secure),
1621 str_or_none(cookie.expires, default=''),
1622 name, value
1623 )))
1624
1625 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1626 """
1627 Save cookies to a file.
24146491 1628 Code is taken from CPython 3.6
1629 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1630
c380cc28
S
1631 if filename is None:
1632 if self.filename is not None:
1633 filename = self.filename
1634 else:
ac668111 1635 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1636
24146491 1637 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1638 for cookie in self:
1639 if cookie.expires is None:
1640 cookie.expires = 0
c380cc28 1641
d76fa1f3 1642 with self.open(filename, write=True) as f:
c380cc28 1643 f.write(self._HEADER)
24146491 1644 self._really_save(f, *args, **kwargs)
1bab3437
S
1645
1646 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1647 """Load cookies from a file."""
1648 if filename is None:
1649 if self.filename is not None:
1650 filename = self.filename
1651 else:
ac668111 1652 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1653
c380cc28
S
1654 def prepare_line(line):
1655 if line.startswith(self._HTTPONLY_PREFIX):
1656 line = line[len(self._HTTPONLY_PREFIX):]
1657 # comments and empty lines are fine
1658 if line.startswith('#') or not line.strip():
1659 return line
1660 cookie_list = line.split('\t')
1661 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1662 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1663 cookie = self._CookieFileEntry(*cookie_list)
1664 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1665 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1666 return line
1667
e7e62441 1668 cf = io.StringIO()
d76fa1f3 1669 with self.open(filename) as f:
e7e62441 1670 for line in f:
c380cc28
S
1671 try:
1672 cf.write(prepare_line(line))
ac668111 1673 except http.cookiejar.LoadError as e:
94aa0644 1674 if f'{line.strip()} '[0] in '[{"':
ac668111 1675 raise http.cookiejar.LoadError(
94aa0644 1676 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1677 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1678 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1679 continue
e7e62441 1680 cf.seek(0)
1681 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1682 # Session cookies are denoted by either `expires` field set to
1683 # an empty string or 0. MozillaCookieJar only recognizes the former
1684 # (see [1]). So we need force the latter to be recognized as session
1685 # cookies on our own.
1686 # Session cookies may be important for cookies-based authentication,
1687 # e.g. usually, when user does not check 'Remember me' check box while
1688 # logging in on a site, some important cookies are stored as session
1689 # cookies so that not recognizing them will result in failed login.
1690 # 1. https://bugs.python.org/issue17164
1691 for cookie in self:
1692 # Treat `expires=0` cookies as session cookies
1693 if cookie.expires == 0:
1694 cookie.expires = None
1695 cookie.discard = True
1696
1697
ac668111 1698class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1699 def __init__(self, cookiejar=None):
ac668111 1700 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1701
1702 def http_response(self, request, response):
ac668111 1703 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1704
ac668111 1705 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1706 https_response = http_response
1707
1708
ac668111 1709class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1710 """YoutubeDL redirect handler
1711
1712 The code is based on HTTPRedirectHandler implementation from CPython [1].
1713
1714 This redirect handler solves two issues:
1715 - ensures redirect URL is always unicode under python 2
1716 - introduces support for experimental HTTP response status code
1717 308 Permanent Redirect [2] used by some sites [3]
1718
1719 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1722 """
1723
ac668111 1724 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1725
1726 def redirect_request(self, req, fp, code, msg, headers, newurl):
1727 """Return a Request or None in response to a redirect.
1728
1729 This is called by the http_error_30x methods when a
1730 redirection response is received. If a redirection should
1731 take place, return a new Request to allow http_error_30x to
1732 perform the redirect. Otherwise, raise HTTPError if no-one
1733 else should try to handle this url. Return None if you can't
1734 but another Handler might.
1735 """
1736 m = req.get_method()
1737 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1738 or code in (301, 302, 303) and m == "POST")):
14f25df2 1739 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1740 # Strictly (according to RFC 2616), 301 or 302 in response to
1741 # a POST MUST NOT cause a redirection without confirmation
1742 # from the user (of urllib.request, in this case). In practice,
1743 # essentially all clients do redirect in this case, so we do
1744 # the same.
1745
201c1459 1746 # Be conciliant with URIs containing a space. This is mainly
1747 # redundant with the more complete encoding done in http_error_302(),
1748 # but it is kept for compatibility with other callers.
1749 newurl = newurl.replace(' ', '%20')
1750
1751 CONTENT_HEADERS = ("content-length", "content-type")
1752 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1753 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1754
1755 # A 303 must either use GET or HEAD for subsequent request
1756 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757 if code == 303 and m != 'HEAD':
1758 m = 'GET'
1759 # 301 and 302 redirects are commonly turned into a GET from a POST
1760 # for subsequent requests by browsers, so we'll do the same.
1761 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763 if code in (301, 302) and m == 'POST':
1764 m = 'GET'
1765
ac668111 1766 return urllib.request.Request(
201c1459 1767 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1768 unverifiable=True, method=m)
fca6dba8
S
1769
1770
46f59e89
S
1771def extract_timezone(date_str):
1772 m = re.search(
f137e4c2 1773 r'''(?x)
1774 ^.{8,}? # >=8 char non-TZ prefix, if present
1775 (?P<tz>Z| # just the UTC Z, or
1776 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1777 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778 [ ]? # optional space
1779 (?P<sign>\+|-) # +/-
1780 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1781 $)
1782 ''', date_str)
46f59e89 1783 if not m:
8f53dc44 1784 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1785 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1786 if timezone is not None:
1787 date_str = date_str[:-len(m.group('tz'))]
1788 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1789 else:
1790 date_str = date_str[:-len(m.group('tz'))]
1791 if not m.group('sign'):
1792 timezone = datetime.timedelta()
1793 else:
1794 sign = 1 if m.group('sign') == '+' else -1
1795 timezone = datetime.timedelta(
1796 hours=sign * int(m.group('hours')),
1797 minutes=sign * int(m.group('minutes')))
1798 return timezone, date_str
1799
1800
08b38d54 1801def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1802 """ Return a UNIX timestamp from the given date """
1803
1804 if date_str is None:
1805 return None
1806
52c3a6e4
S
1807 date_str = re.sub(r'\.[0-9]+', '', date_str)
1808
08b38d54 1809 if timezone is None:
46f59e89
S
1810 timezone, date_str = extract_timezone(date_str)
1811
19a03940 1812 with contextlib.suppress(ValueError):
86e5f3ed 1813 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1814 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1815 return calendar.timegm(dt.timetuple())
912b38b4
PH
1816
1817
46f59e89
S
1818def date_formats(day_first=True):
1819 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1820
1821
42bdd9d0 1822def unified_strdate(date_str, day_first=True):
bf50b038 1823 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1824
1825 if date_str is None:
1826 return None
bf50b038 1827 upload_date = None
5f6a1245 1828 # Replace commas
026fcc04 1829 date_str = date_str.replace(',', ' ')
42bdd9d0 1830 # Remove AM/PM + timezone
9bb8e0a3 1831 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1832 _, date_str = extract_timezone(date_str)
42bdd9d0 1833
46f59e89 1834 for expression in date_formats(day_first):
19a03940 1835 with contextlib.suppress(ValueError):
bf50b038 1836 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1837 if upload_date is None:
1838 timetuple = email.utils.parsedate_tz(date_str)
1839 if timetuple:
19a03940 1840 with contextlib.suppress(ValueError):
c6b9cf05 1841 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1842 if upload_date is not None:
14f25df2 1843 return str(upload_date)
bf50b038 1844
5f6a1245 1845
46f59e89
S
1846def unified_timestamp(date_str, day_first=True):
1847 if date_str is None:
1848 return None
1849
8f53dc44 1850 date_str = re.sub(r'\s+', ' ', re.sub(
1851 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1852
7dc2a74e 1853 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1854 timezone, date_str = extract_timezone(date_str)
1855
1856 # Remove AM/PM + timezone
1857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1858
deef3195
S
1859 # Remove unrecognized timezones from ISO 8601 alike timestamps
1860 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1861 if m:
1862 date_str = date_str[:-len(m.group('tz'))]
1863
f226880c
PH
1864 # Python only supports microseconds, so remove nanoseconds
1865 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1866 if m:
1867 date_str = m.group(1)
1868
46f59e89 1869 for expression in date_formats(day_first):
19a03940 1870 with contextlib.suppress(ValueError):
7dc2a74e 1871 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1872 return calendar.timegm(dt.timetuple())
8f53dc44 1873
46f59e89
S
1874 timetuple = email.utils.parsedate_tz(date_str)
1875 if timetuple:
8f53dc44 1876 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1877
1878
28e614de 1879def determine_ext(url, default_ext='unknown_video'):
85750f89 1880 if url is None or '.' not in url:
f4776371 1881 return default_ext
9cb9a5df 1882 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1883 if re.match(r'^[A-Za-z0-9]+$', guess):
1884 return guess
a7aaa398
S
1885 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1887 return guess.rstrip('/')
73e79f2a 1888 else:
cbdbb766 1889 return default_ext
73e79f2a 1890
5f6a1245 1891
824fa511
S
1892def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1893 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1894
5f6a1245 1895
9e62f283 1896def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1897 R"""
1898 Return a datetime object from a string.
1899 Supported format:
1900 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1901
1902 @param format strftime format of DATE
1903 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1904 auto: round to the unit provided in date_str (if applicable).
9e62f283 1905 """
1906 auto_precision = False
1907 if precision == 'auto':
1908 auto_precision = True
1909 precision = 'microsecond'
396a76f7 1910 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1911 if date_str in ('now', 'today'):
37254abc 1912 return today
f8795e10
PH
1913 if date_str == 'yesterday':
1914 return today - datetime.timedelta(days=1)
9e62f283 1915 match = re.match(
3d38b2d6 1916 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1917 date_str)
37254abc 1918 if match is not None:
9e62f283 1919 start_time = datetime_from_str(match.group('start'), precision, format)
1920 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1921 unit = match.group('unit')
9e62f283 1922 if unit == 'month' or unit == 'year':
1923 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1924 unit = 'day'
9e62f283 1925 else:
1926 if unit == 'week':
1927 unit = 'day'
1928 time *= 7
1929 delta = datetime.timedelta(**{unit + 's': time})
1930 new_date = start_time + delta
1931 if auto_precision:
1932 return datetime_round(new_date, unit)
1933 return new_date
1934
1935 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1936
1937
d49f8db3 1938def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1939 R"""
1940 Return a date object from a string using datetime_from_str
9e62f283 1941
3d38b2d6 1942 @param strict Restrict allowed patterns to "YYYYMMDD" and
1943 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1944 """
3d38b2d6 1945 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1946 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1947 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1948
1949
1950def datetime_add_months(dt, months):
1951 """Increment/Decrement a datetime object by months."""
1952 month = dt.month + months - 1
1953 year = dt.year + month // 12
1954 month = month % 12 + 1
1955 day = min(dt.day, calendar.monthrange(year, month)[1])
1956 return dt.replace(year, month, day)
1957
1958
1959def datetime_round(dt, precision='day'):
1960 """
1961 Round a datetime object's time to a specific precision
1962 """
1963 if precision == 'microsecond':
1964 return dt
1965
1966 unit_seconds = {
1967 'day': 86400,
1968 'hour': 3600,
1969 'minute': 60,
1970 'second': 1,
1971 }
1972 roundto = lambda x, n: ((x + n / 2) // n) * n
1973 timestamp = calendar.timegm(dt.timetuple())
1974 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1975
1976
e63fc1be 1977def hyphenate_date(date_str):
1978 """
1979 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1981 if match is not None:
1982 return '-'.join(match.groups())
1983 else:
1984 return date_str
1985
5f6a1245 1986
86e5f3ed 1987class DateRange:
bd558525 1988 """Represents a time interval between two dates"""
5f6a1245 1989
bd558525
JMF
1990 def __init__(self, start=None, end=None):
1991 """start and end must be strings in the format accepted by date"""
1992 if start is not None:
d49f8db3 1993 self.start = date_from_str(start, strict=True)
bd558525
JMF
1994 else:
1995 self.start = datetime.datetime.min.date()
1996 if end is not None:
d49f8db3 1997 self.end = date_from_str(end, strict=True)
bd558525
JMF
1998 else:
1999 self.end = datetime.datetime.max.date()
37254abc 2000 if self.start > self.end:
bd558525 2001 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 2002
bd558525
JMF
2003 @classmethod
2004 def day(cls, day):
2005 """Returns a range that only contains the given day"""
5f6a1245
JW
2006 return cls(day, day)
2007
bd558525
JMF
2008 def __contains__(self, date):
2009 """Check if the date is in the range"""
37254abc
JMF
2010 if not isinstance(date, datetime.date):
2011 date = date_from_str(date)
2012 return self.start <= date <= self.end
5f6a1245 2013
bd558525 2014 def __str__(self):
86e5f3ed 2015 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 2016
f2df4071 2017 def __eq__(self, other):
2018 return (isinstance(other, DateRange)
2019 and self.start == other.start and self.end == other.end)
2020
c496ca96
PH
2021
2022def platform_name():
14f25df2 2023 """ Returns the platform name as a str """
da4db748 2024 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 2025 return platform.platform()
c496ca96 2026
b1f94422 2027
2028@functools.cache
2029def system_identifier():
2030 python_implementation = platform.python_implementation()
2031 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2032 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 2033 libc_ver = []
2034 with contextlib.suppress(OSError): # We may not have access to the executable
2035 libc_ver = platform.libc_ver()
b1f94422 2036
17fc3dc4 2037 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 2038 platform.python_version(),
2039 python_implementation,
17fc3dc4 2040 platform.machine(),
b1f94422 2041 platform.architecture()[0],
2042 platform.platform(),
5b9f253f
M
2043 ssl.OPENSSL_VERSION,
2044 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 2045 )
c257baff
PH
2046
2047
0b9c08b4 2048@functools.cache
49fa4d9a 2049def get_windows_version():
8a82af35 2050 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
2051 if compat_os_name == 'nt':
2052 return version_tuple(platform.win32_ver()[1])
2053 else:
8a82af35 2054 return ()
49fa4d9a
N
2055
2056
734f90bb 2057def write_string(s, out=None, encoding=None):
19a03940 2058 assert isinstance(s, str)
2059 out = out or sys.stderr
7459e3a2 2060
fe1daad3 2061 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2062 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2063
8a82af35 2064 enc, buffer = None, out
cfb0511d 2065 if 'b' in getattr(out, 'mode', ''):
c487cf00 2066 enc = encoding or preferredencoding()
104aa738 2067 elif hasattr(out, 'buffer'):
8a82af35 2068 buffer = out.buffer
104aa738 2069 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2070
8a82af35 2071 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2072 out.flush()
2073
2074
da4db748 2075def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2076 from . import _IN_CLI
2077 if _IN_CLI:
2078 if msg in deprecation_warning._cache:
2079 return
2080 deprecation_warning._cache.add(msg)
2081 if printer:
2082 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2083 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2084 else:
2085 import warnings
2086 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2087
2088
2089deprecation_warning._cache = set()
2090
2091
48ea9cea
PH
2092def bytes_to_intlist(bs):
2093 if not bs:
2094 return []
2095 if isinstance(bs[0], int): # Python 3
2096 return list(bs)
2097 else:
2098 return [ord(c) for c in bs]
2099
c257baff 2100
cba892fa 2101def intlist_to_bytes(xs):
2102 if not xs:
2103 return b''
ac668111 2104 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2105
2106
8a82af35 2107class LockingUnsupportedError(OSError):
1890fc63 2108 msg = 'File locking is not supported'
0edb3e33 2109
2110 def __init__(self):
2111 super().__init__(self.msg)
2112
2113
c1c9a79c
PH
2114# Cross-platform file locking
2115if sys.platform == 'win32':
fe0918bb 2116 import ctypes
c1c9a79c
PH
2117 import ctypes.wintypes
2118 import msvcrt
2119
2120 class OVERLAPPED(ctypes.Structure):
2121 _fields_ = [
2122 ('Internal', ctypes.wintypes.LPVOID),
2123 ('InternalHigh', ctypes.wintypes.LPVOID),
2124 ('Offset', ctypes.wintypes.DWORD),
2125 ('OffsetHigh', ctypes.wintypes.DWORD),
2126 ('hEvent', ctypes.wintypes.HANDLE),
2127 ]
2128
37e325b9 2129 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
2130 LockFileEx = kernel32.LockFileEx
2131 LockFileEx.argtypes = [
2132 ctypes.wintypes.HANDLE, # hFile
2133 ctypes.wintypes.DWORD, # dwFlags
2134 ctypes.wintypes.DWORD, # dwReserved
2135 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2136 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2137 ctypes.POINTER(OVERLAPPED) # Overlapped
2138 ]
2139 LockFileEx.restype = ctypes.wintypes.BOOL
2140 UnlockFileEx = kernel32.UnlockFileEx
2141 UnlockFileEx.argtypes = [
2142 ctypes.wintypes.HANDLE, # hFile
2143 ctypes.wintypes.DWORD, # dwReserved
2144 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2145 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2146 ctypes.POINTER(OVERLAPPED) # Overlapped
2147 ]
2148 UnlockFileEx.restype = ctypes.wintypes.BOOL
2149 whole_low = 0xffffffff
2150 whole_high = 0x7fffffff
2151
747c0bd1 2152 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2153 overlapped = OVERLAPPED()
2154 overlapped.Offset = 0
2155 overlapped.OffsetHigh = 0
2156 overlapped.hEvent = 0
2157 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2158
2159 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2160 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2161 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2162 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2163 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2164
2165 def _unlock_file(f):
2166 assert f._lock_file_overlapped_p
2167 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2168 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2169 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2170
2171else:
399a76e6
YCH
2172 try:
2173 import fcntl
c1c9a79c 2174
a3125791 2175 def _lock_file(f, exclusive, block):
b63837bc 2176 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2177 if not block:
2178 flags |= fcntl.LOCK_NB
acea8d7c 2179 try:
b63837bc 2180 fcntl.flock(f, flags)
acea8d7c
JK
2181 except BlockingIOError:
2182 raise
2183 except OSError: # AOSP does not have flock()
b63837bc 2184 fcntl.lockf(f, flags)
c1c9a79c 2185
399a76e6 2186 def _unlock_file(f):
acea8d7c
JK
2187 try:
2188 fcntl.flock(f, fcntl.LOCK_UN)
2189 except OSError:
2190 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2191
399a76e6 2192 except ImportError:
399a76e6 2193
a3125791 2194 def _lock_file(f, exclusive, block):
0edb3e33 2195 raise LockingUnsupportedError()
399a76e6
YCH
2196
2197 def _unlock_file(f):
0edb3e33 2198 raise LockingUnsupportedError()
c1c9a79c
PH
2199
2200
86e5f3ed 2201class locked_file:
0edb3e33 2202 locked = False
747c0bd1 2203
a3125791 2204 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2205 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2206 raise NotImplementedError(mode)
2207 self.mode, self.block = mode, block
2208
2209 writable = any(f in mode for f in 'wax+')
2210 readable = any(f in mode for f in 'r+')
2211 flags = functools.reduce(operator.ior, (
2212 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2213 getattr(os, 'O_BINARY', 0), # Windows only
2214 getattr(os, 'O_NOINHERIT', 0), # Windows only
2215 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2216 os.O_APPEND if 'a' in mode else 0,
2217 os.O_EXCL if 'x' in mode else 0,
2218 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2219 ))
2220
98804d03 2221 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2222
2223 def __enter__(self):
a3125791 2224 exclusive = 'r' not in self.mode
c1c9a79c 2225 try:
a3125791 2226 _lock_file(self.f, exclusive, self.block)
0edb3e33 2227 self.locked = True
86e5f3ed 2228 except OSError:
c1c9a79c
PH
2229 self.f.close()
2230 raise
fcfa8853 2231 if 'w' in self.mode:
131e14dc
JK
2232 try:
2233 self.f.truncate()
2234 except OSError as e:
1890fc63 2235 if e.errno not in (
2236 errno.ESPIPE, # Illegal seek - expected for FIFO
2237 errno.EINVAL, # Invalid argument - expected for /dev/null
2238 ):
2239 raise
c1c9a79c
PH
2240 return self
2241
0edb3e33 2242 def unlock(self):
2243 if not self.locked:
2244 return
c1c9a79c 2245 try:
0edb3e33 2246 _unlock_file(self.f)
c1c9a79c 2247 finally:
0edb3e33 2248 self.locked = False
c1c9a79c 2249
0edb3e33 2250 def __exit__(self, *_):
2251 try:
2252 self.unlock()
2253 finally:
2254 self.f.close()
4eb7f1d1 2255
0edb3e33 2256 open = __enter__
2257 close = __exit__
a3125791 2258
0edb3e33 2259 def __getattr__(self, attr):
2260 return getattr(self.f, attr)
a3125791 2261
0edb3e33 2262 def __iter__(self):
2263 return iter(self.f)
a3125791 2264
4eb7f1d1 2265
0b9c08b4 2266@functools.cache
4644ac55
S
2267def get_filesystem_encoding():
2268 encoding = sys.getfilesystemencoding()
2269 return encoding if encoding is not None else 'utf-8'
2270
2271
4eb7f1d1 2272def shell_quote(args):
a6a173c2 2273 quoted_args = []
4644ac55 2274 encoding = get_filesystem_encoding()
a6a173c2
JMF
2275 for a in args:
2276 if isinstance(a, bytes):
2277 # We may get a filename encoded with 'encodeFilename'
2278 a = a.decode(encoding)
aefce8e6 2279 quoted_args.append(compat_shlex_quote(a))
28e614de 2280 return ' '.join(quoted_args)
9d4660ca
PH
2281
2282
2283def smuggle_url(url, data):
2284 """ Pass additional data in a URL for internal use. """
2285
81953d1a
RA
2286 url, idata = unsmuggle_url(url, {})
2287 data.update(idata)
14f25df2 2288 sdata = urllib.parse.urlencode(
28e614de
PH
2289 {'__youtubedl_smuggle': json.dumps(data)})
2290 return url + '#' + sdata
9d4660ca
PH
2291
2292
79f82953 2293def unsmuggle_url(smug_url, default=None):
83e865a3 2294 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2295 return smug_url, default
28e614de 2296 url, _, sdata = smug_url.rpartition('#')
14f25df2 2297 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2298 data = json.loads(jsond)
2299 return url, data
02dbf93f
PH
2300
2301
e0fd9573 2302def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2303 """ Formats numbers with decimal sufixes like K, M, etc """
2304 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2305 if num is None or num < 0:
e0fd9573 2306 return None
eeb2a770 2307 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2308 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2309 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2310 if factor == 1024:
2311 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2312 converted = num / (factor ** exponent)
abbeeebc 2313 return fmt % (converted, suffix)
e0fd9573 2314
2315
02dbf93f 2316def format_bytes(bytes):
f02d24d8 2317 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2318
1c088fa8 2319
64c464a1 2320def lookup_unit_table(unit_table, s, strict=False):
2321 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2322 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2323 m = (re.fullmatch if strict else re.match)(
2324 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2325 if not m:
2326 return None
64c464a1 2327
2328 num = float(m.group('num').replace(',', '.'))
fb47597b 2329 mult = unit_table[m.group('unit')]
64c464a1 2330 return round(num * mult)
2331
2332
2333def parse_bytes(s):
2334 """Parse a string indicating a byte quantity into an integer"""
2335 return lookup_unit_table(
2336 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2337 s.upper(), strict=True)
fb47597b
S
2338
2339
be64b5b0
PH
2340def parse_filesize(s):
2341 if s is None:
2342 return None
2343
dfb1b146 2344 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2345 # but we support those too
2346 _UNIT_TABLE = {
2347 'B': 1,
2348 'b': 1,
70852b47 2349 'bytes': 1,
be64b5b0
PH
2350 'KiB': 1024,
2351 'KB': 1000,
2352 'kB': 1024,
2353 'Kb': 1000,
13585d76 2354 'kb': 1000,
70852b47
YCH
2355 'kilobytes': 1000,
2356 'kibibytes': 1024,
be64b5b0
PH
2357 'MiB': 1024 ** 2,
2358 'MB': 1000 ** 2,
2359 'mB': 1024 ** 2,
2360 'Mb': 1000 ** 2,
13585d76 2361 'mb': 1000 ** 2,
70852b47
YCH
2362 'megabytes': 1000 ** 2,
2363 'mebibytes': 1024 ** 2,
be64b5b0
PH
2364 'GiB': 1024 ** 3,
2365 'GB': 1000 ** 3,
2366 'gB': 1024 ** 3,
2367 'Gb': 1000 ** 3,
13585d76 2368 'gb': 1000 ** 3,
70852b47
YCH
2369 'gigabytes': 1000 ** 3,
2370 'gibibytes': 1024 ** 3,
be64b5b0
PH
2371 'TiB': 1024 ** 4,
2372 'TB': 1000 ** 4,
2373 'tB': 1024 ** 4,
2374 'Tb': 1000 ** 4,
13585d76 2375 'tb': 1000 ** 4,
70852b47
YCH
2376 'terabytes': 1000 ** 4,
2377 'tebibytes': 1024 ** 4,
be64b5b0
PH
2378 'PiB': 1024 ** 5,
2379 'PB': 1000 ** 5,
2380 'pB': 1024 ** 5,
2381 'Pb': 1000 ** 5,
13585d76 2382 'pb': 1000 ** 5,
70852b47
YCH
2383 'petabytes': 1000 ** 5,
2384 'pebibytes': 1024 ** 5,
be64b5b0
PH
2385 'EiB': 1024 ** 6,
2386 'EB': 1000 ** 6,
2387 'eB': 1024 ** 6,
2388 'Eb': 1000 ** 6,
13585d76 2389 'eb': 1000 ** 6,
70852b47
YCH
2390 'exabytes': 1000 ** 6,
2391 'exbibytes': 1024 ** 6,
be64b5b0
PH
2392 'ZiB': 1024 ** 7,
2393 'ZB': 1000 ** 7,
2394 'zB': 1024 ** 7,
2395 'Zb': 1000 ** 7,
13585d76 2396 'zb': 1000 ** 7,
70852b47
YCH
2397 'zettabytes': 1000 ** 7,
2398 'zebibytes': 1024 ** 7,
be64b5b0
PH
2399 'YiB': 1024 ** 8,
2400 'YB': 1000 ** 8,
2401 'yB': 1024 ** 8,
2402 'Yb': 1000 ** 8,
13585d76 2403 'yb': 1000 ** 8,
70852b47
YCH
2404 'yottabytes': 1000 ** 8,
2405 'yobibytes': 1024 ** 8,
be64b5b0
PH
2406 }
2407
fb47597b
S
2408 return lookup_unit_table(_UNIT_TABLE, s)
2409
2410
2411def parse_count(s):
2412 if s is None:
be64b5b0
PH
2413 return None
2414
352d5da8 2415 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2416
2417 if re.match(r'^[\d,.]+$', s):
2418 return str_to_int(s)
2419
2420 _UNIT_TABLE = {
2421 'k': 1000,
2422 'K': 1000,
2423 'm': 1000 ** 2,
2424 'M': 1000 ** 2,
2425 'kk': 1000 ** 2,
2426 'KK': 1000 ** 2,
352d5da8 2427 'b': 1000 ** 3,
2428 'B': 1000 ** 3,
fb47597b 2429 }
be64b5b0 2430
352d5da8 2431 ret = lookup_unit_table(_UNIT_TABLE, s)
2432 if ret is not None:
2433 return ret
2434
2435 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2436 if mobj:
2437 return str_to_int(mobj.group(1))
be64b5b0 2438
2f7ae819 2439
5d45484c 2440def parse_resolution(s, *, lenient=False):
b871d7e9
S
2441 if s is None:
2442 return {}
2443
5d45484c
LNO
2444 if lenient:
2445 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2446 else:
2447 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2448 if mobj:
2449 return {
2450 'width': int(mobj.group('w')),
2451 'height': int(mobj.group('h')),
2452 }
2453
17ec8bcf 2454 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2455 if mobj:
2456 return {'height': int(mobj.group(1))}
2457
2458 mobj = re.search(r'\b([48])[kK]\b', s)
2459 if mobj:
2460 return {'height': int(mobj.group(1)) * 540}
2461
2462 return {}
2463
2464
0dc41787 2465def parse_bitrate(s):
14f25df2 2466 if not isinstance(s, str):
0dc41787
S
2467 return
2468 mobj = re.search(r'\b(\d+)\s*kbps', s)
2469 if mobj:
2470 return int(mobj.group(1))
2471
2472
a942d6cb 2473def month_by_name(name, lang='en'):
caefb1de
PH
2474 """ Return the number of a month by (locale-independently) English name """
2475
f6717dec 2476 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2477
caefb1de 2478 try:
f6717dec 2479 return month_names.index(name) + 1
7105440c
YCH
2480 except ValueError:
2481 return None
2482
2483
2484def month_by_abbreviation(abbrev):
2485 """ Return the number of a month by (locale-independently) English
2486 abbreviations """
2487
2488 try:
2489 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2490 except ValueError:
2491 return None
18258362
JMF
2492
2493
5aafe895 2494def fix_xml_ampersands(xml_str):
18258362 2495 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2496 return re.sub(
2497 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2498 '&amp;',
5aafe895 2499 xml_str)
e3946f98
PH
2500
2501
2502def setproctitle(title):
14f25df2 2503 assert isinstance(title, str)
c1c05c67 2504
fe0918bb 2505 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2506 try:
2507 import ctypes
2508 except ImportError:
c1c05c67
YCH
2509 return
2510
e3946f98 2511 try:
611c1dd9 2512 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2513 except OSError:
2514 return
2f49bcd6
RC
2515 except TypeError:
2516 # LoadLibrary in Windows Python 2.7.13 only expects
2517 # a bytestring, but since unicode_literals turns
2518 # every string into a unicode string, it fails.
2519 return
0f06bcd7 2520 title_bytes = title.encode()
6eefe533
PH
2521 buf = ctypes.create_string_buffer(len(title_bytes))
2522 buf.value = title_bytes
e3946f98 2523 try:
6eefe533 2524 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2525 except AttributeError:
2526 return # Strange libc, just skip this
d7dda168
PH
2527
2528
2529def remove_start(s, start):
46bc9b7d 2530 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2531
2532
2b9faf55 2533def remove_end(s, end):
46bc9b7d 2534 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2535
2536
31b2051e
S
2537def remove_quotes(s):
2538 if s is None or len(s) < 2:
2539 return s
2540 for quote in ('"', "'", ):
2541 if s[0] == quote and s[-1] == quote:
2542 return s[1:-1]
2543 return s
2544
2545
b6e0c7d2 2546def get_domain(url):
ebf99aaf 2547 """
2548 This implementation is inconsistent, but is kept for compatibility.
2549 Use this only for "webpage_url_domain"
2550 """
2551 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2552
2553
29eb5174 2554def url_basename(url):
14f25df2 2555 path = urllib.parse.urlparse(url).path
28e614de 2556 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2557
2558
02dc0a36 2559def base_url(url):
7657ec7e 2560 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2561
2562
e34c3361 2563def urljoin(base, path):
4b5de77b 2564 if isinstance(path, bytes):
0f06bcd7 2565 path = path.decode()
14f25df2 2566 if not isinstance(path, str) or not path:
e34c3361 2567 return None
fad4ceb5 2568 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2569 return path
4b5de77b 2570 if isinstance(base, bytes):
0f06bcd7 2571 base = base.decode()
14f25df2 2572 if not isinstance(base, str) or not re.match(
4b5de77b 2573 r'^(?:https?:)?//', base):
e34c3361 2574 return None
14f25df2 2575 return urllib.parse.urljoin(base, path)
e34c3361
S
2576
2577
ac668111 2578class HEADRequest(urllib.request.Request):
aa94a6d3 2579 def get_method(self):
611c1dd9 2580 return 'HEAD'
7217e148
PH
2581
2582
ac668111 2583class PUTRequest(urllib.request.Request):
95cf60e8
S
2584 def get_method(self):
2585 return 'PUT'
2586
2587
9732d77e 2588def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2589 if get_attr and v is not None:
2590 v = getattr(v, get_attr, None)
1812afb7
S
2591 try:
2592 return int(v) * invscale // scale
31c49255 2593 except (ValueError, TypeError, OverflowError):
af98f8ff 2594 return default
9732d77e 2595
9572013d 2596
40a90862 2597def str_or_none(v, default=None):
14f25df2 2598 return default if v is None else str(v)
40a90862 2599
9732d77e
PH
2600
2601def str_to_int(int_str):
48d4681e 2602 """ A more relaxed version of int_or_none """
f9934b96 2603 if isinstance(int_str, int):
348c6bf1 2604 return int_str
14f25df2 2605 elif isinstance(int_str, str):
42db58ec
S
2606 int_str = re.sub(r'[,\.\+]', '', int_str)
2607 return int_or_none(int_str)
608d11f5
PH
2608
2609
9732d77e 2610def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2611 if v is None:
2612 return default
2613 try:
2614 return float(v) * invscale / scale
5e1271c5 2615 except (ValueError, TypeError):
caf80631 2616 return default
43f775e4
PH
2617
2618
c7e327c4
S
2619def bool_or_none(v, default=None):
2620 return v if isinstance(v, bool) else default
2621
2622
53cd37ba 2623def strip_or_none(v, default=None):
14f25df2 2624 return v.strip() if isinstance(v, str) else default
b72b4431
S
2625
2626
af03000a 2627def url_or_none(url):
14f25df2 2628 if not url or not isinstance(url, str):
af03000a
S
2629 return None
2630 url = url.strip()
29f7c58a 2631 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2632
2633
3e9b66d7 2634def request_to_url(req):
ac668111 2635 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2636 return req.get_full_url()
2637 else:
2638 return req
2639
2640
e29663c6 2641def strftime_or_none(timestamp, date_format, default=None):
2642 datetime_object = None
2643 try:
f9934b96 2644 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2645 # Using naive datetime here can break timestamp() in Windows
2646 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2647 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2648 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2649 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2650 date_format = re.sub( # Support %s on windows
2651 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2652 return datetime_object.strftime(date_format)
2653 except (ValueError, TypeError, AttributeError):
2654 return default
2655
2656
608d11f5 2657def parse_duration(s):
f9934b96 2658 if not isinstance(s, str):
608d11f5 2659 return None
ca7b3246 2660 s = s.strip()
38d79fd1 2661 if not s:
2662 return None
ca7b3246 2663
acaff495 2664 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2665 m = re.match(r'''(?x)
2666 (?P<before_secs>
2667 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2668 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2669 (?P<ms>[.:][0-9]+)?Z?$
2670 ''', s)
acaff495 2671 if m:
8bd1c00b 2672 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2673 else:
2674 m = re.match(
056653bb
S
2675 r'''(?ix)(?:P?
2676 (?:
1c1b2f96 2677 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2678 )?
2679 (?:
1c1b2f96 2680 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2681 )?
2682 (?:
1c1b2f96 2683 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2684 )?
8f4b58d7 2685 (?:
1c1b2f96 2686 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2687 )?
056653bb 2688 T)?
acaff495 2689 (?:
1c1b2f96 2690 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2691 )?
2692 (?:
1c1b2f96 2693 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2694 )?
2695 (?:
2696 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2697 )?Z?$''', s)
acaff495 2698 if m:
2699 days, hours, mins, secs, ms = m.groups()
2700 else:
15846398 2701 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2702 if m:
2703 hours, mins = m.groups()
2704 else:
2705 return None
2706
acaff495 2707 if ms:
19a03940 2708 ms = ms.replace(':', '.')
2709 return sum(float(part or 0) * mult for part, mult in (
2710 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2711
2712
e65e4c88 2713def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2714 name, real_ext = os.path.splitext(filename)
e65e4c88 2715 return (
86e5f3ed 2716 f'{name}.{ext}{real_ext}'
e65e4c88 2717 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2718 else f'{filename}.{ext}')
d70ad093
PH
2719
2720
b3ed15b7
S
2721def replace_extension(filename, ext, expected_real_ext=None):
2722 name, real_ext = os.path.splitext(filename)
86e5f3ed 2723 return '{}.{}'.format(
b3ed15b7
S
2724 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2725 ext)
2726
2727
d70ad093
PH
2728def check_executable(exe, args=[]):
2729 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2730 args can be a list of arguments for a short output (like -version) """
2731 try:
f0c9fb96 2732 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2733 except OSError:
2734 return False
2735 return exe
b7ab0590
PH
2736
2737
7aaf4cd2 2738def _get_exe_version_output(exe, args):
95807118 2739 try:
b64d04c1 2740 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2741 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2742 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2743 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2744 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2745 if ret:
2746 return None
95807118
PH
2747 except OSError:
2748 return False
f0c9fb96 2749 return stdout
cae97f65
PH
2750
2751
2752def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2753 assert isinstance(output, str)
cae97f65
PH
2754 if version_re is None:
2755 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2756 m = re.search(version_re, output)
95807118
PH
2757 if m:
2758 return m.group(1)
2759 else:
2760 return unrecognized
2761
2762
9af98e17 2763def get_exe_version(exe, args=['--version'],
1cdda329 2764 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2765 """ Returns the version of the specified executable,
2766 or False if the executable is not present """
1cdda329 2767 unrecognized = variadic(unrecognized)
2768 assert len(unrecognized) in (1, 2)
9af98e17 2769 out = _get_exe_version_output(exe, args)
1cdda329 2770 if out is None:
2771 return unrecognized[-1]
2772 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2773
2774
7e88d7d7 2775def frange(start=0, stop=None, step=1):
2776 """Float range"""
2777 if stop is None:
2778 start, stop = 0, start
2779 sign = [-1, 1][step > 0] if step else 0
2780 while sign * start < sign * stop:
2781 yield start
2782 start += step
2783
2784
cb89cfc1 2785class LazyList(collections.abc.Sequence):
0f06bcd7 2786 """Lazy immutable list from an iterable
2787 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2788
8e5fecc8 2789 class IndexError(IndexError):
2790 pass
2791
282f5709 2792 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2793 self._iterable = iter(iterable)
2794 self._cache = [] if _cache is None else _cache
2795 self._reversed = reverse
483336e7 2796
2797 def __iter__(self):
0f06bcd7 2798 if self._reversed:
28419ca2 2799 # We need to consume the entire iterable to iterate in reverse
981052c9 2800 yield from self.exhaust()
28419ca2 2801 return
0f06bcd7 2802 yield from self._cache
2803 for item in self._iterable:
2804 self._cache.append(item)
483336e7 2805 yield item
2806
0f06bcd7 2807 def _exhaust(self):
2808 self._cache.extend(self._iterable)
2809 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2810 return self._cache
28419ca2 2811
981052c9 2812 def exhaust(self):
0f06bcd7 2813 """Evaluate the entire iterable"""
2814 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2815
28419ca2 2816 @staticmethod
0f06bcd7 2817 def _reverse_index(x):
f2df4071 2818 return None if x is None else ~x
483336e7 2819
2820 def __getitem__(self, idx):
2821 if isinstance(idx, slice):
0f06bcd7 2822 if self._reversed:
2823 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2824 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2825 elif isinstance(idx, int):
0f06bcd7 2826 if self._reversed:
2827 idx = self._reverse_index(idx)
e0f2b4b4 2828 start, stop, step = idx, idx, 0
483336e7 2829 else:
2830 raise TypeError('indices must be integers or slices')
e0f2b4b4 2831 if ((start or 0) < 0 or (stop or 0) < 0
2832 or (start is None and step < 0)
2833 or (stop is None and step > 0)):
483336e7 2834 # We need to consume the entire iterable to be able to slice from the end
2835 # Obviously, never use this with infinite iterables
0f06bcd7 2836 self._exhaust()
8e5fecc8 2837 try:
0f06bcd7 2838 return self._cache[idx]
8e5fecc8 2839 except IndexError as e:
2840 raise self.IndexError(e) from e
0f06bcd7 2841 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2842 if n > 0:
0f06bcd7 2843 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2844 try:
0f06bcd7 2845 return self._cache[idx]
8e5fecc8 2846 except IndexError as e:
2847 raise self.IndexError(e) from e
483336e7 2848
2849 def __bool__(self):
2850 try:
0f06bcd7 2851 self[-1] if self._reversed else self[0]
8e5fecc8 2852 except self.IndexError:
483336e7 2853 return False
2854 return True
2855
2856 def __len__(self):
0f06bcd7 2857 self._exhaust()
2858 return len(self._cache)
483336e7 2859
282f5709 2860 def __reversed__(self):
0f06bcd7 2861 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2862
2863 def __copy__(self):
0f06bcd7 2864 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2865
28419ca2 2866 def __repr__(self):
2867 # repr and str should mimic a list. So we exhaust the iterable
2868 return repr(self.exhaust())
2869
2870 def __str__(self):
2871 return repr(self.exhaust())
2872
483336e7 2873
7be9ccff 2874class PagedList:
c07a39ae 2875
2876 class IndexError(IndexError):
2877 pass
2878
dd26ced1
PH
2879 def __len__(self):
2880 # This is only useful for tests
2881 return len(self.getslice())
2882
7be9ccff 2883 def __init__(self, pagefunc, pagesize, use_cache=True):
2884 self._pagefunc = pagefunc
2885 self._pagesize = pagesize
f1d13090 2886 self._pagecount = float('inf')
7be9ccff 2887 self._use_cache = use_cache
2888 self._cache = {}
2889
2890 def getpage(self, pagenum):
d8cf8d97 2891 page_results = self._cache.get(pagenum)
2892 if page_results is None:
f1d13090 2893 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2894 if self._use_cache:
2895 self._cache[pagenum] = page_results
2896 return page_results
2897
2898 def getslice(self, start=0, end=None):
2899 return list(self._getslice(start, end))
2900
2901 def _getslice(self, start, end):
55575225 2902 raise NotImplementedError('This method must be implemented by subclasses')
2903
2904 def __getitem__(self, idx):
f1d13090 2905 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2906 if not isinstance(idx, int) or idx < 0:
2907 raise TypeError('indices must be non-negative integers')
2908 entries = self.getslice(idx, idx + 1)
d8cf8d97 2909 if not entries:
c07a39ae 2910 raise self.IndexError()
d8cf8d97 2911 return entries[0]
55575225 2912
9c44d242
PH
2913
2914class OnDemandPagedList(PagedList):
a44ca5a4 2915 """Download pages until a page with less than maximum results"""
86e5f3ed 2916
7be9ccff 2917 def _getslice(self, start, end):
b7ab0590
PH
2918 for pagenum in itertools.count(start // self._pagesize):
2919 firstid = pagenum * self._pagesize
2920 nextfirstid = pagenum * self._pagesize + self._pagesize
2921 if start >= nextfirstid:
2922 continue
2923
b7ab0590
PH
2924 startv = (
2925 start % self._pagesize
2926 if firstid <= start < nextfirstid
2927 else 0)
b7ab0590
PH
2928 endv = (
2929 ((end - 1) % self._pagesize) + 1
2930 if (end is not None and firstid <= end <= nextfirstid)
2931 else None)
2932
f1d13090 2933 try:
2934 page_results = self.getpage(pagenum)
2935 except Exception:
2936 self._pagecount = pagenum - 1
2937 raise
b7ab0590
PH
2938 if startv != 0 or endv is not None:
2939 page_results = page_results[startv:endv]
7be9ccff 2940 yield from page_results
b7ab0590
PH
2941
2942 # A little optimization - if current page is not "full", ie. does
2943 # not contain page_size videos then we can assume that this page
2944 # is the last one - there are no more ids on further pages -
2945 # i.e. no need to query again.
2946 if len(page_results) + startv < self._pagesize:
2947 break
2948
2949 # If we got the whole page, but the next page is not interesting,
2950 # break out early as well
2951 if end == nextfirstid:
2952 break
81c2f20b
PH
2953
2954
9c44d242 2955class InAdvancePagedList(PagedList):
a44ca5a4 2956 """PagedList with total number of pages known in advance"""
86e5f3ed 2957
9c44d242 2958 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2959 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2960 self._pagecount = pagecount
9c44d242 2961
7be9ccff 2962 def _getslice(self, start, end):
9c44d242 2963 start_page = start // self._pagesize
d37707bd 2964 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2965 skip_elems = start - start_page * self._pagesize
2966 only_more = None if end is None else end - start
2967 for pagenum in range(start_page, end_page):
7be9ccff 2968 page_results = self.getpage(pagenum)
9c44d242 2969 if skip_elems:
7be9ccff 2970 page_results = page_results[skip_elems:]
9c44d242
PH
2971 skip_elems = None
2972 if only_more is not None:
7be9ccff 2973 if len(page_results) < only_more:
2974 only_more -= len(page_results)
9c44d242 2975 else:
7be9ccff 2976 yield from page_results[:only_more]
9c44d242 2977 break
7be9ccff 2978 yield from page_results
9c44d242
PH
2979
2980
7e88d7d7 2981class PlaylistEntries:
2982 MissingEntry = object()
2983 is_exhausted = False
2984
2985 def __init__(self, ydl, info_dict):
7e9a6125 2986 self.ydl = ydl
2987
2988 # _entries must be assigned now since infodict can change during iteration
2989 entries = info_dict.get('entries')
2990 if entries is None:
2991 raise EntryNotInPlaylist('There are no entries')
2992 elif isinstance(entries, list):
2993 self.is_exhausted = True
2994
2995 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2996 self.is_incomplete = requested_entries is not None
7e9a6125 2997 if self.is_incomplete:
2998 assert self.is_exhausted
bc5c2f8a 2999 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 3000 for i, entry in zip(requested_entries, entries):
3001 self._entries[i - 1] = entry
3002 elif isinstance(entries, (list, PagedList, LazyList)):
3003 self._entries = entries
3004 else:
3005 self._entries = LazyList(entries)
7e88d7d7 3006
3007 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
3008 (?P<start>[+-]?\d+)?
3009 (?P<range>[:-]
3010 (?P<end>[+-]?\d+|inf(?:inite)?)?
3011 (?::(?P<step>[+-]?\d+))?
3012 )?''')
3013
3014 @classmethod
3015 def parse_playlist_items(cls, string):
3016 for segment in string.split(','):
3017 if not segment:
3018 raise ValueError('There is two or more consecutive commas')
3019 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
3020 if not mobj:
3021 raise ValueError(f'{segment!r} is not a valid specification')
3022 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
3023 if int_or_none(step) == 0:
3024 raise ValueError(f'Step in {segment!r} cannot be zero')
3025 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3026
3027 def get_requested_items(self):
3028 playlist_items = self.ydl.params.get('playlist_items')
3029 playlist_start = self.ydl.params.get('playliststart', 1)
3030 playlist_end = self.ydl.params.get('playlistend')
3031 # For backwards compatibility, interpret -1 as whole list
3032 if playlist_end in (-1, None):
3033 playlist_end = ''
3034 if not playlist_items:
3035 playlist_items = f'{playlist_start}:{playlist_end}'
3036 elif playlist_start != 1 or playlist_end:
3037 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3038
3039 for index in self.parse_playlist_items(playlist_items):
3040 for i, entry in self[index]:
3041 yield i, entry
1ac4fd80 3042 if not entry:
3043 continue
7e88d7d7 3044 try:
d21056f4 3045 # The item may have just been added to archive. Don't break due to it
3046 if not self.ydl.params.get('lazy_playlist'):
3047 # TODO: Add auto-generated fields
3048 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 3049 except (ExistingVideoReached, RejectedVideoReached):
3050 return
3051
7e9a6125 3052 def get_full_count(self):
3053 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 3054 return len(self)
3055 elif isinstance(self._entries, InAdvancePagedList):
3056 if self._entries._pagesize == 1:
3057 return self._entries._pagecount
3058
7e88d7d7 3059 @functools.cached_property
3060 def _getter(self):
3061 if isinstance(self._entries, list):
3062 def get_entry(i):
3063 try:
3064 entry = self._entries[i]
3065 except IndexError:
3066 entry = self.MissingEntry
3067 if not self.is_incomplete:
3068 raise self.IndexError()
3069 if entry is self.MissingEntry:
bc5c2f8a 3070 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3071 return entry
3072 else:
3073 def get_entry(i):
3074 try:
3075 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3076 except (LazyList.IndexError, PagedList.IndexError):
3077 raise self.IndexError()
3078 return get_entry
3079
3080 def __getitem__(self, idx):
3081 if isinstance(idx, int):
3082 idx = slice(idx, idx)
3083
3084 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3085 step = 1 if idx.step is None else idx.step
3086 if idx.start is None:
3087 start = 0 if step > 0 else len(self) - 1
3088 else:
3089 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3090
3091 # NB: Do not call len(self) when idx == [:]
3092 if idx.stop is None:
3093 stop = 0 if step < 0 else float('inf')
3094 else:
3095 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3096 stop += [-1, 1][step > 0]
3097
3098 for i in frange(start, stop, step):
3099 if i < 0:
3100 continue
3101 try:
7e9a6125 3102 entry = self._getter(i)
3103 except self.IndexError:
3104 self.is_exhausted = True
3105 if step > 0:
7e88d7d7 3106 break
7e9a6125 3107 continue
7e88d7d7 3108 yield i + 1, entry
3109
3110 def __len__(self):
3111 return len(tuple(self[:]))
3112
3113 class IndexError(IndexError):
3114 pass
3115
3116
81c2f20b 3117def uppercase_escape(s):
676eb3f2 3118 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3119 return re.sub(
a612753d 3120 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3121 lambda m: unicode_escape(m.group(0))[0],
3122 s)
0fe2ff78
YCH
3123
3124
3125def lowercase_escape(s):
3126 unicode_escape = codecs.getdecoder('unicode_escape')
3127 return re.sub(
3128 r'\\u[0-9a-fA-F]{4}',
3129 lambda m: unicode_escape(m.group(0))[0],
3130 s)
b53466e1 3131
d05cfe06
S
3132
3133def escape_rfc3986(s):
3134 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3135 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3136
3137
3138def escape_url(url):
3139 """Escape URL as suggested by RFC 3986"""
14f25df2 3140 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3141 return url_parsed._replace(
efbed08d 3142 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3143 path=escape_rfc3986(url_parsed.path),
3144 params=escape_rfc3986(url_parsed.params),
3145 query=escape_rfc3986(url_parsed.query),
3146 fragment=escape_rfc3986(url_parsed.fragment)
3147 ).geturl()
3148
62e609ab 3149
96b9e9cf 3150def parse_qs(url, **kwargs):
3151 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3152
3153
62e609ab
PH
3154def read_batch_urls(batch_fd):
3155 def fixup(url):
14f25df2 3156 if not isinstance(url, str):
62e609ab 3157 url = url.decode('utf-8', 'replace')
8c04f0be 3158 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3159 for bom in BOM_UTF8:
3160 if url.startswith(bom):
3161 url = url[len(bom):]
3162 url = url.lstrip()
3163 if not url or url.startswith(('#', ';', ']')):
62e609ab 3164 return False
8c04f0be 3165 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3166 # However, it can be safely stripped out if following a whitespace
8c04f0be 3167 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3168
3169 with contextlib.closing(batch_fd) as fd:
3170 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3171
3172
3173def urlencode_postdata(*args, **kargs):
14f25df2 3174 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3175
3176
45b2ee6f 3177def update_url(url, *, query_update=None, **kwargs):
3178 """Replace URL components specified by kwargs
3179 @param url str or parse url tuple
3180 @param query_update update query
3181 @returns str
3182 """
3183 if isinstance(url, str):
3184 if not kwargs and not query_update:
3185 return url
3186 else:
3187 url = urllib.parse.urlparse(url)
3188 if query_update:
3189 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3190 kwargs['query'] = urllib.parse.urlencode({
3191 **urllib.parse.parse_qs(url.query),
3192 **query_update
3193 }, True)
3194 return urllib.parse.urlunparse(url._replace(**kwargs))
3195
3196
38f9ef31 3197def update_url_query(url, query):
45b2ee6f 3198 return update_url(url, query_update=query)
16392824 3199
8e60dc75 3200
c043c246 3201def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3202 req_headers = req.headers.copy()
c043c246 3203 req_headers.update(headers or {})
ed0291d1
S
3204 req_data = data or req.data
3205 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3206 req_get_method = req.get_method()
3207 if req_get_method == 'HEAD':
3208 req_type = HEADRequest
3209 elif req_get_method == 'PUT':
3210 req_type = PUTRequest
3211 else:
ac668111 3212 req_type = urllib.request.Request
ed0291d1
S
3213 new_req = req_type(
3214 req_url, data=req_data, headers=req_headers,
3215 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3216 if hasattr(req, 'timeout'):
3217 new_req.timeout = req.timeout
3218 return new_req
3219
3220
10c87c15 3221def _multipart_encode_impl(data, boundary):
0c265486
YCH
3222 content_type = 'multipart/form-data; boundary=%s' % boundary
3223
3224 out = b''
3225 for k, v in data.items():
3226 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3227 if isinstance(k, str):
0f06bcd7 3228 k = k.encode()
14f25df2 3229 if isinstance(v, str):
0f06bcd7 3230 v = v.encode()
0c265486
YCH
3231 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3232 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3233 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3234 if boundary.encode('ascii') in content:
3235 raise ValueError('Boundary overlaps with data')
3236 out += content
3237
3238 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3239
3240 return out, content_type
3241
3242
3243def multipart_encode(data, boundary=None):
3244 '''
3245 Encode a dict to RFC 7578-compliant form-data
3246
3247 data:
3248 A dict where keys and values can be either Unicode or bytes-like
3249 objects.
3250 boundary:
3251 If specified a Unicode object, it's used as the boundary. Otherwise
3252 a random boundary is generated.
3253
3254 Reference: https://tools.ietf.org/html/rfc7578
3255 '''
3256 has_specified_boundary = boundary is not None
3257
3258 while True:
3259 if boundary is None:
3260 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3261
3262 try:
10c87c15 3263 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3264 break
3265 except ValueError:
3266 if has_specified_boundary:
3267 raise
3268 boundary = None
3269
3270 return out, content_type
3271
3272
304ad45a 3273def variadic(x, allowed_types=(str, bytes, dict)):
3274 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3275
3276
86296ad2 3277def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3278 for val in map(d.get, variadic(key_or_keys)):
3279 if val is not None and (val or not skip_false_values):
3280 return val
3281 return default
cbecc9b9
S
3282
3283
c4f60dd7 3284def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3285 for f in funcs:
a32a9a7e 3286 try:
c4f60dd7 3287 val = f(*args, **kwargs)
ab029d7e 3288 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3289 pass
3290 else:
c4f60dd7 3291 if expected_type is None or isinstance(val, expected_type):
3292 return val
3293
3294
3295def try_get(src, getter, expected_type=None):
3296 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3297
3298
90137ca4 3299def filter_dict(dct, cndn=lambda _, v: v is not None):
3300 return {k: v for k, v in dct.items() if cndn(k, v)}
3301
3302
6cc62232
S
3303def merge_dicts(*dicts):
3304 merged = {}
3305 for a_dict in dicts:
3306 for k, v in a_dict.items():
90137ca4 3307 if (v is not None and k not in merged
3308 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3309 merged[k] = v
3310 return merged
3311
3312
8e60dc75 3313def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3314 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3315
16392824 3316
a1a530b0
PH
3317US_RATINGS = {
3318 'G': 0,
3319 'PG': 10,
3320 'PG-13': 13,
3321 'R': 16,
3322 'NC': 18,
3323}
fac55558
PH
3324
3325
a8795327 3326TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3327 'TV-Y': 0,
3328 'TV-Y7': 7,
3329 'TV-G': 0,
3330 'TV-PG': 0,
3331 'TV-14': 14,
3332 'TV-MA': 17,
a8795327
S
3333}
3334
3335
146c80e2 3336def parse_age_limit(s):
19a03940 3337 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3338 if type(s) is int: # noqa: E721
a8795327 3339 return s if 0 <= s <= 21 else None
19a03940 3340 elif not isinstance(s, str):
d838b1bd 3341 return None
146c80e2 3342 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3343 if m:
3344 return int(m.group('age'))
5c5fae6d 3345 s = s.upper()
a8795327
S
3346 if s in US_RATINGS:
3347 return US_RATINGS[s]
5a16c9d9 3348 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3349 if m:
5a16c9d9 3350 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3351 return None
146c80e2
S
3352
3353
fac55558 3354def strip_jsonp(code):
609a61e3 3355 return re.sub(
5552c9eb 3356 r'''(?sx)^
e9c671d5 3357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3358 (?:\s*&&\s*(?P=func_name))?
3359 \s*\(\s*(?P<callback_data>.*)\);?
3360 \s*?(?://[^\n]*)*$''',
3361 r'\g<callback_data>', code)
478c2c61
PH
3362
3363
8f53dc44 3364def js_to_json(code, vars={}, *, strict=False):
5c610515 3365 # vars is a dict of var, val pairs to substitute
a71b812f
SS
3366 STRING_QUOTES = '\'"'
3367 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3368 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3369 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3370 INTEGER_TABLE = (
86e5f3ed 3371 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3372 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3373 )
3374
a71b812f
SS
3375 def process_escape(match):
3376 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3377 escape = match.group(1) or match.group(2)
3378
3379 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3380 else R'\u00' if escape == 'x'
3381 else '' if escape == '\n'
3382 else escape)
3383
e05f6939 3384 def fix_kv(m):
e7b6d122
PH
3385 v = m.group(0)
3386 if v in ('true', 'false', 'null'):
3387 return v
421ddcb8
C
3388 elif v in ('undefined', 'void 0'):
3389 return 'null'
8bdd16b4 3390 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3391 return ''
3392
3393 if v[0] in STRING_QUOTES:
3394 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3395 return f'"{escaped}"'
3396
3397 for regex, base in INTEGER_TABLE:
3398 im = re.match(regex, v)
3399 if im:
3400 i = int(im.group(1), base)
3401 return f'"{i}":' if v.endswith(':') else str(i)
3402
3403 if v in vars:
d5f043d1
C
3404 try:
3405 if not strict:
3406 json.loads(vars[v])
08e29b9f 3407 except json.JSONDecodeError:
d5f043d1
C
3408 return json.dumps(vars[v])
3409 else:
3410 return vars[v]
89ac4a19 3411
a71b812f
SS
3412 if not strict:
3413 return f'"{v}"'
5c610515 3414
a71b812f 3415 raise ValueError(f'Unknown value: {v}')
e05f6939 3416
8072ef2b 3417 def create_map(mobj):
3418 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3419
8072ef2b 3420 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3421 if not strict:
3422 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3423 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3424 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3425 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3426
a71b812f
SS
3427 return re.sub(rf'''(?sx)
3428 {STRING_RE}|
3429 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3430 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3431 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3432 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3433 !+
a71b812f 3434 ''', fix_kv, code)
e05f6939
PH
3435
3436
478c2c61
PH
3437def qualities(quality_ids):
3438 """ Get a numeric quality value out of a list of possible values """
3439 def q(qid):
3440 try:
3441 return quality_ids.index(qid)
3442 except ValueError:
3443 return -1
3444 return q
3445
acd69589 3446
119e40ef 3447POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3448
3449
de6000d9 3450DEFAULT_OUTTMPL = {
3451 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3452 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3453}
3454OUTTMPL_TYPES = {
72755351 3455 'chapter': None,
de6000d9 3456 'subtitle': None,
3457 'thumbnail': None,
3458 'description': 'description',
3459 'annotation': 'annotations.xml',
3460 'infojson': 'info.json',
08438d2c 3461 'link': None,
3b603dbd 3462 'pl_video': None,
5112f26a 3463 'pl_thumbnail': None,
de6000d9 3464 'pl_description': 'description',
3465 'pl_infojson': 'info.json',
3466}
0a871f68 3467
143db31d 3468# As of [1] format syntax is:
3469# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3470# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3471STR_FORMAT_RE_TMPL = r'''(?x)
3472 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3473 %
524e2e4f 3474 (?P<has_key>\((?P<key>{0})\))?
752cda38 3475 (?P<format>
524e2e4f 3476 (?P<conversion>[#0\-+ ]+)?
3477 (?P<min_width>\d+)?
3478 (?P<precision>\.\d+)?
3479 (?P<len_mod>[hlL])? # unused in python
901130bb 3480 {1} # conversion type
752cda38 3481 )
143db31d 3482'''
3483
7d1eb38a 3484
901130bb 3485STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3486
7d1eb38a 3487
a020a0dc
PH
3488def limit_length(s, length):
3489 """ Add ellipses to overly long strings """
3490 if s is None:
3491 return None
3492 ELLIPSES = '...'
3493 if len(s) > length:
3494 return s[:length - len(ELLIPSES)] + ELLIPSES
3495 return s
48844745
PH
3496
3497
3498def version_tuple(v):
5f9b8394 3499 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3500
3501
3502def is_outdated_version(version, limit, assume_new=True):
3503 if not version:
3504 return not assume_new
3505 try:
3506 return version_tuple(version) < version_tuple(limit)
3507 except ValueError:
3508 return not assume_new
732ea2f0
PH
3509
3510
3511def ytdl_is_updateable():
7a5c1cfe 3512 """ Returns if yt-dlp can be updated with -U """
735d865e 3513
5d535b4a 3514 from .update import is_non_updateable
732ea2f0 3515
5d535b4a 3516 return not is_non_updateable()
7d4111ed
PH
3517
3518
3519def args_to_str(args):
3520 # Get a short string representation for a subprocess command
702ccf2d 3521 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3522
3523
9b9c5355 3524def error_to_compat_str(err):
cfb0511d 3525 return str(err)
fdae2358
S
3526
3527
a44ca5a4 3528def error_to_str(err):
3529 return f'{type(err).__name__}: {err}'
3530
3531
2647c933 3532def mimetype2ext(mt, default=NO_DEFAULT):
3533 if not isinstance(mt, str):
3534 if default is not NO_DEFAULT:
3535 return default
eb9ee194
S
3536 return None
3537
2647c933 3538 MAP = {
3539 # video
f6861ec9 3540 '3gpp': '3gp',
2647c933 3541 'mp2t': 'ts',
3542 'mp4': 'mp4',
3543 'mpeg': 'mpeg',
3544 'mpegurl': 'm3u8',
3545 'quicktime': 'mov',
3546 'webm': 'webm',
3547 'vp9': 'vp9',
f6861ec9 3548 'x-flv': 'flv',
2647c933 3549 'x-m4v': 'm4v',
3550 'x-matroska': 'mkv',
3551 'x-mng': 'mng',
a0d8d704 3552 'x-mp4-fragmented': 'mp4',
2647c933 3553 'x-ms-asf': 'asf',
a0d8d704 3554 'x-ms-wmv': 'wmv',
2647c933 3555 'x-msvideo': 'avi',
3556
3557 # application (streaming playlists)
b4173f15 3558 'dash+xml': 'mpd',
b4173f15 3559 'f4m+xml': 'f4m',
f164b971 3560 'hds+xml': 'f4m',
2647c933 3561 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3562 'vnd.ms-sstr+xml': 'ism',
2647c933 3563 'x-mpegurl': 'm3u8',
3564
3565 # audio
3566 'audio/mp4': 'm4a',
3567 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3568 # Using .mp3 as it's the most popular one
3569 'audio/mpeg': 'mp3',
d80ca5de 3570 'audio/webm': 'webm',
2647c933 3571 'audio/x-matroska': 'mka',
3572 'audio/x-mpegurl': 'm3u',
3573 'midi': 'mid',
3574 'ogg': 'ogg',
3575 'wav': 'wav',
3576 'wave': 'wav',
3577 'x-aac': 'aac',
3578 'x-flac': 'flac',
3579 'x-m4a': 'm4a',
3580 'x-realaudio': 'ra',
39e7107d 3581 'x-wav': 'wav',
9359f3d4 3582
2647c933 3583 # image
3584 'avif': 'avif',
3585 'bmp': 'bmp',
3586 'gif': 'gif',
3587 'jpeg': 'jpg',
3588 'png': 'png',
3589 'svg+xml': 'svg',
3590 'tiff': 'tif',
3591 'vnd.wap.wbmp': 'wbmp',
3592 'webp': 'webp',
3593 'x-icon': 'ico',
3594 'x-jng': 'jng',
3595 'x-ms-bmp': 'bmp',
3596
3597 # caption
3598 'filmstrip+json': 'fs',
3599 'smptett+xml': 'tt',
3600 'ttaf+xml': 'dfxp',
3601 'ttml+xml': 'ttml',
3602 'x-ms-sami': 'sami',
9359f3d4 3603
2647c933 3604 # misc
3605 'gzip': 'gz',
9359f3d4
F
3606 'json': 'json',
3607 'xml': 'xml',
3608 'zip': 'zip',
9359f3d4
F
3609 }
3610
2647c933 3611 mimetype = mt.partition(';')[0].strip().lower()
3612 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3613
2647c933 3614 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3615 if ext:
3616 return ext
3617 elif default is not NO_DEFAULT:
3618 return default
9359f3d4 3619 return subtype.replace('+', '.')
c460bdd5
PH
3620
3621
2814f12b
THD
3622def ext2mimetype(ext_or_url):
3623 if not ext_or_url:
3624 return None
3625 if '.' not in ext_or_url:
3626 ext_or_url = f'file.{ext_or_url}'
3627 return mimetypes.guess_type(ext_or_url)[0]
3628
3629
4f3c5e06 3630def parse_codecs(codecs_str):
3631 # http://tools.ietf.org/html/rfc6381
3632 if not codecs_str:
3633 return {}
a0566bbf 3634 split_codecs = list(filter(None, map(
dbf5416a 3635 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3636 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3637 for full_codec in split_codecs:
d816f61f 3638 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3639 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3640 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3641 if vcodec:
3642 continue
3643 vcodec = full_codec
3644 if parts[0] in ('dvh1', 'dvhe'):
3645 hdr = 'DV'
3646 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3647 hdr = 'HDR10'
3648 elif parts[:2] == ['vp9', '2']:
3649 hdr = 'HDR10'
71082216 3650 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3651 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3652 acodec = acodec or full_codec
3653 elif parts[0] in ('stpp', 'wvtt'):
3654 scodec = scodec or full_codec
4f3c5e06 3655 else:
19a03940 3656 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3657 if vcodec or acodec or scodec:
4f3c5e06 3658 return {
3659 'vcodec': vcodec or 'none',
3660 'acodec': acodec or 'none',
176f1866 3661 'dynamic_range': hdr,
3fe75fdc 3662 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3663 }
b69fd25c 3664 elif len(split_codecs) == 2:
3665 return {
3666 'vcodec': split_codecs[0],
3667 'acodec': split_codecs[1],
3668 }
4f3c5e06 3669 return {}
3670
3671
fc61aff4
LL
3672def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3673 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3674
3675 allow_mkv = not preferences or 'mkv' in preferences
3676
3677 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3678 return 'mkv' # TODO: any other format allows this?
3679
3680 # TODO: All codecs supported by parse_codecs isn't handled here
3681 COMPATIBLE_CODECS = {
3682 'mp4': {
71082216 3683 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3684 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3685 },
3686 'webm': {
3687 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3688 'vp9x', 'vp8x', # in the webm spec
3689 },
3690 }
3691
a5387729 3692 sanitize_codec = functools.partial(
3693 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3694 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3695
3696 for ext in preferences or COMPATIBLE_CODECS.keys():
3697 codec_set = COMPATIBLE_CODECS.get(ext, set())
3698 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3699 return ext
3700
3701 COMPATIBLE_EXTS = (
3702 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3703 {'webm', 'weba'},
fc61aff4
LL
3704 )
3705 for ext in preferences or vexts:
3706 current_exts = {ext, *vexts, *aexts}
3707 if ext == 'mkv' or current_exts == {ext} or any(
3708 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3709 return ext
3710 return 'mkv' if allow_mkv else preferences[-1]
3711
3712
2647c933 3713def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3714 getheader = url_handle.headers.get
2ccd1b10 3715
b55ee18f
PH
3716 cd = getheader('Content-Disposition')
3717 if cd:
3718 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3719 if m:
3720 e = determine_ext(m.group('filename'), default_ext=None)
3721 if e:
3722 return e
3723
2647c933 3724 meta_ext = getheader('x-amz-meta-name')
3725 if meta_ext:
3726 e = meta_ext.rpartition('.')[2]
3727 if e:
3728 return e
3729
3730 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3731
3732
1e399778
YCH
3733def encode_data_uri(data, mime_type):
3734 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3735
3736
05900629 3737def age_restricted(content_limit, age_limit):
6ec6cb4e 3738 """ Returns True iff the content should be blocked """
05900629
PH
3739
3740 if age_limit is None: # No limit set
3741 return False
3742 if content_limit is None:
3743 return False # Content available for everyone
3744 return age_limit < content_limit
61ca9a80
PH
3745
3746
88f60feb 3747# List of known byte-order-marks (BOM)
a904a7f8
L
3748BOMS = [
3749 (b'\xef\xbb\xbf', 'utf-8'),
3750 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3751 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3752 (b'\xff\xfe', 'utf-16-le'),
3753 (b'\xfe\xff', 'utf-16-be'),
3754]
a904a7f8
L
3755
3756
61ca9a80
PH
3757def is_html(first_bytes):
3758 """ Detect whether a file contains HTML by examining its first bytes. """
3759
80e8493e 3760 encoding = 'utf-8'
61ca9a80 3761 for bom, enc in BOMS:
80e8493e 3762 while first_bytes.startswith(bom):
3763 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3764
80e8493e 3765 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3766
3767
3768def determine_protocol(info_dict):
3769 protocol = info_dict.get('protocol')
3770 if protocol is not None:
3771 return protocol
3772
7de837a5 3773 url = sanitize_url(info_dict['url'])
a055469f
PH
3774 if url.startswith('rtmp'):
3775 return 'rtmp'
3776 elif url.startswith('mms'):
3777 return 'mms'
3778 elif url.startswith('rtsp'):
3779 return 'rtsp'
3780
3781 ext = determine_ext(url)
3782 if ext == 'm3u8':
deae7c17 3783 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3784 elif ext == 'f4m':
3785 return 'f4m'
3786
14f25df2 3787 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3788
3789
c5e3f849 3790def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3791 """ Render a list of rows, each as a list of values.
3792 Text after a \t will be right aligned """
ec11a9f4 3793 def width(string):
c5e3f849 3794 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3795
3796 def get_max_lens(table):
ec11a9f4 3797 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3798
3799 def filter_using_list(row, filterArray):
d16df59d 3800 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3801
d16df59d 3802 max_lens = get_max_lens(data) if hide_empty else []
3803 header_row = filter_using_list(header_row, max_lens)
3804 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3805
cfb56d1a 3806 table = [header_row] + data
76d321f6 3807 max_lens = get_max_lens(table)
c5e3f849 3808 extra_gap += 1
76d321f6 3809 if delim:
c5e3f849 3810 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3811 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3812 for row in table:
3813 for pos, text in enumerate(map(str, row)):
c5e3f849 3814 if '\t' in text:
3815 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3816 else:
3817 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3818 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3819 return ret
347de493
PH
3820
3821
8f18aca8 3822def _match_one(filter_part, dct, incomplete):
77b87f05 3823 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3824 STRING_OPERATORS = {
3825 '*=': operator.contains,
3826 '^=': lambda attr, value: attr.startswith(value),
3827 '$=': lambda attr, value: attr.endswith(value),
3828 '~=': lambda attr, value: re.search(value, attr),
3829 }
347de493 3830 COMPARISON_OPERATORS = {
a047eeb6 3831 **STRING_OPERATORS,
3832 '<=': operator.le, # "<=" must be defined above "<"
347de493 3833 '<': operator.lt,
347de493 3834 '>=': operator.ge,
a047eeb6 3835 '>': operator.gt,
347de493 3836 '=': operator.eq,
347de493 3837 }
a047eeb6 3838
6db9c4d5 3839 if isinstance(incomplete, bool):
3840 is_incomplete = lambda _: incomplete
3841 else:
3842 is_incomplete = lambda k: k in incomplete
3843
64fa820c 3844 operator_rex = re.compile(r'''(?x)
347de493 3845 (?P<key>[a-z_]+)
77b87f05 3846 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3847 (?:
a047eeb6 3848 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3849 (?P<strval>.+?)
347de493 3850 )
347de493 3851 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3852 m = operator_rex.fullmatch(filter_part.strip())
347de493 3853 if m:
18f96d12 3854 m = m.groupdict()
3855 unnegated_op = COMPARISON_OPERATORS[m['op']]
3856 if m['negation']:
77b87f05
MT
3857 op = lambda attr, value: not unnegated_op(attr, value)
3858 else:
3859 op = unnegated_op
18f96d12 3860 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3861 if m['quote']:
3862 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3863 actual_value = dct.get(m['key'])
3864 numeric_comparison = None
f9934b96 3865 if isinstance(actual_value, (int, float)):
e5a088dc
S
3866 # If the original field is a string and matching comparisonvalue is
3867 # a number we should respect the origin of the original field
3868 # and process comparison value as a string (see
18f96d12 3869 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3870 try:
18f96d12 3871 numeric_comparison = int(comparison_value)
347de493 3872 except ValueError:
18f96d12 3873 numeric_comparison = parse_filesize(comparison_value)
3874 if numeric_comparison is None:
3875 numeric_comparison = parse_filesize(f'{comparison_value}B')
3876 if numeric_comparison is None:
3877 numeric_comparison = parse_duration(comparison_value)
3878 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3879 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3880 if actual_value is None:
6db9c4d5 3881 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3882 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3883
3884 UNARY_OPERATORS = {
1cc47c66
S
3885 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3886 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3887 }
64fa820c 3888 operator_rex = re.compile(r'''(?x)
347de493 3889 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3890 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3891 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3892 if m:
3893 op = UNARY_OPERATORS[m.group('op')]
3894 actual_value = dct.get(m.group('key'))
6db9c4d5 3895 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3896 return True
347de493
PH
3897 return op(actual_value)
3898
3899 raise ValueError('Invalid filter part %r' % filter_part)
3900
3901
8f18aca8 3902def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3903 """ Filter a dictionary with a simple string syntax.
3904 @returns Whether the filter passes
3905 @param incomplete Set of keys that is expected to be missing from dct.
3906 Can be True/False to indicate all/none of the keys may be missing.
3907 All conditions on incomplete keys pass if the key is missing
8f18aca8 3908 """
347de493 3909 return all(
8f18aca8 3910 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3911 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3912
3913
fe2ce85a 3914def match_filter_func(filters, breaking_filters=None):
3915 if not filters and not breaking_filters:
d1b5f70b 3916 return None
fe2ce85a 3917 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3918 filters = set(variadic(filters or []))
d1b5f70b 3919
492272fe 3920 interactive = '-' in filters
3921 if interactive:
3922 filters.remove('-')
3923
3924 def _match_func(info_dict, incomplete=False):
fe2ce85a 3925 ret = breaking_filters(info_dict, incomplete)
3926 if ret is not None:
3927 raise RejectedVideoReached(ret)
3928
492272fe 3929 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3930 return NO_DEFAULT if interactive and not incomplete else None
347de493 3931 else:
3bec830a 3932 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3933 filter_str = ') | ('.join(map(str.strip, filters))
3934 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3935 return _match_func
91410c9b
PH
3936
3937
f2df4071 3938class download_range_func:
3939 def __init__(self, chapters, ranges):
3940 self.chapters, self.ranges = chapters, ranges
3941
3942 def __call__(self, info_dict, ydl):
0500ee3d 3943 if not self.ranges and not self.chapters:
3944 yield {}
3945
5ec1b6b7 3946 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3947 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3948 for regex in self.chapters or []:
5ec1b6b7 3949 for i, chapter in enumerate(info_dict.get('chapters') or []):
3950 if re.search(regex, chapter['title']):
3951 warning = None
3952 yield {**chapter, 'index': i}
f2df4071 3953 if self.chapters and warning:
5ec1b6b7 3954 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3955
f2df4071 3956 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3957
f2df4071 3958 def __eq__(self, other):
3959 return (isinstance(other, download_range_func)
3960 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3961
71df9b7f 3962 def __repr__(self):
a5387729 3963 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3964
5ec1b6b7 3965
bf6427d2
YCH
3966def parse_dfxp_time_expr(time_expr):
3967 if not time_expr:
d631d5f9 3968 return
bf6427d2 3969
1d485a1a 3970 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3971 if mobj:
3972 return float(mobj.group('time_offset'))
3973
db2fe38b 3974 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3975 if mobj:
db2fe38b 3976 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3977
3978
c1c924ab 3979def srt_subtitles_timecode(seconds):
aa7785f8 3980 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3981
3982
3983def ass_subtitles_timecode(seconds):
3984 time = timetuple_from_msec(seconds * 1000)
3985 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3986
3987
3988def dfxp2srt(dfxp_data):
3869028f
YCH
3989 '''
3990 @param dfxp_data A bytes-like object containing DFXP data
3991 @returns A unicode object containing converted SRT data
3992 '''
5b995f71 3993 LEGACY_NAMESPACES = (
3869028f
YCH
3994 (b'http://www.w3.org/ns/ttml', [
3995 b'http://www.w3.org/2004/11/ttaf1',
3996 b'http://www.w3.org/2006/04/ttaf1',
3997 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3998 ]),
3869028f
YCH
3999 (b'http://www.w3.org/ns/ttml#styling', [
4000 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
4001 ]),
4002 )
4003
4004 SUPPORTED_STYLING = [
4005 'color',
4006 'fontFamily',
4007 'fontSize',
4008 'fontStyle',
4009 'fontWeight',
4010 'textDecoration'
4011 ]
4012
4e335771 4013 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 4014 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 4015 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 4016 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 4017 })
bf6427d2 4018
5b995f71
RA
4019 styles = {}
4020 default_style = {}
4021
86e5f3ed 4022 class TTMLPElementParser:
5b995f71
RA
4023 _out = ''
4024 _unclosed_elements = []
4025 _applied_styles = []
bf6427d2 4026
2b14cb56 4027 def start(self, tag, attrib):
5b995f71
RA
4028 if tag in (_x('ttml:br'), 'br'):
4029 self._out += '\n'
4030 else:
4031 unclosed_elements = []
4032 style = {}
4033 element_style_id = attrib.get('style')
4034 if default_style:
4035 style.update(default_style)
4036 if element_style_id:
4037 style.update(styles.get(element_style_id, {}))
4038 for prop in SUPPORTED_STYLING:
4039 prop_val = attrib.get(_x('tts:' + prop))
4040 if prop_val:
4041 style[prop] = prop_val
4042 if style:
4043 font = ''
4044 for k, v in sorted(style.items()):
4045 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4046 continue
4047 if k == 'color':
4048 font += ' color="%s"' % v
4049 elif k == 'fontSize':
4050 font += ' size="%s"' % v
4051 elif k == 'fontFamily':
4052 font += ' face="%s"' % v
4053 elif k == 'fontWeight' and v == 'bold':
4054 self._out += '<b>'
4055 unclosed_elements.append('b')
4056 elif k == 'fontStyle' and v == 'italic':
4057 self._out += '<i>'
4058 unclosed_elements.append('i')
4059 elif k == 'textDecoration' and v == 'underline':
4060 self._out += '<u>'
4061 unclosed_elements.append('u')
4062 if font:
4063 self._out += '<font' + font + '>'
4064 unclosed_elements.append('font')
4065 applied_style = {}
4066 if self._applied_styles:
4067 applied_style.update(self._applied_styles[-1])
4068 applied_style.update(style)
4069 self._applied_styles.append(applied_style)
4070 self._unclosed_elements.append(unclosed_elements)
bf6427d2 4071
2b14cb56 4072 def end(self, tag):
5b995f71
RA
4073 if tag not in (_x('ttml:br'), 'br'):
4074 unclosed_elements = self._unclosed_elements.pop()
4075 for element in reversed(unclosed_elements):
4076 self._out += '</%s>' % element
4077 if unclosed_elements and self._applied_styles:
4078 self._applied_styles.pop()
bf6427d2 4079
2b14cb56 4080 def data(self, data):
5b995f71 4081 self._out += data
2b14cb56 4082
4083 def close(self):
5b995f71 4084 return self._out.strip()
2b14cb56 4085
4086 def parse_node(node):
4087 target = TTMLPElementParser()
4088 parser = xml.etree.ElementTree.XMLParser(target=target)
4089 parser.feed(xml.etree.ElementTree.tostring(node))
4090 return parser.close()
bf6427d2 4091
5b995f71
RA
4092 for k, v in LEGACY_NAMESPACES:
4093 for ns in v:
4094 dfxp_data = dfxp_data.replace(ns, k)
4095
3869028f 4096 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4097 out = []
5b995f71 4098 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4099
4100 if not paras:
4101 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4102
5b995f71
RA
4103 repeat = False
4104 while True:
4105 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4106 style_id = style.get('id') or style.get(_x('xml:id'))
4107 if not style_id:
4108 continue
5b995f71
RA
4109 parent_style_id = style.get('style')
4110 if parent_style_id:
4111 if parent_style_id not in styles:
4112 repeat = True
4113 continue
4114 styles[style_id] = styles[parent_style_id].copy()
4115 for prop in SUPPORTED_STYLING:
4116 prop_val = style.get(_x('tts:' + prop))
4117 if prop_val:
4118 styles.setdefault(style_id, {})[prop] = prop_val
4119 if repeat:
4120 repeat = False
4121 else:
4122 break
4123
4124 for p in ('body', 'div'):
4125 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4126 if ele is None:
4127 continue
4128 style = styles.get(ele.get('style'))
4129 if not style:
4130 continue
4131 default_style.update(style)
4132
bf6427d2 4133 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4134 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4135 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4136 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4137 if begin_time is None:
4138 continue
7dff0363 4139 if not end_time:
d631d5f9
YCH
4140 if not dur:
4141 continue
4142 end_time = begin_time + dur
bf6427d2
YCH
4143 out.append('%d\n%s --> %s\n%s\n\n' % (
4144 index,
c1c924ab
YCH
4145 srt_subtitles_timecode(begin_time),
4146 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4147 parse_node(para)))
4148
4149 return ''.join(out)
4150
4151
c487cf00 4152def cli_option(params, command_option, param, separator=None):
66e289ba 4153 param = params.get(param)
c487cf00 4154 return ([] if param is None
4155 else [command_option, str(param)] if separator is None
4156 else [f'{command_option}{separator}{param}'])
66e289ba
S
4157
4158
4159def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4160 param = params.get(param)
c487cf00 4161 assert param in (True, False, None)
4162 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4163
4164
4165def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4166 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4167
4168
e92caff5 4169def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4170 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4171 if use_compat:
5b1ecbb3 4172 return argdict
4173 else:
4174 argdict = None
eab9b2bc 4175 if argdict is None:
5b1ecbb3 4176 return default
eab9b2bc 4177 assert isinstance(argdict, dict)
4178
e92caff5 4179 assert isinstance(keys, (list, tuple))
4180 for key_list in keys:
e92caff5 4181 arg_list = list(filter(
4182 lambda x: x is not None,
6606817a 4183 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4184 if arg_list:
4185 return [arg for args in arg_list for arg in args]
4186 return default
66e289ba 4187
6251555f 4188
330690a2 4189def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4190 main_key, exe = main_key.lower(), exe.lower()
4191 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4192 keys = [f'{root_key}{k}' for k in (keys or [''])]
4193 if root_key in keys:
4194 if main_key != exe:
4195 keys.append((main_key, exe))
4196 keys.append('default')
4197 else:
4198 use_compat = False
4199 return cli_configuration_args(argdict, keys, default, use_compat)
4200
66e289ba 4201
86e5f3ed 4202class ISO639Utils:
39672624
YCH
4203 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4204 _lang_map = {
4205 'aa': 'aar',
4206 'ab': 'abk',
4207 'ae': 'ave',
4208 'af': 'afr',
4209 'ak': 'aka',
4210 'am': 'amh',
4211 'an': 'arg',
4212 'ar': 'ara',
4213 'as': 'asm',
4214 'av': 'ava',
4215 'ay': 'aym',
4216 'az': 'aze',
4217 'ba': 'bak',
4218 'be': 'bel',
4219 'bg': 'bul',
4220 'bh': 'bih',
4221 'bi': 'bis',
4222 'bm': 'bam',
4223 'bn': 'ben',
4224 'bo': 'bod',
4225 'br': 'bre',
4226 'bs': 'bos',
4227 'ca': 'cat',
4228 'ce': 'che',
4229 'ch': 'cha',
4230 'co': 'cos',
4231 'cr': 'cre',
4232 'cs': 'ces',
4233 'cu': 'chu',
4234 'cv': 'chv',
4235 'cy': 'cym',
4236 'da': 'dan',
4237 'de': 'deu',
4238 'dv': 'div',
4239 'dz': 'dzo',
4240 'ee': 'ewe',
4241 'el': 'ell',
4242 'en': 'eng',
4243 'eo': 'epo',
4244 'es': 'spa',
4245 'et': 'est',
4246 'eu': 'eus',
4247 'fa': 'fas',
4248 'ff': 'ful',
4249 'fi': 'fin',
4250 'fj': 'fij',
4251 'fo': 'fao',
4252 'fr': 'fra',
4253 'fy': 'fry',
4254 'ga': 'gle',
4255 'gd': 'gla',
4256 'gl': 'glg',
4257 'gn': 'grn',
4258 'gu': 'guj',
4259 'gv': 'glv',
4260 'ha': 'hau',
4261 'he': 'heb',
b7acc835 4262 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4263 'hi': 'hin',
4264 'ho': 'hmo',
4265 'hr': 'hrv',
4266 'ht': 'hat',
4267 'hu': 'hun',
4268 'hy': 'hye',
4269 'hz': 'her',
4270 'ia': 'ina',
4271 'id': 'ind',
b7acc835 4272 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4273 'ie': 'ile',
4274 'ig': 'ibo',
4275 'ii': 'iii',
4276 'ik': 'ipk',
4277 'io': 'ido',
4278 'is': 'isl',
4279 'it': 'ita',
4280 'iu': 'iku',
4281 'ja': 'jpn',
4282 'jv': 'jav',
4283 'ka': 'kat',
4284 'kg': 'kon',
4285 'ki': 'kik',
4286 'kj': 'kua',
4287 'kk': 'kaz',
4288 'kl': 'kal',
4289 'km': 'khm',
4290 'kn': 'kan',
4291 'ko': 'kor',
4292 'kr': 'kau',
4293 'ks': 'kas',
4294 'ku': 'kur',
4295 'kv': 'kom',
4296 'kw': 'cor',
4297 'ky': 'kir',
4298 'la': 'lat',
4299 'lb': 'ltz',
4300 'lg': 'lug',
4301 'li': 'lim',
4302 'ln': 'lin',
4303 'lo': 'lao',
4304 'lt': 'lit',
4305 'lu': 'lub',
4306 'lv': 'lav',
4307 'mg': 'mlg',
4308 'mh': 'mah',
4309 'mi': 'mri',
4310 'mk': 'mkd',
4311 'ml': 'mal',
4312 'mn': 'mon',
4313 'mr': 'mar',
4314 'ms': 'msa',
4315 'mt': 'mlt',
4316 'my': 'mya',
4317 'na': 'nau',
4318 'nb': 'nob',
4319 'nd': 'nde',
4320 'ne': 'nep',
4321 'ng': 'ndo',
4322 'nl': 'nld',
4323 'nn': 'nno',
4324 'no': 'nor',
4325 'nr': 'nbl',
4326 'nv': 'nav',
4327 'ny': 'nya',
4328 'oc': 'oci',
4329 'oj': 'oji',
4330 'om': 'orm',
4331 'or': 'ori',
4332 'os': 'oss',
4333 'pa': 'pan',
4334 'pi': 'pli',
4335 'pl': 'pol',
4336 'ps': 'pus',
4337 'pt': 'por',
4338 'qu': 'que',
4339 'rm': 'roh',
4340 'rn': 'run',
4341 'ro': 'ron',
4342 'ru': 'rus',
4343 'rw': 'kin',
4344 'sa': 'san',
4345 'sc': 'srd',
4346 'sd': 'snd',
4347 'se': 'sme',
4348 'sg': 'sag',
4349 'si': 'sin',
4350 'sk': 'slk',
4351 'sl': 'slv',
4352 'sm': 'smo',
4353 'sn': 'sna',
4354 'so': 'som',
4355 'sq': 'sqi',
4356 'sr': 'srp',
4357 'ss': 'ssw',
4358 'st': 'sot',
4359 'su': 'sun',
4360 'sv': 'swe',
4361 'sw': 'swa',
4362 'ta': 'tam',
4363 'te': 'tel',
4364 'tg': 'tgk',
4365 'th': 'tha',
4366 'ti': 'tir',
4367 'tk': 'tuk',
4368 'tl': 'tgl',
4369 'tn': 'tsn',
4370 'to': 'ton',
4371 'tr': 'tur',
4372 'ts': 'tso',
4373 'tt': 'tat',
4374 'tw': 'twi',
4375 'ty': 'tah',
4376 'ug': 'uig',
4377 'uk': 'ukr',
4378 'ur': 'urd',
4379 'uz': 'uzb',
4380 've': 'ven',
4381 'vi': 'vie',
4382 'vo': 'vol',
4383 'wa': 'wln',
4384 'wo': 'wol',
4385 'xh': 'xho',
4386 'yi': 'yid',
e9a50fba 4387 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4388 'yo': 'yor',
4389 'za': 'zha',
4390 'zh': 'zho',
4391 'zu': 'zul',
4392 }
4393
4394 @classmethod
4395 def short2long(cls, code):
4396 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4397 return cls._lang_map.get(code[:2])
4398
4399 @classmethod
4400 def long2short(cls, code):
4401 """Convert language code from ISO 639-2/T to ISO 639-1"""
4402 for short_name, long_name in cls._lang_map.items():
4403 if long_name == code:
4404 return short_name
4405
4406
86e5f3ed 4407class ISO3166Utils:
4eb10f66
YCH
4408 # From http://data.okfn.org/data/core/country-list
4409 _country_map = {
4410 'AF': 'Afghanistan',
4411 'AX': 'Åland Islands',
4412 'AL': 'Albania',
4413 'DZ': 'Algeria',
4414 'AS': 'American Samoa',
4415 'AD': 'Andorra',
4416 'AO': 'Angola',
4417 'AI': 'Anguilla',
4418 'AQ': 'Antarctica',
4419 'AG': 'Antigua and Barbuda',
4420 'AR': 'Argentina',
4421 'AM': 'Armenia',
4422 'AW': 'Aruba',
4423 'AU': 'Australia',
4424 'AT': 'Austria',
4425 'AZ': 'Azerbaijan',
4426 'BS': 'Bahamas',
4427 'BH': 'Bahrain',
4428 'BD': 'Bangladesh',
4429 'BB': 'Barbados',
4430 'BY': 'Belarus',
4431 'BE': 'Belgium',
4432 'BZ': 'Belize',
4433 'BJ': 'Benin',
4434 'BM': 'Bermuda',
4435 'BT': 'Bhutan',
4436 'BO': 'Bolivia, Plurinational State of',
4437 'BQ': 'Bonaire, Sint Eustatius and Saba',
4438 'BA': 'Bosnia and Herzegovina',
4439 'BW': 'Botswana',
4440 'BV': 'Bouvet Island',
4441 'BR': 'Brazil',
4442 'IO': 'British Indian Ocean Territory',
4443 'BN': 'Brunei Darussalam',
4444 'BG': 'Bulgaria',
4445 'BF': 'Burkina Faso',
4446 'BI': 'Burundi',
4447 'KH': 'Cambodia',
4448 'CM': 'Cameroon',
4449 'CA': 'Canada',
4450 'CV': 'Cape Verde',
4451 'KY': 'Cayman Islands',
4452 'CF': 'Central African Republic',
4453 'TD': 'Chad',
4454 'CL': 'Chile',
4455 'CN': 'China',
4456 'CX': 'Christmas Island',
4457 'CC': 'Cocos (Keeling) Islands',
4458 'CO': 'Colombia',
4459 'KM': 'Comoros',
4460 'CG': 'Congo',
4461 'CD': 'Congo, the Democratic Republic of the',
4462 'CK': 'Cook Islands',
4463 'CR': 'Costa Rica',
4464 'CI': 'Côte d\'Ivoire',
4465 'HR': 'Croatia',
4466 'CU': 'Cuba',
4467 'CW': 'Curaçao',
4468 'CY': 'Cyprus',
4469 'CZ': 'Czech Republic',
4470 'DK': 'Denmark',
4471 'DJ': 'Djibouti',
4472 'DM': 'Dominica',
4473 'DO': 'Dominican Republic',
4474 'EC': 'Ecuador',
4475 'EG': 'Egypt',
4476 'SV': 'El Salvador',
4477 'GQ': 'Equatorial Guinea',
4478 'ER': 'Eritrea',
4479 'EE': 'Estonia',
4480 'ET': 'Ethiopia',
4481 'FK': 'Falkland Islands (Malvinas)',
4482 'FO': 'Faroe Islands',
4483 'FJ': 'Fiji',
4484 'FI': 'Finland',
4485 'FR': 'France',
4486 'GF': 'French Guiana',
4487 'PF': 'French Polynesia',
4488 'TF': 'French Southern Territories',
4489 'GA': 'Gabon',
4490 'GM': 'Gambia',
4491 'GE': 'Georgia',
4492 'DE': 'Germany',
4493 'GH': 'Ghana',
4494 'GI': 'Gibraltar',
4495 'GR': 'Greece',
4496 'GL': 'Greenland',
4497 'GD': 'Grenada',
4498 'GP': 'Guadeloupe',
4499 'GU': 'Guam',
4500 'GT': 'Guatemala',
4501 'GG': 'Guernsey',
4502 'GN': 'Guinea',
4503 'GW': 'Guinea-Bissau',
4504 'GY': 'Guyana',
4505 'HT': 'Haiti',
4506 'HM': 'Heard Island and McDonald Islands',
4507 'VA': 'Holy See (Vatican City State)',
4508 'HN': 'Honduras',
4509 'HK': 'Hong Kong',
4510 'HU': 'Hungary',
4511 'IS': 'Iceland',
4512 'IN': 'India',
4513 'ID': 'Indonesia',
4514 'IR': 'Iran, Islamic Republic of',
4515 'IQ': 'Iraq',
4516 'IE': 'Ireland',
4517 'IM': 'Isle of Man',
4518 'IL': 'Israel',
4519 'IT': 'Italy',
4520 'JM': 'Jamaica',
4521 'JP': 'Japan',
4522 'JE': 'Jersey',
4523 'JO': 'Jordan',
4524 'KZ': 'Kazakhstan',
4525 'KE': 'Kenya',
4526 'KI': 'Kiribati',
4527 'KP': 'Korea, Democratic People\'s Republic of',
4528 'KR': 'Korea, Republic of',
4529 'KW': 'Kuwait',
4530 'KG': 'Kyrgyzstan',
4531 'LA': 'Lao People\'s Democratic Republic',
4532 'LV': 'Latvia',
4533 'LB': 'Lebanon',
4534 'LS': 'Lesotho',
4535 'LR': 'Liberia',
4536 'LY': 'Libya',
4537 'LI': 'Liechtenstein',
4538 'LT': 'Lithuania',
4539 'LU': 'Luxembourg',
4540 'MO': 'Macao',
4541 'MK': 'Macedonia, the Former Yugoslav Republic of',
4542 'MG': 'Madagascar',
4543 'MW': 'Malawi',
4544 'MY': 'Malaysia',
4545 'MV': 'Maldives',
4546 'ML': 'Mali',
4547 'MT': 'Malta',
4548 'MH': 'Marshall Islands',
4549 'MQ': 'Martinique',
4550 'MR': 'Mauritania',
4551 'MU': 'Mauritius',
4552 'YT': 'Mayotte',
4553 'MX': 'Mexico',
4554 'FM': 'Micronesia, Federated States of',
4555 'MD': 'Moldova, Republic of',
4556 'MC': 'Monaco',
4557 'MN': 'Mongolia',
4558 'ME': 'Montenegro',
4559 'MS': 'Montserrat',
4560 'MA': 'Morocco',
4561 'MZ': 'Mozambique',
4562 'MM': 'Myanmar',
4563 'NA': 'Namibia',
4564 'NR': 'Nauru',
4565 'NP': 'Nepal',
4566 'NL': 'Netherlands',
4567 'NC': 'New Caledonia',
4568 'NZ': 'New Zealand',
4569 'NI': 'Nicaragua',
4570 'NE': 'Niger',
4571 'NG': 'Nigeria',
4572 'NU': 'Niue',
4573 'NF': 'Norfolk Island',
4574 'MP': 'Northern Mariana Islands',
4575 'NO': 'Norway',
4576 'OM': 'Oman',
4577 'PK': 'Pakistan',
4578 'PW': 'Palau',
4579 'PS': 'Palestine, State of',
4580 'PA': 'Panama',
4581 'PG': 'Papua New Guinea',
4582 'PY': 'Paraguay',
4583 'PE': 'Peru',
4584 'PH': 'Philippines',
4585 'PN': 'Pitcairn',
4586 'PL': 'Poland',
4587 'PT': 'Portugal',
4588 'PR': 'Puerto Rico',
4589 'QA': 'Qatar',
4590 'RE': 'Réunion',
4591 'RO': 'Romania',
4592 'RU': 'Russian Federation',
4593 'RW': 'Rwanda',
4594 'BL': 'Saint Barthélemy',
4595 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4596 'KN': 'Saint Kitts and Nevis',
4597 'LC': 'Saint Lucia',
4598 'MF': 'Saint Martin (French part)',
4599 'PM': 'Saint Pierre and Miquelon',
4600 'VC': 'Saint Vincent and the Grenadines',
4601 'WS': 'Samoa',
4602 'SM': 'San Marino',
4603 'ST': 'Sao Tome and Principe',
4604 'SA': 'Saudi Arabia',
4605 'SN': 'Senegal',
4606 'RS': 'Serbia',
4607 'SC': 'Seychelles',
4608 'SL': 'Sierra Leone',
4609 'SG': 'Singapore',
4610 'SX': 'Sint Maarten (Dutch part)',
4611 'SK': 'Slovakia',
4612 'SI': 'Slovenia',
4613 'SB': 'Solomon Islands',
4614 'SO': 'Somalia',
4615 'ZA': 'South Africa',
4616 'GS': 'South Georgia and the South Sandwich Islands',
4617 'SS': 'South Sudan',
4618 'ES': 'Spain',
4619 'LK': 'Sri Lanka',
4620 'SD': 'Sudan',
4621 'SR': 'Suriname',
4622 'SJ': 'Svalbard and Jan Mayen',
4623 'SZ': 'Swaziland',
4624 'SE': 'Sweden',
4625 'CH': 'Switzerland',
4626 'SY': 'Syrian Arab Republic',
4627 'TW': 'Taiwan, Province of China',
4628 'TJ': 'Tajikistan',
4629 'TZ': 'Tanzania, United Republic of',
4630 'TH': 'Thailand',
4631 'TL': 'Timor-Leste',
4632 'TG': 'Togo',
4633 'TK': 'Tokelau',
4634 'TO': 'Tonga',
4635 'TT': 'Trinidad and Tobago',
4636 'TN': 'Tunisia',
4637 'TR': 'Turkey',
4638 'TM': 'Turkmenistan',
4639 'TC': 'Turks and Caicos Islands',
4640 'TV': 'Tuvalu',
4641 'UG': 'Uganda',
4642 'UA': 'Ukraine',
4643 'AE': 'United Arab Emirates',
4644 'GB': 'United Kingdom',
4645 'US': 'United States',
4646 'UM': 'United States Minor Outlying Islands',
4647 'UY': 'Uruguay',
4648 'UZ': 'Uzbekistan',
4649 'VU': 'Vanuatu',
4650 'VE': 'Venezuela, Bolivarian Republic of',
4651 'VN': 'Viet Nam',
4652 'VG': 'Virgin Islands, British',
4653 'VI': 'Virgin Islands, U.S.',
4654 'WF': 'Wallis and Futuna',
4655 'EH': 'Western Sahara',
4656 'YE': 'Yemen',
4657 'ZM': 'Zambia',
4658 'ZW': 'Zimbabwe',
2f97cc61 4659 # Not ISO 3166 codes, but used for IP blocks
4660 'AP': 'Asia/Pacific Region',
4661 'EU': 'Europe',
4eb10f66
YCH
4662 }
4663
4664 @classmethod
4665 def short2full(cls, code):
4666 """Convert an ISO 3166-2 country code to the corresponding full name"""
4667 return cls._country_map.get(code.upper())
4668
4669
86e5f3ed 4670class GeoUtils:
773f291d
S
4671 # Major IPv4 address blocks per country
4672 _country_ip_map = {
53896ca5 4673 'AD': '46.172.224.0/19',
773f291d
S
4674 'AE': '94.200.0.0/13',
4675 'AF': '149.54.0.0/17',
4676 'AG': '209.59.64.0/18',
4677 'AI': '204.14.248.0/21',
4678 'AL': '46.99.0.0/16',
4679 'AM': '46.70.0.0/15',
4680 'AO': '105.168.0.0/13',
53896ca5
S
4681 'AP': '182.50.184.0/21',
4682 'AQ': '23.154.160.0/24',
773f291d
S
4683 'AR': '181.0.0.0/12',
4684 'AS': '202.70.112.0/20',
53896ca5 4685 'AT': '77.116.0.0/14',
773f291d
S
4686 'AU': '1.128.0.0/11',
4687 'AW': '181.41.0.0/18',
53896ca5
S
4688 'AX': '185.217.4.0/22',
4689 'AZ': '5.197.0.0/16',
773f291d
S
4690 'BA': '31.176.128.0/17',
4691 'BB': '65.48.128.0/17',
4692 'BD': '114.130.0.0/16',
4693 'BE': '57.0.0.0/8',
53896ca5 4694 'BF': '102.178.0.0/15',
773f291d
S
4695 'BG': '95.42.0.0/15',
4696 'BH': '37.131.0.0/17',
4697 'BI': '154.117.192.0/18',
4698 'BJ': '137.255.0.0/16',
53896ca5 4699 'BL': '185.212.72.0/23',
773f291d
S
4700 'BM': '196.12.64.0/18',
4701 'BN': '156.31.0.0/16',
4702 'BO': '161.56.0.0/16',
4703 'BQ': '161.0.80.0/20',
53896ca5 4704 'BR': '191.128.0.0/12',
773f291d
S
4705 'BS': '24.51.64.0/18',
4706 'BT': '119.2.96.0/19',
4707 'BW': '168.167.0.0/16',
4708 'BY': '178.120.0.0/13',
4709 'BZ': '179.42.192.0/18',
4710 'CA': '99.224.0.0/11',
4711 'CD': '41.243.0.0/16',
53896ca5
S
4712 'CF': '197.242.176.0/21',
4713 'CG': '160.113.0.0/16',
773f291d 4714 'CH': '85.0.0.0/13',
53896ca5 4715 'CI': '102.136.0.0/14',
773f291d
S
4716 'CK': '202.65.32.0/19',
4717 'CL': '152.172.0.0/14',
53896ca5 4718 'CM': '102.244.0.0/14',
773f291d
S
4719 'CN': '36.128.0.0/10',
4720 'CO': '181.240.0.0/12',
4721 'CR': '201.192.0.0/12',
4722 'CU': '152.206.0.0/15',
4723 'CV': '165.90.96.0/19',
4724 'CW': '190.88.128.0/17',
53896ca5 4725 'CY': '31.153.0.0/16',
773f291d
S
4726 'CZ': '88.100.0.0/14',
4727 'DE': '53.0.0.0/8',
4728 'DJ': '197.241.0.0/17',
4729 'DK': '87.48.0.0/12',
4730 'DM': '192.243.48.0/20',
4731 'DO': '152.166.0.0/15',
4732 'DZ': '41.96.0.0/12',
4733 'EC': '186.68.0.0/15',
4734 'EE': '90.190.0.0/15',
4735 'EG': '156.160.0.0/11',
4736 'ER': '196.200.96.0/20',
4737 'ES': '88.0.0.0/11',
4738 'ET': '196.188.0.0/14',
4739 'EU': '2.16.0.0/13',
4740 'FI': '91.152.0.0/13',
4741 'FJ': '144.120.0.0/16',
53896ca5 4742 'FK': '80.73.208.0/21',
773f291d
S
4743 'FM': '119.252.112.0/20',
4744 'FO': '88.85.32.0/19',
4745 'FR': '90.0.0.0/9',
4746 'GA': '41.158.0.0/15',
4747 'GB': '25.0.0.0/8',
4748 'GD': '74.122.88.0/21',
4749 'GE': '31.146.0.0/16',
4750 'GF': '161.22.64.0/18',
4751 'GG': '62.68.160.0/19',
53896ca5
S
4752 'GH': '154.160.0.0/12',
4753 'GI': '95.164.0.0/16',
773f291d
S
4754 'GL': '88.83.0.0/19',
4755 'GM': '160.182.0.0/15',
4756 'GN': '197.149.192.0/18',
4757 'GP': '104.250.0.0/19',
4758 'GQ': '105.235.224.0/20',
4759 'GR': '94.64.0.0/13',
4760 'GT': '168.234.0.0/16',
4761 'GU': '168.123.0.0/16',
4762 'GW': '197.214.80.0/20',
4763 'GY': '181.41.64.0/18',
4764 'HK': '113.252.0.0/14',
4765 'HN': '181.210.0.0/16',
4766 'HR': '93.136.0.0/13',
4767 'HT': '148.102.128.0/17',
4768 'HU': '84.0.0.0/14',
4769 'ID': '39.192.0.0/10',
4770 'IE': '87.32.0.0/12',
4771 'IL': '79.176.0.0/13',
4772 'IM': '5.62.80.0/20',
4773 'IN': '117.192.0.0/10',
4774 'IO': '203.83.48.0/21',
4775 'IQ': '37.236.0.0/14',
4776 'IR': '2.176.0.0/12',
4777 'IS': '82.221.0.0/16',
4778 'IT': '79.0.0.0/10',
4779 'JE': '87.244.64.0/18',
4780 'JM': '72.27.0.0/17',
4781 'JO': '176.29.0.0/16',
53896ca5 4782 'JP': '133.0.0.0/8',
773f291d
S
4783 'KE': '105.48.0.0/12',
4784 'KG': '158.181.128.0/17',
4785 'KH': '36.37.128.0/17',
4786 'KI': '103.25.140.0/22',
4787 'KM': '197.255.224.0/20',
53896ca5 4788 'KN': '198.167.192.0/19',
773f291d
S
4789 'KP': '175.45.176.0/22',
4790 'KR': '175.192.0.0/10',
4791 'KW': '37.36.0.0/14',
4792 'KY': '64.96.0.0/15',
4793 'KZ': '2.72.0.0/13',
4794 'LA': '115.84.64.0/18',
4795 'LB': '178.135.0.0/16',
53896ca5 4796 'LC': '24.92.144.0/20',
773f291d
S
4797 'LI': '82.117.0.0/19',
4798 'LK': '112.134.0.0/15',
53896ca5 4799 'LR': '102.183.0.0/16',
773f291d
S
4800 'LS': '129.232.0.0/17',
4801 'LT': '78.56.0.0/13',
4802 'LU': '188.42.0.0/16',
4803 'LV': '46.109.0.0/16',
4804 'LY': '41.252.0.0/14',
4805 'MA': '105.128.0.0/11',
4806 'MC': '88.209.64.0/18',
4807 'MD': '37.246.0.0/16',
4808 'ME': '178.175.0.0/17',
4809 'MF': '74.112.232.0/21',
4810 'MG': '154.126.0.0/17',
4811 'MH': '117.103.88.0/21',
4812 'MK': '77.28.0.0/15',
4813 'ML': '154.118.128.0/18',
4814 'MM': '37.111.0.0/17',
4815 'MN': '49.0.128.0/17',
4816 'MO': '60.246.0.0/16',
4817 'MP': '202.88.64.0/20',
4818 'MQ': '109.203.224.0/19',
4819 'MR': '41.188.64.0/18',
4820 'MS': '208.90.112.0/22',
4821 'MT': '46.11.0.0/16',
4822 'MU': '105.16.0.0/12',
4823 'MV': '27.114.128.0/18',
53896ca5 4824 'MW': '102.70.0.0/15',
773f291d
S
4825 'MX': '187.192.0.0/11',
4826 'MY': '175.136.0.0/13',
4827 'MZ': '197.218.0.0/15',
4828 'NA': '41.182.0.0/16',
4829 'NC': '101.101.0.0/18',
4830 'NE': '197.214.0.0/18',
4831 'NF': '203.17.240.0/22',
4832 'NG': '105.112.0.0/12',
4833 'NI': '186.76.0.0/15',
4834 'NL': '145.96.0.0/11',
4835 'NO': '84.208.0.0/13',
4836 'NP': '36.252.0.0/15',
4837 'NR': '203.98.224.0/19',
4838 'NU': '49.156.48.0/22',
4839 'NZ': '49.224.0.0/14',
4840 'OM': '5.36.0.0/15',
4841 'PA': '186.72.0.0/15',
4842 'PE': '186.160.0.0/14',
4843 'PF': '123.50.64.0/18',
4844 'PG': '124.240.192.0/19',
4845 'PH': '49.144.0.0/13',
4846 'PK': '39.32.0.0/11',
4847 'PL': '83.0.0.0/11',
4848 'PM': '70.36.0.0/20',
4849 'PR': '66.50.0.0/16',
4850 'PS': '188.161.0.0/16',
4851 'PT': '85.240.0.0/13',
4852 'PW': '202.124.224.0/20',
4853 'PY': '181.120.0.0/14',
4854 'QA': '37.210.0.0/15',
53896ca5 4855 'RE': '102.35.0.0/16',
773f291d 4856 'RO': '79.112.0.0/13',
53896ca5 4857 'RS': '93.86.0.0/15',
773f291d 4858 'RU': '5.136.0.0/13',
53896ca5 4859 'RW': '41.186.0.0/16',
773f291d
S
4860 'SA': '188.48.0.0/13',
4861 'SB': '202.1.160.0/19',
4862 'SC': '154.192.0.0/11',
53896ca5 4863 'SD': '102.120.0.0/13',
773f291d 4864 'SE': '78.64.0.0/12',
53896ca5 4865 'SG': '8.128.0.0/10',
773f291d
S
4866 'SI': '188.196.0.0/14',
4867 'SK': '78.98.0.0/15',
53896ca5 4868 'SL': '102.143.0.0/17',
773f291d
S
4869 'SM': '89.186.32.0/19',
4870 'SN': '41.82.0.0/15',
53896ca5 4871 'SO': '154.115.192.0/18',
773f291d
S
4872 'SR': '186.179.128.0/17',
4873 'SS': '105.235.208.0/21',
4874 'ST': '197.159.160.0/19',
4875 'SV': '168.243.0.0/16',
4876 'SX': '190.102.0.0/20',
4877 'SY': '5.0.0.0/16',
4878 'SZ': '41.84.224.0/19',
4879 'TC': '65.255.48.0/20',
4880 'TD': '154.68.128.0/19',
4881 'TG': '196.168.0.0/14',
4882 'TH': '171.96.0.0/13',
4883 'TJ': '85.9.128.0/18',
4884 'TK': '27.96.24.0/21',
4885 'TL': '180.189.160.0/20',
4886 'TM': '95.85.96.0/19',
4887 'TN': '197.0.0.0/11',
4888 'TO': '175.176.144.0/21',
4889 'TR': '78.160.0.0/11',
4890 'TT': '186.44.0.0/15',
4891 'TV': '202.2.96.0/19',
4892 'TW': '120.96.0.0/11',
4893 'TZ': '156.156.0.0/14',
53896ca5
S
4894 'UA': '37.52.0.0/14',
4895 'UG': '102.80.0.0/13',
4896 'US': '6.0.0.0/8',
773f291d 4897 'UY': '167.56.0.0/13',
53896ca5 4898 'UZ': '84.54.64.0/18',
773f291d 4899 'VA': '212.77.0.0/19',
53896ca5 4900 'VC': '207.191.240.0/21',
773f291d 4901 'VE': '186.88.0.0/13',
53896ca5 4902 'VG': '66.81.192.0/20',
773f291d
S
4903 'VI': '146.226.0.0/16',
4904 'VN': '14.160.0.0/11',
4905 'VU': '202.80.32.0/20',
4906 'WF': '117.20.32.0/21',
4907 'WS': '202.4.32.0/19',
4908 'YE': '134.35.0.0/16',
4909 'YT': '41.242.116.0/22',
4910 'ZA': '41.0.0.0/11',
53896ca5
S
4911 'ZM': '102.144.0.0/13',
4912 'ZW': '102.177.192.0/18',
773f291d
S
4913 }
4914
4915 @classmethod
5f95927a
S
4916 def random_ipv4(cls, code_or_block):
4917 if len(code_or_block) == 2:
4918 block = cls._country_ip_map.get(code_or_block.upper())
4919 if not block:
4920 return None
4921 else:
4922 block = code_or_block
773f291d 4923 addr, preflen = block.split('/')
ac668111 4924 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4925 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4926 return str(socket.inet_ntoa(
ac668111 4927 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4928
4929
ac668111 4930class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4931 def __init__(self, proxies=None):
4932 # Set default handlers
4933 for type in ('http', 'https'):
4934 setattr(self, '%s_open' % type,
4935 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4936 meth(r, proxy, type))
ac668111 4937 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4938
91410c9b 4939 def proxy_open(self, req, proxy, type):
2461f79d 4940 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4941 if req_proxy is not None:
4942 proxy = req_proxy
2461f79d
PH
4943 del req.headers['Ytdl-request-proxy']
4944
4945 if proxy == '__noproxy__':
4946 return None # No Proxy
14f25df2 4947 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4948 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4949 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4950 return None
ac668111 4951 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4952 self, req, proxy, type)
5bc880b9
YCH
4953
4954
0a5445dd
YCH
4955# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4956# released into Public Domain
4957# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4958
4959def long_to_bytes(n, blocksize=0):
4960 """long_to_bytes(n:long, blocksize:int) : string
4961 Convert a long integer to a byte string.
4962
4963 If optional blocksize is given and greater than zero, pad the front of the
4964 byte string with binary zeros so that the length is a multiple of
4965 blocksize.
4966 """
4967 # after much testing, this algorithm was deemed to be the fastest
4968 s = b''
4969 n = int(n)
4970 while n > 0:
ac668111 4971 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4972 n = n >> 32
4973 # strip off leading zeros
4974 for i in range(len(s)):
4975 if s[i] != b'\000'[0]:
4976 break
4977 else:
4978 # only happens when n == 0
4979 s = b'\000'
4980 i = 0
4981 s = s[i:]
4982 # add back some pad bytes. this could be done more efficiently w.r.t. the
4983 # de-padding being done above, but sigh...
4984 if blocksize > 0 and len(s) % blocksize:
4985 s = (blocksize - len(s) % blocksize) * b'\000' + s
4986 return s
4987
4988
4989def bytes_to_long(s):
4990 """bytes_to_long(string) : long
4991 Convert a byte string to a long integer.
4992
4993 This is (essentially) the inverse of long_to_bytes().
4994 """
4995 acc = 0
4996 length = len(s)
4997 if length % 4:
4998 extra = (4 - length % 4)
4999 s = b'\000' * extra + s
5000 length = length + extra
5001 for i in range(0, length, 4):
ac668111 5002 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
5003 return acc
5004
5005
5bc880b9
YCH
5006def ohdave_rsa_encrypt(data, exponent, modulus):
5007 '''
5008 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5009
5010 Input:
5011 data: data to encrypt, bytes-like object
5012 exponent, modulus: parameter e and N of RSA algorithm, both integer
5013 Output: hex string of encrypted data
5014
5015 Limitation: supports one block encryption only
5016 '''
5017
5018 payload = int(binascii.hexlify(data[::-1]), 16)
5019 encrypted = pow(payload, exponent, modulus)
5020 return '%x' % encrypted
81bdc8fd
YCH
5021
5022
f48409c7
YCH
5023def pkcs1pad(data, length):
5024 """
5025 Padding input data with PKCS#1 scheme
5026
5027 @param {int[]} data input data
5028 @param {int} length target length
5029 @returns {int[]} padded data
5030 """
5031 if len(data) > length - 11:
5032 raise ValueError('Input data too long for PKCS#1 padding')
5033
5034 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5035 return [0, 2] + pseudo_random + [0] + data
5036
5037
7b2c3f47 5038def _base_n_table(n, table):
5039 if not table and not n:
5040 raise ValueError('Either table or n must be specified')
612f2be5 5041 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
5042
44f14eb4 5043 if n and n != len(table):
612f2be5 5044 raise ValueError(f'base {n} exceeds table length {len(table)}')
5045 return table
59f898b7 5046
5eb6bdce 5047
7b2c3f47 5048def encode_base_n(num, n=None, table=None):
5049 """Convert given int to a base-n string"""
612f2be5 5050 table = _base_n_table(n, table)
7b2c3f47 5051 if not num:
5eb6bdce
YCH
5052 return table[0]
5053
7b2c3f47 5054 result, base = '', len(table)
81bdc8fd 5055 while num:
7b2c3f47 5056 result = table[num % base] + result
612f2be5 5057 num = num // base
7b2c3f47 5058 return result
5059
5060
5061def decode_base_n(string, n=None, table=None):
5062 """Convert given base-n string to int"""
5063 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5064 result, base = 0, len(table)
5065 for char in string:
5066 result = result * base + table[char]
5067 return result
5068
5069
5070def decode_base(value, digits):
da4db748 5071 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5072 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 5073 return decode_base_n(value, table=digits)
f52354a8
YCH
5074
5075
5076def decode_packed_codes(code):
06b3fe29 5077 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 5078 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
5079 base = int(base)
5080 count = int(count)
5081 symbols = symbols.split('|')
5082 symbol_table = {}
5083
5084 while count:
5085 count -= 1
5eb6bdce 5086 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5087 symbol_table[base_n_count] = symbols[count] or base_n_count
5088
5089 return re.sub(
5090 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5091 obfuscated_code)
e154c651 5092
5093
1ced2221
S
5094def caesar(s, alphabet, shift):
5095 if shift == 0:
5096 return s
5097 l = len(alphabet)
5098 return ''.join(
5099 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5100 for c in s)
5101
5102
5103def rot47(s):
5104 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5105
5106
e154c651 5107def parse_m3u8_attributes(attrib):
5108 info = {}
5109 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5110 if val.startswith('"'):
5111 val = val[1:-1]
5112 info[key] = val
5113 return info
1143535d
YCH
5114
5115
5116def urshift(val, n):
5117 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5118
5119
5120# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 5121# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
5122def decode_png(png_data):
5123 # Reference: https://www.w3.org/TR/PNG/
5124 header = png_data[8:]
5125
5126 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 5127 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
5128
5129 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 5130 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
5131
5132 chunks = []
5133
5134 while header:
5135 length = unpack_integer(header[:4])
5136 header = header[4:]
5137
5138 chunk_type = header[:4]
5139 header = header[4:]
5140
5141 chunk_data = header[:length]
5142 header = header[length:]
5143
5144 header = header[4:] # Skip CRC
5145
5146 chunks.append({
5147 'type': chunk_type,
5148 'length': length,
5149 'data': chunk_data
5150 })
5151
5152 ihdr = chunks[0]['data']
5153
5154 width = unpack_integer(ihdr[:4])
5155 height = unpack_integer(ihdr[4:8])
5156
5157 idat = b''
5158
5159 for chunk in chunks:
5160 if chunk['type'] == b'IDAT':
5161 idat += chunk['data']
5162
5163 if not idat:
86e5f3ed 5164 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5165
5166 decompressed_data = bytearray(zlib.decompress(idat))
5167
5168 stride = width * 3
5169 pixels = []
5170
5171 def _get_pixel(idx):
5172 x = idx % stride
5173 y = idx // stride
5174 return pixels[y][x]
5175
5176 for y in range(height):
5177 basePos = y * (1 + stride)
5178 filter_type = decompressed_data[basePos]
5179
5180 current_row = []
5181
5182 pixels.append(current_row)
5183
5184 for x in range(stride):
5185 color = decompressed_data[1 + basePos + x]
5186 basex = y * stride + x
5187 left = 0
5188 up = 0
5189
5190 if x > 2:
5191 left = _get_pixel(basex - 3)
5192 if y > 0:
5193 up = _get_pixel(basex - stride)
5194
5195 if filter_type == 1: # Sub
5196 color = (color + left) & 0xff
5197 elif filter_type == 2: # Up
5198 color = (color + up) & 0xff
5199 elif filter_type == 3: # Average
5200 color = (color + ((left + up) >> 1)) & 0xff
5201 elif filter_type == 4: # Paeth
5202 a = left
5203 b = up
5204 c = 0
5205
5206 if x > 2 and y > 0:
5207 c = _get_pixel(basex - stride - 3)
5208
5209 p = a + b - c
5210
5211 pa = abs(p - a)
5212 pb = abs(p - b)
5213 pc = abs(p - c)
5214
5215 if pa <= pb and pa <= pc:
5216 color = (color + a) & 0xff
5217 elif pb <= pc:
5218 color = (color + b) & 0xff
5219 else:
5220 color = (color + c) & 0xff
5221
5222 current_row.append(color)
5223
5224 return width, height, pixels
efa97bdc
YCH
5225
5226
5227def write_xattr(path, key, value):
6f7563be 5228 # Windows: Write xattrs to NTFS Alternate Data Streams:
5229 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5230 if compat_os_name == 'nt':
5231 assert ':' not in key
5232 assert os.path.exists(path)
efa97bdc
YCH
5233
5234 try:
6f7563be 5235 with open(f'{path}:{key}', 'wb') as f:
5236 f.write(value)
86e5f3ed 5237 except OSError as e:
efa97bdc 5238 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5239 return
efa97bdc 5240
6f7563be 5241 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5242
6f7563be 5243 setxattr = None
5244 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5245 # Unicode arguments are not supported in pyxattr until version 0.5.0
5246 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5247 if version_tuple(xattr.__version__) >= (0, 5, 0):
5248 setxattr = xattr.set
5249 elif xattr:
5250 setxattr = xattr.setxattr
efa97bdc 5251
6f7563be 5252 if setxattr:
5253 try:
5254 setxattr(path, key, value)
5255 except OSError as e:
5256 raise XAttrMetadataError(e.errno, e.strerror)
5257 return
efa97bdc 5258
6f7563be 5259 # UNIX Method 2. Use setfattr/xattr executables
5260 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5261 else 'xattr' if check_executable('xattr', ['-h']) else None)
5262 if not exe:
5263 raise XAttrUnavailableError(
5264 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5265 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5266
0f06bcd7 5267 value = value.decode()
6f7563be 5268 try:
f0c9fb96 5269 _, stderr, returncode = Popen.run(
6f7563be 5270 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5271 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5272 except OSError as e:
5273 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5274 if returncode:
5275 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5276
5277
5278def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5279 start_date = datetime.date(1950, 1, 1)
5280 end_date = datetime.date(1995, 12, 31)
5281 offset = random.randint(0, (end_date - start_date).days)
5282 random_date = start_date + datetime.timedelta(offset)
0c265486 5283 return {
aa374bc7
AS
5284 year_field: str(random_date.year),
5285 month_field: str(random_date.month),
5286 day_field: str(random_date.day),
0c265486 5287 }
732044af 5288
c76eb41b 5289
8c53322c
L
5290def find_available_port(interface=''):
5291 try:
5292 with socket.socket() as sock:
5293 sock.bind((interface, 0))
5294 return sock.getsockname()[1]
5295 except OSError:
5296 return None
5297
5298
732044af 5299# Templates for internet shortcut files, which are plain text files.
e5a998f3 5300DOT_URL_LINK_TEMPLATE = '''\
732044af 5301[InternetShortcut]
5302URL=%(url)s
e5a998f3 5303'''
732044af 5304
e5a998f3 5305DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5306<?xml version="1.0" encoding="UTF-8"?>
5307<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5308<plist version="1.0">
5309<dict>
5310\t<key>URL</key>
5311\t<string>%(url)s</string>
5312</dict>
5313</plist>
e5a998f3 5314'''
732044af 5315
e5a998f3 5316DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5317[Desktop Entry]
5318Encoding=UTF-8
5319Name=%(filename)s
5320Type=Link
5321URL=%(url)s
5322Icon=text-html
e5a998f3 5323'''
732044af 5324
08438d2c 5325LINK_TEMPLATES = {
5326 'url': DOT_URL_LINK_TEMPLATE,
5327 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5328 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5329}
5330
732044af 5331
5332def iri_to_uri(iri):
5333 """
5334 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5335
5336 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5337 """
5338
14f25df2 5339 iri_parts = urllib.parse.urlparse(iri)
732044af 5340
5341 if '[' in iri_parts.netloc:
5342 raise ValueError('IPv6 URIs are not, yet, supported.')
5343 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5344
5345 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5346
5347 net_location = ''
5348 if iri_parts.username:
f9934b96 5349 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5350 if iri_parts.password is not None:
f9934b96 5351 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5352 net_location += '@'
5353
0f06bcd7 5354 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5355 # The 'idna' encoding produces ASCII text.
5356 if iri_parts.port is not None and iri_parts.port != 80:
5357 net_location += ':' + str(iri_parts.port)
5358
f9934b96 5359 return urllib.parse.urlunparse(
732044af 5360 (iri_parts.scheme,
5361 net_location,
5362
f9934b96 5363 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5364
5365 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5366 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5367
5368 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5369 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5370
f9934b96 5371 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5372
5373 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5374
5375
5376def to_high_limit_path(path):
5377 if sys.platform in ['win32', 'cygwin']:
5378 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5379 return '\\\\?\\' + os.path.abspath(path)
732044af 5380
5381 return path
76d321f6 5382
c76eb41b 5383
7b2c3f47 5384def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5385 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5386 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5387 return default
7b2c3f47 5388 return template % func(val)
00dd0cd5 5389
5390
5391def clean_podcast_url(url):
5392 return re.sub(r'''(?x)
5393 (?:
5394 (?:
5395 chtbl\.com/track|
5396 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5397 play\.podtrac\.com
5398 )/[^/]+|
5399 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5400 flex\.acast\.com|
5401 pd(?:
5402 cn\.co| # https://podcorn.com/analytics-prefix/
5403 st\.fm # https://podsights.com/docs/
5404 )/e
5405 )/''', '', url)
ffcb8191
THD
5406
5407
5408_HEX_TABLE = '0123456789abcdef'
5409
5410
5411def random_uuidv4():
5412 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5413
5414
5415def make_dir(path, to_screen=None):
5416 try:
5417 dn = os.path.dirname(path)
b25d6cb9
AI
5418 if dn:
5419 os.makedirs(dn, exist_ok=True)
0202b52a 5420 return True
86e5f3ed 5421 except OSError as err:
0202b52a 5422 if callable(to_screen) is not None:
5423 to_screen('unable to create directory ' + error_to_compat_str(err))
5424 return False
f74980cb 5425
5426
5427def get_executable_path():
b5899f4f 5428 from .update import _get_variant_and_executable_path
c487cf00 5429
b5899f4f 5430 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5431
5432
8e40b9d1 5433def get_user_config_dirs(package_name):
8e40b9d1
M
5434 # .config (e.g. ~/.config/package_name)
5435 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5436 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5437
5438 # appdata (%APPDATA%/package_name)
5439 appdata_dir = os.getenv('appdata')
5440 if appdata_dir:
773c272d 5441 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5442
5443 # home (~/.package_name)
773c272d 5444 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5445
5446
5447def get_system_config_dirs(package_name):
8e40b9d1 5448 # /etc/package_name
773c272d 5449 yield os.path.join('/etc', package_name)
06167fbb 5450
5451
325ebc17 5452def traverse_obj(
f99bbfc9 5453 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5454 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5455 """
5456 Safely traverse nested `dict`s and `Sequence`s
5457
5458 >>> obj = [{}, {"key": "value"}]
5459 >>> traverse_obj(obj, (1, "key"))
5460 "value"
5461
5462 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5463 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5464 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
6839ae1f 5465 Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
ab029d7e
SS
5466
5467 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5468
5469 The keys in the path can be one of:
5470 - `None`: Return the current object.
776995bc
SS
5471 - `set`: Requires the only item in the set to be a type or function,
5472 like `{type}`/`{func}`. If a `type`, returns only values
5473 of this type. If a function, returns `func(obj)`.
8e174ba7 5474 - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
ab029d7e
SS
5475 - `slice`: Branch out and return all values in `obj[key]`.
5476 - `Ellipsis`: Branch out and return a list of all values.
5477 - `tuple`/`list`: Branch out and return a list of all matching values.
5478 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5479 - `function`: Branch out and return values filtered by the function.
5480 Read as: `[value for key, value in obj if function(key, value)]`.
5481 For `Sequence`s, `key` is the index of the value.
776995bc
SS
5482 For `re.Match`es, `key` is the group number (0 = full match)
5483 as well as additionally any group names, if given.
ab029d7e
SS
5484 - `dict` Transform the current object and return a matching dict.
5485 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5486
7b0127e1 5487 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5488
5489 @params paths Paths which to traverse by.
5490 @param default Value to return if the paths do not match.
b1bde57b
SS
5491 If the last key in the path is a `dict`, it will apply to each value inside
5492 the dict instead, depth first. Try to avoid if using nested `dict` keys.
ab029d7e
SS
5493 @param expected_type If a `type`, only accept final values of this type.
5494 If any other callable, try to call the function on each result.
776995bc
SS
5495 If the last key in the path is a `dict`, it will apply to each value inside
5496 the dict instead, recursively. This does respect branching paths.
ab029d7e
SS
5497 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5498 @param casesense If `False`, consider string dictionary keys as case insensitive.
5499
5500 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5501
5502 @param is_user_input Whether the keys are generated from user input.
5503 If `True` strings get converted to `int`/`slice` if needed.
5504 @param traverse_string Whether to traverse into objects as strings.
5505 If `True`, any non-compatible object will first be
5506 converted into a string and then traversed into.
b1bde57b
SS
5507 The return value of that path will be a string instead,
5508 not respecting any further branching.
ab029d7e
SS
5509
5510
5511 @returns The result of the object traversal.
5512 If successful, `get_all=True`, and the path branches at least once,
5513 then a list of results is returned instead.
b1bde57b
SS
5514 If no `default` is given and the last path branches, a `list` of results
5515 is always returned. If a path ends on a `dict` that result will always be a `dict`.
ab029d7e
SS
5516 """
5517 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5518 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5519
352d63fd 5520 if isinstance(expected_type, type):
5521 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5522 else:
ab029d7e
SS
5523 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5524
b1bde57b
SS
5525 def apply_key(key, obj, is_last):
5526 branching = False
5527 result = None
5528
6839ae1f 5529 if obj is None and traverse_string:
b1bde57b 5530 pass
ab029d7e
SS
5531
5532 elif key is None:
b1bde57b 5533 result = obj
ab029d7e 5534
776995bc
SS
5535 elif isinstance(key, set):
5536 assert len(key) == 1, 'Set should only be used to wrap a single item'
5537 item = next(iter(key))
5538 if isinstance(item, type):
5539 if isinstance(obj, item):
b1bde57b 5540 result = obj
776995bc 5541 else:
b1bde57b 5542 result = try_call(item, args=(obj,))
776995bc 5543
ab029d7e 5544 elif isinstance(key, (list, tuple)):
b1bde57b
SS
5545 branching = True
5546 result = itertools.chain.from_iterable(
5547 apply_path(obj, branch, is_last)[0] for branch in key)
ab029d7e
SS
5548
5549 elif key is ...:
b1bde57b 5550 branching = True
ab029d7e 5551 if isinstance(obj, collections.abc.Mapping):
b1bde57b 5552 result = obj.values()
ab029d7e 5553 elif is_sequence(obj):
b1bde57b 5554 result = obj
7b0127e1 5555 elif isinstance(obj, re.Match):
b1bde57b 5556 result = obj.groups()
ab029d7e 5557 elif traverse_string:
b1bde57b
SS
5558 branching = False
5559 result = str(obj)
5560 else:
5561 result = ()
ab029d7e
SS
5562
5563 elif callable(key):
b1bde57b
SS
5564 branching = True
5565 if isinstance(obj, collections.abc.Mapping):
ab029d7e 5566 iter_obj = obj.items()
b1bde57b
SS
5567 elif is_sequence(obj):
5568 iter_obj = enumerate(obj)
7b0127e1 5569 elif isinstance(obj, re.Match):
776995bc
SS
5570 iter_obj = itertools.chain(
5571 enumerate((obj.group(), *obj.groups())),
5572 obj.groupdict().items())
ab029d7e 5573 elif traverse_string:
b1bde57b 5574 branching = False
ab029d7e 5575 iter_obj = enumerate(str(obj))
352d63fd 5576 else:
b1bde57b
SS
5577 iter_obj = ()
5578
5579 result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
5580 if not branching: # string traversal
5581 result = ''.join(result)
ab029d7e
SS
5582
5583 elif isinstance(key, dict):
b1bde57b
SS
5584 iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
5585 result = {
5586 k: v if v is not None else default for k, v in iter_obj
5587 if v is not None or default is not NO_DEFAULT
5588 } or None
ab029d7e 5589
7b0127e1 5590 elif isinstance(obj, collections.abc.Mapping):
b1bde57b
SS
5591 result = (obj.get(key) if casesense or (key in obj) else
5592 next((v for k, v in obj.items() if casefold(k) == key), None))
ab029d7e 5593
7b0127e1
SS
5594 elif isinstance(obj, re.Match):
5595 if isinstance(key, int) or casesense:
5596 with contextlib.suppress(IndexError):
b1bde57b 5597 result = obj.group(key)
7b0127e1 5598
b1bde57b
SS
5599 elif isinstance(key, str):
5600 result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
ab029d7e 5601
b1bde57b 5602 elif isinstance(key, (int, slice)):
6839ae1f 5603 if is_sequence(obj):
b1bde57b
SS
5604 branching = isinstance(key, slice)
5605 with contextlib.suppress(IndexError):
5606 result = obj[key]
6839ae1f
SS
5607 elif traverse_string:
5608 with contextlib.suppress(IndexError):
5609 result = str(obj)[key]
ab029d7e 5610
b1bde57b 5611 return branching, result if branching else (result,)
ab029d7e 5612
776995bc
SS
5613 def lazy_last(iterable):
5614 iterator = iter(iterable)
5615 prev = next(iterator, NO_DEFAULT)
5616 if prev is NO_DEFAULT:
5617 return
5618
5619 for item in iterator:
5620 yield False, prev
5621 prev = item
5622
5623 yield True, prev
5624
b1bde57b 5625 def apply_path(start_obj, path, test_type):
ab029d7e
SS
5626 objs = (start_obj,)
5627 has_branched = False
5628
776995bc
SS
5629 key = None
5630 for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
b1bde57b
SS
5631 if is_user_input and isinstance(key, str):
5632 if key == ':':
5633 key = ...
5634 elif ':' in key:
5635 key = slice(*map(int_or_none, key.split(':')))
5636 elif int_or_none(key) is not None:
5637 key = int(key)
ab029d7e
SS
5638
5639 if not casesense and isinstance(key, str):
5640 key = key.casefold()
5641
776995bc
SS
5642 if __debug__ and callable(key):
5643 # Verify function signature
5644 inspect.signature(key).bind(None, None)
5645
b1bde57b
SS
5646 new_objs = []
5647 for obj in objs:
5648 branching, results = apply_key(key, obj, last)
5649 has_branched |= branching
5650 new_objs.append(results)
5651
5652 objs = itertools.chain.from_iterable(new_objs)
ab029d7e 5653
776995bc
SS
5654 if test_type and not isinstance(key, (dict, list, tuple)):
5655 objs = map(type_test, objs)
5656
b1bde57b 5657 return objs, has_branched, isinstance(key, dict)
ab029d7e 5658
b1bde57b
SS
5659 def _traverse_obj(obj, path, allow_empty, test_type):
5660 results, has_branched, is_dict = apply_path(obj, path, test_type)
6839ae1f 5661 results = LazyList(item for item in results if item not in (None, {}))
f99bbfc9 5662 if get_all and has_branched:
b1bde57b
SS
5663 if results:
5664 return results.exhaust()
5665 if allow_empty:
5666 return [] if default is NO_DEFAULT else default
5667 return None
f99bbfc9 5668
b1bde57b 5669 return results[0] if results else {} if allow_empty and is_dict else None
f99bbfc9
SS
5670
5671 for index, path in enumerate(paths, 1):
b1bde57b 5672 result = _traverse_obj(obj, path, index == len(paths), True)
ab029d7e
SS
5673 if result is not None:
5674 return result
5675
f99bbfc9 5676 return None if default is NO_DEFAULT else default
324ad820 5677
5678
5679def traverse_dict(dictn, keys, casesense=True):
da4db748 5680 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5681 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5682 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5683
5684
ff91cf74 5685def get_first(obj, keys, **kwargs):
5686 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5687
5688
3e9b66d7 5689def time_seconds(**kwargs):
83c4970e
L
5690 """
5691 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5692 """
5693 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5694
5695
49fa4d9a
N
5696# create a JSON Web Signature (jws) with HS256 algorithm
5697# the resulting format is in JWS Compact Serialization
5698# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5699# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5700def jwt_encode_hs256(payload_data, key, headers={}):
5701 header_data = {
5702 'alg': 'HS256',
5703 'typ': 'JWT',
5704 }
5705 if headers:
5706 header_data.update(headers)
0f06bcd7 5707 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5708 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5709 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5710 signature_b64 = base64.b64encode(h.digest())
5711 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5712 return token
819e0531 5713
5714
16b0d7e6 5715# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5716def jwt_decode_hs256(jwt):
5717 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5718 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5719 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5720 return payload_data
5721
5722
53973b4d 5723WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5724
5725
7a32c70d 5726@functools.cache
819e0531 5727def supports_terminal_sequences(stream):
5728 if compat_os_name == 'nt':
8a82af35 5729 if not WINDOWS_VT_MODE:
819e0531 5730 return False
5731 elif not os.getenv('TERM'):
5732 return False
5733 try:
5734 return stream.isatty()
5735 except BaseException:
5736 return False
5737
5738
c53a18f0 5739def windows_enable_vt_mode():
5740 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5741 if get_windows_version() < (10, 0, 10586):
53973b4d 5742 return
53973b4d 5743
c53a18f0 5744 import ctypes
5745 import ctypes.wintypes
5746 import msvcrt
5747
5748 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5749
5750 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5751 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5752 try:
5753 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5754 dw_original_mode = ctypes.wintypes.DWORD()
5755 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5756 if not success:
5757 raise Exception('GetConsoleMode failed')
5758
5759 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5760 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5761 if not success:
5762 raise Exception('SetConsoleMode failed')
c53a18f0 5763 finally:
5764 os.close(handle)
53973b4d 5765
f0795149 5766 global WINDOWS_VT_MODE
5767 WINDOWS_VT_MODE = True
5768 supports_terminal_sequences.cache_clear()
5769
53973b4d 5770
ec11a9f4 5771_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5772
5773
5774def remove_terminal_sequences(string):
5775 return _terminal_sequences_re.sub('', string)
5776
5777
5778def number_of_digits(number):
5779 return len('%d' % number)
34921b43 5780
5781
5782def join_nonempty(*values, delim='-', from_dict=None):
5783 if from_dict is not None:
7b2c3f47 5784 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5785 return delim.join(map(str, filter(None, values)))
06e57990 5786
5787
27231526
ZM
5788def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5789 """
5790 Find the largest format dimensions in terms of video width and, for each thumbnail:
5791 * Modify the URL: Match the width with the provided regex and replace with the former width
5792 * Update dimensions
5793
5794 This function is useful with video services that scale the provided thumbnails on demand
5795 """
5796 _keys = ('width', 'height')
5797 max_dimensions = max(
86e5f3ed 5798 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5799 default=(0, 0))
5800 if not max_dimensions[0]:
5801 return thumbnails
5802 return [
5803 merge_dicts(
5804 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5805 dict(zip(_keys, max_dimensions)), thumbnail)
5806 for thumbnail in thumbnails
5807 ]
5808
5809
93c8410d
LNO
5810def parse_http_range(range):
5811 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5812 if not range:
5813 return None, None, None
5814 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5815 if not crg:
5816 return None, None, None
5817 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5818
5819
6b9e832d 5820def read_stdin(what):
5821 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5822 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5823 return sys.stdin
5824
5825
a904a7f8
L
5826def determine_file_encoding(data):
5827 """
88f60feb 5828 Detect the text encoding used
a904a7f8
L
5829 @returns (encoding, bytes to skip)
5830 """
5831
88f60feb 5832 # BOM marks are given priority over declarations
a904a7f8 5833 for bom, enc in BOMS:
a904a7f8
L
5834 if data.startswith(bom):
5835 return enc, len(bom)
5836
88f60feb 5837 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5838 # We ignore the endianness to get a good enough match
a904a7f8 5839 data = data.replace(b'\0', b'')
88f60feb 5840 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5841 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5842
5843
06e57990 5844class Config:
5845 own_args = None
9e491463 5846 parsed_args = None
06e57990 5847 filename = None
5848 __initialized = False
5849
5850 def __init__(self, parser, label=None):
9e491463 5851 self.parser, self.label = parser, label
06e57990 5852 self._loaded_paths, self.configs = set(), []
5853
5854 def init(self, args=None, filename=None):
5855 assert not self.__initialized
284a60c5 5856 self.own_args, self.filename = args, filename
5857 return self.load_configs()
5858
5859 def load_configs(self):
65662dff 5860 directory = ''
284a60c5 5861 if self.filename:
5862 location = os.path.realpath(self.filename)
65662dff 5863 directory = os.path.dirname(location)
06e57990 5864 if location in self._loaded_paths:
5865 return False
5866 self._loaded_paths.add(location)
5867
284a60c5 5868 self.__initialized = True
5869 opts, _ = self.parser.parse_known_args(self.own_args)
5870 self.parsed_args = self.own_args
9e491463 5871 for location in opts.config_locations or []:
6b9e832d 5872 if location == '-':
1060f82f 5873 if location in self._loaded_paths:
5874 continue
5875 self._loaded_paths.add(location)
6b9e832d 5876 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5877 continue
65662dff 5878 location = os.path.join(directory, expand_path(location))
06e57990 5879 if os.path.isdir(location):
5880 location = os.path.join(location, 'yt-dlp.conf')
5881 if not os.path.exists(location):
9e491463 5882 self.parser.error(f'config location {location} does not exist')
06e57990 5883 self.append_config(self.read_file(location), location)
5884 return True
5885
5886 def __str__(self):
5887 label = join_nonempty(
5888 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5889 delim=' ')
5890 return join_nonempty(
5891 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5892 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5893 delim='\n')
5894
7a32c70d 5895 @staticmethod
06e57990 5896 def read_file(filename, default=[]):
5897 try:
a904a7f8 5898 optionf = open(filename, 'rb')
86e5f3ed 5899 except OSError:
06e57990 5900 return default # silently skip if file is not present
a904a7f8
L
5901 try:
5902 enc, skip = determine_file_encoding(optionf.read(512))
5903 optionf.seek(skip, io.SEEK_SET)
5904 except OSError:
5905 enc = None # silently skip read errors
06e57990 5906 try:
5907 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5908 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5909 res = shlex.split(contents, comments=True)
44a6fcff 5910 except Exception as err:
5911 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5912 finally:
5913 optionf.close()
5914 return res
5915
7a32c70d 5916 @staticmethod
06e57990 5917 def hide_login_info(opts):
86e5f3ed 5918 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5919 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5920
5921 def _scrub_eq(o):
5922 m = eqre.match(o)
5923 if m:
5924 return m.group('key') + '=PRIVATE'
5925 else:
5926 return o
5927
5928 opts = list(map(_scrub_eq, opts))
5929 for idx, opt in enumerate(opts):
5930 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5931 opts[idx + 1] = 'PRIVATE'
5932 return opts
5933
5934 def append_config(self, *args, label=None):
9e491463 5935 config = type(self)(self.parser, label)
06e57990 5936 config._loaded_paths = self._loaded_paths
5937 if config.init(*args):
5938 self.configs.append(config)
5939
7a32c70d 5940 @property
06e57990 5941 def all_args(self):
5942 for config in reversed(self.configs):
5943 yield from config.all_args
9e491463 5944 yield from self.parsed_args or []
5945
5946 def parse_known_args(self, **kwargs):
5947 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5948
5949 def parse_args(self):
9e491463 5950 return self.parser.parse_args(self.all_args)
da42679b
LNO
5951
5952
d5d1df8a 5953class WebSocketsWrapper:
da42679b 5954 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5955 pool = None
da42679b 5956
3cea3edd 5957 def __init__(self, url, headers=None, connect=True):
059bc4db 5958 self.loop = asyncio.new_event_loop()
9cd08050 5959 # XXX: "loop" is deprecated
5960 self.conn = websockets.connect(
5961 url, extra_headers=headers, ping_interval=None,
5962 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5963 if connect:
5964 self.__enter__()
15dfb392 5965 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5966
5967 def __enter__(self):
3cea3edd 5968 if not self.pool:
9cd08050 5969 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5970 return self
5971
5972 def send(self, *args):
5973 self.run_with_loop(self.pool.send(*args), self.loop)
5974
5975 def recv(self, *args):
5976 return self.run_with_loop(self.pool.recv(*args), self.loop)
5977
5978 def __exit__(self, type, value, traceback):
5979 try:
5980 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5981 finally:
5982 self.loop.close()
15dfb392 5983 self._cancel_all_tasks(self.loop)
da42679b
LNO
5984
5985 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5986 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5987 @staticmethod
da42679b 5988 def run_with_loop(main, loop):
059bc4db 5989 if not asyncio.iscoroutine(main):
da42679b
LNO
5990 raise ValueError(f'a coroutine was expected, got {main!r}')
5991
5992 try:
5993 return loop.run_until_complete(main)
5994 finally:
5995 loop.run_until_complete(loop.shutdown_asyncgens())
5996 if hasattr(loop, 'shutdown_default_executor'):
5997 loop.run_until_complete(loop.shutdown_default_executor())
5998
7a32c70d 5999 @staticmethod
da42679b 6000 def _cancel_all_tasks(loop):
059bc4db 6001 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
6002
6003 if not to_cancel:
6004 return
6005
6006 for task in to_cancel:
6007 task.cancel()
6008
9cd08050 6009 # XXX: "loop" is removed in python 3.10+
da42679b 6010 loop.run_until_complete(
059bc4db 6011 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
6012
6013 for task in to_cancel:
6014 if task.cancelled():
6015 continue
6016 if task.exception() is not None:
6017 loop.call_exception_handler({
6018 'message': 'unhandled exception during asyncio.run() shutdown',
6019 'exception': task.exception(),
6020 'task': task,
6021 })
6022
6023
8b7539d2 6024def merge_headers(*dicts):
08d30158 6025 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 6026 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 6027
6028
b1f94422 6029def cached_method(f):
6030 """Cache a method"""
6031 signature = inspect.signature(f)
6032
7a32c70d 6033 @functools.wraps(f)
b1f94422 6034 def wrapper(self, *args, **kwargs):
6035 bound_args = signature.bind(self, *args, **kwargs)
6036 bound_args.apply_defaults()
d5d1df8a 6037 key = tuple(bound_args.arguments.values())[1:]
b1f94422 6038
6368e2e6 6039 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 6040 if key not in cache:
6041 cache[key] = f(self, *args, **kwargs)
6042 return cache[key]
6043 return wrapper
6044
6045
28787f16 6046class classproperty:
83cc7b8a 6047 """property access for class methods with optional caching"""
6048 def __new__(cls, func=None, *args, **kwargs):
6049 if not func:
6050 return functools.partial(cls, *args, **kwargs)
6051 return super().__new__(cls)
c487cf00 6052
83cc7b8a 6053 def __init__(self, func, *, cache=False):
c487cf00 6054 functools.update_wrapper(self, func)
6055 self.func = func
83cc7b8a 6056 self._cache = {} if cache else None
28787f16 6057
6058 def __get__(self, _, cls):
83cc7b8a 6059 if self._cache is None:
6060 return self.func(cls)
6061 elif cls not in self._cache:
6062 self._cache[cls] = self.func(cls)
6063 return self._cache[cls]
19a03940 6064
6065
a5387729 6066class function_with_repr:
b2e0343b 6067 def __init__(self, func, repr_=None):
a5387729 6068 functools.update_wrapper(self, func)
b2e0343b 6069 self.func, self.__repr = func, repr_
a5387729 6070
6071 def __call__(self, *args, **kwargs):
6072 return self.func(*args, **kwargs)
6073
6074 def __repr__(self):
b2e0343b 6075 if self.__repr:
6076 return self.__repr
a5387729 6077 return f'{self.func.__module__}.{self.func.__qualname__}'
6078
6079
64fa820c 6080class Namespace(types.SimpleNamespace):
591bb9d3 6081 """Immutable namespace"""
591bb9d3 6082
7896214c 6083 def __iter__(self):
64fa820c 6084 return iter(self.__dict__.values())
7896214c 6085
7a32c70d 6086 @property
64fa820c 6087 def items_(self):
6088 return self.__dict__.items()
9b8ee23b 6089
6090
8dc59305 6091MEDIA_EXTENSIONS = Namespace(
6092 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6093 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6094 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 6095 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 6096 thumbnails=('jpg', 'png', 'webp'),
6097 storyboards=('mhtml', ),
6098 subtitles=('srt', 'vtt', 'ass', 'lrc'),
6099 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6100)
6101MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
6102MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
6103
6104KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
6105
6106
be5c1ae8 6107class RetryManager:
6108 """Usage:
6109 for retry in RetryManager(...):
6110 try:
6111 ...
6112 except SomeException as err:
6113 retry.error = err
6114 continue
6115 """
6116 attempt, _error = 0, None
6117
6118 def __init__(self, _retries, _error_callback, **kwargs):
6119 self.retries = _retries or 0
6120 self.error_callback = functools.partial(_error_callback, **kwargs)
6121
6122 def _should_retry(self):
6123 return self._error is not NO_DEFAULT and self.attempt <= self.retries
6124
7a32c70d 6125 @property
be5c1ae8 6126 def error(self):
6127 if self._error is NO_DEFAULT:
6128 return None
6129 return self._error
6130
7a32c70d 6131 @error.setter
be5c1ae8 6132 def error(self, value):
6133 self._error = value
6134
6135 def __iter__(self):
6136 while self._should_retry():
6137 self.error = NO_DEFAULT
6138 self.attempt += 1
6139 yield self
6140 if self.error:
6141 self.error_callback(self.error, self.attempt, self.retries)
6142
7a32c70d 6143 @staticmethod
be5c1ae8 6144 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6145 """Utility function for reporting retries"""
6146 if count > retries:
6147 if error:
6148 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6149 raise e
6150
6151 if not count:
6152 return warn(e)
6153 elif isinstance(e, ExtractorError):
3ce29336 6154 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 6155 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6156
6157 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6158 if delay:
6159 info(f'Sleeping {delay:.2f} seconds ...')
6160 time.sleep(delay)
6161
6162
0647d925 6163def make_archive_id(ie, video_id):
6164 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6165 return f'{ie_key.lower()} {video_id}'
6166
6167
a1c5bd82 6168def truncate_string(s, left, right=0):
6169 assert left > 3 and right >= 0
6170 if s is None or len(s) <= left + right:
6171 return s
71df9b7f 6172 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 6173
6174
5314b521 6175def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6176 assert 'all' in alias_dict, '"all" alias is required'
6177 requested = list(start or [])
6178 for val in options:
6179 discard = val.startswith('-')
6180 if discard:
6181 val = val[1:]
6182
6183 if val in alias_dict:
6184 val = alias_dict[val] if not discard else [
6185 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6186 # NB: Do not allow regex in aliases for performance
6187 requested = orderedSet_from_options(val, alias_dict, start=requested)
6188 continue
6189
6190 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6191 else [val] if val in alias_dict['all'] else None)
6192 if current is None:
6193 raise ValueError(val)
6194
6195 if discard:
6196 for item in current:
6197 while item in requested:
6198 requested.remove(item)
6199 else:
6200 requested.extend(current)
6201
6202 return orderedSet(requested)
6203
6204
d0d74b71 6205class FormatSorter:
6206 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6207
6208 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6209 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6210 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6211 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6212 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6213 'fps', 'fs_approx', 'source', 'id')
6214
6215 settings = {
6216 'vcodec': {'type': 'ordered', 'regex': True,
6217 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6218 'acodec': {'type': 'ordered', 'regex': True,
71082216 6219 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 6220 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6221 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6222 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6223 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6224 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 6225 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6226 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 6227 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6228 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6229 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 6230 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6231 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6232 'field': ('vcodec', 'acodec'),
6233 'function': lambda it: int(any(v != 'none' for v in it))},
6234 'ie_pref': {'priority': True, 'type': 'extractor'},
6235 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6236 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6237 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6238 'quality': {'convert': 'float', 'default': -1},
6239 'filesize': {'convert': 'bytes'},
6240 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6241 'id': {'convert': 'string', 'field': 'format_id'},
6242 'height': {'convert': 'float_none'},
6243 'width': {'convert': 'float_none'},
6244 'fps': {'convert': 'float_none'},
6245 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6246 'tbr': {'convert': 'float_none'},
6247 'vbr': {'convert': 'float_none'},
6248 'abr': {'convert': 'float_none'},
6249 'asr': {'convert': 'float_none'},
6250 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6251
6252 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6253 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6254 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6255 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6256 'res': {'type': 'multiple', 'field': ('height', 'width'),
6257 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6258
6259 # Actual field names
6260 'format_id': {'type': 'alias', 'field': 'id'},
6261 'preference': {'type': 'alias', 'field': 'ie_pref'},
6262 'language_preference': {'type': 'alias', 'field': 'lang'},
6263 'source_preference': {'type': 'alias', 'field': 'source'},
6264 'protocol': {'type': 'alias', 'field': 'proto'},
6265 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6266 'audio_channels': {'type': 'alias', 'field': 'channels'},
6267
6268 # Deprecated
6269 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6270 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6271 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6272 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6273 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6274 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6275 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6276 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6277 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6278 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6279 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6280 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6281 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6282 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6283 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6284 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6285 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6286 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6287 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6288 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6289 }
6290
6291 def __init__(self, ydl, field_preference):
6292 self.ydl = ydl
6293 self._order = []
6294 self.evaluate_params(self.ydl.params, field_preference)
6295 if ydl.params.get('verbose'):
6296 self.print_verbose_info(self.ydl.write_debug)
6297
6298 def _get_field_setting(self, field, key):
6299 if field not in self.settings:
6300 if key in ('forced', 'priority'):
6301 return False
6302 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6303 'deprecated and may be removed in a future version')
6304 self.settings[field] = {}
6305 propObj = self.settings[field]
6306 if key not in propObj:
6307 type = propObj.get('type')
6308 if key == 'field':
6309 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6310 elif key == 'convert':
6311 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6312 else:
6313 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6314 propObj[key] = default
6315 return propObj[key]
6316
6317 def _resolve_field_value(self, field, value, convertNone=False):
6318 if value is None:
6319 if not convertNone:
6320 return None
6321 else:
6322 value = value.lower()
6323 conversion = self._get_field_setting(field, 'convert')
6324 if conversion == 'ignore':
6325 return None
6326 if conversion == 'string':
6327 return value
6328 elif conversion == 'float_none':
6329 return float_or_none(value)
6330 elif conversion == 'bytes':
6331 return parse_bytes(value)
6332 elif conversion == 'order':
6333 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6334 use_regex = self._get_field_setting(field, 'regex')
6335 list_length = len(order_list)
6336 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6337 if use_regex and value is not None:
6338 for i, regex in enumerate(order_list):
6339 if regex and re.match(regex, value):
6340 return list_length - i
6341 return list_length - empty_pos # not in list
6342 else: # not regex or value = None
6343 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6344 else:
6345 if value.isnumeric():
6346 return float(value)
6347 else:
6348 self.settings[field]['convert'] = 'string'
6349 return value
6350
6351 def evaluate_params(self, params, sort_extractor):
6352 self._use_free_order = params.get('prefer_free_formats', False)
6353 self._sort_user = params.get('format_sort', [])
6354 self._sort_extractor = sort_extractor
6355
6356 def add_item(field, reverse, closest, limit_text):
6357 field = field.lower()
6358 if field in self._order:
6359 return
6360 self._order.append(field)
6361 limit = self._resolve_field_value(field, limit_text)
6362 data = {
6363 'reverse': reverse,
6364 'closest': False if limit is None else closest,
6365 'limit_text': limit_text,
6366 'limit': limit}
6367 if field in self.settings:
6368 self.settings[field].update(data)
6369 else:
6370 self.settings[field] = data
6371
6372 sort_list = (
6373 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6374 + (tuple() if params.get('format_sort_force', False)
6375 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6376 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6377
6378 for item in sort_list:
6379 match = re.match(self.regex, item)
6380 if match is None:
6381 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6382 field = match.group('field')
6383 if field is None:
6384 continue
6385 if self._get_field_setting(field, 'type') == 'alias':
6386 alias, field = field, self._get_field_setting(field, 'field')
6387 if self._get_field_setting(alias, 'deprecated'):
6388 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6389 f'be removed in a future version. Please use {field} instead')
6390 reverse = match.group('reverse') is not None
6391 closest = match.group('separator') == '~'
6392 limit_text = match.group('limit')
6393
6394 has_limit = limit_text is not None
6395 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6396 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6397
6398 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6399 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6400 limit_count = len(limits)
6401 for (i, f) in enumerate(fields):
6402 add_item(f, reverse, closest,
6403 limits[i] if i < limit_count
6404 else limits[0] if has_limit and not has_multiple_limits
6405 else None)
6406
6407 def print_verbose_info(self, write_debug):
6408 if self._sort_user:
6409 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6410 if self._sort_extractor:
6411 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6412 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6413 '+' if self._get_field_setting(field, 'reverse') else '', field,
6414 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6415 self._get_field_setting(field, 'limit_text'),
6416 self._get_field_setting(field, 'limit'))
6417 if self._get_field_setting(field, 'limit_text') is not None else '')
6418 for field in self._order if self._get_field_setting(field, 'visible')]))
6419
6420 def _calculate_field_preference_from_value(self, format, field, type, value):
6421 reverse = self._get_field_setting(field, 'reverse')
6422 closest = self._get_field_setting(field, 'closest')
6423 limit = self._get_field_setting(field, 'limit')
6424
6425 if type == 'extractor':
6426 maximum = self._get_field_setting(field, 'max')
6427 if value is None or (maximum is not None and value >= maximum):
6428 value = -1
6429 elif type == 'boolean':
6430 in_list = self._get_field_setting(field, 'in_list')
6431 not_in_list = self._get_field_setting(field, 'not_in_list')
6432 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6433 elif type == 'ordered':
6434 value = self._resolve_field_value(field, value, True)
6435
6436 # try to convert to number
6437 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6438 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6439 if is_num:
6440 value = val_num
6441
6442 return ((-10, 0) if value is None
6443 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6444 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6445 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6446 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6447 else (-1, value, 0))
6448
6449 def _calculate_field_preference(self, format, field):
6450 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6451 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6452 if type == 'multiple':
6453 type = 'field' # Only 'field' is allowed in multiple for now
6454 actual_fields = self._get_field_setting(field, 'field')
6455
6456 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6457 else:
6458 value = get_value(field)
6459 return self._calculate_field_preference_from_value(format, field, type, value)
6460
6461 def calculate_preference(self, format):
6462 # Determine missing protocol
6463 if not format.get('protocol'):
6464 format['protocol'] = determine_protocol(format)
6465
6466 # Determine missing ext
6467 if not format.get('ext') and 'url' in format:
6468 format['ext'] = determine_ext(format['url'])
6469 if format.get('vcodec') == 'none':
6470 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6471 format['video_ext'] = 'none'
6472 else:
6473 format['video_ext'] = format['ext']
6474 format['audio_ext'] = 'none'
6475 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6476 # format['preference'] = -1000
6477
5424dbaf
L
6478 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6479 # HEVC-over-FLV is out-of-spec by FLV's original spec
6480 # ref. https://trac.ffmpeg.org/ticket/6389
6481 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6482 format['preference'] = -100
6483
d0d74b71 6484 # Determine missing bitrates
6485 if format.get('tbr') is None:
6486 if format.get('vbr') is not None and format.get('abr') is not None:
6487 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6488 else:
6489 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6490 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6491 if format.get('acodec') != 'none' and format.get('abr') is None:
6492 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6493
6494 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6495
6496
9b8ee23b 6497# Deprecated
6498has_certifi = bool(certifi)
6499has_websockets = bool(websockets)
8e40b9d1
M
6500
6501
6502def load_plugins(name, suffix, namespace):
6503 from .plugins import load_plugins
6504 ret = load_plugins(name, suffix)
6505 namespace.update(ret)
6506 return ret