]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[cleanup] Fix misc bugs (#8968)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
1e399778 1import base64
5bc880b9 2import binascii
912b38b4 3import calendar
676eb3f2 4import codecs
c380cc28 5import collections
ab029d7e 6import collections.abc
62e609ab 7import contextlib
c496ca96 8import datetime
0c265486 9import email.header
f8271158 10import email.utils
f45c185f 11import errno
49fa4d9a
N
12import hashlib
13import hmac
ac668111 14import html.entities
15import html.parser
b1f94422 16import inspect
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
f8271158 22import mimetypes
db3ad8a6 23import netrc
347de493 24import operator
d77c3dfd 25import os
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
f8271158 29import shlex
c496ca96 30import socket
79a2e94e 31import ssl
ac668111 32import struct
1c088fa8 33import subprocess
d77c3dfd 34import sys
181c8655 35import tempfile
c380cc28 36import time
01951dda 37import traceback
64fa820c 38import types
989a01c2 39import unicodedata
14f25df2 40import urllib.error
f8271158 41import urllib.parse
ac668111 42import urllib.request
bcf89ce6 43import xml.etree.ElementTree
d77c3dfd 44
69bec673 45from . import traversal
46
47from ..compat import functools # isort: split
48from ..compat import (
36e6f62c 49 compat_etree_fromstring,
51098426 50 compat_expanduser,
f8271158 51 compat_HTMLParseError,
efa97bdc 52 compat_os_name,
702ccf2d 53 compat_shlex_quote,
8c25f81b 54)
ccfd70f4 55from ..dependencies import xattr
51fb4995 56
46f1370e 57__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
f7a147e3 62
4823ec9f 63class NO_DEFAULT:
64 pass
65
66
67def IDENTITY(x):
68 return x
69
bf42a990 70
7105440c
YCH
71ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
f6717dec
S
75MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
3e4185c3
S
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 85}
a942d6cb 86
8f53dc44 87# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95}
96
c587cbb7 97# needed for sanitizing filenames in restricted mode
c8827027 98ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 101
46f59e89
S
102DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
cb655f34
S
106 '%B %dst %Y',
107 '%B %dnd %Y',
9d30c213 108 '%B %drd %Y',
cb655f34 109 '%B %dth %Y',
46f59e89 110 '%b %d %Y',
cb655f34
S
111 '%b %dst %Y',
112 '%b %dnd %Y',
9d30c213 113 '%b %drd %Y',
cb655f34 114 '%b %dth %Y',
46f59e89
S
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
9d30c213 117 '%b %drd %Y %I:%M',
46f59e89
S
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
bccdbd22 121 '%Y.%m.%d.',
46f59e89 122 '%Y/%m/%d',
81c13222 123 '%Y/%m/%d %H:%M',
46f59e89 124 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
4f3fa23e 127 '%Y%m%d',
0c1c6f4b 128 '%Y-%m-%d %H:%M',
46f59e89
S
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
5014558a 131 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
c6eed6b8
S
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
b555ae9b
S
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
a63d9bd0 144 '%H:%M %d-%b-%Y',
46f59e89
S
145)
146
147DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
47304e07 155 '%d-%m-%Y %H:%M',
4cbfa570 156 '%H:%M %d/%m/%Y',
46f59e89
S
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
06b3fe29 168PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 169JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 170
1d485a1a 171NUMBER_RE = r'\d+(?:\.\d+)?'
172
7105440c 173
0b9c08b4 174@functools.cache
d77c3dfd 175def preferredencoding():
59ae15a5 176 """Get preferred encoding.
d77c3dfd 177
59ae15a5
PH
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
28e614de 183 'TEST'.encode(pref)
70a1165b 184 except Exception:
59ae15a5 185 pref = 'UTF-8'
bae611f2 186
59ae15a5 187 return pref
d77c3dfd 188
f4bfd65f 189
181c8655 190def write_json_file(obj, fn):
1394646a 191 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 192
cfb0511d 193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
196
197 try:
198 with tf:
45d86abe 199 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
19a03940 203 with contextlib.suppress(OSError):
1394646a 204 os.unlink(fn)
19a03940 205 with contextlib.suppress(OSError):
9cd5f54e
R
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
181c8655 209 os.rename(tf.name, fn)
70a1165b 210 except Exception:
19a03940 211 with contextlib.suppress(OSError):
181c8655 212 os.remove(tf.name)
181c8655
PH
213 raise
214
215
cfb0511d 216def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 220 return node.find(expr)
59ae56fa 221
d7e66d39
JMF
222# On python2.6 the xml.etree.ElementTree.Element methods don't support
223# the namespace parameter
5f6a1245
JW
224
225
d7e66d39
JMF
226def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
d77c3dfd 237
a41fb80c 238def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 239 def _find_xpath(xpath):
f9934b96 240 return node.find(xpath)
578c0745 241
14f25df2 242 if isinstance(xpath, str):
578c0745
S
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
d74bebd5 249
8e636da4 250 if n is None:
bf42a990
S
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
bf0ff932
PH
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
a41fb80c
S
258 return n
259
260
261def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
a41fb80c
S
274
275
276def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
86e5f3ed 282 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
bf0ff932
PH
287
288
c487cf00 289def get_element_by_id(id, html, **kwargs):
43e8fafd 290 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 291 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 292
12ea2f30 293
c487cf00 294def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 295 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 296 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
297
298
84c237fb 299def get_element_by_class(class_name, html):
2af12ad9
TC
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
6f32a0b5
ZM
305def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
c487cf00 311def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
313 return retval[0] if retval else None
314
315
c487cf00 316def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
318 return retval[0] if retval else None
319
320
c487cf00 321def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
64fa820c 324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
325 html, escape_value=False)
326
327
6f32a0b5
ZM
328def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
64fa820c 331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
332 html, escape_value=False)
333
334
335def get_elements_by_attribute(*args, **kwargs):
43e8fafd 336 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
4c9a1a3b 345def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
c61473c1
M
350 if not value:
351 return
9e6dd238 352
86e5f3ed 353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 354
84c237fb
YCH
355 value = re.escape(value) if escape_value else value
356
86e5f3ed 357 partial_element_re = rf'''(?x)
4c9a1a3b 358 <(?P<tag>{tag})
0254f162 359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
38285056 362
0254f162
ZM
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 365
0254f162
ZM
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
a921f407 370
c5229f39 371
ac668111 372class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
ac668111 384 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
46d09f87 414# XXX: This should be far less strict
6f32a0b5
ZM
415def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
ac668111 450class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 451 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 452
8bb56eee 453 def __init__(self):
c5229f39 454 self.attrs = {}
ac668111 455 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
7053aa3a 459 raise compat_HTMLParseError('done')
8bb56eee 460
c5229f39 461
ac668111 462class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
ac668111 466 html.parser.HTMLParser.__init__(self)
73673ccf
FF
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
8bb56eee
BF
479def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
8bb56eee
BF
492 """
493 parser = HTMLAttributeParser()
19a03940 494 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
495 parser.feed(html_element)
496 parser.close()
8bb56eee 497 return parser.attrs
9e6dd238 498
c5229f39 499
73673ccf
FF
500def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
9e6dd238 509def clean_html(html):
59ae15a5 510 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
49185227 515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
7decf895 522 return html.strip()
9e6dd238
FV
523
524
b7c47b74 525class LenientJSONDecoder(json.JSONDecoder):
cc090836 526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 529 self._close_attempts = 2 * close_objects
b7c47b74 530 super().__init__(*args, **kwargs)
531
cc090836 532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
b7c47b74 546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
cc090836 549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
f9fb3ce8 561 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
cc090836 562 assert False, 'Too many attempts to decode JSON'
b7c47b74 563
564
d77c3dfd 565def sanitize_open(filename, open_mode):
59ae15a5
PH
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
0edb3e33 575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
be5c1ae8 578
62b58c09 579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 583
0edb3e33 584 for attempt in range(2):
585 try:
586 try:
89737671 587 if sys.platform == 'win32':
b506289f 588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 591 raise LockingUnsupportedError()
0edb3e33 592 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 593 except OSError:
0edb3e33 594 stream = open(filename, open_mode)
8a82af35 595 return stream, filename
86e5f3ed 596 except OSError as err:
0edb3e33 597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
d77c3dfd
FV
602
603
604def timeconvert(timestr):
59ae15a5
PH
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
1c469a94 611
5f6a1245 612
5c3895ff 613def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 614 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 618 """
5c3895ff 619 if s == '':
620 return ''
621
59ae15a5 622 def replace_insane(char):
c587cbb7
AT
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
91dd88b9 625 elif not restricted and char == '\n':
5c3895ff 626 return '\0 '
989a01c2 627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
5c3895ff 635 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 636 elif char in '\\/|*<>':
5c3895ff 637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
65de7d20 639 return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
59ae15a5
PH
640 return char
641
db4678e4 642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 644 s = unicodedata.normalize('NFKC', s)
5c3895ff 645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 646 result = ''.join(map(replace_insane, s))
5c3895ff 647 if is_id is NO_DEFAULT:
ae61d108 648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
796173d0
PH
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
5a42414b
PH
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
a7440261 662 result = result.lstrip('.')
796173d0
PH
663 if not result:
664 result = '_'
59ae15a5 665 return result
d77c3dfd 666
5f6a1245 667
c2934512 668def sanitize_path(s, force=False):
a2aaf4db 669 """Sanitizes and normalizes path on Windows"""
836e06d2 670 # XXX: this handles drive relative paths (c:sth) incorrectly
c2934512 671 if sys.platform == 'win32':
c4218ac3 672 force = False
c2934512 673 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 674 elif force:
675 drive_or_unc = ''
676 else:
a2aaf4db 677 return s
c2934512 678
be531ef1
S
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
a2aaf4db
S
681 norm_path.pop(0)
682 sanitized_path = [
ec85ded8 683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 684 for path_part in norm_path]
be531ef1
S
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 687 elif force and s and s[0] == os.path.sep:
c4218ac3 688 sanitized_path.insert(0, os.path.sep)
836e06d2
SS
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
a2aaf4db
S
693
694
8f97a15d 695def sanitize_url(url, *, scheme='http'):
befa4708
S
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
21633673 698 if url is None:
699 return
700 elif url.startswith('//'):
8f97a15d 701 return f'{scheme}:{url}'
befa4708
S
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
067aa17e 704 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
bc6b9bcd 712 return url
17bcc626
S
713
714
5435dcf9 715def extract_basic_auth(url):
14f25df2 716 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
717 if parts.username is None:
718 return url, None
14f25df2 719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
0f06bcd7 723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
725
726
51098426 727def expand_path(s):
2fa669f7 728 """Expand shell variables and ~"""
51098426
S
729 return os.path.expandvars(compat_expanduser(s))
730
731
7e9a6125 732def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
d77c3dfd 742
912b38b4 743
55b2f099 744def _htmlentity_transform(entity_with_semicolon):
4e408e47 745 """Transforms an HTML entity to a character."""
55b2f099
YCH
746 entity = entity_with_semicolon[:-1]
747
4e408e47 748 # Known non-numeric HTML entity
ac668111 749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
4e408e47 751
62b58c09
L
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
55b2f099 756
91757b0f 757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
758 if mobj is not None:
759 numstr = mobj.group(1)
28e614de 760 if numstr.startswith('x'):
4e408e47 761 base = 16
28e614de 762 numstr = '0%s' % numstr
4e408e47
PH
763 else:
764 base = 10
067aa17e 765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 766 with contextlib.suppress(ValueError):
ac668111 767 return chr(int(numstr, base))
4e408e47
PH
768
769 # Unknown entity in name, return its literal representation
7a3f0c00 770 return '&%s;' % entity
4e408e47
PH
771
772
d77c3dfd 773def unescapeHTML(s):
912b38b4
PH
774 if s is None:
775 return None
19a03940 776 assert isinstance(s, str)
d77c3dfd 777
4e408e47 778 return re.sub(
95f3f7c2 779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 780
8bf48f23 781
cdb19aa4 782def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
db3ad8a6
ND
793class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
d3c93ec2 800class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
82ea226c
L
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
de015e93 826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
82ea226c
L
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
da8e2912 831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
de015e93
SS
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
d3c93ec2 851
852 def communicate_or_kill(self, *args, **kwargs):
8a82af35 853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
f0c9fb96 856 self.kill(timeout=None)
8a82af35 857 raise
d3c93ec2 858
f0c9fb96 859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
992dc6b4 865 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 866 with cls(*args, **kwargs) as proc:
da8e2912 867 default = '' if proc.__text_mode else b''
992dc6b4 868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 869 return stdout or default, stderr or default, proc.returncode
f0c9fb96 870
d3c93ec2 871
f07b74fc 872def encodeArgument(s):
cfb0511d 873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
14f25df2 875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 876 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
877
878
aa7785f8 879_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
cdb19aa4 889def formatSeconds(secs, delim=':', msec=False):
aa7785f8 890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 895 else:
aa7785f8 896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 898
a0ddb8a2 899
5873d4cc 900def bug_reports_message(before=';'):
69bec673 901 from ..update import REPOSITORY
57e0f077 902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
08f2a92c
JMF
911
912
bf5b9d85
PM
913class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
aa9369a2 915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
bf5b9d85
PM
923
924
925class ExtractorError(YoutubeDLError):
1c256f70 926 """Error during info extraction."""
5f6a1245 927
1151c407 928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 929 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 931 """
c365dba8 932 from ..networking.exceptions import network_exceptions
3158150c 933 if sys.exc_info()[0] in network_exceptions:
9a82b238 934 expected = True
d5979c5d 935
7265a219 936 self.orig_msg = str(msg)
1c256f70 937 self.traceback = tb
1151c407 938 self.expected = expected
2eabb802 939 self.cause = cause
d11271dd 940 self.video_id = video_id
1151c407 941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
5df14442 943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 945 super().__init__(self.__msg)
1151c407 946
9bcfe33b 947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
1c256f70 955
01951dda 956 def format_traceback(self):
497d2fab 957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 960 delim='\n') or None
01951dda 961
9bcfe33b 962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
1c256f70 968
416c7fcb
PH
969class UnsupportedError(ExtractorError):
970 def __init__(self, url):
86e5f3ed 971 super().__init__(
416c7fcb
PH
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
55b3e45b
JMF
976class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
773f291d
S
981class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
b6e0c7d2 987
0db3bae8 988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
86e5f3ed 990 super().__init__(msg, **kwargs)
773f291d
S
991 self.countries = countries
992
993
693f0600 994class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
bf5b9d85 1002class DownloadError(YoutubeDLError):
59ae15a5 1003 """Download Error exception.
d77c3dfd 1004
59ae15a5
PH
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
5f6a1245 1009
8cc83b8d
FV
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1012 super().__init__(msg)
8cc83b8d 1013 self.exc_info = exc_info
d77c3dfd
FV
1014
1015
498f5606 1016class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
aa9369a2 1022 msg = 'Entry not found in info'
498f5606 1023
1024
bf5b9d85 1025class SameFileError(YoutubeDLError):
59ae15a5 1026 """Same File exception.
d77c3dfd 1027
59ae15a5
PH
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
aa9369a2 1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
d77c3dfd
FV
1037
1038
bf5b9d85 1039class PostProcessingError(YoutubeDLError):
59ae15a5 1040 """Post Processing exception.
d77c3dfd 1041
59ae15a5
PH
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
5f6a1245 1045
5f6a1245 1046
48f79687 1047class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
8b0d7497 1050
8b0d7497 1051
48f79687 1052class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1055
48f79687 1056
1057class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1060
1061
48f79687 1062class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1063 """ --max-downloads limit has been reached. """
48f79687 1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
f2ebc5c7 1067class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075class ThrottledDownload(ReExtractInfo):
48f79687 1076 """ Download speed below --throttled-rate. """
aa9369a2 1077 msg = 'The download speed is below throttle limit'
d77c3dfd 1078
43b22906 1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
f2ebc5c7 1081
d77c3dfd 1082
bf5b9d85 1083class UnavailableVideoError(YoutubeDLError):
59ae15a5 1084 """Unavailable Format exception.
d77c3dfd 1085
59ae15a5
PH
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
aa9369a2 1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
d77c3dfd
FV
1095
1096
bf5b9d85 1097class ContentTooShortError(YoutubeDLError):
59ae15a5 1098 """Content Too Short exception.
d77c3dfd 1099
59ae15a5
PH
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
d77c3dfd 1104
59ae15a5 1105 def __init__(self, downloaded, expected):
86e5f3ed 1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1107 # Both in bytes
59ae15a5
PH
1108 self.downloaded = downloaded
1109 self.expected = expected
d77c3dfd 1110
5f6a1245 1111
bf5b9d85 1112class XAttrMetadataError(YoutubeDLError):
efa97bdc 1113 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1114 super().__init__(msg)
efa97bdc 1115 self.code = code
bd264412 1116 self.msg = msg
efa97bdc
YCH
1117
1118 # Parsing code and msg
3089bc74 1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
bf5b9d85 1128class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1129 pass
1130
1131
941e881e 1132def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
46f59e89
S
1136def extract_timezone(date_str):
1137 m = re.search(
f137e4c2 1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
46f59e89 1148 if not m:
8f53dc44 1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
1153 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
1157 timezone = datetime.timedelta()
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
1160 timezone = datetime.timedelta(
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
08b38d54 1166def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
52c3a6e4
S
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
08b38d54 1174 if timezone is None:
46f59e89
S
1175 timezone, date_str = extract_timezone(date_str)
1176
19a03940 1177 with contextlib.suppress(ValueError):
86e5f3ed 1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1179 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt.timetuple())
912b38b4
PH
1181
1182
46f59e89
S
1183def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
42bdd9d0 1187def unified_strdate(date_str, day_first=True):
bf50b038 1188 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1189
1190 if date_str is None:
1191 return None
bf50b038 1192 upload_date = None
5f6a1245 1193 # Replace commas
026fcc04 1194 date_str = date_str.replace(',', ' ')
42bdd9d0 1195 # Remove AM/PM + timezone
9bb8e0a3 1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1197 _, date_str = extract_timezone(date_str)
42bdd9d0 1198
46f59e89 1199 for expression in date_formats(day_first):
19a03940 1200 with contextlib.suppress(ValueError):
bf50b038 1201 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
19a03940 1205 with contextlib.suppress(ValueError):
c6b9cf05 1206 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1207 if upload_date is not None:
14f25df2 1208 return str(upload_date)
bf50b038 1209
5f6a1245 1210
46f59e89 1211def unified_timestamp(date_str, day_first=True):
ad54c913 1212 if not isinstance(date_str, str):
46f59e89
S
1213 return None
1214
8f53dc44 1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1217
7dc2a74e 1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
deef3195
S
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
f226880c
PH
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
46f59e89 1234 for expression in date_formats(day_first):
19a03940 1235 with contextlib.suppress(ValueError):
7dc2a74e 1236 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1237 return calendar.timegm(dt.timetuple())
8f53dc44 1238
46f59e89
S
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
8f53dc44 1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1242
1243
28e614de 1244def determine_ext(url, default_ext='unknown_video'):
85750f89 1245 if url is None or '.' not in url:
f4776371 1246 return default_ext
9cb9a5df 1247 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
a7aaa398
S
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1252 return guess.rstrip('/')
73e79f2a 1253 else:
cbdbb766 1254 return default_ext
73e79f2a 1255
5f6a1245 1256
824fa511
S
1257def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1259
5f6a1245 1260
9e62f283 1261def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
9e62f283 1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
836e06d2 1275 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
f8795e10 1276 if date_str in ('now', 'today'):
37254abc 1277 return today
f8795e10
PH
1278 if date_str == 'yesterday':
1279 return today - datetime.timedelta(days=1)
9e62f283 1280 match = re.match(
3d38b2d6 1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1282 date_str)
37254abc 1283 if match is not None:
9e62f283 1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1286 unit = match.group('unit')
9e62f283 1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1289 unit = 'day'
9e62f283 1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
1294 delta = datetime.timedelta(**{unit + 's': time})
1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
1300 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
d49f8db3 1303def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1304 R"""
1305 Return a date object from a string using datetime_from_str
9e62f283 1306
3d38b2d6 1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1309 """
3d38b2d6 1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315def datetime_add_months(dt, months):
1316 """Increment/Decrement a datetime object by months."""
1317 month = dt.month + months - 1
1318 year = dt.year + month // 12
1319 month = month % 12 + 1
1320 day = min(dt.day, calendar.monthrange(year, month)[1])
1321 return dt.replace(year, month, day)
1322
1323
1324def datetime_round(dt, precision='day'):
1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
1329 return dt
1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
836e06d2
SS
1338 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
5f6a1245
JW
1340
1341
e63fc1be 1342def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
5f6a1245 1351
86e5f3ed 1352class DateRange:
bd558525 1353 """Represents a time interval between two dates"""
5f6a1245 1354
bd558525
JMF
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
d49f8db3 1358 self.start = date_from_str(start, strict=True)
bd558525
JMF
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
d49f8db3 1362 self.end = date_from_str(end, strict=True)
bd558525
JMF
1363 else:
1364 self.end = datetime.datetime.max.date()
37254abc 1365 if self.start > self.end:
bd558525 1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1367
bd558525
JMF
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
5f6a1245
JW
1371 return cls(day, day)
1372
bd558525
JMF
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
37254abc
JMF
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
5f6a1245 1378
46f1370e 1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1381
f2df4071 1382 def __eq__(self, other):
1383 return (isinstance(other, DateRange)
1384 and self.start == other.start and self.end == other.end)
1385
c496ca96 1386
b1f94422 1387@functools.cache
1388def system_identifier():
1389 python_implementation = platform.python_implementation()
1390 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1392 libc_ver = []
1393 with contextlib.suppress(OSError): # We may not have access to the executable
1394 libc_ver = platform.libc_ver()
b1f94422 1395
17fc3dc4 1396 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1397 platform.python_version(),
1398 python_implementation,
17fc3dc4 1399 platform.machine(),
b1f94422 1400 platform.architecture()[0],
1401 platform.platform(),
5b9f253f
M
1402 ssl.OPENSSL_VERSION,
1403 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1404 )
c257baff
PH
1405
1406
0b9c08b4 1407@functools.cache
49fa4d9a 1408def get_windows_version():
8a82af35 1409 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1410 if compat_os_name == 'nt':
1411 return version_tuple(platform.win32_ver()[1])
1412 else:
8a82af35 1413 return ()
49fa4d9a
N
1414
1415
734f90bb 1416def write_string(s, out=None, encoding=None):
19a03940 1417 assert isinstance(s, str)
1418 out = out or sys.stderr
3b479100
SS
1419 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420 if not out:
1421 return
7459e3a2 1422
fe1daad3 1423 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1424 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1425
8a82af35 1426 enc, buffer = None, out
93240fc1 1427 # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1428 if 'b' in (getattr(out, 'mode', None) or ''):
c487cf00 1429 enc = encoding or preferredencoding()
104aa738 1430 elif hasattr(out, 'buffer'):
8a82af35 1431 buffer = out.buffer
104aa738 1432 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1433
8a82af35 1434 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1435 out.flush()
1436
1437
3d2623a8 1438# TODO: Use global logger
da4db748 1439def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1440 from .. import _IN_CLI
da4db748 1441 if _IN_CLI:
1442 if msg in deprecation_warning._cache:
1443 return
1444 deprecation_warning._cache.add(msg)
1445 if printer:
1446 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1447 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1448 else:
1449 import warnings
1450 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1451
1452
1453deprecation_warning._cache = set()
1454
1455
48ea9cea
PH
1456def bytes_to_intlist(bs):
1457 if not bs:
1458 return []
1459 if isinstance(bs[0], int): # Python 3
1460 return list(bs)
1461 else:
1462 return [ord(c) for c in bs]
1463
c257baff 1464
cba892fa 1465def intlist_to_bytes(xs):
1466 if not xs:
1467 return b''
ac668111 1468 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1469
1470
8a82af35 1471class LockingUnsupportedError(OSError):
1890fc63 1472 msg = 'File locking is not supported'
0edb3e33 1473
1474 def __init__(self):
1475 super().__init__(self.msg)
1476
1477
c1c9a79c
PH
1478# Cross-platform file locking
1479if sys.platform == 'win32':
fe0918bb 1480 import ctypes
c1c9a79c
PH
1481 import ctypes.wintypes
1482 import msvcrt
1483
1484 class OVERLAPPED(ctypes.Structure):
1485 _fields_ = [
1486 ('Internal', ctypes.wintypes.LPVOID),
1487 ('InternalHigh', ctypes.wintypes.LPVOID),
1488 ('Offset', ctypes.wintypes.DWORD),
1489 ('OffsetHigh', ctypes.wintypes.DWORD),
1490 ('hEvent', ctypes.wintypes.HANDLE),
1491 ]
1492
37e325b9 1493 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1494 LockFileEx = kernel32.LockFileEx
1495 LockFileEx.argtypes = [
1496 ctypes.wintypes.HANDLE, # hFile
1497 ctypes.wintypes.DWORD, # dwFlags
1498 ctypes.wintypes.DWORD, # dwReserved
1499 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1500 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1501 ctypes.POINTER(OVERLAPPED) # Overlapped
1502 ]
1503 LockFileEx.restype = ctypes.wintypes.BOOL
1504 UnlockFileEx = kernel32.UnlockFileEx
1505 UnlockFileEx.argtypes = [
1506 ctypes.wintypes.HANDLE, # hFile
1507 ctypes.wintypes.DWORD, # dwReserved
1508 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1509 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1510 ctypes.POINTER(OVERLAPPED) # Overlapped
1511 ]
1512 UnlockFileEx.restype = ctypes.wintypes.BOOL
1513 whole_low = 0xffffffff
1514 whole_high = 0x7fffffff
1515
747c0bd1 1516 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1517 overlapped = OVERLAPPED()
1518 overlapped.Offset = 0
1519 overlapped.OffsetHigh = 0
1520 overlapped.hEvent = 0
1521 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1522
1523 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1524 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1525 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1526 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1527 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1528
1529 def _unlock_file(f):
1530 assert f._lock_file_overlapped_p
1531 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1532 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1533 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1534
1535else:
399a76e6
YCH
1536 try:
1537 import fcntl
c1c9a79c 1538
a3125791 1539 def _lock_file(f, exclusive, block):
b63837bc 1540 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1541 if not block:
1542 flags |= fcntl.LOCK_NB
acea8d7c 1543 try:
b63837bc 1544 fcntl.flock(f, flags)
acea8d7c
JK
1545 except BlockingIOError:
1546 raise
1547 except OSError: # AOSP does not have flock()
b63837bc 1548 fcntl.lockf(f, flags)
c1c9a79c 1549
399a76e6 1550 def _unlock_file(f):
45998b3e
E
1551 with contextlib.suppress(OSError):
1552 return fcntl.flock(f, fcntl.LOCK_UN)
1553 with contextlib.suppress(OSError):
1554 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1555 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1556
399a76e6 1557 except ImportError:
399a76e6 1558
a3125791 1559 def _lock_file(f, exclusive, block):
0edb3e33 1560 raise LockingUnsupportedError()
399a76e6
YCH
1561
1562 def _unlock_file(f):
0edb3e33 1563 raise LockingUnsupportedError()
c1c9a79c
PH
1564
1565
86e5f3ed 1566class locked_file:
0edb3e33 1567 locked = False
747c0bd1 1568
a3125791 1569 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1570 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1571 raise NotImplementedError(mode)
1572 self.mode, self.block = mode, block
1573
1574 writable = any(f in mode for f in 'wax+')
1575 readable = any(f in mode for f in 'r+')
1576 flags = functools.reduce(operator.ior, (
1577 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1578 getattr(os, 'O_BINARY', 0), # Windows only
1579 getattr(os, 'O_NOINHERIT', 0), # Windows only
1580 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1581 os.O_APPEND if 'a' in mode else 0,
1582 os.O_EXCL if 'x' in mode else 0,
1583 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1584 ))
1585
98804d03 1586 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1587
1588 def __enter__(self):
a3125791 1589 exclusive = 'r' not in self.mode
c1c9a79c 1590 try:
a3125791 1591 _lock_file(self.f, exclusive, self.block)
0edb3e33 1592 self.locked = True
86e5f3ed 1593 except OSError:
c1c9a79c
PH
1594 self.f.close()
1595 raise
fcfa8853 1596 if 'w' in self.mode:
131e14dc
JK
1597 try:
1598 self.f.truncate()
1599 except OSError as e:
1890fc63 1600 if e.errno not in (
1601 errno.ESPIPE, # Illegal seek - expected for FIFO
1602 errno.EINVAL, # Invalid argument - expected for /dev/null
1603 ):
1604 raise
c1c9a79c
PH
1605 return self
1606
0edb3e33 1607 def unlock(self):
1608 if not self.locked:
1609 return
c1c9a79c 1610 try:
0edb3e33 1611 _unlock_file(self.f)
c1c9a79c 1612 finally:
0edb3e33 1613 self.locked = False
c1c9a79c 1614
0edb3e33 1615 def __exit__(self, *_):
1616 try:
1617 self.unlock()
1618 finally:
1619 self.f.close()
4eb7f1d1 1620
0edb3e33 1621 open = __enter__
1622 close = __exit__
a3125791 1623
0edb3e33 1624 def __getattr__(self, attr):
1625 return getattr(self.f, attr)
a3125791 1626
0edb3e33 1627 def __iter__(self):
1628 return iter(self.f)
a3125791 1629
4eb7f1d1 1630
0b9c08b4 1631@functools.cache
4644ac55
S
1632def get_filesystem_encoding():
1633 encoding = sys.getfilesystemencoding()
1634 return encoding if encoding is not None else 'utf-8'
1635
1636
4eb7f1d1 1637def shell_quote(args):
a6a173c2 1638 quoted_args = []
4644ac55 1639 encoding = get_filesystem_encoding()
a6a173c2
JMF
1640 for a in args:
1641 if isinstance(a, bytes):
1642 # We may get a filename encoded with 'encodeFilename'
1643 a = a.decode(encoding)
aefce8e6 1644 quoted_args.append(compat_shlex_quote(a))
28e614de 1645 return ' '.join(quoted_args)
9d4660ca
PH
1646
1647
1648def smuggle_url(url, data):
1649 """ Pass additional data in a URL for internal use. """
1650
81953d1a
RA
1651 url, idata = unsmuggle_url(url, {})
1652 data.update(idata)
14f25df2 1653 sdata = urllib.parse.urlencode(
28e614de
PH
1654 {'__youtubedl_smuggle': json.dumps(data)})
1655 return url + '#' + sdata
9d4660ca
PH
1656
1657
79f82953 1658def unsmuggle_url(smug_url, default=None):
83e865a3 1659 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1660 return smug_url, default
28e614de 1661 url, _, sdata = smug_url.rpartition('#')
14f25df2 1662 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1663 data = json.loads(jsond)
1664 return url, data
02dbf93f
PH
1665
1666
e0fd9573 1667def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1668 """ Formats numbers with decimal sufixes like K, M, etc """
1669 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1670 if num is None or num < 0:
e0fd9573 1671 return None
eeb2a770 1672 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1673 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1674 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1675 if factor == 1024:
1676 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1677 converted = num / (factor ** exponent)
abbeeebc 1678 return fmt % (converted, suffix)
e0fd9573 1679
1680
02dbf93f 1681def format_bytes(bytes):
f02d24d8 1682 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1683
1c088fa8 1684
64c464a1 1685def lookup_unit_table(unit_table, s, strict=False):
1686 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1687 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1688 m = (re.fullmatch if strict else re.match)(
1689 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1690 if not m:
1691 return None
64c464a1 1692
1693 num = float(m.group('num').replace(',', '.'))
fb47597b 1694 mult = unit_table[m.group('unit')]
64c464a1 1695 return round(num * mult)
1696
1697
1698def parse_bytes(s):
1699 """Parse a string indicating a byte quantity into an integer"""
1700 return lookup_unit_table(
1701 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1702 s.upper(), strict=True)
fb47597b
S
1703
1704
be64b5b0
PH
1705def parse_filesize(s):
1706 if s is None:
1707 return None
1708
dfb1b146 1709 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1710 # but we support those too
1711 _UNIT_TABLE = {
1712 'B': 1,
1713 'b': 1,
70852b47 1714 'bytes': 1,
be64b5b0
PH
1715 'KiB': 1024,
1716 'KB': 1000,
1717 'kB': 1024,
1718 'Kb': 1000,
13585d76 1719 'kb': 1000,
70852b47
YCH
1720 'kilobytes': 1000,
1721 'kibibytes': 1024,
be64b5b0
PH
1722 'MiB': 1024 ** 2,
1723 'MB': 1000 ** 2,
1724 'mB': 1024 ** 2,
1725 'Mb': 1000 ** 2,
13585d76 1726 'mb': 1000 ** 2,
70852b47
YCH
1727 'megabytes': 1000 ** 2,
1728 'mebibytes': 1024 ** 2,
be64b5b0
PH
1729 'GiB': 1024 ** 3,
1730 'GB': 1000 ** 3,
1731 'gB': 1024 ** 3,
1732 'Gb': 1000 ** 3,
13585d76 1733 'gb': 1000 ** 3,
70852b47
YCH
1734 'gigabytes': 1000 ** 3,
1735 'gibibytes': 1024 ** 3,
be64b5b0
PH
1736 'TiB': 1024 ** 4,
1737 'TB': 1000 ** 4,
1738 'tB': 1024 ** 4,
1739 'Tb': 1000 ** 4,
13585d76 1740 'tb': 1000 ** 4,
70852b47
YCH
1741 'terabytes': 1000 ** 4,
1742 'tebibytes': 1024 ** 4,
be64b5b0
PH
1743 'PiB': 1024 ** 5,
1744 'PB': 1000 ** 5,
1745 'pB': 1024 ** 5,
1746 'Pb': 1000 ** 5,
13585d76 1747 'pb': 1000 ** 5,
70852b47
YCH
1748 'petabytes': 1000 ** 5,
1749 'pebibytes': 1024 ** 5,
be64b5b0
PH
1750 'EiB': 1024 ** 6,
1751 'EB': 1000 ** 6,
1752 'eB': 1024 ** 6,
1753 'Eb': 1000 ** 6,
13585d76 1754 'eb': 1000 ** 6,
70852b47
YCH
1755 'exabytes': 1000 ** 6,
1756 'exbibytes': 1024 ** 6,
be64b5b0
PH
1757 'ZiB': 1024 ** 7,
1758 'ZB': 1000 ** 7,
1759 'zB': 1024 ** 7,
1760 'Zb': 1000 ** 7,
13585d76 1761 'zb': 1000 ** 7,
70852b47
YCH
1762 'zettabytes': 1000 ** 7,
1763 'zebibytes': 1024 ** 7,
be64b5b0
PH
1764 'YiB': 1024 ** 8,
1765 'YB': 1000 ** 8,
1766 'yB': 1024 ** 8,
1767 'Yb': 1000 ** 8,
13585d76 1768 'yb': 1000 ** 8,
70852b47
YCH
1769 'yottabytes': 1000 ** 8,
1770 'yobibytes': 1024 ** 8,
be64b5b0
PH
1771 }
1772
fb47597b
S
1773 return lookup_unit_table(_UNIT_TABLE, s)
1774
1775
1776def parse_count(s):
1777 if s is None:
be64b5b0
PH
1778 return None
1779
352d5da8 1780 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1781
1782 if re.match(r'^[\d,.]+$', s):
1783 return str_to_int(s)
1784
1785 _UNIT_TABLE = {
1786 'k': 1000,
1787 'K': 1000,
1788 'm': 1000 ** 2,
1789 'M': 1000 ** 2,
1790 'kk': 1000 ** 2,
1791 'KK': 1000 ** 2,
352d5da8 1792 'b': 1000 ** 3,
1793 'B': 1000 ** 3,
fb47597b 1794 }
be64b5b0 1795
352d5da8 1796 ret = lookup_unit_table(_UNIT_TABLE, s)
1797 if ret is not None:
1798 return ret
1799
1800 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1801 if mobj:
1802 return str_to_int(mobj.group(1))
be64b5b0 1803
2f7ae819 1804
5d45484c 1805def parse_resolution(s, *, lenient=False):
b871d7e9
S
1806 if s is None:
1807 return {}
1808
5d45484c
LNO
1809 if lenient:
1810 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1811 else:
1812 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1813 if mobj:
1814 return {
1815 'width': int(mobj.group('w')),
1816 'height': int(mobj.group('h')),
1817 }
1818
17ec8bcf 1819 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1820 if mobj:
1821 return {'height': int(mobj.group(1))}
1822
1823 mobj = re.search(r'\b([48])[kK]\b', s)
1824 if mobj:
1825 return {'height': int(mobj.group(1)) * 540}
1826
1827 return {}
1828
1829
0dc41787 1830def parse_bitrate(s):
14f25df2 1831 if not isinstance(s, str):
0dc41787
S
1832 return
1833 mobj = re.search(r'\b(\d+)\s*kbps', s)
1834 if mobj:
1835 return int(mobj.group(1))
1836
1837
a942d6cb 1838def month_by_name(name, lang='en'):
caefb1de
PH
1839 """ Return the number of a month by (locale-independently) English name """
1840
f6717dec 1841 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1842
caefb1de 1843 try:
f6717dec 1844 return month_names.index(name) + 1
7105440c
YCH
1845 except ValueError:
1846 return None
1847
1848
1849def month_by_abbreviation(abbrev):
1850 """ Return the number of a month by (locale-independently) English
1851 abbreviations """
1852
1853 try:
1854 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1855 except ValueError:
1856 return None
18258362
JMF
1857
1858
5aafe895 1859def fix_xml_ampersands(xml_str):
18258362 1860 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1861 return re.sub(
1862 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1863 '&amp;',
5aafe895 1864 xml_str)
e3946f98
PH
1865
1866
1867def setproctitle(title):
14f25df2 1868 assert isinstance(title, str)
c1c05c67 1869
fe0918bb 1870 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1871 try:
1872 import ctypes
1873 except ImportError:
c1c05c67
YCH
1874 return
1875
e3946f98 1876 try:
611c1dd9 1877 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1878 except OSError:
1879 return
2f49bcd6
RC
1880 except TypeError:
1881 # LoadLibrary in Windows Python 2.7.13 only expects
1882 # a bytestring, but since unicode_literals turns
1883 # every string into a unicode string, it fails.
1884 return
0f06bcd7 1885 title_bytes = title.encode()
6eefe533
PH
1886 buf = ctypes.create_string_buffer(len(title_bytes))
1887 buf.value = title_bytes
e3946f98 1888 try:
f9fb3ce8 1889 # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
6eefe533 1890 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1891 except AttributeError:
1892 return # Strange libc, just skip this
d7dda168
PH
1893
1894
1895def remove_start(s, start):
46bc9b7d 1896 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1897
1898
2b9faf55 1899def remove_end(s, end):
46bc9b7d 1900 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1901
1902
31b2051e
S
1903def remove_quotes(s):
1904 if s is None or len(s) < 2:
1905 return s
1906 for quote in ('"', "'", ):
1907 if s[0] == quote and s[-1] == quote:
1908 return s[1:-1]
1909 return s
1910
1911
b6e0c7d2 1912def get_domain(url):
ebf99aaf 1913 """
1914 This implementation is inconsistent, but is kept for compatibility.
1915 Use this only for "webpage_url_domain"
1916 """
1917 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1918
1919
29eb5174 1920def url_basename(url):
14f25df2 1921 path = urllib.parse.urlparse(url).path
28e614de 1922 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1923
1924
02dc0a36 1925def base_url(url):
7657ec7e 1926 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1927
1928
e34c3361 1929def urljoin(base, path):
4b5de77b 1930 if isinstance(path, bytes):
0f06bcd7 1931 path = path.decode()
14f25df2 1932 if not isinstance(path, str) or not path:
e34c3361 1933 return None
fad4ceb5 1934 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1935 return path
4b5de77b 1936 if isinstance(base, bytes):
0f06bcd7 1937 base = base.decode()
14f25df2 1938 if not isinstance(base, str) or not re.match(
4b5de77b 1939 r'^(?:https?:)?//', base):
e34c3361 1940 return None
14f25df2 1941 return urllib.parse.urljoin(base, path)
e34c3361
S
1942
1943
9732d77e 1944def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1945 if get_attr and v is not None:
1946 v = getattr(v, get_attr, None)
1812afb7
S
1947 try:
1948 return int(v) * invscale // scale
31c49255 1949 except (ValueError, TypeError, OverflowError):
af98f8ff 1950 return default
9732d77e 1951
9572013d 1952
40a90862 1953def str_or_none(v, default=None):
14f25df2 1954 return default if v is None else str(v)
40a90862 1955
9732d77e
PH
1956
1957def str_to_int(int_str):
48d4681e 1958 """ A more relaxed version of int_or_none """
f9934b96 1959 if isinstance(int_str, int):
348c6bf1 1960 return int_str
14f25df2 1961 elif isinstance(int_str, str):
42db58ec
S
1962 int_str = re.sub(r'[,\.\+]', '', int_str)
1963 return int_or_none(int_str)
608d11f5
PH
1964
1965
9732d77e 1966def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1967 if v is None:
1968 return default
1969 try:
1970 return float(v) * invscale / scale
5e1271c5 1971 except (ValueError, TypeError):
caf80631 1972 return default
43f775e4
PH
1973
1974
c7e327c4
S
1975def bool_or_none(v, default=None):
1976 return v if isinstance(v, bool) else default
1977
1978
53cd37ba 1979def strip_or_none(v, default=None):
14f25df2 1980 return v.strip() if isinstance(v, str) else default
b72b4431
S
1981
1982
af03000a 1983def url_or_none(url):
14f25df2 1984 if not url or not isinstance(url, str):
af03000a
S
1985 return None
1986 url = url.strip()
29f7c58a 1987 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
1988
1989
ad54c913 1990def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 1991 datetime_object = None
1992 try:
f9934b96 1993 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 1994 # Using naive datetime here can break timestamp() in Windows
1995 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 1996 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1997 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1998 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1999 + datetime.timedelta(seconds=timestamp))
14f25df2 2000 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2001 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2002 date_format = re.sub( # Support %s on windows
2003 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2004 return datetime_object.strftime(date_format)
2005 except (ValueError, TypeError, AttributeError):
2006 return default
2007
2008
608d11f5 2009def parse_duration(s):
f9934b96 2010 if not isinstance(s, str):
608d11f5 2011 return None
ca7b3246 2012 s = s.strip()
38d79fd1 2013 if not s:
2014 return None
ca7b3246 2015
acaff495 2016 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2017 m = re.match(r'''(?x)
2018 (?P<before_secs>
2019 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2020 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2021 (?P<ms>[.:][0-9]+)?Z?$
2022 ''', s)
acaff495 2023 if m:
8bd1c00b 2024 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2025 else:
2026 m = re.match(
056653bb
S
2027 r'''(?ix)(?:P?
2028 (?:
1c1b2f96 2029 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2030 )?
2031 (?:
1c1b2f96 2032 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2033 )?
2034 (?:
1c1b2f96 2035 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2036 )?
8f4b58d7 2037 (?:
1c1b2f96 2038 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2039 )?
056653bb 2040 T)?
acaff495 2041 (?:
af868732 2042 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
acaff495 2043 )?
2044 (?:
1c1b2f96 2045 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2046 )?
2047 (?:
2048 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2049 )?Z?$''', s)
acaff495 2050 if m:
2051 days, hours, mins, secs, ms = m.groups()
2052 else:
15846398 2053 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2054 if m:
2055 hours, mins = m.groups()
2056 else:
2057 return None
2058
acaff495 2059 if ms:
19a03940 2060 ms = ms.replace(':', '.')
2061 return sum(float(part or 0) * mult for part, mult in (
2062 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2063
2064
e65e4c88 2065def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2066 name, real_ext = os.path.splitext(filename)
e65e4c88 2067 return (
86e5f3ed 2068 f'{name}.{ext}{real_ext}'
e65e4c88 2069 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2070 else f'{filename}.{ext}')
d70ad093
PH
2071
2072
b3ed15b7
S
2073def replace_extension(filename, ext, expected_real_ext=None):
2074 name, real_ext = os.path.splitext(filename)
86e5f3ed 2075 return '{}.{}'.format(
b3ed15b7
S
2076 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2077 ext)
2078
2079
d70ad093
PH
2080def check_executable(exe, args=[]):
2081 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2082 args can be a list of arguments for a short output (like -version) """
2083 try:
f0c9fb96 2084 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2085 except OSError:
2086 return False
2087 return exe
b7ab0590
PH
2088
2089
7aaf4cd2 2090def _get_exe_version_output(exe, args):
95807118 2091 try:
b64d04c1 2092 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2093 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2094 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2095 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2096 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2097 if ret:
2098 return None
95807118
PH
2099 except OSError:
2100 return False
f0c9fb96 2101 return stdout
cae97f65
PH
2102
2103
2104def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2105 assert isinstance(output, str)
cae97f65
PH
2106 if version_re is None:
2107 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2108 m = re.search(version_re, output)
95807118
PH
2109 if m:
2110 return m.group(1)
2111 else:
2112 return unrecognized
2113
2114
9af98e17 2115def get_exe_version(exe, args=['--version'],
1cdda329 2116 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2117 """ Returns the version of the specified executable,
2118 or False if the executable is not present """
1cdda329 2119 unrecognized = variadic(unrecognized)
2120 assert len(unrecognized) in (1, 2)
9af98e17 2121 out = _get_exe_version_output(exe, args)
1cdda329 2122 if out is None:
2123 return unrecognized[-1]
2124 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2125
2126
7e88d7d7 2127def frange(start=0, stop=None, step=1):
2128 """Float range"""
2129 if stop is None:
2130 start, stop = 0, start
2131 sign = [-1, 1][step > 0] if step else 0
2132 while sign * start < sign * stop:
2133 yield start
2134 start += step
2135
2136
cb89cfc1 2137class LazyList(collections.abc.Sequence):
0f06bcd7 2138 """Lazy immutable list from an iterable
2139 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2140
8e5fecc8 2141 class IndexError(IndexError):
2142 pass
2143
282f5709 2144 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2145 self._iterable = iter(iterable)
2146 self._cache = [] if _cache is None else _cache
2147 self._reversed = reverse
483336e7 2148
2149 def __iter__(self):
0f06bcd7 2150 if self._reversed:
28419ca2 2151 # We need to consume the entire iterable to iterate in reverse
981052c9 2152 yield from self.exhaust()
28419ca2 2153 return
0f06bcd7 2154 yield from self._cache
2155 for item in self._iterable:
2156 self._cache.append(item)
483336e7 2157 yield item
2158
0f06bcd7 2159 def _exhaust(self):
2160 self._cache.extend(self._iterable)
2161 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2162 return self._cache
28419ca2 2163
981052c9 2164 def exhaust(self):
0f06bcd7 2165 """Evaluate the entire iterable"""
2166 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2167
28419ca2 2168 @staticmethod
0f06bcd7 2169 def _reverse_index(x):
f2df4071 2170 return None if x is None else ~x
483336e7 2171
2172 def __getitem__(self, idx):
2173 if isinstance(idx, slice):
0f06bcd7 2174 if self._reversed:
2175 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2176 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2177 elif isinstance(idx, int):
0f06bcd7 2178 if self._reversed:
2179 idx = self._reverse_index(idx)
e0f2b4b4 2180 start, stop, step = idx, idx, 0
483336e7 2181 else:
2182 raise TypeError('indices must be integers or slices')
e0f2b4b4 2183 if ((start or 0) < 0 or (stop or 0) < 0
2184 or (start is None and step < 0)
2185 or (stop is None and step > 0)):
483336e7 2186 # We need to consume the entire iterable to be able to slice from the end
2187 # Obviously, never use this with infinite iterables
0f06bcd7 2188 self._exhaust()
8e5fecc8 2189 try:
0f06bcd7 2190 return self._cache[idx]
8e5fecc8 2191 except IndexError as e:
2192 raise self.IndexError(e) from e
0f06bcd7 2193 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2194 if n > 0:
0f06bcd7 2195 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2196 try:
0f06bcd7 2197 return self._cache[idx]
8e5fecc8 2198 except IndexError as e:
2199 raise self.IndexError(e) from e
483336e7 2200
2201 def __bool__(self):
2202 try:
0f06bcd7 2203 self[-1] if self._reversed else self[0]
8e5fecc8 2204 except self.IndexError:
483336e7 2205 return False
2206 return True
2207
2208 def __len__(self):
0f06bcd7 2209 self._exhaust()
2210 return len(self._cache)
483336e7 2211
282f5709 2212 def __reversed__(self):
0f06bcd7 2213 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2214
2215 def __copy__(self):
0f06bcd7 2216 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2217
28419ca2 2218 def __repr__(self):
2219 # repr and str should mimic a list. So we exhaust the iterable
2220 return repr(self.exhaust())
2221
2222 def __str__(self):
2223 return repr(self.exhaust())
2224
483336e7 2225
7be9ccff 2226class PagedList:
c07a39ae 2227
2228 class IndexError(IndexError):
2229 pass
2230
dd26ced1
PH
2231 def __len__(self):
2232 # This is only useful for tests
2233 return len(self.getslice())
2234
7be9ccff 2235 def __init__(self, pagefunc, pagesize, use_cache=True):
2236 self._pagefunc = pagefunc
2237 self._pagesize = pagesize
f1d13090 2238 self._pagecount = float('inf')
7be9ccff 2239 self._use_cache = use_cache
2240 self._cache = {}
2241
2242 def getpage(self, pagenum):
d8cf8d97 2243 page_results = self._cache.get(pagenum)
2244 if page_results is None:
f1d13090 2245 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2246 if self._use_cache:
2247 self._cache[pagenum] = page_results
2248 return page_results
2249
2250 def getslice(self, start=0, end=None):
2251 return list(self._getslice(start, end))
2252
2253 def _getslice(self, start, end):
55575225 2254 raise NotImplementedError('This method must be implemented by subclasses')
2255
2256 def __getitem__(self, idx):
f1d13090 2257 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2258 if not isinstance(idx, int) or idx < 0:
2259 raise TypeError('indices must be non-negative integers')
2260 entries = self.getslice(idx, idx + 1)
d8cf8d97 2261 if not entries:
c07a39ae 2262 raise self.IndexError()
d8cf8d97 2263 return entries[0]
55575225 2264
f9fb3ce8
SS
2265 def __bool__(self):
2266 return bool(self.getslice(0, 1))
2267
9c44d242
PH
2268
2269class OnDemandPagedList(PagedList):
a44ca5a4 2270 """Download pages until a page with less than maximum results"""
86e5f3ed 2271
7be9ccff 2272 def _getslice(self, start, end):
b7ab0590
PH
2273 for pagenum in itertools.count(start // self._pagesize):
2274 firstid = pagenum * self._pagesize
2275 nextfirstid = pagenum * self._pagesize + self._pagesize
2276 if start >= nextfirstid:
2277 continue
2278
b7ab0590
PH
2279 startv = (
2280 start % self._pagesize
2281 if firstid <= start < nextfirstid
2282 else 0)
b7ab0590
PH
2283 endv = (
2284 ((end - 1) % self._pagesize) + 1
2285 if (end is not None and firstid <= end <= nextfirstid)
2286 else None)
2287
f1d13090 2288 try:
2289 page_results = self.getpage(pagenum)
2290 except Exception:
2291 self._pagecount = pagenum - 1
2292 raise
b7ab0590
PH
2293 if startv != 0 or endv is not None:
2294 page_results = page_results[startv:endv]
7be9ccff 2295 yield from page_results
b7ab0590
PH
2296
2297 # A little optimization - if current page is not "full", ie. does
2298 # not contain page_size videos then we can assume that this page
2299 # is the last one - there are no more ids on further pages -
2300 # i.e. no need to query again.
2301 if len(page_results) + startv < self._pagesize:
2302 break
2303
2304 # If we got the whole page, but the next page is not interesting,
2305 # break out early as well
2306 if end == nextfirstid:
2307 break
81c2f20b
PH
2308
2309
9c44d242 2310class InAdvancePagedList(PagedList):
a44ca5a4 2311 """PagedList with total number of pages known in advance"""
86e5f3ed 2312
9c44d242 2313 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2314 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2315 self._pagecount = pagecount
9c44d242 2316
7be9ccff 2317 def _getslice(self, start, end):
9c44d242 2318 start_page = start // self._pagesize
d37707bd 2319 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2320 skip_elems = start - start_page * self._pagesize
2321 only_more = None if end is None else end - start
2322 for pagenum in range(start_page, end_page):
7be9ccff 2323 page_results = self.getpage(pagenum)
9c44d242 2324 if skip_elems:
7be9ccff 2325 page_results = page_results[skip_elems:]
9c44d242
PH
2326 skip_elems = None
2327 if only_more is not None:
7be9ccff 2328 if len(page_results) < only_more:
2329 only_more -= len(page_results)
9c44d242 2330 else:
7be9ccff 2331 yield from page_results[:only_more]
9c44d242 2332 break
7be9ccff 2333 yield from page_results
9c44d242
PH
2334
2335
7e88d7d7 2336class PlaylistEntries:
2337 MissingEntry = object()
2338 is_exhausted = False
2339
2340 def __init__(self, ydl, info_dict):
7e9a6125 2341 self.ydl = ydl
2342
2343 # _entries must be assigned now since infodict can change during iteration
2344 entries = info_dict.get('entries')
2345 if entries is None:
2346 raise EntryNotInPlaylist('There are no entries')
2347 elif isinstance(entries, list):
2348 self.is_exhausted = True
2349
2350 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2351 self.is_incomplete = requested_entries is not None
7e9a6125 2352 if self.is_incomplete:
2353 assert self.is_exhausted
bc5c2f8a 2354 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2355 for i, entry in zip(requested_entries, entries):
2356 self._entries[i - 1] = entry
2357 elif isinstance(entries, (list, PagedList, LazyList)):
2358 self._entries = entries
2359 else:
2360 self._entries = LazyList(entries)
7e88d7d7 2361
2362 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2363 (?P<start>[+-]?\d+)?
2364 (?P<range>[:-]
2365 (?P<end>[+-]?\d+|inf(?:inite)?)?
2366 (?::(?P<step>[+-]?\d+))?
2367 )?''')
2368
2369 @classmethod
2370 def parse_playlist_items(cls, string):
2371 for segment in string.split(','):
2372 if not segment:
2373 raise ValueError('There is two or more consecutive commas')
2374 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2375 if not mobj:
2376 raise ValueError(f'{segment!r} is not a valid specification')
2377 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2378 if int_or_none(step) == 0:
2379 raise ValueError(f'Step in {segment!r} cannot be zero')
2380 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2381
2382 def get_requested_items(self):
2383 playlist_items = self.ydl.params.get('playlist_items')
2384 playlist_start = self.ydl.params.get('playliststart', 1)
2385 playlist_end = self.ydl.params.get('playlistend')
2386 # For backwards compatibility, interpret -1 as whole list
2387 if playlist_end in (-1, None):
2388 playlist_end = ''
2389 if not playlist_items:
2390 playlist_items = f'{playlist_start}:{playlist_end}'
2391 elif playlist_start != 1 or playlist_end:
2392 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2393
2394 for index in self.parse_playlist_items(playlist_items):
2395 for i, entry in self[index]:
2396 yield i, entry
1ac4fd80 2397 if not entry:
2398 continue
7e88d7d7 2399 try:
d21056f4 2400 # The item may have just been added to archive. Don't break due to it
2401 if not self.ydl.params.get('lazy_playlist'):
2402 # TODO: Add auto-generated fields
2403 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2404 except (ExistingVideoReached, RejectedVideoReached):
2405 return
2406
7e9a6125 2407 def get_full_count(self):
2408 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2409 return len(self)
2410 elif isinstance(self._entries, InAdvancePagedList):
2411 if self._entries._pagesize == 1:
2412 return self._entries._pagecount
2413
7e88d7d7 2414 @functools.cached_property
2415 def _getter(self):
2416 if isinstance(self._entries, list):
2417 def get_entry(i):
2418 try:
2419 entry = self._entries[i]
2420 except IndexError:
2421 entry = self.MissingEntry
2422 if not self.is_incomplete:
2423 raise self.IndexError()
2424 if entry is self.MissingEntry:
bc5c2f8a 2425 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2426 return entry
2427 else:
2428 def get_entry(i):
2429 try:
2430 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2431 except (LazyList.IndexError, PagedList.IndexError):
2432 raise self.IndexError()
2433 return get_entry
2434
2435 def __getitem__(self, idx):
2436 if isinstance(idx, int):
2437 idx = slice(idx, idx)
2438
2439 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2440 step = 1 if idx.step is None else idx.step
2441 if idx.start is None:
2442 start = 0 if step > 0 else len(self) - 1
2443 else:
2444 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2445
2446 # NB: Do not call len(self) when idx == [:]
2447 if idx.stop is None:
2448 stop = 0 if step < 0 else float('inf')
2449 else:
2450 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2451 stop += [-1, 1][step > 0]
2452
2453 for i in frange(start, stop, step):
2454 if i < 0:
2455 continue
2456 try:
7e9a6125 2457 entry = self._getter(i)
2458 except self.IndexError:
2459 self.is_exhausted = True
2460 if step > 0:
7e88d7d7 2461 break
7e9a6125 2462 continue
7e88d7d7 2463 yield i + 1, entry
2464
2465 def __len__(self):
2466 return len(tuple(self[:]))
2467
2468 class IndexError(IndexError):
2469 pass
2470
2471
81c2f20b 2472def uppercase_escape(s):
676eb3f2 2473 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2474 return re.sub(
a612753d 2475 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2476 lambda m: unicode_escape(m.group(0))[0],
2477 s)
0fe2ff78
YCH
2478
2479
2480def lowercase_escape(s):
2481 unicode_escape = codecs.getdecoder('unicode_escape')
2482 return re.sub(
2483 r'\\u[0-9a-fA-F]{4}',
2484 lambda m: unicode_escape(m.group(0))[0],
2485 s)
b53466e1 2486
d05cfe06 2487
96b9e9cf 2488def parse_qs(url, **kwargs):
2489 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2490
2491
62e609ab
PH
2492def read_batch_urls(batch_fd):
2493 def fixup(url):
14f25df2 2494 if not isinstance(url, str):
62e609ab 2495 url = url.decode('utf-8', 'replace')
8c04f0be 2496 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2497 for bom in BOM_UTF8:
2498 if url.startswith(bom):
2499 url = url[len(bom):]
2500 url = url.lstrip()
2501 if not url or url.startswith(('#', ';', ']')):
62e609ab 2502 return False
8c04f0be 2503 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2504 # However, it can be safely stripped out if following a whitespace
8c04f0be 2505 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2506
2507 with contextlib.closing(batch_fd) as fd:
2508 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2509
2510
2511def urlencode_postdata(*args, **kargs):
14f25df2 2512 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2513
2514
45b2ee6f 2515def update_url(url, *, query_update=None, **kwargs):
2516 """Replace URL components specified by kwargs
2517 @param url str or parse url tuple
2518 @param query_update update query
2519 @returns str
2520 """
2521 if isinstance(url, str):
2522 if not kwargs and not query_update:
2523 return url
2524 else:
2525 url = urllib.parse.urlparse(url)
2526 if query_update:
2527 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2528 kwargs['query'] = urllib.parse.urlencode({
2529 **urllib.parse.parse_qs(url.query),
2530 **query_update
2531 }, True)
2532 return urllib.parse.urlunparse(url._replace(**kwargs))
2533
2534
38f9ef31 2535def update_url_query(url, query):
45b2ee6f 2536 return update_url(url, query_update=query)
16392824 2537
8e60dc75 2538
10c87c15 2539def _multipart_encode_impl(data, boundary):
0c265486
YCH
2540 content_type = 'multipart/form-data; boundary=%s' % boundary
2541
2542 out = b''
2543 for k, v in data.items():
2544 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2545 if isinstance(k, str):
0f06bcd7 2546 k = k.encode()
14f25df2 2547 if isinstance(v, str):
0f06bcd7 2548 v = v.encode()
0c265486
YCH
2549 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2550 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2551 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2552 if boundary.encode('ascii') in content:
2553 raise ValueError('Boundary overlaps with data')
2554 out += content
2555
2556 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2557
2558 return out, content_type
2559
2560
2561def multipart_encode(data, boundary=None):
2562 '''
2563 Encode a dict to RFC 7578-compliant form-data
2564
2565 data:
2566 A dict where keys and values can be either Unicode or bytes-like
2567 objects.
2568 boundary:
2569 If specified a Unicode object, it's used as the boundary. Otherwise
2570 a random boundary is generated.
2571
2572 Reference: https://tools.ietf.org/html/rfc7578
2573 '''
2574 has_specified_boundary = boundary is not None
2575
2576 while True:
2577 if boundary is None:
2578 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2579
2580 try:
10c87c15 2581 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2582 break
2583 except ValueError:
2584 if has_specified_boundary:
2585 raise
2586 boundary = None
2587
2588 return out, content_type
2589
2590
b079c26f
SS
2591def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2592 if blocked_types is NO_DEFAULT:
2593 blocked_types = (str, bytes, collections.abc.Mapping)
2594 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2595
2596
2597def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2598 if not isinstance(allowed_types, (tuple, type)):
2599 deprecation_warning('allowed_types should be a tuple or a type')
2600 allowed_types = tuple(allowed_types)
6f2287cb 2601 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2602
2603
c4f60dd7 2604def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2605 for f in funcs:
a32a9a7e 2606 try:
c4f60dd7 2607 val = f(*args, **kwargs)
ab029d7e 2608 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2609 pass
2610 else:
c4f60dd7 2611 if expected_type is None or isinstance(val, expected_type):
2612 return val
2613
2614
2615def try_get(src, getter, expected_type=None):
2616 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2617
2618
90137ca4 2619def filter_dict(dct, cndn=lambda _, v: v is not None):
2620 return {k: v for k, v in dct.items() if cndn(k, v)}
2621
2622
6cc62232
S
2623def merge_dicts(*dicts):
2624 merged = {}
2625 for a_dict in dicts:
2626 for k, v in a_dict.items():
90137ca4 2627 if (v is not None and k not in merged
2628 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2629 merged[k] = v
2630 return merged
2631
2632
8e60dc75 2633def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2634 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2635
16392824 2636
a1a530b0
PH
2637US_RATINGS = {
2638 'G': 0,
2639 'PG': 10,
2640 'PG-13': 13,
2641 'R': 16,
2642 'NC': 18,
2643}
fac55558
PH
2644
2645
a8795327 2646TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2647 'TV-Y': 0,
2648 'TV-Y7': 7,
2649 'TV-G': 0,
2650 'TV-PG': 0,
2651 'TV-14': 14,
2652 'TV-MA': 17,
a8795327
S
2653}
2654
2655
146c80e2 2656def parse_age_limit(s):
19a03940 2657 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2658 if type(s) is int: # noqa: E721
a8795327 2659 return s if 0 <= s <= 21 else None
19a03940 2660 elif not isinstance(s, str):
d838b1bd 2661 return None
146c80e2 2662 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2663 if m:
2664 return int(m.group('age'))
5c5fae6d 2665 s = s.upper()
a8795327
S
2666 if s in US_RATINGS:
2667 return US_RATINGS[s]
5a16c9d9 2668 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2669 if m:
5a16c9d9 2670 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2671 return None
146c80e2
S
2672
2673
fac55558 2674def strip_jsonp(code):
609a61e3 2675 return re.sub(
5552c9eb 2676 r'''(?sx)^
e9c671d5 2677 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2678 (?:\s*&&\s*(?P=func_name))?
2679 \s*\(\s*(?P<callback_data>.*)\);?
2680 \s*?(?://[^\n]*)*$''',
2681 r'\g<callback_data>', code)
478c2c61
PH
2682
2683
8f53dc44 2684def js_to_json(code, vars={}, *, strict=False):
5c610515 2685 # vars is a dict of var, val pairs to substitute
0898c5c8 2686 STRING_QUOTES = '\'"`'
a71b812f 2687 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2688 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2689 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2690 INTEGER_TABLE = (
86e5f3ed 2691 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2692 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2693 )
2694
a71b812f
SS
2695 def process_escape(match):
2696 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2697 escape = match.group(1) or match.group(2)
2698
2699 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2700 else R'\u00' if escape == 'x'
2701 else '' if escape == '\n'
2702 else escape)
2703
0898c5c8
SS
2704 def template_substitute(match):
2705 evaluated = js_to_json(match.group(1), vars, strict=strict)
2706 if evaluated[0] == '"':
2707 return json.loads(evaluated)
2708 return evaluated
2709
e05f6939 2710 def fix_kv(m):
e7b6d122
PH
2711 v = m.group(0)
2712 if v in ('true', 'false', 'null'):
2713 return v
421ddcb8
C
2714 elif v in ('undefined', 'void 0'):
2715 return 'null'
8bdd16b4 2716 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2717 return ''
2718
2719 if v[0] in STRING_QUOTES:
0898c5c8
SS
2720 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2721 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2722 return f'"{escaped}"'
2723
2724 for regex, base in INTEGER_TABLE:
2725 im = re.match(regex, v)
2726 if im:
2727 i = int(im.group(1), base)
2728 return f'"{i}":' if v.endswith(':') else str(i)
2729
2730 if v in vars:
d5f043d1
C
2731 try:
2732 if not strict:
2733 json.loads(vars[v])
08e29b9f 2734 except json.JSONDecodeError:
d5f043d1
C
2735 return json.dumps(vars[v])
2736 else:
2737 return vars[v]
89ac4a19 2738
a71b812f
SS
2739 if not strict:
2740 return f'"{v}"'
5c610515 2741
a71b812f 2742 raise ValueError(f'Unknown value: {v}')
e05f6939 2743
8072ef2b 2744 def create_map(mobj):
2745 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2746
52414d64 2747 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
8072ef2b 2748 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2749 if not strict:
9d7ded64 2750 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
f55523cf 2751 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2752 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2753 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2754
a71b812f
SS
2755 return re.sub(rf'''(?sx)
2756 {STRING_RE}|
2757 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2758 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2759 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2760 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2761 !+
a71b812f 2762 ''', fix_kv, code)
e05f6939
PH
2763
2764
478c2c61
PH
2765def qualities(quality_ids):
2766 """ Get a numeric quality value out of a list of possible values """
2767 def q(qid):
2768 try:
2769 return quality_ids.index(qid)
2770 except ValueError:
2771 return -1
2772 return q
2773
acd69589 2774
119e40ef 2775POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2776
2777
de6000d9 2778DEFAULT_OUTTMPL = {
2779 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2780 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2781}
2782OUTTMPL_TYPES = {
72755351 2783 'chapter': None,
de6000d9 2784 'subtitle': None,
2785 'thumbnail': None,
2786 'description': 'description',
2787 'annotation': 'annotations.xml',
2788 'infojson': 'info.json',
08438d2c 2789 'link': None,
3b603dbd 2790 'pl_video': None,
5112f26a 2791 'pl_thumbnail': None,
de6000d9 2792 'pl_description': 'description',
2793 'pl_infojson': 'info.json',
2794}
0a871f68 2795
143db31d 2796# As of [1] format syntax is:
2797# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2798# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2799STR_FORMAT_RE_TMPL = r'''(?x)
2800 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2801 %
524e2e4f 2802 (?P<has_key>\((?P<key>{0})\))?
752cda38 2803 (?P<format>
524e2e4f 2804 (?P<conversion>[#0\-+ ]+)?
2805 (?P<min_width>\d+)?
2806 (?P<precision>\.\d+)?
2807 (?P<len_mod>[hlL])? # unused in python
901130bb 2808 {1} # conversion type
752cda38 2809 )
143db31d 2810'''
2811
7d1eb38a 2812
ebe1b4e3 2813STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2814
7d1eb38a 2815
a020a0dc
PH
2816def limit_length(s, length):
2817 """ Add ellipses to overly long strings """
2818 if s is None:
2819 return None
2820 ELLIPSES = '...'
2821 if len(s) > length:
2822 return s[:length - len(ELLIPSES)] + ELLIPSES
2823 return s
48844745
PH
2824
2825
2826def version_tuple(v):
5f9b8394 2827 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2828
2829
2830def is_outdated_version(version, limit, assume_new=True):
2831 if not version:
2832 return not assume_new
2833 try:
2834 return version_tuple(version) < version_tuple(limit)
2835 except ValueError:
2836 return not assume_new
732ea2f0
PH
2837
2838
2839def ytdl_is_updateable():
7a5c1cfe 2840 """ Returns if yt-dlp can be updated with -U """
735d865e 2841
69bec673 2842 from ..update import is_non_updateable
732ea2f0 2843
5d535b4a 2844 return not is_non_updateable()
7d4111ed
PH
2845
2846
2847def args_to_str(args):
2848 # Get a short string representation for a subprocess command
702ccf2d 2849 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2850
2851
a44ca5a4 2852def error_to_str(err):
2853 return f'{type(err).__name__}: {err}'
2854
2855
2647c933 2856def mimetype2ext(mt, default=NO_DEFAULT):
2857 if not isinstance(mt, str):
2858 if default is not NO_DEFAULT:
2859 return default
eb9ee194
S
2860 return None
2861
2647c933 2862 MAP = {
2863 # video
f6861ec9 2864 '3gpp': '3gp',
2647c933 2865 'mp2t': 'ts',
2866 'mp4': 'mp4',
2867 'mpeg': 'mpeg',
2868 'mpegurl': 'm3u8',
2869 'quicktime': 'mov',
2870 'webm': 'webm',
2871 'vp9': 'vp9',
f659e643 2872 'video/ogg': 'ogv',
f6861ec9 2873 'x-flv': 'flv',
2647c933 2874 'x-m4v': 'm4v',
2875 'x-matroska': 'mkv',
2876 'x-mng': 'mng',
a0d8d704 2877 'x-mp4-fragmented': 'mp4',
2647c933 2878 'x-ms-asf': 'asf',
a0d8d704 2879 'x-ms-wmv': 'wmv',
2647c933 2880 'x-msvideo': 'avi',
2881
2882 # application (streaming playlists)
b4173f15 2883 'dash+xml': 'mpd',
b4173f15 2884 'f4m+xml': 'f4m',
f164b971 2885 'hds+xml': 'f4m',
2647c933 2886 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2887 'vnd.ms-sstr+xml': 'ism',
2647c933 2888 'x-mpegurl': 'm3u8',
2889
2890 # audio
2891 'audio/mp4': 'm4a',
2892 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2893 # Using .mp3 as it's the most popular one
2894 'audio/mpeg': 'mp3',
d80ca5de 2895 'audio/webm': 'webm',
2647c933 2896 'audio/x-matroska': 'mka',
2897 'audio/x-mpegurl': 'm3u',
2898 'midi': 'mid',
2899 'ogg': 'ogg',
2900 'wav': 'wav',
2901 'wave': 'wav',
2902 'x-aac': 'aac',
2903 'x-flac': 'flac',
2904 'x-m4a': 'm4a',
2905 'x-realaudio': 'ra',
39e7107d 2906 'x-wav': 'wav',
9359f3d4 2907
2647c933 2908 # image
2909 'avif': 'avif',
2910 'bmp': 'bmp',
2911 'gif': 'gif',
2912 'jpeg': 'jpg',
2913 'png': 'png',
2914 'svg+xml': 'svg',
2915 'tiff': 'tif',
2916 'vnd.wap.wbmp': 'wbmp',
2917 'webp': 'webp',
2918 'x-icon': 'ico',
2919 'x-jng': 'jng',
2920 'x-ms-bmp': 'bmp',
2921
2922 # caption
2923 'filmstrip+json': 'fs',
2924 'smptett+xml': 'tt',
2925 'ttaf+xml': 'dfxp',
2926 'ttml+xml': 'ttml',
2927 'x-ms-sami': 'sami',
9359f3d4 2928
2647c933 2929 # misc
2930 'gzip': 'gz',
9359f3d4
F
2931 'json': 'json',
2932 'xml': 'xml',
2933 'zip': 'zip',
9359f3d4
F
2934 }
2935
2647c933 2936 mimetype = mt.partition(';')[0].strip().lower()
2937 _, _, subtype = mimetype.rpartition('/')
9359f3d4 2938
69bec673 2939 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 2940 if ext:
2941 return ext
2942 elif default is not NO_DEFAULT:
2943 return default
9359f3d4 2944 return subtype.replace('+', '.')
c460bdd5
PH
2945
2946
2814f12b
THD
2947def ext2mimetype(ext_or_url):
2948 if not ext_or_url:
2949 return None
2950 if '.' not in ext_or_url:
2951 ext_or_url = f'file.{ext_or_url}'
2952 return mimetypes.guess_type(ext_or_url)[0]
2953
2954
4f3c5e06 2955def parse_codecs(codecs_str):
2956 # http://tools.ietf.org/html/rfc6381
2957 if not codecs_str:
2958 return {}
a0566bbf 2959 split_codecs = list(filter(None, map(
dbf5416a 2960 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 2961 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 2962 for full_codec in split_codecs:
d816f61f 2963 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2964 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2965 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2966 if vcodec:
2967 continue
2968 vcodec = full_codec
2969 if parts[0] in ('dvh1', 'dvhe'):
2970 hdr = 'DV'
69bec673 2971 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 2972 hdr = 'HDR10'
2973 elif parts[:2] == ['vp9', '2']:
2974 hdr = 'HDR10'
71082216 2975 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 2976 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2977 acodec = acodec or full_codec
2978 elif parts[0] in ('stpp', 'wvtt'):
2979 scodec = scodec or full_codec
4f3c5e06 2980 else:
19a03940 2981 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 2982 if vcodec or acodec or scodec:
4f3c5e06 2983 return {
2984 'vcodec': vcodec or 'none',
2985 'acodec': acodec or 'none',
176f1866 2986 'dynamic_range': hdr,
3fe75fdc 2987 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 2988 }
b69fd25c 2989 elif len(split_codecs) == 2:
2990 return {
2991 'vcodec': split_codecs[0],
2992 'acodec': split_codecs[1],
2993 }
4f3c5e06 2994 return {}
2995
2996
fc61aff4
LL
2997def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2998 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2999
3000 allow_mkv = not preferences or 'mkv' in preferences
3001
3002 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3003 return 'mkv' # TODO: any other format allows this?
3004
3005 # TODO: All codecs supported by parse_codecs isn't handled here
3006 COMPATIBLE_CODECS = {
3007 'mp4': {
71082216 3008 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3009 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3010 },
3011 'webm': {
3012 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3013 'vp9x', 'vp8x', # in the webm spec
3014 },
3015 }
3016
812cdfa0 3017 sanitize_codec = functools.partial(
3018 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3019 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3020
3021 for ext in preferences or COMPATIBLE_CODECS.keys():
3022 codec_set = COMPATIBLE_CODECS.get(ext, set())
3023 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3024 return ext
3025
3026 COMPATIBLE_EXTS = (
3027 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3028 {'webm', 'weba'},
fc61aff4
LL
3029 )
3030 for ext in preferences or vexts:
3031 current_exts = {ext, *vexts, *aexts}
3032 if ext == 'mkv' or current_exts == {ext} or any(
3033 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3034 return ext
3035 return 'mkv' if allow_mkv else preferences[-1]
3036
3037
2647c933 3038def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3039 getheader = url_handle.headers.get
2ccd1b10 3040
b55ee18f
PH
3041 cd = getheader('Content-Disposition')
3042 if cd:
3043 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3044 if m:
3045 e = determine_ext(m.group('filename'), default_ext=None)
3046 if e:
3047 return e
3048
2647c933 3049 meta_ext = getheader('x-amz-meta-name')
3050 if meta_ext:
3051 e = meta_ext.rpartition('.')[2]
3052 if e:
3053 return e
3054
3055 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3056
3057
1e399778
YCH
3058def encode_data_uri(data, mime_type):
3059 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3060
3061
05900629 3062def age_restricted(content_limit, age_limit):
6ec6cb4e 3063 """ Returns True iff the content should be blocked """
05900629
PH
3064
3065 if age_limit is None: # No limit set
3066 return False
3067 if content_limit is None:
3068 return False # Content available for everyone
3069 return age_limit < content_limit
61ca9a80
PH
3070
3071
88f60feb 3072# List of known byte-order-marks (BOM)
a904a7f8
L
3073BOMS = [
3074 (b'\xef\xbb\xbf', 'utf-8'),
3075 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3076 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3077 (b'\xff\xfe', 'utf-16-le'),
3078 (b'\xfe\xff', 'utf-16-be'),
3079]
a904a7f8
L
3080
3081
61ca9a80
PH
3082def is_html(first_bytes):
3083 """ Detect whether a file contains HTML by examining its first bytes. """
3084
80e8493e 3085 encoding = 'utf-8'
61ca9a80 3086 for bom, enc in BOMS:
80e8493e 3087 while first_bytes.startswith(bom):
3088 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3089
80e8493e 3090 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3091
3092
3093def determine_protocol(info_dict):
3094 protocol = info_dict.get('protocol')
3095 if protocol is not None:
3096 return protocol
3097
7de837a5 3098 url = sanitize_url(info_dict['url'])
a055469f
PH
3099 if url.startswith('rtmp'):
3100 return 'rtmp'
3101 elif url.startswith('mms'):
3102 return 'mms'
3103 elif url.startswith('rtsp'):
3104 return 'rtsp'
3105
3106 ext = determine_ext(url)
3107 if ext == 'm3u8':
deae7c17 3108 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3109 elif ext == 'f4m':
3110 return 'f4m'
3111
14f25df2 3112 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3113
3114
c5e3f849 3115def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3116 """ Render a list of rows, each as a list of values.
3117 Text after a \t will be right aligned """
ec11a9f4 3118 def width(string):
c5e3f849 3119 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3120
3121 def get_max_lens(table):
ec11a9f4 3122 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3123
3124 def filter_using_list(row, filterArray):
d16df59d 3125 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3126
d16df59d 3127 max_lens = get_max_lens(data) if hide_empty else []
3128 header_row = filter_using_list(header_row, max_lens)
3129 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3130
cfb56d1a 3131 table = [header_row] + data
76d321f6 3132 max_lens = get_max_lens(table)
c5e3f849 3133 extra_gap += 1
76d321f6 3134 if delim:
c5e3f849 3135 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3136 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3137 for row in table:
3138 for pos, text in enumerate(map(str, row)):
c5e3f849 3139 if '\t' in text:
3140 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3141 else:
3142 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3143 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3144 return ret
347de493
PH
3145
3146
8f18aca8 3147def _match_one(filter_part, dct, incomplete):
77b87f05 3148 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3149 STRING_OPERATORS = {
3150 '*=': operator.contains,
3151 '^=': lambda attr, value: attr.startswith(value),
3152 '$=': lambda attr, value: attr.endswith(value),
3153 '~=': lambda attr, value: re.search(value, attr),
3154 }
347de493 3155 COMPARISON_OPERATORS = {
a047eeb6 3156 **STRING_OPERATORS,
3157 '<=': operator.le, # "<=" must be defined above "<"
347de493 3158 '<': operator.lt,
347de493 3159 '>=': operator.ge,
a047eeb6 3160 '>': operator.gt,
347de493 3161 '=': operator.eq,
347de493 3162 }
a047eeb6 3163
6db9c4d5 3164 if isinstance(incomplete, bool):
3165 is_incomplete = lambda _: incomplete
3166 else:
3167 is_incomplete = lambda k: k in incomplete
3168
64fa820c 3169 operator_rex = re.compile(r'''(?x)
347de493 3170 (?P<key>[a-z_]+)
77b87f05 3171 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3172 (?:
a047eeb6 3173 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3174 (?P<strval>.+?)
347de493 3175 )
347de493 3176 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3177 m = operator_rex.fullmatch(filter_part.strip())
347de493 3178 if m:
18f96d12 3179 m = m.groupdict()
3180 unnegated_op = COMPARISON_OPERATORS[m['op']]
3181 if m['negation']:
77b87f05
MT
3182 op = lambda attr, value: not unnegated_op(attr, value)
3183 else:
3184 op = unnegated_op
18f96d12 3185 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3186 if m['quote']:
3187 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3188 actual_value = dct.get(m['key'])
3189 numeric_comparison = None
f9934b96 3190 if isinstance(actual_value, (int, float)):
e5a088dc
S
3191 # If the original field is a string and matching comparisonvalue is
3192 # a number we should respect the origin of the original field
3193 # and process comparison value as a string (see
18f96d12 3194 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3195 try:
18f96d12 3196 numeric_comparison = int(comparison_value)
347de493 3197 except ValueError:
18f96d12 3198 numeric_comparison = parse_filesize(comparison_value)
3199 if numeric_comparison is None:
3200 numeric_comparison = parse_filesize(f'{comparison_value}B')
3201 if numeric_comparison is None:
3202 numeric_comparison = parse_duration(comparison_value)
3203 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3204 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3205 if actual_value is None:
6db9c4d5 3206 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3207 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3208
3209 UNARY_OPERATORS = {
1cc47c66
S
3210 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3211 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3212 }
64fa820c 3213 operator_rex = re.compile(r'''(?x)
347de493 3214 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3215 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3216 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3217 if m:
3218 op = UNARY_OPERATORS[m.group('op')]
3219 actual_value = dct.get(m.group('key'))
6db9c4d5 3220 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3221 return True
347de493
PH
3222 return op(actual_value)
3223
3224 raise ValueError('Invalid filter part %r' % filter_part)
3225
3226
8f18aca8 3227def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3228 """ Filter a dictionary with a simple string syntax.
3229 @returns Whether the filter passes
3230 @param incomplete Set of keys that is expected to be missing from dct.
3231 Can be True/False to indicate all/none of the keys may be missing.
3232 All conditions on incomplete keys pass if the key is missing
8f18aca8 3233 """
347de493 3234 return all(
8f18aca8 3235 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3236 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3237
3238
fe2ce85a 3239def match_filter_func(filters, breaking_filters=None):
3240 if not filters and not breaking_filters:
d1b5f70b 3241 return None
fe2ce85a 3242 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3243 filters = set(variadic(filters or []))
d1b5f70b 3244
492272fe 3245 interactive = '-' in filters
3246 if interactive:
3247 filters.remove('-')
3248
3249 def _match_func(info_dict, incomplete=False):
fe2ce85a 3250 ret = breaking_filters(info_dict, incomplete)
3251 if ret is not None:
3252 raise RejectedVideoReached(ret)
3253
492272fe 3254 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3255 return NO_DEFAULT if interactive and not incomplete else None
347de493 3256 else:
3bec830a 3257 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3258 filter_str = ') | ('.join(map(str.strip, filters))
3259 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3260 return _match_func
91410c9b
PH
3261
3262
f2df4071 3263class download_range_func:
b4e0d758 3264 def __init__(self, chapters, ranges, from_info=False):
3265 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3266
3267 def __call__(self, info_dict, ydl):
0500ee3d 3268
5ec1b6b7 3269 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3270 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3271 for regex in self.chapters or []:
5ec1b6b7 3272 for i, chapter in enumerate(info_dict.get('chapters') or []):
3273 if re.search(regex, chapter['title']):
3274 warning = None
3275 yield {**chapter, 'index': i}
f2df4071 3276 if self.chapters and warning:
5ec1b6b7 3277 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3278
b4e0d758 3279 for start, end in self.ranges or []:
3280 yield {
3281 'start_time': self._handle_negative_timestamp(start, info_dict),
3282 'end_time': self._handle_negative_timestamp(end, info_dict),
3283 }
3284
3285 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3286 yield {
e59e2074 3287 'start_time': info_dict.get('start_time') or 0,
3288 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3289 }
e59e2074 3290 elif not self.ranges and not self.chapters:
3291 yield {}
b4e0d758 3292
3293 @staticmethod
3294 def _handle_negative_timestamp(time, info):
3295 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3296
f2df4071 3297 def __eq__(self, other):
3298 return (isinstance(other, download_range_func)
3299 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3300
71df9b7f 3301 def __repr__(self):
a5387729 3302 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3303
5ec1b6b7 3304
bf6427d2
YCH
3305def parse_dfxp_time_expr(time_expr):
3306 if not time_expr:
d631d5f9 3307 return
bf6427d2 3308
1d485a1a 3309 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3310 if mobj:
3311 return float(mobj.group('time_offset'))
3312
db2fe38b 3313 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3314 if mobj:
db2fe38b 3315 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3316
3317
c1c924ab 3318def srt_subtitles_timecode(seconds):
aa7785f8 3319 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3320
3321
3322def ass_subtitles_timecode(seconds):
3323 time = timetuple_from_msec(seconds * 1000)
3324 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3325
3326
3327def dfxp2srt(dfxp_data):
3869028f
YCH
3328 '''
3329 @param dfxp_data A bytes-like object containing DFXP data
3330 @returns A unicode object containing converted SRT data
3331 '''
5b995f71 3332 LEGACY_NAMESPACES = (
3869028f
YCH
3333 (b'http://www.w3.org/ns/ttml', [
3334 b'http://www.w3.org/2004/11/ttaf1',
3335 b'http://www.w3.org/2006/04/ttaf1',
3336 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3337 ]),
3869028f
YCH
3338 (b'http://www.w3.org/ns/ttml#styling', [
3339 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3340 ]),
3341 )
3342
3343 SUPPORTED_STYLING = [
3344 'color',
3345 'fontFamily',
3346 'fontSize',
3347 'fontStyle',
3348 'fontWeight',
3349 'textDecoration'
3350 ]
3351
4e335771 3352 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3353 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3354 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3355 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3356 })
bf6427d2 3357
5b995f71
RA
3358 styles = {}
3359 default_style = {}
3360
86e5f3ed 3361 class TTMLPElementParser:
5b995f71
RA
3362 _out = ''
3363 _unclosed_elements = []
3364 _applied_styles = []
bf6427d2 3365
2b14cb56 3366 def start(self, tag, attrib):
5b995f71
RA
3367 if tag in (_x('ttml:br'), 'br'):
3368 self._out += '\n'
3369 else:
3370 unclosed_elements = []
3371 style = {}
3372 element_style_id = attrib.get('style')
3373 if default_style:
3374 style.update(default_style)
3375 if element_style_id:
3376 style.update(styles.get(element_style_id, {}))
3377 for prop in SUPPORTED_STYLING:
3378 prop_val = attrib.get(_x('tts:' + prop))
3379 if prop_val:
3380 style[prop] = prop_val
3381 if style:
3382 font = ''
3383 for k, v in sorted(style.items()):
3384 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3385 continue
3386 if k == 'color':
3387 font += ' color="%s"' % v
3388 elif k == 'fontSize':
3389 font += ' size="%s"' % v
3390 elif k == 'fontFamily':
3391 font += ' face="%s"' % v
3392 elif k == 'fontWeight' and v == 'bold':
3393 self._out += '<b>'
3394 unclosed_elements.append('b')
3395 elif k == 'fontStyle' and v == 'italic':
3396 self._out += '<i>'
3397 unclosed_elements.append('i')
3398 elif k == 'textDecoration' and v == 'underline':
3399 self._out += '<u>'
3400 unclosed_elements.append('u')
3401 if font:
3402 self._out += '<font' + font + '>'
3403 unclosed_elements.append('font')
3404 applied_style = {}
3405 if self._applied_styles:
3406 applied_style.update(self._applied_styles[-1])
3407 applied_style.update(style)
3408 self._applied_styles.append(applied_style)
3409 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3410
2b14cb56 3411 def end(self, tag):
5b995f71
RA
3412 if tag not in (_x('ttml:br'), 'br'):
3413 unclosed_elements = self._unclosed_elements.pop()
3414 for element in reversed(unclosed_elements):
3415 self._out += '</%s>' % element
3416 if unclosed_elements and self._applied_styles:
3417 self._applied_styles.pop()
bf6427d2 3418
2b14cb56 3419 def data(self, data):
5b995f71 3420 self._out += data
2b14cb56 3421
3422 def close(self):
5b995f71 3423 return self._out.strip()
2b14cb56 3424
6a765f13 3425 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3426 # This will not trigger false positives since only UTF-8 text is being replaced
3427 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3428
2b14cb56 3429 def parse_node(node):
3430 target = TTMLPElementParser()
3431 parser = xml.etree.ElementTree.XMLParser(target=target)
3432 parser.feed(xml.etree.ElementTree.tostring(node))
3433 return parser.close()
bf6427d2 3434
5b995f71
RA
3435 for k, v in LEGACY_NAMESPACES:
3436 for ns in v:
3437 dfxp_data = dfxp_data.replace(ns, k)
3438
3869028f 3439 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3440 out = []
5b995f71 3441 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3442
3443 if not paras:
3444 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3445
5b995f71
RA
3446 repeat = False
3447 while True:
3448 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3449 style_id = style.get('id') or style.get(_x('xml:id'))
3450 if not style_id:
3451 continue
5b995f71
RA
3452 parent_style_id = style.get('style')
3453 if parent_style_id:
3454 if parent_style_id not in styles:
3455 repeat = True
3456 continue
3457 styles[style_id] = styles[parent_style_id].copy()
3458 for prop in SUPPORTED_STYLING:
3459 prop_val = style.get(_x('tts:' + prop))
3460 if prop_val:
3461 styles.setdefault(style_id, {})[prop] = prop_val
3462 if repeat:
3463 repeat = False
3464 else:
3465 break
3466
3467 for p in ('body', 'div'):
3468 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3469 if ele is None:
3470 continue
3471 style = styles.get(ele.get('style'))
3472 if not style:
3473 continue
3474 default_style.update(style)
3475
bf6427d2 3476 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3477 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3478 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3479 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3480 if begin_time is None:
3481 continue
7dff0363 3482 if not end_time:
d631d5f9
YCH
3483 if not dur:
3484 continue
3485 end_time = begin_time + dur
bf6427d2
YCH
3486 out.append('%d\n%s --> %s\n%s\n\n' % (
3487 index,
c1c924ab
YCH
3488 srt_subtitles_timecode(begin_time),
3489 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3490 parse_node(para)))
3491
3492 return ''.join(out)
3493
3494
c487cf00 3495def cli_option(params, command_option, param, separator=None):
66e289ba 3496 param = params.get(param)
c487cf00 3497 return ([] if param is None
3498 else [command_option, str(param)] if separator is None
3499 else [f'{command_option}{separator}{param}'])
66e289ba
S
3500
3501
3502def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3503 param = params.get(param)
c487cf00 3504 assert param in (True, False, None)
3505 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3506
3507
3508def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3509 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3510
3511
e92caff5 3512def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3513 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3514 if use_compat:
5b1ecbb3 3515 return argdict
3516 else:
3517 argdict = None
eab9b2bc 3518 if argdict is None:
5b1ecbb3 3519 return default
eab9b2bc 3520 assert isinstance(argdict, dict)
3521
e92caff5 3522 assert isinstance(keys, (list, tuple))
3523 for key_list in keys:
e92caff5 3524 arg_list = list(filter(
3525 lambda x: x is not None,
6606817a 3526 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3527 if arg_list:
3528 return [arg for args in arg_list for arg in args]
3529 return default
66e289ba 3530
6251555f 3531
330690a2 3532def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3533 main_key, exe = main_key.lower(), exe.lower()
3534 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3535 keys = [f'{root_key}{k}' for k in (keys or [''])]
3536 if root_key in keys:
3537 if main_key != exe:
3538 keys.append((main_key, exe))
3539 keys.append('default')
3540 else:
3541 use_compat = False
3542 return cli_configuration_args(argdict, keys, default, use_compat)
3543
66e289ba 3544
86e5f3ed 3545class ISO639Utils:
39672624
YCH
3546 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3547 _lang_map = {
3548 'aa': 'aar',
3549 'ab': 'abk',
3550 'ae': 'ave',
3551 'af': 'afr',
3552 'ak': 'aka',
3553 'am': 'amh',
3554 'an': 'arg',
3555 'ar': 'ara',
3556 'as': 'asm',
3557 'av': 'ava',
3558 'ay': 'aym',
3559 'az': 'aze',
3560 'ba': 'bak',
3561 'be': 'bel',
3562 'bg': 'bul',
3563 'bh': 'bih',
3564 'bi': 'bis',
3565 'bm': 'bam',
3566 'bn': 'ben',
3567 'bo': 'bod',
3568 'br': 'bre',
3569 'bs': 'bos',
3570 'ca': 'cat',
3571 'ce': 'che',
3572 'ch': 'cha',
3573 'co': 'cos',
3574 'cr': 'cre',
3575 'cs': 'ces',
3576 'cu': 'chu',
3577 'cv': 'chv',
3578 'cy': 'cym',
3579 'da': 'dan',
3580 'de': 'deu',
3581 'dv': 'div',
3582 'dz': 'dzo',
3583 'ee': 'ewe',
3584 'el': 'ell',
3585 'en': 'eng',
3586 'eo': 'epo',
3587 'es': 'spa',
3588 'et': 'est',
3589 'eu': 'eus',
3590 'fa': 'fas',
3591 'ff': 'ful',
3592 'fi': 'fin',
3593 'fj': 'fij',
3594 'fo': 'fao',
3595 'fr': 'fra',
3596 'fy': 'fry',
3597 'ga': 'gle',
3598 'gd': 'gla',
3599 'gl': 'glg',
3600 'gn': 'grn',
3601 'gu': 'guj',
3602 'gv': 'glv',
3603 'ha': 'hau',
3604 'he': 'heb',
b7acc835 3605 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3606 'hi': 'hin',
3607 'ho': 'hmo',
3608 'hr': 'hrv',
3609 'ht': 'hat',
3610 'hu': 'hun',
3611 'hy': 'hye',
3612 'hz': 'her',
3613 'ia': 'ina',
3614 'id': 'ind',
b7acc835 3615 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3616 'ie': 'ile',
3617 'ig': 'ibo',
3618 'ii': 'iii',
3619 'ik': 'ipk',
3620 'io': 'ido',
3621 'is': 'isl',
3622 'it': 'ita',
3623 'iu': 'iku',
3624 'ja': 'jpn',
3625 'jv': 'jav',
3626 'ka': 'kat',
3627 'kg': 'kon',
3628 'ki': 'kik',
3629 'kj': 'kua',
3630 'kk': 'kaz',
3631 'kl': 'kal',
3632 'km': 'khm',
3633 'kn': 'kan',
3634 'ko': 'kor',
3635 'kr': 'kau',
3636 'ks': 'kas',
3637 'ku': 'kur',
3638 'kv': 'kom',
3639 'kw': 'cor',
3640 'ky': 'kir',
3641 'la': 'lat',
3642 'lb': 'ltz',
3643 'lg': 'lug',
3644 'li': 'lim',
3645 'ln': 'lin',
3646 'lo': 'lao',
3647 'lt': 'lit',
3648 'lu': 'lub',
3649 'lv': 'lav',
3650 'mg': 'mlg',
3651 'mh': 'mah',
3652 'mi': 'mri',
3653 'mk': 'mkd',
3654 'ml': 'mal',
3655 'mn': 'mon',
3656 'mr': 'mar',
3657 'ms': 'msa',
3658 'mt': 'mlt',
3659 'my': 'mya',
3660 'na': 'nau',
3661 'nb': 'nob',
3662 'nd': 'nde',
3663 'ne': 'nep',
3664 'ng': 'ndo',
3665 'nl': 'nld',
3666 'nn': 'nno',
3667 'no': 'nor',
3668 'nr': 'nbl',
3669 'nv': 'nav',
3670 'ny': 'nya',
3671 'oc': 'oci',
3672 'oj': 'oji',
3673 'om': 'orm',
3674 'or': 'ori',
3675 'os': 'oss',
3676 'pa': 'pan',
7bcd4813 3677 'pe': 'per',
39672624
YCH
3678 'pi': 'pli',
3679 'pl': 'pol',
3680 'ps': 'pus',
3681 'pt': 'por',
3682 'qu': 'que',
3683 'rm': 'roh',
3684 'rn': 'run',
3685 'ro': 'ron',
3686 'ru': 'rus',
3687 'rw': 'kin',
3688 'sa': 'san',
3689 'sc': 'srd',
3690 'sd': 'snd',
3691 'se': 'sme',
3692 'sg': 'sag',
3693 'si': 'sin',
3694 'sk': 'slk',
3695 'sl': 'slv',
3696 'sm': 'smo',
3697 'sn': 'sna',
3698 'so': 'som',
3699 'sq': 'sqi',
3700 'sr': 'srp',
3701 'ss': 'ssw',
3702 'st': 'sot',
3703 'su': 'sun',
3704 'sv': 'swe',
3705 'sw': 'swa',
3706 'ta': 'tam',
3707 'te': 'tel',
3708 'tg': 'tgk',
3709 'th': 'tha',
3710 'ti': 'tir',
3711 'tk': 'tuk',
3712 'tl': 'tgl',
3713 'tn': 'tsn',
3714 'to': 'ton',
3715 'tr': 'tur',
3716 'ts': 'tso',
3717 'tt': 'tat',
3718 'tw': 'twi',
3719 'ty': 'tah',
3720 'ug': 'uig',
3721 'uk': 'ukr',
3722 'ur': 'urd',
3723 'uz': 'uzb',
3724 've': 'ven',
3725 'vi': 'vie',
3726 'vo': 'vol',
3727 'wa': 'wln',
3728 'wo': 'wol',
3729 'xh': 'xho',
3730 'yi': 'yid',
e9a50fba 3731 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3732 'yo': 'yor',
3733 'za': 'zha',
3734 'zh': 'zho',
3735 'zu': 'zul',
3736 }
3737
3738 @classmethod
3739 def short2long(cls, code):
3740 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3741 return cls._lang_map.get(code[:2])
3742
3743 @classmethod
3744 def long2short(cls, code):
3745 """Convert language code from ISO 639-2/T to ISO 639-1"""
3746 for short_name, long_name in cls._lang_map.items():
3747 if long_name == code:
3748 return short_name
3749
3750
86e5f3ed 3751class ISO3166Utils:
4eb10f66
YCH
3752 # From http://data.okfn.org/data/core/country-list
3753 _country_map = {
3754 'AF': 'Afghanistan',
3755 'AX': 'Åland Islands',
3756 'AL': 'Albania',
3757 'DZ': 'Algeria',
3758 'AS': 'American Samoa',
3759 'AD': 'Andorra',
3760 'AO': 'Angola',
3761 'AI': 'Anguilla',
3762 'AQ': 'Antarctica',
3763 'AG': 'Antigua and Barbuda',
3764 'AR': 'Argentina',
3765 'AM': 'Armenia',
3766 'AW': 'Aruba',
3767 'AU': 'Australia',
3768 'AT': 'Austria',
3769 'AZ': 'Azerbaijan',
3770 'BS': 'Bahamas',
3771 'BH': 'Bahrain',
3772 'BD': 'Bangladesh',
3773 'BB': 'Barbados',
3774 'BY': 'Belarus',
3775 'BE': 'Belgium',
3776 'BZ': 'Belize',
3777 'BJ': 'Benin',
3778 'BM': 'Bermuda',
3779 'BT': 'Bhutan',
3780 'BO': 'Bolivia, Plurinational State of',
3781 'BQ': 'Bonaire, Sint Eustatius and Saba',
3782 'BA': 'Bosnia and Herzegovina',
3783 'BW': 'Botswana',
3784 'BV': 'Bouvet Island',
3785 'BR': 'Brazil',
3786 'IO': 'British Indian Ocean Territory',
3787 'BN': 'Brunei Darussalam',
3788 'BG': 'Bulgaria',
3789 'BF': 'Burkina Faso',
3790 'BI': 'Burundi',
3791 'KH': 'Cambodia',
3792 'CM': 'Cameroon',
3793 'CA': 'Canada',
3794 'CV': 'Cape Verde',
3795 'KY': 'Cayman Islands',
3796 'CF': 'Central African Republic',
3797 'TD': 'Chad',
3798 'CL': 'Chile',
3799 'CN': 'China',
3800 'CX': 'Christmas Island',
3801 'CC': 'Cocos (Keeling) Islands',
3802 'CO': 'Colombia',
3803 'KM': 'Comoros',
3804 'CG': 'Congo',
3805 'CD': 'Congo, the Democratic Republic of the',
3806 'CK': 'Cook Islands',
3807 'CR': 'Costa Rica',
3808 'CI': 'Côte d\'Ivoire',
3809 'HR': 'Croatia',
3810 'CU': 'Cuba',
3811 'CW': 'Curaçao',
3812 'CY': 'Cyprus',
3813 'CZ': 'Czech Republic',
3814 'DK': 'Denmark',
3815 'DJ': 'Djibouti',
3816 'DM': 'Dominica',
3817 'DO': 'Dominican Republic',
3818 'EC': 'Ecuador',
3819 'EG': 'Egypt',
3820 'SV': 'El Salvador',
3821 'GQ': 'Equatorial Guinea',
3822 'ER': 'Eritrea',
3823 'EE': 'Estonia',
3824 'ET': 'Ethiopia',
3825 'FK': 'Falkland Islands (Malvinas)',
3826 'FO': 'Faroe Islands',
3827 'FJ': 'Fiji',
3828 'FI': 'Finland',
3829 'FR': 'France',
3830 'GF': 'French Guiana',
3831 'PF': 'French Polynesia',
3832 'TF': 'French Southern Territories',
3833 'GA': 'Gabon',
3834 'GM': 'Gambia',
3835 'GE': 'Georgia',
3836 'DE': 'Germany',
3837 'GH': 'Ghana',
3838 'GI': 'Gibraltar',
3839 'GR': 'Greece',
3840 'GL': 'Greenland',
3841 'GD': 'Grenada',
3842 'GP': 'Guadeloupe',
3843 'GU': 'Guam',
3844 'GT': 'Guatemala',
3845 'GG': 'Guernsey',
3846 'GN': 'Guinea',
3847 'GW': 'Guinea-Bissau',
3848 'GY': 'Guyana',
3849 'HT': 'Haiti',
3850 'HM': 'Heard Island and McDonald Islands',
3851 'VA': 'Holy See (Vatican City State)',
3852 'HN': 'Honduras',
3853 'HK': 'Hong Kong',
3854 'HU': 'Hungary',
3855 'IS': 'Iceland',
3856 'IN': 'India',
3857 'ID': 'Indonesia',
3858 'IR': 'Iran, Islamic Republic of',
3859 'IQ': 'Iraq',
3860 'IE': 'Ireland',
3861 'IM': 'Isle of Man',
3862 'IL': 'Israel',
3863 'IT': 'Italy',
3864 'JM': 'Jamaica',
3865 'JP': 'Japan',
3866 'JE': 'Jersey',
3867 'JO': 'Jordan',
3868 'KZ': 'Kazakhstan',
3869 'KE': 'Kenya',
3870 'KI': 'Kiribati',
3871 'KP': 'Korea, Democratic People\'s Republic of',
3872 'KR': 'Korea, Republic of',
3873 'KW': 'Kuwait',
3874 'KG': 'Kyrgyzstan',
3875 'LA': 'Lao People\'s Democratic Republic',
3876 'LV': 'Latvia',
3877 'LB': 'Lebanon',
3878 'LS': 'Lesotho',
3879 'LR': 'Liberia',
3880 'LY': 'Libya',
3881 'LI': 'Liechtenstein',
3882 'LT': 'Lithuania',
3883 'LU': 'Luxembourg',
3884 'MO': 'Macao',
3885 'MK': 'Macedonia, the Former Yugoslav Republic of',
3886 'MG': 'Madagascar',
3887 'MW': 'Malawi',
3888 'MY': 'Malaysia',
3889 'MV': 'Maldives',
3890 'ML': 'Mali',
3891 'MT': 'Malta',
3892 'MH': 'Marshall Islands',
3893 'MQ': 'Martinique',
3894 'MR': 'Mauritania',
3895 'MU': 'Mauritius',
3896 'YT': 'Mayotte',
3897 'MX': 'Mexico',
3898 'FM': 'Micronesia, Federated States of',
3899 'MD': 'Moldova, Republic of',
3900 'MC': 'Monaco',
3901 'MN': 'Mongolia',
3902 'ME': 'Montenegro',
3903 'MS': 'Montserrat',
3904 'MA': 'Morocco',
3905 'MZ': 'Mozambique',
3906 'MM': 'Myanmar',
3907 'NA': 'Namibia',
3908 'NR': 'Nauru',
3909 'NP': 'Nepal',
3910 'NL': 'Netherlands',
3911 'NC': 'New Caledonia',
3912 'NZ': 'New Zealand',
3913 'NI': 'Nicaragua',
3914 'NE': 'Niger',
3915 'NG': 'Nigeria',
3916 'NU': 'Niue',
3917 'NF': 'Norfolk Island',
3918 'MP': 'Northern Mariana Islands',
3919 'NO': 'Norway',
3920 'OM': 'Oman',
3921 'PK': 'Pakistan',
3922 'PW': 'Palau',
3923 'PS': 'Palestine, State of',
3924 'PA': 'Panama',
3925 'PG': 'Papua New Guinea',
3926 'PY': 'Paraguay',
3927 'PE': 'Peru',
3928 'PH': 'Philippines',
3929 'PN': 'Pitcairn',
3930 'PL': 'Poland',
3931 'PT': 'Portugal',
3932 'PR': 'Puerto Rico',
3933 'QA': 'Qatar',
3934 'RE': 'Réunion',
3935 'RO': 'Romania',
3936 'RU': 'Russian Federation',
3937 'RW': 'Rwanda',
3938 'BL': 'Saint Barthélemy',
3939 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3940 'KN': 'Saint Kitts and Nevis',
3941 'LC': 'Saint Lucia',
3942 'MF': 'Saint Martin (French part)',
3943 'PM': 'Saint Pierre and Miquelon',
3944 'VC': 'Saint Vincent and the Grenadines',
3945 'WS': 'Samoa',
3946 'SM': 'San Marino',
3947 'ST': 'Sao Tome and Principe',
3948 'SA': 'Saudi Arabia',
3949 'SN': 'Senegal',
3950 'RS': 'Serbia',
3951 'SC': 'Seychelles',
3952 'SL': 'Sierra Leone',
3953 'SG': 'Singapore',
3954 'SX': 'Sint Maarten (Dutch part)',
3955 'SK': 'Slovakia',
3956 'SI': 'Slovenia',
3957 'SB': 'Solomon Islands',
3958 'SO': 'Somalia',
3959 'ZA': 'South Africa',
3960 'GS': 'South Georgia and the South Sandwich Islands',
3961 'SS': 'South Sudan',
3962 'ES': 'Spain',
3963 'LK': 'Sri Lanka',
3964 'SD': 'Sudan',
3965 'SR': 'Suriname',
3966 'SJ': 'Svalbard and Jan Mayen',
3967 'SZ': 'Swaziland',
3968 'SE': 'Sweden',
3969 'CH': 'Switzerland',
3970 'SY': 'Syrian Arab Republic',
3971 'TW': 'Taiwan, Province of China',
3972 'TJ': 'Tajikistan',
3973 'TZ': 'Tanzania, United Republic of',
3974 'TH': 'Thailand',
3975 'TL': 'Timor-Leste',
3976 'TG': 'Togo',
3977 'TK': 'Tokelau',
3978 'TO': 'Tonga',
3979 'TT': 'Trinidad and Tobago',
3980 'TN': 'Tunisia',
3981 'TR': 'Turkey',
3982 'TM': 'Turkmenistan',
3983 'TC': 'Turks and Caicos Islands',
3984 'TV': 'Tuvalu',
3985 'UG': 'Uganda',
3986 'UA': 'Ukraine',
3987 'AE': 'United Arab Emirates',
3988 'GB': 'United Kingdom',
3989 'US': 'United States',
3990 'UM': 'United States Minor Outlying Islands',
3991 'UY': 'Uruguay',
3992 'UZ': 'Uzbekistan',
3993 'VU': 'Vanuatu',
3994 'VE': 'Venezuela, Bolivarian Republic of',
3995 'VN': 'Viet Nam',
3996 'VG': 'Virgin Islands, British',
3997 'VI': 'Virgin Islands, U.S.',
3998 'WF': 'Wallis and Futuna',
3999 'EH': 'Western Sahara',
4000 'YE': 'Yemen',
4001 'ZM': 'Zambia',
4002 'ZW': 'Zimbabwe',
2f97cc61 4003 # Not ISO 3166 codes, but used for IP blocks
4004 'AP': 'Asia/Pacific Region',
4005 'EU': 'Europe',
4eb10f66
YCH
4006 }
4007
4008 @classmethod
4009 def short2full(cls, code):
4010 """Convert an ISO 3166-2 country code to the corresponding full name"""
4011 return cls._country_map.get(code.upper())
4012
4013
86e5f3ed 4014class GeoUtils:
773f291d
S
4015 # Major IPv4 address blocks per country
4016 _country_ip_map = {
53896ca5 4017 'AD': '46.172.224.0/19',
773f291d
S
4018 'AE': '94.200.0.0/13',
4019 'AF': '149.54.0.0/17',
4020 'AG': '209.59.64.0/18',
4021 'AI': '204.14.248.0/21',
4022 'AL': '46.99.0.0/16',
4023 'AM': '46.70.0.0/15',
4024 'AO': '105.168.0.0/13',
53896ca5
S
4025 'AP': '182.50.184.0/21',
4026 'AQ': '23.154.160.0/24',
773f291d
S
4027 'AR': '181.0.0.0/12',
4028 'AS': '202.70.112.0/20',
53896ca5 4029 'AT': '77.116.0.0/14',
773f291d
S
4030 'AU': '1.128.0.0/11',
4031 'AW': '181.41.0.0/18',
53896ca5
S
4032 'AX': '185.217.4.0/22',
4033 'AZ': '5.197.0.0/16',
773f291d
S
4034 'BA': '31.176.128.0/17',
4035 'BB': '65.48.128.0/17',
4036 'BD': '114.130.0.0/16',
4037 'BE': '57.0.0.0/8',
53896ca5 4038 'BF': '102.178.0.0/15',
773f291d
S
4039 'BG': '95.42.0.0/15',
4040 'BH': '37.131.0.0/17',
4041 'BI': '154.117.192.0/18',
4042 'BJ': '137.255.0.0/16',
53896ca5 4043 'BL': '185.212.72.0/23',
773f291d
S
4044 'BM': '196.12.64.0/18',
4045 'BN': '156.31.0.0/16',
4046 'BO': '161.56.0.0/16',
4047 'BQ': '161.0.80.0/20',
53896ca5 4048 'BR': '191.128.0.0/12',
773f291d
S
4049 'BS': '24.51.64.0/18',
4050 'BT': '119.2.96.0/19',
4051 'BW': '168.167.0.0/16',
4052 'BY': '178.120.0.0/13',
4053 'BZ': '179.42.192.0/18',
4054 'CA': '99.224.0.0/11',
4055 'CD': '41.243.0.0/16',
53896ca5
S
4056 'CF': '197.242.176.0/21',
4057 'CG': '160.113.0.0/16',
773f291d 4058 'CH': '85.0.0.0/13',
53896ca5 4059 'CI': '102.136.0.0/14',
773f291d
S
4060 'CK': '202.65.32.0/19',
4061 'CL': '152.172.0.0/14',
53896ca5 4062 'CM': '102.244.0.0/14',
773f291d
S
4063 'CN': '36.128.0.0/10',
4064 'CO': '181.240.0.0/12',
4065 'CR': '201.192.0.0/12',
4066 'CU': '152.206.0.0/15',
4067 'CV': '165.90.96.0/19',
4068 'CW': '190.88.128.0/17',
53896ca5 4069 'CY': '31.153.0.0/16',
773f291d
S
4070 'CZ': '88.100.0.0/14',
4071 'DE': '53.0.0.0/8',
4072 'DJ': '197.241.0.0/17',
4073 'DK': '87.48.0.0/12',
4074 'DM': '192.243.48.0/20',
4075 'DO': '152.166.0.0/15',
4076 'DZ': '41.96.0.0/12',
4077 'EC': '186.68.0.0/15',
4078 'EE': '90.190.0.0/15',
4079 'EG': '156.160.0.0/11',
4080 'ER': '196.200.96.0/20',
4081 'ES': '88.0.0.0/11',
4082 'ET': '196.188.0.0/14',
4083 'EU': '2.16.0.0/13',
4084 'FI': '91.152.0.0/13',
4085 'FJ': '144.120.0.0/16',
53896ca5 4086 'FK': '80.73.208.0/21',
773f291d
S
4087 'FM': '119.252.112.0/20',
4088 'FO': '88.85.32.0/19',
4089 'FR': '90.0.0.0/9',
4090 'GA': '41.158.0.0/15',
4091 'GB': '25.0.0.0/8',
4092 'GD': '74.122.88.0/21',
4093 'GE': '31.146.0.0/16',
4094 'GF': '161.22.64.0/18',
4095 'GG': '62.68.160.0/19',
53896ca5
S
4096 'GH': '154.160.0.0/12',
4097 'GI': '95.164.0.0/16',
773f291d
S
4098 'GL': '88.83.0.0/19',
4099 'GM': '160.182.0.0/15',
4100 'GN': '197.149.192.0/18',
4101 'GP': '104.250.0.0/19',
4102 'GQ': '105.235.224.0/20',
4103 'GR': '94.64.0.0/13',
4104 'GT': '168.234.0.0/16',
4105 'GU': '168.123.0.0/16',
4106 'GW': '197.214.80.0/20',
4107 'GY': '181.41.64.0/18',
4108 'HK': '113.252.0.0/14',
4109 'HN': '181.210.0.0/16',
4110 'HR': '93.136.0.0/13',
4111 'HT': '148.102.128.0/17',
4112 'HU': '84.0.0.0/14',
4113 'ID': '39.192.0.0/10',
4114 'IE': '87.32.0.0/12',
4115 'IL': '79.176.0.0/13',
4116 'IM': '5.62.80.0/20',
4117 'IN': '117.192.0.0/10',
4118 'IO': '203.83.48.0/21',
4119 'IQ': '37.236.0.0/14',
4120 'IR': '2.176.0.0/12',
4121 'IS': '82.221.0.0/16',
4122 'IT': '79.0.0.0/10',
4123 'JE': '87.244.64.0/18',
4124 'JM': '72.27.0.0/17',
4125 'JO': '176.29.0.0/16',
53896ca5 4126 'JP': '133.0.0.0/8',
773f291d
S
4127 'KE': '105.48.0.0/12',
4128 'KG': '158.181.128.0/17',
4129 'KH': '36.37.128.0/17',
4130 'KI': '103.25.140.0/22',
4131 'KM': '197.255.224.0/20',
53896ca5 4132 'KN': '198.167.192.0/19',
773f291d
S
4133 'KP': '175.45.176.0/22',
4134 'KR': '175.192.0.0/10',
4135 'KW': '37.36.0.0/14',
4136 'KY': '64.96.0.0/15',
4137 'KZ': '2.72.0.0/13',
4138 'LA': '115.84.64.0/18',
4139 'LB': '178.135.0.0/16',
53896ca5 4140 'LC': '24.92.144.0/20',
773f291d
S
4141 'LI': '82.117.0.0/19',
4142 'LK': '112.134.0.0/15',
53896ca5 4143 'LR': '102.183.0.0/16',
773f291d
S
4144 'LS': '129.232.0.0/17',
4145 'LT': '78.56.0.0/13',
4146 'LU': '188.42.0.0/16',
4147 'LV': '46.109.0.0/16',
4148 'LY': '41.252.0.0/14',
4149 'MA': '105.128.0.0/11',
4150 'MC': '88.209.64.0/18',
4151 'MD': '37.246.0.0/16',
4152 'ME': '178.175.0.0/17',
4153 'MF': '74.112.232.0/21',
4154 'MG': '154.126.0.0/17',
4155 'MH': '117.103.88.0/21',
4156 'MK': '77.28.0.0/15',
4157 'ML': '154.118.128.0/18',
4158 'MM': '37.111.0.0/17',
4159 'MN': '49.0.128.0/17',
4160 'MO': '60.246.0.0/16',
4161 'MP': '202.88.64.0/20',
4162 'MQ': '109.203.224.0/19',
4163 'MR': '41.188.64.0/18',
4164 'MS': '208.90.112.0/22',
4165 'MT': '46.11.0.0/16',
4166 'MU': '105.16.0.0/12',
4167 'MV': '27.114.128.0/18',
53896ca5 4168 'MW': '102.70.0.0/15',
773f291d
S
4169 'MX': '187.192.0.0/11',
4170 'MY': '175.136.0.0/13',
4171 'MZ': '197.218.0.0/15',
4172 'NA': '41.182.0.0/16',
4173 'NC': '101.101.0.0/18',
4174 'NE': '197.214.0.0/18',
4175 'NF': '203.17.240.0/22',
4176 'NG': '105.112.0.0/12',
4177 'NI': '186.76.0.0/15',
4178 'NL': '145.96.0.0/11',
4179 'NO': '84.208.0.0/13',
4180 'NP': '36.252.0.0/15',
4181 'NR': '203.98.224.0/19',
4182 'NU': '49.156.48.0/22',
4183 'NZ': '49.224.0.0/14',
4184 'OM': '5.36.0.0/15',
4185 'PA': '186.72.0.0/15',
4186 'PE': '186.160.0.0/14',
4187 'PF': '123.50.64.0/18',
4188 'PG': '124.240.192.0/19',
4189 'PH': '49.144.0.0/13',
4190 'PK': '39.32.0.0/11',
4191 'PL': '83.0.0.0/11',
4192 'PM': '70.36.0.0/20',
4193 'PR': '66.50.0.0/16',
4194 'PS': '188.161.0.0/16',
4195 'PT': '85.240.0.0/13',
4196 'PW': '202.124.224.0/20',
4197 'PY': '181.120.0.0/14',
4198 'QA': '37.210.0.0/15',
53896ca5 4199 'RE': '102.35.0.0/16',
773f291d 4200 'RO': '79.112.0.0/13',
53896ca5 4201 'RS': '93.86.0.0/15',
773f291d 4202 'RU': '5.136.0.0/13',
53896ca5 4203 'RW': '41.186.0.0/16',
773f291d
S
4204 'SA': '188.48.0.0/13',
4205 'SB': '202.1.160.0/19',
4206 'SC': '154.192.0.0/11',
53896ca5 4207 'SD': '102.120.0.0/13',
773f291d 4208 'SE': '78.64.0.0/12',
53896ca5 4209 'SG': '8.128.0.0/10',
773f291d
S
4210 'SI': '188.196.0.0/14',
4211 'SK': '78.98.0.0/15',
53896ca5 4212 'SL': '102.143.0.0/17',
773f291d
S
4213 'SM': '89.186.32.0/19',
4214 'SN': '41.82.0.0/15',
53896ca5 4215 'SO': '154.115.192.0/18',
773f291d
S
4216 'SR': '186.179.128.0/17',
4217 'SS': '105.235.208.0/21',
4218 'ST': '197.159.160.0/19',
4219 'SV': '168.243.0.0/16',
4220 'SX': '190.102.0.0/20',
4221 'SY': '5.0.0.0/16',
4222 'SZ': '41.84.224.0/19',
4223 'TC': '65.255.48.0/20',
4224 'TD': '154.68.128.0/19',
4225 'TG': '196.168.0.0/14',
4226 'TH': '171.96.0.0/13',
4227 'TJ': '85.9.128.0/18',
4228 'TK': '27.96.24.0/21',
4229 'TL': '180.189.160.0/20',
4230 'TM': '95.85.96.0/19',
4231 'TN': '197.0.0.0/11',
4232 'TO': '175.176.144.0/21',
4233 'TR': '78.160.0.0/11',
4234 'TT': '186.44.0.0/15',
4235 'TV': '202.2.96.0/19',
4236 'TW': '120.96.0.0/11',
4237 'TZ': '156.156.0.0/14',
53896ca5
S
4238 'UA': '37.52.0.0/14',
4239 'UG': '102.80.0.0/13',
4240 'US': '6.0.0.0/8',
773f291d 4241 'UY': '167.56.0.0/13',
53896ca5 4242 'UZ': '84.54.64.0/18',
773f291d 4243 'VA': '212.77.0.0/19',
53896ca5 4244 'VC': '207.191.240.0/21',
773f291d 4245 'VE': '186.88.0.0/13',
53896ca5 4246 'VG': '66.81.192.0/20',
773f291d
S
4247 'VI': '146.226.0.0/16',
4248 'VN': '14.160.0.0/11',
4249 'VU': '202.80.32.0/20',
4250 'WF': '117.20.32.0/21',
4251 'WS': '202.4.32.0/19',
4252 'YE': '134.35.0.0/16',
4253 'YT': '41.242.116.0/22',
4254 'ZA': '41.0.0.0/11',
53896ca5
S
4255 'ZM': '102.144.0.0/13',
4256 'ZW': '102.177.192.0/18',
773f291d
S
4257 }
4258
4259 @classmethod
5f95927a
S
4260 def random_ipv4(cls, code_or_block):
4261 if len(code_or_block) == 2:
4262 block = cls._country_ip_map.get(code_or_block.upper())
4263 if not block:
4264 return None
4265 else:
4266 block = code_or_block
773f291d 4267 addr, preflen = block.split('/')
ac668111 4268 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4269 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4270 return str(socket.inet_ntoa(
ac668111 4271 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4272
4273
0a5445dd
YCH
4274# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4275# released into Public Domain
4276# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4277
4278def long_to_bytes(n, blocksize=0):
4279 """long_to_bytes(n:long, blocksize:int) : string
4280 Convert a long integer to a byte string.
4281
4282 If optional blocksize is given and greater than zero, pad the front of the
4283 byte string with binary zeros so that the length is a multiple of
4284 blocksize.
4285 """
4286 # after much testing, this algorithm was deemed to be the fastest
4287 s = b''
4288 n = int(n)
4289 while n > 0:
ac668111 4290 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4291 n = n >> 32
4292 # strip off leading zeros
4293 for i in range(len(s)):
4294 if s[i] != b'\000'[0]:
4295 break
4296 else:
4297 # only happens when n == 0
4298 s = b'\000'
4299 i = 0
4300 s = s[i:]
4301 # add back some pad bytes. this could be done more efficiently w.r.t. the
4302 # de-padding being done above, but sigh...
4303 if blocksize > 0 and len(s) % blocksize:
4304 s = (blocksize - len(s) % blocksize) * b'\000' + s
4305 return s
4306
4307
4308def bytes_to_long(s):
4309 """bytes_to_long(string) : long
4310 Convert a byte string to a long integer.
4311
4312 This is (essentially) the inverse of long_to_bytes().
4313 """
4314 acc = 0
4315 length = len(s)
4316 if length % 4:
4317 extra = (4 - length % 4)
4318 s = b'\000' * extra + s
4319 length = length + extra
4320 for i in range(0, length, 4):
ac668111 4321 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4322 return acc
4323
4324
5bc880b9
YCH
4325def ohdave_rsa_encrypt(data, exponent, modulus):
4326 '''
4327 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4328
4329 Input:
4330 data: data to encrypt, bytes-like object
4331 exponent, modulus: parameter e and N of RSA algorithm, both integer
4332 Output: hex string of encrypted data
4333
4334 Limitation: supports one block encryption only
4335 '''
4336
4337 payload = int(binascii.hexlify(data[::-1]), 16)
4338 encrypted = pow(payload, exponent, modulus)
4339 return '%x' % encrypted
81bdc8fd
YCH
4340
4341
f48409c7
YCH
4342def pkcs1pad(data, length):
4343 """
4344 Padding input data with PKCS#1 scheme
4345
4346 @param {int[]} data input data
4347 @param {int} length target length
4348 @returns {int[]} padded data
4349 """
4350 if len(data) > length - 11:
4351 raise ValueError('Input data too long for PKCS#1 padding')
4352
4353 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4354 return [0, 2] + pseudo_random + [0] + data
4355
4356
7b2c3f47 4357def _base_n_table(n, table):
4358 if not table and not n:
4359 raise ValueError('Either table or n must be specified')
612f2be5 4360 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4361
44f14eb4 4362 if n and n != len(table):
612f2be5 4363 raise ValueError(f'base {n} exceeds table length {len(table)}')
4364 return table
59f898b7 4365
5eb6bdce 4366
7b2c3f47 4367def encode_base_n(num, n=None, table=None):
4368 """Convert given int to a base-n string"""
612f2be5 4369 table = _base_n_table(n, table)
7b2c3f47 4370 if not num:
5eb6bdce
YCH
4371 return table[0]
4372
7b2c3f47 4373 result, base = '', len(table)
81bdc8fd 4374 while num:
7b2c3f47 4375 result = table[num % base] + result
612f2be5 4376 num = num // base
7b2c3f47 4377 return result
4378
4379
4380def decode_base_n(string, n=None, table=None):
4381 """Convert given base-n string to int"""
4382 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4383 result, base = 0, len(table)
4384 for char in string:
4385 result = result * base + table[char]
4386 return result
4387
4388
f52354a8 4389def decode_packed_codes(code):
06b3fe29 4390 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4391 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4392 base = int(base)
4393 count = int(count)
4394 symbols = symbols.split('|')
4395 symbol_table = {}
4396
4397 while count:
4398 count -= 1
5eb6bdce 4399 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4400 symbol_table[base_n_count] = symbols[count] or base_n_count
4401
4402 return re.sub(
4403 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4404 obfuscated_code)
e154c651 4405
4406
1ced2221
S
4407def caesar(s, alphabet, shift):
4408 if shift == 0:
4409 return s
4410 l = len(alphabet)
4411 return ''.join(
4412 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4413 for c in s)
4414
4415
4416def rot47(s):
4417 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4418
4419
e154c651 4420def parse_m3u8_attributes(attrib):
4421 info = {}
4422 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4423 if val.startswith('"'):
4424 val = val[1:-1]
4425 info[key] = val
4426 return info
1143535d
YCH
4427
4428
4429def urshift(val, n):
4430 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4431
4432
efa97bdc 4433def write_xattr(path, key, value):
6f7563be 4434 # Windows: Write xattrs to NTFS Alternate Data Streams:
4435 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4436 if compat_os_name == 'nt':
4437 assert ':' not in key
4438 assert os.path.exists(path)
efa97bdc
YCH
4439
4440 try:
6f7563be 4441 with open(f'{path}:{key}', 'wb') as f:
4442 f.write(value)
86e5f3ed 4443 except OSError as e:
efa97bdc 4444 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4445 return
efa97bdc 4446
84e26038 4447 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
efa97bdc 4448
6f7563be 4449 setxattr = None
84e26038 4450 if callable(getattr(os, 'setxattr', None)):
4451 setxattr = os.setxattr
4452 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
6f7563be 4453 # Unicode arguments are not supported in pyxattr until version 0.5.0
4454 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4455 if version_tuple(xattr.__version__) >= (0, 5, 0):
4456 setxattr = xattr.set
4457 elif xattr:
4458 setxattr = xattr.setxattr
efa97bdc 4459
6f7563be 4460 if setxattr:
4461 try:
4462 setxattr(path, key, value)
4463 except OSError as e:
4464 raise XAttrMetadataError(e.errno, e.strerror)
4465 return
efa97bdc 4466
6f7563be 4467 # UNIX Method 2. Use setfattr/xattr executables
4468 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4469 else 'xattr' if check_executable('xattr', ['-h']) else None)
4470 if not exe:
4471 raise XAttrUnavailableError(
47ab66db 4472 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
6f7563be 4473 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4474
0f06bcd7 4475 value = value.decode()
6f7563be 4476 try:
f0c9fb96 4477 _, stderr, returncode = Popen.run(
6f7563be 4478 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4479 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4480 except OSError as e:
4481 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4482 if returncode:
4483 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4484
4485
4486def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4487 start_date = datetime.date(1950, 1, 1)
4488 end_date = datetime.date(1995, 12, 31)
4489 offset = random.randint(0, (end_date - start_date).days)
4490 random_date = start_date + datetime.timedelta(offset)
0c265486 4491 return {
aa374bc7
AS
4492 year_field: str(random_date.year),
4493 month_field: str(random_date.month),
4494 day_field: str(random_date.day),
0c265486 4495 }
732044af 4496
c76eb41b 4497
8c53322c
L
4498def find_available_port(interface=''):
4499 try:
4500 with socket.socket() as sock:
4501 sock.bind((interface, 0))
4502 return sock.getsockname()[1]
4503 except OSError:
4504 return None
4505
4506
732044af 4507# Templates for internet shortcut files, which are plain text files.
e5a998f3 4508DOT_URL_LINK_TEMPLATE = '''\
732044af 4509[InternetShortcut]
4510URL=%(url)s
e5a998f3 4511'''
732044af 4512
e5a998f3 4513DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4514<?xml version="1.0" encoding="UTF-8"?>
4515<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4516<plist version="1.0">
4517<dict>
4518\t<key>URL</key>
4519\t<string>%(url)s</string>
4520</dict>
4521</plist>
e5a998f3 4522'''
732044af 4523
e5a998f3 4524DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4525[Desktop Entry]
4526Encoding=UTF-8
4527Name=%(filename)s
4528Type=Link
4529URL=%(url)s
4530Icon=text-html
e5a998f3 4531'''
732044af 4532
08438d2c 4533LINK_TEMPLATES = {
4534 'url': DOT_URL_LINK_TEMPLATE,
4535 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4536 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4537}
4538
732044af 4539
4540def iri_to_uri(iri):
4541 """
4542 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4543
4544 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4545 """
4546
14f25df2 4547 iri_parts = urllib.parse.urlparse(iri)
732044af 4548
4549 if '[' in iri_parts.netloc:
4550 raise ValueError('IPv6 URIs are not, yet, supported.')
4551 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4552
4553 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4554
4555 net_location = ''
4556 if iri_parts.username:
f9934b96 4557 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4558 if iri_parts.password is not None:
f9934b96 4559 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4560 net_location += '@'
4561
0f06bcd7 4562 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4563 # The 'idna' encoding produces ASCII text.
4564 if iri_parts.port is not None and iri_parts.port != 80:
4565 net_location += ':' + str(iri_parts.port)
4566
f9934b96 4567 return urllib.parse.urlunparse(
732044af 4568 (iri_parts.scheme,
4569 net_location,
4570
f9934b96 4571 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4572
4573 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4574 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4575
4576 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4577 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4578
f9934b96 4579 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4580
4581 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4582
4583
4584def to_high_limit_path(path):
4585 if sys.platform in ['win32', 'cygwin']:
4586 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4587 return '\\\\?\\' + os.path.abspath(path)
732044af 4588
4589 return path
76d321f6 4590
c76eb41b 4591
7b2c3f47 4592def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4593 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4594 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4595 return default
7b2c3f47 4596 return template % func(val)
00dd0cd5 4597
4598
4599def clean_podcast_url(url):
91302ed3 4600 url = re.sub(r'''(?x)
00dd0cd5 4601 (?:
4602 (?:
4603 chtbl\.com/track|
4604 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4605 play\.podtrac\.com|
4606 chrt\.fm/track|
4607 mgln\.ai/e
4608 )(?:/[^/.]+)?|
00dd0cd5 4609 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4610 flex\.acast\.com|
4611 pd(?:
4612 cn\.co| # https://podcorn.com/analytics-prefix/
4613 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4614 )/e|
4615 [0-9]\.gum\.fm|
4616 pscrb\.fm/rss/p
00dd0cd5 4617 )/''', '', url)
91302ed3 4618 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4619
4620
4621_HEX_TABLE = '0123456789abcdef'
4622
4623
4624def random_uuidv4():
4625 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4626
4627
4628def make_dir(path, to_screen=None):
4629 try:
4630 dn = os.path.dirname(path)
b25d6cb9
AI
4631 if dn:
4632 os.makedirs(dn, exist_ok=True)
0202b52a 4633 return True
86e5f3ed 4634 except OSError as err:
0202b52a 4635 if callable(to_screen) is not None:
69bec673 4636 to_screen(f'unable to create directory {err}')
0202b52a 4637 return False
f74980cb 4638
4639
4640def get_executable_path():
69bec673 4641 from ..update import _get_variant_and_executable_path
c487cf00 4642
b5899f4f 4643 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4644
4645
8e40b9d1 4646def get_user_config_dirs(package_name):
8e40b9d1
M
4647 # .config (e.g. ~/.config/package_name)
4648 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4649 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4650
4651 # appdata (%APPDATA%/package_name)
4652 appdata_dir = os.getenv('appdata')
4653 if appdata_dir:
773c272d 4654 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4655
4656 # home (~/.package_name)
773c272d 4657 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4658
4659
4660def get_system_config_dirs(package_name):
8e40b9d1 4661 # /etc/package_name
773c272d 4662 yield os.path.join('/etc', package_name)
06167fbb 4663
4664
3e9b66d7 4665def time_seconds(**kwargs):
83c4970e
L
4666 """
4667 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4668 """
4669 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4670
4671
49fa4d9a
N
4672# create a JSON Web Signature (jws) with HS256 algorithm
4673# the resulting format is in JWS Compact Serialization
4674# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4675# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4676def jwt_encode_hs256(payload_data, key, headers={}):
4677 header_data = {
4678 'alg': 'HS256',
4679 'typ': 'JWT',
4680 }
4681 if headers:
4682 header_data.update(headers)
0f06bcd7 4683 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4684 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4685 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4686 signature_b64 = base64.b64encode(h.digest())
4687 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4688 return token
819e0531 4689
4690
16b0d7e6 4691# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4692def jwt_decode_hs256(jwt):
4693 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4694 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4695 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4696 return payload_data
4697
4698
53973b4d 4699WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4700
4701
7a32c70d 4702@functools.cache
819e0531 4703def supports_terminal_sequences(stream):
4704 if compat_os_name == 'nt':
8a82af35 4705 if not WINDOWS_VT_MODE:
819e0531 4706 return False
4707 elif not os.getenv('TERM'):
4708 return False
4709 try:
4710 return stream.isatty()
4711 except BaseException:
4712 return False
4713
4714
c53a18f0 4715def windows_enable_vt_mode():
4716 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4717 if get_windows_version() < (10, 0, 10586):
53973b4d 4718 return
53973b4d 4719
c53a18f0 4720 import ctypes
4721 import ctypes.wintypes
4722 import msvcrt
4723
4724 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4725
4726 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4727 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4728 try:
4729 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4730 dw_original_mode = ctypes.wintypes.DWORD()
4731 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4732 if not success:
4733 raise Exception('GetConsoleMode failed')
4734
4735 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4736 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4737 if not success:
4738 raise Exception('SetConsoleMode failed')
c53a18f0 4739 finally:
4740 os.close(handle)
53973b4d 4741
f0795149 4742 global WINDOWS_VT_MODE
4743 WINDOWS_VT_MODE = True
4744 supports_terminal_sequences.cache_clear()
4745
53973b4d 4746
ec11a9f4 4747_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4748
4749
4750def remove_terminal_sequences(string):
4751 return _terminal_sequences_re.sub('', string)
4752
4753
4754def number_of_digits(number):
4755 return len('%d' % number)
34921b43 4756
4757
4758def join_nonempty(*values, delim='-', from_dict=None):
4759 if from_dict is not None:
69bec673 4760 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4761 return delim.join(map(str, filter(None, values)))
06e57990 4762
4763
27231526
ZM
4764def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4765 """
4766 Find the largest format dimensions in terms of video width and, for each thumbnail:
4767 * Modify the URL: Match the width with the provided regex and replace with the former width
4768 * Update dimensions
4769
4770 This function is useful with video services that scale the provided thumbnails on demand
4771 """
4772 _keys = ('width', 'height')
4773 max_dimensions = max(
86e5f3ed 4774 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4775 default=(0, 0))
4776 if not max_dimensions[0]:
4777 return thumbnails
4778 return [
4779 merge_dicts(
4780 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4781 dict(zip(_keys, max_dimensions)), thumbnail)
4782 for thumbnail in thumbnails
4783 ]
4784
4785
93c8410d
LNO
4786def parse_http_range(range):
4787 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4788 if not range:
4789 return None, None, None
4790 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4791 if not crg:
4792 return None, None, None
4793 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4794
4795
6b9e832d 4796def read_stdin(what):
a174c453 4797 if what:
4798 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4799 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
6b9e832d 4800 return sys.stdin
4801
4802
a904a7f8
L
4803def determine_file_encoding(data):
4804 """
88f60feb 4805 Detect the text encoding used
a904a7f8
L
4806 @returns (encoding, bytes to skip)
4807 """
4808
88f60feb 4809 # BOM marks are given priority over declarations
a904a7f8 4810 for bom, enc in BOMS:
a904a7f8
L
4811 if data.startswith(bom):
4812 return enc, len(bom)
4813
88f60feb 4814 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4815 # We ignore the endianness to get a good enough match
a904a7f8 4816 data = data.replace(b'\0', b'')
88f60feb 4817 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4818 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4819
4820
06e57990 4821class Config:
4822 own_args = None
9e491463 4823 parsed_args = None
06e57990 4824 filename = None
4825 __initialized = False
4826
4827 def __init__(self, parser, label=None):
9e491463 4828 self.parser, self.label = parser, label
06e57990 4829 self._loaded_paths, self.configs = set(), []
4830
4831 def init(self, args=None, filename=None):
4832 assert not self.__initialized
284a60c5 4833 self.own_args, self.filename = args, filename
4834 return self.load_configs()
4835
4836 def load_configs(self):
65662dff 4837 directory = ''
284a60c5 4838 if self.filename:
4839 location = os.path.realpath(self.filename)
65662dff 4840 directory = os.path.dirname(location)
06e57990 4841 if location in self._loaded_paths:
4842 return False
4843 self._loaded_paths.add(location)
4844
284a60c5 4845 self.__initialized = True
4846 opts, _ = self.parser.parse_known_args(self.own_args)
4847 self.parsed_args = self.own_args
9e491463 4848 for location in opts.config_locations or []:
6b9e832d 4849 if location == '-':
1060f82f 4850 if location in self._loaded_paths:
4851 continue
4852 self._loaded_paths.add(location)
6b9e832d 4853 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4854 continue
65662dff 4855 location = os.path.join(directory, expand_path(location))
06e57990 4856 if os.path.isdir(location):
4857 location = os.path.join(location, 'yt-dlp.conf')
4858 if not os.path.exists(location):
9e491463 4859 self.parser.error(f'config location {location} does not exist')
06e57990 4860 self.append_config(self.read_file(location), location)
4861 return True
4862
4863 def __str__(self):
4864 label = join_nonempty(
4865 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4866 delim=' ')
4867 return join_nonempty(
4868 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4869 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4870 delim='\n')
4871
7a32c70d 4872 @staticmethod
06e57990 4873 def read_file(filename, default=[]):
4874 try:
a904a7f8 4875 optionf = open(filename, 'rb')
86e5f3ed 4876 except OSError:
06e57990 4877 return default # silently skip if file is not present
a904a7f8
L
4878 try:
4879 enc, skip = determine_file_encoding(optionf.read(512))
4880 optionf.seek(skip, io.SEEK_SET)
4881 except OSError:
4882 enc = None # silently skip read errors
06e57990 4883 try:
4884 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4885 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4886 res = shlex.split(contents, comments=True)
44a6fcff 4887 except Exception as err:
4888 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4889 finally:
4890 optionf.close()
4891 return res
4892
7a32c70d 4893 @staticmethod
06e57990 4894 def hide_login_info(opts):
86e5f3ed 4895 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4896 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4897
4898 def _scrub_eq(o):
4899 m = eqre.match(o)
4900 if m:
4901 return m.group('key') + '=PRIVATE'
4902 else:
4903 return o
4904
4905 opts = list(map(_scrub_eq, opts))
4906 for idx, opt in enumerate(opts):
4907 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4908 opts[idx + 1] = 'PRIVATE'
4909 return opts
4910
4911 def append_config(self, *args, label=None):
9e491463 4912 config = type(self)(self.parser, label)
06e57990 4913 config._loaded_paths = self._loaded_paths
4914 if config.init(*args):
4915 self.configs.append(config)
4916
7a32c70d 4917 @property
06e57990 4918 def all_args(self):
4919 for config in reversed(self.configs):
4920 yield from config.all_args
9e491463 4921 yield from self.parsed_args or []
4922
4923 def parse_known_args(self, **kwargs):
4924 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4925
4926 def parse_args(self):
9e491463 4927 return self.parser.parse_args(self.all_args)
da42679b
LNO
4928
4929
8b7539d2 4930def merge_headers(*dicts):
08d30158 4931 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 4932 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 4933
4934
b1f94422 4935def cached_method(f):
4936 """Cache a method"""
4937 signature = inspect.signature(f)
4938
7a32c70d 4939 @functools.wraps(f)
b1f94422 4940 def wrapper(self, *args, **kwargs):
4941 bound_args = signature.bind(self, *args, **kwargs)
4942 bound_args.apply_defaults()
d5d1df8a 4943 key = tuple(bound_args.arguments.values())[1:]
b1f94422 4944
6368e2e6 4945 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 4946 if key not in cache:
4947 cache[key] = f(self, *args, **kwargs)
4948 return cache[key]
4949 return wrapper
4950
4951
28787f16 4952class classproperty:
83cc7b8a 4953 """property access for class methods with optional caching"""
4954 def __new__(cls, func=None, *args, **kwargs):
4955 if not func:
4956 return functools.partial(cls, *args, **kwargs)
4957 return super().__new__(cls)
c487cf00 4958
83cc7b8a 4959 def __init__(self, func, *, cache=False):
c487cf00 4960 functools.update_wrapper(self, func)
4961 self.func = func
83cc7b8a 4962 self._cache = {} if cache else None
28787f16 4963
4964 def __get__(self, _, cls):
83cc7b8a 4965 if self._cache is None:
4966 return self.func(cls)
4967 elif cls not in self._cache:
4968 self._cache[cls] = self.func(cls)
4969 return self._cache[cls]
19a03940 4970
4971
a5387729 4972class function_with_repr:
b2e0343b 4973 def __init__(self, func, repr_=None):
a5387729 4974 functools.update_wrapper(self, func)
b2e0343b 4975 self.func, self.__repr = func, repr_
a5387729 4976
4977 def __call__(self, *args, **kwargs):
4978 return self.func(*args, **kwargs)
4979
4980 def __repr__(self):
b2e0343b 4981 if self.__repr:
4982 return self.__repr
a5387729 4983 return f'{self.func.__module__}.{self.func.__qualname__}'
4984
4985
64fa820c 4986class Namespace(types.SimpleNamespace):
591bb9d3 4987 """Immutable namespace"""
591bb9d3 4988
7896214c 4989 def __iter__(self):
64fa820c 4990 return iter(self.__dict__.values())
7896214c 4991
7a32c70d 4992 @property
64fa820c 4993 def items_(self):
4994 return self.__dict__.items()
9b8ee23b 4995
4996
8dc59305 4997MEDIA_EXTENSIONS = Namespace(
4998 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4999 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5000 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5001 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5002 thumbnails=('jpg', 'png', 'webp'),
5003 storyboards=('mhtml', ),
5004 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5005 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5006)
5007MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5008MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5009
5010KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5011
5012
be5c1ae8 5013class RetryManager:
5014 """Usage:
5015 for retry in RetryManager(...):
5016 try:
5017 ...
5018 except SomeException as err:
5019 retry.error = err
5020 continue
5021 """
5022 attempt, _error = 0, None
5023
5024 def __init__(self, _retries, _error_callback, **kwargs):
5025 self.retries = _retries or 0
5026 self.error_callback = functools.partial(_error_callback, **kwargs)
5027
5028 def _should_retry(self):
5029 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5030
7a32c70d 5031 @property
be5c1ae8 5032 def error(self):
5033 if self._error is NO_DEFAULT:
5034 return None
5035 return self._error
5036
7a32c70d 5037 @error.setter
be5c1ae8 5038 def error(self, value):
5039 self._error = value
5040
5041 def __iter__(self):
5042 while self._should_retry():
5043 self.error = NO_DEFAULT
5044 self.attempt += 1
5045 yield self
5046 if self.error:
5047 self.error_callback(self.error, self.attempt, self.retries)
5048
7a32c70d 5049 @staticmethod
be5c1ae8 5050 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5051 """Utility function for reporting retries"""
5052 if count > retries:
5053 if error:
5054 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5055 raise e
5056
5057 if not count:
5058 return warn(e)
5059 elif isinstance(e, ExtractorError):
3ce29336 5060 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5061 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5062
5063 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5064 if delay:
5065 info(f'Sleeping {delay:.2f} seconds ...')
5066 time.sleep(delay)
5067
5068
0647d925 5069def make_archive_id(ie, video_id):
5070 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5071 return f'{ie_key.lower()} {video_id}'
5072
5073
a1c5bd82 5074def truncate_string(s, left, right=0):
5075 assert left > 3 and right >= 0
5076 if s is None or len(s) <= left + right:
5077 return s
f9fb3ce8 5078 return f'{s[:left - 3]}...{s[-right:] if right else ""}'
a1c5bd82 5079
5080
5314b521 5081def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5082 assert 'all' in alias_dict, '"all" alias is required'
5083 requested = list(start or [])
5084 for val in options:
5085 discard = val.startswith('-')
5086 if discard:
5087 val = val[1:]
5088
5089 if val in alias_dict:
5090 val = alias_dict[val] if not discard else [
5091 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5092 # NB: Do not allow regex in aliases for performance
5093 requested = orderedSet_from_options(val, alias_dict, start=requested)
5094 continue
5095
5096 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5097 else [val] if val in alias_dict['all'] else None)
5098 if current is None:
5099 raise ValueError(val)
5100
5101 if discard:
5102 for item in current:
5103 while item in requested:
5104 requested.remove(item)
5105 else:
5106 requested.extend(current)
5107
5108 return orderedSet(requested)
5109
5110
eedda525 5111# TODO: Rewrite
d0d74b71 5112class FormatSorter:
5113 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5114
5115 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5116 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5117 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5118 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5119 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5120 'fps', 'fs_approx', 'source', 'id')
5121
5122 settings = {
5123 'vcodec': {'type': 'ordered', 'regex': True,
5124 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5125 'acodec': {'type': 'ordered', 'regex': True,
71082216 5126 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5127 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5128 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5129 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5130 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5131 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5132 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5133 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5134 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5135 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5136 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5137 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5138 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5139 'field': ('vcodec', 'acodec'),
5140 'function': lambda it: int(any(v != 'none' for v in it))},
5141 'ie_pref': {'priority': True, 'type': 'extractor'},
5142 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5143 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5144 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5145 'quality': {'convert': 'float', 'default': -1},
5146 'filesize': {'convert': 'bytes'},
5147 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5148 'id': {'convert': 'string', 'field': 'format_id'},
5149 'height': {'convert': 'float_none'},
5150 'width': {'convert': 'float_none'},
5151 'fps': {'convert': 'float_none'},
5152 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5153 'tbr': {'convert': 'float_none'},
5154 'vbr': {'convert': 'float_none'},
5155 'abr': {'convert': 'float_none'},
5156 'asr': {'convert': 'float_none'},
5157 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5158
5159 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5160 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5161 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5162 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5163 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5164 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5165 'res': {'type': 'multiple', 'field': ('height', 'width'),
5166 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5167
5168 # Actual field names
5169 'format_id': {'type': 'alias', 'field': 'id'},
5170 'preference': {'type': 'alias', 'field': 'ie_pref'},
5171 'language_preference': {'type': 'alias', 'field': 'lang'},
5172 'source_preference': {'type': 'alias', 'field': 'source'},
5173 'protocol': {'type': 'alias', 'field': 'proto'},
5174 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5175 'audio_channels': {'type': 'alias', 'field': 'channels'},
5176
5177 # Deprecated
5178 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5179 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5180 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5181 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5182 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5183 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5184 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5185 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5186 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5187 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5188 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5189 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5190 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5191 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5192 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5193 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5194 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5195 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5196 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5197 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5198 }
5199
5200 def __init__(self, ydl, field_preference):
5201 self.ydl = ydl
5202 self._order = []
5203 self.evaluate_params(self.ydl.params, field_preference)
5204 if ydl.params.get('verbose'):
5205 self.print_verbose_info(self.ydl.write_debug)
5206
5207 def _get_field_setting(self, field, key):
5208 if field not in self.settings:
5209 if key in ('forced', 'priority'):
5210 return False
5211 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5212 'deprecated and may be removed in a future version')
5213 self.settings[field] = {}
5214 propObj = self.settings[field]
5215 if key not in propObj:
5216 type = propObj.get('type')
5217 if key == 'field':
5218 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5219 elif key == 'convert':
5220 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5221 else:
5222 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5223 propObj[key] = default
5224 return propObj[key]
5225
5226 def _resolve_field_value(self, field, value, convertNone=False):
5227 if value is None:
5228 if not convertNone:
5229 return None
5230 else:
5231 value = value.lower()
5232 conversion = self._get_field_setting(field, 'convert')
5233 if conversion == 'ignore':
5234 return None
5235 if conversion == 'string':
5236 return value
5237 elif conversion == 'float_none':
5238 return float_or_none(value)
5239 elif conversion == 'bytes':
5240 return parse_bytes(value)
5241 elif conversion == 'order':
5242 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5243 use_regex = self._get_field_setting(field, 'regex')
5244 list_length = len(order_list)
5245 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5246 if use_regex and value is not None:
5247 for i, regex in enumerate(order_list):
5248 if regex and re.match(regex, value):
5249 return list_length - i
5250 return list_length - empty_pos # not in list
5251 else: # not regex or value = None
5252 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5253 else:
5254 if value.isnumeric():
5255 return float(value)
5256 else:
5257 self.settings[field]['convert'] = 'string'
5258 return value
5259
5260 def evaluate_params(self, params, sort_extractor):
5261 self._use_free_order = params.get('prefer_free_formats', False)
5262 self._sort_user = params.get('format_sort', [])
5263 self._sort_extractor = sort_extractor
5264
5265 def add_item(field, reverse, closest, limit_text):
5266 field = field.lower()
5267 if field in self._order:
5268 return
5269 self._order.append(field)
5270 limit = self._resolve_field_value(field, limit_text)
5271 data = {
5272 'reverse': reverse,
5273 'closest': False if limit is None else closest,
5274 'limit_text': limit_text,
5275 'limit': limit}
5276 if field in self.settings:
5277 self.settings[field].update(data)
5278 else:
5279 self.settings[field] = data
5280
5281 sort_list = (
5282 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5283 + (tuple() if params.get('format_sort_force', False)
5284 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5285 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5286
5287 for item in sort_list:
5288 match = re.match(self.regex, item)
5289 if match is None:
5290 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5291 field = match.group('field')
5292 if field is None:
5293 continue
5294 if self._get_field_setting(field, 'type') == 'alias':
5295 alias, field = field, self._get_field_setting(field, 'field')
5296 if self._get_field_setting(alias, 'deprecated'):
5297 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5298 f'be removed in a future version. Please use {field} instead')
5299 reverse = match.group('reverse') is not None
5300 closest = match.group('separator') == '~'
5301 limit_text = match.group('limit')
5302
5303 has_limit = limit_text is not None
5304 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5305 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5306
5307 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5308 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5309 limit_count = len(limits)
5310 for (i, f) in enumerate(fields):
5311 add_item(f, reverse, closest,
5312 limits[i] if i < limit_count
5313 else limits[0] if has_limit and not has_multiple_limits
5314 else None)
5315
5316 def print_verbose_info(self, write_debug):
5317 if self._sort_user:
5318 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5319 if self._sort_extractor:
5320 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5321 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5322 '+' if self._get_field_setting(field, 'reverse') else '', field,
5323 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5324 self._get_field_setting(field, 'limit_text'),
5325 self._get_field_setting(field, 'limit'))
5326 if self._get_field_setting(field, 'limit_text') is not None else '')
5327 for field in self._order if self._get_field_setting(field, 'visible')]))
5328
5329 def _calculate_field_preference_from_value(self, format, field, type, value):
5330 reverse = self._get_field_setting(field, 'reverse')
5331 closest = self._get_field_setting(field, 'closest')
5332 limit = self._get_field_setting(field, 'limit')
5333
5334 if type == 'extractor':
5335 maximum = self._get_field_setting(field, 'max')
5336 if value is None or (maximum is not None and value >= maximum):
5337 value = -1
5338 elif type == 'boolean':
5339 in_list = self._get_field_setting(field, 'in_list')
5340 not_in_list = self._get_field_setting(field, 'not_in_list')
5341 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5342 elif type == 'ordered':
5343 value = self._resolve_field_value(field, value, True)
5344
5345 # try to convert to number
5346 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5347 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5348 if is_num:
5349 value = val_num
5350
5351 return ((-10, 0) if value is None
5352 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5353 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5354 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5355 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5356 else (-1, value, 0))
5357
5358 def _calculate_field_preference(self, format, field):
5359 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5360 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5361 if type == 'multiple':
5362 type = 'field' # Only 'field' is allowed in multiple for now
5363 actual_fields = self._get_field_setting(field, 'field')
5364
5365 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5366 else:
5367 value = get_value(field)
5368 return self._calculate_field_preference_from_value(format, field, type, value)
5369
5370 def calculate_preference(self, format):
5371 # Determine missing protocol
5372 if not format.get('protocol'):
5373 format['protocol'] = determine_protocol(format)
5374
5375 # Determine missing ext
5376 if not format.get('ext') and 'url' in format:
5377 format['ext'] = determine_ext(format['url'])
5378 if format.get('vcodec') == 'none':
5379 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5380 format['video_ext'] = 'none'
5381 else:
5382 format['video_ext'] = format['ext']
5383 format['audio_ext'] = 'none'
5384 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5385 # format['preference'] = -1000
5386
5424dbaf
L
5387 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5388 # HEVC-over-FLV is out-of-spec by FLV's original spec
5389 # ref. https://trac.ffmpeg.org/ticket/6389
5390 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5391 format['preference'] = -100
5392
d0d74b71 5393 # Determine missing bitrates
eedda525 5394 if format.get('vcodec') == 'none':
5395 format['vbr'] = 0
5396 if format.get('acodec') == 'none':
5397 format['abr'] = 0
5398 if not format.get('vbr') and format.get('vcodec') != 'none':
5399 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5400 if not format.get('abr') and format.get('acodec') != 'none':
5401 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5402 if not format.get('tbr'):
5403 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5404
5405 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5406
5407
5408# XXX: Temporary
5409class _YDLLogger:
5410 def __init__(self, ydl=None):
5411 self._ydl = ydl
5412
5413 def debug(self, message):
5414 if self._ydl:
5415 self._ydl.write_debug(message)
5416
5417 def info(self, message):
5418 if self._ydl:
5419 self._ydl.to_screen(message)
5420
5421 def warning(self, message, *, once=False):
5422 if self._ydl:
3d2623a8 5423 self._ydl.report_warning(message, once)
1b392f90 5424
5425 def error(self, message, *, is_error=True):
5426 if self._ydl:
5427 self._ydl.report_error(message, is_error=is_error)
5428
5429 def stdout(self, message):
5430 if self._ydl:
5431 self._ydl.to_stdout(message)
5432
5433 def stderr(self, message):
5434 if self._ydl:
5435 self._ydl.to_stderr(message)