]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[rh:websockets] Migrate websockets to networking framework (#7720)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
1e399778 1import base64
5bc880b9 2import binascii
912b38b4 3import calendar
676eb3f2 4import codecs
c380cc28 5import collections
ab029d7e 6import collections.abc
62e609ab 7import contextlib
c496ca96 8import datetime
0c265486 9import email.header
f8271158 10import email.utils
f45c185f 11import errno
49fa4d9a
N
12import hashlib
13import hmac
ac668111 14import html.entities
15import html.parser
b1f94422 16import inspect
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
f8271158 22import mimetypes
db3ad8a6 23import netrc
347de493 24import operator
d77c3dfd 25import os
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
f8271158 29import shlex
c496ca96 30import socket
79a2e94e 31import ssl
ac668111 32import struct
1c088fa8 33import subprocess
d77c3dfd 34import sys
181c8655 35import tempfile
c380cc28 36import time
01951dda 37import traceback
64fa820c 38import types
989a01c2 39import unicodedata
14f25df2 40import urllib.error
f8271158 41import urllib.parse
ac668111 42import urllib.request
bcf89ce6 43import xml.etree.ElementTree
d77c3dfd 44
69bec673 45from . import traversal
46
47from ..compat import functools # isort: split
48from ..compat import (
36e6f62c 49 compat_etree_fromstring,
51098426 50 compat_expanduser,
f8271158 51 compat_HTMLParseError,
efa97bdc 52 compat_os_name,
702ccf2d 53 compat_shlex_quote,
8c25f81b 54)
ccfd70f4 55from ..dependencies import xattr
51fb4995 56
46f1370e 57__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
f7a147e3 62
4823ec9f 63class NO_DEFAULT:
64 pass
65
66
67def IDENTITY(x):
68 return x
69
bf42a990 70
7105440c
YCH
71ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
f6717dec
S
75MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
3e4185c3
S
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 85}
a942d6cb 86
8f53dc44 87# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95}
96
c587cbb7 97# needed for sanitizing filenames in restricted mode
c8827027 98ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 101
46f59e89
S
102DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
cb655f34
S
106 '%B %dst %Y',
107 '%B %dnd %Y',
9d30c213 108 '%B %drd %Y',
cb655f34 109 '%B %dth %Y',
46f59e89 110 '%b %d %Y',
cb655f34
S
111 '%b %dst %Y',
112 '%b %dnd %Y',
9d30c213 113 '%b %drd %Y',
cb655f34 114 '%b %dth %Y',
46f59e89
S
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
9d30c213 117 '%b %drd %Y %I:%M',
46f59e89
S
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
bccdbd22 121 '%Y.%m.%d.',
46f59e89 122 '%Y/%m/%d',
81c13222 123 '%Y/%m/%d %H:%M',
46f59e89 124 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
4f3fa23e 127 '%Y%m%d',
0c1c6f4b 128 '%Y-%m-%d %H:%M',
46f59e89
S
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
5014558a 131 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
c6eed6b8
S
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
b555ae9b
S
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
a63d9bd0 144 '%H:%M %d-%b-%Y',
46f59e89
S
145)
146
147DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
47304e07 155 '%d-%m-%Y %H:%M',
4cbfa570 156 '%H:%M %d/%m/%Y',
46f59e89
S
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
06b3fe29 168PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 169JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 170
1d485a1a 171NUMBER_RE = r'\d+(?:\.\d+)?'
172
7105440c 173
0b9c08b4 174@functools.cache
d77c3dfd 175def preferredencoding():
59ae15a5 176 """Get preferred encoding.
d77c3dfd 177
59ae15a5
PH
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
28e614de 183 'TEST'.encode(pref)
70a1165b 184 except Exception:
59ae15a5 185 pref = 'UTF-8'
bae611f2 186
59ae15a5 187 return pref
d77c3dfd 188
f4bfd65f 189
181c8655 190def write_json_file(obj, fn):
1394646a 191 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 192
cfb0511d 193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
196
197 try:
198 with tf:
45d86abe 199 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
19a03940 203 with contextlib.suppress(OSError):
1394646a 204 os.unlink(fn)
19a03940 205 with contextlib.suppress(OSError):
9cd5f54e
R
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
181c8655 209 os.rename(tf.name, fn)
70a1165b 210 except Exception:
19a03940 211 with contextlib.suppress(OSError):
181c8655 212 os.remove(tf.name)
181c8655
PH
213 raise
214
215
cfb0511d 216def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 220 return node.find(expr)
59ae56fa 221
d7e66d39
JMF
222# On python2.6 the xml.etree.ElementTree.Element methods don't support
223# the namespace parameter
5f6a1245
JW
224
225
d7e66d39
JMF
226def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
d77c3dfd 237
a41fb80c 238def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 239 def _find_xpath(xpath):
f9934b96 240 return node.find(xpath)
578c0745 241
14f25df2 242 if isinstance(xpath, str):
578c0745
S
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
d74bebd5 249
8e636da4 250 if n is None:
bf42a990
S
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
bf0ff932
PH
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
a41fb80c
S
258 return n
259
260
261def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
a41fb80c
S
274
275
276def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
86e5f3ed 282 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
bf0ff932
PH
287
288
c487cf00 289def get_element_by_id(id, html, **kwargs):
43e8fafd 290 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 291 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 292
12ea2f30 293
c487cf00 294def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 295 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 296 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
297
298
84c237fb 299def get_element_by_class(class_name, html):
2af12ad9
TC
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
6f32a0b5
ZM
305def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
c487cf00 311def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
313 return retval[0] if retval else None
314
315
c487cf00 316def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
318 return retval[0] if retval else None
319
320
c487cf00 321def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
64fa820c 324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
325 html, escape_value=False)
326
327
6f32a0b5
ZM
328def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
64fa820c 331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
332 html, escape_value=False)
333
334
335def get_elements_by_attribute(*args, **kwargs):
43e8fafd 336 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
4c9a1a3b 345def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
c61473c1
M
350 if not value:
351 return
9e6dd238 352
86e5f3ed 353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 354
84c237fb
YCH
355 value = re.escape(value) if escape_value else value
356
86e5f3ed 357 partial_element_re = rf'''(?x)
4c9a1a3b 358 <(?P<tag>{tag})
0254f162 359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
38285056 362
0254f162
ZM
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 365
0254f162
ZM
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
a921f407 370
c5229f39 371
ac668111 372class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
ac668111 384 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
46d09f87 414# XXX: This should be far less strict
6f32a0b5
ZM
415def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
ac668111 450class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 451 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 452
8bb56eee 453 def __init__(self):
c5229f39 454 self.attrs = {}
ac668111 455 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
7053aa3a 459 raise compat_HTMLParseError('done')
8bb56eee 460
c5229f39 461
ac668111 462class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
ac668111 466 html.parser.HTMLParser.__init__(self)
73673ccf
FF
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
8bb56eee
BF
479def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
8bb56eee
BF
492 """
493 parser = HTMLAttributeParser()
19a03940 494 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
495 parser.feed(html_element)
496 parser.close()
8bb56eee 497 return parser.attrs
9e6dd238 498
c5229f39 499
73673ccf
FF
500def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
9e6dd238 509def clean_html(html):
59ae15a5 510 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
49185227 515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
7decf895 522 return html.strip()
9e6dd238
FV
523
524
b7c47b74 525class LenientJSONDecoder(json.JSONDecoder):
cc090836 526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 529 self._close_attempts = 2 * close_objects
b7c47b74 530 super().__init__(*args, **kwargs)
531
cc090836 532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
b7c47b74 546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
cc090836 549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
2fa669f7 561 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 562 assert False, 'Too many attempts to decode JSON'
b7c47b74 563
564
d77c3dfd 565def sanitize_open(filename, open_mode):
59ae15a5
PH
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
0edb3e33 575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
be5c1ae8 578
62b58c09 579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 583
0edb3e33 584 for attempt in range(2):
585 try:
586 try:
89737671 587 if sys.platform == 'win32':
b506289f 588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 591 raise LockingUnsupportedError()
0edb3e33 592 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 593 except OSError:
0edb3e33 594 stream = open(filename, open_mode)
8a82af35 595 return stream, filename
86e5f3ed 596 except OSError as err:
0edb3e33 597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
d77c3dfd
FV
602
603
604def timeconvert(timestr):
59ae15a5
PH
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
1c469a94 611
5f6a1245 612
5c3895ff 613def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 614 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 618 """
5c3895ff 619 if s == '':
620 return ''
621
59ae15a5 622 def replace_insane(char):
c587cbb7
AT
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
91dd88b9 625 elif not restricted and char == '\n':
5c3895ff 626 return '\0 '
989a01c2 627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
5c3895ff 635 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 636 elif char in '\\/|*<>':
5c3895ff 637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
639 return '\0_'
59ae15a5
PH
640 return char
641
db4678e4 642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 644 s = unicodedata.normalize('NFKC', s)
5c3895ff 645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 646 result = ''.join(map(replace_insane, s))
5c3895ff 647 if is_id is NO_DEFAULT:
ae61d108 648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
796173d0
PH
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
5a42414b
PH
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
a7440261 662 result = result.lstrip('.')
796173d0
PH
663 if not result:
664 result = '_'
59ae15a5 665 return result
d77c3dfd 666
5f6a1245 667
c2934512 668def sanitize_path(s, force=False):
a2aaf4db 669 """Sanitizes and normalizes path on Windows"""
836e06d2 670 # XXX: this handles drive relative paths (c:sth) incorrectly
c2934512 671 if sys.platform == 'win32':
c4218ac3 672 force = False
c2934512 673 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 674 elif force:
675 drive_or_unc = ''
676 else:
a2aaf4db 677 return s
c2934512 678
be531ef1
S
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
a2aaf4db
S
681 norm_path.pop(0)
682 sanitized_path = [
ec85ded8 683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 684 for path_part in norm_path]
be531ef1
S
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 687 elif force and s and s[0] == os.path.sep:
c4218ac3 688 sanitized_path.insert(0, os.path.sep)
836e06d2
SS
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
a2aaf4db
S
693
694
8f97a15d 695def sanitize_url(url, *, scheme='http'):
befa4708
S
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
21633673 698 if url is None:
699 return
700 elif url.startswith('//'):
8f97a15d 701 return f'{scheme}:{url}'
befa4708
S
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
067aa17e 704 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
bc6b9bcd 712 return url
17bcc626
S
713
714
5435dcf9 715def extract_basic_auth(url):
14f25df2 716 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
717 if parts.username is None:
718 return url, None
14f25df2 719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
0f06bcd7 723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
725
726
51098426 727def expand_path(s):
2fa669f7 728 """Expand shell variables and ~"""
51098426
S
729 return os.path.expandvars(compat_expanduser(s))
730
731
7e9a6125 732def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
d77c3dfd 742
912b38b4 743
55b2f099 744def _htmlentity_transform(entity_with_semicolon):
4e408e47 745 """Transforms an HTML entity to a character."""
55b2f099
YCH
746 entity = entity_with_semicolon[:-1]
747
4e408e47 748 # Known non-numeric HTML entity
ac668111 749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
4e408e47 751
62b58c09
L
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
55b2f099 756
91757b0f 757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
758 if mobj is not None:
759 numstr = mobj.group(1)
28e614de 760 if numstr.startswith('x'):
4e408e47 761 base = 16
28e614de 762 numstr = '0%s' % numstr
4e408e47
PH
763 else:
764 base = 10
067aa17e 765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 766 with contextlib.suppress(ValueError):
ac668111 767 return chr(int(numstr, base))
4e408e47
PH
768
769 # Unknown entity in name, return its literal representation
7a3f0c00 770 return '&%s;' % entity
4e408e47
PH
771
772
d77c3dfd 773def unescapeHTML(s):
912b38b4
PH
774 if s is None:
775 return None
19a03940 776 assert isinstance(s, str)
d77c3dfd 777
4e408e47 778 return re.sub(
95f3f7c2 779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 780
8bf48f23 781
cdb19aa4 782def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
db3ad8a6
ND
793class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
d3c93ec2 800class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
82ea226c
L
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
de015e93 826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
82ea226c
L
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
da8e2912 831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
de015e93
SS
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
d3c93ec2 851
852 def communicate_or_kill(self, *args, **kwargs):
8a82af35 853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
f0c9fb96 856 self.kill(timeout=None)
8a82af35 857 raise
d3c93ec2 858
f0c9fb96 859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
992dc6b4 865 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 866 with cls(*args, **kwargs) as proc:
da8e2912 867 default = '' if proc.__text_mode else b''
992dc6b4 868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 869 return stdout or default, stderr or default, proc.returncode
f0c9fb96 870
d3c93ec2 871
f07b74fc 872def encodeArgument(s):
cfb0511d 873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
14f25df2 875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 876 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
877
878
aa7785f8 879_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
cdb19aa4 889def formatSeconds(secs, delim=':', msec=False):
aa7785f8 890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 895 else:
aa7785f8 896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 898
a0ddb8a2 899
5873d4cc 900def bug_reports_message(before=';'):
69bec673 901 from ..update import REPOSITORY
57e0f077 902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
08f2a92c
JMF
911
912
bf5b9d85
PM
913class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
aa9369a2 915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
bf5b9d85
PM
923
924
925class ExtractorError(YoutubeDLError):
1c256f70 926 """Error during info extraction."""
5f6a1245 927
1151c407 928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 929 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 931 """
c365dba8 932 from ..networking.exceptions import network_exceptions
3158150c 933 if sys.exc_info()[0] in network_exceptions:
9a82b238 934 expected = True
d5979c5d 935
7265a219 936 self.orig_msg = str(msg)
1c256f70 937 self.traceback = tb
1151c407 938 self.expected = expected
2eabb802 939 self.cause = cause
d11271dd 940 self.video_id = video_id
1151c407 941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
5df14442 943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 945 super().__init__(self.__msg)
1151c407 946
9bcfe33b 947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
1c256f70 955
01951dda 956 def format_traceback(self):
497d2fab 957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 960 delim='\n') or None
01951dda 961
9bcfe33b 962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
1c256f70 968
416c7fcb
PH
969class UnsupportedError(ExtractorError):
970 def __init__(self, url):
86e5f3ed 971 super().__init__(
416c7fcb
PH
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
55b3e45b
JMF
976class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
773f291d
S
981class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
b6e0c7d2 987
0db3bae8 988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
86e5f3ed 990 super().__init__(msg, **kwargs)
773f291d
S
991 self.countries = countries
992
993
693f0600 994class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
bf5b9d85 1002class DownloadError(YoutubeDLError):
59ae15a5 1003 """Download Error exception.
d77c3dfd 1004
59ae15a5
PH
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
5f6a1245 1009
8cc83b8d
FV
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1012 super().__init__(msg)
8cc83b8d 1013 self.exc_info = exc_info
d77c3dfd
FV
1014
1015
498f5606 1016class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
aa9369a2 1022 msg = 'Entry not found in info'
498f5606 1023
1024
bf5b9d85 1025class SameFileError(YoutubeDLError):
59ae15a5 1026 """Same File exception.
d77c3dfd 1027
59ae15a5
PH
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
aa9369a2 1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
d77c3dfd
FV
1037
1038
bf5b9d85 1039class PostProcessingError(YoutubeDLError):
59ae15a5 1040 """Post Processing exception.
d77c3dfd 1041
59ae15a5
PH
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
5f6a1245 1045
5f6a1245 1046
48f79687 1047class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
8b0d7497 1050
8b0d7497 1051
48f79687 1052class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1055
48f79687 1056
1057class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1060
1061
48f79687 1062class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1063 """ --max-downloads limit has been reached. """
48f79687 1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
f2ebc5c7 1067class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075class ThrottledDownload(ReExtractInfo):
48f79687 1076 """ Download speed below --throttled-rate. """
aa9369a2 1077 msg = 'The download speed is below throttle limit'
d77c3dfd 1078
43b22906 1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
f2ebc5c7 1081
d77c3dfd 1082
bf5b9d85 1083class UnavailableVideoError(YoutubeDLError):
59ae15a5 1084 """Unavailable Format exception.
d77c3dfd 1085
59ae15a5
PH
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
aa9369a2 1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
d77c3dfd
FV
1095
1096
bf5b9d85 1097class ContentTooShortError(YoutubeDLError):
59ae15a5 1098 """Content Too Short exception.
d77c3dfd 1099
59ae15a5
PH
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
d77c3dfd 1104
59ae15a5 1105 def __init__(self, downloaded, expected):
86e5f3ed 1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1107 # Both in bytes
59ae15a5
PH
1108 self.downloaded = downloaded
1109 self.expected = expected
d77c3dfd 1110
5f6a1245 1111
bf5b9d85 1112class XAttrMetadataError(YoutubeDLError):
efa97bdc 1113 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1114 super().__init__(msg)
efa97bdc 1115 self.code = code
bd264412 1116 self.msg = msg
efa97bdc
YCH
1117
1118 # Parsing code and msg
3089bc74 1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
bf5b9d85 1128class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1129 pass
1130
1131
941e881e 1132def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
46f59e89
S
1136def extract_timezone(date_str):
1137 m = re.search(
f137e4c2 1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
46f59e89 1148 if not m:
8f53dc44 1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
1153 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
1157 timezone = datetime.timedelta()
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
1160 timezone = datetime.timedelta(
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
08b38d54 1166def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
52c3a6e4
S
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
08b38d54 1174 if timezone is None:
46f59e89
S
1175 timezone, date_str = extract_timezone(date_str)
1176
19a03940 1177 with contextlib.suppress(ValueError):
86e5f3ed 1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1179 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt.timetuple())
912b38b4
PH
1181
1182
46f59e89
S
1183def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
42bdd9d0 1187def unified_strdate(date_str, day_first=True):
bf50b038 1188 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1189
1190 if date_str is None:
1191 return None
bf50b038 1192 upload_date = None
5f6a1245 1193 # Replace commas
026fcc04 1194 date_str = date_str.replace(',', ' ')
42bdd9d0 1195 # Remove AM/PM + timezone
9bb8e0a3 1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1197 _, date_str = extract_timezone(date_str)
42bdd9d0 1198
46f59e89 1199 for expression in date_formats(day_first):
19a03940 1200 with contextlib.suppress(ValueError):
bf50b038 1201 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
19a03940 1205 with contextlib.suppress(ValueError):
c6b9cf05 1206 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1207 if upload_date is not None:
14f25df2 1208 return str(upload_date)
bf50b038 1209
5f6a1245 1210
46f59e89 1211def unified_timestamp(date_str, day_first=True):
ad54c913 1212 if not isinstance(date_str, str):
46f59e89
S
1213 return None
1214
8f53dc44 1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1217
7dc2a74e 1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
deef3195
S
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
f226880c
PH
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
46f59e89 1234 for expression in date_formats(day_first):
19a03940 1235 with contextlib.suppress(ValueError):
7dc2a74e 1236 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1237 return calendar.timegm(dt.timetuple())
8f53dc44 1238
46f59e89
S
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
8f53dc44 1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1242
1243
28e614de 1244def determine_ext(url, default_ext='unknown_video'):
85750f89 1245 if url is None or '.' not in url:
f4776371 1246 return default_ext
9cb9a5df 1247 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
a7aaa398
S
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1252 return guess.rstrip('/')
73e79f2a 1253 else:
cbdbb766 1254 return default_ext
73e79f2a 1255
5f6a1245 1256
824fa511
S
1257def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1259
5f6a1245 1260
9e62f283 1261def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
9e62f283 1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
836e06d2 1275 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
f8795e10 1276 if date_str in ('now', 'today'):
37254abc 1277 return today
f8795e10
PH
1278 if date_str == 'yesterday':
1279 return today - datetime.timedelta(days=1)
9e62f283 1280 match = re.match(
3d38b2d6 1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1282 date_str)
37254abc 1283 if match is not None:
9e62f283 1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1286 unit = match.group('unit')
9e62f283 1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1289 unit = 'day'
9e62f283 1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
1294 delta = datetime.timedelta(**{unit + 's': time})
1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
1300 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
d49f8db3 1303def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1304 R"""
1305 Return a date object from a string using datetime_from_str
9e62f283 1306
3d38b2d6 1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1309 """
3d38b2d6 1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315def datetime_add_months(dt, months):
1316 """Increment/Decrement a datetime object by months."""
1317 month = dt.month + months - 1
1318 year = dt.year + month // 12
1319 month = month % 12 + 1
1320 day = min(dt.day, calendar.monthrange(year, month)[1])
1321 return dt.replace(year, month, day)
1322
1323
1324def datetime_round(dt, precision='day'):
1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
1329 return dt
1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
836e06d2
SS
1338 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
5f6a1245
JW
1340
1341
e63fc1be 1342def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
5f6a1245 1351
86e5f3ed 1352class DateRange:
bd558525 1353 """Represents a time interval between two dates"""
5f6a1245 1354
bd558525
JMF
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
d49f8db3 1358 self.start = date_from_str(start, strict=True)
bd558525
JMF
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
d49f8db3 1362 self.end = date_from_str(end, strict=True)
bd558525
JMF
1363 else:
1364 self.end = datetime.datetime.max.date()
37254abc 1365 if self.start > self.end:
bd558525 1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1367
bd558525
JMF
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
5f6a1245
JW
1371 return cls(day, day)
1372
bd558525
JMF
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
37254abc
JMF
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
5f6a1245 1378
46f1370e 1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1381
f2df4071 1382 def __eq__(self, other):
1383 return (isinstance(other, DateRange)
1384 and self.start == other.start and self.end == other.end)
1385
c496ca96 1386
b1f94422 1387@functools.cache
1388def system_identifier():
1389 python_implementation = platform.python_implementation()
1390 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1392 libc_ver = []
1393 with contextlib.suppress(OSError): # We may not have access to the executable
1394 libc_ver = platform.libc_ver()
b1f94422 1395
17fc3dc4 1396 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1397 platform.python_version(),
1398 python_implementation,
17fc3dc4 1399 platform.machine(),
b1f94422 1400 platform.architecture()[0],
1401 platform.platform(),
5b9f253f
M
1402 ssl.OPENSSL_VERSION,
1403 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1404 )
c257baff
PH
1405
1406
0b9c08b4 1407@functools.cache
49fa4d9a 1408def get_windows_version():
8a82af35 1409 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1410 if compat_os_name == 'nt':
1411 return version_tuple(platform.win32_ver()[1])
1412 else:
8a82af35 1413 return ()
49fa4d9a
N
1414
1415
734f90bb 1416def write_string(s, out=None, encoding=None):
19a03940 1417 assert isinstance(s, str)
1418 out = out or sys.stderr
3b479100
SS
1419 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420 if not out:
1421 return
7459e3a2 1422
fe1daad3 1423 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1424 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1425
8a82af35 1426 enc, buffer = None, out
cfb0511d 1427 if 'b' in getattr(out, 'mode', ''):
c487cf00 1428 enc = encoding or preferredencoding()
104aa738 1429 elif hasattr(out, 'buffer'):
8a82af35 1430 buffer = out.buffer
104aa738 1431 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1432
8a82af35 1433 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1434 out.flush()
1435
1436
3d2623a8 1437# TODO: Use global logger
da4db748 1438def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1439 from .. import _IN_CLI
da4db748 1440 if _IN_CLI:
1441 if msg in deprecation_warning._cache:
1442 return
1443 deprecation_warning._cache.add(msg)
1444 if printer:
1445 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1446 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1447 else:
1448 import warnings
1449 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1450
1451
1452deprecation_warning._cache = set()
1453
1454
48ea9cea
PH
1455def bytes_to_intlist(bs):
1456 if not bs:
1457 return []
1458 if isinstance(bs[0], int): # Python 3
1459 return list(bs)
1460 else:
1461 return [ord(c) for c in bs]
1462
c257baff 1463
cba892fa 1464def intlist_to_bytes(xs):
1465 if not xs:
1466 return b''
ac668111 1467 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1468
1469
8a82af35 1470class LockingUnsupportedError(OSError):
1890fc63 1471 msg = 'File locking is not supported'
0edb3e33 1472
1473 def __init__(self):
1474 super().__init__(self.msg)
1475
1476
c1c9a79c
PH
1477# Cross-platform file locking
1478if sys.platform == 'win32':
fe0918bb 1479 import ctypes
c1c9a79c
PH
1480 import ctypes.wintypes
1481 import msvcrt
1482
1483 class OVERLAPPED(ctypes.Structure):
1484 _fields_ = [
1485 ('Internal', ctypes.wintypes.LPVOID),
1486 ('InternalHigh', ctypes.wintypes.LPVOID),
1487 ('Offset', ctypes.wintypes.DWORD),
1488 ('OffsetHigh', ctypes.wintypes.DWORD),
1489 ('hEvent', ctypes.wintypes.HANDLE),
1490 ]
1491
37e325b9 1492 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1493 LockFileEx = kernel32.LockFileEx
1494 LockFileEx.argtypes = [
1495 ctypes.wintypes.HANDLE, # hFile
1496 ctypes.wintypes.DWORD, # dwFlags
1497 ctypes.wintypes.DWORD, # dwReserved
1498 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1499 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1500 ctypes.POINTER(OVERLAPPED) # Overlapped
1501 ]
1502 LockFileEx.restype = ctypes.wintypes.BOOL
1503 UnlockFileEx = kernel32.UnlockFileEx
1504 UnlockFileEx.argtypes = [
1505 ctypes.wintypes.HANDLE, # hFile
1506 ctypes.wintypes.DWORD, # dwReserved
1507 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1508 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1509 ctypes.POINTER(OVERLAPPED) # Overlapped
1510 ]
1511 UnlockFileEx.restype = ctypes.wintypes.BOOL
1512 whole_low = 0xffffffff
1513 whole_high = 0x7fffffff
1514
747c0bd1 1515 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1516 overlapped = OVERLAPPED()
1517 overlapped.Offset = 0
1518 overlapped.OffsetHigh = 0
1519 overlapped.hEvent = 0
1520 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1521
1522 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1523 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1524 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1525 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1526 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1527
1528 def _unlock_file(f):
1529 assert f._lock_file_overlapped_p
1530 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1531 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1532 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1533
1534else:
399a76e6
YCH
1535 try:
1536 import fcntl
c1c9a79c 1537
a3125791 1538 def _lock_file(f, exclusive, block):
b63837bc 1539 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1540 if not block:
1541 flags |= fcntl.LOCK_NB
acea8d7c 1542 try:
b63837bc 1543 fcntl.flock(f, flags)
acea8d7c
JK
1544 except BlockingIOError:
1545 raise
1546 except OSError: # AOSP does not have flock()
b63837bc 1547 fcntl.lockf(f, flags)
c1c9a79c 1548
399a76e6 1549 def _unlock_file(f):
45998b3e
E
1550 with contextlib.suppress(OSError):
1551 return fcntl.flock(f, fcntl.LOCK_UN)
1552 with contextlib.suppress(OSError):
1553 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1554 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1555
399a76e6 1556 except ImportError:
399a76e6 1557
a3125791 1558 def _lock_file(f, exclusive, block):
0edb3e33 1559 raise LockingUnsupportedError()
399a76e6
YCH
1560
1561 def _unlock_file(f):
0edb3e33 1562 raise LockingUnsupportedError()
c1c9a79c
PH
1563
1564
86e5f3ed 1565class locked_file:
0edb3e33 1566 locked = False
747c0bd1 1567
a3125791 1568 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1569 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1570 raise NotImplementedError(mode)
1571 self.mode, self.block = mode, block
1572
1573 writable = any(f in mode for f in 'wax+')
1574 readable = any(f in mode for f in 'r+')
1575 flags = functools.reduce(operator.ior, (
1576 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1577 getattr(os, 'O_BINARY', 0), # Windows only
1578 getattr(os, 'O_NOINHERIT', 0), # Windows only
1579 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1580 os.O_APPEND if 'a' in mode else 0,
1581 os.O_EXCL if 'x' in mode else 0,
1582 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1583 ))
1584
98804d03 1585 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1586
1587 def __enter__(self):
a3125791 1588 exclusive = 'r' not in self.mode
c1c9a79c 1589 try:
a3125791 1590 _lock_file(self.f, exclusive, self.block)
0edb3e33 1591 self.locked = True
86e5f3ed 1592 except OSError:
c1c9a79c
PH
1593 self.f.close()
1594 raise
fcfa8853 1595 if 'w' in self.mode:
131e14dc
JK
1596 try:
1597 self.f.truncate()
1598 except OSError as e:
1890fc63 1599 if e.errno not in (
1600 errno.ESPIPE, # Illegal seek - expected for FIFO
1601 errno.EINVAL, # Invalid argument - expected for /dev/null
1602 ):
1603 raise
c1c9a79c
PH
1604 return self
1605
0edb3e33 1606 def unlock(self):
1607 if not self.locked:
1608 return
c1c9a79c 1609 try:
0edb3e33 1610 _unlock_file(self.f)
c1c9a79c 1611 finally:
0edb3e33 1612 self.locked = False
c1c9a79c 1613
0edb3e33 1614 def __exit__(self, *_):
1615 try:
1616 self.unlock()
1617 finally:
1618 self.f.close()
4eb7f1d1 1619
0edb3e33 1620 open = __enter__
1621 close = __exit__
a3125791 1622
0edb3e33 1623 def __getattr__(self, attr):
1624 return getattr(self.f, attr)
a3125791 1625
0edb3e33 1626 def __iter__(self):
1627 return iter(self.f)
a3125791 1628
4eb7f1d1 1629
0b9c08b4 1630@functools.cache
4644ac55
S
1631def get_filesystem_encoding():
1632 encoding = sys.getfilesystemencoding()
1633 return encoding if encoding is not None else 'utf-8'
1634
1635
4eb7f1d1 1636def shell_quote(args):
a6a173c2 1637 quoted_args = []
4644ac55 1638 encoding = get_filesystem_encoding()
a6a173c2
JMF
1639 for a in args:
1640 if isinstance(a, bytes):
1641 # We may get a filename encoded with 'encodeFilename'
1642 a = a.decode(encoding)
aefce8e6 1643 quoted_args.append(compat_shlex_quote(a))
28e614de 1644 return ' '.join(quoted_args)
9d4660ca
PH
1645
1646
1647def smuggle_url(url, data):
1648 """ Pass additional data in a URL for internal use. """
1649
81953d1a
RA
1650 url, idata = unsmuggle_url(url, {})
1651 data.update(idata)
14f25df2 1652 sdata = urllib.parse.urlencode(
28e614de
PH
1653 {'__youtubedl_smuggle': json.dumps(data)})
1654 return url + '#' + sdata
9d4660ca
PH
1655
1656
79f82953 1657def unsmuggle_url(smug_url, default=None):
83e865a3 1658 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1659 return smug_url, default
28e614de 1660 url, _, sdata = smug_url.rpartition('#')
14f25df2 1661 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1662 data = json.loads(jsond)
1663 return url, data
02dbf93f
PH
1664
1665
e0fd9573 1666def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1667 """ Formats numbers with decimal sufixes like K, M, etc """
1668 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1669 if num is None or num < 0:
e0fd9573 1670 return None
eeb2a770 1671 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1672 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1673 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1674 if factor == 1024:
1675 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1676 converted = num / (factor ** exponent)
abbeeebc 1677 return fmt % (converted, suffix)
e0fd9573 1678
1679
02dbf93f 1680def format_bytes(bytes):
f02d24d8 1681 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1682
1c088fa8 1683
64c464a1 1684def lookup_unit_table(unit_table, s, strict=False):
1685 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1686 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1687 m = (re.fullmatch if strict else re.match)(
1688 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1689 if not m:
1690 return None
64c464a1 1691
1692 num = float(m.group('num').replace(',', '.'))
fb47597b 1693 mult = unit_table[m.group('unit')]
64c464a1 1694 return round(num * mult)
1695
1696
1697def parse_bytes(s):
1698 """Parse a string indicating a byte quantity into an integer"""
1699 return lookup_unit_table(
1700 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1701 s.upper(), strict=True)
fb47597b
S
1702
1703
be64b5b0
PH
1704def parse_filesize(s):
1705 if s is None:
1706 return None
1707
dfb1b146 1708 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1709 # but we support those too
1710 _UNIT_TABLE = {
1711 'B': 1,
1712 'b': 1,
70852b47 1713 'bytes': 1,
be64b5b0
PH
1714 'KiB': 1024,
1715 'KB': 1000,
1716 'kB': 1024,
1717 'Kb': 1000,
13585d76 1718 'kb': 1000,
70852b47
YCH
1719 'kilobytes': 1000,
1720 'kibibytes': 1024,
be64b5b0
PH
1721 'MiB': 1024 ** 2,
1722 'MB': 1000 ** 2,
1723 'mB': 1024 ** 2,
1724 'Mb': 1000 ** 2,
13585d76 1725 'mb': 1000 ** 2,
70852b47
YCH
1726 'megabytes': 1000 ** 2,
1727 'mebibytes': 1024 ** 2,
be64b5b0
PH
1728 'GiB': 1024 ** 3,
1729 'GB': 1000 ** 3,
1730 'gB': 1024 ** 3,
1731 'Gb': 1000 ** 3,
13585d76 1732 'gb': 1000 ** 3,
70852b47
YCH
1733 'gigabytes': 1000 ** 3,
1734 'gibibytes': 1024 ** 3,
be64b5b0
PH
1735 'TiB': 1024 ** 4,
1736 'TB': 1000 ** 4,
1737 'tB': 1024 ** 4,
1738 'Tb': 1000 ** 4,
13585d76 1739 'tb': 1000 ** 4,
70852b47
YCH
1740 'terabytes': 1000 ** 4,
1741 'tebibytes': 1024 ** 4,
be64b5b0
PH
1742 'PiB': 1024 ** 5,
1743 'PB': 1000 ** 5,
1744 'pB': 1024 ** 5,
1745 'Pb': 1000 ** 5,
13585d76 1746 'pb': 1000 ** 5,
70852b47
YCH
1747 'petabytes': 1000 ** 5,
1748 'pebibytes': 1024 ** 5,
be64b5b0
PH
1749 'EiB': 1024 ** 6,
1750 'EB': 1000 ** 6,
1751 'eB': 1024 ** 6,
1752 'Eb': 1000 ** 6,
13585d76 1753 'eb': 1000 ** 6,
70852b47
YCH
1754 'exabytes': 1000 ** 6,
1755 'exbibytes': 1024 ** 6,
be64b5b0
PH
1756 'ZiB': 1024 ** 7,
1757 'ZB': 1000 ** 7,
1758 'zB': 1024 ** 7,
1759 'Zb': 1000 ** 7,
13585d76 1760 'zb': 1000 ** 7,
70852b47
YCH
1761 'zettabytes': 1000 ** 7,
1762 'zebibytes': 1024 ** 7,
be64b5b0
PH
1763 'YiB': 1024 ** 8,
1764 'YB': 1000 ** 8,
1765 'yB': 1024 ** 8,
1766 'Yb': 1000 ** 8,
13585d76 1767 'yb': 1000 ** 8,
70852b47
YCH
1768 'yottabytes': 1000 ** 8,
1769 'yobibytes': 1024 ** 8,
be64b5b0
PH
1770 }
1771
fb47597b
S
1772 return lookup_unit_table(_UNIT_TABLE, s)
1773
1774
1775def parse_count(s):
1776 if s is None:
be64b5b0
PH
1777 return None
1778
352d5da8 1779 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1780
1781 if re.match(r'^[\d,.]+$', s):
1782 return str_to_int(s)
1783
1784 _UNIT_TABLE = {
1785 'k': 1000,
1786 'K': 1000,
1787 'm': 1000 ** 2,
1788 'M': 1000 ** 2,
1789 'kk': 1000 ** 2,
1790 'KK': 1000 ** 2,
352d5da8 1791 'b': 1000 ** 3,
1792 'B': 1000 ** 3,
fb47597b 1793 }
be64b5b0 1794
352d5da8 1795 ret = lookup_unit_table(_UNIT_TABLE, s)
1796 if ret is not None:
1797 return ret
1798
1799 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1800 if mobj:
1801 return str_to_int(mobj.group(1))
be64b5b0 1802
2f7ae819 1803
5d45484c 1804def parse_resolution(s, *, lenient=False):
b871d7e9
S
1805 if s is None:
1806 return {}
1807
5d45484c
LNO
1808 if lenient:
1809 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1810 else:
1811 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1812 if mobj:
1813 return {
1814 'width': int(mobj.group('w')),
1815 'height': int(mobj.group('h')),
1816 }
1817
17ec8bcf 1818 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1819 if mobj:
1820 return {'height': int(mobj.group(1))}
1821
1822 mobj = re.search(r'\b([48])[kK]\b', s)
1823 if mobj:
1824 return {'height': int(mobj.group(1)) * 540}
1825
1826 return {}
1827
1828
0dc41787 1829def parse_bitrate(s):
14f25df2 1830 if not isinstance(s, str):
0dc41787
S
1831 return
1832 mobj = re.search(r'\b(\d+)\s*kbps', s)
1833 if mobj:
1834 return int(mobj.group(1))
1835
1836
a942d6cb 1837def month_by_name(name, lang='en'):
caefb1de
PH
1838 """ Return the number of a month by (locale-independently) English name """
1839
f6717dec 1840 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1841
caefb1de 1842 try:
f6717dec 1843 return month_names.index(name) + 1
7105440c
YCH
1844 except ValueError:
1845 return None
1846
1847
1848def month_by_abbreviation(abbrev):
1849 """ Return the number of a month by (locale-independently) English
1850 abbreviations """
1851
1852 try:
1853 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1854 except ValueError:
1855 return None
18258362
JMF
1856
1857
5aafe895 1858def fix_xml_ampersands(xml_str):
18258362 1859 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1860 return re.sub(
1861 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1862 '&amp;',
5aafe895 1863 xml_str)
e3946f98
PH
1864
1865
1866def setproctitle(title):
14f25df2 1867 assert isinstance(title, str)
c1c05c67 1868
fe0918bb 1869 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1870 try:
1871 import ctypes
1872 except ImportError:
c1c05c67
YCH
1873 return
1874
e3946f98 1875 try:
611c1dd9 1876 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1877 except OSError:
1878 return
2f49bcd6
RC
1879 except TypeError:
1880 # LoadLibrary in Windows Python 2.7.13 only expects
1881 # a bytestring, but since unicode_literals turns
1882 # every string into a unicode string, it fails.
1883 return
0f06bcd7 1884 title_bytes = title.encode()
6eefe533
PH
1885 buf = ctypes.create_string_buffer(len(title_bytes))
1886 buf.value = title_bytes
e3946f98 1887 try:
6eefe533 1888 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1889 except AttributeError:
1890 return # Strange libc, just skip this
d7dda168
PH
1891
1892
1893def remove_start(s, start):
46bc9b7d 1894 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1895
1896
2b9faf55 1897def remove_end(s, end):
46bc9b7d 1898 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1899
1900
31b2051e
S
1901def remove_quotes(s):
1902 if s is None or len(s) < 2:
1903 return s
1904 for quote in ('"', "'", ):
1905 if s[0] == quote and s[-1] == quote:
1906 return s[1:-1]
1907 return s
1908
1909
b6e0c7d2 1910def get_domain(url):
ebf99aaf 1911 """
1912 This implementation is inconsistent, but is kept for compatibility.
1913 Use this only for "webpage_url_domain"
1914 """
1915 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1916
1917
29eb5174 1918def url_basename(url):
14f25df2 1919 path = urllib.parse.urlparse(url).path
28e614de 1920 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1921
1922
02dc0a36 1923def base_url(url):
7657ec7e 1924 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1925
1926
e34c3361 1927def urljoin(base, path):
4b5de77b 1928 if isinstance(path, bytes):
0f06bcd7 1929 path = path.decode()
14f25df2 1930 if not isinstance(path, str) or not path:
e34c3361 1931 return None
fad4ceb5 1932 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1933 return path
4b5de77b 1934 if isinstance(base, bytes):
0f06bcd7 1935 base = base.decode()
14f25df2 1936 if not isinstance(base, str) or not re.match(
4b5de77b 1937 r'^(?:https?:)?//', base):
e34c3361 1938 return None
14f25df2 1939 return urllib.parse.urljoin(base, path)
e34c3361
S
1940
1941
9732d77e 1942def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1943 if get_attr and v is not None:
1944 v = getattr(v, get_attr, None)
1812afb7
S
1945 try:
1946 return int(v) * invscale // scale
31c49255 1947 except (ValueError, TypeError, OverflowError):
af98f8ff 1948 return default
9732d77e 1949
9572013d 1950
40a90862 1951def str_or_none(v, default=None):
14f25df2 1952 return default if v is None else str(v)
40a90862 1953
9732d77e
PH
1954
1955def str_to_int(int_str):
48d4681e 1956 """ A more relaxed version of int_or_none """
f9934b96 1957 if isinstance(int_str, int):
348c6bf1 1958 return int_str
14f25df2 1959 elif isinstance(int_str, str):
42db58ec
S
1960 int_str = re.sub(r'[,\.\+]', '', int_str)
1961 return int_or_none(int_str)
608d11f5
PH
1962
1963
9732d77e 1964def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1965 if v is None:
1966 return default
1967 try:
1968 return float(v) * invscale / scale
5e1271c5 1969 except (ValueError, TypeError):
caf80631 1970 return default
43f775e4
PH
1971
1972
c7e327c4
S
1973def bool_or_none(v, default=None):
1974 return v if isinstance(v, bool) else default
1975
1976
53cd37ba 1977def strip_or_none(v, default=None):
14f25df2 1978 return v.strip() if isinstance(v, str) else default
b72b4431
S
1979
1980
af03000a 1981def url_or_none(url):
14f25df2 1982 if not url or not isinstance(url, str):
af03000a
S
1983 return None
1984 url = url.strip()
29f7c58a 1985 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
1986
1987
ad54c913 1988def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 1989 datetime_object = None
1990 try:
f9934b96 1991 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 1992 # Using naive datetime here can break timestamp() in Windows
1993 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 1994 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1995 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1996 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1997 + datetime.timedelta(seconds=timestamp))
14f25df2 1998 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 1999 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2000 date_format = re.sub( # Support %s on windows
2001 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2002 return datetime_object.strftime(date_format)
2003 except (ValueError, TypeError, AttributeError):
2004 return default
2005
2006
608d11f5 2007def parse_duration(s):
f9934b96 2008 if not isinstance(s, str):
608d11f5 2009 return None
ca7b3246 2010 s = s.strip()
38d79fd1 2011 if not s:
2012 return None
ca7b3246 2013
acaff495 2014 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2015 m = re.match(r'''(?x)
2016 (?P<before_secs>
2017 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2018 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2019 (?P<ms>[.:][0-9]+)?Z?$
2020 ''', s)
acaff495 2021 if m:
8bd1c00b 2022 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2023 else:
2024 m = re.match(
056653bb
S
2025 r'''(?ix)(?:P?
2026 (?:
1c1b2f96 2027 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2028 )?
2029 (?:
1c1b2f96 2030 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2031 )?
2032 (?:
1c1b2f96 2033 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2034 )?
8f4b58d7 2035 (?:
1c1b2f96 2036 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2037 )?
056653bb 2038 T)?
acaff495 2039 (?:
af868732 2040 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
acaff495 2041 )?
2042 (?:
1c1b2f96 2043 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2044 )?
2045 (?:
2046 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2047 )?Z?$''', s)
acaff495 2048 if m:
2049 days, hours, mins, secs, ms = m.groups()
2050 else:
15846398 2051 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2052 if m:
2053 hours, mins = m.groups()
2054 else:
2055 return None
2056
acaff495 2057 if ms:
19a03940 2058 ms = ms.replace(':', '.')
2059 return sum(float(part or 0) * mult for part, mult in (
2060 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2061
2062
e65e4c88 2063def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2064 name, real_ext = os.path.splitext(filename)
e65e4c88 2065 return (
86e5f3ed 2066 f'{name}.{ext}{real_ext}'
e65e4c88 2067 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2068 else f'{filename}.{ext}')
d70ad093
PH
2069
2070
b3ed15b7
S
2071def replace_extension(filename, ext, expected_real_ext=None):
2072 name, real_ext = os.path.splitext(filename)
86e5f3ed 2073 return '{}.{}'.format(
b3ed15b7
S
2074 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2075 ext)
2076
2077
d70ad093
PH
2078def check_executable(exe, args=[]):
2079 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2080 args can be a list of arguments for a short output (like -version) """
2081 try:
f0c9fb96 2082 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2083 except OSError:
2084 return False
2085 return exe
b7ab0590
PH
2086
2087
7aaf4cd2 2088def _get_exe_version_output(exe, args):
95807118 2089 try:
b64d04c1 2090 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2091 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2092 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2093 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2094 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2095 if ret:
2096 return None
95807118
PH
2097 except OSError:
2098 return False
f0c9fb96 2099 return stdout
cae97f65
PH
2100
2101
2102def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2103 assert isinstance(output, str)
cae97f65
PH
2104 if version_re is None:
2105 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2106 m = re.search(version_re, output)
95807118
PH
2107 if m:
2108 return m.group(1)
2109 else:
2110 return unrecognized
2111
2112
9af98e17 2113def get_exe_version(exe, args=['--version'],
1cdda329 2114 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2115 """ Returns the version of the specified executable,
2116 or False if the executable is not present """
1cdda329 2117 unrecognized = variadic(unrecognized)
2118 assert len(unrecognized) in (1, 2)
9af98e17 2119 out = _get_exe_version_output(exe, args)
1cdda329 2120 if out is None:
2121 return unrecognized[-1]
2122 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2123
2124
7e88d7d7 2125def frange(start=0, stop=None, step=1):
2126 """Float range"""
2127 if stop is None:
2128 start, stop = 0, start
2129 sign = [-1, 1][step > 0] if step else 0
2130 while sign * start < sign * stop:
2131 yield start
2132 start += step
2133
2134
cb89cfc1 2135class LazyList(collections.abc.Sequence):
0f06bcd7 2136 """Lazy immutable list from an iterable
2137 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2138
8e5fecc8 2139 class IndexError(IndexError):
2140 pass
2141
282f5709 2142 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2143 self._iterable = iter(iterable)
2144 self._cache = [] if _cache is None else _cache
2145 self._reversed = reverse
483336e7 2146
2147 def __iter__(self):
0f06bcd7 2148 if self._reversed:
28419ca2 2149 # We need to consume the entire iterable to iterate in reverse
981052c9 2150 yield from self.exhaust()
28419ca2 2151 return
0f06bcd7 2152 yield from self._cache
2153 for item in self._iterable:
2154 self._cache.append(item)
483336e7 2155 yield item
2156
0f06bcd7 2157 def _exhaust(self):
2158 self._cache.extend(self._iterable)
2159 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2160 return self._cache
28419ca2 2161
981052c9 2162 def exhaust(self):
0f06bcd7 2163 """Evaluate the entire iterable"""
2164 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2165
28419ca2 2166 @staticmethod
0f06bcd7 2167 def _reverse_index(x):
f2df4071 2168 return None if x is None else ~x
483336e7 2169
2170 def __getitem__(self, idx):
2171 if isinstance(idx, slice):
0f06bcd7 2172 if self._reversed:
2173 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2174 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2175 elif isinstance(idx, int):
0f06bcd7 2176 if self._reversed:
2177 idx = self._reverse_index(idx)
e0f2b4b4 2178 start, stop, step = idx, idx, 0
483336e7 2179 else:
2180 raise TypeError('indices must be integers or slices')
e0f2b4b4 2181 if ((start or 0) < 0 or (stop or 0) < 0
2182 or (start is None and step < 0)
2183 or (stop is None and step > 0)):
483336e7 2184 # We need to consume the entire iterable to be able to slice from the end
2185 # Obviously, never use this with infinite iterables
0f06bcd7 2186 self._exhaust()
8e5fecc8 2187 try:
0f06bcd7 2188 return self._cache[idx]
8e5fecc8 2189 except IndexError as e:
2190 raise self.IndexError(e) from e
0f06bcd7 2191 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2192 if n > 0:
0f06bcd7 2193 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2194 try:
0f06bcd7 2195 return self._cache[idx]
8e5fecc8 2196 except IndexError as e:
2197 raise self.IndexError(e) from e
483336e7 2198
2199 def __bool__(self):
2200 try:
0f06bcd7 2201 self[-1] if self._reversed else self[0]
8e5fecc8 2202 except self.IndexError:
483336e7 2203 return False
2204 return True
2205
2206 def __len__(self):
0f06bcd7 2207 self._exhaust()
2208 return len(self._cache)
483336e7 2209
282f5709 2210 def __reversed__(self):
0f06bcd7 2211 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2212
2213 def __copy__(self):
0f06bcd7 2214 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2215
28419ca2 2216 def __repr__(self):
2217 # repr and str should mimic a list. So we exhaust the iterable
2218 return repr(self.exhaust())
2219
2220 def __str__(self):
2221 return repr(self.exhaust())
2222
483336e7 2223
7be9ccff 2224class PagedList:
c07a39ae 2225
2226 class IndexError(IndexError):
2227 pass
2228
dd26ced1
PH
2229 def __len__(self):
2230 # This is only useful for tests
2231 return len(self.getslice())
2232
7be9ccff 2233 def __init__(self, pagefunc, pagesize, use_cache=True):
2234 self._pagefunc = pagefunc
2235 self._pagesize = pagesize
f1d13090 2236 self._pagecount = float('inf')
7be9ccff 2237 self._use_cache = use_cache
2238 self._cache = {}
2239
2240 def getpage(self, pagenum):
d8cf8d97 2241 page_results = self._cache.get(pagenum)
2242 if page_results is None:
f1d13090 2243 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2244 if self._use_cache:
2245 self._cache[pagenum] = page_results
2246 return page_results
2247
2248 def getslice(self, start=0, end=None):
2249 return list(self._getslice(start, end))
2250
2251 def _getslice(self, start, end):
55575225 2252 raise NotImplementedError('This method must be implemented by subclasses')
2253
2254 def __getitem__(self, idx):
f1d13090 2255 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2256 if not isinstance(idx, int) or idx < 0:
2257 raise TypeError('indices must be non-negative integers')
2258 entries = self.getslice(idx, idx + 1)
d8cf8d97 2259 if not entries:
c07a39ae 2260 raise self.IndexError()
d8cf8d97 2261 return entries[0]
55575225 2262
9c44d242
PH
2263
2264class OnDemandPagedList(PagedList):
a44ca5a4 2265 """Download pages until a page with less than maximum results"""
86e5f3ed 2266
7be9ccff 2267 def _getslice(self, start, end):
b7ab0590
PH
2268 for pagenum in itertools.count(start // self._pagesize):
2269 firstid = pagenum * self._pagesize
2270 nextfirstid = pagenum * self._pagesize + self._pagesize
2271 if start >= nextfirstid:
2272 continue
2273
b7ab0590
PH
2274 startv = (
2275 start % self._pagesize
2276 if firstid <= start < nextfirstid
2277 else 0)
b7ab0590
PH
2278 endv = (
2279 ((end - 1) % self._pagesize) + 1
2280 if (end is not None and firstid <= end <= nextfirstid)
2281 else None)
2282
f1d13090 2283 try:
2284 page_results = self.getpage(pagenum)
2285 except Exception:
2286 self._pagecount = pagenum - 1
2287 raise
b7ab0590
PH
2288 if startv != 0 or endv is not None:
2289 page_results = page_results[startv:endv]
7be9ccff 2290 yield from page_results
b7ab0590
PH
2291
2292 # A little optimization - if current page is not "full", ie. does
2293 # not contain page_size videos then we can assume that this page
2294 # is the last one - there are no more ids on further pages -
2295 # i.e. no need to query again.
2296 if len(page_results) + startv < self._pagesize:
2297 break
2298
2299 # If we got the whole page, but the next page is not interesting,
2300 # break out early as well
2301 if end == nextfirstid:
2302 break
81c2f20b
PH
2303
2304
9c44d242 2305class InAdvancePagedList(PagedList):
a44ca5a4 2306 """PagedList with total number of pages known in advance"""
86e5f3ed 2307
9c44d242 2308 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2309 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2310 self._pagecount = pagecount
9c44d242 2311
7be9ccff 2312 def _getslice(self, start, end):
9c44d242 2313 start_page = start // self._pagesize
d37707bd 2314 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2315 skip_elems = start - start_page * self._pagesize
2316 only_more = None if end is None else end - start
2317 for pagenum in range(start_page, end_page):
7be9ccff 2318 page_results = self.getpage(pagenum)
9c44d242 2319 if skip_elems:
7be9ccff 2320 page_results = page_results[skip_elems:]
9c44d242
PH
2321 skip_elems = None
2322 if only_more is not None:
7be9ccff 2323 if len(page_results) < only_more:
2324 only_more -= len(page_results)
9c44d242 2325 else:
7be9ccff 2326 yield from page_results[:only_more]
9c44d242 2327 break
7be9ccff 2328 yield from page_results
9c44d242
PH
2329
2330
7e88d7d7 2331class PlaylistEntries:
2332 MissingEntry = object()
2333 is_exhausted = False
2334
2335 def __init__(self, ydl, info_dict):
7e9a6125 2336 self.ydl = ydl
2337
2338 # _entries must be assigned now since infodict can change during iteration
2339 entries = info_dict.get('entries')
2340 if entries is None:
2341 raise EntryNotInPlaylist('There are no entries')
2342 elif isinstance(entries, list):
2343 self.is_exhausted = True
2344
2345 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2346 self.is_incomplete = requested_entries is not None
7e9a6125 2347 if self.is_incomplete:
2348 assert self.is_exhausted
bc5c2f8a 2349 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2350 for i, entry in zip(requested_entries, entries):
2351 self._entries[i - 1] = entry
2352 elif isinstance(entries, (list, PagedList, LazyList)):
2353 self._entries = entries
2354 else:
2355 self._entries = LazyList(entries)
7e88d7d7 2356
2357 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2358 (?P<start>[+-]?\d+)?
2359 (?P<range>[:-]
2360 (?P<end>[+-]?\d+|inf(?:inite)?)?
2361 (?::(?P<step>[+-]?\d+))?
2362 )?''')
2363
2364 @classmethod
2365 def parse_playlist_items(cls, string):
2366 for segment in string.split(','):
2367 if not segment:
2368 raise ValueError('There is two or more consecutive commas')
2369 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2370 if not mobj:
2371 raise ValueError(f'{segment!r} is not a valid specification')
2372 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2373 if int_or_none(step) == 0:
2374 raise ValueError(f'Step in {segment!r} cannot be zero')
2375 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2376
2377 def get_requested_items(self):
2378 playlist_items = self.ydl.params.get('playlist_items')
2379 playlist_start = self.ydl.params.get('playliststart', 1)
2380 playlist_end = self.ydl.params.get('playlistend')
2381 # For backwards compatibility, interpret -1 as whole list
2382 if playlist_end in (-1, None):
2383 playlist_end = ''
2384 if not playlist_items:
2385 playlist_items = f'{playlist_start}:{playlist_end}'
2386 elif playlist_start != 1 or playlist_end:
2387 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2388
2389 for index in self.parse_playlist_items(playlist_items):
2390 for i, entry in self[index]:
2391 yield i, entry
1ac4fd80 2392 if not entry:
2393 continue
7e88d7d7 2394 try:
d21056f4 2395 # The item may have just been added to archive. Don't break due to it
2396 if not self.ydl.params.get('lazy_playlist'):
2397 # TODO: Add auto-generated fields
2398 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2399 except (ExistingVideoReached, RejectedVideoReached):
2400 return
2401
7e9a6125 2402 def get_full_count(self):
2403 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2404 return len(self)
2405 elif isinstance(self._entries, InAdvancePagedList):
2406 if self._entries._pagesize == 1:
2407 return self._entries._pagecount
2408
7e88d7d7 2409 @functools.cached_property
2410 def _getter(self):
2411 if isinstance(self._entries, list):
2412 def get_entry(i):
2413 try:
2414 entry = self._entries[i]
2415 except IndexError:
2416 entry = self.MissingEntry
2417 if not self.is_incomplete:
2418 raise self.IndexError()
2419 if entry is self.MissingEntry:
bc5c2f8a 2420 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2421 return entry
2422 else:
2423 def get_entry(i):
2424 try:
2425 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2426 except (LazyList.IndexError, PagedList.IndexError):
2427 raise self.IndexError()
2428 return get_entry
2429
2430 def __getitem__(self, idx):
2431 if isinstance(idx, int):
2432 idx = slice(idx, idx)
2433
2434 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2435 step = 1 if idx.step is None else idx.step
2436 if idx.start is None:
2437 start = 0 if step > 0 else len(self) - 1
2438 else:
2439 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2440
2441 # NB: Do not call len(self) when idx == [:]
2442 if idx.stop is None:
2443 stop = 0 if step < 0 else float('inf')
2444 else:
2445 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2446 stop += [-1, 1][step > 0]
2447
2448 for i in frange(start, stop, step):
2449 if i < 0:
2450 continue
2451 try:
7e9a6125 2452 entry = self._getter(i)
2453 except self.IndexError:
2454 self.is_exhausted = True
2455 if step > 0:
7e88d7d7 2456 break
7e9a6125 2457 continue
7e88d7d7 2458 yield i + 1, entry
2459
2460 def __len__(self):
2461 return len(tuple(self[:]))
2462
2463 class IndexError(IndexError):
2464 pass
2465
2466
81c2f20b 2467def uppercase_escape(s):
676eb3f2 2468 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2469 return re.sub(
a612753d 2470 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2471 lambda m: unicode_escape(m.group(0))[0],
2472 s)
0fe2ff78
YCH
2473
2474
2475def lowercase_escape(s):
2476 unicode_escape = codecs.getdecoder('unicode_escape')
2477 return re.sub(
2478 r'\\u[0-9a-fA-F]{4}',
2479 lambda m: unicode_escape(m.group(0))[0],
2480 s)
b53466e1 2481
d05cfe06 2482
96b9e9cf 2483def parse_qs(url, **kwargs):
2484 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2485
2486
62e609ab
PH
2487def read_batch_urls(batch_fd):
2488 def fixup(url):
14f25df2 2489 if not isinstance(url, str):
62e609ab 2490 url = url.decode('utf-8', 'replace')
8c04f0be 2491 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2492 for bom in BOM_UTF8:
2493 if url.startswith(bom):
2494 url = url[len(bom):]
2495 url = url.lstrip()
2496 if not url or url.startswith(('#', ';', ']')):
62e609ab 2497 return False
8c04f0be 2498 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2499 # However, it can be safely stripped out if following a whitespace
8c04f0be 2500 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2501
2502 with contextlib.closing(batch_fd) as fd:
2503 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2504
2505
2506def urlencode_postdata(*args, **kargs):
14f25df2 2507 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2508
2509
45b2ee6f 2510def update_url(url, *, query_update=None, **kwargs):
2511 """Replace URL components specified by kwargs
2512 @param url str or parse url tuple
2513 @param query_update update query
2514 @returns str
2515 """
2516 if isinstance(url, str):
2517 if not kwargs and not query_update:
2518 return url
2519 else:
2520 url = urllib.parse.urlparse(url)
2521 if query_update:
2522 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2523 kwargs['query'] = urllib.parse.urlencode({
2524 **urllib.parse.parse_qs(url.query),
2525 **query_update
2526 }, True)
2527 return urllib.parse.urlunparse(url._replace(**kwargs))
2528
2529
38f9ef31 2530def update_url_query(url, query):
45b2ee6f 2531 return update_url(url, query_update=query)
16392824 2532
8e60dc75 2533
10c87c15 2534def _multipart_encode_impl(data, boundary):
0c265486
YCH
2535 content_type = 'multipart/form-data; boundary=%s' % boundary
2536
2537 out = b''
2538 for k, v in data.items():
2539 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2540 if isinstance(k, str):
0f06bcd7 2541 k = k.encode()
14f25df2 2542 if isinstance(v, str):
0f06bcd7 2543 v = v.encode()
0c265486
YCH
2544 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2545 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2546 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2547 if boundary.encode('ascii') in content:
2548 raise ValueError('Boundary overlaps with data')
2549 out += content
2550
2551 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2552
2553 return out, content_type
2554
2555
2556def multipart_encode(data, boundary=None):
2557 '''
2558 Encode a dict to RFC 7578-compliant form-data
2559
2560 data:
2561 A dict where keys and values can be either Unicode or bytes-like
2562 objects.
2563 boundary:
2564 If specified a Unicode object, it's used as the boundary. Otherwise
2565 a random boundary is generated.
2566
2567 Reference: https://tools.ietf.org/html/rfc7578
2568 '''
2569 has_specified_boundary = boundary is not None
2570
2571 while True:
2572 if boundary is None:
2573 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2574
2575 try:
10c87c15 2576 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2577 break
2578 except ValueError:
2579 if has_specified_boundary:
2580 raise
2581 boundary = None
2582
2583 return out, content_type
2584
2585
b079c26f
SS
2586def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2587 if blocked_types is NO_DEFAULT:
2588 blocked_types = (str, bytes, collections.abc.Mapping)
2589 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2590
2591
2592def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2593 if not isinstance(allowed_types, (tuple, type)):
2594 deprecation_warning('allowed_types should be a tuple or a type')
2595 allowed_types = tuple(allowed_types)
6f2287cb 2596 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2597
2598
c4f60dd7 2599def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2600 for f in funcs:
a32a9a7e 2601 try:
c4f60dd7 2602 val = f(*args, **kwargs)
ab029d7e 2603 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2604 pass
2605 else:
c4f60dd7 2606 if expected_type is None or isinstance(val, expected_type):
2607 return val
2608
2609
2610def try_get(src, getter, expected_type=None):
2611 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2612
2613
90137ca4 2614def filter_dict(dct, cndn=lambda _, v: v is not None):
2615 return {k: v for k, v in dct.items() if cndn(k, v)}
2616
2617
6cc62232
S
2618def merge_dicts(*dicts):
2619 merged = {}
2620 for a_dict in dicts:
2621 for k, v in a_dict.items():
90137ca4 2622 if (v is not None and k not in merged
2623 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2624 merged[k] = v
2625 return merged
2626
2627
8e60dc75 2628def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2629 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2630
16392824 2631
a1a530b0
PH
2632US_RATINGS = {
2633 'G': 0,
2634 'PG': 10,
2635 'PG-13': 13,
2636 'R': 16,
2637 'NC': 18,
2638}
fac55558
PH
2639
2640
a8795327 2641TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2642 'TV-Y': 0,
2643 'TV-Y7': 7,
2644 'TV-G': 0,
2645 'TV-PG': 0,
2646 'TV-14': 14,
2647 'TV-MA': 17,
a8795327
S
2648}
2649
2650
146c80e2 2651def parse_age_limit(s):
19a03940 2652 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2653 if type(s) is int: # noqa: E721
a8795327 2654 return s if 0 <= s <= 21 else None
19a03940 2655 elif not isinstance(s, str):
d838b1bd 2656 return None
146c80e2 2657 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2658 if m:
2659 return int(m.group('age'))
5c5fae6d 2660 s = s.upper()
a8795327
S
2661 if s in US_RATINGS:
2662 return US_RATINGS[s]
5a16c9d9 2663 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2664 if m:
5a16c9d9 2665 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2666 return None
146c80e2
S
2667
2668
fac55558 2669def strip_jsonp(code):
609a61e3 2670 return re.sub(
5552c9eb 2671 r'''(?sx)^
e9c671d5 2672 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2673 (?:\s*&&\s*(?P=func_name))?
2674 \s*\(\s*(?P<callback_data>.*)\);?
2675 \s*?(?://[^\n]*)*$''',
2676 r'\g<callback_data>', code)
478c2c61
PH
2677
2678
8f53dc44 2679def js_to_json(code, vars={}, *, strict=False):
5c610515 2680 # vars is a dict of var, val pairs to substitute
0898c5c8 2681 STRING_QUOTES = '\'"`'
a71b812f 2682 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2683 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2684 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2685 INTEGER_TABLE = (
86e5f3ed 2686 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2687 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2688 )
2689
a71b812f
SS
2690 def process_escape(match):
2691 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2692 escape = match.group(1) or match.group(2)
2693
2694 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2695 else R'\u00' if escape == 'x'
2696 else '' if escape == '\n'
2697 else escape)
2698
0898c5c8
SS
2699 def template_substitute(match):
2700 evaluated = js_to_json(match.group(1), vars, strict=strict)
2701 if evaluated[0] == '"':
2702 return json.loads(evaluated)
2703 return evaluated
2704
e05f6939 2705 def fix_kv(m):
e7b6d122
PH
2706 v = m.group(0)
2707 if v in ('true', 'false', 'null'):
2708 return v
421ddcb8
C
2709 elif v in ('undefined', 'void 0'):
2710 return 'null'
8bdd16b4 2711 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2712 return ''
2713
2714 if v[0] in STRING_QUOTES:
0898c5c8
SS
2715 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2716 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2717 return f'"{escaped}"'
2718
2719 for regex, base in INTEGER_TABLE:
2720 im = re.match(regex, v)
2721 if im:
2722 i = int(im.group(1), base)
2723 return f'"{i}":' if v.endswith(':') else str(i)
2724
2725 if v in vars:
d5f043d1
C
2726 try:
2727 if not strict:
2728 json.loads(vars[v])
08e29b9f 2729 except json.JSONDecodeError:
d5f043d1
C
2730 return json.dumps(vars[v])
2731 else:
2732 return vars[v]
89ac4a19 2733
a71b812f
SS
2734 if not strict:
2735 return f'"{v}"'
5c610515 2736
a71b812f 2737 raise ValueError(f'Unknown value: {v}')
e05f6939 2738
8072ef2b 2739 def create_map(mobj):
2740 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2741
52414d64 2742 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
8072ef2b 2743 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2744 if not strict:
9d7ded64 2745 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
f55523cf 2746 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2747 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2748 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2749
a71b812f
SS
2750 return re.sub(rf'''(?sx)
2751 {STRING_RE}|
2752 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2753 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2754 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2755 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2756 !+
a71b812f 2757 ''', fix_kv, code)
e05f6939
PH
2758
2759
478c2c61
PH
2760def qualities(quality_ids):
2761 """ Get a numeric quality value out of a list of possible values """
2762 def q(qid):
2763 try:
2764 return quality_ids.index(qid)
2765 except ValueError:
2766 return -1
2767 return q
2768
acd69589 2769
119e40ef 2770POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2771
2772
de6000d9 2773DEFAULT_OUTTMPL = {
2774 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2775 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2776}
2777OUTTMPL_TYPES = {
72755351 2778 'chapter': None,
de6000d9 2779 'subtitle': None,
2780 'thumbnail': None,
2781 'description': 'description',
2782 'annotation': 'annotations.xml',
2783 'infojson': 'info.json',
08438d2c 2784 'link': None,
3b603dbd 2785 'pl_video': None,
5112f26a 2786 'pl_thumbnail': None,
de6000d9 2787 'pl_description': 'description',
2788 'pl_infojson': 'info.json',
2789}
0a871f68 2790
143db31d 2791# As of [1] format syntax is:
2792# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2793# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2794STR_FORMAT_RE_TMPL = r'''(?x)
2795 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2796 %
524e2e4f 2797 (?P<has_key>\((?P<key>{0})\))?
752cda38 2798 (?P<format>
524e2e4f 2799 (?P<conversion>[#0\-+ ]+)?
2800 (?P<min_width>\d+)?
2801 (?P<precision>\.\d+)?
2802 (?P<len_mod>[hlL])? # unused in python
901130bb 2803 {1} # conversion type
752cda38 2804 )
143db31d 2805'''
2806
7d1eb38a 2807
ebe1b4e3 2808STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2809
7d1eb38a 2810
a020a0dc
PH
2811def limit_length(s, length):
2812 """ Add ellipses to overly long strings """
2813 if s is None:
2814 return None
2815 ELLIPSES = '...'
2816 if len(s) > length:
2817 return s[:length - len(ELLIPSES)] + ELLIPSES
2818 return s
48844745
PH
2819
2820
2821def version_tuple(v):
5f9b8394 2822 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2823
2824
2825def is_outdated_version(version, limit, assume_new=True):
2826 if not version:
2827 return not assume_new
2828 try:
2829 return version_tuple(version) < version_tuple(limit)
2830 except ValueError:
2831 return not assume_new
732ea2f0
PH
2832
2833
2834def ytdl_is_updateable():
7a5c1cfe 2835 """ Returns if yt-dlp can be updated with -U """
735d865e 2836
69bec673 2837 from ..update import is_non_updateable
732ea2f0 2838
5d535b4a 2839 return not is_non_updateable()
7d4111ed
PH
2840
2841
2842def args_to_str(args):
2843 # Get a short string representation for a subprocess command
702ccf2d 2844 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2845
2846
a44ca5a4 2847def error_to_str(err):
2848 return f'{type(err).__name__}: {err}'
2849
2850
2647c933 2851def mimetype2ext(mt, default=NO_DEFAULT):
2852 if not isinstance(mt, str):
2853 if default is not NO_DEFAULT:
2854 return default
eb9ee194
S
2855 return None
2856
2647c933 2857 MAP = {
2858 # video
f6861ec9 2859 '3gpp': '3gp',
2647c933 2860 'mp2t': 'ts',
2861 'mp4': 'mp4',
2862 'mpeg': 'mpeg',
2863 'mpegurl': 'm3u8',
2864 'quicktime': 'mov',
2865 'webm': 'webm',
2866 'vp9': 'vp9',
f659e643 2867 'video/ogg': 'ogv',
f6861ec9 2868 'x-flv': 'flv',
2647c933 2869 'x-m4v': 'm4v',
2870 'x-matroska': 'mkv',
2871 'x-mng': 'mng',
a0d8d704 2872 'x-mp4-fragmented': 'mp4',
2647c933 2873 'x-ms-asf': 'asf',
a0d8d704 2874 'x-ms-wmv': 'wmv',
2647c933 2875 'x-msvideo': 'avi',
2876
2877 # application (streaming playlists)
b4173f15 2878 'dash+xml': 'mpd',
b4173f15 2879 'f4m+xml': 'f4m',
f164b971 2880 'hds+xml': 'f4m',
2647c933 2881 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2882 'vnd.ms-sstr+xml': 'ism',
2647c933 2883 'x-mpegurl': 'm3u8',
2884
2885 # audio
2886 'audio/mp4': 'm4a',
2887 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2888 # Using .mp3 as it's the most popular one
2889 'audio/mpeg': 'mp3',
d80ca5de 2890 'audio/webm': 'webm',
2647c933 2891 'audio/x-matroska': 'mka',
2892 'audio/x-mpegurl': 'm3u',
2893 'midi': 'mid',
2894 'ogg': 'ogg',
2895 'wav': 'wav',
2896 'wave': 'wav',
2897 'x-aac': 'aac',
2898 'x-flac': 'flac',
2899 'x-m4a': 'm4a',
2900 'x-realaudio': 'ra',
39e7107d 2901 'x-wav': 'wav',
9359f3d4 2902
2647c933 2903 # image
2904 'avif': 'avif',
2905 'bmp': 'bmp',
2906 'gif': 'gif',
2907 'jpeg': 'jpg',
2908 'png': 'png',
2909 'svg+xml': 'svg',
2910 'tiff': 'tif',
2911 'vnd.wap.wbmp': 'wbmp',
2912 'webp': 'webp',
2913 'x-icon': 'ico',
2914 'x-jng': 'jng',
2915 'x-ms-bmp': 'bmp',
2916
2917 # caption
2918 'filmstrip+json': 'fs',
2919 'smptett+xml': 'tt',
2920 'ttaf+xml': 'dfxp',
2921 'ttml+xml': 'ttml',
2922 'x-ms-sami': 'sami',
9359f3d4 2923
2647c933 2924 # misc
2925 'gzip': 'gz',
9359f3d4
F
2926 'json': 'json',
2927 'xml': 'xml',
2928 'zip': 'zip',
9359f3d4
F
2929 }
2930
2647c933 2931 mimetype = mt.partition(';')[0].strip().lower()
2932 _, _, subtype = mimetype.rpartition('/')
9359f3d4 2933
69bec673 2934 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 2935 if ext:
2936 return ext
2937 elif default is not NO_DEFAULT:
2938 return default
9359f3d4 2939 return subtype.replace('+', '.')
c460bdd5
PH
2940
2941
2814f12b
THD
2942def ext2mimetype(ext_or_url):
2943 if not ext_or_url:
2944 return None
2945 if '.' not in ext_or_url:
2946 ext_or_url = f'file.{ext_or_url}'
2947 return mimetypes.guess_type(ext_or_url)[0]
2948
2949
4f3c5e06 2950def parse_codecs(codecs_str):
2951 # http://tools.ietf.org/html/rfc6381
2952 if not codecs_str:
2953 return {}
a0566bbf 2954 split_codecs = list(filter(None, map(
dbf5416a 2955 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 2956 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 2957 for full_codec in split_codecs:
d816f61f 2958 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2959 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2960 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2961 if vcodec:
2962 continue
2963 vcodec = full_codec
2964 if parts[0] in ('dvh1', 'dvhe'):
2965 hdr = 'DV'
69bec673 2966 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 2967 hdr = 'HDR10'
2968 elif parts[:2] == ['vp9', '2']:
2969 hdr = 'HDR10'
71082216 2970 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 2971 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2972 acodec = acodec or full_codec
2973 elif parts[0] in ('stpp', 'wvtt'):
2974 scodec = scodec or full_codec
4f3c5e06 2975 else:
19a03940 2976 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 2977 if vcodec or acodec or scodec:
4f3c5e06 2978 return {
2979 'vcodec': vcodec or 'none',
2980 'acodec': acodec or 'none',
176f1866 2981 'dynamic_range': hdr,
3fe75fdc 2982 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 2983 }
b69fd25c 2984 elif len(split_codecs) == 2:
2985 return {
2986 'vcodec': split_codecs[0],
2987 'acodec': split_codecs[1],
2988 }
4f3c5e06 2989 return {}
2990
2991
fc61aff4
LL
2992def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2993 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2994
2995 allow_mkv = not preferences or 'mkv' in preferences
2996
2997 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2998 return 'mkv' # TODO: any other format allows this?
2999
3000 # TODO: All codecs supported by parse_codecs isn't handled here
3001 COMPATIBLE_CODECS = {
3002 'mp4': {
71082216 3003 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3004 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3005 },
3006 'webm': {
3007 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3008 'vp9x', 'vp8x', # in the webm spec
3009 },
3010 }
3011
812cdfa0 3012 sanitize_codec = functools.partial(
3013 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3014 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3015
3016 for ext in preferences or COMPATIBLE_CODECS.keys():
3017 codec_set = COMPATIBLE_CODECS.get(ext, set())
3018 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3019 return ext
3020
3021 COMPATIBLE_EXTS = (
3022 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3023 {'webm', 'weba'},
fc61aff4
LL
3024 )
3025 for ext in preferences or vexts:
3026 current_exts = {ext, *vexts, *aexts}
3027 if ext == 'mkv' or current_exts == {ext} or any(
3028 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3029 return ext
3030 return 'mkv' if allow_mkv else preferences[-1]
3031
3032
2647c933 3033def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3034 getheader = url_handle.headers.get
2ccd1b10 3035
b55ee18f
PH
3036 cd = getheader('Content-Disposition')
3037 if cd:
3038 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3039 if m:
3040 e = determine_ext(m.group('filename'), default_ext=None)
3041 if e:
3042 return e
3043
2647c933 3044 meta_ext = getheader('x-amz-meta-name')
3045 if meta_ext:
3046 e = meta_ext.rpartition('.')[2]
3047 if e:
3048 return e
3049
3050 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3051
3052
1e399778
YCH
3053def encode_data_uri(data, mime_type):
3054 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3055
3056
05900629 3057def age_restricted(content_limit, age_limit):
6ec6cb4e 3058 """ Returns True iff the content should be blocked """
05900629
PH
3059
3060 if age_limit is None: # No limit set
3061 return False
3062 if content_limit is None:
3063 return False # Content available for everyone
3064 return age_limit < content_limit
61ca9a80
PH
3065
3066
88f60feb 3067# List of known byte-order-marks (BOM)
a904a7f8
L
3068BOMS = [
3069 (b'\xef\xbb\xbf', 'utf-8'),
3070 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3071 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3072 (b'\xff\xfe', 'utf-16-le'),
3073 (b'\xfe\xff', 'utf-16-be'),
3074]
a904a7f8
L
3075
3076
61ca9a80
PH
3077def is_html(first_bytes):
3078 """ Detect whether a file contains HTML by examining its first bytes. """
3079
80e8493e 3080 encoding = 'utf-8'
61ca9a80 3081 for bom, enc in BOMS:
80e8493e 3082 while first_bytes.startswith(bom):
3083 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3084
80e8493e 3085 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3086
3087
3088def determine_protocol(info_dict):
3089 protocol = info_dict.get('protocol')
3090 if protocol is not None:
3091 return protocol
3092
7de837a5 3093 url = sanitize_url(info_dict['url'])
a055469f
PH
3094 if url.startswith('rtmp'):
3095 return 'rtmp'
3096 elif url.startswith('mms'):
3097 return 'mms'
3098 elif url.startswith('rtsp'):
3099 return 'rtsp'
3100
3101 ext = determine_ext(url)
3102 if ext == 'm3u8':
deae7c17 3103 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3104 elif ext == 'f4m':
3105 return 'f4m'
3106
14f25df2 3107 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3108
3109
c5e3f849 3110def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3111 """ Render a list of rows, each as a list of values.
3112 Text after a \t will be right aligned """
ec11a9f4 3113 def width(string):
c5e3f849 3114 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3115
3116 def get_max_lens(table):
ec11a9f4 3117 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3118
3119 def filter_using_list(row, filterArray):
d16df59d 3120 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3121
d16df59d 3122 max_lens = get_max_lens(data) if hide_empty else []
3123 header_row = filter_using_list(header_row, max_lens)
3124 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3125
cfb56d1a 3126 table = [header_row] + data
76d321f6 3127 max_lens = get_max_lens(table)
c5e3f849 3128 extra_gap += 1
76d321f6 3129 if delim:
c5e3f849 3130 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3131 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3132 for row in table:
3133 for pos, text in enumerate(map(str, row)):
c5e3f849 3134 if '\t' in text:
3135 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3136 else:
3137 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3138 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3139 return ret
347de493
PH
3140
3141
8f18aca8 3142def _match_one(filter_part, dct, incomplete):
77b87f05 3143 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3144 STRING_OPERATORS = {
3145 '*=': operator.contains,
3146 '^=': lambda attr, value: attr.startswith(value),
3147 '$=': lambda attr, value: attr.endswith(value),
3148 '~=': lambda attr, value: re.search(value, attr),
3149 }
347de493 3150 COMPARISON_OPERATORS = {
a047eeb6 3151 **STRING_OPERATORS,
3152 '<=': operator.le, # "<=" must be defined above "<"
347de493 3153 '<': operator.lt,
347de493 3154 '>=': operator.ge,
a047eeb6 3155 '>': operator.gt,
347de493 3156 '=': operator.eq,
347de493 3157 }
a047eeb6 3158
6db9c4d5 3159 if isinstance(incomplete, bool):
3160 is_incomplete = lambda _: incomplete
3161 else:
3162 is_incomplete = lambda k: k in incomplete
3163
64fa820c 3164 operator_rex = re.compile(r'''(?x)
347de493 3165 (?P<key>[a-z_]+)
77b87f05 3166 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3167 (?:
a047eeb6 3168 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3169 (?P<strval>.+?)
347de493 3170 )
347de493 3171 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3172 m = operator_rex.fullmatch(filter_part.strip())
347de493 3173 if m:
18f96d12 3174 m = m.groupdict()
3175 unnegated_op = COMPARISON_OPERATORS[m['op']]
3176 if m['negation']:
77b87f05
MT
3177 op = lambda attr, value: not unnegated_op(attr, value)
3178 else:
3179 op = unnegated_op
18f96d12 3180 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3181 if m['quote']:
3182 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3183 actual_value = dct.get(m['key'])
3184 numeric_comparison = None
f9934b96 3185 if isinstance(actual_value, (int, float)):
e5a088dc
S
3186 # If the original field is a string and matching comparisonvalue is
3187 # a number we should respect the origin of the original field
3188 # and process comparison value as a string (see
18f96d12 3189 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3190 try:
18f96d12 3191 numeric_comparison = int(comparison_value)
347de493 3192 except ValueError:
18f96d12 3193 numeric_comparison = parse_filesize(comparison_value)
3194 if numeric_comparison is None:
3195 numeric_comparison = parse_filesize(f'{comparison_value}B')
3196 if numeric_comparison is None:
3197 numeric_comparison = parse_duration(comparison_value)
3198 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3199 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3200 if actual_value is None:
6db9c4d5 3201 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3202 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3203
3204 UNARY_OPERATORS = {
1cc47c66
S
3205 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3206 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3207 }
64fa820c 3208 operator_rex = re.compile(r'''(?x)
347de493 3209 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3210 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3211 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3212 if m:
3213 op = UNARY_OPERATORS[m.group('op')]
3214 actual_value = dct.get(m.group('key'))
6db9c4d5 3215 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3216 return True
347de493
PH
3217 return op(actual_value)
3218
3219 raise ValueError('Invalid filter part %r' % filter_part)
3220
3221
8f18aca8 3222def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3223 """ Filter a dictionary with a simple string syntax.
3224 @returns Whether the filter passes
3225 @param incomplete Set of keys that is expected to be missing from dct.
3226 Can be True/False to indicate all/none of the keys may be missing.
3227 All conditions on incomplete keys pass if the key is missing
8f18aca8 3228 """
347de493 3229 return all(
8f18aca8 3230 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3231 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3232
3233
fe2ce85a 3234def match_filter_func(filters, breaking_filters=None):
3235 if not filters and not breaking_filters:
d1b5f70b 3236 return None
fe2ce85a 3237 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3238 filters = set(variadic(filters or []))
d1b5f70b 3239
492272fe 3240 interactive = '-' in filters
3241 if interactive:
3242 filters.remove('-')
3243
3244 def _match_func(info_dict, incomplete=False):
fe2ce85a 3245 ret = breaking_filters(info_dict, incomplete)
3246 if ret is not None:
3247 raise RejectedVideoReached(ret)
3248
492272fe 3249 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3250 return NO_DEFAULT if interactive and not incomplete else None
347de493 3251 else:
3bec830a 3252 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3253 filter_str = ') | ('.join(map(str.strip, filters))
3254 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3255 return _match_func
91410c9b
PH
3256
3257
f2df4071 3258class download_range_func:
b4e0d758 3259 def __init__(self, chapters, ranges, from_info=False):
3260 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3261
3262 def __call__(self, info_dict, ydl):
0500ee3d 3263
5ec1b6b7 3264 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3265 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3266 for regex in self.chapters or []:
5ec1b6b7 3267 for i, chapter in enumerate(info_dict.get('chapters') or []):
3268 if re.search(regex, chapter['title']):
3269 warning = None
3270 yield {**chapter, 'index': i}
f2df4071 3271 if self.chapters and warning:
5ec1b6b7 3272 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3273
b4e0d758 3274 for start, end in self.ranges or []:
3275 yield {
3276 'start_time': self._handle_negative_timestamp(start, info_dict),
3277 'end_time': self._handle_negative_timestamp(end, info_dict),
3278 }
3279
3280 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3281 yield {
e59e2074 3282 'start_time': info_dict.get('start_time') or 0,
3283 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3284 }
e59e2074 3285 elif not self.ranges and not self.chapters:
3286 yield {}
b4e0d758 3287
3288 @staticmethod
3289 def _handle_negative_timestamp(time, info):
3290 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3291
f2df4071 3292 def __eq__(self, other):
3293 return (isinstance(other, download_range_func)
3294 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3295
71df9b7f 3296 def __repr__(self):
a5387729 3297 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3298
5ec1b6b7 3299
bf6427d2
YCH
3300def parse_dfxp_time_expr(time_expr):
3301 if not time_expr:
d631d5f9 3302 return
bf6427d2 3303
1d485a1a 3304 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3305 if mobj:
3306 return float(mobj.group('time_offset'))
3307
db2fe38b 3308 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3309 if mobj:
db2fe38b 3310 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3311
3312
c1c924ab 3313def srt_subtitles_timecode(seconds):
aa7785f8 3314 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3315
3316
3317def ass_subtitles_timecode(seconds):
3318 time = timetuple_from_msec(seconds * 1000)
3319 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3320
3321
3322def dfxp2srt(dfxp_data):
3869028f
YCH
3323 '''
3324 @param dfxp_data A bytes-like object containing DFXP data
3325 @returns A unicode object containing converted SRT data
3326 '''
5b995f71 3327 LEGACY_NAMESPACES = (
3869028f
YCH
3328 (b'http://www.w3.org/ns/ttml', [
3329 b'http://www.w3.org/2004/11/ttaf1',
3330 b'http://www.w3.org/2006/04/ttaf1',
3331 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3332 ]),
3869028f
YCH
3333 (b'http://www.w3.org/ns/ttml#styling', [
3334 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3335 ]),
3336 )
3337
3338 SUPPORTED_STYLING = [
3339 'color',
3340 'fontFamily',
3341 'fontSize',
3342 'fontStyle',
3343 'fontWeight',
3344 'textDecoration'
3345 ]
3346
4e335771 3347 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3348 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3349 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3350 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3351 })
bf6427d2 3352
5b995f71
RA
3353 styles = {}
3354 default_style = {}
3355
86e5f3ed 3356 class TTMLPElementParser:
5b995f71
RA
3357 _out = ''
3358 _unclosed_elements = []
3359 _applied_styles = []
bf6427d2 3360
2b14cb56 3361 def start(self, tag, attrib):
5b995f71
RA
3362 if tag in (_x('ttml:br'), 'br'):
3363 self._out += '\n'
3364 else:
3365 unclosed_elements = []
3366 style = {}
3367 element_style_id = attrib.get('style')
3368 if default_style:
3369 style.update(default_style)
3370 if element_style_id:
3371 style.update(styles.get(element_style_id, {}))
3372 for prop in SUPPORTED_STYLING:
3373 prop_val = attrib.get(_x('tts:' + prop))
3374 if prop_val:
3375 style[prop] = prop_val
3376 if style:
3377 font = ''
3378 for k, v in sorted(style.items()):
3379 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3380 continue
3381 if k == 'color':
3382 font += ' color="%s"' % v
3383 elif k == 'fontSize':
3384 font += ' size="%s"' % v
3385 elif k == 'fontFamily':
3386 font += ' face="%s"' % v
3387 elif k == 'fontWeight' and v == 'bold':
3388 self._out += '<b>'
3389 unclosed_elements.append('b')
3390 elif k == 'fontStyle' and v == 'italic':
3391 self._out += '<i>'
3392 unclosed_elements.append('i')
3393 elif k == 'textDecoration' and v == 'underline':
3394 self._out += '<u>'
3395 unclosed_elements.append('u')
3396 if font:
3397 self._out += '<font' + font + '>'
3398 unclosed_elements.append('font')
3399 applied_style = {}
3400 if self._applied_styles:
3401 applied_style.update(self._applied_styles[-1])
3402 applied_style.update(style)
3403 self._applied_styles.append(applied_style)
3404 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3405
2b14cb56 3406 def end(self, tag):
5b995f71
RA
3407 if tag not in (_x('ttml:br'), 'br'):
3408 unclosed_elements = self._unclosed_elements.pop()
3409 for element in reversed(unclosed_elements):
3410 self._out += '</%s>' % element
3411 if unclosed_elements and self._applied_styles:
3412 self._applied_styles.pop()
bf6427d2 3413
2b14cb56 3414 def data(self, data):
5b995f71 3415 self._out += data
2b14cb56 3416
3417 def close(self):
5b995f71 3418 return self._out.strip()
2b14cb56 3419
6a765f13 3420 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3421 # This will not trigger false positives since only UTF-8 text is being replaced
3422 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3423
2b14cb56 3424 def parse_node(node):
3425 target = TTMLPElementParser()
3426 parser = xml.etree.ElementTree.XMLParser(target=target)
3427 parser.feed(xml.etree.ElementTree.tostring(node))
3428 return parser.close()
bf6427d2 3429
5b995f71
RA
3430 for k, v in LEGACY_NAMESPACES:
3431 for ns in v:
3432 dfxp_data = dfxp_data.replace(ns, k)
3433
3869028f 3434 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3435 out = []
5b995f71 3436 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3437
3438 if not paras:
3439 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3440
5b995f71
RA
3441 repeat = False
3442 while True:
3443 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3444 style_id = style.get('id') or style.get(_x('xml:id'))
3445 if not style_id:
3446 continue
5b995f71
RA
3447 parent_style_id = style.get('style')
3448 if parent_style_id:
3449 if parent_style_id not in styles:
3450 repeat = True
3451 continue
3452 styles[style_id] = styles[parent_style_id].copy()
3453 for prop in SUPPORTED_STYLING:
3454 prop_val = style.get(_x('tts:' + prop))
3455 if prop_val:
3456 styles.setdefault(style_id, {})[prop] = prop_val
3457 if repeat:
3458 repeat = False
3459 else:
3460 break
3461
3462 for p in ('body', 'div'):
3463 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3464 if ele is None:
3465 continue
3466 style = styles.get(ele.get('style'))
3467 if not style:
3468 continue
3469 default_style.update(style)
3470
bf6427d2 3471 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3472 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3473 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3474 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3475 if begin_time is None:
3476 continue
7dff0363 3477 if not end_time:
d631d5f9
YCH
3478 if not dur:
3479 continue
3480 end_time = begin_time + dur
bf6427d2
YCH
3481 out.append('%d\n%s --> %s\n%s\n\n' % (
3482 index,
c1c924ab
YCH
3483 srt_subtitles_timecode(begin_time),
3484 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3485 parse_node(para)))
3486
3487 return ''.join(out)
3488
3489
c487cf00 3490def cli_option(params, command_option, param, separator=None):
66e289ba 3491 param = params.get(param)
c487cf00 3492 return ([] if param is None
3493 else [command_option, str(param)] if separator is None
3494 else [f'{command_option}{separator}{param}'])
66e289ba
S
3495
3496
3497def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3498 param = params.get(param)
c487cf00 3499 assert param in (True, False, None)
3500 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3501
3502
3503def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3504 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3505
3506
e92caff5 3507def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3508 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3509 if use_compat:
5b1ecbb3 3510 return argdict
3511 else:
3512 argdict = None
eab9b2bc 3513 if argdict is None:
5b1ecbb3 3514 return default
eab9b2bc 3515 assert isinstance(argdict, dict)
3516
e92caff5 3517 assert isinstance(keys, (list, tuple))
3518 for key_list in keys:
e92caff5 3519 arg_list = list(filter(
3520 lambda x: x is not None,
6606817a 3521 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3522 if arg_list:
3523 return [arg for args in arg_list for arg in args]
3524 return default
66e289ba 3525
6251555f 3526
330690a2 3527def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3528 main_key, exe = main_key.lower(), exe.lower()
3529 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3530 keys = [f'{root_key}{k}' for k in (keys or [''])]
3531 if root_key in keys:
3532 if main_key != exe:
3533 keys.append((main_key, exe))
3534 keys.append('default')
3535 else:
3536 use_compat = False
3537 return cli_configuration_args(argdict, keys, default, use_compat)
3538
66e289ba 3539
86e5f3ed 3540class ISO639Utils:
39672624
YCH
3541 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3542 _lang_map = {
3543 'aa': 'aar',
3544 'ab': 'abk',
3545 'ae': 'ave',
3546 'af': 'afr',
3547 'ak': 'aka',
3548 'am': 'amh',
3549 'an': 'arg',
3550 'ar': 'ara',
3551 'as': 'asm',
3552 'av': 'ava',
3553 'ay': 'aym',
3554 'az': 'aze',
3555 'ba': 'bak',
3556 'be': 'bel',
3557 'bg': 'bul',
3558 'bh': 'bih',
3559 'bi': 'bis',
3560 'bm': 'bam',
3561 'bn': 'ben',
3562 'bo': 'bod',
3563 'br': 'bre',
3564 'bs': 'bos',
3565 'ca': 'cat',
3566 'ce': 'che',
3567 'ch': 'cha',
3568 'co': 'cos',
3569 'cr': 'cre',
3570 'cs': 'ces',
3571 'cu': 'chu',
3572 'cv': 'chv',
3573 'cy': 'cym',
3574 'da': 'dan',
3575 'de': 'deu',
3576 'dv': 'div',
3577 'dz': 'dzo',
3578 'ee': 'ewe',
3579 'el': 'ell',
3580 'en': 'eng',
3581 'eo': 'epo',
3582 'es': 'spa',
3583 'et': 'est',
3584 'eu': 'eus',
3585 'fa': 'fas',
3586 'ff': 'ful',
3587 'fi': 'fin',
3588 'fj': 'fij',
3589 'fo': 'fao',
3590 'fr': 'fra',
3591 'fy': 'fry',
3592 'ga': 'gle',
3593 'gd': 'gla',
3594 'gl': 'glg',
3595 'gn': 'grn',
3596 'gu': 'guj',
3597 'gv': 'glv',
3598 'ha': 'hau',
3599 'he': 'heb',
b7acc835 3600 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3601 'hi': 'hin',
3602 'ho': 'hmo',
3603 'hr': 'hrv',
3604 'ht': 'hat',
3605 'hu': 'hun',
3606 'hy': 'hye',
3607 'hz': 'her',
3608 'ia': 'ina',
3609 'id': 'ind',
b7acc835 3610 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3611 'ie': 'ile',
3612 'ig': 'ibo',
3613 'ii': 'iii',
3614 'ik': 'ipk',
3615 'io': 'ido',
3616 'is': 'isl',
3617 'it': 'ita',
3618 'iu': 'iku',
3619 'ja': 'jpn',
3620 'jv': 'jav',
3621 'ka': 'kat',
3622 'kg': 'kon',
3623 'ki': 'kik',
3624 'kj': 'kua',
3625 'kk': 'kaz',
3626 'kl': 'kal',
3627 'km': 'khm',
3628 'kn': 'kan',
3629 'ko': 'kor',
3630 'kr': 'kau',
3631 'ks': 'kas',
3632 'ku': 'kur',
3633 'kv': 'kom',
3634 'kw': 'cor',
3635 'ky': 'kir',
3636 'la': 'lat',
3637 'lb': 'ltz',
3638 'lg': 'lug',
3639 'li': 'lim',
3640 'ln': 'lin',
3641 'lo': 'lao',
3642 'lt': 'lit',
3643 'lu': 'lub',
3644 'lv': 'lav',
3645 'mg': 'mlg',
3646 'mh': 'mah',
3647 'mi': 'mri',
3648 'mk': 'mkd',
3649 'ml': 'mal',
3650 'mn': 'mon',
3651 'mr': 'mar',
3652 'ms': 'msa',
3653 'mt': 'mlt',
3654 'my': 'mya',
3655 'na': 'nau',
3656 'nb': 'nob',
3657 'nd': 'nde',
3658 'ne': 'nep',
3659 'ng': 'ndo',
3660 'nl': 'nld',
3661 'nn': 'nno',
3662 'no': 'nor',
3663 'nr': 'nbl',
3664 'nv': 'nav',
3665 'ny': 'nya',
3666 'oc': 'oci',
3667 'oj': 'oji',
3668 'om': 'orm',
3669 'or': 'ori',
3670 'os': 'oss',
3671 'pa': 'pan',
7bcd4813 3672 'pe': 'per',
39672624
YCH
3673 'pi': 'pli',
3674 'pl': 'pol',
3675 'ps': 'pus',
3676 'pt': 'por',
3677 'qu': 'que',
3678 'rm': 'roh',
3679 'rn': 'run',
3680 'ro': 'ron',
3681 'ru': 'rus',
3682 'rw': 'kin',
3683 'sa': 'san',
3684 'sc': 'srd',
3685 'sd': 'snd',
3686 'se': 'sme',
3687 'sg': 'sag',
3688 'si': 'sin',
3689 'sk': 'slk',
3690 'sl': 'slv',
3691 'sm': 'smo',
3692 'sn': 'sna',
3693 'so': 'som',
3694 'sq': 'sqi',
3695 'sr': 'srp',
3696 'ss': 'ssw',
3697 'st': 'sot',
3698 'su': 'sun',
3699 'sv': 'swe',
3700 'sw': 'swa',
3701 'ta': 'tam',
3702 'te': 'tel',
3703 'tg': 'tgk',
3704 'th': 'tha',
3705 'ti': 'tir',
3706 'tk': 'tuk',
3707 'tl': 'tgl',
3708 'tn': 'tsn',
3709 'to': 'ton',
3710 'tr': 'tur',
3711 'ts': 'tso',
3712 'tt': 'tat',
3713 'tw': 'twi',
3714 'ty': 'tah',
3715 'ug': 'uig',
3716 'uk': 'ukr',
3717 'ur': 'urd',
3718 'uz': 'uzb',
3719 've': 'ven',
3720 'vi': 'vie',
3721 'vo': 'vol',
3722 'wa': 'wln',
3723 'wo': 'wol',
3724 'xh': 'xho',
3725 'yi': 'yid',
e9a50fba 3726 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3727 'yo': 'yor',
3728 'za': 'zha',
3729 'zh': 'zho',
3730 'zu': 'zul',
3731 }
3732
3733 @classmethod
3734 def short2long(cls, code):
3735 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3736 return cls._lang_map.get(code[:2])
3737
3738 @classmethod
3739 def long2short(cls, code):
3740 """Convert language code from ISO 639-2/T to ISO 639-1"""
3741 for short_name, long_name in cls._lang_map.items():
3742 if long_name == code:
3743 return short_name
3744
3745
86e5f3ed 3746class ISO3166Utils:
4eb10f66
YCH
3747 # From http://data.okfn.org/data/core/country-list
3748 _country_map = {
3749 'AF': 'Afghanistan',
3750 'AX': 'Åland Islands',
3751 'AL': 'Albania',
3752 'DZ': 'Algeria',
3753 'AS': 'American Samoa',
3754 'AD': 'Andorra',
3755 'AO': 'Angola',
3756 'AI': 'Anguilla',
3757 'AQ': 'Antarctica',
3758 'AG': 'Antigua and Barbuda',
3759 'AR': 'Argentina',
3760 'AM': 'Armenia',
3761 'AW': 'Aruba',
3762 'AU': 'Australia',
3763 'AT': 'Austria',
3764 'AZ': 'Azerbaijan',
3765 'BS': 'Bahamas',
3766 'BH': 'Bahrain',
3767 'BD': 'Bangladesh',
3768 'BB': 'Barbados',
3769 'BY': 'Belarus',
3770 'BE': 'Belgium',
3771 'BZ': 'Belize',
3772 'BJ': 'Benin',
3773 'BM': 'Bermuda',
3774 'BT': 'Bhutan',
3775 'BO': 'Bolivia, Plurinational State of',
3776 'BQ': 'Bonaire, Sint Eustatius and Saba',
3777 'BA': 'Bosnia and Herzegovina',
3778 'BW': 'Botswana',
3779 'BV': 'Bouvet Island',
3780 'BR': 'Brazil',
3781 'IO': 'British Indian Ocean Territory',
3782 'BN': 'Brunei Darussalam',
3783 'BG': 'Bulgaria',
3784 'BF': 'Burkina Faso',
3785 'BI': 'Burundi',
3786 'KH': 'Cambodia',
3787 'CM': 'Cameroon',
3788 'CA': 'Canada',
3789 'CV': 'Cape Verde',
3790 'KY': 'Cayman Islands',
3791 'CF': 'Central African Republic',
3792 'TD': 'Chad',
3793 'CL': 'Chile',
3794 'CN': 'China',
3795 'CX': 'Christmas Island',
3796 'CC': 'Cocos (Keeling) Islands',
3797 'CO': 'Colombia',
3798 'KM': 'Comoros',
3799 'CG': 'Congo',
3800 'CD': 'Congo, the Democratic Republic of the',
3801 'CK': 'Cook Islands',
3802 'CR': 'Costa Rica',
3803 'CI': 'Côte d\'Ivoire',
3804 'HR': 'Croatia',
3805 'CU': 'Cuba',
3806 'CW': 'Curaçao',
3807 'CY': 'Cyprus',
3808 'CZ': 'Czech Republic',
3809 'DK': 'Denmark',
3810 'DJ': 'Djibouti',
3811 'DM': 'Dominica',
3812 'DO': 'Dominican Republic',
3813 'EC': 'Ecuador',
3814 'EG': 'Egypt',
3815 'SV': 'El Salvador',
3816 'GQ': 'Equatorial Guinea',
3817 'ER': 'Eritrea',
3818 'EE': 'Estonia',
3819 'ET': 'Ethiopia',
3820 'FK': 'Falkland Islands (Malvinas)',
3821 'FO': 'Faroe Islands',
3822 'FJ': 'Fiji',
3823 'FI': 'Finland',
3824 'FR': 'France',
3825 'GF': 'French Guiana',
3826 'PF': 'French Polynesia',
3827 'TF': 'French Southern Territories',
3828 'GA': 'Gabon',
3829 'GM': 'Gambia',
3830 'GE': 'Georgia',
3831 'DE': 'Germany',
3832 'GH': 'Ghana',
3833 'GI': 'Gibraltar',
3834 'GR': 'Greece',
3835 'GL': 'Greenland',
3836 'GD': 'Grenada',
3837 'GP': 'Guadeloupe',
3838 'GU': 'Guam',
3839 'GT': 'Guatemala',
3840 'GG': 'Guernsey',
3841 'GN': 'Guinea',
3842 'GW': 'Guinea-Bissau',
3843 'GY': 'Guyana',
3844 'HT': 'Haiti',
3845 'HM': 'Heard Island and McDonald Islands',
3846 'VA': 'Holy See (Vatican City State)',
3847 'HN': 'Honduras',
3848 'HK': 'Hong Kong',
3849 'HU': 'Hungary',
3850 'IS': 'Iceland',
3851 'IN': 'India',
3852 'ID': 'Indonesia',
3853 'IR': 'Iran, Islamic Republic of',
3854 'IQ': 'Iraq',
3855 'IE': 'Ireland',
3856 'IM': 'Isle of Man',
3857 'IL': 'Israel',
3858 'IT': 'Italy',
3859 'JM': 'Jamaica',
3860 'JP': 'Japan',
3861 'JE': 'Jersey',
3862 'JO': 'Jordan',
3863 'KZ': 'Kazakhstan',
3864 'KE': 'Kenya',
3865 'KI': 'Kiribati',
3866 'KP': 'Korea, Democratic People\'s Republic of',
3867 'KR': 'Korea, Republic of',
3868 'KW': 'Kuwait',
3869 'KG': 'Kyrgyzstan',
3870 'LA': 'Lao People\'s Democratic Republic',
3871 'LV': 'Latvia',
3872 'LB': 'Lebanon',
3873 'LS': 'Lesotho',
3874 'LR': 'Liberia',
3875 'LY': 'Libya',
3876 'LI': 'Liechtenstein',
3877 'LT': 'Lithuania',
3878 'LU': 'Luxembourg',
3879 'MO': 'Macao',
3880 'MK': 'Macedonia, the Former Yugoslav Republic of',
3881 'MG': 'Madagascar',
3882 'MW': 'Malawi',
3883 'MY': 'Malaysia',
3884 'MV': 'Maldives',
3885 'ML': 'Mali',
3886 'MT': 'Malta',
3887 'MH': 'Marshall Islands',
3888 'MQ': 'Martinique',
3889 'MR': 'Mauritania',
3890 'MU': 'Mauritius',
3891 'YT': 'Mayotte',
3892 'MX': 'Mexico',
3893 'FM': 'Micronesia, Federated States of',
3894 'MD': 'Moldova, Republic of',
3895 'MC': 'Monaco',
3896 'MN': 'Mongolia',
3897 'ME': 'Montenegro',
3898 'MS': 'Montserrat',
3899 'MA': 'Morocco',
3900 'MZ': 'Mozambique',
3901 'MM': 'Myanmar',
3902 'NA': 'Namibia',
3903 'NR': 'Nauru',
3904 'NP': 'Nepal',
3905 'NL': 'Netherlands',
3906 'NC': 'New Caledonia',
3907 'NZ': 'New Zealand',
3908 'NI': 'Nicaragua',
3909 'NE': 'Niger',
3910 'NG': 'Nigeria',
3911 'NU': 'Niue',
3912 'NF': 'Norfolk Island',
3913 'MP': 'Northern Mariana Islands',
3914 'NO': 'Norway',
3915 'OM': 'Oman',
3916 'PK': 'Pakistan',
3917 'PW': 'Palau',
3918 'PS': 'Palestine, State of',
3919 'PA': 'Panama',
3920 'PG': 'Papua New Guinea',
3921 'PY': 'Paraguay',
3922 'PE': 'Peru',
3923 'PH': 'Philippines',
3924 'PN': 'Pitcairn',
3925 'PL': 'Poland',
3926 'PT': 'Portugal',
3927 'PR': 'Puerto Rico',
3928 'QA': 'Qatar',
3929 'RE': 'Réunion',
3930 'RO': 'Romania',
3931 'RU': 'Russian Federation',
3932 'RW': 'Rwanda',
3933 'BL': 'Saint Barthélemy',
3934 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3935 'KN': 'Saint Kitts and Nevis',
3936 'LC': 'Saint Lucia',
3937 'MF': 'Saint Martin (French part)',
3938 'PM': 'Saint Pierre and Miquelon',
3939 'VC': 'Saint Vincent and the Grenadines',
3940 'WS': 'Samoa',
3941 'SM': 'San Marino',
3942 'ST': 'Sao Tome and Principe',
3943 'SA': 'Saudi Arabia',
3944 'SN': 'Senegal',
3945 'RS': 'Serbia',
3946 'SC': 'Seychelles',
3947 'SL': 'Sierra Leone',
3948 'SG': 'Singapore',
3949 'SX': 'Sint Maarten (Dutch part)',
3950 'SK': 'Slovakia',
3951 'SI': 'Slovenia',
3952 'SB': 'Solomon Islands',
3953 'SO': 'Somalia',
3954 'ZA': 'South Africa',
3955 'GS': 'South Georgia and the South Sandwich Islands',
3956 'SS': 'South Sudan',
3957 'ES': 'Spain',
3958 'LK': 'Sri Lanka',
3959 'SD': 'Sudan',
3960 'SR': 'Suriname',
3961 'SJ': 'Svalbard and Jan Mayen',
3962 'SZ': 'Swaziland',
3963 'SE': 'Sweden',
3964 'CH': 'Switzerland',
3965 'SY': 'Syrian Arab Republic',
3966 'TW': 'Taiwan, Province of China',
3967 'TJ': 'Tajikistan',
3968 'TZ': 'Tanzania, United Republic of',
3969 'TH': 'Thailand',
3970 'TL': 'Timor-Leste',
3971 'TG': 'Togo',
3972 'TK': 'Tokelau',
3973 'TO': 'Tonga',
3974 'TT': 'Trinidad and Tobago',
3975 'TN': 'Tunisia',
3976 'TR': 'Turkey',
3977 'TM': 'Turkmenistan',
3978 'TC': 'Turks and Caicos Islands',
3979 'TV': 'Tuvalu',
3980 'UG': 'Uganda',
3981 'UA': 'Ukraine',
3982 'AE': 'United Arab Emirates',
3983 'GB': 'United Kingdom',
3984 'US': 'United States',
3985 'UM': 'United States Minor Outlying Islands',
3986 'UY': 'Uruguay',
3987 'UZ': 'Uzbekistan',
3988 'VU': 'Vanuatu',
3989 'VE': 'Venezuela, Bolivarian Republic of',
3990 'VN': 'Viet Nam',
3991 'VG': 'Virgin Islands, British',
3992 'VI': 'Virgin Islands, U.S.',
3993 'WF': 'Wallis and Futuna',
3994 'EH': 'Western Sahara',
3995 'YE': 'Yemen',
3996 'ZM': 'Zambia',
3997 'ZW': 'Zimbabwe',
2f97cc61 3998 # Not ISO 3166 codes, but used for IP blocks
3999 'AP': 'Asia/Pacific Region',
4000 'EU': 'Europe',
4eb10f66
YCH
4001 }
4002
4003 @classmethod
4004 def short2full(cls, code):
4005 """Convert an ISO 3166-2 country code to the corresponding full name"""
4006 return cls._country_map.get(code.upper())
4007
4008
86e5f3ed 4009class GeoUtils:
773f291d
S
4010 # Major IPv4 address blocks per country
4011 _country_ip_map = {
53896ca5 4012 'AD': '46.172.224.0/19',
773f291d
S
4013 'AE': '94.200.0.0/13',
4014 'AF': '149.54.0.0/17',
4015 'AG': '209.59.64.0/18',
4016 'AI': '204.14.248.0/21',
4017 'AL': '46.99.0.0/16',
4018 'AM': '46.70.0.0/15',
4019 'AO': '105.168.0.0/13',
53896ca5
S
4020 'AP': '182.50.184.0/21',
4021 'AQ': '23.154.160.0/24',
773f291d
S
4022 'AR': '181.0.0.0/12',
4023 'AS': '202.70.112.0/20',
53896ca5 4024 'AT': '77.116.0.0/14',
773f291d
S
4025 'AU': '1.128.0.0/11',
4026 'AW': '181.41.0.0/18',
53896ca5
S
4027 'AX': '185.217.4.0/22',
4028 'AZ': '5.197.0.0/16',
773f291d
S
4029 'BA': '31.176.128.0/17',
4030 'BB': '65.48.128.0/17',
4031 'BD': '114.130.0.0/16',
4032 'BE': '57.0.0.0/8',
53896ca5 4033 'BF': '102.178.0.0/15',
773f291d
S
4034 'BG': '95.42.0.0/15',
4035 'BH': '37.131.0.0/17',
4036 'BI': '154.117.192.0/18',
4037 'BJ': '137.255.0.0/16',
53896ca5 4038 'BL': '185.212.72.0/23',
773f291d
S
4039 'BM': '196.12.64.0/18',
4040 'BN': '156.31.0.0/16',
4041 'BO': '161.56.0.0/16',
4042 'BQ': '161.0.80.0/20',
53896ca5 4043 'BR': '191.128.0.0/12',
773f291d
S
4044 'BS': '24.51.64.0/18',
4045 'BT': '119.2.96.0/19',
4046 'BW': '168.167.0.0/16',
4047 'BY': '178.120.0.0/13',
4048 'BZ': '179.42.192.0/18',
4049 'CA': '99.224.0.0/11',
4050 'CD': '41.243.0.0/16',
53896ca5
S
4051 'CF': '197.242.176.0/21',
4052 'CG': '160.113.0.0/16',
773f291d 4053 'CH': '85.0.0.0/13',
53896ca5 4054 'CI': '102.136.0.0/14',
773f291d
S
4055 'CK': '202.65.32.0/19',
4056 'CL': '152.172.0.0/14',
53896ca5 4057 'CM': '102.244.0.0/14',
773f291d
S
4058 'CN': '36.128.0.0/10',
4059 'CO': '181.240.0.0/12',
4060 'CR': '201.192.0.0/12',
4061 'CU': '152.206.0.0/15',
4062 'CV': '165.90.96.0/19',
4063 'CW': '190.88.128.0/17',
53896ca5 4064 'CY': '31.153.0.0/16',
773f291d
S
4065 'CZ': '88.100.0.0/14',
4066 'DE': '53.0.0.0/8',
4067 'DJ': '197.241.0.0/17',
4068 'DK': '87.48.0.0/12',
4069 'DM': '192.243.48.0/20',
4070 'DO': '152.166.0.0/15',
4071 'DZ': '41.96.0.0/12',
4072 'EC': '186.68.0.0/15',
4073 'EE': '90.190.0.0/15',
4074 'EG': '156.160.0.0/11',
4075 'ER': '196.200.96.0/20',
4076 'ES': '88.0.0.0/11',
4077 'ET': '196.188.0.0/14',
4078 'EU': '2.16.0.0/13',
4079 'FI': '91.152.0.0/13',
4080 'FJ': '144.120.0.0/16',
53896ca5 4081 'FK': '80.73.208.0/21',
773f291d
S
4082 'FM': '119.252.112.0/20',
4083 'FO': '88.85.32.0/19',
4084 'FR': '90.0.0.0/9',
4085 'GA': '41.158.0.0/15',
4086 'GB': '25.0.0.0/8',
4087 'GD': '74.122.88.0/21',
4088 'GE': '31.146.0.0/16',
4089 'GF': '161.22.64.0/18',
4090 'GG': '62.68.160.0/19',
53896ca5
S
4091 'GH': '154.160.0.0/12',
4092 'GI': '95.164.0.0/16',
773f291d
S
4093 'GL': '88.83.0.0/19',
4094 'GM': '160.182.0.0/15',
4095 'GN': '197.149.192.0/18',
4096 'GP': '104.250.0.0/19',
4097 'GQ': '105.235.224.0/20',
4098 'GR': '94.64.0.0/13',
4099 'GT': '168.234.0.0/16',
4100 'GU': '168.123.0.0/16',
4101 'GW': '197.214.80.0/20',
4102 'GY': '181.41.64.0/18',
4103 'HK': '113.252.0.0/14',
4104 'HN': '181.210.0.0/16',
4105 'HR': '93.136.0.0/13',
4106 'HT': '148.102.128.0/17',
4107 'HU': '84.0.0.0/14',
4108 'ID': '39.192.0.0/10',
4109 'IE': '87.32.0.0/12',
4110 'IL': '79.176.0.0/13',
4111 'IM': '5.62.80.0/20',
4112 'IN': '117.192.0.0/10',
4113 'IO': '203.83.48.0/21',
4114 'IQ': '37.236.0.0/14',
4115 'IR': '2.176.0.0/12',
4116 'IS': '82.221.0.0/16',
4117 'IT': '79.0.0.0/10',
4118 'JE': '87.244.64.0/18',
4119 'JM': '72.27.0.0/17',
4120 'JO': '176.29.0.0/16',
53896ca5 4121 'JP': '133.0.0.0/8',
773f291d
S
4122 'KE': '105.48.0.0/12',
4123 'KG': '158.181.128.0/17',
4124 'KH': '36.37.128.0/17',
4125 'KI': '103.25.140.0/22',
4126 'KM': '197.255.224.0/20',
53896ca5 4127 'KN': '198.167.192.0/19',
773f291d
S
4128 'KP': '175.45.176.0/22',
4129 'KR': '175.192.0.0/10',
4130 'KW': '37.36.0.0/14',
4131 'KY': '64.96.0.0/15',
4132 'KZ': '2.72.0.0/13',
4133 'LA': '115.84.64.0/18',
4134 'LB': '178.135.0.0/16',
53896ca5 4135 'LC': '24.92.144.0/20',
773f291d
S
4136 'LI': '82.117.0.0/19',
4137 'LK': '112.134.0.0/15',
53896ca5 4138 'LR': '102.183.0.0/16',
773f291d
S
4139 'LS': '129.232.0.0/17',
4140 'LT': '78.56.0.0/13',
4141 'LU': '188.42.0.0/16',
4142 'LV': '46.109.0.0/16',
4143 'LY': '41.252.0.0/14',
4144 'MA': '105.128.0.0/11',
4145 'MC': '88.209.64.0/18',
4146 'MD': '37.246.0.0/16',
4147 'ME': '178.175.0.0/17',
4148 'MF': '74.112.232.0/21',
4149 'MG': '154.126.0.0/17',
4150 'MH': '117.103.88.0/21',
4151 'MK': '77.28.0.0/15',
4152 'ML': '154.118.128.0/18',
4153 'MM': '37.111.0.0/17',
4154 'MN': '49.0.128.0/17',
4155 'MO': '60.246.0.0/16',
4156 'MP': '202.88.64.0/20',
4157 'MQ': '109.203.224.0/19',
4158 'MR': '41.188.64.0/18',
4159 'MS': '208.90.112.0/22',
4160 'MT': '46.11.0.0/16',
4161 'MU': '105.16.0.0/12',
4162 'MV': '27.114.128.0/18',
53896ca5 4163 'MW': '102.70.0.0/15',
773f291d
S
4164 'MX': '187.192.0.0/11',
4165 'MY': '175.136.0.0/13',
4166 'MZ': '197.218.0.0/15',
4167 'NA': '41.182.0.0/16',
4168 'NC': '101.101.0.0/18',
4169 'NE': '197.214.0.0/18',
4170 'NF': '203.17.240.0/22',
4171 'NG': '105.112.0.0/12',
4172 'NI': '186.76.0.0/15',
4173 'NL': '145.96.0.0/11',
4174 'NO': '84.208.0.0/13',
4175 'NP': '36.252.0.0/15',
4176 'NR': '203.98.224.0/19',
4177 'NU': '49.156.48.0/22',
4178 'NZ': '49.224.0.0/14',
4179 'OM': '5.36.0.0/15',
4180 'PA': '186.72.0.0/15',
4181 'PE': '186.160.0.0/14',
4182 'PF': '123.50.64.0/18',
4183 'PG': '124.240.192.0/19',
4184 'PH': '49.144.0.0/13',
4185 'PK': '39.32.0.0/11',
4186 'PL': '83.0.0.0/11',
4187 'PM': '70.36.0.0/20',
4188 'PR': '66.50.0.0/16',
4189 'PS': '188.161.0.0/16',
4190 'PT': '85.240.0.0/13',
4191 'PW': '202.124.224.0/20',
4192 'PY': '181.120.0.0/14',
4193 'QA': '37.210.0.0/15',
53896ca5 4194 'RE': '102.35.0.0/16',
773f291d 4195 'RO': '79.112.0.0/13',
53896ca5 4196 'RS': '93.86.0.0/15',
773f291d 4197 'RU': '5.136.0.0/13',
53896ca5 4198 'RW': '41.186.0.0/16',
773f291d
S
4199 'SA': '188.48.0.0/13',
4200 'SB': '202.1.160.0/19',
4201 'SC': '154.192.0.0/11',
53896ca5 4202 'SD': '102.120.0.0/13',
773f291d 4203 'SE': '78.64.0.0/12',
53896ca5 4204 'SG': '8.128.0.0/10',
773f291d
S
4205 'SI': '188.196.0.0/14',
4206 'SK': '78.98.0.0/15',
53896ca5 4207 'SL': '102.143.0.0/17',
773f291d
S
4208 'SM': '89.186.32.0/19',
4209 'SN': '41.82.0.0/15',
53896ca5 4210 'SO': '154.115.192.0/18',
773f291d
S
4211 'SR': '186.179.128.0/17',
4212 'SS': '105.235.208.0/21',
4213 'ST': '197.159.160.0/19',
4214 'SV': '168.243.0.0/16',
4215 'SX': '190.102.0.0/20',
4216 'SY': '5.0.0.0/16',
4217 'SZ': '41.84.224.0/19',
4218 'TC': '65.255.48.0/20',
4219 'TD': '154.68.128.0/19',
4220 'TG': '196.168.0.0/14',
4221 'TH': '171.96.0.0/13',
4222 'TJ': '85.9.128.0/18',
4223 'TK': '27.96.24.0/21',
4224 'TL': '180.189.160.0/20',
4225 'TM': '95.85.96.0/19',
4226 'TN': '197.0.0.0/11',
4227 'TO': '175.176.144.0/21',
4228 'TR': '78.160.0.0/11',
4229 'TT': '186.44.0.0/15',
4230 'TV': '202.2.96.0/19',
4231 'TW': '120.96.0.0/11',
4232 'TZ': '156.156.0.0/14',
53896ca5
S
4233 'UA': '37.52.0.0/14',
4234 'UG': '102.80.0.0/13',
4235 'US': '6.0.0.0/8',
773f291d 4236 'UY': '167.56.0.0/13',
53896ca5 4237 'UZ': '84.54.64.0/18',
773f291d 4238 'VA': '212.77.0.0/19',
53896ca5 4239 'VC': '207.191.240.0/21',
773f291d 4240 'VE': '186.88.0.0/13',
53896ca5 4241 'VG': '66.81.192.0/20',
773f291d
S
4242 'VI': '146.226.0.0/16',
4243 'VN': '14.160.0.0/11',
4244 'VU': '202.80.32.0/20',
4245 'WF': '117.20.32.0/21',
4246 'WS': '202.4.32.0/19',
4247 'YE': '134.35.0.0/16',
4248 'YT': '41.242.116.0/22',
4249 'ZA': '41.0.0.0/11',
53896ca5
S
4250 'ZM': '102.144.0.0/13',
4251 'ZW': '102.177.192.0/18',
773f291d
S
4252 }
4253
4254 @classmethod
5f95927a
S
4255 def random_ipv4(cls, code_or_block):
4256 if len(code_or_block) == 2:
4257 block = cls._country_ip_map.get(code_or_block.upper())
4258 if not block:
4259 return None
4260 else:
4261 block = code_or_block
773f291d 4262 addr, preflen = block.split('/')
ac668111 4263 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4264 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4265 return str(socket.inet_ntoa(
ac668111 4266 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4267
4268
0a5445dd
YCH
4269# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4270# released into Public Domain
4271# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4272
4273def long_to_bytes(n, blocksize=0):
4274 """long_to_bytes(n:long, blocksize:int) : string
4275 Convert a long integer to a byte string.
4276
4277 If optional blocksize is given and greater than zero, pad the front of the
4278 byte string with binary zeros so that the length is a multiple of
4279 blocksize.
4280 """
4281 # after much testing, this algorithm was deemed to be the fastest
4282 s = b''
4283 n = int(n)
4284 while n > 0:
ac668111 4285 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4286 n = n >> 32
4287 # strip off leading zeros
4288 for i in range(len(s)):
4289 if s[i] != b'\000'[0]:
4290 break
4291 else:
4292 # only happens when n == 0
4293 s = b'\000'
4294 i = 0
4295 s = s[i:]
4296 # add back some pad bytes. this could be done more efficiently w.r.t. the
4297 # de-padding being done above, but sigh...
4298 if blocksize > 0 and len(s) % blocksize:
4299 s = (blocksize - len(s) % blocksize) * b'\000' + s
4300 return s
4301
4302
4303def bytes_to_long(s):
4304 """bytes_to_long(string) : long
4305 Convert a byte string to a long integer.
4306
4307 This is (essentially) the inverse of long_to_bytes().
4308 """
4309 acc = 0
4310 length = len(s)
4311 if length % 4:
4312 extra = (4 - length % 4)
4313 s = b'\000' * extra + s
4314 length = length + extra
4315 for i in range(0, length, 4):
ac668111 4316 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4317 return acc
4318
4319
5bc880b9
YCH
4320def ohdave_rsa_encrypt(data, exponent, modulus):
4321 '''
4322 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4323
4324 Input:
4325 data: data to encrypt, bytes-like object
4326 exponent, modulus: parameter e and N of RSA algorithm, both integer
4327 Output: hex string of encrypted data
4328
4329 Limitation: supports one block encryption only
4330 '''
4331
4332 payload = int(binascii.hexlify(data[::-1]), 16)
4333 encrypted = pow(payload, exponent, modulus)
4334 return '%x' % encrypted
81bdc8fd
YCH
4335
4336
f48409c7
YCH
4337def pkcs1pad(data, length):
4338 """
4339 Padding input data with PKCS#1 scheme
4340
4341 @param {int[]} data input data
4342 @param {int} length target length
4343 @returns {int[]} padded data
4344 """
4345 if len(data) > length - 11:
4346 raise ValueError('Input data too long for PKCS#1 padding')
4347
4348 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4349 return [0, 2] + pseudo_random + [0] + data
4350
4351
7b2c3f47 4352def _base_n_table(n, table):
4353 if not table and not n:
4354 raise ValueError('Either table or n must be specified')
612f2be5 4355 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4356
44f14eb4 4357 if n and n != len(table):
612f2be5 4358 raise ValueError(f'base {n} exceeds table length {len(table)}')
4359 return table
59f898b7 4360
5eb6bdce 4361
7b2c3f47 4362def encode_base_n(num, n=None, table=None):
4363 """Convert given int to a base-n string"""
612f2be5 4364 table = _base_n_table(n, table)
7b2c3f47 4365 if not num:
5eb6bdce
YCH
4366 return table[0]
4367
7b2c3f47 4368 result, base = '', len(table)
81bdc8fd 4369 while num:
7b2c3f47 4370 result = table[num % base] + result
612f2be5 4371 num = num // base
7b2c3f47 4372 return result
4373
4374
4375def decode_base_n(string, n=None, table=None):
4376 """Convert given base-n string to int"""
4377 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4378 result, base = 0, len(table)
4379 for char in string:
4380 result = result * base + table[char]
4381 return result
4382
4383
f52354a8 4384def decode_packed_codes(code):
06b3fe29 4385 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4386 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4387 base = int(base)
4388 count = int(count)
4389 symbols = symbols.split('|')
4390 symbol_table = {}
4391
4392 while count:
4393 count -= 1
5eb6bdce 4394 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4395 symbol_table[base_n_count] = symbols[count] or base_n_count
4396
4397 return re.sub(
4398 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4399 obfuscated_code)
e154c651 4400
4401
1ced2221
S
4402def caesar(s, alphabet, shift):
4403 if shift == 0:
4404 return s
4405 l = len(alphabet)
4406 return ''.join(
4407 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4408 for c in s)
4409
4410
4411def rot47(s):
4412 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4413
4414
e154c651 4415def parse_m3u8_attributes(attrib):
4416 info = {}
4417 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4418 if val.startswith('"'):
4419 val = val[1:-1]
4420 info[key] = val
4421 return info
1143535d
YCH
4422
4423
4424def urshift(val, n):
4425 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4426
4427
efa97bdc 4428def write_xattr(path, key, value):
6f7563be 4429 # Windows: Write xattrs to NTFS Alternate Data Streams:
4430 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4431 if compat_os_name == 'nt':
4432 assert ':' not in key
4433 assert os.path.exists(path)
efa97bdc
YCH
4434
4435 try:
6f7563be 4436 with open(f'{path}:{key}', 'wb') as f:
4437 f.write(value)
86e5f3ed 4438 except OSError as e:
efa97bdc 4439 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4440 return
efa97bdc 4441
84e26038 4442 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
efa97bdc 4443
6f7563be 4444 setxattr = None
84e26038 4445 if callable(getattr(os, 'setxattr', None)):
4446 setxattr = os.setxattr
4447 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
6f7563be 4448 # Unicode arguments are not supported in pyxattr until version 0.5.0
4449 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4450 if version_tuple(xattr.__version__) >= (0, 5, 0):
4451 setxattr = xattr.set
4452 elif xattr:
4453 setxattr = xattr.setxattr
efa97bdc 4454
6f7563be 4455 if setxattr:
4456 try:
4457 setxattr(path, key, value)
4458 except OSError as e:
4459 raise XAttrMetadataError(e.errno, e.strerror)
4460 return
efa97bdc 4461
6f7563be 4462 # UNIX Method 2. Use setfattr/xattr executables
4463 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4464 else 'xattr' if check_executable('xattr', ['-h']) else None)
4465 if not exe:
4466 raise XAttrUnavailableError(
4467 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4468 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4469
0f06bcd7 4470 value = value.decode()
6f7563be 4471 try:
f0c9fb96 4472 _, stderr, returncode = Popen.run(
6f7563be 4473 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4474 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4475 except OSError as e:
4476 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4477 if returncode:
4478 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4479
4480
4481def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4482 start_date = datetime.date(1950, 1, 1)
4483 end_date = datetime.date(1995, 12, 31)
4484 offset = random.randint(0, (end_date - start_date).days)
4485 random_date = start_date + datetime.timedelta(offset)
0c265486 4486 return {
aa374bc7
AS
4487 year_field: str(random_date.year),
4488 month_field: str(random_date.month),
4489 day_field: str(random_date.day),
0c265486 4490 }
732044af 4491
c76eb41b 4492
8c53322c
L
4493def find_available_port(interface=''):
4494 try:
4495 with socket.socket() as sock:
4496 sock.bind((interface, 0))
4497 return sock.getsockname()[1]
4498 except OSError:
4499 return None
4500
4501
732044af 4502# Templates for internet shortcut files, which are plain text files.
e5a998f3 4503DOT_URL_LINK_TEMPLATE = '''\
732044af 4504[InternetShortcut]
4505URL=%(url)s
e5a998f3 4506'''
732044af 4507
e5a998f3 4508DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4509<?xml version="1.0" encoding="UTF-8"?>
4510<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4511<plist version="1.0">
4512<dict>
4513\t<key>URL</key>
4514\t<string>%(url)s</string>
4515</dict>
4516</plist>
e5a998f3 4517'''
732044af 4518
e5a998f3 4519DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4520[Desktop Entry]
4521Encoding=UTF-8
4522Name=%(filename)s
4523Type=Link
4524URL=%(url)s
4525Icon=text-html
e5a998f3 4526'''
732044af 4527
08438d2c 4528LINK_TEMPLATES = {
4529 'url': DOT_URL_LINK_TEMPLATE,
4530 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4531 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4532}
4533
732044af 4534
4535def iri_to_uri(iri):
4536 """
4537 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4538
4539 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4540 """
4541
14f25df2 4542 iri_parts = urllib.parse.urlparse(iri)
732044af 4543
4544 if '[' in iri_parts.netloc:
4545 raise ValueError('IPv6 URIs are not, yet, supported.')
4546 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4547
4548 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4549
4550 net_location = ''
4551 if iri_parts.username:
f9934b96 4552 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4553 if iri_parts.password is not None:
f9934b96 4554 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4555 net_location += '@'
4556
0f06bcd7 4557 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4558 # The 'idna' encoding produces ASCII text.
4559 if iri_parts.port is not None and iri_parts.port != 80:
4560 net_location += ':' + str(iri_parts.port)
4561
f9934b96 4562 return urllib.parse.urlunparse(
732044af 4563 (iri_parts.scheme,
4564 net_location,
4565
f9934b96 4566 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4567
4568 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4569 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4570
4571 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4572 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4573
f9934b96 4574 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4575
4576 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4577
4578
4579def to_high_limit_path(path):
4580 if sys.platform in ['win32', 'cygwin']:
4581 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4582 return '\\\\?\\' + os.path.abspath(path)
732044af 4583
4584 return path
76d321f6 4585
c76eb41b 4586
7b2c3f47 4587def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4588 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4589 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4590 return default
7b2c3f47 4591 return template % func(val)
00dd0cd5 4592
4593
4594def clean_podcast_url(url):
91302ed3 4595 url = re.sub(r'''(?x)
00dd0cd5 4596 (?:
4597 (?:
4598 chtbl\.com/track|
4599 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4600 play\.podtrac\.com|
4601 chrt\.fm/track|
4602 mgln\.ai/e
4603 )(?:/[^/.]+)?|
00dd0cd5 4604 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4605 flex\.acast\.com|
4606 pd(?:
4607 cn\.co| # https://podcorn.com/analytics-prefix/
4608 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4609 )/e|
4610 [0-9]\.gum\.fm|
4611 pscrb\.fm/rss/p
00dd0cd5 4612 )/''', '', url)
91302ed3 4613 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4614
4615
4616_HEX_TABLE = '0123456789abcdef'
4617
4618
4619def random_uuidv4():
4620 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4621
4622
4623def make_dir(path, to_screen=None):
4624 try:
4625 dn = os.path.dirname(path)
b25d6cb9
AI
4626 if dn:
4627 os.makedirs(dn, exist_ok=True)
0202b52a 4628 return True
86e5f3ed 4629 except OSError as err:
0202b52a 4630 if callable(to_screen) is not None:
69bec673 4631 to_screen(f'unable to create directory {err}')
0202b52a 4632 return False
f74980cb 4633
4634
4635def get_executable_path():
69bec673 4636 from ..update import _get_variant_and_executable_path
c487cf00 4637
b5899f4f 4638 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4639
4640
8e40b9d1 4641def get_user_config_dirs(package_name):
8e40b9d1
M
4642 # .config (e.g. ~/.config/package_name)
4643 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4644 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4645
4646 # appdata (%APPDATA%/package_name)
4647 appdata_dir = os.getenv('appdata')
4648 if appdata_dir:
773c272d 4649 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4650
4651 # home (~/.package_name)
773c272d 4652 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4653
4654
4655def get_system_config_dirs(package_name):
8e40b9d1 4656 # /etc/package_name
773c272d 4657 yield os.path.join('/etc', package_name)
06167fbb 4658
4659
3e9b66d7 4660def time_seconds(**kwargs):
83c4970e
L
4661 """
4662 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4663 """
4664 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4665
4666
49fa4d9a
N
4667# create a JSON Web Signature (jws) with HS256 algorithm
4668# the resulting format is in JWS Compact Serialization
4669# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4670# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4671def jwt_encode_hs256(payload_data, key, headers={}):
4672 header_data = {
4673 'alg': 'HS256',
4674 'typ': 'JWT',
4675 }
4676 if headers:
4677 header_data.update(headers)
0f06bcd7 4678 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4679 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4680 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4681 signature_b64 = base64.b64encode(h.digest())
4682 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4683 return token
819e0531 4684
4685
16b0d7e6 4686# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4687def jwt_decode_hs256(jwt):
4688 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4689 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4690 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4691 return payload_data
4692
4693
53973b4d 4694WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4695
4696
7a32c70d 4697@functools.cache
819e0531 4698def supports_terminal_sequences(stream):
4699 if compat_os_name == 'nt':
8a82af35 4700 if not WINDOWS_VT_MODE:
819e0531 4701 return False
4702 elif not os.getenv('TERM'):
4703 return False
4704 try:
4705 return stream.isatty()
4706 except BaseException:
4707 return False
4708
4709
c53a18f0 4710def windows_enable_vt_mode():
4711 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4712 if get_windows_version() < (10, 0, 10586):
53973b4d 4713 return
53973b4d 4714
c53a18f0 4715 import ctypes
4716 import ctypes.wintypes
4717 import msvcrt
4718
4719 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4720
4721 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4722 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4723 try:
4724 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4725 dw_original_mode = ctypes.wintypes.DWORD()
4726 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4727 if not success:
4728 raise Exception('GetConsoleMode failed')
4729
4730 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4731 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4732 if not success:
4733 raise Exception('SetConsoleMode failed')
c53a18f0 4734 finally:
4735 os.close(handle)
53973b4d 4736
f0795149 4737 global WINDOWS_VT_MODE
4738 WINDOWS_VT_MODE = True
4739 supports_terminal_sequences.cache_clear()
4740
53973b4d 4741
ec11a9f4 4742_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4743
4744
4745def remove_terminal_sequences(string):
4746 return _terminal_sequences_re.sub('', string)
4747
4748
4749def number_of_digits(number):
4750 return len('%d' % number)
34921b43 4751
4752
4753def join_nonempty(*values, delim='-', from_dict=None):
4754 if from_dict is not None:
69bec673 4755 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4756 return delim.join(map(str, filter(None, values)))
06e57990 4757
4758
27231526
ZM
4759def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4760 """
4761 Find the largest format dimensions in terms of video width and, for each thumbnail:
4762 * Modify the URL: Match the width with the provided regex and replace with the former width
4763 * Update dimensions
4764
4765 This function is useful with video services that scale the provided thumbnails on demand
4766 """
4767 _keys = ('width', 'height')
4768 max_dimensions = max(
86e5f3ed 4769 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4770 default=(0, 0))
4771 if not max_dimensions[0]:
4772 return thumbnails
4773 return [
4774 merge_dicts(
4775 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4776 dict(zip(_keys, max_dimensions)), thumbnail)
4777 for thumbnail in thumbnails
4778 ]
4779
4780
93c8410d
LNO
4781def parse_http_range(range):
4782 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4783 if not range:
4784 return None, None, None
4785 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4786 if not crg:
4787 return None, None, None
4788 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4789
4790
6b9e832d 4791def read_stdin(what):
4792 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4793 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4794 return sys.stdin
4795
4796
a904a7f8
L
4797def determine_file_encoding(data):
4798 """
88f60feb 4799 Detect the text encoding used
a904a7f8
L
4800 @returns (encoding, bytes to skip)
4801 """
4802
88f60feb 4803 # BOM marks are given priority over declarations
a904a7f8 4804 for bom, enc in BOMS:
a904a7f8
L
4805 if data.startswith(bom):
4806 return enc, len(bom)
4807
88f60feb 4808 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4809 # We ignore the endianness to get a good enough match
a904a7f8 4810 data = data.replace(b'\0', b'')
88f60feb 4811 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4812 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4813
4814
06e57990 4815class Config:
4816 own_args = None
9e491463 4817 parsed_args = None
06e57990 4818 filename = None
4819 __initialized = False
4820
4821 def __init__(self, parser, label=None):
9e491463 4822 self.parser, self.label = parser, label
06e57990 4823 self._loaded_paths, self.configs = set(), []
4824
4825 def init(self, args=None, filename=None):
4826 assert not self.__initialized
284a60c5 4827 self.own_args, self.filename = args, filename
4828 return self.load_configs()
4829
4830 def load_configs(self):
65662dff 4831 directory = ''
284a60c5 4832 if self.filename:
4833 location = os.path.realpath(self.filename)
65662dff 4834 directory = os.path.dirname(location)
06e57990 4835 if location in self._loaded_paths:
4836 return False
4837 self._loaded_paths.add(location)
4838
284a60c5 4839 self.__initialized = True
4840 opts, _ = self.parser.parse_known_args(self.own_args)
4841 self.parsed_args = self.own_args
9e491463 4842 for location in opts.config_locations or []:
6b9e832d 4843 if location == '-':
1060f82f 4844 if location in self._loaded_paths:
4845 continue
4846 self._loaded_paths.add(location)
6b9e832d 4847 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4848 continue
65662dff 4849 location = os.path.join(directory, expand_path(location))
06e57990 4850 if os.path.isdir(location):
4851 location = os.path.join(location, 'yt-dlp.conf')
4852 if not os.path.exists(location):
9e491463 4853 self.parser.error(f'config location {location} does not exist')
06e57990 4854 self.append_config(self.read_file(location), location)
4855 return True
4856
4857 def __str__(self):
4858 label = join_nonempty(
4859 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4860 delim=' ')
4861 return join_nonempty(
4862 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4863 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4864 delim='\n')
4865
7a32c70d 4866 @staticmethod
06e57990 4867 def read_file(filename, default=[]):
4868 try:
a904a7f8 4869 optionf = open(filename, 'rb')
86e5f3ed 4870 except OSError:
06e57990 4871 return default # silently skip if file is not present
a904a7f8
L
4872 try:
4873 enc, skip = determine_file_encoding(optionf.read(512))
4874 optionf.seek(skip, io.SEEK_SET)
4875 except OSError:
4876 enc = None # silently skip read errors
06e57990 4877 try:
4878 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4879 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4880 res = shlex.split(contents, comments=True)
44a6fcff 4881 except Exception as err:
4882 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4883 finally:
4884 optionf.close()
4885 return res
4886
7a32c70d 4887 @staticmethod
06e57990 4888 def hide_login_info(opts):
86e5f3ed 4889 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4890 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4891
4892 def _scrub_eq(o):
4893 m = eqre.match(o)
4894 if m:
4895 return m.group('key') + '=PRIVATE'
4896 else:
4897 return o
4898
4899 opts = list(map(_scrub_eq, opts))
4900 for idx, opt in enumerate(opts):
4901 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4902 opts[idx + 1] = 'PRIVATE'
4903 return opts
4904
4905 def append_config(self, *args, label=None):
9e491463 4906 config = type(self)(self.parser, label)
06e57990 4907 config._loaded_paths = self._loaded_paths
4908 if config.init(*args):
4909 self.configs.append(config)
4910
7a32c70d 4911 @property
06e57990 4912 def all_args(self):
4913 for config in reversed(self.configs):
4914 yield from config.all_args
9e491463 4915 yield from self.parsed_args or []
4916
4917 def parse_known_args(self, **kwargs):
4918 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4919
4920 def parse_args(self):
9e491463 4921 return self.parser.parse_args(self.all_args)
da42679b
LNO
4922
4923
8b7539d2 4924def merge_headers(*dicts):
08d30158 4925 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 4926 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 4927
4928
b1f94422 4929def cached_method(f):
4930 """Cache a method"""
4931 signature = inspect.signature(f)
4932
7a32c70d 4933 @functools.wraps(f)
b1f94422 4934 def wrapper(self, *args, **kwargs):
4935 bound_args = signature.bind(self, *args, **kwargs)
4936 bound_args.apply_defaults()
d5d1df8a 4937 key = tuple(bound_args.arguments.values())[1:]
b1f94422 4938
6368e2e6 4939 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 4940 if key not in cache:
4941 cache[key] = f(self, *args, **kwargs)
4942 return cache[key]
4943 return wrapper
4944
4945
28787f16 4946class classproperty:
83cc7b8a 4947 """property access for class methods with optional caching"""
4948 def __new__(cls, func=None, *args, **kwargs):
4949 if not func:
4950 return functools.partial(cls, *args, **kwargs)
4951 return super().__new__(cls)
c487cf00 4952
83cc7b8a 4953 def __init__(self, func, *, cache=False):
c487cf00 4954 functools.update_wrapper(self, func)
4955 self.func = func
83cc7b8a 4956 self._cache = {} if cache else None
28787f16 4957
4958 def __get__(self, _, cls):
83cc7b8a 4959 if self._cache is None:
4960 return self.func(cls)
4961 elif cls not in self._cache:
4962 self._cache[cls] = self.func(cls)
4963 return self._cache[cls]
19a03940 4964
4965
a5387729 4966class function_with_repr:
b2e0343b 4967 def __init__(self, func, repr_=None):
a5387729 4968 functools.update_wrapper(self, func)
b2e0343b 4969 self.func, self.__repr = func, repr_
a5387729 4970
4971 def __call__(self, *args, **kwargs):
4972 return self.func(*args, **kwargs)
4973
4974 def __repr__(self):
b2e0343b 4975 if self.__repr:
4976 return self.__repr
a5387729 4977 return f'{self.func.__module__}.{self.func.__qualname__}'
4978
4979
64fa820c 4980class Namespace(types.SimpleNamespace):
591bb9d3 4981 """Immutable namespace"""
591bb9d3 4982
7896214c 4983 def __iter__(self):
64fa820c 4984 return iter(self.__dict__.values())
7896214c 4985
7a32c70d 4986 @property
64fa820c 4987 def items_(self):
4988 return self.__dict__.items()
9b8ee23b 4989
4990
8dc59305 4991MEDIA_EXTENSIONS = Namespace(
4992 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4993 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
4994 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 4995 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 4996 thumbnails=('jpg', 'png', 'webp'),
4997 storyboards=('mhtml', ),
4998 subtitles=('srt', 'vtt', 'ass', 'lrc'),
4999 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5000)
5001MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5002MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5003
5004KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5005
5006
be5c1ae8 5007class RetryManager:
5008 """Usage:
5009 for retry in RetryManager(...):
5010 try:
5011 ...
5012 except SomeException as err:
5013 retry.error = err
5014 continue
5015 """
5016 attempt, _error = 0, None
5017
5018 def __init__(self, _retries, _error_callback, **kwargs):
5019 self.retries = _retries or 0
5020 self.error_callback = functools.partial(_error_callback, **kwargs)
5021
5022 def _should_retry(self):
5023 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5024
7a32c70d 5025 @property
be5c1ae8 5026 def error(self):
5027 if self._error is NO_DEFAULT:
5028 return None
5029 return self._error
5030
7a32c70d 5031 @error.setter
be5c1ae8 5032 def error(self, value):
5033 self._error = value
5034
5035 def __iter__(self):
5036 while self._should_retry():
5037 self.error = NO_DEFAULT
5038 self.attempt += 1
5039 yield self
5040 if self.error:
5041 self.error_callback(self.error, self.attempt, self.retries)
5042
7a32c70d 5043 @staticmethod
be5c1ae8 5044 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5045 """Utility function for reporting retries"""
5046 if count > retries:
5047 if error:
5048 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5049 raise e
5050
5051 if not count:
5052 return warn(e)
5053 elif isinstance(e, ExtractorError):
3ce29336 5054 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5055 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5056
5057 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5058 if delay:
5059 info(f'Sleeping {delay:.2f} seconds ...')
5060 time.sleep(delay)
5061
5062
0647d925 5063def make_archive_id(ie, video_id):
5064 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5065 return f'{ie_key.lower()} {video_id}'
5066
5067
a1c5bd82 5068def truncate_string(s, left, right=0):
5069 assert left > 3 and right >= 0
5070 if s is None or len(s) <= left + right:
5071 return s
71df9b7f 5072 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5073
5074
5314b521 5075def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5076 assert 'all' in alias_dict, '"all" alias is required'
5077 requested = list(start or [])
5078 for val in options:
5079 discard = val.startswith('-')
5080 if discard:
5081 val = val[1:]
5082
5083 if val in alias_dict:
5084 val = alias_dict[val] if not discard else [
5085 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5086 # NB: Do not allow regex in aliases for performance
5087 requested = orderedSet_from_options(val, alias_dict, start=requested)
5088 continue
5089
5090 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5091 else [val] if val in alias_dict['all'] else None)
5092 if current is None:
5093 raise ValueError(val)
5094
5095 if discard:
5096 for item in current:
5097 while item in requested:
5098 requested.remove(item)
5099 else:
5100 requested.extend(current)
5101
5102 return orderedSet(requested)
5103
5104
eedda525 5105# TODO: Rewrite
d0d74b71 5106class FormatSorter:
5107 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5108
5109 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5110 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5111 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5112 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5113 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5114 'fps', 'fs_approx', 'source', 'id')
5115
5116 settings = {
5117 'vcodec': {'type': 'ordered', 'regex': True,
5118 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5119 'acodec': {'type': 'ordered', 'regex': True,
71082216 5120 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5121 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5122 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5123 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5124 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5125 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5126 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5127 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5128 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5129 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5130 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5131 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5132 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5133 'field': ('vcodec', 'acodec'),
5134 'function': lambda it: int(any(v != 'none' for v in it))},
5135 'ie_pref': {'priority': True, 'type': 'extractor'},
5136 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5137 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5138 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5139 'quality': {'convert': 'float', 'default': -1},
5140 'filesize': {'convert': 'bytes'},
5141 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5142 'id': {'convert': 'string', 'field': 'format_id'},
5143 'height': {'convert': 'float_none'},
5144 'width': {'convert': 'float_none'},
5145 'fps': {'convert': 'float_none'},
5146 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5147 'tbr': {'convert': 'float_none'},
5148 'vbr': {'convert': 'float_none'},
5149 'abr': {'convert': 'float_none'},
5150 'asr': {'convert': 'float_none'},
5151 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5152
5153 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5154 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5155 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5156 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5157 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5158 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5159 'res': {'type': 'multiple', 'field': ('height', 'width'),
5160 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5161
5162 # Actual field names
5163 'format_id': {'type': 'alias', 'field': 'id'},
5164 'preference': {'type': 'alias', 'field': 'ie_pref'},
5165 'language_preference': {'type': 'alias', 'field': 'lang'},
5166 'source_preference': {'type': 'alias', 'field': 'source'},
5167 'protocol': {'type': 'alias', 'field': 'proto'},
5168 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5169 'audio_channels': {'type': 'alias', 'field': 'channels'},
5170
5171 # Deprecated
5172 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5173 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5174 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5175 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5176 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5177 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5178 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5179 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5180 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5181 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5182 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5183 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5184 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5185 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5186 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5187 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5188 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5189 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5190 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5191 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5192 }
5193
5194 def __init__(self, ydl, field_preference):
5195 self.ydl = ydl
5196 self._order = []
5197 self.evaluate_params(self.ydl.params, field_preference)
5198 if ydl.params.get('verbose'):
5199 self.print_verbose_info(self.ydl.write_debug)
5200
5201 def _get_field_setting(self, field, key):
5202 if field not in self.settings:
5203 if key in ('forced', 'priority'):
5204 return False
5205 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5206 'deprecated and may be removed in a future version')
5207 self.settings[field] = {}
5208 propObj = self.settings[field]
5209 if key not in propObj:
5210 type = propObj.get('type')
5211 if key == 'field':
5212 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5213 elif key == 'convert':
5214 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5215 else:
5216 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5217 propObj[key] = default
5218 return propObj[key]
5219
5220 def _resolve_field_value(self, field, value, convertNone=False):
5221 if value is None:
5222 if not convertNone:
5223 return None
5224 else:
5225 value = value.lower()
5226 conversion = self._get_field_setting(field, 'convert')
5227 if conversion == 'ignore':
5228 return None
5229 if conversion == 'string':
5230 return value
5231 elif conversion == 'float_none':
5232 return float_or_none(value)
5233 elif conversion == 'bytes':
5234 return parse_bytes(value)
5235 elif conversion == 'order':
5236 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5237 use_regex = self._get_field_setting(field, 'regex')
5238 list_length = len(order_list)
5239 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5240 if use_regex and value is not None:
5241 for i, regex in enumerate(order_list):
5242 if regex and re.match(regex, value):
5243 return list_length - i
5244 return list_length - empty_pos # not in list
5245 else: # not regex or value = None
5246 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5247 else:
5248 if value.isnumeric():
5249 return float(value)
5250 else:
5251 self.settings[field]['convert'] = 'string'
5252 return value
5253
5254 def evaluate_params(self, params, sort_extractor):
5255 self._use_free_order = params.get('prefer_free_formats', False)
5256 self._sort_user = params.get('format_sort', [])
5257 self._sort_extractor = sort_extractor
5258
5259 def add_item(field, reverse, closest, limit_text):
5260 field = field.lower()
5261 if field in self._order:
5262 return
5263 self._order.append(field)
5264 limit = self._resolve_field_value(field, limit_text)
5265 data = {
5266 'reverse': reverse,
5267 'closest': False if limit is None else closest,
5268 'limit_text': limit_text,
5269 'limit': limit}
5270 if field in self.settings:
5271 self.settings[field].update(data)
5272 else:
5273 self.settings[field] = data
5274
5275 sort_list = (
5276 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5277 + (tuple() if params.get('format_sort_force', False)
5278 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5279 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5280
5281 for item in sort_list:
5282 match = re.match(self.regex, item)
5283 if match is None:
5284 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5285 field = match.group('field')
5286 if field is None:
5287 continue
5288 if self._get_field_setting(field, 'type') == 'alias':
5289 alias, field = field, self._get_field_setting(field, 'field')
5290 if self._get_field_setting(alias, 'deprecated'):
5291 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5292 f'be removed in a future version. Please use {field} instead')
5293 reverse = match.group('reverse') is not None
5294 closest = match.group('separator') == '~'
5295 limit_text = match.group('limit')
5296
5297 has_limit = limit_text is not None
5298 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5299 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5300
5301 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5302 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5303 limit_count = len(limits)
5304 for (i, f) in enumerate(fields):
5305 add_item(f, reverse, closest,
5306 limits[i] if i < limit_count
5307 else limits[0] if has_limit and not has_multiple_limits
5308 else None)
5309
5310 def print_verbose_info(self, write_debug):
5311 if self._sort_user:
5312 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5313 if self._sort_extractor:
5314 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5315 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5316 '+' if self._get_field_setting(field, 'reverse') else '', field,
5317 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5318 self._get_field_setting(field, 'limit_text'),
5319 self._get_field_setting(field, 'limit'))
5320 if self._get_field_setting(field, 'limit_text') is not None else '')
5321 for field in self._order if self._get_field_setting(field, 'visible')]))
5322
5323 def _calculate_field_preference_from_value(self, format, field, type, value):
5324 reverse = self._get_field_setting(field, 'reverse')
5325 closest = self._get_field_setting(field, 'closest')
5326 limit = self._get_field_setting(field, 'limit')
5327
5328 if type == 'extractor':
5329 maximum = self._get_field_setting(field, 'max')
5330 if value is None or (maximum is not None and value >= maximum):
5331 value = -1
5332 elif type == 'boolean':
5333 in_list = self._get_field_setting(field, 'in_list')
5334 not_in_list = self._get_field_setting(field, 'not_in_list')
5335 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5336 elif type == 'ordered':
5337 value = self._resolve_field_value(field, value, True)
5338
5339 # try to convert to number
5340 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5341 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5342 if is_num:
5343 value = val_num
5344
5345 return ((-10, 0) if value is None
5346 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5347 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5348 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5349 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5350 else (-1, value, 0))
5351
5352 def _calculate_field_preference(self, format, field):
5353 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5354 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5355 if type == 'multiple':
5356 type = 'field' # Only 'field' is allowed in multiple for now
5357 actual_fields = self._get_field_setting(field, 'field')
5358
5359 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5360 else:
5361 value = get_value(field)
5362 return self._calculate_field_preference_from_value(format, field, type, value)
5363
5364 def calculate_preference(self, format):
5365 # Determine missing protocol
5366 if not format.get('protocol'):
5367 format['protocol'] = determine_protocol(format)
5368
5369 # Determine missing ext
5370 if not format.get('ext') and 'url' in format:
5371 format['ext'] = determine_ext(format['url'])
5372 if format.get('vcodec') == 'none':
5373 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5374 format['video_ext'] = 'none'
5375 else:
5376 format['video_ext'] = format['ext']
5377 format['audio_ext'] = 'none'
5378 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5379 # format['preference'] = -1000
5380
5424dbaf
L
5381 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5382 # HEVC-over-FLV is out-of-spec by FLV's original spec
5383 # ref. https://trac.ffmpeg.org/ticket/6389
5384 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5385 format['preference'] = -100
5386
d0d74b71 5387 # Determine missing bitrates
eedda525 5388 if format.get('vcodec') == 'none':
5389 format['vbr'] = 0
5390 if format.get('acodec') == 'none':
5391 format['abr'] = 0
5392 if not format.get('vbr') and format.get('vcodec') != 'none':
5393 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5394 if not format.get('abr') and format.get('acodec') != 'none':
5395 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5396 if not format.get('tbr'):
5397 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5398
5399 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5400
5401
5402# XXX: Temporary
5403class _YDLLogger:
5404 def __init__(self, ydl=None):
5405 self._ydl = ydl
5406
5407 def debug(self, message):
5408 if self._ydl:
5409 self._ydl.write_debug(message)
5410
5411 def info(self, message):
5412 if self._ydl:
5413 self._ydl.to_screen(message)
5414
5415 def warning(self, message, *, once=False):
5416 if self._ydl:
3d2623a8 5417 self._ydl.report_warning(message, once)
1b392f90 5418
5419 def error(self, message, *, is_error=True):
5420 if self._ydl:
5421 self._ydl.report_error(message, is_error=is_error)
5422
5423 def stdout(self, message):
5424 if self._ydl:
5425 self._ydl.to_stdout(message)
5426
5427 def stderr(self, message):
5428 if self._ydl:
5429 self._ydl.to_stderr(message)