]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[core] Fix support for upcoming Python 3.12 (#8130)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
b1f94422 18import inspect
03f9daab 19import io
79a2e94e 20import itertools
f4bfd65f 21import json
d77c3dfd 22import locale
02dbf93f 23import math
f8271158 24import mimetypes
db3ad8a6 25import netrc
347de493 26import operator
d77c3dfd 27import os
c496ca96 28import platform
773f291d 29import random
d77c3dfd 30import re
f8271158 31import shlex
c496ca96 32import socket
79a2e94e 33import ssl
ac668111 34import struct
1c088fa8 35import subprocess
d77c3dfd 36import sys
181c8655 37import tempfile
c380cc28 38import time
01951dda 39import traceback
64fa820c 40import types
989a01c2 41import unicodedata
14f25df2 42import urllib.error
f8271158 43import urllib.parse
ac668111 44import urllib.request
bcf89ce6 45import xml.etree.ElementTree
d77c3dfd 46
69bec673 47from . import traversal
48
49from ..compat import functools # isort: split
50from ..compat import (
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
f8271158 53 compat_HTMLParseError,
efa97bdc 54 compat_os_name,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56)
c365dba8 57from ..dependencies import websockets, xattr
51fb4995 58
46f1370e 59__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
60
468e2e92
FV
61# This is not clearly defined otherwise
62compiled_regex_type = type(re.compile(''))
63
f7a147e3 64
4823ec9f 65class NO_DEFAULT:
66 pass
67
68
69def IDENTITY(x):
70 return x
71
bf42a990 72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
f6717dec
S
77MONTH_NAMES = {
78 'en': ENGLISH_MONTH_NAMES,
79 'fr': [
3e4185c3
S
80 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
81 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 82 # these follow the genitive grammatical case (dopełniacz)
83 # some websites might be using nominative, which will require another month list
84 # https://en.wikibooks.org/wiki/Polish/Noun_cases
85 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
86 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 87}
a942d6cb 88
8f53dc44 89# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
90TIMEZONE_NAMES = {
91 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
92 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
93 'EST': -5, 'EDT': -4, # Eastern
94 'CST': -6, 'CDT': -5, # Central
95 'MST': -7, 'MDT': -6, # Mountain
96 'PST': -8, 'PDT': -7 # Pacific
97}
98
c587cbb7 99# needed for sanitizing filenames in restricted mode
c8827027 100ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
101 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
102 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 103
46f59e89
S
104DATE_FORMATS = (
105 '%d %B %Y',
106 '%d %b %Y',
107 '%B %d %Y',
cb655f34
S
108 '%B %dst %Y',
109 '%B %dnd %Y',
9d30c213 110 '%B %drd %Y',
cb655f34 111 '%B %dth %Y',
46f59e89 112 '%b %d %Y',
cb655f34
S
113 '%b %dst %Y',
114 '%b %dnd %Y',
9d30c213 115 '%b %drd %Y',
cb655f34 116 '%b %dth %Y',
46f59e89
S
117 '%b %dst %Y %I:%M',
118 '%b %dnd %Y %I:%M',
9d30c213 119 '%b %drd %Y %I:%M',
46f59e89
S
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
bccdbd22 123 '%Y.%m.%d.',
46f59e89 124 '%Y/%m/%d',
81c13222 125 '%Y/%m/%d %H:%M',
46f59e89 126 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
127 '%Y%m%d%H%M',
128 '%Y%m%d%H%M%S',
4f3fa23e 129 '%Y%m%d',
0c1c6f4b 130 '%Y-%m-%d %H:%M',
46f59e89
S
131 '%Y-%m-%d %H:%M:%S',
132 '%Y-%m-%d %H:%M:%S.%f',
5014558a 133 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
134 '%d.%m.%Y %H:%M',
135 '%d.%m.%Y %H.%M',
136 '%Y-%m-%dT%H:%M:%SZ',
137 '%Y-%m-%dT%H:%M:%S.%fZ',
138 '%Y-%m-%dT%H:%M:%S.%f0Z',
139 '%Y-%m-%dT%H:%M:%S',
140 '%Y-%m-%dT%H:%M:%S.%f',
141 '%Y-%m-%dT%H:%M',
c6eed6b8
S
142 '%b %d %Y at %H:%M',
143 '%b %d %Y at %H:%M:%S',
b555ae9b
S
144 '%B %d %Y at %H:%M',
145 '%B %d %Y at %H:%M:%S',
a63d9bd0 146 '%H:%M %d-%b-%Y',
46f59e89
S
147)
148
149DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
47304e07 157 '%d-%m-%Y %H:%M',
4cbfa570 158 '%H:%M %d/%m/%Y',
46f59e89
S
159])
160
161DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
162DATE_FORMATS_MONTH_FIRST.extend([
163 '%m-%d-%Y',
164 '%m.%d.%Y',
165 '%m/%d/%Y',
166 '%m/%d/%y',
167 '%m/%d/%Y %H:%M:%S',
168])
169
06b3fe29 170PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 171JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 172
1d485a1a 173NUMBER_RE = r'\d+(?:\.\d+)?'
174
7105440c 175
0b9c08b4 176@functools.cache
d77c3dfd 177def preferredencoding():
59ae15a5 178 """Get preferred encoding.
d77c3dfd 179
59ae15a5
PH
180 Returns the best encoding scheme for the system, based on
181 locale.getpreferredencoding() and some further tweaks.
182 """
183 try:
184 pref = locale.getpreferredencoding()
28e614de 185 'TEST'.encode(pref)
70a1165b 186 except Exception:
59ae15a5 187 pref = 'UTF-8'
bae611f2 188
59ae15a5 189 return pref
d77c3dfd 190
f4bfd65f 191
181c8655 192def write_json_file(obj, fn):
1394646a 193 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 194
cfb0511d 195 tf = tempfile.NamedTemporaryFile(
196 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
197 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
198
199 try:
200 with tf:
45d86abe 201 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
202 if sys.platform == 'win32':
203 # Need to remove existing file on Windows, else os.rename raises
204 # WindowsError or FileExistsError.
19a03940 205 with contextlib.suppress(OSError):
1394646a 206 os.unlink(fn)
19a03940 207 with contextlib.suppress(OSError):
9cd5f54e
R
208 mask = os.umask(0)
209 os.umask(mask)
210 os.chmod(tf.name, 0o666 & ~mask)
181c8655 211 os.rename(tf.name, fn)
70a1165b 212 except Exception:
19a03940 213 with contextlib.suppress(OSError):
181c8655 214 os.remove(tf.name)
181c8655
PH
215 raise
216
217
cfb0511d 218def find_xpath_attr(node, xpath, key, val=None):
219 """ Find the xpath xpath[@key=val] """
220 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 221 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 222 return node.find(expr)
59ae56fa 223
d7e66d39
JMF
224# On python2.6 the xml.etree.ElementTree.Element methods don't support
225# the namespace parameter
5f6a1245
JW
226
227
d7e66d39
JMF
228def xpath_with_ns(path, ns_map):
229 components = [c.split(':') for c in path.split('/')]
230 replaced = []
231 for c in components:
232 if len(c) == 1:
233 replaced.append(c[0])
234 else:
235 ns, tag = c
236 replaced.append('{%s}%s' % (ns_map[ns], tag))
237 return '/'.join(replaced)
238
d77c3dfd 239
a41fb80c 240def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 241 def _find_xpath(xpath):
f9934b96 242 return node.find(xpath)
578c0745 243
14f25df2 244 if isinstance(xpath, str):
578c0745
S
245 n = _find_xpath(xpath)
246 else:
247 for xp in xpath:
248 n = _find_xpath(xp)
249 if n is not None:
250 break
d74bebd5 251
8e636da4 252 if n is None:
bf42a990
S
253 if default is not NO_DEFAULT:
254 return default
255 elif fatal:
bf0ff932
PH
256 name = xpath if name is None else name
257 raise ExtractorError('Could not find XML element %s' % name)
258 else:
259 return None
a41fb80c
S
260 return n
261
262
263def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
264 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
265 if n is None or n == default:
266 return n
267 if n.text is None:
268 if default is not NO_DEFAULT:
269 return default
270 elif fatal:
271 name = xpath if name is None else name
272 raise ExtractorError('Could not find XML element\'s text %s' % name)
273 else:
274 return None
275 return n.text
a41fb80c
S
276
277
278def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
279 n = find_xpath_attr(node, xpath, key)
280 if n is None:
281 if default is not NO_DEFAULT:
282 return default
283 elif fatal:
86e5f3ed 284 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
285 raise ExtractorError('Could not find XML attribute %s' % name)
286 else:
287 return None
288 return n.attrib[key]
bf0ff932
PH
289
290
c487cf00 291def get_element_by_id(id, html, **kwargs):
43e8fafd 292 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 293 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 294
12ea2f30 295
c487cf00 296def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 297 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 298 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
299
300
84c237fb 301def get_element_by_class(class_name, html):
2af12ad9
TC
302 """Return the content of the first tag with the specified class in the passed HTML document"""
303 retval = get_elements_by_class(class_name, html)
304 return retval[0] if retval else None
305
306
6f32a0b5
ZM
307def get_element_html_by_class(class_name, html):
308 """Return the html of the first tag with the specified class in the passed HTML document"""
309 retval = get_elements_html_by_class(class_name, html)
310 return retval[0] if retval else None
311
312
c487cf00 313def get_element_by_attribute(attribute, value, html, **kwargs):
314 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
315 return retval[0] if retval else None
316
317
c487cf00 318def get_element_html_by_attribute(attribute, value, html, **kargs):
319 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
320 return retval[0] if retval else None
321
322
c487cf00 323def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
324 """Return the content of all tags with the specified class in the passed HTML document as a list"""
325 return get_elements_by_attribute(
64fa820c 326 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
327 html, escape_value=False)
328
329
6f32a0b5
ZM
330def get_elements_html_by_class(class_name, html):
331 """Return the html of all tags with the specified class in the passed HTML document as a list"""
332 return get_elements_html_by_attribute(
64fa820c 333 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
334 html, escape_value=False)
335
336
337def get_elements_by_attribute(*args, **kwargs):
43e8fafd 338 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
339 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
340
341
342def get_elements_html_by_attribute(*args, **kwargs):
343 """Return the html of the tag with the specified attribute in the passed HTML document"""
344 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
345
346
4c9a1a3b 347def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
348 """
349 Return the text (content) and the html (whole) of the tag with the specified
350 attribute in the passed HTML document
351 """
c61473c1
M
352 if not value:
353 return
9e6dd238 354
86e5f3ed 355 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 356
84c237fb
YCH
357 value = re.escape(value) if escape_value else value
358
86e5f3ed 359 partial_element_re = rf'''(?x)
4c9a1a3b 360 <(?P<tag>{tag})
0254f162 361 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 362 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
363 '''
38285056 364
0254f162
ZM
365 for m in re.finditer(partial_element_re, html):
366 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 367
0254f162
ZM
368 yield (
369 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
370 whole
371 )
a921f407 372
c5229f39 373
ac668111 374class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
375 """
376 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
377 closing tag for the first opening tag it has encountered, and can be used
378 as a context manager
379 """
380
381 class HTMLBreakOnClosingTagException(Exception):
382 pass
383
384 def __init__(self):
385 self.tagstack = collections.deque()
ac668111 386 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
387
388 def __enter__(self):
389 return self
390
391 def __exit__(self, *_):
392 self.close()
393
394 def close(self):
395 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
396 # so data remains buffered; we no longer have any interest in it, thus
397 # override this method to discard it
398 pass
399
400 def handle_starttag(self, tag, _):
401 self.tagstack.append(tag)
402
403 def handle_endtag(self, tag):
404 if not self.tagstack:
405 raise compat_HTMLParseError('no tags in the stack')
406 while self.tagstack:
407 inner_tag = self.tagstack.pop()
408 if inner_tag == tag:
409 break
410 else:
411 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
412 if not self.tagstack:
413 raise self.HTMLBreakOnClosingTagException()
414
415
46d09f87 416# XXX: This should be far less strict
6f32a0b5
ZM
417def get_element_text_and_html_by_tag(tag, html):
418 """
419 For the first element with the specified tag in the passed HTML document
420 return its' content (text) and the whole element (html)
421 """
422 def find_or_raise(haystack, needle, exc):
423 try:
424 return haystack.index(needle)
425 except ValueError:
426 raise exc
427 closing_tag = f'</{tag}>'
428 whole_start = find_or_raise(
429 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
430 content_start = find_or_raise(
431 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
432 content_start += whole_start + 1
433 with HTMLBreakOnClosingTagParser() as parser:
434 parser.feed(html[whole_start:content_start])
435 if not parser.tagstack or parser.tagstack[0] != tag:
436 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
437 offset = content_start
438 while offset < len(html):
439 next_closing_tag_start = find_or_raise(
440 html[offset:], closing_tag,
441 compat_HTMLParseError(f'closing {tag} tag not found'))
442 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
443 try:
444 parser.feed(html[offset:offset + next_closing_tag_end])
445 offset += next_closing_tag_end
446 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
447 return html[content_start:offset + next_closing_tag_start], \
448 html[whole_start:offset + next_closing_tag_end]
449 raise compat_HTMLParseError('unexpected end of html')
450
451
ac668111 452class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 453 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 454
8bb56eee 455 def __init__(self):
c5229f39 456 self.attrs = {}
ac668111 457 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
458
459 def handle_starttag(self, tag, attrs):
460 self.attrs = dict(attrs)
7053aa3a 461 raise compat_HTMLParseError('done')
8bb56eee 462
c5229f39 463
ac668111 464class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
465 """HTML parser to gather the attributes for the elements of a list"""
466
467 def __init__(self):
ac668111 468 html.parser.HTMLParser.__init__(self)
73673ccf
FF
469 self.items = []
470 self._level = 0
471
472 def handle_starttag(self, tag, attrs):
473 if tag == 'li' and self._level == 0:
474 self.items.append(dict(attrs))
475 self._level += 1
476
477 def handle_endtag(self, tag):
478 self._level -= 1
479
480
8bb56eee
BF
481def extract_attributes(html_element):
482 """Given a string for an HTML element such as
483 <el
484 a="foo" B="bar" c="&98;az" d=boz
485 empty= noval entity="&amp;"
486 sq='"' dq="'"
487 >
488 Decode and return a dictionary of attributes.
489 {
490 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
491 'empty': '', 'noval': None, 'entity': '&',
492 'sq': '"', 'dq': '\''
493 }.
8bb56eee
BF
494 """
495 parser = HTMLAttributeParser()
19a03940 496 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
497 parser.feed(html_element)
498 parser.close()
8bb56eee 499 return parser.attrs
9e6dd238 500
c5229f39 501
73673ccf
FF
502def parse_list(webpage):
503 """Given a string for an series of HTML <li> elements,
504 return a dictionary of their attributes"""
505 parser = HTMLListAttrsParser()
506 parser.feed(webpage)
507 parser.close()
508 return parser.items
509
510
9e6dd238 511def clean_html(html):
59ae15a5 512 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
513
514 if html is None: # Convenience for sanitizing descriptions etc.
515 return html
516
49185227 517 html = re.sub(r'\s+', ' ', html)
518 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
519 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
520 # Strip html tags
521 html = re.sub('<.*?>', '', html)
522 # Replace html entities
523 html = unescapeHTML(html)
7decf895 524 return html.strip()
9e6dd238
FV
525
526
b7c47b74 527class LenientJSONDecoder(json.JSONDecoder):
cc090836 528 # TODO: Write tests
529 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 530 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 531 self._close_attempts = 2 * close_objects
b7c47b74 532 super().__init__(*args, **kwargs)
533
cc090836 534 @staticmethod
535 def _close_object(err):
536 doc = err.doc[:err.pos]
537 # We need to add comma first to get the correct error message
538 if err.msg.startswith('Expecting \',\''):
539 return doc + ','
540 elif not doc.endswith(','):
541 return
542
543 if err.msg.startswith('Expecting property name'):
544 return doc[:-1] + '}'
545 elif err.msg.startswith('Expecting value'):
546 return doc[:-1] + ']'
547
b7c47b74 548 def decode(self, s):
549 if self.transform_source:
550 s = self.transform_source(s)
cc090836 551 for attempt in range(self._close_attempts + 1):
552 try:
553 if self.ignore_extra:
554 return self.raw_decode(s.lstrip())[0]
555 return super().decode(s)
556 except json.JSONDecodeError as e:
557 if e.pos is None:
558 raise
559 elif attempt < self._close_attempts:
560 s = self._close_object(e)
561 if s is not None:
562 continue
2fa669f7 563 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 564 assert False, 'Too many attempts to decode JSON'
b7c47b74 565
566
d77c3dfd 567def sanitize_open(filename, open_mode):
59ae15a5
PH
568 """Try to open the given filename, and slightly tweak it if this fails.
569
570 Attempts to open the given filename. If this fails, it tries to change
571 the filename slightly, step by step, until it's either able to open it
572 or it fails and raises a final exception, like the standard open()
573 function.
574
575 It returns the tuple (stream, definitive_file_name).
576 """
0edb3e33 577 if filename == '-':
578 if sys.platform == 'win32':
579 import msvcrt
be5c1ae8 580
62b58c09 581 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 582 with contextlib.suppress(io.UnsupportedOperation):
583 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 584 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 585
0edb3e33 586 for attempt in range(2):
587 try:
588 try:
89737671 589 if sys.platform == 'win32':
b506289f 590 # FIXME: An exclusive lock also locks the file from being read.
591 # Since windows locks are mandatory, don't lock the file on windows (for now).
592 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 593 raise LockingUnsupportedError()
0edb3e33 594 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 595 except OSError:
0edb3e33 596 stream = open(filename, open_mode)
8a82af35 597 return stream, filename
86e5f3ed 598 except OSError as err:
0edb3e33 599 if attempt or err.errno in (errno.EACCES,):
600 raise
601 old_filename, filename = filename, sanitize_path(filename)
602 if old_filename == filename:
603 raise
d77c3dfd
FV
604
605
606def timeconvert(timestr):
59ae15a5
PH
607 """Convert RFC 2822 defined time string into system timestamp"""
608 timestamp = None
609 timetuple = email.utils.parsedate_tz(timestr)
610 if timetuple is not None:
611 timestamp = email.utils.mktime_tz(timetuple)
612 return timestamp
1c469a94 613
5f6a1245 614
5c3895ff 615def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 616 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 617 @param restricted Use a stricter subset of allowed characters
618 @param is_id Whether this is an ID that should be kept unchanged if possible.
619 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 620 """
5c3895ff 621 if s == '':
622 return ''
623
59ae15a5 624 def replace_insane(char):
c587cbb7
AT
625 if restricted and char in ACCENT_CHARS:
626 return ACCENT_CHARS[char]
91dd88b9 627 elif not restricted and char == '\n':
5c3895ff 628 return '\0 '
989a01c2 629 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
630 # Replace with their full-width unicode counterparts
631 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 632 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
633 return ''
634 elif char == '"':
635 return '' if restricted else '\''
636 elif char == ':':
5c3895ff 637 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 638 elif char in '\\/|*<>':
5c3895ff 639 return '\0_'
640 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
641 return '\0_'
59ae15a5
PH
642 return char
643
db4678e4 644 # Replace look-alike Unicode glyphs
645 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 646 s = unicodedata.normalize('NFKC', s)
5c3895ff 647 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 648 result = ''.join(map(replace_insane, s))
5c3895ff 649 if is_id is NO_DEFAULT:
ae61d108 650 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
651 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 652 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
653 result = result.replace('\0', '') or '_'
654
796173d0
PH
655 if not is_id:
656 while '__' in result:
657 result = result.replace('__', '_')
658 result = result.strip('_')
659 # Common case of "Foreign band name - English song title"
660 if restricted and result.startswith('-_'):
661 result = result[2:]
5a42414b
PH
662 if result.startswith('-'):
663 result = '_' + result[len('-'):]
a7440261 664 result = result.lstrip('.')
796173d0
PH
665 if not result:
666 result = '_'
59ae15a5 667 return result
d77c3dfd 668
5f6a1245 669
c2934512 670def sanitize_path(s, force=False):
a2aaf4db 671 """Sanitizes and normalizes path on Windows"""
836e06d2 672 # XXX: this handles drive relative paths (c:sth) incorrectly
c2934512 673 if sys.platform == 'win32':
c4218ac3 674 force = False
c2934512 675 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 676 elif force:
677 drive_or_unc = ''
678 else:
a2aaf4db 679 return s
c2934512 680
be531ef1
S
681 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
682 if drive_or_unc:
a2aaf4db
S
683 norm_path.pop(0)
684 sanitized_path = [
ec85ded8 685 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 686 for path_part in norm_path]
be531ef1
S
687 if drive_or_unc:
688 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 689 elif force and s and s[0] == os.path.sep:
c4218ac3 690 sanitized_path.insert(0, os.path.sep)
836e06d2
SS
691 # TODO: Fix behavioral differences <3.12
692 # The workaround using `normpath` only superficially passes tests
693 # Ref: https://github.com/python/cpython/pull/100351
694 return os.path.normpath(os.path.join(*sanitized_path))
a2aaf4db
S
695
696
8f97a15d 697def sanitize_url(url, *, scheme='http'):
befa4708
S
698 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
699 # the number of unwanted failures due to missing protocol
21633673 700 if url is None:
701 return
702 elif url.startswith('//'):
8f97a15d 703 return f'{scheme}:{url}'
befa4708
S
704 # Fix some common typos seen so far
705 COMMON_TYPOS = (
067aa17e 706 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
707 (r'^httpss://', r'https://'),
708 # https://bx1.be/lives/direct-tv/
709 (r'^rmtp([es]?)://', r'rtmp\1://'),
710 )
711 for mistake, fixup in COMMON_TYPOS:
712 if re.match(mistake, url):
713 return re.sub(mistake, fixup, url)
bc6b9bcd 714 return url
17bcc626
S
715
716
5435dcf9 717def extract_basic_auth(url):
14f25df2 718 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
719 if parts.username is None:
720 return url, None
14f25df2 721 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
722 parts.hostname if parts.port is None
723 else '%s:%d' % (parts.hostname, parts.port))))
724 auth_payload = base64.b64encode(
0f06bcd7 725 ('%s:%s' % (parts.username, parts.password or '')).encode())
726 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
727
728
51098426 729def expand_path(s):
2fa669f7 730 """Expand shell variables and ~"""
51098426
S
731 return os.path.expandvars(compat_expanduser(s))
732
733
7e9a6125 734def orderedSet(iterable, *, lazy=False):
735 """Remove all duplicates from the input iterable"""
736 def _iter():
737 seen = [] # Do not use set since the items can be unhashable
738 for x in iterable:
739 if x not in seen:
740 seen.append(x)
741 yield x
742
743 return _iter() if lazy else list(_iter())
d77c3dfd 744
912b38b4 745
55b2f099 746def _htmlentity_transform(entity_with_semicolon):
4e408e47 747 """Transforms an HTML entity to a character."""
55b2f099
YCH
748 entity = entity_with_semicolon[:-1]
749
4e408e47 750 # Known non-numeric HTML entity
ac668111 751 if entity in html.entities.name2codepoint:
752 return chr(html.entities.name2codepoint[entity])
4e408e47 753
62b58c09
L
754 # TODO: HTML5 allows entities without a semicolon.
755 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 756 if entity_with_semicolon in html.entities.html5:
757 return html.entities.html5[entity_with_semicolon]
55b2f099 758
91757b0f 759 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
760 if mobj is not None:
761 numstr = mobj.group(1)
28e614de 762 if numstr.startswith('x'):
4e408e47 763 base = 16
28e614de 764 numstr = '0%s' % numstr
4e408e47
PH
765 else:
766 base = 10
067aa17e 767 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 768 with contextlib.suppress(ValueError):
ac668111 769 return chr(int(numstr, base))
4e408e47
PH
770
771 # Unknown entity in name, return its literal representation
7a3f0c00 772 return '&%s;' % entity
4e408e47
PH
773
774
d77c3dfd 775def unescapeHTML(s):
912b38b4
PH
776 if s is None:
777 return None
19a03940 778 assert isinstance(s, str)
d77c3dfd 779
4e408e47 780 return re.sub(
95f3f7c2 781 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 782
8bf48f23 783
cdb19aa4 784def escapeHTML(text):
785 return (
786 text
787 .replace('&', '&amp;')
788 .replace('<', '&lt;')
789 .replace('>', '&gt;')
790 .replace('"', '&quot;')
791 .replace("'", '&#39;')
792 )
793
794
db3ad8a6
ND
795class netrc_from_content(netrc.netrc):
796 def __init__(self, content):
797 self.hosts, self.macros = {}, {}
798 with io.StringIO(content) as stream:
799 self._parse('-', stream, False)
800
801
d3c93ec2 802class Popen(subprocess.Popen):
803 if sys.platform == 'win32':
804 _startupinfo = subprocess.STARTUPINFO()
805 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
806 else:
807 _startupinfo = None
808
82ea226c
L
809 @staticmethod
810 def _fix_pyinstaller_ld_path(env):
811 """Restore LD_LIBRARY_PATH when using PyInstaller
812 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
813 https://github.com/yt-dlp/yt-dlp/issues/4573
814 """
815 if not hasattr(sys, '_MEIPASS'):
816 return
817
818 def _fix(key):
819 orig = env.get(f'{key}_ORIG')
820 if orig is None:
821 env.pop(key, None)
822 else:
823 env[key] = orig
824
825 _fix('LD_LIBRARY_PATH') # Linux
826 _fix('DYLD_LIBRARY_PATH') # macOS
827
828 def __init__(self, *args, env=None, text=False, **kwargs):
829 if env is None:
830 env = os.environ.copy()
831 self._fix_pyinstaller_ld_path(env)
832
da8e2912 833 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 834 if text is True:
835 kwargs['universal_newlines'] = True # For 3.6 compatibility
836 kwargs.setdefault('encoding', 'utf-8')
837 kwargs.setdefault('errors', 'replace')
82ea226c 838 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 839
840 def communicate_or_kill(self, *args, **kwargs):
8a82af35 841 try:
842 return self.communicate(*args, **kwargs)
843 except BaseException: # Including KeyboardInterrupt
f0c9fb96 844 self.kill(timeout=None)
8a82af35 845 raise
d3c93ec2 846
f0c9fb96 847 def kill(self, *, timeout=0):
848 super().kill()
849 if timeout != 0:
850 self.wait(timeout=timeout)
851
852 @classmethod
992dc6b4 853 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 854 with cls(*args, **kwargs) as proc:
da8e2912 855 default = '' if proc.__text_mode else b''
992dc6b4 856 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 857 return stdout or default, stderr or default, proc.returncode
f0c9fb96 858
d3c93ec2 859
f07b74fc 860def encodeArgument(s):
cfb0511d 861 # Legacy code that uses byte strings
862 # Uncomment the following line after fixing all post processors
14f25df2 863 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 864 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
865
866
aa7785f8 867_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
868
869
870def timetuple_from_msec(msec):
871 secs, msec = divmod(msec, 1000)
872 mins, secs = divmod(secs, 60)
873 hrs, mins = divmod(mins, 60)
874 return _timetuple(hrs, mins, secs, msec)
875
876
cdb19aa4 877def formatSeconds(secs, delim=':', msec=False):
aa7785f8 878 time = timetuple_from_msec(secs * 1000)
879 if time.hours:
880 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
881 elif time.minutes:
882 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 883 else:
aa7785f8 884 ret = '%d' % time.seconds
885 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 886
a0ddb8a2 887
5873d4cc 888def bug_reports_message(before=';'):
69bec673 889 from ..update import REPOSITORY
57e0f077 890
891 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
892 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
893
894 before = before.rstrip()
895 if not before or before.endswith(('.', '!', '?')):
896 msg = msg[0].title() + msg[1:]
897
898 return (before + ' ' if before else '') + msg
08f2a92c
JMF
899
900
bf5b9d85
PM
901class YoutubeDLError(Exception):
902 """Base exception for YoutubeDL errors."""
aa9369a2 903 msg = None
904
905 def __init__(self, msg=None):
906 if msg is not None:
907 self.msg = msg
908 elif self.msg is None:
909 self.msg = type(self).__name__
910 super().__init__(self.msg)
bf5b9d85
PM
911
912
913class ExtractorError(YoutubeDLError):
1c256f70 914 """Error during info extraction."""
5f6a1245 915
1151c407 916 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 917 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 918 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 919 """
c365dba8 920 from ..networking.exceptions import network_exceptions
3158150c 921 if sys.exc_info()[0] in network_exceptions:
9a82b238 922 expected = True
d5979c5d 923
7265a219 924 self.orig_msg = str(msg)
1c256f70 925 self.traceback = tb
1151c407 926 self.expected = expected
2eabb802 927 self.cause = cause
d11271dd 928 self.video_id = video_id
1151c407 929 self.ie = ie
930 self.exc_info = sys.exc_info() # preserve original exception
5df14442 931 if isinstance(self.exc_info[1], ExtractorError):
932 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 933 super().__init__(self.__msg)
1151c407 934
9bcfe33b 935 @property
936 def __msg(self):
937 return ''.join((
938 format_field(self.ie, None, '[%s] '),
939 format_field(self.video_id, None, '%s: '),
940 self.orig_msg,
941 format_field(self.cause, None, ' (caused by %r)'),
942 '' if self.expected else bug_reports_message()))
1c256f70 943
01951dda 944 def format_traceback(self):
497d2fab 945 return join_nonempty(
946 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 947 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 948 delim='\n') or None
01951dda 949
9bcfe33b 950 def __setattr__(self, name, value):
951 super().__setattr__(name, value)
952 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
953 self.msg = self.__msg or type(self).__name__
954 self.args = (self.msg, ) # Cannot be property
955
1c256f70 956
416c7fcb
PH
957class UnsupportedError(ExtractorError):
958 def __init__(self, url):
86e5f3ed 959 super().__init__(
416c7fcb
PH
960 'Unsupported URL: %s' % url, expected=True)
961 self.url = url
962
963
55b3e45b
JMF
964class RegexNotFoundError(ExtractorError):
965 """Error when a regex didn't match"""
966 pass
967
968
773f291d
S
969class GeoRestrictedError(ExtractorError):
970 """Geographic restriction Error exception.
971
972 This exception may be thrown when a video is not available from your
973 geographic location due to geographic restrictions imposed by a website.
974 """
b6e0c7d2 975
0db3bae8 976 def __init__(self, msg, countries=None, **kwargs):
977 kwargs['expected'] = True
86e5f3ed 978 super().__init__(msg, **kwargs)
773f291d
S
979 self.countries = countries
980
981
693f0600 982class UserNotLive(ExtractorError):
983 """Error when a channel/user is not live"""
984
985 def __init__(self, msg=None, **kwargs):
986 kwargs['expected'] = True
987 super().__init__(msg or 'The channel is not currently live', **kwargs)
988
989
bf5b9d85 990class DownloadError(YoutubeDLError):
59ae15a5 991 """Download Error exception.
d77c3dfd 992
59ae15a5
PH
993 This exception may be thrown by FileDownloader objects if they are not
994 configured to continue on errors. They will contain the appropriate
995 error message.
996 """
5f6a1245 997
8cc83b8d
FV
998 def __init__(self, msg, exc_info=None):
999 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1000 super().__init__(msg)
8cc83b8d 1001 self.exc_info = exc_info
d77c3dfd
FV
1002
1003
498f5606 1004class EntryNotInPlaylist(YoutubeDLError):
1005 """Entry not in playlist exception.
1006
1007 This exception will be thrown by YoutubeDL when a requested entry
1008 is not found in the playlist info_dict
1009 """
aa9369a2 1010 msg = 'Entry not found in info'
498f5606 1011
1012
bf5b9d85 1013class SameFileError(YoutubeDLError):
59ae15a5 1014 """Same File exception.
d77c3dfd 1015
59ae15a5
PH
1016 This exception will be thrown by FileDownloader objects if they detect
1017 multiple files would have to be downloaded to the same file on disk.
1018 """
aa9369a2 1019 msg = 'Fixed output name but more than one file to download'
1020
1021 def __init__(self, filename=None):
1022 if filename is not None:
1023 self.msg += f': {filename}'
1024 super().__init__(self.msg)
d77c3dfd
FV
1025
1026
bf5b9d85 1027class PostProcessingError(YoutubeDLError):
59ae15a5 1028 """Post Processing exception.
d77c3dfd 1029
59ae15a5
PH
1030 This exception may be raised by PostProcessor's .run() method to
1031 indicate an error in the postprocessing task.
1032 """
5f6a1245 1033
5f6a1245 1034
48f79687 1035class DownloadCancelled(YoutubeDLError):
1036 """ Exception raised when the download queue should be interrupted """
1037 msg = 'The download was cancelled'
8b0d7497 1038
8b0d7497 1039
48f79687 1040class ExistingVideoReached(DownloadCancelled):
1041 """ --break-on-existing triggered """
1042 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1043
48f79687 1044
1045class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1046 """ --break-match-filter triggered """
1047 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1048
1049
48f79687 1050class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1051 """ --max-downloads limit has been reached. """
48f79687 1052 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1053
1054
f2ebc5c7 1055class ReExtractInfo(YoutubeDLError):
1056 """ Video info needs to be re-extracted. """
1057
1058 def __init__(self, msg, expected=False):
1059 super().__init__(msg)
1060 self.expected = expected
1061
1062
1063class ThrottledDownload(ReExtractInfo):
48f79687 1064 """ Download speed below --throttled-rate. """
aa9369a2 1065 msg = 'The download speed is below throttle limit'
d77c3dfd 1066
43b22906 1067 def __init__(self):
1068 super().__init__(self.msg, expected=False)
f2ebc5c7 1069
d77c3dfd 1070
bf5b9d85 1071class UnavailableVideoError(YoutubeDLError):
59ae15a5 1072 """Unavailable Format exception.
d77c3dfd 1073
59ae15a5
PH
1074 This exception will be thrown when a video is requested
1075 in a format that is not available for that video.
1076 """
aa9369a2 1077 msg = 'Unable to download video'
1078
1079 def __init__(self, err=None):
1080 if err is not None:
1081 self.msg += f': {err}'
1082 super().__init__(self.msg)
d77c3dfd
FV
1083
1084
bf5b9d85 1085class ContentTooShortError(YoutubeDLError):
59ae15a5 1086 """Content Too Short exception.
d77c3dfd 1087
59ae15a5
PH
1088 This exception may be raised by FileDownloader objects when a file they
1089 download is too small for what the server announced first, indicating
1090 the connection was probably interrupted.
1091 """
d77c3dfd 1092
59ae15a5 1093 def __init__(self, downloaded, expected):
86e5f3ed 1094 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1095 # Both in bytes
59ae15a5
PH
1096 self.downloaded = downloaded
1097 self.expected = expected
d77c3dfd 1098
5f6a1245 1099
bf5b9d85 1100class XAttrMetadataError(YoutubeDLError):
efa97bdc 1101 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1102 super().__init__(msg)
efa97bdc 1103 self.code = code
bd264412 1104 self.msg = msg
efa97bdc
YCH
1105
1106 # Parsing code and msg
3089bc74 1107 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1108 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1109 self.reason = 'NO_SPACE'
1110 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1111 self.reason = 'VALUE_TOO_LONG'
1112 else:
1113 self.reason = 'NOT_SUPPORTED'
1114
1115
bf5b9d85 1116class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1117 pass
1118
1119
941e881e 1120def is_path_like(f):
1121 return isinstance(f, (str, bytes, os.PathLike))
1122
1123
46f59e89
S
1124def extract_timezone(date_str):
1125 m = re.search(
f137e4c2 1126 r'''(?x)
1127 ^.{8,}? # >=8 char non-TZ prefix, if present
1128 (?P<tz>Z| # just the UTC Z, or
1129 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1130 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1131 [ ]? # optional space
1132 (?P<sign>\+|-) # +/-
1133 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1134 $)
1135 ''', date_str)
46f59e89 1136 if not m:
8f53dc44 1137 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1138 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1139 if timezone is not None:
1140 date_str = date_str[:-len(m.group('tz'))]
1141 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1142 else:
1143 date_str = date_str[:-len(m.group('tz'))]
1144 if not m.group('sign'):
1145 timezone = datetime.timedelta()
1146 else:
1147 sign = 1 if m.group('sign') == '+' else -1
1148 timezone = datetime.timedelta(
1149 hours=sign * int(m.group('hours')),
1150 minutes=sign * int(m.group('minutes')))
1151 return timezone, date_str
1152
1153
08b38d54 1154def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1155 """ Return a UNIX timestamp from the given date """
1156
1157 if date_str is None:
1158 return None
1159
52c3a6e4
S
1160 date_str = re.sub(r'\.[0-9]+', '', date_str)
1161
08b38d54 1162 if timezone is None:
46f59e89
S
1163 timezone, date_str = extract_timezone(date_str)
1164
19a03940 1165 with contextlib.suppress(ValueError):
86e5f3ed 1166 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1167 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1168 return calendar.timegm(dt.timetuple())
912b38b4
PH
1169
1170
46f59e89
S
1171def date_formats(day_first=True):
1172 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1173
1174
42bdd9d0 1175def unified_strdate(date_str, day_first=True):
bf50b038 1176 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1177
1178 if date_str is None:
1179 return None
bf50b038 1180 upload_date = None
5f6a1245 1181 # Replace commas
026fcc04 1182 date_str = date_str.replace(',', ' ')
42bdd9d0 1183 # Remove AM/PM + timezone
9bb8e0a3 1184 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1185 _, date_str = extract_timezone(date_str)
42bdd9d0 1186
46f59e89 1187 for expression in date_formats(day_first):
19a03940 1188 with contextlib.suppress(ValueError):
bf50b038 1189 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1190 if upload_date is None:
1191 timetuple = email.utils.parsedate_tz(date_str)
1192 if timetuple:
19a03940 1193 with contextlib.suppress(ValueError):
c6b9cf05 1194 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1195 if upload_date is not None:
14f25df2 1196 return str(upload_date)
bf50b038 1197
5f6a1245 1198
46f59e89 1199def unified_timestamp(date_str, day_first=True):
ad54c913 1200 if not isinstance(date_str, str):
46f59e89
S
1201 return None
1202
8f53dc44 1203 date_str = re.sub(r'\s+', ' ', re.sub(
1204 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1205
7dc2a74e 1206 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1207 timezone, date_str = extract_timezone(date_str)
1208
1209 # Remove AM/PM + timezone
1210 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1211
deef3195
S
1212 # Remove unrecognized timezones from ISO 8601 alike timestamps
1213 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1214 if m:
1215 date_str = date_str[:-len(m.group('tz'))]
1216
f226880c
PH
1217 # Python only supports microseconds, so remove nanoseconds
1218 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1219 if m:
1220 date_str = m.group(1)
1221
46f59e89 1222 for expression in date_formats(day_first):
19a03940 1223 with contextlib.suppress(ValueError):
7dc2a74e 1224 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1225 return calendar.timegm(dt.timetuple())
8f53dc44 1226
46f59e89
S
1227 timetuple = email.utils.parsedate_tz(date_str)
1228 if timetuple:
8f53dc44 1229 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1230
1231
28e614de 1232def determine_ext(url, default_ext='unknown_video'):
85750f89 1233 if url is None or '.' not in url:
f4776371 1234 return default_ext
9cb9a5df 1235 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1236 if re.match(r'^[A-Za-z0-9]+$', guess):
1237 return guess
a7aaa398
S
1238 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1239 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1240 return guess.rstrip('/')
73e79f2a 1241 else:
cbdbb766 1242 return default_ext
73e79f2a 1243
5f6a1245 1244
824fa511
S
1245def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1246 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1247
5f6a1245 1248
9e62f283 1249def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1250 R"""
1251 Return a datetime object from a string.
1252 Supported format:
1253 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1254
1255 @param format strftime format of DATE
1256 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1257 auto: round to the unit provided in date_str (if applicable).
9e62f283 1258 """
1259 auto_precision = False
1260 if precision == 'auto':
1261 auto_precision = True
1262 precision = 'microsecond'
836e06d2 1263 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
f8795e10 1264 if date_str in ('now', 'today'):
37254abc 1265 return today
f8795e10
PH
1266 if date_str == 'yesterday':
1267 return today - datetime.timedelta(days=1)
9e62f283 1268 match = re.match(
3d38b2d6 1269 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1270 date_str)
37254abc 1271 if match is not None:
9e62f283 1272 start_time = datetime_from_str(match.group('start'), precision, format)
1273 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1274 unit = match.group('unit')
9e62f283 1275 if unit == 'month' or unit == 'year':
1276 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1277 unit = 'day'
9e62f283 1278 else:
1279 if unit == 'week':
1280 unit = 'day'
1281 time *= 7
1282 delta = datetime.timedelta(**{unit + 's': time})
1283 new_date = start_time + delta
1284 if auto_precision:
1285 return datetime_round(new_date, unit)
1286 return new_date
1287
1288 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1289
1290
d49f8db3 1291def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1292 R"""
1293 Return a date object from a string using datetime_from_str
9e62f283 1294
3d38b2d6 1295 @param strict Restrict allowed patterns to "YYYYMMDD" and
1296 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1297 """
3d38b2d6 1298 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1299 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1300 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1301
1302
1303def datetime_add_months(dt, months):
1304 """Increment/Decrement a datetime object by months."""
1305 month = dt.month + months - 1
1306 year = dt.year + month // 12
1307 month = month % 12 + 1
1308 day = min(dt.day, calendar.monthrange(year, month)[1])
1309 return dt.replace(year, month, day)
1310
1311
1312def datetime_round(dt, precision='day'):
1313 """
1314 Round a datetime object's time to a specific precision
1315 """
1316 if precision == 'microsecond':
1317 return dt
1318
1319 unit_seconds = {
1320 'day': 86400,
1321 'hour': 3600,
1322 'minute': 60,
1323 'second': 1,
1324 }
1325 roundto = lambda x, n: ((x + n / 2) // n) * n
836e06d2
SS
1326 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1327 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
5f6a1245
JW
1328
1329
e63fc1be 1330def hyphenate_date(date_str):
1331 """
1332 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1333 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1334 if match is not None:
1335 return '-'.join(match.groups())
1336 else:
1337 return date_str
1338
5f6a1245 1339
86e5f3ed 1340class DateRange:
bd558525 1341 """Represents a time interval between two dates"""
5f6a1245 1342
bd558525
JMF
1343 def __init__(self, start=None, end=None):
1344 """start and end must be strings in the format accepted by date"""
1345 if start is not None:
d49f8db3 1346 self.start = date_from_str(start, strict=True)
bd558525
JMF
1347 else:
1348 self.start = datetime.datetime.min.date()
1349 if end is not None:
d49f8db3 1350 self.end = date_from_str(end, strict=True)
bd558525
JMF
1351 else:
1352 self.end = datetime.datetime.max.date()
37254abc 1353 if self.start > self.end:
bd558525 1354 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1355
bd558525
JMF
1356 @classmethod
1357 def day(cls, day):
1358 """Returns a range that only contains the given day"""
5f6a1245
JW
1359 return cls(day, day)
1360
bd558525
JMF
1361 def __contains__(self, date):
1362 """Check if the date is in the range"""
37254abc
JMF
1363 if not isinstance(date, datetime.date):
1364 date = date_from_str(date)
1365 return self.start <= date <= self.end
5f6a1245 1366
46f1370e 1367 def __repr__(self):
1368 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1369
f2df4071 1370 def __eq__(self, other):
1371 return (isinstance(other, DateRange)
1372 and self.start == other.start and self.end == other.end)
1373
c496ca96 1374
b1f94422 1375@functools.cache
1376def system_identifier():
1377 python_implementation = platform.python_implementation()
1378 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1379 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1380 libc_ver = []
1381 with contextlib.suppress(OSError): # We may not have access to the executable
1382 libc_ver = platform.libc_ver()
b1f94422 1383
17fc3dc4 1384 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1385 platform.python_version(),
1386 python_implementation,
17fc3dc4 1387 platform.machine(),
b1f94422 1388 platform.architecture()[0],
1389 platform.platform(),
5b9f253f
M
1390 ssl.OPENSSL_VERSION,
1391 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1392 )
c257baff
PH
1393
1394
0b9c08b4 1395@functools.cache
49fa4d9a 1396def get_windows_version():
8a82af35 1397 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1398 if compat_os_name == 'nt':
1399 return version_tuple(platform.win32_ver()[1])
1400 else:
8a82af35 1401 return ()
49fa4d9a
N
1402
1403
734f90bb 1404def write_string(s, out=None, encoding=None):
19a03940 1405 assert isinstance(s, str)
1406 out = out or sys.stderr
3b479100
SS
1407 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1408 if not out:
1409 return
7459e3a2 1410
fe1daad3 1411 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1412 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1413
8a82af35 1414 enc, buffer = None, out
cfb0511d 1415 if 'b' in getattr(out, 'mode', ''):
c487cf00 1416 enc = encoding or preferredencoding()
104aa738 1417 elif hasattr(out, 'buffer'):
8a82af35 1418 buffer = out.buffer
104aa738 1419 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1420
8a82af35 1421 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1422 out.flush()
1423
1424
3d2623a8 1425# TODO: Use global logger
da4db748 1426def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1427 from .. import _IN_CLI
da4db748 1428 if _IN_CLI:
1429 if msg in deprecation_warning._cache:
1430 return
1431 deprecation_warning._cache.add(msg)
1432 if printer:
1433 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1434 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1435 else:
1436 import warnings
1437 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1438
1439
1440deprecation_warning._cache = set()
1441
1442
48ea9cea
PH
1443def bytes_to_intlist(bs):
1444 if not bs:
1445 return []
1446 if isinstance(bs[0], int): # Python 3
1447 return list(bs)
1448 else:
1449 return [ord(c) for c in bs]
1450
c257baff 1451
cba892fa 1452def intlist_to_bytes(xs):
1453 if not xs:
1454 return b''
ac668111 1455 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1456
1457
8a82af35 1458class LockingUnsupportedError(OSError):
1890fc63 1459 msg = 'File locking is not supported'
0edb3e33 1460
1461 def __init__(self):
1462 super().__init__(self.msg)
1463
1464
c1c9a79c
PH
1465# Cross-platform file locking
1466if sys.platform == 'win32':
fe0918bb 1467 import ctypes
c1c9a79c
PH
1468 import ctypes.wintypes
1469 import msvcrt
1470
1471 class OVERLAPPED(ctypes.Structure):
1472 _fields_ = [
1473 ('Internal', ctypes.wintypes.LPVOID),
1474 ('InternalHigh', ctypes.wintypes.LPVOID),
1475 ('Offset', ctypes.wintypes.DWORD),
1476 ('OffsetHigh', ctypes.wintypes.DWORD),
1477 ('hEvent', ctypes.wintypes.HANDLE),
1478 ]
1479
37e325b9 1480 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1481 LockFileEx = kernel32.LockFileEx
1482 LockFileEx.argtypes = [
1483 ctypes.wintypes.HANDLE, # hFile
1484 ctypes.wintypes.DWORD, # dwFlags
1485 ctypes.wintypes.DWORD, # dwReserved
1486 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1487 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1488 ctypes.POINTER(OVERLAPPED) # Overlapped
1489 ]
1490 LockFileEx.restype = ctypes.wintypes.BOOL
1491 UnlockFileEx = kernel32.UnlockFileEx
1492 UnlockFileEx.argtypes = [
1493 ctypes.wintypes.HANDLE, # hFile
1494 ctypes.wintypes.DWORD, # dwReserved
1495 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1496 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1497 ctypes.POINTER(OVERLAPPED) # Overlapped
1498 ]
1499 UnlockFileEx.restype = ctypes.wintypes.BOOL
1500 whole_low = 0xffffffff
1501 whole_high = 0x7fffffff
1502
747c0bd1 1503 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1504 overlapped = OVERLAPPED()
1505 overlapped.Offset = 0
1506 overlapped.OffsetHigh = 0
1507 overlapped.hEvent = 0
1508 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1509
1510 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1511 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1512 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1513 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1514 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1515
1516 def _unlock_file(f):
1517 assert f._lock_file_overlapped_p
1518 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1519 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1520 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1521
1522else:
399a76e6
YCH
1523 try:
1524 import fcntl
c1c9a79c 1525
a3125791 1526 def _lock_file(f, exclusive, block):
b63837bc 1527 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1528 if not block:
1529 flags |= fcntl.LOCK_NB
acea8d7c 1530 try:
b63837bc 1531 fcntl.flock(f, flags)
acea8d7c
JK
1532 except BlockingIOError:
1533 raise
1534 except OSError: # AOSP does not have flock()
b63837bc 1535 fcntl.lockf(f, flags)
c1c9a79c 1536
399a76e6 1537 def _unlock_file(f):
45998b3e
E
1538 with contextlib.suppress(OSError):
1539 return fcntl.flock(f, fcntl.LOCK_UN)
1540 with contextlib.suppress(OSError):
1541 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1542 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1543
399a76e6 1544 except ImportError:
399a76e6 1545
a3125791 1546 def _lock_file(f, exclusive, block):
0edb3e33 1547 raise LockingUnsupportedError()
399a76e6
YCH
1548
1549 def _unlock_file(f):
0edb3e33 1550 raise LockingUnsupportedError()
c1c9a79c
PH
1551
1552
86e5f3ed 1553class locked_file:
0edb3e33 1554 locked = False
747c0bd1 1555
a3125791 1556 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1557 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1558 raise NotImplementedError(mode)
1559 self.mode, self.block = mode, block
1560
1561 writable = any(f in mode for f in 'wax+')
1562 readable = any(f in mode for f in 'r+')
1563 flags = functools.reduce(operator.ior, (
1564 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1565 getattr(os, 'O_BINARY', 0), # Windows only
1566 getattr(os, 'O_NOINHERIT', 0), # Windows only
1567 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1568 os.O_APPEND if 'a' in mode else 0,
1569 os.O_EXCL if 'x' in mode else 0,
1570 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1571 ))
1572
98804d03 1573 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1574
1575 def __enter__(self):
a3125791 1576 exclusive = 'r' not in self.mode
c1c9a79c 1577 try:
a3125791 1578 _lock_file(self.f, exclusive, self.block)
0edb3e33 1579 self.locked = True
86e5f3ed 1580 except OSError:
c1c9a79c
PH
1581 self.f.close()
1582 raise
fcfa8853 1583 if 'w' in self.mode:
131e14dc
JK
1584 try:
1585 self.f.truncate()
1586 except OSError as e:
1890fc63 1587 if e.errno not in (
1588 errno.ESPIPE, # Illegal seek - expected for FIFO
1589 errno.EINVAL, # Invalid argument - expected for /dev/null
1590 ):
1591 raise
c1c9a79c
PH
1592 return self
1593
0edb3e33 1594 def unlock(self):
1595 if not self.locked:
1596 return
c1c9a79c 1597 try:
0edb3e33 1598 _unlock_file(self.f)
c1c9a79c 1599 finally:
0edb3e33 1600 self.locked = False
c1c9a79c 1601
0edb3e33 1602 def __exit__(self, *_):
1603 try:
1604 self.unlock()
1605 finally:
1606 self.f.close()
4eb7f1d1 1607
0edb3e33 1608 open = __enter__
1609 close = __exit__
a3125791 1610
0edb3e33 1611 def __getattr__(self, attr):
1612 return getattr(self.f, attr)
a3125791 1613
0edb3e33 1614 def __iter__(self):
1615 return iter(self.f)
a3125791 1616
4eb7f1d1 1617
0b9c08b4 1618@functools.cache
4644ac55
S
1619def get_filesystem_encoding():
1620 encoding = sys.getfilesystemencoding()
1621 return encoding if encoding is not None else 'utf-8'
1622
1623
4eb7f1d1 1624def shell_quote(args):
a6a173c2 1625 quoted_args = []
4644ac55 1626 encoding = get_filesystem_encoding()
a6a173c2
JMF
1627 for a in args:
1628 if isinstance(a, bytes):
1629 # We may get a filename encoded with 'encodeFilename'
1630 a = a.decode(encoding)
aefce8e6 1631 quoted_args.append(compat_shlex_quote(a))
28e614de 1632 return ' '.join(quoted_args)
9d4660ca
PH
1633
1634
1635def smuggle_url(url, data):
1636 """ Pass additional data in a URL for internal use. """
1637
81953d1a
RA
1638 url, idata = unsmuggle_url(url, {})
1639 data.update(idata)
14f25df2 1640 sdata = urllib.parse.urlencode(
28e614de
PH
1641 {'__youtubedl_smuggle': json.dumps(data)})
1642 return url + '#' + sdata
9d4660ca
PH
1643
1644
79f82953 1645def unsmuggle_url(smug_url, default=None):
83e865a3 1646 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1647 return smug_url, default
28e614de 1648 url, _, sdata = smug_url.rpartition('#')
14f25df2 1649 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1650 data = json.loads(jsond)
1651 return url, data
02dbf93f
PH
1652
1653
e0fd9573 1654def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1655 """ Formats numbers with decimal sufixes like K, M, etc """
1656 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1657 if num is None or num < 0:
e0fd9573 1658 return None
eeb2a770 1659 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1660 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1661 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1662 if factor == 1024:
1663 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1664 converted = num / (factor ** exponent)
abbeeebc 1665 return fmt % (converted, suffix)
e0fd9573 1666
1667
02dbf93f 1668def format_bytes(bytes):
f02d24d8 1669 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1670
1c088fa8 1671
64c464a1 1672def lookup_unit_table(unit_table, s, strict=False):
1673 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1674 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1675 m = (re.fullmatch if strict else re.match)(
1676 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1677 if not m:
1678 return None
64c464a1 1679
1680 num = float(m.group('num').replace(',', '.'))
fb47597b 1681 mult = unit_table[m.group('unit')]
64c464a1 1682 return round(num * mult)
1683
1684
1685def parse_bytes(s):
1686 """Parse a string indicating a byte quantity into an integer"""
1687 return lookup_unit_table(
1688 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1689 s.upper(), strict=True)
fb47597b
S
1690
1691
be64b5b0
PH
1692def parse_filesize(s):
1693 if s is None:
1694 return None
1695
dfb1b146 1696 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1697 # but we support those too
1698 _UNIT_TABLE = {
1699 'B': 1,
1700 'b': 1,
70852b47 1701 'bytes': 1,
be64b5b0
PH
1702 'KiB': 1024,
1703 'KB': 1000,
1704 'kB': 1024,
1705 'Kb': 1000,
13585d76 1706 'kb': 1000,
70852b47
YCH
1707 'kilobytes': 1000,
1708 'kibibytes': 1024,
be64b5b0
PH
1709 'MiB': 1024 ** 2,
1710 'MB': 1000 ** 2,
1711 'mB': 1024 ** 2,
1712 'Mb': 1000 ** 2,
13585d76 1713 'mb': 1000 ** 2,
70852b47
YCH
1714 'megabytes': 1000 ** 2,
1715 'mebibytes': 1024 ** 2,
be64b5b0
PH
1716 'GiB': 1024 ** 3,
1717 'GB': 1000 ** 3,
1718 'gB': 1024 ** 3,
1719 'Gb': 1000 ** 3,
13585d76 1720 'gb': 1000 ** 3,
70852b47
YCH
1721 'gigabytes': 1000 ** 3,
1722 'gibibytes': 1024 ** 3,
be64b5b0
PH
1723 'TiB': 1024 ** 4,
1724 'TB': 1000 ** 4,
1725 'tB': 1024 ** 4,
1726 'Tb': 1000 ** 4,
13585d76 1727 'tb': 1000 ** 4,
70852b47
YCH
1728 'terabytes': 1000 ** 4,
1729 'tebibytes': 1024 ** 4,
be64b5b0
PH
1730 'PiB': 1024 ** 5,
1731 'PB': 1000 ** 5,
1732 'pB': 1024 ** 5,
1733 'Pb': 1000 ** 5,
13585d76 1734 'pb': 1000 ** 5,
70852b47
YCH
1735 'petabytes': 1000 ** 5,
1736 'pebibytes': 1024 ** 5,
be64b5b0
PH
1737 'EiB': 1024 ** 6,
1738 'EB': 1000 ** 6,
1739 'eB': 1024 ** 6,
1740 'Eb': 1000 ** 6,
13585d76 1741 'eb': 1000 ** 6,
70852b47
YCH
1742 'exabytes': 1000 ** 6,
1743 'exbibytes': 1024 ** 6,
be64b5b0
PH
1744 'ZiB': 1024 ** 7,
1745 'ZB': 1000 ** 7,
1746 'zB': 1024 ** 7,
1747 'Zb': 1000 ** 7,
13585d76 1748 'zb': 1000 ** 7,
70852b47
YCH
1749 'zettabytes': 1000 ** 7,
1750 'zebibytes': 1024 ** 7,
be64b5b0
PH
1751 'YiB': 1024 ** 8,
1752 'YB': 1000 ** 8,
1753 'yB': 1024 ** 8,
1754 'Yb': 1000 ** 8,
13585d76 1755 'yb': 1000 ** 8,
70852b47
YCH
1756 'yottabytes': 1000 ** 8,
1757 'yobibytes': 1024 ** 8,
be64b5b0
PH
1758 }
1759
fb47597b
S
1760 return lookup_unit_table(_UNIT_TABLE, s)
1761
1762
1763def parse_count(s):
1764 if s is None:
be64b5b0
PH
1765 return None
1766
352d5da8 1767 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1768
1769 if re.match(r'^[\d,.]+$', s):
1770 return str_to_int(s)
1771
1772 _UNIT_TABLE = {
1773 'k': 1000,
1774 'K': 1000,
1775 'm': 1000 ** 2,
1776 'M': 1000 ** 2,
1777 'kk': 1000 ** 2,
1778 'KK': 1000 ** 2,
352d5da8 1779 'b': 1000 ** 3,
1780 'B': 1000 ** 3,
fb47597b 1781 }
be64b5b0 1782
352d5da8 1783 ret = lookup_unit_table(_UNIT_TABLE, s)
1784 if ret is not None:
1785 return ret
1786
1787 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1788 if mobj:
1789 return str_to_int(mobj.group(1))
be64b5b0 1790
2f7ae819 1791
5d45484c 1792def parse_resolution(s, *, lenient=False):
b871d7e9
S
1793 if s is None:
1794 return {}
1795
5d45484c
LNO
1796 if lenient:
1797 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1798 else:
1799 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1800 if mobj:
1801 return {
1802 'width': int(mobj.group('w')),
1803 'height': int(mobj.group('h')),
1804 }
1805
17ec8bcf 1806 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1807 if mobj:
1808 return {'height': int(mobj.group(1))}
1809
1810 mobj = re.search(r'\b([48])[kK]\b', s)
1811 if mobj:
1812 return {'height': int(mobj.group(1)) * 540}
1813
1814 return {}
1815
1816
0dc41787 1817def parse_bitrate(s):
14f25df2 1818 if not isinstance(s, str):
0dc41787
S
1819 return
1820 mobj = re.search(r'\b(\d+)\s*kbps', s)
1821 if mobj:
1822 return int(mobj.group(1))
1823
1824
a942d6cb 1825def month_by_name(name, lang='en'):
caefb1de
PH
1826 """ Return the number of a month by (locale-independently) English name """
1827
f6717dec 1828 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1829
caefb1de 1830 try:
f6717dec 1831 return month_names.index(name) + 1
7105440c
YCH
1832 except ValueError:
1833 return None
1834
1835
1836def month_by_abbreviation(abbrev):
1837 """ Return the number of a month by (locale-independently) English
1838 abbreviations """
1839
1840 try:
1841 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1842 except ValueError:
1843 return None
18258362
JMF
1844
1845
5aafe895 1846def fix_xml_ampersands(xml_str):
18258362 1847 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1848 return re.sub(
1849 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1850 '&amp;',
5aafe895 1851 xml_str)
e3946f98
PH
1852
1853
1854def setproctitle(title):
14f25df2 1855 assert isinstance(title, str)
c1c05c67 1856
fe0918bb 1857 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1858 try:
1859 import ctypes
1860 except ImportError:
c1c05c67
YCH
1861 return
1862
e3946f98 1863 try:
611c1dd9 1864 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1865 except OSError:
1866 return
2f49bcd6
RC
1867 except TypeError:
1868 # LoadLibrary in Windows Python 2.7.13 only expects
1869 # a bytestring, but since unicode_literals turns
1870 # every string into a unicode string, it fails.
1871 return
0f06bcd7 1872 title_bytes = title.encode()
6eefe533
PH
1873 buf = ctypes.create_string_buffer(len(title_bytes))
1874 buf.value = title_bytes
e3946f98 1875 try:
6eefe533 1876 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1877 except AttributeError:
1878 return # Strange libc, just skip this
d7dda168
PH
1879
1880
1881def remove_start(s, start):
46bc9b7d 1882 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1883
1884
2b9faf55 1885def remove_end(s, end):
46bc9b7d 1886 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1887
1888
31b2051e
S
1889def remove_quotes(s):
1890 if s is None or len(s) < 2:
1891 return s
1892 for quote in ('"', "'", ):
1893 if s[0] == quote and s[-1] == quote:
1894 return s[1:-1]
1895 return s
1896
1897
b6e0c7d2 1898def get_domain(url):
ebf99aaf 1899 """
1900 This implementation is inconsistent, but is kept for compatibility.
1901 Use this only for "webpage_url_domain"
1902 """
1903 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1904
1905
29eb5174 1906def url_basename(url):
14f25df2 1907 path = urllib.parse.urlparse(url).path
28e614de 1908 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1909
1910
02dc0a36 1911def base_url(url):
7657ec7e 1912 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1913
1914
e34c3361 1915def urljoin(base, path):
4b5de77b 1916 if isinstance(path, bytes):
0f06bcd7 1917 path = path.decode()
14f25df2 1918 if not isinstance(path, str) or not path:
e34c3361 1919 return None
fad4ceb5 1920 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1921 return path
4b5de77b 1922 if isinstance(base, bytes):
0f06bcd7 1923 base = base.decode()
14f25df2 1924 if not isinstance(base, str) or not re.match(
4b5de77b 1925 r'^(?:https?:)?//', base):
e34c3361 1926 return None
14f25df2 1927 return urllib.parse.urljoin(base, path)
e34c3361
S
1928
1929
9732d77e 1930def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1931 if get_attr and v is not None:
1932 v = getattr(v, get_attr, None)
1812afb7
S
1933 try:
1934 return int(v) * invscale // scale
31c49255 1935 except (ValueError, TypeError, OverflowError):
af98f8ff 1936 return default
9732d77e 1937
9572013d 1938
40a90862 1939def str_or_none(v, default=None):
14f25df2 1940 return default if v is None else str(v)
40a90862 1941
9732d77e
PH
1942
1943def str_to_int(int_str):
48d4681e 1944 """ A more relaxed version of int_or_none """
f9934b96 1945 if isinstance(int_str, int):
348c6bf1 1946 return int_str
14f25df2 1947 elif isinstance(int_str, str):
42db58ec
S
1948 int_str = re.sub(r'[,\.\+]', '', int_str)
1949 return int_or_none(int_str)
608d11f5
PH
1950
1951
9732d77e 1952def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1953 if v is None:
1954 return default
1955 try:
1956 return float(v) * invscale / scale
5e1271c5 1957 except (ValueError, TypeError):
caf80631 1958 return default
43f775e4
PH
1959
1960
c7e327c4
S
1961def bool_or_none(v, default=None):
1962 return v if isinstance(v, bool) else default
1963
1964
53cd37ba 1965def strip_or_none(v, default=None):
14f25df2 1966 return v.strip() if isinstance(v, str) else default
b72b4431
S
1967
1968
af03000a 1969def url_or_none(url):
14f25df2 1970 if not url or not isinstance(url, str):
af03000a
S
1971 return None
1972 url = url.strip()
29f7c58a 1973 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
1974
1975
ad54c913 1976def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 1977 datetime_object = None
1978 try:
f9934b96 1979 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 1980 # Using naive datetime here can break timestamp() in Windows
1981 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 1982 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1983 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1984 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1985 + datetime.timedelta(seconds=timestamp))
14f25df2 1986 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 1987 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 1988 date_format = re.sub( # Support %s on windows
1989 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 1990 return datetime_object.strftime(date_format)
1991 except (ValueError, TypeError, AttributeError):
1992 return default
1993
1994
608d11f5 1995def parse_duration(s):
f9934b96 1996 if not isinstance(s, str):
608d11f5 1997 return None
ca7b3246 1998 s = s.strip()
38d79fd1 1999 if not s:
2000 return None
ca7b3246 2001
acaff495 2002 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2003 m = re.match(r'''(?x)
2004 (?P<before_secs>
2005 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2006 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2007 (?P<ms>[.:][0-9]+)?Z?$
2008 ''', s)
acaff495 2009 if m:
8bd1c00b 2010 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2011 else:
2012 m = re.match(
056653bb
S
2013 r'''(?ix)(?:P?
2014 (?:
1c1b2f96 2015 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2016 )?
2017 (?:
1c1b2f96 2018 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2019 )?
2020 (?:
1c1b2f96 2021 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2022 )?
8f4b58d7 2023 (?:
1c1b2f96 2024 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2025 )?
056653bb 2026 T)?
acaff495 2027 (?:
af868732 2028 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
acaff495 2029 )?
2030 (?:
1c1b2f96 2031 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2032 )?
2033 (?:
2034 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2035 )?Z?$''', s)
acaff495 2036 if m:
2037 days, hours, mins, secs, ms = m.groups()
2038 else:
15846398 2039 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2040 if m:
2041 hours, mins = m.groups()
2042 else:
2043 return None
2044
acaff495 2045 if ms:
19a03940 2046 ms = ms.replace(':', '.')
2047 return sum(float(part or 0) * mult for part, mult in (
2048 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2049
2050
e65e4c88 2051def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2052 name, real_ext = os.path.splitext(filename)
e65e4c88 2053 return (
86e5f3ed 2054 f'{name}.{ext}{real_ext}'
e65e4c88 2055 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2056 else f'{filename}.{ext}')
d70ad093
PH
2057
2058
b3ed15b7
S
2059def replace_extension(filename, ext, expected_real_ext=None):
2060 name, real_ext = os.path.splitext(filename)
86e5f3ed 2061 return '{}.{}'.format(
b3ed15b7
S
2062 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2063 ext)
2064
2065
d70ad093
PH
2066def check_executable(exe, args=[]):
2067 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2068 args can be a list of arguments for a short output (like -version) """
2069 try:
f0c9fb96 2070 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2071 except OSError:
2072 return False
2073 return exe
b7ab0590
PH
2074
2075
7aaf4cd2 2076def _get_exe_version_output(exe, args):
95807118 2077 try:
b64d04c1 2078 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2079 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2080 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2081 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2082 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2083 if ret:
2084 return None
95807118
PH
2085 except OSError:
2086 return False
f0c9fb96 2087 return stdout
cae97f65
PH
2088
2089
2090def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2091 assert isinstance(output, str)
cae97f65
PH
2092 if version_re is None:
2093 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2094 m = re.search(version_re, output)
95807118
PH
2095 if m:
2096 return m.group(1)
2097 else:
2098 return unrecognized
2099
2100
9af98e17 2101def get_exe_version(exe, args=['--version'],
1cdda329 2102 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2103 """ Returns the version of the specified executable,
2104 or False if the executable is not present """
1cdda329 2105 unrecognized = variadic(unrecognized)
2106 assert len(unrecognized) in (1, 2)
9af98e17 2107 out = _get_exe_version_output(exe, args)
1cdda329 2108 if out is None:
2109 return unrecognized[-1]
2110 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2111
2112
7e88d7d7 2113def frange(start=0, stop=None, step=1):
2114 """Float range"""
2115 if stop is None:
2116 start, stop = 0, start
2117 sign = [-1, 1][step > 0] if step else 0
2118 while sign * start < sign * stop:
2119 yield start
2120 start += step
2121
2122
cb89cfc1 2123class LazyList(collections.abc.Sequence):
0f06bcd7 2124 """Lazy immutable list from an iterable
2125 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2126
8e5fecc8 2127 class IndexError(IndexError):
2128 pass
2129
282f5709 2130 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2131 self._iterable = iter(iterable)
2132 self._cache = [] if _cache is None else _cache
2133 self._reversed = reverse
483336e7 2134
2135 def __iter__(self):
0f06bcd7 2136 if self._reversed:
28419ca2 2137 # We need to consume the entire iterable to iterate in reverse
981052c9 2138 yield from self.exhaust()
28419ca2 2139 return
0f06bcd7 2140 yield from self._cache
2141 for item in self._iterable:
2142 self._cache.append(item)
483336e7 2143 yield item
2144
0f06bcd7 2145 def _exhaust(self):
2146 self._cache.extend(self._iterable)
2147 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2148 return self._cache
28419ca2 2149
981052c9 2150 def exhaust(self):
0f06bcd7 2151 """Evaluate the entire iterable"""
2152 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2153
28419ca2 2154 @staticmethod
0f06bcd7 2155 def _reverse_index(x):
f2df4071 2156 return None if x is None else ~x
483336e7 2157
2158 def __getitem__(self, idx):
2159 if isinstance(idx, slice):
0f06bcd7 2160 if self._reversed:
2161 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2162 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2163 elif isinstance(idx, int):
0f06bcd7 2164 if self._reversed:
2165 idx = self._reverse_index(idx)
e0f2b4b4 2166 start, stop, step = idx, idx, 0
483336e7 2167 else:
2168 raise TypeError('indices must be integers or slices')
e0f2b4b4 2169 if ((start or 0) < 0 or (stop or 0) < 0
2170 or (start is None and step < 0)
2171 or (stop is None and step > 0)):
483336e7 2172 # We need to consume the entire iterable to be able to slice from the end
2173 # Obviously, never use this with infinite iterables
0f06bcd7 2174 self._exhaust()
8e5fecc8 2175 try:
0f06bcd7 2176 return self._cache[idx]
8e5fecc8 2177 except IndexError as e:
2178 raise self.IndexError(e) from e
0f06bcd7 2179 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2180 if n > 0:
0f06bcd7 2181 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2182 try:
0f06bcd7 2183 return self._cache[idx]
8e5fecc8 2184 except IndexError as e:
2185 raise self.IndexError(e) from e
483336e7 2186
2187 def __bool__(self):
2188 try:
0f06bcd7 2189 self[-1] if self._reversed else self[0]
8e5fecc8 2190 except self.IndexError:
483336e7 2191 return False
2192 return True
2193
2194 def __len__(self):
0f06bcd7 2195 self._exhaust()
2196 return len(self._cache)
483336e7 2197
282f5709 2198 def __reversed__(self):
0f06bcd7 2199 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2200
2201 def __copy__(self):
0f06bcd7 2202 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2203
28419ca2 2204 def __repr__(self):
2205 # repr and str should mimic a list. So we exhaust the iterable
2206 return repr(self.exhaust())
2207
2208 def __str__(self):
2209 return repr(self.exhaust())
2210
483336e7 2211
7be9ccff 2212class PagedList:
c07a39ae 2213
2214 class IndexError(IndexError):
2215 pass
2216
dd26ced1
PH
2217 def __len__(self):
2218 # This is only useful for tests
2219 return len(self.getslice())
2220
7be9ccff 2221 def __init__(self, pagefunc, pagesize, use_cache=True):
2222 self._pagefunc = pagefunc
2223 self._pagesize = pagesize
f1d13090 2224 self._pagecount = float('inf')
7be9ccff 2225 self._use_cache = use_cache
2226 self._cache = {}
2227
2228 def getpage(self, pagenum):
d8cf8d97 2229 page_results = self._cache.get(pagenum)
2230 if page_results is None:
f1d13090 2231 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2232 if self._use_cache:
2233 self._cache[pagenum] = page_results
2234 return page_results
2235
2236 def getslice(self, start=0, end=None):
2237 return list(self._getslice(start, end))
2238
2239 def _getslice(self, start, end):
55575225 2240 raise NotImplementedError('This method must be implemented by subclasses')
2241
2242 def __getitem__(self, idx):
f1d13090 2243 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2244 if not isinstance(idx, int) or idx < 0:
2245 raise TypeError('indices must be non-negative integers')
2246 entries = self.getslice(idx, idx + 1)
d8cf8d97 2247 if not entries:
c07a39ae 2248 raise self.IndexError()
d8cf8d97 2249 return entries[0]
55575225 2250
9c44d242
PH
2251
2252class OnDemandPagedList(PagedList):
a44ca5a4 2253 """Download pages until a page with less than maximum results"""
86e5f3ed 2254
7be9ccff 2255 def _getslice(self, start, end):
b7ab0590
PH
2256 for pagenum in itertools.count(start // self._pagesize):
2257 firstid = pagenum * self._pagesize
2258 nextfirstid = pagenum * self._pagesize + self._pagesize
2259 if start >= nextfirstid:
2260 continue
2261
b7ab0590
PH
2262 startv = (
2263 start % self._pagesize
2264 if firstid <= start < nextfirstid
2265 else 0)
b7ab0590
PH
2266 endv = (
2267 ((end - 1) % self._pagesize) + 1
2268 if (end is not None and firstid <= end <= nextfirstid)
2269 else None)
2270
f1d13090 2271 try:
2272 page_results = self.getpage(pagenum)
2273 except Exception:
2274 self._pagecount = pagenum - 1
2275 raise
b7ab0590
PH
2276 if startv != 0 or endv is not None:
2277 page_results = page_results[startv:endv]
7be9ccff 2278 yield from page_results
b7ab0590
PH
2279
2280 # A little optimization - if current page is not "full", ie. does
2281 # not contain page_size videos then we can assume that this page
2282 # is the last one - there are no more ids on further pages -
2283 # i.e. no need to query again.
2284 if len(page_results) + startv < self._pagesize:
2285 break
2286
2287 # If we got the whole page, but the next page is not interesting,
2288 # break out early as well
2289 if end == nextfirstid:
2290 break
81c2f20b
PH
2291
2292
9c44d242 2293class InAdvancePagedList(PagedList):
a44ca5a4 2294 """PagedList with total number of pages known in advance"""
86e5f3ed 2295
9c44d242 2296 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2297 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2298 self._pagecount = pagecount
9c44d242 2299
7be9ccff 2300 def _getslice(self, start, end):
9c44d242 2301 start_page = start // self._pagesize
d37707bd 2302 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2303 skip_elems = start - start_page * self._pagesize
2304 only_more = None if end is None else end - start
2305 for pagenum in range(start_page, end_page):
7be9ccff 2306 page_results = self.getpage(pagenum)
9c44d242 2307 if skip_elems:
7be9ccff 2308 page_results = page_results[skip_elems:]
9c44d242
PH
2309 skip_elems = None
2310 if only_more is not None:
7be9ccff 2311 if len(page_results) < only_more:
2312 only_more -= len(page_results)
9c44d242 2313 else:
7be9ccff 2314 yield from page_results[:only_more]
9c44d242 2315 break
7be9ccff 2316 yield from page_results
9c44d242
PH
2317
2318
7e88d7d7 2319class PlaylistEntries:
2320 MissingEntry = object()
2321 is_exhausted = False
2322
2323 def __init__(self, ydl, info_dict):
7e9a6125 2324 self.ydl = ydl
2325
2326 # _entries must be assigned now since infodict can change during iteration
2327 entries = info_dict.get('entries')
2328 if entries is None:
2329 raise EntryNotInPlaylist('There are no entries')
2330 elif isinstance(entries, list):
2331 self.is_exhausted = True
2332
2333 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2334 self.is_incomplete = requested_entries is not None
7e9a6125 2335 if self.is_incomplete:
2336 assert self.is_exhausted
bc5c2f8a 2337 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2338 for i, entry in zip(requested_entries, entries):
2339 self._entries[i - 1] = entry
2340 elif isinstance(entries, (list, PagedList, LazyList)):
2341 self._entries = entries
2342 else:
2343 self._entries = LazyList(entries)
7e88d7d7 2344
2345 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2346 (?P<start>[+-]?\d+)?
2347 (?P<range>[:-]
2348 (?P<end>[+-]?\d+|inf(?:inite)?)?
2349 (?::(?P<step>[+-]?\d+))?
2350 )?''')
2351
2352 @classmethod
2353 def parse_playlist_items(cls, string):
2354 for segment in string.split(','):
2355 if not segment:
2356 raise ValueError('There is two or more consecutive commas')
2357 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2358 if not mobj:
2359 raise ValueError(f'{segment!r} is not a valid specification')
2360 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2361 if int_or_none(step) == 0:
2362 raise ValueError(f'Step in {segment!r} cannot be zero')
2363 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2364
2365 def get_requested_items(self):
2366 playlist_items = self.ydl.params.get('playlist_items')
2367 playlist_start = self.ydl.params.get('playliststart', 1)
2368 playlist_end = self.ydl.params.get('playlistend')
2369 # For backwards compatibility, interpret -1 as whole list
2370 if playlist_end in (-1, None):
2371 playlist_end = ''
2372 if not playlist_items:
2373 playlist_items = f'{playlist_start}:{playlist_end}'
2374 elif playlist_start != 1 or playlist_end:
2375 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2376
2377 for index in self.parse_playlist_items(playlist_items):
2378 for i, entry in self[index]:
2379 yield i, entry
1ac4fd80 2380 if not entry:
2381 continue
7e88d7d7 2382 try:
d21056f4 2383 # The item may have just been added to archive. Don't break due to it
2384 if not self.ydl.params.get('lazy_playlist'):
2385 # TODO: Add auto-generated fields
2386 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2387 except (ExistingVideoReached, RejectedVideoReached):
2388 return
2389
7e9a6125 2390 def get_full_count(self):
2391 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2392 return len(self)
2393 elif isinstance(self._entries, InAdvancePagedList):
2394 if self._entries._pagesize == 1:
2395 return self._entries._pagecount
2396
7e88d7d7 2397 @functools.cached_property
2398 def _getter(self):
2399 if isinstance(self._entries, list):
2400 def get_entry(i):
2401 try:
2402 entry = self._entries[i]
2403 except IndexError:
2404 entry = self.MissingEntry
2405 if not self.is_incomplete:
2406 raise self.IndexError()
2407 if entry is self.MissingEntry:
bc5c2f8a 2408 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2409 return entry
2410 else:
2411 def get_entry(i):
2412 try:
2413 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2414 except (LazyList.IndexError, PagedList.IndexError):
2415 raise self.IndexError()
2416 return get_entry
2417
2418 def __getitem__(self, idx):
2419 if isinstance(idx, int):
2420 idx = slice(idx, idx)
2421
2422 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2423 step = 1 if idx.step is None else idx.step
2424 if idx.start is None:
2425 start = 0 if step > 0 else len(self) - 1
2426 else:
2427 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2428
2429 # NB: Do not call len(self) when idx == [:]
2430 if idx.stop is None:
2431 stop = 0 if step < 0 else float('inf')
2432 else:
2433 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2434 stop += [-1, 1][step > 0]
2435
2436 for i in frange(start, stop, step):
2437 if i < 0:
2438 continue
2439 try:
7e9a6125 2440 entry = self._getter(i)
2441 except self.IndexError:
2442 self.is_exhausted = True
2443 if step > 0:
7e88d7d7 2444 break
7e9a6125 2445 continue
7e88d7d7 2446 yield i + 1, entry
2447
2448 def __len__(self):
2449 return len(tuple(self[:]))
2450
2451 class IndexError(IndexError):
2452 pass
2453
2454
81c2f20b 2455def uppercase_escape(s):
676eb3f2 2456 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2457 return re.sub(
a612753d 2458 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2459 lambda m: unicode_escape(m.group(0))[0],
2460 s)
0fe2ff78
YCH
2461
2462
2463def lowercase_escape(s):
2464 unicode_escape = codecs.getdecoder('unicode_escape')
2465 return re.sub(
2466 r'\\u[0-9a-fA-F]{4}',
2467 lambda m: unicode_escape(m.group(0))[0],
2468 s)
b53466e1 2469
d05cfe06 2470
96b9e9cf 2471def parse_qs(url, **kwargs):
2472 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2473
2474
62e609ab
PH
2475def read_batch_urls(batch_fd):
2476 def fixup(url):
14f25df2 2477 if not isinstance(url, str):
62e609ab 2478 url = url.decode('utf-8', 'replace')
8c04f0be 2479 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2480 for bom in BOM_UTF8:
2481 if url.startswith(bom):
2482 url = url[len(bom):]
2483 url = url.lstrip()
2484 if not url or url.startswith(('#', ';', ']')):
62e609ab 2485 return False
8c04f0be 2486 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2487 # However, it can be safely stripped out if following a whitespace
8c04f0be 2488 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2489
2490 with contextlib.closing(batch_fd) as fd:
2491 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2492
2493
2494def urlencode_postdata(*args, **kargs):
14f25df2 2495 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2496
2497
45b2ee6f 2498def update_url(url, *, query_update=None, **kwargs):
2499 """Replace URL components specified by kwargs
2500 @param url str or parse url tuple
2501 @param query_update update query
2502 @returns str
2503 """
2504 if isinstance(url, str):
2505 if not kwargs and not query_update:
2506 return url
2507 else:
2508 url = urllib.parse.urlparse(url)
2509 if query_update:
2510 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2511 kwargs['query'] = urllib.parse.urlencode({
2512 **urllib.parse.parse_qs(url.query),
2513 **query_update
2514 }, True)
2515 return urllib.parse.urlunparse(url._replace(**kwargs))
2516
2517
38f9ef31 2518def update_url_query(url, query):
45b2ee6f 2519 return update_url(url, query_update=query)
16392824 2520
8e60dc75 2521
10c87c15 2522def _multipart_encode_impl(data, boundary):
0c265486
YCH
2523 content_type = 'multipart/form-data; boundary=%s' % boundary
2524
2525 out = b''
2526 for k, v in data.items():
2527 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2528 if isinstance(k, str):
0f06bcd7 2529 k = k.encode()
14f25df2 2530 if isinstance(v, str):
0f06bcd7 2531 v = v.encode()
0c265486
YCH
2532 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2533 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2534 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2535 if boundary.encode('ascii') in content:
2536 raise ValueError('Boundary overlaps with data')
2537 out += content
2538
2539 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2540
2541 return out, content_type
2542
2543
2544def multipart_encode(data, boundary=None):
2545 '''
2546 Encode a dict to RFC 7578-compliant form-data
2547
2548 data:
2549 A dict where keys and values can be either Unicode or bytes-like
2550 objects.
2551 boundary:
2552 If specified a Unicode object, it's used as the boundary. Otherwise
2553 a random boundary is generated.
2554
2555 Reference: https://tools.ietf.org/html/rfc7578
2556 '''
2557 has_specified_boundary = boundary is not None
2558
2559 while True:
2560 if boundary is None:
2561 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2562
2563 try:
10c87c15 2564 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2565 break
2566 except ValueError:
2567 if has_specified_boundary:
2568 raise
2569 boundary = None
2570
2571 return out, content_type
2572
2573
b079c26f
SS
2574def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2575 if blocked_types is NO_DEFAULT:
2576 blocked_types = (str, bytes, collections.abc.Mapping)
2577 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2578
2579
2580def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2581 if not isinstance(allowed_types, (tuple, type)):
2582 deprecation_warning('allowed_types should be a tuple or a type')
2583 allowed_types = tuple(allowed_types)
6f2287cb 2584 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2585
2586
c4f60dd7 2587def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2588 for f in funcs:
a32a9a7e 2589 try:
c4f60dd7 2590 val = f(*args, **kwargs)
ab029d7e 2591 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2592 pass
2593 else:
c4f60dd7 2594 if expected_type is None or isinstance(val, expected_type):
2595 return val
2596
2597
2598def try_get(src, getter, expected_type=None):
2599 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2600
2601
90137ca4 2602def filter_dict(dct, cndn=lambda _, v: v is not None):
2603 return {k: v for k, v in dct.items() if cndn(k, v)}
2604
2605
6cc62232
S
2606def merge_dicts(*dicts):
2607 merged = {}
2608 for a_dict in dicts:
2609 for k, v in a_dict.items():
90137ca4 2610 if (v is not None and k not in merged
2611 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2612 merged[k] = v
2613 return merged
2614
2615
8e60dc75 2616def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2617 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2618
16392824 2619
a1a530b0
PH
2620US_RATINGS = {
2621 'G': 0,
2622 'PG': 10,
2623 'PG-13': 13,
2624 'R': 16,
2625 'NC': 18,
2626}
fac55558
PH
2627
2628
a8795327 2629TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2630 'TV-Y': 0,
2631 'TV-Y7': 7,
2632 'TV-G': 0,
2633 'TV-PG': 0,
2634 'TV-14': 14,
2635 'TV-MA': 17,
a8795327
S
2636}
2637
2638
146c80e2 2639def parse_age_limit(s):
19a03940 2640 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2641 if type(s) is int: # noqa: E721
a8795327 2642 return s if 0 <= s <= 21 else None
19a03940 2643 elif not isinstance(s, str):
d838b1bd 2644 return None
146c80e2 2645 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2646 if m:
2647 return int(m.group('age'))
5c5fae6d 2648 s = s.upper()
a8795327
S
2649 if s in US_RATINGS:
2650 return US_RATINGS[s]
5a16c9d9 2651 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2652 if m:
5a16c9d9 2653 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2654 return None
146c80e2
S
2655
2656
fac55558 2657def strip_jsonp(code):
609a61e3 2658 return re.sub(
5552c9eb 2659 r'''(?sx)^
e9c671d5 2660 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2661 (?:\s*&&\s*(?P=func_name))?
2662 \s*\(\s*(?P<callback_data>.*)\);?
2663 \s*?(?://[^\n]*)*$''',
2664 r'\g<callback_data>', code)
478c2c61
PH
2665
2666
8f53dc44 2667def js_to_json(code, vars={}, *, strict=False):
5c610515 2668 # vars is a dict of var, val pairs to substitute
0898c5c8 2669 STRING_QUOTES = '\'"`'
a71b812f 2670 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2671 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2672 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2673 INTEGER_TABLE = (
86e5f3ed 2674 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2675 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2676 )
2677
a71b812f
SS
2678 def process_escape(match):
2679 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2680 escape = match.group(1) or match.group(2)
2681
2682 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2683 else R'\u00' if escape == 'x'
2684 else '' if escape == '\n'
2685 else escape)
2686
0898c5c8
SS
2687 def template_substitute(match):
2688 evaluated = js_to_json(match.group(1), vars, strict=strict)
2689 if evaluated[0] == '"':
2690 return json.loads(evaluated)
2691 return evaluated
2692
e05f6939 2693 def fix_kv(m):
e7b6d122
PH
2694 v = m.group(0)
2695 if v in ('true', 'false', 'null'):
2696 return v
421ddcb8
C
2697 elif v in ('undefined', 'void 0'):
2698 return 'null'
8bdd16b4 2699 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2700 return ''
2701
2702 if v[0] in STRING_QUOTES:
0898c5c8
SS
2703 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2704 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2705 return f'"{escaped}"'
2706
2707 for regex, base in INTEGER_TABLE:
2708 im = re.match(regex, v)
2709 if im:
2710 i = int(im.group(1), base)
2711 return f'"{i}":' if v.endswith(':') else str(i)
2712
2713 if v in vars:
d5f043d1
C
2714 try:
2715 if not strict:
2716 json.loads(vars[v])
08e29b9f 2717 except json.JSONDecodeError:
d5f043d1
C
2718 return json.dumps(vars[v])
2719 else:
2720 return vars[v]
89ac4a19 2721
a71b812f
SS
2722 if not strict:
2723 return f'"{v}"'
5c610515 2724
a71b812f 2725 raise ValueError(f'Unknown value: {v}')
e05f6939 2726
8072ef2b 2727 def create_map(mobj):
2728 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2729
8072ef2b 2730 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2731 if not strict:
2732 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 2733 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2734 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2735 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2736
a71b812f
SS
2737 return re.sub(rf'''(?sx)
2738 {STRING_RE}|
2739 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2740 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2741 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2742 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2743 !+
a71b812f 2744 ''', fix_kv, code)
e05f6939
PH
2745
2746
478c2c61
PH
2747def qualities(quality_ids):
2748 """ Get a numeric quality value out of a list of possible values """
2749 def q(qid):
2750 try:
2751 return quality_ids.index(qid)
2752 except ValueError:
2753 return -1
2754 return q
2755
acd69589 2756
119e40ef 2757POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2758
2759
de6000d9 2760DEFAULT_OUTTMPL = {
2761 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2762 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2763}
2764OUTTMPL_TYPES = {
72755351 2765 'chapter': None,
de6000d9 2766 'subtitle': None,
2767 'thumbnail': None,
2768 'description': 'description',
2769 'annotation': 'annotations.xml',
2770 'infojson': 'info.json',
08438d2c 2771 'link': None,
3b603dbd 2772 'pl_video': None,
5112f26a 2773 'pl_thumbnail': None,
de6000d9 2774 'pl_description': 'description',
2775 'pl_infojson': 'info.json',
2776}
0a871f68 2777
143db31d 2778# As of [1] format syntax is:
2779# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2780# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2781STR_FORMAT_RE_TMPL = r'''(?x)
2782 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2783 %
524e2e4f 2784 (?P<has_key>\((?P<key>{0})\))?
752cda38 2785 (?P<format>
524e2e4f 2786 (?P<conversion>[#0\-+ ]+)?
2787 (?P<min_width>\d+)?
2788 (?P<precision>\.\d+)?
2789 (?P<len_mod>[hlL])? # unused in python
901130bb 2790 {1} # conversion type
752cda38 2791 )
143db31d 2792'''
2793
7d1eb38a 2794
ebe1b4e3 2795STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2796
7d1eb38a 2797
a020a0dc
PH
2798def limit_length(s, length):
2799 """ Add ellipses to overly long strings """
2800 if s is None:
2801 return None
2802 ELLIPSES = '...'
2803 if len(s) > length:
2804 return s[:length - len(ELLIPSES)] + ELLIPSES
2805 return s
48844745
PH
2806
2807
2808def version_tuple(v):
5f9b8394 2809 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2810
2811
2812def is_outdated_version(version, limit, assume_new=True):
2813 if not version:
2814 return not assume_new
2815 try:
2816 return version_tuple(version) < version_tuple(limit)
2817 except ValueError:
2818 return not assume_new
732ea2f0
PH
2819
2820
2821def ytdl_is_updateable():
7a5c1cfe 2822 """ Returns if yt-dlp can be updated with -U """
735d865e 2823
69bec673 2824 from ..update import is_non_updateable
732ea2f0 2825
5d535b4a 2826 return not is_non_updateable()
7d4111ed
PH
2827
2828
2829def args_to_str(args):
2830 # Get a short string representation for a subprocess command
702ccf2d 2831 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2832
2833
a44ca5a4 2834def error_to_str(err):
2835 return f'{type(err).__name__}: {err}'
2836
2837
2647c933 2838def mimetype2ext(mt, default=NO_DEFAULT):
2839 if not isinstance(mt, str):
2840 if default is not NO_DEFAULT:
2841 return default
eb9ee194
S
2842 return None
2843
2647c933 2844 MAP = {
2845 # video
f6861ec9 2846 '3gpp': '3gp',
2647c933 2847 'mp2t': 'ts',
2848 'mp4': 'mp4',
2849 'mpeg': 'mpeg',
2850 'mpegurl': 'm3u8',
2851 'quicktime': 'mov',
2852 'webm': 'webm',
2853 'vp9': 'vp9',
f659e643 2854 'video/ogg': 'ogv',
f6861ec9 2855 'x-flv': 'flv',
2647c933 2856 'x-m4v': 'm4v',
2857 'x-matroska': 'mkv',
2858 'x-mng': 'mng',
a0d8d704 2859 'x-mp4-fragmented': 'mp4',
2647c933 2860 'x-ms-asf': 'asf',
a0d8d704 2861 'x-ms-wmv': 'wmv',
2647c933 2862 'x-msvideo': 'avi',
2863
2864 # application (streaming playlists)
b4173f15 2865 'dash+xml': 'mpd',
b4173f15 2866 'f4m+xml': 'f4m',
f164b971 2867 'hds+xml': 'f4m',
2647c933 2868 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2869 'vnd.ms-sstr+xml': 'ism',
2647c933 2870 'x-mpegurl': 'm3u8',
2871
2872 # audio
2873 'audio/mp4': 'm4a',
2874 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2875 # Using .mp3 as it's the most popular one
2876 'audio/mpeg': 'mp3',
d80ca5de 2877 'audio/webm': 'webm',
2647c933 2878 'audio/x-matroska': 'mka',
2879 'audio/x-mpegurl': 'm3u',
2880 'midi': 'mid',
2881 'ogg': 'ogg',
2882 'wav': 'wav',
2883 'wave': 'wav',
2884 'x-aac': 'aac',
2885 'x-flac': 'flac',
2886 'x-m4a': 'm4a',
2887 'x-realaudio': 'ra',
39e7107d 2888 'x-wav': 'wav',
9359f3d4 2889
2647c933 2890 # image
2891 'avif': 'avif',
2892 'bmp': 'bmp',
2893 'gif': 'gif',
2894 'jpeg': 'jpg',
2895 'png': 'png',
2896 'svg+xml': 'svg',
2897 'tiff': 'tif',
2898 'vnd.wap.wbmp': 'wbmp',
2899 'webp': 'webp',
2900 'x-icon': 'ico',
2901 'x-jng': 'jng',
2902 'x-ms-bmp': 'bmp',
2903
2904 # caption
2905 'filmstrip+json': 'fs',
2906 'smptett+xml': 'tt',
2907 'ttaf+xml': 'dfxp',
2908 'ttml+xml': 'ttml',
2909 'x-ms-sami': 'sami',
9359f3d4 2910
2647c933 2911 # misc
2912 'gzip': 'gz',
9359f3d4
F
2913 'json': 'json',
2914 'xml': 'xml',
2915 'zip': 'zip',
9359f3d4
F
2916 }
2917
2647c933 2918 mimetype = mt.partition(';')[0].strip().lower()
2919 _, _, subtype = mimetype.rpartition('/')
9359f3d4 2920
69bec673 2921 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 2922 if ext:
2923 return ext
2924 elif default is not NO_DEFAULT:
2925 return default
9359f3d4 2926 return subtype.replace('+', '.')
c460bdd5
PH
2927
2928
2814f12b
THD
2929def ext2mimetype(ext_or_url):
2930 if not ext_or_url:
2931 return None
2932 if '.' not in ext_or_url:
2933 ext_or_url = f'file.{ext_or_url}'
2934 return mimetypes.guess_type(ext_or_url)[0]
2935
2936
4f3c5e06 2937def parse_codecs(codecs_str):
2938 # http://tools.ietf.org/html/rfc6381
2939 if not codecs_str:
2940 return {}
a0566bbf 2941 split_codecs = list(filter(None, map(
dbf5416a 2942 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 2943 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 2944 for full_codec in split_codecs:
d816f61f 2945 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2946 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2947 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2948 if vcodec:
2949 continue
2950 vcodec = full_codec
2951 if parts[0] in ('dvh1', 'dvhe'):
2952 hdr = 'DV'
69bec673 2953 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 2954 hdr = 'HDR10'
2955 elif parts[:2] == ['vp9', '2']:
2956 hdr = 'HDR10'
71082216 2957 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 2958 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2959 acodec = acodec or full_codec
2960 elif parts[0] in ('stpp', 'wvtt'):
2961 scodec = scodec or full_codec
4f3c5e06 2962 else:
19a03940 2963 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 2964 if vcodec or acodec or scodec:
4f3c5e06 2965 return {
2966 'vcodec': vcodec or 'none',
2967 'acodec': acodec or 'none',
176f1866 2968 'dynamic_range': hdr,
3fe75fdc 2969 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 2970 }
b69fd25c 2971 elif len(split_codecs) == 2:
2972 return {
2973 'vcodec': split_codecs[0],
2974 'acodec': split_codecs[1],
2975 }
4f3c5e06 2976 return {}
2977
2978
fc61aff4
LL
2979def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2980 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2981
2982 allow_mkv = not preferences or 'mkv' in preferences
2983
2984 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2985 return 'mkv' # TODO: any other format allows this?
2986
2987 # TODO: All codecs supported by parse_codecs isn't handled here
2988 COMPATIBLE_CODECS = {
2989 'mp4': {
71082216 2990 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 2991 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
2992 },
2993 'webm': {
2994 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
2995 'vp9x', 'vp8x', # in the webm spec
2996 },
2997 }
2998
812cdfa0 2999 sanitize_codec = functools.partial(
3000 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3001 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3002
3003 for ext in preferences or COMPATIBLE_CODECS.keys():
3004 codec_set = COMPATIBLE_CODECS.get(ext, set())
3005 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3006 return ext
3007
3008 COMPATIBLE_EXTS = (
3009 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3010 {'webm', 'weba'},
fc61aff4
LL
3011 )
3012 for ext in preferences or vexts:
3013 current_exts = {ext, *vexts, *aexts}
3014 if ext == 'mkv' or current_exts == {ext} or any(
3015 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3016 return ext
3017 return 'mkv' if allow_mkv else preferences[-1]
3018
3019
2647c933 3020def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3021 getheader = url_handle.headers.get
2ccd1b10 3022
b55ee18f
PH
3023 cd = getheader('Content-Disposition')
3024 if cd:
3025 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3026 if m:
3027 e = determine_ext(m.group('filename'), default_ext=None)
3028 if e:
3029 return e
3030
2647c933 3031 meta_ext = getheader('x-amz-meta-name')
3032 if meta_ext:
3033 e = meta_ext.rpartition('.')[2]
3034 if e:
3035 return e
3036
3037 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3038
3039
1e399778
YCH
3040def encode_data_uri(data, mime_type):
3041 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3042
3043
05900629 3044def age_restricted(content_limit, age_limit):
6ec6cb4e 3045 """ Returns True iff the content should be blocked """
05900629
PH
3046
3047 if age_limit is None: # No limit set
3048 return False
3049 if content_limit is None:
3050 return False # Content available for everyone
3051 return age_limit < content_limit
61ca9a80
PH
3052
3053
88f60feb 3054# List of known byte-order-marks (BOM)
a904a7f8
L
3055BOMS = [
3056 (b'\xef\xbb\xbf', 'utf-8'),
3057 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3058 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3059 (b'\xff\xfe', 'utf-16-le'),
3060 (b'\xfe\xff', 'utf-16-be'),
3061]
a904a7f8
L
3062
3063
61ca9a80
PH
3064def is_html(first_bytes):
3065 """ Detect whether a file contains HTML by examining its first bytes. """
3066
80e8493e 3067 encoding = 'utf-8'
61ca9a80 3068 for bom, enc in BOMS:
80e8493e 3069 while first_bytes.startswith(bom):
3070 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3071
80e8493e 3072 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3073
3074
3075def determine_protocol(info_dict):
3076 protocol = info_dict.get('protocol')
3077 if protocol is not None:
3078 return protocol
3079
7de837a5 3080 url = sanitize_url(info_dict['url'])
a055469f
PH
3081 if url.startswith('rtmp'):
3082 return 'rtmp'
3083 elif url.startswith('mms'):
3084 return 'mms'
3085 elif url.startswith('rtsp'):
3086 return 'rtsp'
3087
3088 ext = determine_ext(url)
3089 if ext == 'm3u8':
deae7c17 3090 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3091 elif ext == 'f4m':
3092 return 'f4m'
3093
14f25df2 3094 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3095
3096
c5e3f849 3097def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3098 """ Render a list of rows, each as a list of values.
3099 Text after a \t will be right aligned """
ec11a9f4 3100 def width(string):
c5e3f849 3101 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3102
3103 def get_max_lens(table):
ec11a9f4 3104 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3105
3106 def filter_using_list(row, filterArray):
d16df59d 3107 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3108
d16df59d 3109 max_lens = get_max_lens(data) if hide_empty else []
3110 header_row = filter_using_list(header_row, max_lens)
3111 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3112
cfb56d1a 3113 table = [header_row] + data
76d321f6 3114 max_lens = get_max_lens(table)
c5e3f849 3115 extra_gap += 1
76d321f6 3116 if delim:
c5e3f849 3117 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3118 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3119 for row in table:
3120 for pos, text in enumerate(map(str, row)):
c5e3f849 3121 if '\t' in text:
3122 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3123 else:
3124 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3125 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3126 return ret
347de493
PH
3127
3128
8f18aca8 3129def _match_one(filter_part, dct, incomplete):
77b87f05 3130 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3131 STRING_OPERATORS = {
3132 '*=': operator.contains,
3133 '^=': lambda attr, value: attr.startswith(value),
3134 '$=': lambda attr, value: attr.endswith(value),
3135 '~=': lambda attr, value: re.search(value, attr),
3136 }
347de493 3137 COMPARISON_OPERATORS = {
a047eeb6 3138 **STRING_OPERATORS,
3139 '<=': operator.le, # "<=" must be defined above "<"
347de493 3140 '<': operator.lt,
347de493 3141 '>=': operator.ge,
a047eeb6 3142 '>': operator.gt,
347de493 3143 '=': operator.eq,
347de493 3144 }
a047eeb6 3145
6db9c4d5 3146 if isinstance(incomplete, bool):
3147 is_incomplete = lambda _: incomplete
3148 else:
3149 is_incomplete = lambda k: k in incomplete
3150
64fa820c 3151 operator_rex = re.compile(r'''(?x)
347de493 3152 (?P<key>[a-z_]+)
77b87f05 3153 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3154 (?:
a047eeb6 3155 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3156 (?P<strval>.+?)
347de493 3157 )
347de493 3158 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3159 m = operator_rex.fullmatch(filter_part.strip())
347de493 3160 if m:
18f96d12 3161 m = m.groupdict()
3162 unnegated_op = COMPARISON_OPERATORS[m['op']]
3163 if m['negation']:
77b87f05
MT
3164 op = lambda attr, value: not unnegated_op(attr, value)
3165 else:
3166 op = unnegated_op
18f96d12 3167 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3168 if m['quote']:
3169 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3170 actual_value = dct.get(m['key'])
3171 numeric_comparison = None
f9934b96 3172 if isinstance(actual_value, (int, float)):
e5a088dc
S
3173 # If the original field is a string and matching comparisonvalue is
3174 # a number we should respect the origin of the original field
3175 # and process comparison value as a string (see
18f96d12 3176 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3177 try:
18f96d12 3178 numeric_comparison = int(comparison_value)
347de493 3179 except ValueError:
18f96d12 3180 numeric_comparison = parse_filesize(comparison_value)
3181 if numeric_comparison is None:
3182 numeric_comparison = parse_filesize(f'{comparison_value}B')
3183 if numeric_comparison is None:
3184 numeric_comparison = parse_duration(comparison_value)
3185 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3186 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3187 if actual_value is None:
6db9c4d5 3188 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3189 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3190
3191 UNARY_OPERATORS = {
1cc47c66
S
3192 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3193 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3194 }
64fa820c 3195 operator_rex = re.compile(r'''(?x)
347de493 3196 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3197 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3198 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3199 if m:
3200 op = UNARY_OPERATORS[m.group('op')]
3201 actual_value = dct.get(m.group('key'))
6db9c4d5 3202 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3203 return True
347de493
PH
3204 return op(actual_value)
3205
3206 raise ValueError('Invalid filter part %r' % filter_part)
3207
3208
8f18aca8 3209def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3210 """ Filter a dictionary with a simple string syntax.
3211 @returns Whether the filter passes
3212 @param incomplete Set of keys that is expected to be missing from dct.
3213 Can be True/False to indicate all/none of the keys may be missing.
3214 All conditions on incomplete keys pass if the key is missing
8f18aca8 3215 """
347de493 3216 return all(
8f18aca8 3217 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3218 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3219
3220
fe2ce85a 3221def match_filter_func(filters, breaking_filters=None):
3222 if not filters and not breaking_filters:
d1b5f70b 3223 return None
fe2ce85a 3224 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3225 filters = set(variadic(filters or []))
d1b5f70b 3226
492272fe 3227 interactive = '-' in filters
3228 if interactive:
3229 filters.remove('-')
3230
3231 def _match_func(info_dict, incomplete=False):
fe2ce85a 3232 ret = breaking_filters(info_dict, incomplete)
3233 if ret is not None:
3234 raise RejectedVideoReached(ret)
3235
492272fe 3236 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3237 return NO_DEFAULT if interactive and not incomplete else None
347de493 3238 else:
3bec830a 3239 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3240 filter_str = ') | ('.join(map(str.strip, filters))
3241 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3242 return _match_func
91410c9b
PH
3243
3244
f2df4071 3245class download_range_func:
b4e0d758 3246 def __init__(self, chapters, ranges, from_info=False):
3247 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3248
3249 def __call__(self, info_dict, ydl):
0500ee3d 3250
5ec1b6b7 3251 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3252 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3253 for regex in self.chapters or []:
5ec1b6b7 3254 for i, chapter in enumerate(info_dict.get('chapters') or []):
3255 if re.search(regex, chapter['title']):
3256 warning = None
3257 yield {**chapter, 'index': i}
f2df4071 3258 if self.chapters and warning:
5ec1b6b7 3259 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3260
b4e0d758 3261 for start, end in self.ranges or []:
3262 yield {
3263 'start_time': self._handle_negative_timestamp(start, info_dict),
3264 'end_time': self._handle_negative_timestamp(end, info_dict),
3265 }
3266
3267 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3268 yield {
e59e2074 3269 'start_time': info_dict.get('start_time') or 0,
3270 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3271 }
e59e2074 3272 elif not self.ranges and not self.chapters:
3273 yield {}
b4e0d758 3274
3275 @staticmethod
3276 def _handle_negative_timestamp(time, info):
3277 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3278
f2df4071 3279 def __eq__(self, other):
3280 return (isinstance(other, download_range_func)
3281 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3282
71df9b7f 3283 def __repr__(self):
a5387729 3284 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3285
5ec1b6b7 3286
bf6427d2
YCH
3287def parse_dfxp_time_expr(time_expr):
3288 if not time_expr:
d631d5f9 3289 return
bf6427d2 3290
1d485a1a 3291 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3292 if mobj:
3293 return float(mobj.group('time_offset'))
3294
db2fe38b 3295 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3296 if mobj:
db2fe38b 3297 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3298
3299
c1c924ab 3300def srt_subtitles_timecode(seconds):
aa7785f8 3301 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3302
3303
3304def ass_subtitles_timecode(seconds):
3305 time = timetuple_from_msec(seconds * 1000)
3306 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3307
3308
3309def dfxp2srt(dfxp_data):
3869028f
YCH
3310 '''
3311 @param dfxp_data A bytes-like object containing DFXP data
3312 @returns A unicode object containing converted SRT data
3313 '''
5b995f71 3314 LEGACY_NAMESPACES = (
3869028f
YCH
3315 (b'http://www.w3.org/ns/ttml', [
3316 b'http://www.w3.org/2004/11/ttaf1',
3317 b'http://www.w3.org/2006/04/ttaf1',
3318 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3319 ]),
3869028f
YCH
3320 (b'http://www.w3.org/ns/ttml#styling', [
3321 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3322 ]),
3323 )
3324
3325 SUPPORTED_STYLING = [
3326 'color',
3327 'fontFamily',
3328 'fontSize',
3329 'fontStyle',
3330 'fontWeight',
3331 'textDecoration'
3332 ]
3333
4e335771 3334 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3335 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3336 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3337 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3338 })
bf6427d2 3339
5b995f71
RA
3340 styles = {}
3341 default_style = {}
3342
86e5f3ed 3343 class TTMLPElementParser:
5b995f71
RA
3344 _out = ''
3345 _unclosed_elements = []
3346 _applied_styles = []
bf6427d2 3347
2b14cb56 3348 def start(self, tag, attrib):
5b995f71
RA
3349 if tag in (_x('ttml:br'), 'br'):
3350 self._out += '\n'
3351 else:
3352 unclosed_elements = []
3353 style = {}
3354 element_style_id = attrib.get('style')
3355 if default_style:
3356 style.update(default_style)
3357 if element_style_id:
3358 style.update(styles.get(element_style_id, {}))
3359 for prop in SUPPORTED_STYLING:
3360 prop_val = attrib.get(_x('tts:' + prop))
3361 if prop_val:
3362 style[prop] = prop_val
3363 if style:
3364 font = ''
3365 for k, v in sorted(style.items()):
3366 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3367 continue
3368 if k == 'color':
3369 font += ' color="%s"' % v
3370 elif k == 'fontSize':
3371 font += ' size="%s"' % v
3372 elif k == 'fontFamily':
3373 font += ' face="%s"' % v
3374 elif k == 'fontWeight' and v == 'bold':
3375 self._out += '<b>'
3376 unclosed_elements.append('b')
3377 elif k == 'fontStyle' and v == 'italic':
3378 self._out += '<i>'
3379 unclosed_elements.append('i')
3380 elif k == 'textDecoration' and v == 'underline':
3381 self._out += '<u>'
3382 unclosed_elements.append('u')
3383 if font:
3384 self._out += '<font' + font + '>'
3385 unclosed_elements.append('font')
3386 applied_style = {}
3387 if self._applied_styles:
3388 applied_style.update(self._applied_styles[-1])
3389 applied_style.update(style)
3390 self._applied_styles.append(applied_style)
3391 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3392
2b14cb56 3393 def end(self, tag):
5b995f71
RA
3394 if tag not in (_x('ttml:br'), 'br'):
3395 unclosed_elements = self._unclosed_elements.pop()
3396 for element in reversed(unclosed_elements):
3397 self._out += '</%s>' % element
3398 if unclosed_elements and self._applied_styles:
3399 self._applied_styles.pop()
bf6427d2 3400
2b14cb56 3401 def data(self, data):
5b995f71 3402 self._out += data
2b14cb56 3403
3404 def close(self):
5b995f71 3405 return self._out.strip()
2b14cb56 3406
6a765f13 3407 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3408 # This will not trigger false positives since only UTF-8 text is being replaced
3409 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3410
2b14cb56 3411 def parse_node(node):
3412 target = TTMLPElementParser()
3413 parser = xml.etree.ElementTree.XMLParser(target=target)
3414 parser.feed(xml.etree.ElementTree.tostring(node))
3415 return parser.close()
bf6427d2 3416
5b995f71
RA
3417 for k, v in LEGACY_NAMESPACES:
3418 for ns in v:
3419 dfxp_data = dfxp_data.replace(ns, k)
3420
3869028f 3421 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3422 out = []
5b995f71 3423 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3424
3425 if not paras:
3426 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3427
5b995f71
RA
3428 repeat = False
3429 while True:
3430 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3431 style_id = style.get('id') or style.get(_x('xml:id'))
3432 if not style_id:
3433 continue
5b995f71
RA
3434 parent_style_id = style.get('style')
3435 if parent_style_id:
3436 if parent_style_id not in styles:
3437 repeat = True
3438 continue
3439 styles[style_id] = styles[parent_style_id].copy()
3440 for prop in SUPPORTED_STYLING:
3441 prop_val = style.get(_x('tts:' + prop))
3442 if prop_val:
3443 styles.setdefault(style_id, {})[prop] = prop_val
3444 if repeat:
3445 repeat = False
3446 else:
3447 break
3448
3449 for p in ('body', 'div'):
3450 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3451 if ele is None:
3452 continue
3453 style = styles.get(ele.get('style'))
3454 if not style:
3455 continue
3456 default_style.update(style)
3457
bf6427d2 3458 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3459 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3460 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3461 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3462 if begin_time is None:
3463 continue
7dff0363 3464 if not end_time:
d631d5f9
YCH
3465 if not dur:
3466 continue
3467 end_time = begin_time + dur
bf6427d2
YCH
3468 out.append('%d\n%s --> %s\n%s\n\n' % (
3469 index,
c1c924ab
YCH
3470 srt_subtitles_timecode(begin_time),
3471 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3472 parse_node(para)))
3473
3474 return ''.join(out)
3475
3476
c487cf00 3477def cli_option(params, command_option, param, separator=None):
66e289ba 3478 param = params.get(param)
c487cf00 3479 return ([] if param is None
3480 else [command_option, str(param)] if separator is None
3481 else [f'{command_option}{separator}{param}'])
66e289ba
S
3482
3483
3484def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3485 param = params.get(param)
c487cf00 3486 assert param in (True, False, None)
3487 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3488
3489
3490def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3491 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3492
3493
e92caff5 3494def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3495 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3496 if use_compat:
5b1ecbb3 3497 return argdict
3498 else:
3499 argdict = None
eab9b2bc 3500 if argdict is None:
5b1ecbb3 3501 return default
eab9b2bc 3502 assert isinstance(argdict, dict)
3503
e92caff5 3504 assert isinstance(keys, (list, tuple))
3505 for key_list in keys:
e92caff5 3506 arg_list = list(filter(
3507 lambda x: x is not None,
6606817a 3508 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3509 if arg_list:
3510 return [arg for args in arg_list for arg in args]
3511 return default
66e289ba 3512
6251555f 3513
330690a2 3514def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3515 main_key, exe = main_key.lower(), exe.lower()
3516 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3517 keys = [f'{root_key}{k}' for k in (keys or [''])]
3518 if root_key in keys:
3519 if main_key != exe:
3520 keys.append((main_key, exe))
3521 keys.append('default')
3522 else:
3523 use_compat = False
3524 return cli_configuration_args(argdict, keys, default, use_compat)
3525
66e289ba 3526
86e5f3ed 3527class ISO639Utils:
39672624
YCH
3528 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3529 _lang_map = {
3530 'aa': 'aar',
3531 'ab': 'abk',
3532 'ae': 'ave',
3533 'af': 'afr',
3534 'ak': 'aka',
3535 'am': 'amh',
3536 'an': 'arg',
3537 'ar': 'ara',
3538 'as': 'asm',
3539 'av': 'ava',
3540 'ay': 'aym',
3541 'az': 'aze',
3542 'ba': 'bak',
3543 'be': 'bel',
3544 'bg': 'bul',
3545 'bh': 'bih',
3546 'bi': 'bis',
3547 'bm': 'bam',
3548 'bn': 'ben',
3549 'bo': 'bod',
3550 'br': 'bre',
3551 'bs': 'bos',
3552 'ca': 'cat',
3553 'ce': 'che',
3554 'ch': 'cha',
3555 'co': 'cos',
3556 'cr': 'cre',
3557 'cs': 'ces',
3558 'cu': 'chu',
3559 'cv': 'chv',
3560 'cy': 'cym',
3561 'da': 'dan',
3562 'de': 'deu',
3563 'dv': 'div',
3564 'dz': 'dzo',
3565 'ee': 'ewe',
3566 'el': 'ell',
3567 'en': 'eng',
3568 'eo': 'epo',
3569 'es': 'spa',
3570 'et': 'est',
3571 'eu': 'eus',
3572 'fa': 'fas',
3573 'ff': 'ful',
3574 'fi': 'fin',
3575 'fj': 'fij',
3576 'fo': 'fao',
3577 'fr': 'fra',
3578 'fy': 'fry',
3579 'ga': 'gle',
3580 'gd': 'gla',
3581 'gl': 'glg',
3582 'gn': 'grn',
3583 'gu': 'guj',
3584 'gv': 'glv',
3585 'ha': 'hau',
3586 'he': 'heb',
b7acc835 3587 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3588 'hi': 'hin',
3589 'ho': 'hmo',
3590 'hr': 'hrv',
3591 'ht': 'hat',
3592 'hu': 'hun',
3593 'hy': 'hye',
3594 'hz': 'her',
3595 'ia': 'ina',
3596 'id': 'ind',
b7acc835 3597 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3598 'ie': 'ile',
3599 'ig': 'ibo',
3600 'ii': 'iii',
3601 'ik': 'ipk',
3602 'io': 'ido',
3603 'is': 'isl',
3604 'it': 'ita',
3605 'iu': 'iku',
3606 'ja': 'jpn',
3607 'jv': 'jav',
3608 'ka': 'kat',
3609 'kg': 'kon',
3610 'ki': 'kik',
3611 'kj': 'kua',
3612 'kk': 'kaz',
3613 'kl': 'kal',
3614 'km': 'khm',
3615 'kn': 'kan',
3616 'ko': 'kor',
3617 'kr': 'kau',
3618 'ks': 'kas',
3619 'ku': 'kur',
3620 'kv': 'kom',
3621 'kw': 'cor',
3622 'ky': 'kir',
3623 'la': 'lat',
3624 'lb': 'ltz',
3625 'lg': 'lug',
3626 'li': 'lim',
3627 'ln': 'lin',
3628 'lo': 'lao',
3629 'lt': 'lit',
3630 'lu': 'lub',
3631 'lv': 'lav',
3632 'mg': 'mlg',
3633 'mh': 'mah',
3634 'mi': 'mri',
3635 'mk': 'mkd',
3636 'ml': 'mal',
3637 'mn': 'mon',
3638 'mr': 'mar',
3639 'ms': 'msa',
3640 'mt': 'mlt',
3641 'my': 'mya',
3642 'na': 'nau',
3643 'nb': 'nob',
3644 'nd': 'nde',
3645 'ne': 'nep',
3646 'ng': 'ndo',
3647 'nl': 'nld',
3648 'nn': 'nno',
3649 'no': 'nor',
3650 'nr': 'nbl',
3651 'nv': 'nav',
3652 'ny': 'nya',
3653 'oc': 'oci',
3654 'oj': 'oji',
3655 'om': 'orm',
3656 'or': 'ori',
3657 'os': 'oss',
3658 'pa': 'pan',
7bcd4813 3659 'pe': 'per',
39672624
YCH
3660 'pi': 'pli',
3661 'pl': 'pol',
3662 'ps': 'pus',
3663 'pt': 'por',
3664 'qu': 'que',
3665 'rm': 'roh',
3666 'rn': 'run',
3667 'ro': 'ron',
3668 'ru': 'rus',
3669 'rw': 'kin',
3670 'sa': 'san',
3671 'sc': 'srd',
3672 'sd': 'snd',
3673 'se': 'sme',
3674 'sg': 'sag',
3675 'si': 'sin',
3676 'sk': 'slk',
3677 'sl': 'slv',
3678 'sm': 'smo',
3679 'sn': 'sna',
3680 'so': 'som',
3681 'sq': 'sqi',
3682 'sr': 'srp',
3683 'ss': 'ssw',
3684 'st': 'sot',
3685 'su': 'sun',
3686 'sv': 'swe',
3687 'sw': 'swa',
3688 'ta': 'tam',
3689 'te': 'tel',
3690 'tg': 'tgk',
3691 'th': 'tha',
3692 'ti': 'tir',
3693 'tk': 'tuk',
3694 'tl': 'tgl',
3695 'tn': 'tsn',
3696 'to': 'ton',
3697 'tr': 'tur',
3698 'ts': 'tso',
3699 'tt': 'tat',
3700 'tw': 'twi',
3701 'ty': 'tah',
3702 'ug': 'uig',
3703 'uk': 'ukr',
3704 'ur': 'urd',
3705 'uz': 'uzb',
3706 've': 'ven',
3707 'vi': 'vie',
3708 'vo': 'vol',
3709 'wa': 'wln',
3710 'wo': 'wol',
3711 'xh': 'xho',
3712 'yi': 'yid',
e9a50fba 3713 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3714 'yo': 'yor',
3715 'za': 'zha',
3716 'zh': 'zho',
3717 'zu': 'zul',
3718 }
3719
3720 @classmethod
3721 def short2long(cls, code):
3722 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3723 return cls._lang_map.get(code[:2])
3724
3725 @classmethod
3726 def long2short(cls, code):
3727 """Convert language code from ISO 639-2/T to ISO 639-1"""
3728 for short_name, long_name in cls._lang_map.items():
3729 if long_name == code:
3730 return short_name
3731
3732
86e5f3ed 3733class ISO3166Utils:
4eb10f66
YCH
3734 # From http://data.okfn.org/data/core/country-list
3735 _country_map = {
3736 'AF': 'Afghanistan',
3737 'AX': 'Åland Islands',
3738 'AL': 'Albania',
3739 'DZ': 'Algeria',
3740 'AS': 'American Samoa',
3741 'AD': 'Andorra',
3742 'AO': 'Angola',
3743 'AI': 'Anguilla',
3744 'AQ': 'Antarctica',
3745 'AG': 'Antigua and Barbuda',
3746 'AR': 'Argentina',
3747 'AM': 'Armenia',
3748 'AW': 'Aruba',
3749 'AU': 'Australia',
3750 'AT': 'Austria',
3751 'AZ': 'Azerbaijan',
3752 'BS': 'Bahamas',
3753 'BH': 'Bahrain',
3754 'BD': 'Bangladesh',
3755 'BB': 'Barbados',
3756 'BY': 'Belarus',
3757 'BE': 'Belgium',
3758 'BZ': 'Belize',
3759 'BJ': 'Benin',
3760 'BM': 'Bermuda',
3761 'BT': 'Bhutan',
3762 'BO': 'Bolivia, Plurinational State of',
3763 'BQ': 'Bonaire, Sint Eustatius and Saba',
3764 'BA': 'Bosnia and Herzegovina',
3765 'BW': 'Botswana',
3766 'BV': 'Bouvet Island',
3767 'BR': 'Brazil',
3768 'IO': 'British Indian Ocean Territory',
3769 'BN': 'Brunei Darussalam',
3770 'BG': 'Bulgaria',
3771 'BF': 'Burkina Faso',
3772 'BI': 'Burundi',
3773 'KH': 'Cambodia',
3774 'CM': 'Cameroon',
3775 'CA': 'Canada',
3776 'CV': 'Cape Verde',
3777 'KY': 'Cayman Islands',
3778 'CF': 'Central African Republic',
3779 'TD': 'Chad',
3780 'CL': 'Chile',
3781 'CN': 'China',
3782 'CX': 'Christmas Island',
3783 'CC': 'Cocos (Keeling) Islands',
3784 'CO': 'Colombia',
3785 'KM': 'Comoros',
3786 'CG': 'Congo',
3787 'CD': 'Congo, the Democratic Republic of the',
3788 'CK': 'Cook Islands',
3789 'CR': 'Costa Rica',
3790 'CI': 'Côte d\'Ivoire',
3791 'HR': 'Croatia',
3792 'CU': 'Cuba',
3793 'CW': 'Curaçao',
3794 'CY': 'Cyprus',
3795 'CZ': 'Czech Republic',
3796 'DK': 'Denmark',
3797 'DJ': 'Djibouti',
3798 'DM': 'Dominica',
3799 'DO': 'Dominican Republic',
3800 'EC': 'Ecuador',
3801 'EG': 'Egypt',
3802 'SV': 'El Salvador',
3803 'GQ': 'Equatorial Guinea',
3804 'ER': 'Eritrea',
3805 'EE': 'Estonia',
3806 'ET': 'Ethiopia',
3807 'FK': 'Falkland Islands (Malvinas)',
3808 'FO': 'Faroe Islands',
3809 'FJ': 'Fiji',
3810 'FI': 'Finland',
3811 'FR': 'France',
3812 'GF': 'French Guiana',
3813 'PF': 'French Polynesia',
3814 'TF': 'French Southern Territories',
3815 'GA': 'Gabon',
3816 'GM': 'Gambia',
3817 'GE': 'Georgia',
3818 'DE': 'Germany',
3819 'GH': 'Ghana',
3820 'GI': 'Gibraltar',
3821 'GR': 'Greece',
3822 'GL': 'Greenland',
3823 'GD': 'Grenada',
3824 'GP': 'Guadeloupe',
3825 'GU': 'Guam',
3826 'GT': 'Guatemala',
3827 'GG': 'Guernsey',
3828 'GN': 'Guinea',
3829 'GW': 'Guinea-Bissau',
3830 'GY': 'Guyana',
3831 'HT': 'Haiti',
3832 'HM': 'Heard Island and McDonald Islands',
3833 'VA': 'Holy See (Vatican City State)',
3834 'HN': 'Honduras',
3835 'HK': 'Hong Kong',
3836 'HU': 'Hungary',
3837 'IS': 'Iceland',
3838 'IN': 'India',
3839 'ID': 'Indonesia',
3840 'IR': 'Iran, Islamic Republic of',
3841 'IQ': 'Iraq',
3842 'IE': 'Ireland',
3843 'IM': 'Isle of Man',
3844 'IL': 'Israel',
3845 'IT': 'Italy',
3846 'JM': 'Jamaica',
3847 'JP': 'Japan',
3848 'JE': 'Jersey',
3849 'JO': 'Jordan',
3850 'KZ': 'Kazakhstan',
3851 'KE': 'Kenya',
3852 'KI': 'Kiribati',
3853 'KP': 'Korea, Democratic People\'s Republic of',
3854 'KR': 'Korea, Republic of',
3855 'KW': 'Kuwait',
3856 'KG': 'Kyrgyzstan',
3857 'LA': 'Lao People\'s Democratic Republic',
3858 'LV': 'Latvia',
3859 'LB': 'Lebanon',
3860 'LS': 'Lesotho',
3861 'LR': 'Liberia',
3862 'LY': 'Libya',
3863 'LI': 'Liechtenstein',
3864 'LT': 'Lithuania',
3865 'LU': 'Luxembourg',
3866 'MO': 'Macao',
3867 'MK': 'Macedonia, the Former Yugoslav Republic of',
3868 'MG': 'Madagascar',
3869 'MW': 'Malawi',
3870 'MY': 'Malaysia',
3871 'MV': 'Maldives',
3872 'ML': 'Mali',
3873 'MT': 'Malta',
3874 'MH': 'Marshall Islands',
3875 'MQ': 'Martinique',
3876 'MR': 'Mauritania',
3877 'MU': 'Mauritius',
3878 'YT': 'Mayotte',
3879 'MX': 'Mexico',
3880 'FM': 'Micronesia, Federated States of',
3881 'MD': 'Moldova, Republic of',
3882 'MC': 'Monaco',
3883 'MN': 'Mongolia',
3884 'ME': 'Montenegro',
3885 'MS': 'Montserrat',
3886 'MA': 'Morocco',
3887 'MZ': 'Mozambique',
3888 'MM': 'Myanmar',
3889 'NA': 'Namibia',
3890 'NR': 'Nauru',
3891 'NP': 'Nepal',
3892 'NL': 'Netherlands',
3893 'NC': 'New Caledonia',
3894 'NZ': 'New Zealand',
3895 'NI': 'Nicaragua',
3896 'NE': 'Niger',
3897 'NG': 'Nigeria',
3898 'NU': 'Niue',
3899 'NF': 'Norfolk Island',
3900 'MP': 'Northern Mariana Islands',
3901 'NO': 'Norway',
3902 'OM': 'Oman',
3903 'PK': 'Pakistan',
3904 'PW': 'Palau',
3905 'PS': 'Palestine, State of',
3906 'PA': 'Panama',
3907 'PG': 'Papua New Guinea',
3908 'PY': 'Paraguay',
3909 'PE': 'Peru',
3910 'PH': 'Philippines',
3911 'PN': 'Pitcairn',
3912 'PL': 'Poland',
3913 'PT': 'Portugal',
3914 'PR': 'Puerto Rico',
3915 'QA': 'Qatar',
3916 'RE': 'Réunion',
3917 'RO': 'Romania',
3918 'RU': 'Russian Federation',
3919 'RW': 'Rwanda',
3920 'BL': 'Saint Barthélemy',
3921 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3922 'KN': 'Saint Kitts and Nevis',
3923 'LC': 'Saint Lucia',
3924 'MF': 'Saint Martin (French part)',
3925 'PM': 'Saint Pierre and Miquelon',
3926 'VC': 'Saint Vincent and the Grenadines',
3927 'WS': 'Samoa',
3928 'SM': 'San Marino',
3929 'ST': 'Sao Tome and Principe',
3930 'SA': 'Saudi Arabia',
3931 'SN': 'Senegal',
3932 'RS': 'Serbia',
3933 'SC': 'Seychelles',
3934 'SL': 'Sierra Leone',
3935 'SG': 'Singapore',
3936 'SX': 'Sint Maarten (Dutch part)',
3937 'SK': 'Slovakia',
3938 'SI': 'Slovenia',
3939 'SB': 'Solomon Islands',
3940 'SO': 'Somalia',
3941 'ZA': 'South Africa',
3942 'GS': 'South Georgia and the South Sandwich Islands',
3943 'SS': 'South Sudan',
3944 'ES': 'Spain',
3945 'LK': 'Sri Lanka',
3946 'SD': 'Sudan',
3947 'SR': 'Suriname',
3948 'SJ': 'Svalbard and Jan Mayen',
3949 'SZ': 'Swaziland',
3950 'SE': 'Sweden',
3951 'CH': 'Switzerland',
3952 'SY': 'Syrian Arab Republic',
3953 'TW': 'Taiwan, Province of China',
3954 'TJ': 'Tajikistan',
3955 'TZ': 'Tanzania, United Republic of',
3956 'TH': 'Thailand',
3957 'TL': 'Timor-Leste',
3958 'TG': 'Togo',
3959 'TK': 'Tokelau',
3960 'TO': 'Tonga',
3961 'TT': 'Trinidad and Tobago',
3962 'TN': 'Tunisia',
3963 'TR': 'Turkey',
3964 'TM': 'Turkmenistan',
3965 'TC': 'Turks and Caicos Islands',
3966 'TV': 'Tuvalu',
3967 'UG': 'Uganda',
3968 'UA': 'Ukraine',
3969 'AE': 'United Arab Emirates',
3970 'GB': 'United Kingdom',
3971 'US': 'United States',
3972 'UM': 'United States Minor Outlying Islands',
3973 'UY': 'Uruguay',
3974 'UZ': 'Uzbekistan',
3975 'VU': 'Vanuatu',
3976 'VE': 'Venezuela, Bolivarian Republic of',
3977 'VN': 'Viet Nam',
3978 'VG': 'Virgin Islands, British',
3979 'VI': 'Virgin Islands, U.S.',
3980 'WF': 'Wallis and Futuna',
3981 'EH': 'Western Sahara',
3982 'YE': 'Yemen',
3983 'ZM': 'Zambia',
3984 'ZW': 'Zimbabwe',
2f97cc61 3985 # Not ISO 3166 codes, but used for IP blocks
3986 'AP': 'Asia/Pacific Region',
3987 'EU': 'Europe',
4eb10f66
YCH
3988 }
3989
3990 @classmethod
3991 def short2full(cls, code):
3992 """Convert an ISO 3166-2 country code to the corresponding full name"""
3993 return cls._country_map.get(code.upper())
3994
3995
86e5f3ed 3996class GeoUtils:
773f291d
S
3997 # Major IPv4 address blocks per country
3998 _country_ip_map = {
53896ca5 3999 'AD': '46.172.224.0/19',
773f291d
S
4000 'AE': '94.200.0.0/13',
4001 'AF': '149.54.0.0/17',
4002 'AG': '209.59.64.0/18',
4003 'AI': '204.14.248.0/21',
4004 'AL': '46.99.0.0/16',
4005 'AM': '46.70.0.0/15',
4006 'AO': '105.168.0.0/13',
53896ca5
S
4007 'AP': '182.50.184.0/21',
4008 'AQ': '23.154.160.0/24',
773f291d
S
4009 'AR': '181.0.0.0/12',
4010 'AS': '202.70.112.0/20',
53896ca5 4011 'AT': '77.116.0.0/14',
773f291d
S
4012 'AU': '1.128.0.0/11',
4013 'AW': '181.41.0.0/18',
53896ca5
S
4014 'AX': '185.217.4.0/22',
4015 'AZ': '5.197.0.0/16',
773f291d
S
4016 'BA': '31.176.128.0/17',
4017 'BB': '65.48.128.0/17',
4018 'BD': '114.130.0.0/16',
4019 'BE': '57.0.0.0/8',
53896ca5 4020 'BF': '102.178.0.0/15',
773f291d
S
4021 'BG': '95.42.0.0/15',
4022 'BH': '37.131.0.0/17',
4023 'BI': '154.117.192.0/18',
4024 'BJ': '137.255.0.0/16',
53896ca5 4025 'BL': '185.212.72.0/23',
773f291d
S
4026 'BM': '196.12.64.0/18',
4027 'BN': '156.31.0.0/16',
4028 'BO': '161.56.0.0/16',
4029 'BQ': '161.0.80.0/20',
53896ca5 4030 'BR': '191.128.0.0/12',
773f291d
S
4031 'BS': '24.51.64.0/18',
4032 'BT': '119.2.96.0/19',
4033 'BW': '168.167.0.0/16',
4034 'BY': '178.120.0.0/13',
4035 'BZ': '179.42.192.0/18',
4036 'CA': '99.224.0.0/11',
4037 'CD': '41.243.0.0/16',
53896ca5
S
4038 'CF': '197.242.176.0/21',
4039 'CG': '160.113.0.0/16',
773f291d 4040 'CH': '85.0.0.0/13',
53896ca5 4041 'CI': '102.136.0.0/14',
773f291d
S
4042 'CK': '202.65.32.0/19',
4043 'CL': '152.172.0.0/14',
53896ca5 4044 'CM': '102.244.0.0/14',
773f291d
S
4045 'CN': '36.128.0.0/10',
4046 'CO': '181.240.0.0/12',
4047 'CR': '201.192.0.0/12',
4048 'CU': '152.206.0.0/15',
4049 'CV': '165.90.96.0/19',
4050 'CW': '190.88.128.0/17',
53896ca5 4051 'CY': '31.153.0.0/16',
773f291d
S
4052 'CZ': '88.100.0.0/14',
4053 'DE': '53.0.0.0/8',
4054 'DJ': '197.241.0.0/17',
4055 'DK': '87.48.0.0/12',
4056 'DM': '192.243.48.0/20',
4057 'DO': '152.166.0.0/15',
4058 'DZ': '41.96.0.0/12',
4059 'EC': '186.68.0.0/15',
4060 'EE': '90.190.0.0/15',
4061 'EG': '156.160.0.0/11',
4062 'ER': '196.200.96.0/20',
4063 'ES': '88.0.0.0/11',
4064 'ET': '196.188.0.0/14',
4065 'EU': '2.16.0.0/13',
4066 'FI': '91.152.0.0/13',
4067 'FJ': '144.120.0.0/16',
53896ca5 4068 'FK': '80.73.208.0/21',
773f291d
S
4069 'FM': '119.252.112.0/20',
4070 'FO': '88.85.32.0/19',
4071 'FR': '90.0.0.0/9',
4072 'GA': '41.158.0.0/15',
4073 'GB': '25.0.0.0/8',
4074 'GD': '74.122.88.0/21',
4075 'GE': '31.146.0.0/16',
4076 'GF': '161.22.64.0/18',
4077 'GG': '62.68.160.0/19',
53896ca5
S
4078 'GH': '154.160.0.0/12',
4079 'GI': '95.164.0.0/16',
773f291d
S
4080 'GL': '88.83.0.0/19',
4081 'GM': '160.182.0.0/15',
4082 'GN': '197.149.192.0/18',
4083 'GP': '104.250.0.0/19',
4084 'GQ': '105.235.224.0/20',
4085 'GR': '94.64.0.0/13',
4086 'GT': '168.234.0.0/16',
4087 'GU': '168.123.0.0/16',
4088 'GW': '197.214.80.0/20',
4089 'GY': '181.41.64.0/18',
4090 'HK': '113.252.0.0/14',
4091 'HN': '181.210.0.0/16',
4092 'HR': '93.136.0.0/13',
4093 'HT': '148.102.128.0/17',
4094 'HU': '84.0.0.0/14',
4095 'ID': '39.192.0.0/10',
4096 'IE': '87.32.0.0/12',
4097 'IL': '79.176.0.0/13',
4098 'IM': '5.62.80.0/20',
4099 'IN': '117.192.0.0/10',
4100 'IO': '203.83.48.0/21',
4101 'IQ': '37.236.0.0/14',
4102 'IR': '2.176.0.0/12',
4103 'IS': '82.221.0.0/16',
4104 'IT': '79.0.0.0/10',
4105 'JE': '87.244.64.0/18',
4106 'JM': '72.27.0.0/17',
4107 'JO': '176.29.0.0/16',
53896ca5 4108 'JP': '133.0.0.0/8',
773f291d
S
4109 'KE': '105.48.0.0/12',
4110 'KG': '158.181.128.0/17',
4111 'KH': '36.37.128.0/17',
4112 'KI': '103.25.140.0/22',
4113 'KM': '197.255.224.0/20',
53896ca5 4114 'KN': '198.167.192.0/19',
773f291d
S
4115 'KP': '175.45.176.0/22',
4116 'KR': '175.192.0.0/10',
4117 'KW': '37.36.0.0/14',
4118 'KY': '64.96.0.0/15',
4119 'KZ': '2.72.0.0/13',
4120 'LA': '115.84.64.0/18',
4121 'LB': '178.135.0.0/16',
53896ca5 4122 'LC': '24.92.144.0/20',
773f291d
S
4123 'LI': '82.117.0.0/19',
4124 'LK': '112.134.0.0/15',
53896ca5 4125 'LR': '102.183.0.0/16',
773f291d
S
4126 'LS': '129.232.0.0/17',
4127 'LT': '78.56.0.0/13',
4128 'LU': '188.42.0.0/16',
4129 'LV': '46.109.0.0/16',
4130 'LY': '41.252.0.0/14',
4131 'MA': '105.128.0.0/11',
4132 'MC': '88.209.64.0/18',
4133 'MD': '37.246.0.0/16',
4134 'ME': '178.175.0.0/17',
4135 'MF': '74.112.232.0/21',
4136 'MG': '154.126.0.0/17',
4137 'MH': '117.103.88.0/21',
4138 'MK': '77.28.0.0/15',
4139 'ML': '154.118.128.0/18',
4140 'MM': '37.111.0.0/17',
4141 'MN': '49.0.128.0/17',
4142 'MO': '60.246.0.0/16',
4143 'MP': '202.88.64.0/20',
4144 'MQ': '109.203.224.0/19',
4145 'MR': '41.188.64.0/18',
4146 'MS': '208.90.112.0/22',
4147 'MT': '46.11.0.0/16',
4148 'MU': '105.16.0.0/12',
4149 'MV': '27.114.128.0/18',
53896ca5 4150 'MW': '102.70.0.0/15',
773f291d
S
4151 'MX': '187.192.0.0/11',
4152 'MY': '175.136.0.0/13',
4153 'MZ': '197.218.0.0/15',
4154 'NA': '41.182.0.0/16',
4155 'NC': '101.101.0.0/18',
4156 'NE': '197.214.0.0/18',
4157 'NF': '203.17.240.0/22',
4158 'NG': '105.112.0.0/12',
4159 'NI': '186.76.0.0/15',
4160 'NL': '145.96.0.0/11',
4161 'NO': '84.208.0.0/13',
4162 'NP': '36.252.0.0/15',
4163 'NR': '203.98.224.0/19',
4164 'NU': '49.156.48.0/22',
4165 'NZ': '49.224.0.0/14',
4166 'OM': '5.36.0.0/15',
4167 'PA': '186.72.0.0/15',
4168 'PE': '186.160.0.0/14',
4169 'PF': '123.50.64.0/18',
4170 'PG': '124.240.192.0/19',
4171 'PH': '49.144.0.0/13',
4172 'PK': '39.32.0.0/11',
4173 'PL': '83.0.0.0/11',
4174 'PM': '70.36.0.0/20',
4175 'PR': '66.50.0.0/16',
4176 'PS': '188.161.0.0/16',
4177 'PT': '85.240.0.0/13',
4178 'PW': '202.124.224.0/20',
4179 'PY': '181.120.0.0/14',
4180 'QA': '37.210.0.0/15',
53896ca5 4181 'RE': '102.35.0.0/16',
773f291d 4182 'RO': '79.112.0.0/13',
53896ca5 4183 'RS': '93.86.0.0/15',
773f291d 4184 'RU': '5.136.0.0/13',
53896ca5 4185 'RW': '41.186.0.0/16',
773f291d
S
4186 'SA': '188.48.0.0/13',
4187 'SB': '202.1.160.0/19',
4188 'SC': '154.192.0.0/11',
53896ca5 4189 'SD': '102.120.0.0/13',
773f291d 4190 'SE': '78.64.0.0/12',
53896ca5 4191 'SG': '8.128.0.0/10',
773f291d
S
4192 'SI': '188.196.0.0/14',
4193 'SK': '78.98.0.0/15',
53896ca5 4194 'SL': '102.143.0.0/17',
773f291d
S
4195 'SM': '89.186.32.0/19',
4196 'SN': '41.82.0.0/15',
53896ca5 4197 'SO': '154.115.192.0/18',
773f291d
S
4198 'SR': '186.179.128.0/17',
4199 'SS': '105.235.208.0/21',
4200 'ST': '197.159.160.0/19',
4201 'SV': '168.243.0.0/16',
4202 'SX': '190.102.0.0/20',
4203 'SY': '5.0.0.0/16',
4204 'SZ': '41.84.224.0/19',
4205 'TC': '65.255.48.0/20',
4206 'TD': '154.68.128.0/19',
4207 'TG': '196.168.0.0/14',
4208 'TH': '171.96.0.0/13',
4209 'TJ': '85.9.128.0/18',
4210 'TK': '27.96.24.0/21',
4211 'TL': '180.189.160.0/20',
4212 'TM': '95.85.96.0/19',
4213 'TN': '197.0.0.0/11',
4214 'TO': '175.176.144.0/21',
4215 'TR': '78.160.0.0/11',
4216 'TT': '186.44.0.0/15',
4217 'TV': '202.2.96.0/19',
4218 'TW': '120.96.0.0/11',
4219 'TZ': '156.156.0.0/14',
53896ca5
S
4220 'UA': '37.52.0.0/14',
4221 'UG': '102.80.0.0/13',
4222 'US': '6.0.0.0/8',
773f291d 4223 'UY': '167.56.0.0/13',
53896ca5 4224 'UZ': '84.54.64.0/18',
773f291d 4225 'VA': '212.77.0.0/19',
53896ca5 4226 'VC': '207.191.240.0/21',
773f291d 4227 'VE': '186.88.0.0/13',
53896ca5 4228 'VG': '66.81.192.0/20',
773f291d
S
4229 'VI': '146.226.0.0/16',
4230 'VN': '14.160.0.0/11',
4231 'VU': '202.80.32.0/20',
4232 'WF': '117.20.32.0/21',
4233 'WS': '202.4.32.0/19',
4234 'YE': '134.35.0.0/16',
4235 'YT': '41.242.116.0/22',
4236 'ZA': '41.0.0.0/11',
53896ca5
S
4237 'ZM': '102.144.0.0/13',
4238 'ZW': '102.177.192.0/18',
773f291d
S
4239 }
4240
4241 @classmethod
5f95927a
S
4242 def random_ipv4(cls, code_or_block):
4243 if len(code_or_block) == 2:
4244 block = cls._country_ip_map.get(code_or_block.upper())
4245 if not block:
4246 return None
4247 else:
4248 block = code_or_block
773f291d 4249 addr, preflen = block.split('/')
ac668111 4250 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4251 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4252 return str(socket.inet_ntoa(
ac668111 4253 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4254
4255
0a5445dd
YCH
4256# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4257# released into Public Domain
4258# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4259
4260def long_to_bytes(n, blocksize=0):
4261 """long_to_bytes(n:long, blocksize:int) : string
4262 Convert a long integer to a byte string.
4263
4264 If optional blocksize is given and greater than zero, pad the front of the
4265 byte string with binary zeros so that the length is a multiple of
4266 blocksize.
4267 """
4268 # after much testing, this algorithm was deemed to be the fastest
4269 s = b''
4270 n = int(n)
4271 while n > 0:
ac668111 4272 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4273 n = n >> 32
4274 # strip off leading zeros
4275 for i in range(len(s)):
4276 if s[i] != b'\000'[0]:
4277 break
4278 else:
4279 # only happens when n == 0
4280 s = b'\000'
4281 i = 0
4282 s = s[i:]
4283 # add back some pad bytes. this could be done more efficiently w.r.t. the
4284 # de-padding being done above, but sigh...
4285 if blocksize > 0 and len(s) % blocksize:
4286 s = (blocksize - len(s) % blocksize) * b'\000' + s
4287 return s
4288
4289
4290def bytes_to_long(s):
4291 """bytes_to_long(string) : long
4292 Convert a byte string to a long integer.
4293
4294 This is (essentially) the inverse of long_to_bytes().
4295 """
4296 acc = 0
4297 length = len(s)
4298 if length % 4:
4299 extra = (4 - length % 4)
4300 s = b'\000' * extra + s
4301 length = length + extra
4302 for i in range(0, length, 4):
ac668111 4303 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4304 return acc
4305
4306
5bc880b9
YCH
4307def ohdave_rsa_encrypt(data, exponent, modulus):
4308 '''
4309 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4310
4311 Input:
4312 data: data to encrypt, bytes-like object
4313 exponent, modulus: parameter e and N of RSA algorithm, both integer
4314 Output: hex string of encrypted data
4315
4316 Limitation: supports one block encryption only
4317 '''
4318
4319 payload = int(binascii.hexlify(data[::-1]), 16)
4320 encrypted = pow(payload, exponent, modulus)
4321 return '%x' % encrypted
81bdc8fd
YCH
4322
4323
f48409c7
YCH
4324def pkcs1pad(data, length):
4325 """
4326 Padding input data with PKCS#1 scheme
4327
4328 @param {int[]} data input data
4329 @param {int} length target length
4330 @returns {int[]} padded data
4331 """
4332 if len(data) > length - 11:
4333 raise ValueError('Input data too long for PKCS#1 padding')
4334
4335 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4336 return [0, 2] + pseudo_random + [0] + data
4337
4338
7b2c3f47 4339def _base_n_table(n, table):
4340 if not table and not n:
4341 raise ValueError('Either table or n must be specified')
612f2be5 4342 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4343
44f14eb4 4344 if n and n != len(table):
612f2be5 4345 raise ValueError(f'base {n} exceeds table length {len(table)}')
4346 return table
59f898b7 4347
5eb6bdce 4348
7b2c3f47 4349def encode_base_n(num, n=None, table=None):
4350 """Convert given int to a base-n string"""
612f2be5 4351 table = _base_n_table(n, table)
7b2c3f47 4352 if not num:
5eb6bdce
YCH
4353 return table[0]
4354
7b2c3f47 4355 result, base = '', len(table)
81bdc8fd 4356 while num:
7b2c3f47 4357 result = table[num % base] + result
612f2be5 4358 num = num // base
7b2c3f47 4359 return result
4360
4361
4362def decode_base_n(string, n=None, table=None):
4363 """Convert given base-n string to int"""
4364 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4365 result, base = 0, len(table)
4366 for char in string:
4367 result = result * base + table[char]
4368 return result
4369
4370
f52354a8 4371def decode_packed_codes(code):
06b3fe29 4372 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4373 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4374 base = int(base)
4375 count = int(count)
4376 symbols = symbols.split('|')
4377 symbol_table = {}
4378
4379 while count:
4380 count -= 1
5eb6bdce 4381 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4382 symbol_table[base_n_count] = symbols[count] or base_n_count
4383
4384 return re.sub(
4385 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4386 obfuscated_code)
e154c651 4387
4388
1ced2221
S
4389def caesar(s, alphabet, shift):
4390 if shift == 0:
4391 return s
4392 l = len(alphabet)
4393 return ''.join(
4394 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4395 for c in s)
4396
4397
4398def rot47(s):
4399 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4400
4401
e154c651 4402def parse_m3u8_attributes(attrib):
4403 info = {}
4404 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4405 if val.startswith('"'):
4406 val = val[1:-1]
4407 info[key] = val
4408 return info
1143535d
YCH
4409
4410
4411def urshift(val, n):
4412 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4413
4414
efa97bdc 4415def write_xattr(path, key, value):
6f7563be 4416 # Windows: Write xattrs to NTFS Alternate Data Streams:
4417 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4418 if compat_os_name == 'nt':
4419 assert ':' not in key
4420 assert os.path.exists(path)
efa97bdc
YCH
4421
4422 try:
6f7563be 4423 with open(f'{path}:{key}', 'wb') as f:
4424 f.write(value)
86e5f3ed 4425 except OSError as e:
efa97bdc 4426 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4427 return
efa97bdc 4428
6f7563be 4429 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4430
6f7563be 4431 setxattr = None
4432 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4433 # Unicode arguments are not supported in pyxattr until version 0.5.0
4434 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4435 if version_tuple(xattr.__version__) >= (0, 5, 0):
4436 setxattr = xattr.set
4437 elif xattr:
4438 setxattr = xattr.setxattr
efa97bdc 4439
6f7563be 4440 if setxattr:
4441 try:
4442 setxattr(path, key, value)
4443 except OSError as e:
4444 raise XAttrMetadataError(e.errno, e.strerror)
4445 return
efa97bdc 4446
6f7563be 4447 # UNIX Method 2. Use setfattr/xattr executables
4448 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4449 else 'xattr' if check_executable('xattr', ['-h']) else None)
4450 if not exe:
4451 raise XAttrUnavailableError(
4452 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4453 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4454
0f06bcd7 4455 value = value.decode()
6f7563be 4456 try:
f0c9fb96 4457 _, stderr, returncode = Popen.run(
6f7563be 4458 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4459 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4460 except OSError as e:
4461 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4462 if returncode:
4463 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4464
4465
4466def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4467 start_date = datetime.date(1950, 1, 1)
4468 end_date = datetime.date(1995, 12, 31)
4469 offset = random.randint(0, (end_date - start_date).days)
4470 random_date = start_date + datetime.timedelta(offset)
0c265486 4471 return {
aa374bc7
AS
4472 year_field: str(random_date.year),
4473 month_field: str(random_date.month),
4474 day_field: str(random_date.day),
0c265486 4475 }
732044af 4476
c76eb41b 4477
8c53322c
L
4478def find_available_port(interface=''):
4479 try:
4480 with socket.socket() as sock:
4481 sock.bind((interface, 0))
4482 return sock.getsockname()[1]
4483 except OSError:
4484 return None
4485
4486
732044af 4487# Templates for internet shortcut files, which are plain text files.
e5a998f3 4488DOT_URL_LINK_TEMPLATE = '''\
732044af 4489[InternetShortcut]
4490URL=%(url)s
e5a998f3 4491'''
732044af 4492
e5a998f3 4493DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4494<?xml version="1.0" encoding="UTF-8"?>
4495<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4496<plist version="1.0">
4497<dict>
4498\t<key>URL</key>
4499\t<string>%(url)s</string>
4500</dict>
4501</plist>
e5a998f3 4502'''
732044af 4503
e5a998f3 4504DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4505[Desktop Entry]
4506Encoding=UTF-8
4507Name=%(filename)s
4508Type=Link
4509URL=%(url)s
4510Icon=text-html
e5a998f3 4511'''
732044af 4512
08438d2c 4513LINK_TEMPLATES = {
4514 'url': DOT_URL_LINK_TEMPLATE,
4515 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4516 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4517}
4518
732044af 4519
4520def iri_to_uri(iri):
4521 """
4522 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4523
4524 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4525 """
4526
14f25df2 4527 iri_parts = urllib.parse.urlparse(iri)
732044af 4528
4529 if '[' in iri_parts.netloc:
4530 raise ValueError('IPv6 URIs are not, yet, supported.')
4531 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4532
4533 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4534
4535 net_location = ''
4536 if iri_parts.username:
f9934b96 4537 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4538 if iri_parts.password is not None:
f9934b96 4539 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4540 net_location += '@'
4541
0f06bcd7 4542 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4543 # The 'idna' encoding produces ASCII text.
4544 if iri_parts.port is not None and iri_parts.port != 80:
4545 net_location += ':' + str(iri_parts.port)
4546
f9934b96 4547 return urllib.parse.urlunparse(
732044af 4548 (iri_parts.scheme,
4549 net_location,
4550
f9934b96 4551 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4552
4553 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4554 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4555
4556 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4557 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4558
f9934b96 4559 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4560
4561 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4562
4563
4564def to_high_limit_path(path):
4565 if sys.platform in ['win32', 'cygwin']:
4566 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4567 return '\\\\?\\' + os.path.abspath(path)
732044af 4568
4569 return path
76d321f6 4570
c76eb41b 4571
7b2c3f47 4572def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4573 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4574 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4575 return default
7b2c3f47 4576 return template % func(val)
00dd0cd5 4577
4578
4579def clean_podcast_url(url):
91302ed3 4580 url = re.sub(r'''(?x)
00dd0cd5 4581 (?:
4582 (?:
4583 chtbl\.com/track|
4584 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4585 play\.podtrac\.com|
4586 chrt\.fm/track|
4587 mgln\.ai/e
4588 )(?:/[^/.]+)?|
00dd0cd5 4589 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4590 flex\.acast\.com|
4591 pd(?:
4592 cn\.co| # https://podcorn.com/analytics-prefix/
4593 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4594 )/e|
4595 [0-9]\.gum\.fm|
4596 pscrb\.fm/rss/p
00dd0cd5 4597 )/''', '', url)
91302ed3 4598 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4599
4600
4601_HEX_TABLE = '0123456789abcdef'
4602
4603
4604def random_uuidv4():
4605 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4606
4607
4608def make_dir(path, to_screen=None):
4609 try:
4610 dn = os.path.dirname(path)
b25d6cb9
AI
4611 if dn:
4612 os.makedirs(dn, exist_ok=True)
0202b52a 4613 return True
86e5f3ed 4614 except OSError as err:
0202b52a 4615 if callable(to_screen) is not None:
69bec673 4616 to_screen(f'unable to create directory {err}')
0202b52a 4617 return False
f74980cb 4618
4619
4620def get_executable_path():
69bec673 4621 from ..update import _get_variant_and_executable_path
c487cf00 4622
b5899f4f 4623 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4624
4625
8e40b9d1 4626def get_user_config_dirs(package_name):
8e40b9d1
M
4627 # .config (e.g. ~/.config/package_name)
4628 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4629 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4630
4631 # appdata (%APPDATA%/package_name)
4632 appdata_dir = os.getenv('appdata')
4633 if appdata_dir:
773c272d 4634 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4635
4636 # home (~/.package_name)
773c272d 4637 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4638
4639
4640def get_system_config_dirs(package_name):
8e40b9d1 4641 # /etc/package_name
773c272d 4642 yield os.path.join('/etc', package_name)
06167fbb 4643
4644
3e9b66d7 4645def time_seconds(**kwargs):
83c4970e
L
4646 """
4647 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4648 """
4649 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4650
4651
49fa4d9a
N
4652# create a JSON Web Signature (jws) with HS256 algorithm
4653# the resulting format is in JWS Compact Serialization
4654# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4655# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4656def jwt_encode_hs256(payload_data, key, headers={}):
4657 header_data = {
4658 'alg': 'HS256',
4659 'typ': 'JWT',
4660 }
4661 if headers:
4662 header_data.update(headers)
0f06bcd7 4663 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4664 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4665 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4666 signature_b64 = base64.b64encode(h.digest())
4667 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4668 return token
819e0531 4669
4670
16b0d7e6 4671# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4672def jwt_decode_hs256(jwt):
4673 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4674 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4675 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4676 return payload_data
4677
4678
53973b4d 4679WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4680
4681
7a32c70d 4682@functools.cache
819e0531 4683def supports_terminal_sequences(stream):
4684 if compat_os_name == 'nt':
8a82af35 4685 if not WINDOWS_VT_MODE:
819e0531 4686 return False
4687 elif not os.getenv('TERM'):
4688 return False
4689 try:
4690 return stream.isatty()
4691 except BaseException:
4692 return False
4693
4694
c53a18f0 4695def windows_enable_vt_mode():
4696 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4697 if get_windows_version() < (10, 0, 10586):
53973b4d 4698 return
53973b4d 4699
c53a18f0 4700 import ctypes
4701 import ctypes.wintypes
4702 import msvcrt
4703
4704 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4705
4706 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4707 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4708 try:
4709 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4710 dw_original_mode = ctypes.wintypes.DWORD()
4711 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4712 if not success:
4713 raise Exception('GetConsoleMode failed')
4714
4715 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4716 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4717 if not success:
4718 raise Exception('SetConsoleMode failed')
c53a18f0 4719 finally:
4720 os.close(handle)
53973b4d 4721
f0795149 4722 global WINDOWS_VT_MODE
4723 WINDOWS_VT_MODE = True
4724 supports_terminal_sequences.cache_clear()
4725
53973b4d 4726
ec11a9f4 4727_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4728
4729
4730def remove_terminal_sequences(string):
4731 return _terminal_sequences_re.sub('', string)
4732
4733
4734def number_of_digits(number):
4735 return len('%d' % number)
34921b43 4736
4737
4738def join_nonempty(*values, delim='-', from_dict=None):
4739 if from_dict is not None:
69bec673 4740 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4741 return delim.join(map(str, filter(None, values)))
06e57990 4742
4743
27231526
ZM
4744def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4745 """
4746 Find the largest format dimensions in terms of video width and, for each thumbnail:
4747 * Modify the URL: Match the width with the provided regex and replace with the former width
4748 * Update dimensions
4749
4750 This function is useful with video services that scale the provided thumbnails on demand
4751 """
4752 _keys = ('width', 'height')
4753 max_dimensions = max(
86e5f3ed 4754 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4755 default=(0, 0))
4756 if not max_dimensions[0]:
4757 return thumbnails
4758 return [
4759 merge_dicts(
4760 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4761 dict(zip(_keys, max_dimensions)), thumbnail)
4762 for thumbnail in thumbnails
4763 ]
4764
4765
93c8410d
LNO
4766def parse_http_range(range):
4767 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4768 if not range:
4769 return None, None, None
4770 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4771 if not crg:
4772 return None, None, None
4773 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4774
4775
6b9e832d 4776def read_stdin(what):
4777 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4778 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4779 return sys.stdin
4780
4781
a904a7f8
L
4782def determine_file_encoding(data):
4783 """
88f60feb 4784 Detect the text encoding used
a904a7f8
L
4785 @returns (encoding, bytes to skip)
4786 """
4787
88f60feb 4788 # BOM marks are given priority over declarations
a904a7f8 4789 for bom, enc in BOMS:
a904a7f8
L
4790 if data.startswith(bom):
4791 return enc, len(bom)
4792
88f60feb 4793 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4794 # We ignore the endianness to get a good enough match
a904a7f8 4795 data = data.replace(b'\0', b'')
88f60feb 4796 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4797 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4798
4799
06e57990 4800class Config:
4801 own_args = None
9e491463 4802 parsed_args = None
06e57990 4803 filename = None
4804 __initialized = False
4805
4806 def __init__(self, parser, label=None):
9e491463 4807 self.parser, self.label = parser, label
06e57990 4808 self._loaded_paths, self.configs = set(), []
4809
4810 def init(self, args=None, filename=None):
4811 assert not self.__initialized
284a60c5 4812 self.own_args, self.filename = args, filename
4813 return self.load_configs()
4814
4815 def load_configs(self):
65662dff 4816 directory = ''
284a60c5 4817 if self.filename:
4818 location = os.path.realpath(self.filename)
65662dff 4819 directory = os.path.dirname(location)
06e57990 4820 if location in self._loaded_paths:
4821 return False
4822 self._loaded_paths.add(location)
4823
284a60c5 4824 self.__initialized = True
4825 opts, _ = self.parser.parse_known_args(self.own_args)
4826 self.parsed_args = self.own_args
9e491463 4827 for location in opts.config_locations or []:
6b9e832d 4828 if location == '-':
1060f82f 4829 if location in self._loaded_paths:
4830 continue
4831 self._loaded_paths.add(location)
6b9e832d 4832 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4833 continue
65662dff 4834 location = os.path.join(directory, expand_path(location))
06e57990 4835 if os.path.isdir(location):
4836 location = os.path.join(location, 'yt-dlp.conf')
4837 if not os.path.exists(location):
9e491463 4838 self.parser.error(f'config location {location} does not exist')
06e57990 4839 self.append_config(self.read_file(location), location)
4840 return True
4841
4842 def __str__(self):
4843 label = join_nonempty(
4844 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4845 delim=' ')
4846 return join_nonempty(
4847 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4848 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4849 delim='\n')
4850
7a32c70d 4851 @staticmethod
06e57990 4852 def read_file(filename, default=[]):
4853 try:
a904a7f8 4854 optionf = open(filename, 'rb')
86e5f3ed 4855 except OSError:
06e57990 4856 return default # silently skip if file is not present
a904a7f8
L
4857 try:
4858 enc, skip = determine_file_encoding(optionf.read(512))
4859 optionf.seek(skip, io.SEEK_SET)
4860 except OSError:
4861 enc = None # silently skip read errors
06e57990 4862 try:
4863 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4864 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4865 res = shlex.split(contents, comments=True)
44a6fcff 4866 except Exception as err:
4867 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4868 finally:
4869 optionf.close()
4870 return res
4871
7a32c70d 4872 @staticmethod
06e57990 4873 def hide_login_info(opts):
86e5f3ed 4874 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4875 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4876
4877 def _scrub_eq(o):
4878 m = eqre.match(o)
4879 if m:
4880 return m.group('key') + '=PRIVATE'
4881 else:
4882 return o
4883
4884 opts = list(map(_scrub_eq, opts))
4885 for idx, opt in enumerate(opts):
4886 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4887 opts[idx + 1] = 'PRIVATE'
4888 return opts
4889
4890 def append_config(self, *args, label=None):
9e491463 4891 config = type(self)(self.parser, label)
06e57990 4892 config._loaded_paths = self._loaded_paths
4893 if config.init(*args):
4894 self.configs.append(config)
4895
7a32c70d 4896 @property
06e57990 4897 def all_args(self):
4898 for config in reversed(self.configs):
4899 yield from config.all_args
9e491463 4900 yield from self.parsed_args or []
4901
4902 def parse_known_args(self, **kwargs):
4903 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4904
4905 def parse_args(self):
9e491463 4906 return self.parser.parse_args(self.all_args)
da42679b
LNO
4907
4908
d5d1df8a 4909class WebSocketsWrapper:
da42679b 4910 """Wraps websockets module to use in non-async scopes"""
abfecb7b 4911 pool = None
da42679b 4912
3cea3edd 4913 def __init__(self, url, headers=None, connect=True):
059bc4db 4914 self.loop = asyncio.new_event_loop()
9cd08050 4915 # XXX: "loop" is deprecated
4916 self.conn = websockets.connect(
4917 url, extra_headers=headers, ping_interval=None,
4918 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
4919 if connect:
4920 self.__enter__()
15dfb392 4921 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
4922
4923 def __enter__(self):
3cea3edd 4924 if not self.pool:
9cd08050 4925 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
4926 return self
4927
4928 def send(self, *args):
4929 self.run_with_loop(self.pool.send(*args), self.loop)
4930
4931 def recv(self, *args):
4932 return self.run_with_loop(self.pool.recv(*args), self.loop)
4933
4934 def __exit__(self, type, value, traceback):
4935 try:
4936 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4937 finally:
4938 self.loop.close()
15dfb392 4939 self._cancel_all_tasks(self.loop)
da42679b
LNO
4940
4941 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4942 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 4943 @staticmethod
da42679b 4944 def run_with_loop(main, loop):
059bc4db 4945 if not asyncio.iscoroutine(main):
da42679b
LNO
4946 raise ValueError(f'a coroutine was expected, got {main!r}')
4947
4948 try:
4949 return loop.run_until_complete(main)
4950 finally:
4951 loop.run_until_complete(loop.shutdown_asyncgens())
4952 if hasattr(loop, 'shutdown_default_executor'):
4953 loop.run_until_complete(loop.shutdown_default_executor())
4954
7a32c70d 4955 @staticmethod
da42679b 4956 def _cancel_all_tasks(loop):
059bc4db 4957 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
4958
4959 if not to_cancel:
4960 return
4961
4962 for task in to_cancel:
4963 task.cancel()
4964
9cd08050 4965 # XXX: "loop" is removed in python 3.10+
da42679b 4966 loop.run_until_complete(
059bc4db 4967 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
4968
4969 for task in to_cancel:
4970 if task.cancelled():
4971 continue
4972 if task.exception() is not None:
4973 loop.call_exception_handler({
4974 'message': 'unhandled exception during asyncio.run() shutdown',
4975 'exception': task.exception(),
4976 'task': task,
4977 })
4978
4979
8b7539d2 4980def merge_headers(*dicts):
08d30158 4981 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 4982 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 4983
4984
b1f94422 4985def cached_method(f):
4986 """Cache a method"""
4987 signature = inspect.signature(f)
4988
7a32c70d 4989 @functools.wraps(f)
b1f94422 4990 def wrapper(self, *args, **kwargs):
4991 bound_args = signature.bind(self, *args, **kwargs)
4992 bound_args.apply_defaults()
d5d1df8a 4993 key = tuple(bound_args.arguments.values())[1:]
b1f94422 4994
6368e2e6 4995 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 4996 if key not in cache:
4997 cache[key] = f(self, *args, **kwargs)
4998 return cache[key]
4999 return wrapper
5000
5001
28787f16 5002class classproperty:
83cc7b8a 5003 """property access for class methods with optional caching"""
5004 def __new__(cls, func=None, *args, **kwargs):
5005 if not func:
5006 return functools.partial(cls, *args, **kwargs)
5007 return super().__new__(cls)
c487cf00 5008
83cc7b8a 5009 def __init__(self, func, *, cache=False):
c487cf00 5010 functools.update_wrapper(self, func)
5011 self.func = func
83cc7b8a 5012 self._cache = {} if cache else None
28787f16 5013
5014 def __get__(self, _, cls):
83cc7b8a 5015 if self._cache is None:
5016 return self.func(cls)
5017 elif cls not in self._cache:
5018 self._cache[cls] = self.func(cls)
5019 return self._cache[cls]
19a03940 5020
5021
a5387729 5022class function_with_repr:
b2e0343b 5023 def __init__(self, func, repr_=None):
a5387729 5024 functools.update_wrapper(self, func)
b2e0343b 5025 self.func, self.__repr = func, repr_
a5387729 5026
5027 def __call__(self, *args, **kwargs):
5028 return self.func(*args, **kwargs)
5029
5030 def __repr__(self):
b2e0343b 5031 if self.__repr:
5032 return self.__repr
a5387729 5033 return f'{self.func.__module__}.{self.func.__qualname__}'
5034
5035
64fa820c 5036class Namespace(types.SimpleNamespace):
591bb9d3 5037 """Immutable namespace"""
591bb9d3 5038
7896214c 5039 def __iter__(self):
64fa820c 5040 return iter(self.__dict__.values())
7896214c 5041
7a32c70d 5042 @property
64fa820c 5043 def items_(self):
5044 return self.__dict__.items()
9b8ee23b 5045
5046
8dc59305 5047MEDIA_EXTENSIONS = Namespace(
5048 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5049 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5050 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5051 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5052 thumbnails=('jpg', 'png', 'webp'),
5053 storyboards=('mhtml', ),
5054 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5055 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5056)
5057MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5058MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5059
5060KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5061
5062
be5c1ae8 5063class RetryManager:
5064 """Usage:
5065 for retry in RetryManager(...):
5066 try:
5067 ...
5068 except SomeException as err:
5069 retry.error = err
5070 continue
5071 """
5072 attempt, _error = 0, None
5073
5074 def __init__(self, _retries, _error_callback, **kwargs):
5075 self.retries = _retries or 0
5076 self.error_callback = functools.partial(_error_callback, **kwargs)
5077
5078 def _should_retry(self):
5079 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5080
7a32c70d 5081 @property
be5c1ae8 5082 def error(self):
5083 if self._error is NO_DEFAULT:
5084 return None
5085 return self._error
5086
7a32c70d 5087 @error.setter
be5c1ae8 5088 def error(self, value):
5089 self._error = value
5090
5091 def __iter__(self):
5092 while self._should_retry():
5093 self.error = NO_DEFAULT
5094 self.attempt += 1
5095 yield self
5096 if self.error:
5097 self.error_callback(self.error, self.attempt, self.retries)
5098
7a32c70d 5099 @staticmethod
be5c1ae8 5100 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5101 """Utility function for reporting retries"""
5102 if count > retries:
5103 if error:
5104 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5105 raise e
5106
5107 if not count:
5108 return warn(e)
5109 elif isinstance(e, ExtractorError):
3ce29336 5110 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5111 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5112
5113 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5114 if delay:
5115 info(f'Sleeping {delay:.2f} seconds ...')
5116 time.sleep(delay)
5117
5118
0647d925 5119def make_archive_id(ie, video_id):
5120 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5121 return f'{ie_key.lower()} {video_id}'
5122
5123
a1c5bd82 5124def truncate_string(s, left, right=0):
5125 assert left > 3 and right >= 0
5126 if s is None or len(s) <= left + right:
5127 return s
71df9b7f 5128 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5129
5130
5314b521 5131def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5132 assert 'all' in alias_dict, '"all" alias is required'
5133 requested = list(start or [])
5134 for val in options:
5135 discard = val.startswith('-')
5136 if discard:
5137 val = val[1:]
5138
5139 if val in alias_dict:
5140 val = alias_dict[val] if not discard else [
5141 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5142 # NB: Do not allow regex in aliases for performance
5143 requested = orderedSet_from_options(val, alias_dict, start=requested)
5144 continue
5145
5146 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5147 else [val] if val in alias_dict['all'] else None)
5148 if current is None:
5149 raise ValueError(val)
5150
5151 if discard:
5152 for item in current:
5153 while item in requested:
5154 requested.remove(item)
5155 else:
5156 requested.extend(current)
5157
5158 return orderedSet(requested)
5159
5160
eedda525 5161# TODO: Rewrite
d0d74b71 5162class FormatSorter:
5163 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5164
5165 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5166 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5167 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5168 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5169 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5170 'fps', 'fs_approx', 'source', 'id')
5171
5172 settings = {
5173 'vcodec': {'type': 'ordered', 'regex': True,
5174 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5175 'acodec': {'type': 'ordered', 'regex': True,
71082216 5176 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5177 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5178 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5179 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5180 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5181 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5182 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5183 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5184 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5185 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5186 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5187 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5188 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5189 'field': ('vcodec', 'acodec'),
5190 'function': lambda it: int(any(v != 'none' for v in it))},
5191 'ie_pref': {'priority': True, 'type': 'extractor'},
5192 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5193 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5194 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5195 'quality': {'convert': 'float', 'default': -1},
5196 'filesize': {'convert': 'bytes'},
5197 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5198 'id': {'convert': 'string', 'field': 'format_id'},
5199 'height': {'convert': 'float_none'},
5200 'width': {'convert': 'float_none'},
5201 'fps': {'convert': 'float_none'},
5202 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5203 'tbr': {'convert': 'float_none'},
5204 'vbr': {'convert': 'float_none'},
5205 'abr': {'convert': 'float_none'},
5206 'asr': {'convert': 'float_none'},
5207 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5208
5209 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5210 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5211 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5212 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5213 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5214 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5215 'res': {'type': 'multiple', 'field': ('height', 'width'),
5216 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5217
5218 # Actual field names
5219 'format_id': {'type': 'alias', 'field': 'id'},
5220 'preference': {'type': 'alias', 'field': 'ie_pref'},
5221 'language_preference': {'type': 'alias', 'field': 'lang'},
5222 'source_preference': {'type': 'alias', 'field': 'source'},
5223 'protocol': {'type': 'alias', 'field': 'proto'},
5224 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5225 'audio_channels': {'type': 'alias', 'field': 'channels'},
5226
5227 # Deprecated
5228 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5229 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5230 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5231 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5232 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5233 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5234 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5235 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5236 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5237 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5238 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5239 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5240 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5241 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5242 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5243 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5244 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5245 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5246 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5247 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5248 }
5249
5250 def __init__(self, ydl, field_preference):
5251 self.ydl = ydl
5252 self._order = []
5253 self.evaluate_params(self.ydl.params, field_preference)
5254 if ydl.params.get('verbose'):
5255 self.print_verbose_info(self.ydl.write_debug)
5256
5257 def _get_field_setting(self, field, key):
5258 if field not in self.settings:
5259 if key in ('forced', 'priority'):
5260 return False
5261 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5262 'deprecated and may be removed in a future version')
5263 self.settings[field] = {}
5264 propObj = self.settings[field]
5265 if key not in propObj:
5266 type = propObj.get('type')
5267 if key == 'field':
5268 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5269 elif key == 'convert':
5270 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5271 else:
5272 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5273 propObj[key] = default
5274 return propObj[key]
5275
5276 def _resolve_field_value(self, field, value, convertNone=False):
5277 if value is None:
5278 if not convertNone:
5279 return None
5280 else:
5281 value = value.lower()
5282 conversion = self._get_field_setting(field, 'convert')
5283 if conversion == 'ignore':
5284 return None
5285 if conversion == 'string':
5286 return value
5287 elif conversion == 'float_none':
5288 return float_or_none(value)
5289 elif conversion == 'bytes':
5290 return parse_bytes(value)
5291 elif conversion == 'order':
5292 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5293 use_regex = self._get_field_setting(field, 'regex')
5294 list_length = len(order_list)
5295 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5296 if use_regex and value is not None:
5297 for i, regex in enumerate(order_list):
5298 if regex and re.match(regex, value):
5299 return list_length - i
5300 return list_length - empty_pos # not in list
5301 else: # not regex or value = None
5302 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5303 else:
5304 if value.isnumeric():
5305 return float(value)
5306 else:
5307 self.settings[field]['convert'] = 'string'
5308 return value
5309
5310 def evaluate_params(self, params, sort_extractor):
5311 self._use_free_order = params.get('prefer_free_formats', False)
5312 self._sort_user = params.get('format_sort', [])
5313 self._sort_extractor = sort_extractor
5314
5315 def add_item(field, reverse, closest, limit_text):
5316 field = field.lower()
5317 if field in self._order:
5318 return
5319 self._order.append(field)
5320 limit = self._resolve_field_value(field, limit_text)
5321 data = {
5322 'reverse': reverse,
5323 'closest': False if limit is None else closest,
5324 'limit_text': limit_text,
5325 'limit': limit}
5326 if field in self.settings:
5327 self.settings[field].update(data)
5328 else:
5329 self.settings[field] = data
5330
5331 sort_list = (
5332 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5333 + (tuple() if params.get('format_sort_force', False)
5334 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5335 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5336
5337 for item in sort_list:
5338 match = re.match(self.regex, item)
5339 if match is None:
5340 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5341 field = match.group('field')
5342 if field is None:
5343 continue
5344 if self._get_field_setting(field, 'type') == 'alias':
5345 alias, field = field, self._get_field_setting(field, 'field')
5346 if self._get_field_setting(alias, 'deprecated'):
5347 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5348 f'be removed in a future version. Please use {field} instead')
5349 reverse = match.group('reverse') is not None
5350 closest = match.group('separator') == '~'
5351 limit_text = match.group('limit')
5352
5353 has_limit = limit_text is not None
5354 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5355 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5356
5357 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5358 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5359 limit_count = len(limits)
5360 for (i, f) in enumerate(fields):
5361 add_item(f, reverse, closest,
5362 limits[i] if i < limit_count
5363 else limits[0] if has_limit and not has_multiple_limits
5364 else None)
5365
5366 def print_verbose_info(self, write_debug):
5367 if self._sort_user:
5368 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5369 if self._sort_extractor:
5370 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5371 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5372 '+' if self._get_field_setting(field, 'reverse') else '', field,
5373 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5374 self._get_field_setting(field, 'limit_text'),
5375 self._get_field_setting(field, 'limit'))
5376 if self._get_field_setting(field, 'limit_text') is not None else '')
5377 for field in self._order if self._get_field_setting(field, 'visible')]))
5378
5379 def _calculate_field_preference_from_value(self, format, field, type, value):
5380 reverse = self._get_field_setting(field, 'reverse')
5381 closest = self._get_field_setting(field, 'closest')
5382 limit = self._get_field_setting(field, 'limit')
5383
5384 if type == 'extractor':
5385 maximum = self._get_field_setting(field, 'max')
5386 if value is None or (maximum is not None and value >= maximum):
5387 value = -1
5388 elif type == 'boolean':
5389 in_list = self._get_field_setting(field, 'in_list')
5390 not_in_list = self._get_field_setting(field, 'not_in_list')
5391 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5392 elif type == 'ordered':
5393 value = self._resolve_field_value(field, value, True)
5394
5395 # try to convert to number
5396 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5397 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5398 if is_num:
5399 value = val_num
5400
5401 return ((-10, 0) if value is None
5402 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5403 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5404 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5405 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5406 else (-1, value, 0))
5407
5408 def _calculate_field_preference(self, format, field):
5409 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5410 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5411 if type == 'multiple':
5412 type = 'field' # Only 'field' is allowed in multiple for now
5413 actual_fields = self._get_field_setting(field, 'field')
5414
5415 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5416 else:
5417 value = get_value(field)
5418 return self._calculate_field_preference_from_value(format, field, type, value)
5419
5420 def calculate_preference(self, format):
5421 # Determine missing protocol
5422 if not format.get('protocol'):
5423 format['protocol'] = determine_protocol(format)
5424
5425 # Determine missing ext
5426 if not format.get('ext') and 'url' in format:
5427 format['ext'] = determine_ext(format['url'])
5428 if format.get('vcodec') == 'none':
5429 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5430 format['video_ext'] = 'none'
5431 else:
5432 format['video_ext'] = format['ext']
5433 format['audio_ext'] = 'none'
5434 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5435 # format['preference'] = -1000
5436
5424dbaf
L
5437 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5438 # HEVC-over-FLV is out-of-spec by FLV's original spec
5439 # ref. https://trac.ffmpeg.org/ticket/6389
5440 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5441 format['preference'] = -100
5442
d0d74b71 5443 # Determine missing bitrates
eedda525 5444 if format.get('vcodec') == 'none':
5445 format['vbr'] = 0
5446 if format.get('acodec') == 'none':
5447 format['abr'] = 0
5448 if not format.get('vbr') and format.get('vcodec') != 'none':
5449 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5450 if not format.get('abr') and format.get('acodec') != 'none':
5451 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5452 if not format.get('tbr'):
5453 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5454
5455 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5456
5457
5458# XXX: Temporary
5459class _YDLLogger:
5460 def __init__(self, ydl=None):
5461 self._ydl = ydl
5462
5463 def debug(self, message):
5464 if self._ydl:
5465 self._ydl.write_debug(message)
5466
5467 def info(self, message):
5468 if self._ydl:
5469 self._ydl.to_screen(message)
5470
5471 def warning(self, message, *, once=False):
5472 if self._ydl:
3d2623a8 5473 self._ydl.report_warning(message, once)
1b392f90 5474
5475 def error(self, message, *, is_error=True):
5476 if self._ydl:
5477 self._ydl.report_error(message, is_error=is_error)
5478
5479 def stdout(self, message):
5480 if self._ydl:
5481 self._ydl.to_stdout(message)
5482
5483 def stderr(self, message):
5484 if self._ydl:
5485 self._ydl.to_stderr(message)