]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[networking] Remove dot segments during URL normalization (#7662)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
b1f94422 18import inspect
03f9daab 19import io
79a2e94e 20import itertools
f4bfd65f 21import json
d77c3dfd 22import locale
02dbf93f 23import math
f8271158 24import mimetypes
db3ad8a6 25import netrc
347de493 26import operator
d77c3dfd 27import os
c496ca96 28import platform
773f291d 29import random
d77c3dfd 30import re
f8271158 31import shlex
c496ca96 32import socket
79a2e94e 33import ssl
ac668111 34import struct
1c088fa8 35import subprocess
d77c3dfd 36import sys
181c8655 37import tempfile
c380cc28 38import time
01951dda 39import traceback
64fa820c 40import types
989a01c2 41import unicodedata
14f25df2 42import urllib.error
f8271158 43import urllib.parse
ac668111 44import urllib.request
bcf89ce6 45import xml.etree.ElementTree
d77c3dfd 46
69bec673 47from . import traversal
48
49from ..compat import functools # isort: split
50from ..compat import (
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
f8271158 53 compat_HTMLParseError,
efa97bdc 54 compat_os_name,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56)
c365dba8 57from ..dependencies import websockets, xattr
51fb4995 58
46f1370e 59__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
60
468e2e92
FV
61# This is not clearly defined otherwise
62compiled_regex_type = type(re.compile(''))
63
f7a147e3 64
4823ec9f 65class NO_DEFAULT:
66 pass
67
68
69def IDENTITY(x):
70 return x
71
bf42a990 72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
f6717dec
S
77MONTH_NAMES = {
78 'en': ENGLISH_MONTH_NAMES,
79 'fr': [
3e4185c3
S
80 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
81 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 82 # these follow the genitive grammatical case (dopełniacz)
83 # some websites might be using nominative, which will require another month list
84 # https://en.wikibooks.org/wiki/Polish/Noun_cases
85 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
86 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 87}
a942d6cb 88
8f53dc44 89# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
90TIMEZONE_NAMES = {
91 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
92 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
93 'EST': -5, 'EDT': -4, # Eastern
94 'CST': -6, 'CDT': -5, # Central
95 'MST': -7, 'MDT': -6, # Mountain
96 'PST': -8, 'PDT': -7 # Pacific
97}
98
c587cbb7 99# needed for sanitizing filenames in restricted mode
c8827027 100ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
101 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
102 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 103
46f59e89
S
104DATE_FORMATS = (
105 '%d %B %Y',
106 '%d %b %Y',
107 '%B %d %Y',
cb655f34
S
108 '%B %dst %Y',
109 '%B %dnd %Y',
9d30c213 110 '%B %drd %Y',
cb655f34 111 '%B %dth %Y',
46f59e89 112 '%b %d %Y',
cb655f34
S
113 '%b %dst %Y',
114 '%b %dnd %Y',
9d30c213 115 '%b %drd %Y',
cb655f34 116 '%b %dth %Y',
46f59e89
S
117 '%b %dst %Y %I:%M',
118 '%b %dnd %Y %I:%M',
9d30c213 119 '%b %drd %Y %I:%M',
46f59e89
S
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
bccdbd22 123 '%Y.%m.%d.',
46f59e89 124 '%Y/%m/%d',
81c13222 125 '%Y/%m/%d %H:%M',
46f59e89 126 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
127 '%Y%m%d%H%M',
128 '%Y%m%d%H%M%S',
4f3fa23e 129 '%Y%m%d',
0c1c6f4b 130 '%Y-%m-%d %H:%M',
46f59e89
S
131 '%Y-%m-%d %H:%M:%S',
132 '%Y-%m-%d %H:%M:%S.%f',
5014558a 133 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
134 '%d.%m.%Y %H:%M',
135 '%d.%m.%Y %H.%M',
136 '%Y-%m-%dT%H:%M:%SZ',
137 '%Y-%m-%dT%H:%M:%S.%fZ',
138 '%Y-%m-%dT%H:%M:%S.%f0Z',
139 '%Y-%m-%dT%H:%M:%S',
140 '%Y-%m-%dT%H:%M:%S.%f',
141 '%Y-%m-%dT%H:%M',
c6eed6b8
S
142 '%b %d %Y at %H:%M',
143 '%b %d %Y at %H:%M:%S',
b555ae9b
S
144 '%B %d %Y at %H:%M',
145 '%B %d %Y at %H:%M:%S',
a63d9bd0 146 '%H:%M %d-%b-%Y',
46f59e89
S
147)
148
149DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
47304e07 157 '%d-%m-%Y %H:%M',
4cbfa570 158 '%H:%M %d/%m/%Y',
46f59e89
S
159])
160
161DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
162DATE_FORMATS_MONTH_FIRST.extend([
163 '%m-%d-%Y',
164 '%m.%d.%Y',
165 '%m/%d/%Y',
166 '%m/%d/%y',
167 '%m/%d/%Y %H:%M:%S',
168])
169
06b3fe29 170PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 171JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 172
1d485a1a 173NUMBER_RE = r'\d+(?:\.\d+)?'
174
7105440c 175
0b9c08b4 176@functools.cache
d77c3dfd 177def preferredencoding():
59ae15a5 178 """Get preferred encoding.
d77c3dfd 179
59ae15a5
PH
180 Returns the best encoding scheme for the system, based on
181 locale.getpreferredencoding() and some further tweaks.
182 """
183 try:
184 pref = locale.getpreferredencoding()
28e614de 185 'TEST'.encode(pref)
70a1165b 186 except Exception:
59ae15a5 187 pref = 'UTF-8'
bae611f2 188
59ae15a5 189 return pref
d77c3dfd 190
f4bfd65f 191
181c8655 192def write_json_file(obj, fn):
1394646a 193 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 194
cfb0511d 195 tf = tempfile.NamedTemporaryFile(
196 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
197 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
198
199 try:
200 with tf:
45d86abe 201 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
202 if sys.platform == 'win32':
203 # Need to remove existing file on Windows, else os.rename raises
204 # WindowsError or FileExistsError.
19a03940 205 with contextlib.suppress(OSError):
1394646a 206 os.unlink(fn)
19a03940 207 with contextlib.suppress(OSError):
9cd5f54e
R
208 mask = os.umask(0)
209 os.umask(mask)
210 os.chmod(tf.name, 0o666 & ~mask)
181c8655 211 os.rename(tf.name, fn)
70a1165b 212 except Exception:
19a03940 213 with contextlib.suppress(OSError):
181c8655 214 os.remove(tf.name)
181c8655
PH
215 raise
216
217
cfb0511d 218def find_xpath_attr(node, xpath, key, val=None):
219 """ Find the xpath xpath[@key=val] """
220 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 221 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 222 return node.find(expr)
59ae56fa 223
d7e66d39
JMF
224# On python2.6 the xml.etree.ElementTree.Element methods don't support
225# the namespace parameter
5f6a1245
JW
226
227
d7e66d39
JMF
228def xpath_with_ns(path, ns_map):
229 components = [c.split(':') for c in path.split('/')]
230 replaced = []
231 for c in components:
232 if len(c) == 1:
233 replaced.append(c[0])
234 else:
235 ns, tag = c
236 replaced.append('{%s}%s' % (ns_map[ns], tag))
237 return '/'.join(replaced)
238
d77c3dfd 239
a41fb80c 240def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 241 def _find_xpath(xpath):
f9934b96 242 return node.find(xpath)
578c0745 243
14f25df2 244 if isinstance(xpath, str):
578c0745
S
245 n = _find_xpath(xpath)
246 else:
247 for xp in xpath:
248 n = _find_xpath(xp)
249 if n is not None:
250 break
d74bebd5 251
8e636da4 252 if n is None:
bf42a990
S
253 if default is not NO_DEFAULT:
254 return default
255 elif fatal:
bf0ff932
PH
256 name = xpath if name is None else name
257 raise ExtractorError('Could not find XML element %s' % name)
258 else:
259 return None
a41fb80c
S
260 return n
261
262
263def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
264 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
265 if n is None or n == default:
266 return n
267 if n.text is None:
268 if default is not NO_DEFAULT:
269 return default
270 elif fatal:
271 name = xpath if name is None else name
272 raise ExtractorError('Could not find XML element\'s text %s' % name)
273 else:
274 return None
275 return n.text
a41fb80c
S
276
277
278def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
279 n = find_xpath_attr(node, xpath, key)
280 if n is None:
281 if default is not NO_DEFAULT:
282 return default
283 elif fatal:
86e5f3ed 284 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
285 raise ExtractorError('Could not find XML attribute %s' % name)
286 else:
287 return None
288 return n.attrib[key]
bf0ff932
PH
289
290
c487cf00 291def get_element_by_id(id, html, **kwargs):
43e8fafd 292 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 293 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 294
12ea2f30 295
c487cf00 296def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 297 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 298 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
299
300
84c237fb 301def get_element_by_class(class_name, html):
2af12ad9
TC
302 """Return the content of the first tag with the specified class in the passed HTML document"""
303 retval = get_elements_by_class(class_name, html)
304 return retval[0] if retval else None
305
306
6f32a0b5
ZM
307def get_element_html_by_class(class_name, html):
308 """Return the html of the first tag with the specified class in the passed HTML document"""
309 retval = get_elements_html_by_class(class_name, html)
310 return retval[0] if retval else None
311
312
c487cf00 313def get_element_by_attribute(attribute, value, html, **kwargs):
314 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
315 return retval[0] if retval else None
316
317
c487cf00 318def get_element_html_by_attribute(attribute, value, html, **kargs):
319 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
320 return retval[0] if retval else None
321
322
c487cf00 323def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
324 """Return the content of all tags with the specified class in the passed HTML document as a list"""
325 return get_elements_by_attribute(
64fa820c 326 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
327 html, escape_value=False)
328
329
6f32a0b5
ZM
330def get_elements_html_by_class(class_name, html):
331 """Return the html of all tags with the specified class in the passed HTML document as a list"""
332 return get_elements_html_by_attribute(
64fa820c 333 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
334 html, escape_value=False)
335
336
337def get_elements_by_attribute(*args, **kwargs):
43e8fafd 338 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
339 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
340
341
342def get_elements_html_by_attribute(*args, **kwargs):
343 """Return the html of the tag with the specified attribute in the passed HTML document"""
344 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
345
346
4c9a1a3b 347def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
348 """
349 Return the text (content) and the html (whole) of the tag with the specified
350 attribute in the passed HTML document
351 """
c61473c1
M
352 if not value:
353 return
9e6dd238 354
86e5f3ed 355 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 356
84c237fb
YCH
357 value = re.escape(value) if escape_value else value
358
86e5f3ed 359 partial_element_re = rf'''(?x)
4c9a1a3b 360 <(?P<tag>{tag})
0254f162 361 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 362 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
363 '''
38285056 364
0254f162
ZM
365 for m in re.finditer(partial_element_re, html):
366 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 367
0254f162
ZM
368 yield (
369 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
370 whole
371 )
a921f407 372
c5229f39 373
ac668111 374class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
375 """
376 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
377 closing tag for the first opening tag it has encountered, and can be used
378 as a context manager
379 """
380
381 class HTMLBreakOnClosingTagException(Exception):
382 pass
383
384 def __init__(self):
385 self.tagstack = collections.deque()
ac668111 386 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
387
388 def __enter__(self):
389 return self
390
391 def __exit__(self, *_):
392 self.close()
393
394 def close(self):
395 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
396 # so data remains buffered; we no longer have any interest in it, thus
397 # override this method to discard it
398 pass
399
400 def handle_starttag(self, tag, _):
401 self.tagstack.append(tag)
402
403 def handle_endtag(self, tag):
404 if not self.tagstack:
405 raise compat_HTMLParseError('no tags in the stack')
406 while self.tagstack:
407 inner_tag = self.tagstack.pop()
408 if inner_tag == tag:
409 break
410 else:
411 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
412 if not self.tagstack:
413 raise self.HTMLBreakOnClosingTagException()
414
415
46d09f87 416# XXX: This should be far less strict
6f32a0b5
ZM
417def get_element_text_and_html_by_tag(tag, html):
418 """
419 For the first element with the specified tag in the passed HTML document
420 return its' content (text) and the whole element (html)
421 """
422 def find_or_raise(haystack, needle, exc):
423 try:
424 return haystack.index(needle)
425 except ValueError:
426 raise exc
427 closing_tag = f'</{tag}>'
428 whole_start = find_or_raise(
429 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
430 content_start = find_or_raise(
431 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
432 content_start += whole_start + 1
433 with HTMLBreakOnClosingTagParser() as parser:
434 parser.feed(html[whole_start:content_start])
435 if not parser.tagstack or parser.tagstack[0] != tag:
436 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
437 offset = content_start
438 while offset < len(html):
439 next_closing_tag_start = find_or_raise(
440 html[offset:], closing_tag,
441 compat_HTMLParseError(f'closing {tag} tag not found'))
442 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
443 try:
444 parser.feed(html[offset:offset + next_closing_tag_end])
445 offset += next_closing_tag_end
446 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
447 return html[content_start:offset + next_closing_tag_start], \
448 html[whole_start:offset + next_closing_tag_end]
449 raise compat_HTMLParseError('unexpected end of html')
450
451
ac668111 452class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 453 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 454
8bb56eee 455 def __init__(self):
c5229f39 456 self.attrs = {}
ac668111 457 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
458
459 def handle_starttag(self, tag, attrs):
460 self.attrs = dict(attrs)
7053aa3a 461 raise compat_HTMLParseError('done')
8bb56eee 462
c5229f39 463
ac668111 464class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
465 """HTML parser to gather the attributes for the elements of a list"""
466
467 def __init__(self):
ac668111 468 html.parser.HTMLParser.__init__(self)
73673ccf
FF
469 self.items = []
470 self._level = 0
471
472 def handle_starttag(self, tag, attrs):
473 if tag == 'li' and self._level == 0:
474 self.items.append(dict(attrs))
475 self._level += 1
476
477 def handle_endtag(self, tag):
478 self._level -= 1
479
480
8bb56eee
BF
481def extract_attributes(html_element):
482 """Given a string for an HTML element such as
483 <el
484 a="foo" B="bar" c="&98;az" d=boz
485 empty= noval entity="&amp;"
486 sq='"' dq="'"
487 >
488 Decode and return a dictionary of attributes.
489 {
490 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
491 'empty': '', 'noval': None, 'entity': '&',
492 'sq': '"', 'dq': '\''
493 }.
8bb56eee
BF
494 """
495 parser = HTMLAttributeParser()
19a03940 496 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
497 parser.feed(html_element)
498 parser.close()
8bb56eee 499 return parser.attrs
9e6dd238 500
c5229f39 501
73673ccf
FF
502def parse_list(webpage):
503 """Given a string for an series of HTML <li> elements,
504 return a dictionary of their attributes"""
505 parser = HTMLListAttrsParser()
506 parser.feed(webpage)
507 parser.close()
508 return parser.items
509
510
9e6dd238 511def clean_html(html):
59ae15a5 512 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
513
514 if html is None: # Convenience for sanitizing descriptions etc.
515 return html
516
49185227 517 html = re.sub(r'\s+', ' ', html)
518 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
519 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
520 # Strip html tags
521 html = re.sub('<.*?>', '', html)
522 # Replace html entities
523 html = unescapeHTML(html)
7decf895 524 return html.strip()
9e6dd238
FV
525
526
b7c47b74 527class LenientJSONDecoder(json.JSONDecoder):
cc090836 528 # TODO: Write tests
529 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 530 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 531 self._close_attempts = 2 * close_objects
b7c47b74 532 super().__init__(*args, **kwargs)
533
cc090836 534 @staticmethod
535 def _close_object(err):
536 doc = err.doc[:err.pos]
537 # We need to add comma first to get the correct error message
538 if err.msg.startswith('Expecting \',\''):
539 return doc + ','
540 elif not doc.endswith(','):
541 return
542
543 if err.msg.startswith('Expecting property name'):
544 return doc[:-1] + '}'
545 elif err.msg.startswith('Expecting value'):
546 return doc[:-1] + ']'
547
b7c47b74 548 def decode(self, s):
549 if self.transform_source:
550 s = self.transform_source(s)
cc090836 551 for attempt in range(self._close_attempts + 1):
552 try:
553 if self.ignore_extra:
554 return self.raw_decode(s.lstrip())[0]
555 return super().decode(s)
556 except json.JSONDecodeError as e:
557 if e.pos is None:
558 raise
559 elif attempt < self._close_attempts:
560 s = self._close_object(e)
561 if s is not None:
562 continue
2fa669f7 563 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 564 assert False, 'Too many attempts to decode JSON'
b7c47b74 565
566
d77c3dfd 567def sanitize_open(filename, open_mode):
59ae15a5
PH
568 """Try to open the given filename, and slightly tweak it if this fails.
569
570 Attempts to open the given filename. If this fails, it tries to change
571 the filename slightly, step by step, until it's either able to open it
572 or it fails and raises a final exception, like the standard open()
573 function.
574
575 It returns the tuple (stream, definitive_file_name).
576 """
0edb3e33 577 if filename == '-':
578 if sys.platform == 'win32':
579 import msvcrt
be5c1ae8 580
62b58c09 581 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 582 with contextlib.suppress(io.UnsupportedOperation):
583 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 584 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 585
0edb3e33 586 for attempt in range(2):
587 try:
588 try:
89737671 589 if sys.platform == 'win32':
b506289f 590 # FIXME: An exclusive lock also locks the file from being read.
591 # Since windows locks are mandatory, don't lock the file on windows (for now).
592 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 593 raise LockingUnsupportedError()
0edb3e33 594 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 595 except OSError:
0edb3e33 596 stream = open(filename, open_mode)
8a82af35 597 return stream, filename
86e5f3ed 598 except OSError as err:
0edb3e33 599 if attempt or err.errno in (errno.EACCES,):
600 raise
601 old_filename, filename = filename, sanitize_path(filename)
602 if old_filename == filename:
603 raise
d77c3dfd
FV
604
605
606def timeconvert(timestr):
59ae15a5
PH
607 """Convert RFC 2822 defined time string into system timestamp"""
608 timestamp = None
609 timetuple = email.utils.parsedate_tz(timestr)
610 if timetuple is not None:
611 timestamp = email.utils.mktime_tz(timetuple)
612 return timestamp
1c469a94 613
5f6a1245 614
5c3895ff 615def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 616 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 617 @param restricted Use a stricter subset of allowed characters
618 @param is_id Whether this is an ID that should be kept unchanged if possible.
619 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 620 """
5c3895ff 621 if s == '':
622 return ''
623
59ae15a5 624 def replace_insane(char):
c587cbb7
AT
625 if restricted and char in ACCENT_CHARS:
626 return ACCENT_CHARS[char]
91dd88b9 627 elif not restricted and char == '\n':
5c3895ff 628 return '\0 '
989a01c2 629 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
630 # Replace with their full-width unicode counterparts
631 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 632 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
633 return ''
634 elif char == '"':
635 return '' if restricted else '\''
636 elif char == ':':
5c3895ff 637 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 638 elif char in '\\/|*<>':
5c3895ff 639 return '\0_'
640 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
641 return '\0_'
59ae15a5
PH
642 return char
643
db4678e4 644 # Replace look-alike Unicode glyphs
645 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 646 s = unicodedata.normalize('NFKC', s)
5c3895ff 647 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 648 result = ''.join(map(replace_insane, s))
5c3895ff 649 if is_id is NO_DEFAULT:
ae61d108 650 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
651 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 652 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
653 result = result.replace('\0', '') or '_'
654
796173d0
PH
655 if not is_id:
656 while '__' in result:
657 result = result.replace('__', '_')
658 result = result.strip('_')
659 # Common case of "Foreign band name - English song title"
660 if restricted and result.startswith('-_'):
661 result = result[2:]
5a42414b
PH
662 if result.startswith('-'):
663 result = '_' + result[len('-'):]
a7440261 664 result = result.lstrip('.')
796173d0
PH
665 if not result:
666 result = '_'
59ae15a5 667 return result
d77c3dfd 668
5f6a1245 669
c2934512 670def sanitize_path(s, force=False):
a2aaf4db 671 """Sanitizes and normalizes path on Windows"""
c2934512 672 if sys.platform == 'win32':
c4218ac3 673 force = False
c2934512 674 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 675 elif force:
676 drive_or_unc = ''
677 else:
a2aaf4db 678 return s
c2934512 679
be531ef1
S
680 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
681 if drive_or_unc:
a2aaf4db
S
682 norm_path.pop(0)
683 sanitized_path = [
ec85ded8 684 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 685 for path_part in norm_path]
be531ef1
S
686 if drive_or_unc:
687 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 688 elif force and s and s[0] == os.path.sep:
c4218ac3 689 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
690 return os.path.join(*sanitized_path)
691
692
8f97a15d 693def sanitize_url(url, *, scheme='http'):
befa4708
S
694 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
695 # the number of unwanted failures due to missing protocol
21633673 696 if url is None:
697 return
698 elif url.startswith('//'):
8f97a15d 699 return f'{scheme}:{url}'
befa4708
S
700 # Fix some common typos seen so far
701 COMMON_TYPOS = (
067aa17e 702 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
703 (r'^httpss://', r'https://'),
704 # https://bx1.be/lives/direct-tv/
705 (r'^rmtp([es]?)://', r'rtmp\1://'),
706 )
707 for mistake, fixup in COMMON_TYPOS:
708 if re.match(mistake, url):
709 return re.sub(mistake, fixup, url)
bc6b9bcd 710 return url
17bcc626
S
711
712
5435dcf9 713def extract_basic_auth(url):
14f25df2 714 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
715 if parts.username is None:
716 return url, None
14f25df2 717 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
718 parts.hostname if parts.port is None
719 else '%s:%d' % (parts.hostname, parts.port))))
720 auth_payload = base64.b64encode(
0f06bcd7 721 ('%s:%s' % (parts.username, parts.password or '')).encode())
722 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
723
724
51098426 725def expand_path(s):
2fa669f7 726 """Expand shell variables and ~"""
51098426
S
727 return os.path.expandvars(compat_expanduser(s))
728
729
7e9a6125 730def orderedSet(iterable, *, lazy=False):
731 """Remove all duplicates from the input iterable"""
732 def _iter():
733 seen = [] # Do not use set since the items can be unhashable
734 for x in iterable:
735 if x not in seen:
736 seen.append(x)
737 yield x
738
739 return _iter() if lazy else list(_iter())
d77c3dfd 740
912b38b4 741
55b2f099 742def _htmlentity_transform(entity_with_semicolon):
4e408e47 743 """Transforms an HTML entity to a character."""
55b2f099
YCH
744 entity = entity_with_semicolon[:-1]
745
4e408e47 746 # Known non-numeric HTML entity
ac668111 747 if entity in html.entities.name2codepoint:
748 return chr(html.entities.name2codepoint[entity])
4e408e47 749
62b58c09
L
750 # TODO: HTML5 allows entities without a semicolon.
751 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 752 if entity_with_semicolon in html.entities.html5:
753 return html.entities.html5[entity_with_semicolon]
55b2f099 754
91757b0f 755 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
756 if mobj is not None:
757 numstr = mobj.group(1)
28e614de 758 if numstr.startswith('x'):
4e408e47 759 base = 16
28e614de 760 numstr = '0%s' % numstr
4e408e47
PH
761 else:
762 base = 10
067aa17e 763 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 764 with contextlib.suppress(ValueError):
ac668111 765 return chr(int(numstr, base))
4e408e47
PH
766
767 # Unknown entity in name, return its literal representation
7a3f0c00 768 return '&%s;' % entity
4e408e47
PH
769
770
d77c3dfd 771def unescapeHTML(s):
912b38b4
PH
772 if s is None:
773 return None
19a03940 774 assert isinstance(s, str)
d77c3dfd 775
4e408e47 776 return re.sub(
95f3f7c2 777 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 778
8bf48f23 779
cdb19aa4 780def escapeHTML(text):
781 return (
782 text
783 .replace('&', '&amp;')
784 .replace('<', '&lt;')
785 .replace('>', '&gt;')
786 .replace('"', '&quot;')
787 .replace("'", '&#39;')
788 )
789
790
db3ad8a6
ND
791class netrc_from_content(netrc.netrc):
792 def __init__(self, content):
793 self.hosts, self.macros = {}, {}
794 with io.StringIO(content) as stream:
795 self._parse('-', stream, False)
796
797
d3c93ec2 798class Popen(subprocess.Popen):
799 if sys.platform == 'win32':
800 _startupinfo = subprocess.STARTUPINFO()
801 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
802 else:
803 _startupinfo = None
804
82ea226c
L
805 @staticmethod
806 def _fix_pyinstaller_ld_path(env):
807 """Restore LD_LIBRARY_PATH when using PyInstaller
808 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
809 https://github.com/yt-dlp/yt-dlp/issues/4573
810 """
811 if not hasattr(sys, '_MEIPASS'):
812 return
813
814 def _fix(key):
815 orig = env.get(f'{key}_ORIG')
816 if orig is None:
817 env.pop(key, None)
818 else:
819 env[key] = orig
820
821 _fix('LD_LIBRARY_PATH') # Linux
822 _fix('DYLD_LIBRARY_PATH') # macOS
823
824 def __init__(self, *args, env=None, text=False, **kwargs):
825 if env is None:
826 env = os.environ.copy()
827 self._fix_pyinstaller_ld_path(env)
828
da8e2912 829 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 830 if text is True:
831 kwargs['universal_newlines'] = True # For 3.6 compatibility
832 kwargs.setdefault('encoding', 'utf-8')
833 kwargs.setdefault('errors', 'replace')
82ea226c 834 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 835
836 def communicate_or_kill(self, *args, **kwargs):
8a82af35 837 try:
838 return self.communicate(*args, **kwargs)
839 except BaseException: # Including KeyboardInterrupt
f0c9fb96 840 self.kill(timeout=None)
8a82af35 841 raise
d3c93ec2 842
f0c9fb96 843 def kill(self, *, timeout=0):
844 super().kill()
845 if timeout != 0:
846 self.wait(timeout=timeout)
847
848 @classmethod
992dc6b4 849 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 850 with cls(*args, **kwargs) as proc:
da8e2912 851 default = '' if proc.__text_mode else b''
992dc6b4 852 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 853 return stdout or default, stderr or default, proc.returncode
f0c9fb96 854
d3c93ec2 855
f07b74fc 856def encodeArgument(s):
cfb0511d 857 # Legacy code that uses byte strings
858 # Uncomment the following line after fixing all post processors
14f25df2 859 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 860 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
861
862
aa7785f8 863_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
864
865
866def timetuple_from_msec(msec):
867 secs, msec = divmod(msec, 1000)
868 mins, secs = divmod(secs, 60)
869 hrs, mins = divmod(mins, 60)
870 return _timetuple(hrs, mins, secs, msec)
871
872
cdb19aa4 873def formatSeconds(secs, delim=':', msec=False):
aa7785f8 874 time = timetuple_from_msec(secs * 1000)
875 if time.hours:
876 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
877 elif time.minutes:
878 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 879 else:
aa7785f8 880 ret = '%d' % time.seconds
881 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 882
a0ddb8a2 883
5873d4cc 884def bug_reports_message(before=';'):
69bec673 885 from ..update import REPOSITORY
57e0f077 886
887 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
888 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
889
890 before = before.rstrip()
891 if not before or before.endswith(('.', '!', '?')):
892 msg = msg[0].title() + msg[1:]
893
894 return (before + ' ' if before else '') + msg
08f2a92c
JMF
895
896
bf5b9d85
PM
897class YoutubeDLError(Exception):
898 """Base exception for YoutubeDL errors."""
aa9369a2 899 msg = None
900
901 def __init__(self, msg=None):
902 if msg is not None:
903 self.msg = msg
904 elif self.msg is None:
905 self.msg = type(self).__name__
906 super().__init__(self.msg)
bf5b9d85
PM
907
908
909class ExtractorError(YoutubeDLError):
1c256f70 910 """Error during info extraction."""
5f6a1245 911
1151c407 912 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 913 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 914 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 915 """
c365dba8 916 from ..networking.exceptions import network_exceptions
3158150c 917 if sys.exc_info()[0] in network_exceptions:
9a82b238 918 expected = True
d5979c5d 919
7265a219 920 self.orig_msg = str(msg)
1c256f70 921 self.traceback = tb
1151c407 922 self.expected = expected
2eabb802 923 self.cause = cause
d11271dd 924 self.video_id = video_id
1151c407 925 self.ie = ie
926 self.exc_info = sys.exc_info() # preserve original exception
5df14442 927 if isinstance(self.exc_info[1], ExtractorError):
928 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 929 super().__init__(self.__msg)
1151c407 930
9bcfe33b 931 @property
932 def __msg(self):
933 return ''.join((
934 format_field(self.ie, None, '[%s] '),
935 format_field(self.video_id, None, '%s: '),
936 self.orig_msg,
937 format_field(self.cause, None, ' (caused by %r)'),
938 '' if self.expected else bug_reports_message()))
1c256f70 939
01951dda 940 def format_traceback(self):
497d2fab 941 return join_nonempty(
942 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 943 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 944 delim='\n') or None
01951dda 945
9bcfe33b 946 def __setattr__(self, name, value):
947 super().__setattr__(name, value)
948 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
949 self.msg = self.__msg or type(self).__name__
950 self.args = (self.msg, ) # Cannot be property
951
1c256f70 952
416c7fcb
PH
953class UnsupportedError(ExtractorError):
954 def __init__(self, url):
86e5f3ed 955 super().__init__(
416c7fcb
PH
956 'Unsupported URL: %s' % url, expected=True)
957 self.url = url
958
959
55b3e45b
JMF
960class RegexNotFoundError(ExtractorError):
961 """Error when a regex didn't match"""
962 pass
963
964
773f291d
S
965class GeoRestrictedError(ExtractorError):
966 """Geographic restriction Error exception.
967
968 This exception may be thrown when a video is not available from your
969 geographic location due to geographic restrictions imposed by a website.
970 """
b6e0c7d2 971
0db3bae8 972 def __init__(self, msg, countries=None, **kwargs):
973 kwargs['expected'] = True
86e5f3ed 974 super().__init__(msg, **kwargs)
773f291d
S
975 self.countries = countries
976
977
693f0600 978class UserNotLive(ExtractorError):
979 """Error when a channel/user is not live"""
980
981 def __init__(self, msg=None, **kwargs):
982 kwargs['expected'] = True
983 super().__init__(msg or 'The channel is not currently live', **kwargs)
984
985
bf5b9d85 986class DownloadError(YoutubeDLError):
59ae15a5 987 """Download Error exception.
d77c3dfd 988
59ae15a5
PH
989 This exception may be thrown by FileDownloader objects if they are not
990 configured to continue on errors. They will contain the appropriate
991 error message.
992 """
5f6a1245 993
8cc83b8d
FV
994 def __init__(self, msg, exc_info=None):
995 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 996 super().__init__(msg)
8cc83b8d 997 self.exc_info = exc_info
d77c3dfd
FV
998
999
498f5606 1000class EntryNotInPlaylist(YoutubeDLError):
1001 """Entry not in playlist exception.
1002
1003 This exception will be thrown by YoutubeDL when a requested entry
1004 is not found in the playlist info_dict
1005 """
aa9369a2 1006 msg = 'Entry not found in info'
498f5606 1007
1008
bf5b9d85 1009class SameFileError(YoutubeDLError):
59ae15a5 1010 """Same File exception.
d77c3dfd 1011
59ae15a5
PH
1012 This exception will be thrown by FileDownloader objects if they detect
1013 multiple files would have to be downloaded to the same file on disk.
1014 """
aa9369a2 1015 msg = 'Fixed output name but more than one file to download'
1016
1017 def __init__(self, filename=None):
1018 if filename is not None:
1019 self.msg += f': {filename}'
1020 super().__init__(self.msg)
d77c3dfd
FV
1021
1022
bf5b9d85 1023class PostProcessingError(YoutubeDLError):
59ae15a5 1024 """Post Processing exception.
d77c3dfd 1025
59ae15a5
PH
1026 This exception may be raised by PostProcessor's .run() method to
1027 indicate an error in the postprocessing task.
1028 """
5f6a1245 1029
5f6a1245 1030
48f79687 1031class DownloadCancelled(YoutubeDLError):
1032 """ Exception raised when the download queue should be interrupted """
1033 msg = 'The download was cancelled'
8b0d7497 1034
8b0d7497 1035
48f79687 1036class ExistingVideoReached(DownloadCancelled):
1037 """ --break-on-existing triggered """
1038 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1039
48f79687 1040
1041class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1042 """ --break-match-filter triggered """
1043 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1044
1045
48f79687 1046class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1047 """ --max-downloads limit has been reached. """
48f79687 1048 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1049
1050
f2ebc5c7 1051class ReExtractInfo(YoutubeDLError):
1052 """ Video info needs to be re-extracted. """
1053
1054 def __init__(self, msg, expected=False):
1055 super().__init__(msg)
1056 self.expected = expected
1057
1058
1059class ThrottledDownload(ReExtractInfo):
48f79687 1060 """ Download speed below --throttled-rate. """
aa9369a2 1061 msg = 'The download speed is below throttle limit'
d77c3dfd 1062
43b22906 1063 def __init__(self):
1064 super().__init__(self.msg, expected=False)
f2ebc5c7 1065
d77c3dfd 1066
bf5b9d85 1067class UnavailableVideoError(YoutubeDLError):
59ae15a5 1068 """Unavailable Format exception.
d77c3dfd 1069
59ae15a5
PH
1070 This exception will be thrown when a video is requested
1071 in a format that is not available for that video.
1072 """
aa9369a2 1073 msg = 'Unable to download video'
1074
1075 def __init__(self, err=None):
1076 if err is not None:
1077 self.msg += f': {err}'
1078 super().__init__(self.msg)
d77c3dfd
FV
1079
1080
bf5b9d85 1081class ContentTooShortError(YoutubeDLError):
59ae15a5 1082 """Content Too Short exception.
d77c3dfd 1083
59ae15a5
PH
1084 This exception may be raised by FileDownloader objects when a file they
1085 download is too small for what the server announced first, indicating
1086 the connection was probably interrupted.
1087 """
d77c3dfd 1088
59ae15a5 1089 def __init__(self, downloaded, expected):
86e5f3ed 1090 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1091 # Both in bytes
59ae15a5
PH
1092 self.downloaded = downloaded
1093 self.expected = expected
d77c3dfd 1094
5f6a1245 1095
bf5b9d85 1096class XAttrMetadataError(YoutubeDLError):
efa97bdc 1097 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1098 super().__init__(msg)
efa97bdc 1099 self.code = code
bd264412 1100 self.msg = msg
efa97bdc
YCH
1101
1102 # Parsing code and msg
3089bc74 1103 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1104 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1105 self.reason = 'NO_SPACE'
1106 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1107 self.reason = 'VALUE_TOO_LONG'
1108 else:
1109 self.reason = 'NOT_SUPPORTED'
1110
1111
bf5b9d85 1112class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1113 pass
1114
1115
941e881e 1116def is_path_like(f):
1117 return isinstance(f, (str, bytes, os.PathLike))
1118
1119
46f59e89
S
1120def extract_timezone(date_str):
1121 m = re.search(
f137e4c2 1122 r'''(?x)
1123 ^.{8,}? # >=8 char non-TZ prefix, if present
1124 (?P<tz>Z| # just the UTC Z, or
1125 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1126 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1127 [ ]? # optional space
1128 (?P<sign>\+|-) # +/-
1129 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1130 $)
1131 ''', date_str)
46f59e89 1132 if not m:
8f53dc44 1133 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1134 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1135 if timezone is not None:
1136 date_str = date_str[:-len(m.group('tz'))]
1137 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1138 else:
1139 date_str = date_str[:-len(m.group('tz'))]
1140 if not m.group('sign'):
1141 timezone = datetime.timedelta()
1142 else:
1143 sign = 1 if m.group('sign') == '+' else -1
1144 timezone = datetime.timedelta(
1145 hours=sign * int(m.group('hours')),
1146 minutes=sign * int(m.group('minutes')))
1147 return timezone, date_str
1148
1149
08b38d54 1150def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1151 """ Return a UNIX timestamp from the given date """
1152
1153 if date_str is None:
1154 return None
1155
52c3a6e4
S
1156 date_str = re.sub(r'\.[0-9]+', '', date_str)
1157
08b38d54 1158 if timezone is None:
46f59e89
S
1159 timezone, date_str = extract_timezone(date_str)
1160
19a03940 1161 with contextlib.suppress(ValueError):
86e5f3ed 1162 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1163 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1164 return calendar.timegm(dt.timetuple())
912b38b4
PH
1165
1166
46f59e89
S
1167def date_formats(day_first=True):
1168 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1169
1170
42bdd9d0 1171def unified_strdate(date_str, day_first=True):
bf50b038 1172 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1173
1174 if date_str is None:
1175 return None
bf50b038 1176 upload_date = None
5f6a1245 1177 # Replace commas
026fcc04 1178 date_str = date_str.replace(',', ' ')
42bdd9d0 1179 # Remove AM/PM + timezone
9bb8e0a3 1180 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1181 _, date_str = extract_timezone(date_str)
42bdd9d0 1182
46f59e89 1183 for expression in date_formats(day_first):
19a03940 1184 with contextlib.suppress(ValueError):
bf50b038 1185 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1186 if upload_date is None:
1187 timetuple = email.utils.parsedate_tz(date_str)
1188 if timetuple:
19a03940 1189 with contextlib.suppress(ValueError):
c6b9cf05 1190 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1191 if upload_date is not None:
14f25df2 1192 return str(upload_date)
bf50b038 1193
5f6a1245 1194
46f59e89 1195def unified_timestamp(date_str, day_first=True):
ad54c913 1196 if not isinstance(date_str, str):
46f59e89
S
1197 return None
1198
8f53dc44 1199 date_str = re.sub(r'\s+', ' ', re.sub(
1200 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1201
7dc2a74e 1202 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1203 timezone, date_str = extract_timezone(date_str)
1204
1205 # Remove AM/PM + timezone
1206 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1207
deef3195
S
1208 # Remove unrecognized timezones from ISO 8601 alike timestamps
1209 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1210 if m:
1211 date_str = date_str[:-len(m.group('tz'))]
1212
f226880c
PH
1213 # Python only supports microseconds, so remove nanoseconds
1214 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1215 if m:
1216 date_str = m.group(1)
1217
46f59e89 1218 for expression in date_formats(day_first):
19a03940 1219 with contextlib.suppress(ValueError):
7dc2a74e 1220 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1221 return calendar.timegm(dt.timetuple())
8f53dc44 1222
46f59e89
S
1223 timetuple = email.utils.parsedate_tz(date_str)
1224 if timetuple:
8f53dc44 1225 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1226
1227
28e614de 1228def determine_ext(url, default_ext='unknown_video'):
85750f89 1229 if url is None or '.' not in url:
f4776371 1230 return default_ext
9cb9a5df 1231 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1232 if re.match(r'^[A-Za-z0-9]+$', guess):
1233 return guess
a7aaa398
S
1234 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1235 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1236 return guess.rstrip('/')
73e79f2a 1237 else:
cbdbb766 1238 return default_ext
73e79f2a 1239
5f6a1245 1240
824fa511
S
1241def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1242 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1243
5f6a1245 1244
9e62f283 1245def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1246 R"""
1247 Return a datetime object from a string.
1248 Supported format:
1249 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1250
1251 @param format strftime format of DATE
1252 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1253 auto: round to the unit provided in date_str (if applicable).
9e62f283 1254 """
1255 auto_precision = False
1256 if precision == 'auto':
1257 auto_precision = True
1258 precision = 'microsecond'
396a76f7 1259 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1260 if date_str in ('now', 'today'):
37254abc 1261 return today
f8795e10
PH
1262 if date_str == 'yesterday':
1263 return today - datetime.timedelta(days=1)
9e62f283 1264 match = re.match(
3d38b2d6 1265 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1266 date_str)
37254abc 1267 if match is not None:
9e62f283 1268 start_time = datetime_from_str(match.group('start'), precision, format)
1269 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1270 unit = match.group('unit')
9e62f283 1271 if unit == 'month' or unit == 'year':
1272 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1273 unit = 'day'
9e62f283 1274 else:
1275 if unit == 'week':
1276 unit = 'day'
1277 time *= 7
1278 delta = datetime.timedelta(**{unit + 's': time})
1279 new_date = start_time + delta
1280 if auto_precision:
1281 return datetime_round(new_date, unit)
1282 return new_date
1283
1284 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1285
1286
d49f8db3 1287def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1288 R"""
1289 Return a date object from a string using datetime_from_str
9e62f283 1290
3d38b2d6 1291 @param strict Restrict allowed patterns to "YYYYMMDD" and
1292 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1293 """
3d38b2d6 1294 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1295 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1296 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1297
1298
1299def datetime_add_months(dt, months):
1300 """Increment/Decrement a datetime object by months."""
1301 month = dt.month + months - 1
1302 year = dt.year + month // 12
1303 month = month % 12 + 1
1304 day = min(dt.day, calendar.monthrange(year, month)[1])
1305 return dt.replace(year, month, day)
1306
1307
1308def datetime_round(dt, precision='day'):
1309 """
1310 Round a datetime object's time to a specific precision
1311 """
1312 if precision == 'microsecond':
1313 return dt
1314
1315 unit_seconds = {
1316 'day': 86400,
1317 'hour': 3600,
1318 'minute': 60,
1319 'second': 1,
1320 }
1321 roundto = lambda x, n: ((x + n / 2) // n) * n
1322 timestamp = calendar.timegm(dt.timetuple())
1323 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1324
1325
e63fc1be 1326def hyphenate_date(date_str):
1327 """
1328 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1329 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1330 if match is not None:
1331 return '-'.join(match.groups())
1332 else:
1333 return date_str
1334
5f6a1245 1335
86e5f3ed 1336class DateRange:
bd558525 1337 """Represents a time interval between two dates"""
5f6a1245 1338
bd558525
JMF
1339 def __init__(self, start=None, end=None):
1340 """start and end must be strings in the format accepted by date"""
1341 if start is not None:
d49f8db3 1342 self.start = date_from_str(start, strict=True)
bd558525
JMF
1343 else:
1344 self.start = datetime.datetime.min.date()
1345 if end is not None:
d49f8db3 1346 self.end = date_from_str(end, strict=True)
bd558525
JMF
1347 else:
1348 self.end = datetime.datetime.max.date()
37254abc 1349 if self.start > self.end:
bd558525 1350 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1351
bd558525
JMF
1352 @classmethod
1353 def day(cls, day):
1354 """Returns a range that only contains the given day"""
5f6a1245
JW
1355 return cls(day, day)
1356
bd558525
JMF
1357 def __contains__(self, date):
1358 """Check if the date is in the range"""
37254abc
JMF
1359 if not isinstance(date, datetime.date):
1360 date = date_from_str(date)
1361 return self.start <= date <= self.end
5f6a1245 1362
46f1370e 1363 def __repr__(self):
1364 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1365
f2df4071 1366 def __eq__(self, other):
1367 return (isinstance(other, DateRange)
1368 and self.start == other.start and self.end == other.end)
1369
c496ca96 1370
b1f94422 1371@functools.cache
1372def system_identifier():
1373 python_implementation = platform.python_implementation()
1374 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1375 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1376 libc_ver = []
1377 with contextlib.suppress(OSError): # We may not have access to the executable
1378 libc_ver = platform.libc_ver()
b1f94422 1379
17fc3dc4 1380 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1381 platform.python_version(),
1382 python_implementation,
17fc3dc4 1383 platform.machine(),
b1f94422 1384 platform.architecture()[0],
1385 platform.platform(),
5b9f253f
M
1386 ssl.OPENSSL_VERSION,
1387 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1388 )
c257baff
PH
1389
1390
0b9c08b4 1391@functools.cache
49fa4d9a 1392def get_windows_version():
8a82af35 1393 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1394 if compat_os_name == 'nt':
1395 return version_tuple(platform.win32_ver()[1])
1396 else:
8a82af35 1397 return ()
49fa4d9a
N
1398
1399
734f90bb 1400def write_string(s, out=None, encoding=None):
19a03940 1401 assert isinstance(s, str)
1402 out = out or sys.stderr
3b479100
SS
1403 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1404 if not out:
1405 return
7459e3a2 1406
fe1daad3 1407 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1408 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1409
8a82af35 1410 enc, buffer = None, out
cfb0511d 1411 if 'b' in getattr(out, 'mode', ''):
c487cf00 1412 enc = encoding or preferredencoding()
104aa738 1413 elif hasattr(out, 'buffer'):
8a82af35 1414 buffer = out.buffer
104aa738 1415 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1416
8a82af35 1417 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1418 out.flush()
1419
1420
3d2623a8 1421# TODO: Use global logger
da4db748 1422def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1423 from .. import _IN_CLI
da4db748 1424 if _IN_CLI:
1425 if msg in deprecation_warning._cache:
1426 return
1427 deprecation_warning._cache.add(msg)
1428 if printer:
1429 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1430 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1431 else:
1432 import warnings
1433 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1434
1435
1436deprecation_warning._cache = set()
1437
1438
48ea9cea
PH
1439def bytes_to_intlist(bs):
1440 if not bs:
1441 return []
1442 if isinstance(bs[0], int): # Python 3
1443 return list(bs)
1444 else:
1445 return [ord(c) for c in bs]
1446
c257baff 1447
cba892fa 1448def intlist_to_bytes(xs):
1449 if not xs:
1450 return b''
ac668111 1451 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1452
1453
8a82af35 1454class LockingUnsupportedError(OSError):
1890fc63 1455 msg = 'File locking is not supported'
0edb3e33 1456
1457 def __init__(self):
1458 super().__init__(self.msg)
1459
1460
c1c9a79c
PH
1461# Cross-platform file locking
1462if sys.platform == 'win32':
fe0918bb 1463 import ctypes
c1c9a79c
PH
1464 import ctypes.wintypes
1465 import msvcrt
1466
1467 class OVERLAPPED(ctypes.Structure):
1468 _fields_ = [
1469 ('Internal', ctypes.wintypes.LPVOID),
1470 ('InternalHigh', ctypes.wintypes.LPVOID),
1471 ('Offset', ctypes.wintypes.DWORD),
1472 ('OffsetHigh', ctypes.wintypes.DWORD),
1473 ('hEvent', ctypes.wintypes.HANDLE),
1474 ]
1475
37e325b9 1476 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1477 LockFileEx = kernel32.LockFileEx
1478 LockFileEx.argtypes = [
1479 ctypes.wintypes.HANDLE, # hFile
1480 ctypes.wintypes.DWORD, # dwFlags
1481 ctypes.wintypes.DWORD, # dwReserved
1482 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1483 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1484 ctypes.POINTER(OVERLAPPED) # Overlapped
1485 ]
1486 LockFileEx.restype = ctypes.wintypes.BOOL
1487 UnlockFileEx = kernel32.UnlockFileEx
1488 UnlockFileEx.argtypes = [
1489 ctypes.wintypes.HANDLE, # hFile
1490 ctypes.wintypes.DWORD, # dwReserved
1491 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1492 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1493 ctypes.POINTER(OVERLAPPED) # Overlapped
1494 ]
1495 UnlockFileEx.restype = ctypes.wintypes.BOOL
1496 whole_low = 0xffffffff
1497 whole_high = 0x7fffffff
1498
747c0bd1 1499 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1500 overlapped = OVERLAPPED()
1501 overlapped.Offset = 0
1502 overlapped.OffsetHigh = 0
1503 overlapped.hEvent = 0
1504 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1505
1506 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1507 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1508 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1509 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1510 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1511
1512 def _unlock_file(f):
1513 assert f._lock_file_overlapped_p
1514 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1515 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1516 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1517
1518else:
399a76e6
YCH
1519 try:
1520 import fcntl
c1c9a79c 1521
a3125791 1522 def _lock_file(f, exclusive, block):
b63837bc 1523 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1524 if not block:
1525 flags |= fcntl.LOCK_NB
acea8d7c 1526 try:
b63837bc 1527 fcntl.flock(f, flags)
acea8d7c
JK
1528 except BlockingIOError:
1529 raise
1530 except OSError: # AOSP does not have flock()
b63837bc 1531 fcntl.lockf(f, flags)
c1c9a79c 1532
399a76e6 1533 def _unlock_file(f):
45998b3e
E
1534 with contextlib.suppress(OSError):
1535 return fcntl.flock(f, fcntl.LOCK_UN)
1536 with contextlib.suppress(OSError):
1537 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1538 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1539
399a76e6 1540 except ImportError:
399a76e6 1541
a3125791 1542 def _lock_file(f, exclusive, block):
0edb3e33 1543 raise LockingUnsupportedError()
399a76e6
YCH
1544
1545 def _unlock_file(f):
0edb3e33 1546 raise LockingUnsupportedError()
c1c9a79c
PH
1547
1548
86e5f3ed 1549class locked_file:
0edb3e33 1550 locked = False
747c0bd1 1551
a3125791 1552 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1553 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1554 raise NotImplementedError(mode)
1555 self.mode, self.block = mode, block
1556
1557 writable = any(f in mode for f in 'wax+')
1558 readable = any(f in mode for f in 'r+')
1559 flags = functools.reduce(operator.ior, (
1560 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1561 getattr(os, 'O_BINARY', 0), # Windows only
1562 getattr(os, 'O_NOINHERIT', 0), # Windows only
1563 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1564 os.O_APPEND if 'a' in mode else 0,
1565 os.O_EXCL if 'x' in mode else 0,
1566 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1567 ))
1568
98804d03 1569 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1570
1571 def __enter__(self):
a3125791 1572 exclusive = 'r' not in self.mode
c1c9a79c 1573 try:
a3125791 1574 _lock_file(self.f, exclusive, self.block)
0edb3e33 1575 self.locked = True
86e5f3ed 1576 except OSError:
c1c9a79c
PH
1577 self.f.close()
1578 raise
fcfa8853 1579 if 'w' in self.mode:
131e14dc
JK
1580 try:
1581 self.f.truncate()
1582 except OSError as e:
1890fc63 1583 if e.errno not in (
1584 errno.ESPIPE, # Illegal seek - expected for FIFO
1585 errno.EINVAL, # Invalid argument - expected for /dev/null
1586 ):
1587 raise
c1c9a79c
PH
1588 return self
1589
0edb3e33 1590 def unlock(self):
1591 if not self.locked:
1592 return
c1c9a79c 1593 try:
0edb3e33 1594 _unlock_file(self.f)
c1c9a79c 1595 finally:
0edb3e33 1596 self.locked = False
c1c9a79c 1597
0edb3e33 1598 def __exit__(self, *_):
1599 try:
1600 self.unlock()
1601 finally:
1602 self.f.close()
4eb7f1d1 1603
0edb3e33 1604 open = __enter__
1605 close = __exit__
a3125791 1606
0edb3e33 1607 def __getattr__(self, attr):
1608 return getattr(self.f, attr)
a3125791 1609
0edb3e33 1610 def __iter__(self):
1611 return iter(self.f)
a3125791 1612
4eb7f1d1 1613
0b9c08b4 1614@functools.cache
4644ac55
S
1615def get_filesystem_encoding():
1616 encoding = sys.getfilesystemencoding()
1617 return encoding if encoding is not None else 'utf-8'
1618
1619
4eb7f1d1 1620def shell_quote(args):
a6a173c2 1621 quoted_args = []
4644ac55 1622 encoding = get_filesystem_encoding()
a6a173c2
JMF
1623 for a in args:
1624 if isinstance(a, bytes):
1625 # We may get a filename encoded with 'encodeFilename'
1626 a = a.decode(encoding)
aefce8e6 1627 quoted_args.append(compat_shlex_quote(a))
28e614de 1628 return ' '.join(quoted_args)
9d4660ca
PH
1629
1630
1631def smuggle_url(url, data):
1632 """ Pass additional data in a URL for internal use. """
1633
81953d1a
RA
1634 url, idata = unsmuggle_url(url, {})
1635 data.update(idata)
14f25df2 1636 sdata = urllib.parse.urlencode(
28e614de
PH
1637 {'__youtubedl_smuggle': json.dumps(data)})
1638 return url + '#' + sdata
9d4660ca
PH
1639
1640
79f82953 1641def unsmuggle_url(smug_url, default=None):
83e865a3 1642 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1643 return smug_url, default
28e614de 1644 url, _, sdata = smug_url.rpartition('#')
14f25df2 1645 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1646 data = json.loads(jsond)
1647 return url, data
02dbf93f
PH
1648
1649
e0fd9573 1650def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1651 """ Formats numbers with decimal sufixes like K, M, etc """
1652 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1653 if num is None or num < 0:
e0fd9573 1654 return None
eeb2a770 1655 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1656 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1657 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1658 if factor == 1024:
1659 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1660 converted = num / (factor ** exponent)
abbeeebc 1661 return fmt % (converted, suffix)
e0fd9573 1662
1663
02dbf93f 1664def format_bytes(bytes):
f02d24d8 1665 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1666
1c088fa8 1667
64c464a1 1668def lookup_unit_table(unit_table, s, strict=False):
1669 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1670 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1671 m = (re.fullmatch if strict else re.match)(
1672 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1673 if not m:
1674 return None
64c464a1 1675
1676 num = float(m.group('num').replace(',', '.'))
fb47597b 1677 mult = unit_table[m.group('unit')]
64c464a1 1678 return round(num * mult)
1679
1680
1681def parse_bytes(s):
1682 """Parse a string indicating a byte quantity into an integer"""
1683 return lookup_unit_table(
1684 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1685 s.upper(), strict=True)
fb47597b
S
1686
1687
be64b5b0
PH
1688def parse_filesize(s):
1689 if s is None:
1690 return None
1691
dfb1b146 1692 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1693 # but we support those too
1694 _UNIT_TABLE = {
1695 'B': 1,
1696 'b': 1,
70852b47 1697 'bytes': 1,
be64b5b0
PH
1698 'KiB': 1024,
1699 'KB': 1000,
1700 'kB': 1024,
1701 'Kb': 1000,
13585d76 1702 'kb': 1000,
70852b47
YCH
1703 'kilobytes': 1000,
1704 'kibibytes': 1024,
be64b5b0
PH
1705 'MiB': 1024 ** 2,
1706 'MB': 1000 ** 2,
1707 'mB': 1024 ** 2,
1708 'Mb': 1000 ** 2,
13585d76 1709 'mb': 1000 ** 2,
70852b47
YCH
1710 'megabytes': 1000 ** 2,
1711 'mebibytes': 1024 ** 2,
be64b5b0
PH
1712 'GiB': 1024 ** 3,
1713 'GB': 1000 ** 3,
1714 'gB': 1024 ** 3,
1715 'Gb': 1000 ** 3,
13585d76 1716 'gb': 1000 ** 3,
70852b47
YCH
1717 'gigabytes': 1000 ** 3,
1718 'gibibytes': 1024 ** 3,
be64b5b0
PH
1719 'TiB': 1024 ** 4,
1720 'TB': 1000 ** 4,
1721 'tB': 1024 ** 4,
1722 'Tb': 1000 ** 4,
13585d76 1723 'tb': 1000 ** 4,
70852b47
YCH
1724 'terabytes': 1000 ** 4,
1725 'tebibytes': 1024 ** 4,
be64b5b0
PH
1726 'PiB': 1024 ** 5,
1727 'PB': 1000 ** 5,
1728 'pB': 1024 ** 5,
1729 'Pb': 1000 ** 5,
13585d76 1730 'pb': 1000 ** 5,
70852b47
YCH
1731 'petabytes': 1000 ** 5,
1732 'pebibytes': 1024 ** 5,
be64b5b0
PH
1733 'EiB': 1024 ** 6,
1734 'EB': 1000 ** 6,
1735 'eB': 1024 ** 6,
1736 'Eb': 1000 ** 6,
13585d76 1737 'eb': 1000 ** 6,
70852b47
YCH
1738 'exabytes': 1000 ** 6,
1739 'exbibytes': 1024 ** 6,
be64b5b0
PH
1740 'ZiB': 1024 ** 7,
1741 'ZB': 1000 ** 7,
1742 'zB': 1024 ** 7,
1743 'Zb': 1000 ** 7,
13585d76 1744 'zb': 1000 ** 7,
70852b47
YCH
1745 'zettabytes': 1000 ** 7,
1746 'zebibytes': 1024 ** 7,
be64b5b0
PH
1747 'YiB': 1024 ** 8,
1748 'YB': 1000 ** 8,
1749 'yB': 1024 ** 8,
1750 'Yb': 1000 ** 8,
13585d76 1751 'yb': 1000 ** 8,
70852b47
YCH
1752 'yottabytes': 1000 ** 8,
1753 'yobibytes': 1024 ** 8,
be64b5b0
PH
1754 }
1755
fb47597b
S
1756 return lookup_unit_table(_UNIT_TABLE, s)
1757
1758
1759def parse_count(s):
1760 if s is None:
be64b5b0
PH
1761 return None
1762
352d5da8 1763 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1764
1765 if re.match(r'^[\d,.]+$', s):
1766 return str_to_int(s)
1767
1768 _UNIT_TABLE = {
1769 'k': 1000,
1770 'K': 1000,
1771 'm': 1000 ** 2,
1772 'M': 1000 ** 2,
1773 'kk': 1000 ** 2,
1774 'KK': 1000 ** 2,
352d5da8 1775 'b': 1000 ** 3,
1776 'B': 1000 ** 3,
fb47597b 1777 }
be64b5b0 1778
352d5da8 1779 ret = lookup_unit_table(_UNIT_TABLE, s)
1780 if ret is not None:
1781 return ret
1782
1783 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1784 if mobj:
1785 return str_to_int(mobj.group(1))
be64b5b0 1786
2f7ae819 1787
5d45484c 1788def parse_resolution(s, *, lenient=False):
b871d7e9
S
1789 if s is None:
1790 return {}
1791
5d45484c
LNO
1792 if lenient:
1793 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1794 else:
1795 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1796 if mobj:
1797 return {
1798 'width': int(mobj.group('w')),
1799 'height': int(mobj.group('h')),
1800 }
1801
17ec8bcf 1802 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1803 if mobj:
1804 return {'height': int(mobj.group(1))}
1805
1806 mobj = re.search(r'\b([48])[kK]\b', s)
1807 if mobj:
1808 return {'height': int(mobj.group(1)) * 540}
1809
1810 return {}
1811
1812
0dc41787 1813def parse_bitrate(s):
14f25df2 1814 if not isinstance(s, str):
0dc41787
S
1815 return
1816 mobj = re.search(r'\b(\d+)\s*kbps', s)
1817 if mobj:
1818 return int(mobj.group(1))
1819
1820
a942d6cb 1821def month_by_name(name, lang='en'):
caefb1de
PH
1822 """ Return the number of a month by (locale-independently) English name """
1823
f6717dec 1824 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1825
caefb1de 1826 try:
f6717dec 1827 return month_names.index(name) + 1
7105440c
YCH
1828 except ValueError:
1829 return None
1830
1831
1832def month_by_abbreviation(abbrev):
1833 """ Return the number of a month by (locale-independently) English
1834 abbreviations """
1835
1836 try:
1837 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1838 except ValueError:
1839 return None
18258362
JMF
1840
1841
5aafe895 1842def fix_xml_ampersands(xml_str):
18258362 1843 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1844 return re.sub(
1845 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1846 '&amp;',
5aafe895 1847 xml_str)
e3946f98
PH
1848
1849
1850def setproctitle(title):
14f25df2 1851 assert isinstance(title, str)
c1c05c67 1852
fe0918bb 1853 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1854 try:
1855 import ctypes
1856 except ImportError:
c1c05c67
YCH
1857 return
1858
e3946f98 1859 try:
611c1dd9 1860 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1861 except OSError:
1862 return
2f49bcd6
RC
1863 except TypeError:
1864 # LoadLibrary in Windows Python 2.7.13 only expects
1865 # a bytestring, but since unicode_literals turns
1866 # every string into a unicode string, it fails.
1867 return
0f06bcd7 1868 title_bytes = title.encode()
6eefe533
PH
1869 buf = ctypes.create_string_buffer(len(title_bytes))
1870 buf.value = title_bytes
e3946f98 1871 try:
6eefe533 1872 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1873 except AttributeError:
1874 return # Strange libc, just skip this
d7dda168
PH
1875
1876
1877def remove_start(s, start):
46bc9b7d 1878 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1879
1880
2b9faf55 1881def remove_end(s, end):
46bc9b7d 1882 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1883
1884
31b2051e
S
1885def remove_quotes(s):
1886 if s is None or len(s) < 2:
1887 return s
1888 for quote in ('"', "'", ):
1889 if s[0] == quote and s[-1] == quote:
1890 return s[1:-1]
1891 return s
1892
1893
b6e0c7d2 1894def get_domain(url):
ebf99aaf 1895 """
1896 This implementation is inconsistent, but is kept for compatibility.
1897 Use this only for "webpage_url_domain"
1898 """
1899 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1900
1901
29eb5174 1902def url_basename(url):
14f25df2 1903 path = urllib.parse.urlparse(url).path
28e614de 1904 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1905
1906
02dc0a36 1907def base_url(url):
7657ec7e 1908 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1909
1910
e34c3361 1911def urljoin(base, path):
4b5de77b 1912 if isinstance(path, bytes):
0f06bcd7 1913 path = path.decode()
14f25df2 1914 if not isinstance(path, str) or not path:
e34c3361 1915 return None
fad4ceb5 1916 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1917 return path
4b5de77b 1918 if isinstance(base, bytes):
0f06bcd7 1919 base = base.decode()
14f25df2 1920 if not isinstance(base, str) or not re.match(
4b5de77b 1921 r'^(?:https?:)?//', base):
e34c3361 1922 return None
14f25df2 1923 return urllib.parse.urljoin(base, path)
e34c3361
S
1924
1925
9732d77e 1926def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1927 if get_attr and v is not None:
1928 v = getattr(v, get_attr, None)
1812afb7
S
1929 try:
1930 return int(v) * invscale // scale
31c49255 1931 except (ValueError, TypeError, OverflowError):
af98f8ff 1932 return default
9732d77e 1933
9572013d 1934
40a90862 1935def str_or_none(v, default=None):
14f25df2 1936 return default if v is None else str(v)
40a90862 1937
9732d77e
PH
1938
1939def str_to_int(int_str):
48d4681e 1940 """ A more relaxed version of int_or_none """
f9934b96 1941 if isinstance(int_str, int):
348c6bf1 1942 return int_str
14f25df2 1943 elif isinstance(int_str, str):
42db58ec
S
1944 int_str = re.sub(r'[,\.\+]', '', int_str)
1945 return int_or_none(int_str)
608d11f5
PH
1946
1947
9732d77e 1948def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1949 if v is None:
1950 return default
1951 try:
1952 return float(v) * invscale / scale
5e1271c5 1953 except (ValueError, TypeError):
caf80631 1954 return default
43f775e4
PH
1955
1956
c7e327c4
S
1957def bool_or_none(v, default=None):
1958 return v if isinstance(v, bool) else default
1959
1960
53cd37ba 1961def strip_or_none(v, default=None):
14f25df2 1962 return v.strip() if isinstance(v, str) else default
b72b4431
S
1963
1964
af03000a 1965def url_or_none(url):
14f25df2 1966 if not url or not isinstance(url, str):
af03000a
S
1967 return None
1968 url = url.strip()
29f7c58a 1969 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
1970
1971
ad54c913 1972def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 1973 datetime_object = None
1974 try:
f9934b96 1975 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 1976 # Using naive datetime here can break timestamp() in Windows
1977 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 1978 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1979 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1980 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1981 + datetime.timedelta(seconds=timestamp))
14f25df2 1982 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 1983 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 1984 date_format = re.sub( # Support %s on windows
1985 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 1986 return datetime_object.strftime(date_format)
1987 except (ValueError, TypeError, AttributeError):
1988 return default
1989
1990
608d11f5 1991def parse_duration(s):
f9934b96 1992 if not isinstance(s, str):
608d11f5 1993 return None
ca7b3246 1994 s = s.strip()
38d79fd1 1995 if not s:
1996 return None
ca7b3246 1997
acaff495 1998 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 1999 m = re.match(r'''(?x)
2000 (?P<before_secs>
2001 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2002 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2003 (?P<ms>[.:][0-9]+)?Z?$
2004 ''', s)
acaff495 2005 if m:
8bd1c00b 2006 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2007 else:
2008 m = re.match(
056653bb
S
2009 r'''(?ix)(?:P?
2010 (?:
1c1b2f96 2011 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2012 )?
2013 (?:
1c1b2f96 2014 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2015 )?
2016 (?:
1c1b2f96 2017 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2018 )?
8f4b58d7 2019 (?:
1c1b2f96 2020 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2021 )?
056653bb 2022 T)?
acaff495 2023 (?:
af868732 2024 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
acaff495 2025 )?
2026 (?:
1c1b2f96 2027 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2028 )?
2029 (?:
2030 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2031 )?Z?$''', s)
acaff495 2032 if m:
2033 days, hours, mins, secs, ms = m.groups()
2034 else:
15846398 2035 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2036 if m:
2037 hours, mins = m.groups()
2038 else:
2039 return None
2040
acaff495 2041 if ms:
19a03940 2042 ms = ms.replace(':', '.')
2043 return sum(float(part or 0) * mult for part, mult in (
2044 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2045
2046
e65e4c88 2047def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2048 name, real_ext = os.path.splitext(filename)
e65e4c88 2049 return (
86e5f3ed 2050 f'{name}.{ext}{real_ext}'
e65e4c88 2051 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2052 else f'{filename}.{ext}')
d70ad093
PH
2053
2054
b3ed15b7
S
2055def replace_extension(filename, ext, expected_real_ext=None):
2056 name, real_ext = os.path.splitext(filename)
86e5f3ed 2057 return '{}.{}'.format(
b3ed15b7
S
2058 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2059 ext)
2060
2061
d70ad093
PH
2062def check_executable(exe, args=[]):
2063 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2064 args can be a list of arguments for a short output (like -version) """
2065 try:
f0c9fb96 2066 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2067 except OSError:
2068 return False
2069 return exe
b7ab0590
PH
2070
2071
7aaf4cd2 2072def _get_exe_version_output(exe, args):
95807118 2073 try:
b64d04c1 2074 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2075 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2076 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2077 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2078 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2079 if ret:
2080 return None
95807118
PH
2081 except OSError:
2082 return False
f0c9fb96 2083 return stdout
cae97f65
PH
2084
2085
2086def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2087 assert isinstance(output, str)
cae97f65
PH
2088 if version_re is None:
2089 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2090 m = re.search(version_re, output)
95807118
PH
2091 if m:
2092 return m.group(1)
2093 else:
2094 return unrecognized
2095
2096
9af98e17 2097def get_exe_version(exe, args=['--version'],
1cdda329 2098 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2099 """ Returns the version of the specified executable,
2100 or False if the executable is not present """
1cdda329 2101 unrecognized = variadic(unrecognized)
2102 assert len(unrecognized) in (1, 2)
9af98e17 2103 out = _get_exe_version_output(exe, args)
1cdda329 2104 if out is None:
2105 return unrecognized[-1]
2106 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2107
2108
7e88d7d7 2109def frange(start=0, stop=None, step=1):
2110 """Float range"""
2111 if stop is None:
2112 start, stop = 0, start
2113 sign = [-1, 1][step > 0] if step else 0
2114 while sign * start < sign * stop:
2115 yield start
2116 start += step
2117
2118
cb89cfc1 2119class LazyList(collections.abc.Sequence):
0f06bcd7 2120 """Lazy immutable list from an iterable
2121 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2122
8e5fecc8 2123 class IndexError(IndexError):
2124 pass
2125
282f5709 2126 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2127 self._iterable = iter(iterable)
2128 self._cache = [] if _cache is None else _cache
2129 self._reversed = reverse
483336e7 2130
2131 def __iter__(self):
0f06bcd7 2132 if self._reversed:
28419ca2 2133 # We need to consume the entire iterable to iterate in reverse
981052c9 2134 yield from self.exhaust()
28419ca2 2135 return
0f06bcd7 2136 yield from self._cache
2137 for item in self._iterable:
2138 self._cache.append(item)
483336e7 2139 yield item
2140
0f06bcd7 2141 def _exhaust(self):
2142 self._cache.extend(self._iterable)
2143 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2144 return self._cache
28419ca2 2145
981052c9 2146 def exhaust(self):
0f06bcd7 2147 """Evaluate the entire iterable"""
2148 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2149
28419ca2 2150 @staticmethod
0f06bcd7 2151 def _reverse_index(x):
f2df4071 2152 return None if x is None else ~x
483336e7 2153
2154 def __getitem__(self, idx):
2155 if isinstance(idx, slice):
0f06bcd7 2156 if self._reversed:
2157 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2158 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2159 elif isinstance(idx, int):
0f06bcd7 2160 if self._reversed:
2161 idx = self._reverse_index(idx)
e0f2b4b4 2162 start, stop, step = idx, idx, 0
483336e7 2163 else:
2164 raise TypeError('indices must be integers or slices')
e0f2b4b4 2165 if ((start or 0) < 0 or (stop or 0) < 0
2166 or (start is None and step < 0)
2167 or (stop is None and step > 0)):
483336e7 2168 # We need to consume the entire iterable to be able to slice from the end
2169 # Obviously, never use this with infinite iterables
0f06bcd7 2170 self._exhaust()
8e5fecc8 2171 try:
0f06bcd7 2172 return self._cache[idx]
8e5fecc8 2173 except IndexError as e:
2174 raise self.IndexError(e) from e
0f06bcd7 2175 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2176 if n > 0:
0f06bcd7 2177 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2178 try:
0f06bcd7 2179 return self._cache[idx]
8e5fecc8 2180 except IndexError as e:
2181 raise self.IndexError(e) from e
483336e7 2182
2183 def __bool__(self):
2184 try:
0f06bcd7 2185 self[-1] if self._reversed else self[0]
8e5fecc8 2186 except self.IndexError:
483336e7 2187 return False
2188 return True
2189
2190 def __len__(self):
0f06bcd7 2191 self._exhaust()
2192 return len(self._cache)
483336e7 2193
282f5709 2194 def __reversed__(self):
0f06bcd7 2195 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2196
2197 def __copy__(self):
0f06bcd7 2198 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2199
28419ca2 2200 def __repr__(self):
2201 # repr and str should mimic a list. So we exhaust the iterable
2202 return repr(self.exhaust())
2203
2204 def __str__(self):
2205 return repr(self.exhaust())
2206
483336e7 2207
7be9ccff 2208class PagedList:
c07a39ae 2209
2210 class IndexError(IndexError):
2211 pass
2212
dd26ced1
PH
2213 def __len__(self):
2214 # This is only useful for tests
2215 return len(self.getslice())
2216
7be9ccff 2217 def __init__(self, pagefunc, pagesize, use_cache=True):
2218 self._pagefunc = pagefunc
2219 self._pagesize = pagesize
f1d13090 2220 self._pagecount = float('inf')
7be9ccff 2221 self._use_cache = use_cache
2222 self._cache = {}
2223
2224 def getpage(self, pagenum):
d8cf8d97 2225 page_results = self._cache.get(pagenum)
2226 if page_results is None:
f1d13090 2227 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2228 if self._use_cache:
2229 self._cache[pagenum] = page_results
2230 return page_results
2231
2232 def getslice(self, start=0, end=None):
2233 return list(self._getslice(start, end))
2234
2235 def _getslice(self, start, end):
55575225 2236 raise NotImplementedError('This method must be implemented by subclasses')
2237
2238 def __getitem__(self, idx):
f1d13090 2239 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2240 if not isinstance(idx, int) or idx < 0:
2241 raise TypeError('indices must be non-negative integers')
2242 entries = self.getslice(idx, idx + 1)
d8cf8d97 2243 if not entries:
c07a39ae 2244 raise self.IndexError()
d8cf8d97 2245 return entries[0]
55575225 2246
9c44d242
PH
2247
2248class OnDemandPagedList(PagedList):
a44ca5a4 2249 """Download pages until a page with less than maximum results"""
86e5f3ed 2250
7be9ccff 2251 def _getslice(self, start, end):
b7ab0590
PH
2252 for pagenum in itertools.count(start // self._pagesize):
2253 firstid = pagenum * self._pagesize
2254 nextfirstid = pagenum * self._pagesize + self._pagesize
2255 if start >= nextfirstid:
2256 continue
2257
b7ab0590
PH
2258 startv = (
2259 start % self._pagesize
2260 if firstid <= start < nextfirstid
2261 else 0)
b7ab0590
PH
2262 endv = (
2263 ((end - 1) % self._pagesize) + 1
2264 if (end is not None and firstid <= end <= nextfirstid)
2265 else None)
2266
f1d13090 2267 try:
2268 page_results = self.getpage(pagenum)
2269 except Exception:
2270 self._pagecount = pagenum - 1
2271 raise
b7ab0590
PH
2272 if startv != 0 or endv is not None:
2273 page_results = page_results[startv:endv]
7be9ccff 2274 yield from page_results
b7ab0590
PH
2275
2276 # A little optimization - if current page is not "full", ie. does
2277 # not contain page_size videos then we can assume that this page
2278 # is the last one - there are no more ids on further pages -
2279 # i.e. no need to query again.
2280 if len(page_results) + startv < self._pagesize:
2281 break
2282
2283 # If we got the whole page, but the next page is not interesting,
2284 # break out early as well
2285 if end == nextfirstid:
2286 break
81c2f20b
PH
2287
2288
9c44d242 2289class InAdvancePagedList(PagedList):
a44ca5a4 2290 """PagedList with total number of pages known in advance"""
86e5f3ed 2291
9c44d242 2292 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2293 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2294 self._pagecount = pagecount
9c44d242 2295
7be9ccff 2296 def _getslice(self, start, end):
9c44d242 2297 start_page = start // self._pagesize
d37707bd 2298 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2299 skip_elems = start - start_page * self._pagesize
2300 only_more = None if end is None else end - start
2301 for pagenum in range(start_page, end_page):
7be9ccff 2302 page_results = self.getpage(pagenum)
9c44d242 2303 if skip_elems:
7be9ccff 2304 page_results = page_results[skip_elems:]
9c44d242
PH
2305 skip_elems = None
2306 if only_more is not None:
7be9ccff 2307 if len(page_results) < only_more:
2308 only_more -= len(page_results)
9c44d242 2309 else:
7be9ccff 2310 yield from page_results[:only_more]
9c44d242 2311 break
7be9ccff 2312 yield from page_results
9c44d242
PH
2313
2314
7e88d7d7 2315class PlaylistEntries:
2316 MissingEntry = object()
2317 is_exhausted = False
2318
2319 def __init__(self, ydl, info_dict):
7e9a6125 2320 self.ydl = ydl
2321
2322 # _entries must be assigned now since infodict can change during iteration
2323 entries = info_dict.get('entries')
2324 if entries is None:
2325 raise EntryNotInPlaylist('There are no entries')
2326 elif isinstance(entries, list):
2327 self.is_exhausted = True
2328
2329 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2330 self.is_incomplete = requested_entries is not None
7e9a6125 2331 if self.is_incomplete:
2332 assert self.is_exhausted
bc5c2f8a 2333 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2334 for i, entry in zip(requested_entries, entries):
2335 self._entries[i - 1] = entry
2336 elif isinstance(entries, (list, PagedList, LazyList)):
2337 self._entries = entries
2338 else:
2339 self._entries = LazyList(entries)
7e88d7d7 2340
2341 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2342 (?P<start>[+-]?\d+)?
2343 (?P<range>[:-]
2344 (?P<end>[+-]?\d+|inf(?:inite)?)?
2345 (?::(?P<step>[+-]?\d+))?
2346 )?''')
2347
2348 @classmethod
2349 def parse_playlist_items(cls, string):
2350 for segment in string.split(','):
2351 if not segment:
2352 raise ValueError('There is two or more consecutive commas')
2353 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2354 if not mobj:
2355 raise ValueError(f'{segment!r} is not a valid specification')
2356 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2357 if int_or_none(step) == 0:
2358 raise ValueError(f'Step in {segment!r} cannot be zero')
2359 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2360
2361 def get_requested_items(self):
2362 playlist_items = self.ydl.params.get('playlist_items')
2363 playlist_start = self.ydl.params.get('playliststart', 1)
2364 playlist_end = self.ydl.params.get('playlistend')
2365 # For backwards compatibility, interpret -1 as whole list
2366 if playlist_end in (-1, None):
2367 playlist_end = ''
2368 if not playlist_items:
2369 playlist_items = f'{playlist_start}:{playlist_end}'
2370 elif playlist_start != 1 or playlist_end:
2371 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2372
2373 for index in self.parse_playlist_items(playlist_items):
2374 for i, entry in self[index]:
2375 yield i, entry
1ac4fd80 2376 if not entry:
2377 continue
7e88d7d7 2378 try:
d21056f4 2379 # The item may have just been added to archive. Don't break due to it
2380 if not self.ydl.params.get('lazy_playlist'):
2381 # TODO: Add auto-generated fields
2382 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2383 except (ExistingVideoReached, RejectedVideoReached):
2384 return
2385
7e9a6125 2386 def get_full_count(self):
2387 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2388 return len(self)
2389 elif isinstance(self._entries, InAdvancePagedList):
2390 if self._entries._pagesize == 1:
2391 return self._entries._pagecount
2392
7e88d7d7 2393 @functools.cached_property
2394 def _getter(self):
2395 if isinstance(self._entries, list):
2396 def get_entry(i):
2397 try:
2398 entry = self._entries[i]
2399 except IndexError:
2400 entry = self.MissingEntry
2401 if not self.is_incomplete:
2402 raise self.IndexError()
2403 if entry is self.MissingEntry:
bc5c2f8a 2404 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2405 return entry
2406 else:
2407 def get_entry(i):
2408 try:
2409 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2410 except (LazyList.IndexError, PagedList.IndexError):
2411 raise self.IndexError()
2412 return get_entry
2413
2414 def __getitem__(self, idx):
2415 if isinstance(idx, int):
2416 idx = slice(idx, idx)
2417
2418 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2419 step = 1 if idx.step is None else idx.step
2420 if idx.start is None:
2421 start = 0 if step > 0 else len(self) - 1
2422 else:
2423 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2424
2425 # NB: Do not call len(self) when idx == [:]
2426 if idx.stop is None:
2427 stop = 0 if step < 0 else float('inf')
2428 else:
2429 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2430 stop += [-1, 1][step > 0]
2431
2432 for i in frange(start, stop, step):
2433 if i < 0:
2434 continue
2435 try:
7e9a6125 2436 entry = self._getter(i)
2437 except self.IndexError:
2438 self.is_exhausted = True
2439 if step > 0:
7e88d7d7 2440 break
7e9a6125 2441 continue
7e88d7d7 2442 yield i + 1, entry
2443
2444 def __len__(self):
2445 return len(tuple(self[:]))
2446
2447 class IndexError(IndexError):
2448 pass
2449
2450
81c2f20b 2451def uppercase_escape(s):
676eb3f2 2452 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2453 return re.sub(
a612753d 2454 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2455 lambda m: unicode_escape(m.group(0))[0],
2456 s)
0fe2ff78
YCH
2457
2458
2459def lowercase_escape(s):
2460 unicode_escape = codecs.getdecoder('unicode_escape')
2461 return re.sub(
2462 r'\\u[0-9a-fA-F]{4}',
2463 lambda m: unicode_escape(m.group(0))[0],
2464 s)
b53466e1 2465
d05cfe06 2466
96b9e9cf 2467def parse_qs(url, **kwargs):
2468 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2469
2470
62e609ab
PH
2471def read_batch_urls(batch_fd):
2472 def fixup(url):
14f25df2 2473 if not isinstance(url, str):
62e609ab 2474 url = url.decode('utf-8', 'replace')
8c04f0be 2475 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2476 for bom in BOM_UTF8:
2477 if url.startswith(bom):
2478 url = url[len(bom):]
2479 url = url.lstrip()
2480 if not url or url.startswith(('#', ';', ']')):
62e609ab 2481 return False
8c04f0be 2482 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2483 # However, it can be safely stripped out if following a whitespace
8c04f0be 2484 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2485
2486 with contextlib.closing(batch_fd) as fd:
2487 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2488
2489
2490def urlencode_postdata(*args, **kargs):
14f25df2 2491 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2492
2493
45b2ee6f 2494def update_url(url, *, query_update=None, **kwargs):
2495 """Replace URL components specified by kwargs
2496 @param url str or parse url tuple
2497 @param query_update update query
2498 @returns str
2499 """
2500 if isinstance(url, str):
2501 if not kwargs and not query_update:
2502 return url
2503 else:
2504 url = urllib.parse.urlparse(url)
2505 if query_update:
2506 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2507 kwargs['query'] = urllib.parse.urlencode({
2508 **urllib.parse.parse_qs(url.query),
2509 **query_update
2510 }, True)
2511 return urllib.parse.urlunparse(url._replace(**kwargs))
2512
2513
38f9ef31 2514def update_url_query(url, query):
45b2ee6f 2515 return update_url(url, query_update=query)
16392824 2516
8e60dc75 2517
10c87c15 2518def _multipart_encode_impl(data, boundary):
0c265486
YCH
2519 content_type = 'multipart/form-data; boundary=%s' % boundary
2520
2521 out = b''
2522 for k, v in data.items():
2523 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2524 if isinstance(k, str):
0f06bcd7 2525 k = k.encode()
14f25df2 2526 if isinstance(v, str):
0f06bcd7 2527 v = v.encode()
0c265486
YCH
2528 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2529 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2530 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2531 if boundary.encode('ascii') in content:
2532 raise ValueError('Boundary overlaps with data')
2533 out += content
2534
2535 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2536
2537 return out, content_type
2538
2539
2540def multipart_encode(data, boundary=None):
2541 '''
2542 Encode a dict to RFC 7578-compliant form-data
2543
2544 data:
2545 A dict where keys and values can be either Unicode or bytes-like
2546 objects.
2547 boundary:
2548 If specified a Unicode object, it's used as the boundary. Otherwise
2549 a random boundary is generated.
2550
2551 Reference: https://tools.ietf.org/html/rfc7578
2552 '''
2553 has_specified_boundary = boundary is not None
2554
2555 while True:
2556 if boundary is None:
2557 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2558
2559 try:
10c87c15 2560 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2561 break
2562 except ValueError:
2563 if has_specified_boundary:
2564 raise
2565 boundary = None
2566
2567 return out, content_type
2568
2569
b079c26f
SS
2570def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2571 if blocked_types is NO_DEFAULT:
2572 blocked_types = (str, bytes, collections.abc.Mapping)
2573 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2574
2575
2576def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2577 if not isinstance(allowed_types, (tuple, type)):
2578 deprecation_warning('allowed_types should be a tuple or a type')
2579 allowed_types = tuple(allowed_types)
6f2287cb 2580 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2581
2582
c4f60dd7 2583def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2584 for f in funcs:
a32a9a7e 2585 try:
c4f60dd7 2586 val = f(*args, **kwargs)
ab029d7e 2587 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2588 pass
2589 else:
c4f60dd7 2590 if expected_type is None or isinstance(val, expected_type):
2591 return val
2592
2593
2594def try_get(src, getter, expected_type=None):
2595 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2596
2597
90137ca4 2598def filter_dict(dct, cndn=lambda _, v: v is not None):
2599 return {k: v for k, v in dct.items() if cndn(k, v)}
2600
2601
6cc62232
S
2602def merge_dicts(*dicts):
2603 merged = {}
2604 for a_dict in dicts:
2605 for k, v in a_dict.items():
90137ca4 2606 if (v is not None and k not in merged
2607 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2608 merged[k] = v
2609 return merged
2610
2611
8e60dc75 2612def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2613 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2614
16392824 2615
a1a530b0
PH
2616US_RATINGS = {
2617 'G': 0,
2618 'PG': 10,
2619 'PG-13': 13,
2620 'R': 16,
2621 'NC': 18,
2622}
fac55558
PH
2623
2624
a8795327 2625TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2626 'TV-Y': 0,
2627 'TV-Y7': 7,
2628 'TV-G': 0,
2629 'TV-PG': 0,
2630 'TV-14': 14,
2631 'TV-MA': 17,
a8795327
S
2632}
2633
2634
146c80e2 2635def parse_age_limit(s):
19a03940 2636 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2637 if type(s) is int: # noqa: E721
a8795327 2638 return s if 0 <= s <= 21 else None
19a03940 2639 elif not isinstance(s, str):
d838b1bd 2640 return None
146c80e2 2641 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2642 if m:
2643 return int(m.group('age'))
5c5fae6d 2644 s = s.upper()
a8795327
S
2645 if s in US_RATINGS:
2646 return US_RATINGS[s]
5a16c9d9 2647 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2648 if m:
5a16c9d9 2649 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2650 return None
146c80e2
S
2651
2652
fac55558 2653def strip_jsonp(code):
609a61e3 2654 return re.sub(
5552c9eb 2655 r'''(?sx)^
e9c671d5 2656 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2657 (?:\s*&&\s*(?P=func_name))?
2658 \s*\(\s*(?P<callback_data>.*)\);?
2659 \s*?(?://[^\n]*)*$''',
2660 r'\g<callback_data>', code)
478c2c61
PH
2661
2662
8f53dc44 2663def js_to_json(code, vars={}, *, strict=False):
5c610515 2664 # vars is a dict of var, val pairs to substitute
0898c5c8 2665 STRING_QUOTES = '\'"`'
a71b812f 2666 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2667 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2668 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2669 INTEGER_TABLE = (
86e5f3ed 2670 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2671 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2672 )
2673
a71b812f
SS
2674 def process_escape(match):
2675 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2676 escape = match.group(1) or match.group(2)
2677
2678 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2679 else R'\u00' if escape == 'x'
2680 else '' if escape == '\n'
2681 else escape)
2682
0898c5c8
SS
2683 def template_substitute(match):
2684 evaluated = js_to_json(match.group(1), vars, strict=strict)
2685 if evaluated[0] == '"':
2686 return json.loads(evaluated)
2687 return evaluated
2688
e05f6939 2689 def fix_kv(m):
e7b6d122
PH
2690 v = m.group(0)
2691 if v in ('true', 'false', 'null'):
2692 return v
421ddcb8
C
2693 elif v in ('undefined', 'void 0'):
2694 return 'null'
8bdd16b4 2695 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2696 return ''
2697
2698 if v[0] in STRING_QUOTES:
0898c5c8
SS
2699 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2700 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2701 return f'"{escaped}"'
2702
2703 for regex, base in INTEGER_TABLE:
2704 im = re.match(regex, v)
2705 if im:
2706 i = int(im.group(1), base)
2707 return f'"{i}":' if v.endswith(':') else str(i)
2708
2709 if v in vars:
d5f043d1
C
2710 try:
2711 if not strict:
2712 json.loads(vars[v])
08e29b9f 2713 except json.JSONDecodeError:
d5f043d1
C
2714 return json.dumps(vars[v])
2715 else:
2716 return vars[v]
89ac4a19 2717
a71b812f
SS
2718 if not strict:
2719 return f'"{v}"'
5c610515 2720
a71b812f 2721 raise ValueError(f'Unknown value: {v}')
e05f6939 2722
8072ef2b 2723 def create_map(mobj):
2724 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2725
8072ef2b 2726 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2727 if not strict:
2728 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 2729 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2730 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2731 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2732
a71b812f
SS
2733 return re.sub(rf'''(?sx)
2734 {STRING_RE}|
2735 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2736 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2737 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2738 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2739 !+
a71b812f 2740 ''', fix_kv, code)
e05f6939
PH
2741
2742
478c2c61
PH
2743def qualities(quality_ids):
2744 """ Get a numeric quality value out of a list of possible values """
2745 def q(qid):
2746 try:
2747 return quality_ids.index(qid)
2748 except ValueError:
2749 return -1
2750 return q
2751
acd69589 2752
119e40ef 2753POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2754
2755
de6000d9 2756DEFAULT_OUTTMPL = {
2757 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2758 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2759}
2760OUTTMPL_TYPES = {
72755351 2761 'chapter': None,
de6000d9 2762 'subtitle': None,
2763 'thumbnail': None,
2764 'description': 'description',
2765 'annotation': 'annotations.xml',
2766 'infojson': 'info.json',
08438d2c 2767 'link': None,
3b603dbd 2768 'pl_video': None,
5112f26a 2769 'pl_thumbnail': None,
de6000d9 2770 'pl_description': 'description',
2771 'pl_infojson': 'info.json',
2772}
0a871f68 2773
143db31d 2774# As of [1] format syntax is:
2775# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2776# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2777STR_FORMAT_RE_TMPL = r'''(?x)
2778 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2779 %
524e2e4f 2780 (?P<has_key>\((?P<key>{0})\))?
752cda38 2781 (?P<format>
524e2e4f 2782 (?P<conversion>[#0\-+ ]+)?
2783 (?P<min_width>\d+)?
2784 (?P<precision>\.\d+)?
2785 (?P<len_mod>[hlL])? # unused in python
901130bb 2786 {1} # conversion type
752cda38 2787 )
143db31d 2788'''
2789
7d1eb38a 2790
ebe1b4e3 2791STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2792
7d1eb38a 2793
a020a0dc
PH
2794def limit_length(s, length):
2795 """ Add ellipses to overly long strings """
2796 if s is None:
2797 return None
2798 ELLIPSES = '...'
2799 if len(s) > length:
2800 return s[:length - len(ELLIPSES)] + ELLIPSES
2801 return s
48844745
PH
2802
2803
2804def version_tuple(v):
5f9b8394 2805 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2806
2807
2808def is_outdated_version(version, limit, assume_new=True):
2809 if not version:
2810 return not assume_new
2811 try:
2812 return version_tuple(version) < version_tuple(limit)
2813 except ValueError:
2814 return not assume_new
732ea2f0
PH
2815
2816
2817def ytdl_is_updateable():
7a5c1cfe 2818 """ Returns if yt-dlp can be updated with -U """
735d865e 2819
69bec673 2820 from ..update import is_non_updateable
732ea2f0 2821
5d535b4a 2822 return not is_non_updateable()
7d4111ed
PH
2823
2824
2825def args_to_str(args):
2826 # Get a short string representation for a subprocess command
702ccf2d 2827 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2828
2829
a44ca5a4 2830def error_to_str(err):
2831 return f'{type(err).__name__}: {err}'
2832
2833
2647c933 2834def mimetype2ext(mt, default=NO_DEFAULT):
2835 if not isinstance(mt, str):
2836 if default is not NO_DEFAULT:
2837 return default
eb9ee194
S
2838 return None
2839
2647c933 2840 MAP = {
2841 # video
f6861ec9 2842 '3gpp': '3gp',
2647c933 2843 'mp2t': 'ts',
2844 'mp4': 'mp4',
2845 'mpeg': 'mpeg',
2846 'mpegurl': 'm3u8',
2847 'quicktime': 'mov',
2848 'webm': 'webm',
2849 'vp9': 'vp9',
f6861ec9 2850 'x-flv': 'flv',
2647c933 2851 'x-m4v': 'm4v',
2852 'x-matroska': 'mkv',
2853 'x-mng': 'mng',
a0d8d704 2854 'x-mp4-fragmented': 'mp4',
2647c933 2855 'x-ms-asf': 'asf',
a0d8d704 2856 'x-ms-wmv': 'wmv',
2647c933 2857 'x-msvideo': 'avi',
2858
2859 # application (streaming playlists)
b4173f15 2860 'dash+xml': 'mpd',
b4173f15 2861 'f4m+xml': 'f4m',
f164b971 2862 'hds+xml': 'f4m',
2647c933 2863 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2864 'vnd.ms-sstr+xml': 'ism',
2647c933 2865 'x-mpegurl': 'm3u8',
2866
2867 # audio
2868 'audio/mp4': 'm4a',
2869 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2870 # Using .mp3 as it's the most popular one
2871 'audio/mpeg': 'mp3',
d80ca5de 2872 'audio/webm': 'webm',
2647c933 2873 'audio/x-matroska': 'mka',
2874 'audio/x-mpegurl': 'm3u',
2875 'midi': 'mid',
2876 'ogg': 'ogg',
2877 'wav': 'wav',
2878 'wave': 'wav',
2879 'x-aac': 'aac',
2880 'x-flac': 'flac',
2881 'x-m4a': 'm4a',
2882 'x-realaudio': 'ra',
39e7107d 2883 'x-wav': 'wav',
9359f3d4 2884
2647c933 2885 # image
2886 'avif': 'avif',
2887 'bmp': 'bmp',
2888 'gif': 'gif',
2889 'jpeg': 'jpg',
2890 'png': 'png',
2891 'svg+xml': 'svg',
2892 'tiff': 'tif',
2893 'vnd.wap.wbmp': 'wbmp',
2894 'webp': 'webp',
2895 'x-icon': 'ico',
2896 'x-jng': 'jng',
2897 'x-ms-bmp': 'bmp',
2898
2899 # caption
2900 'filmstrip+json': 'fs',
2901 'smptett+xml': 'tt',
2902 'ttaf+xml': 'dfxp',
2903 'ttml+xml': 'ttml',
2904 'x-ms-sami': 'sami',
9359f3d4 2905
2647c933 2906 # misc
2907 'gzip': 'gz',
9359f3d4
F
2908 'json': 'json',
2909 'xml': 'xml',
2910 'zip': 'zip',
9359f3d4
F
2911 }
2912
2647c933 2913 mimetype = mt.partition(';')[0].strip().lower()
2914 _, _, subtype = mimetype.rpartition('/')
9359f3d4 2915
69bec673 2916 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 2917 if ext:
2918 return ext
2919 elif default is not NO_DEFAULT:
2920 return default
9359f3d4 2921 return subtype.replace('+', '.')
c460bdd5
PH
2922
2923
2814f12b
THD
2924def ext2mimetype(ext_or_url):
2925 if not ext_or_url:
2926 return None
2927 if '.' not in ext_or_url:
2928 ext_or_url = f'file.{ext_or_url}'
2929 return mimetypes.guess_type(ext_or_url)[0]
2930
2931
4f3c5e06 2932def parse_codecs(codecs_str):
2933 # http://tools.ietf.org/html/rfc6381
2934 if not codecs_str:
2935 return {}
a0566bbf 2936 split_codecs = list(filter(None, map(
dbf5416a 2937 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 2938 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 2939 for full_codec in split_codecs:
d816f61f 2940 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2941 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2942 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2943 if vcodec:
2944 continue
2945 vcodec = full_codec
2946 if parts[0] in ('dvh1', 'dvhe'):
2947 hdr = 'DV'
69bec673 2948 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 2949 hdr = 'HDR10'
2950 elif parts[:2] == ['vp9', '2']:
2951 hdr = 'HDR10'
71082216 2952 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 2953 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2954 acodec = acodec or full_codec
2955 elif parts[0] in ('stpp', 'wvtt'):
2956 scodec = scodec or full_codec
4f3c5e06 2957 else:
19a03940 2958 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 2959 if vcodec or acodec or scodec:
4f3c5e06 2960 return {
2961 'vcodec': vcodec or 'none',
2962 'acodec': acodec or 'none',
176f1866 2963 'dynamic_range': hdr,
3fe75fdc 2964 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 2965 }
b69fd25c 2966 elif len(split_codecs) == 2:
2967 return {
2968 'vcodec': split_codecs[0],
2969 'acodec': split_codecs[1],
2970 }
4f3c5e06 2971 return {}
2972
2973
fc61aff4
LL
2974def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2975 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2976
2977 allow_mkv = not preferences or 'mkv' in preferences
2978
2979 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
2980 return 'mkv' # TODO: any other format allows this?
2981
2982 # TODO: All codecs supported by parse_codecs isn't handled here
2983 COMPATIBLE_CODECS = {
2984 'mp4': {
71082216 2985 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 2986 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
2987 },
2988 'webm': {
2989 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
2990 'vp9x', 'vp8x', # in the webm spec
2991 },
2992 }
2993
812cdfa0 2994 sanitize_codec = functools.partial(
2995 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 2996 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
2997
2998 for ext in preferences or COMPATIBLE_CODECS.keys():
2999 codec_set = COMPATIBLE_CODECS.get(ext, set())
3000 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3001 return ext
3002
3003 COMPATIBLE_EXTS = (
3004 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3005 {'webm', 'weba'},
fc61aff4
LL
3006 )
3007 for ext in preferences or vexts:
3008 current_exts = {ext, *vexts, *aexts}
3009 if ext == 'mkv' or current_exts == {ext} or any(
3010 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3011 return ext
3012 return 'mkv' if allow_mkv else preferences[-1]
3013
3014
2647c933 3015def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3016 getheader = url_handle.headers.get
2ccd1b10 3017
b55ee18f
PH
3018 cd = getheader('Content-Disposition')
3019 if cd:
3020 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3021 if m:
3022 e = determine_ext(m.group('filename'), default_ext=None)
3023 if e:
3024 return e
3025
2647c933 3026 meta_ext = getheader('x-amz-meta-name')
3027 if meta_ext:
3028 e = meta_ext.rpartition('.')[2]
3029 if e:
3030 return e
3031
3032 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3033
3034
1e399778
YCH
3035def encode_data_uri(data, mime_type):
3036 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3037
3038
05900629 3039def age_restricted(content_limit, age_limit):
6ec6cb4e 3040 """ Returns True iff the content should be blocked """
05900629
PH
3041
3042 if age_limit is None: # No limit set
3043 return False
3044 if content_limit is None:
3045 return False # Content available for everyone
3046 return age_limit < content_limit
61ca9a80
PH
3047
3048
88f60feb 3049# List of known byte-order-marks (BOM)
a904a7f8
L
3050BOMS = [
3051 (b'\xef\xbb\xbf', 'utf-8'),
3052 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3053 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3054 (b'\xff\xfe', 'utf-16-le'),
3055 (b'\xfe\xff', 'utf-16-be'),
3056]
a904a7f8
L
3057
3058
61ca9a80
PH
3059def is_html(first_bytes):
3060 """ Detect whether a file contains HTML by examining its first bytes. """
3061
80e8493e 3062 encoding = 'utf-8'
61ca9a80 3063 for bom, enc in BOMS:
80e8493e 3064 while first_bytes.startswith(bom):
3065 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3066
80e8493e 3067 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3068
3069
3070def determine_protocol(info_dict):
3071 protocol = info_dict.get('protocol')
3072 if protocol is not None:
3073 return protocol
3074
7de837a5 3075 url = sanitize_url(info_dict['url'])
a055469f
PH
3076 if url.startswith('rtmp'):
3077 return 'rtmp'
3078 elif url.startswith('mms'):
3079 return 'mms'
3080 elif url.startswith('rtsp'):
3081 return 'rtsp'
3082
3083 ext = determine_ext(url)
3084 if ext == 'm3u8':
deae7c17 3085 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3086 elif ext == 'f4m':
3087 return 'f4m'
3088
14f25df2 3089 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3090
3091
c5e3f849 3092def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3093 """ Render a list of rows, each as a list of values.
3094 Text after a \t will be right aligned """
ec11a9f4 3095 def width(string):
c5e3f849 3096 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3097
3098 def get_max_lens(table):
ec11a9f4 3099 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3100
3101 def filter_using_list(row, filterArray):
d16df59d 3102 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3103
d16df59d 3104 max_lens = get_max_lens(data) if hide_empty else []
3105 header_row = filter_using_list(header_row, max_lens)
3106 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3107
cfb56d1a 3108 table = [header_row] + data
76d321f6 3109 max_lens = get_max_lens(table)
c5e3f849 3110 extra_gap += 1
76d321f6 3111 if delim:
c5e3f849 3112 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3113 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3114 for row in table:
3115 for pos, text in enumerate(map(str, row)):
c5e3f849 3116 if '\t' in text:
3117 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3118 else:
3119 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3120 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3121 return ret
347de493
PH
3122
3123
8f18aca8 3124def _match_one(filter_part, dct, incomplete):
77b87f05 3125 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3126 STRING_OPERATORS = {
3127 '*=': operator.contains,
3128 '^=': lambda attr, value: attr.startswith(value),
3129 '$=': lambda attr, value: attr.endswith(value),
3130 '~=': lambda attr, value: re.search(value, attr),
3131 }
347de493 3132 COMPARISON_OPERATORS = {
a047eeb6 3133 **STRING_OPERATORS,
3134 '<=': operator.le, # "<=" must be defined above "<"
347de493 3135 '<': operator.lt,
347de493 3136 '>=': operator.ge,
a047eeb6 3137 '>': operator.gt,
347de493 3138 '=': operator.eq,
347de493 3139 }
a047eeb6 3140
6db9c4d5 3141 if isinstance(incomplete, bool):
3142 is_incomplete = lambda _: incomplete
3143 else:
3144 is_incomplete = lambda k: k in incomplete
3145
64fa820c 3146 operator_rex = re.compile(r'''(?x)
347de493 3147 (?P<key>[a-z_]+)
77b87f05 3148 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3149 (?:
a047eeb6 3150 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3151 (?P<strval>.+?)
347de493 3152 )
347de493 3153 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3154 m = operator_rex.fullmatch(filter_part.strip())
347de493 3155 if m:
18f96d12 3156 m = m.groupdict()
3157 unnegated_op = COMPARISON_OPERATORS[m['op']]
3158 if m['negation']:
77b87f05
MT
3159 op = lambda attr, value: not unnegated_op(attr, value)
3160 else:
3161 op = unnegated_op
18f96d12 3162 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3163 if m['quote']:
3164 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3165 actual_value = dct.get(m['key'])
3166 numeric_comparison = None
f9934b96 3167 if isinstance(actual_value, (int, float)):
e5a088dc
S
3168 # If the original field is a string and matching comparisonvalue is
3169 # a number we should respect the origin of the original field
3170 # and process comparison value as a string (see
18f96d12 3171 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3172 try:
18f96d12 3173 numeric_comparison = int(comparison_value)
347de493 3174 except ValueError:
18f96d12 3175 numeric_comparison = parse_filesize(comparison_value)
3176 if numeric_comparison is None:
3177 numeric_comparison = parse_filesize(f'{comparison_value}B')
3178 if numeric_comparison is None:
3179 numeric_comparison = parse_duration(comparison_value)
3180 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3181 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3182 if actual_value is None:
6db9c4d5 3183 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3184 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3185
3186 UNARY_OPERATORS = {
1cc47c66
S
3187 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3188 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3189 }
64fa820c 3190 operator_rex = re.compile(r'''(?x)
347de493 3191 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3192 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3193 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3194 if m:
3195 op = UNARY_OPERATORS[m.group('op')]
3196 actual_value = dct.get(m.group('key'))
6db9c4d5 3197 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3198 return True
347de493
PH
3199 return op(actual_value)
3200
3201 raise ValueError('Invalid filter part %r' % filter_part)
3202
3203
8f18aca8 3204def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3205 """ Filter a dictionary with a simple string syntax.
3206 @returns Whether the filter passes
3207 @param incomplete Set of keys that is expected to be missing from dct.
3208 Can be True/False to indicate all/none of the keys may be missing.
3209 All conditions on incomplete keys pass if the key is missing
8f18aca8 3210 """
347de493 3211 return all(
8f18aca8 3212 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3213 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3214
3215
fe2ce85a 3216def match_filter_func(filters, breaking_filters=None):
3217 if not filters and not breaking_filters:
d1b5f70b 3218 return None
fe2ce85a 3219 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3220 filters = set(variadic(filters or []))
d1b5f70b 3221
492272fe 3222 interactive = '-' in filters
3223 if interactive:
3224 filters.remove('-')
3225
3226 def _match_func(info_dict, incomplete=False):
fe2ce85a 3227 ret = breaking_filters(info_dict, incomplete)
3228 if ret is not None:
3229 raise RejectedVideoReached(ret)
3230
492272fe 3231 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3232 return NO_DEFAULT if interactive and not incomplete else None
347de493 3233 else:
3bec830a 3234 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3235 filter_str = ') | ('.join(map(str.strip, filters))
3236 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3237 return _match_func
91410c9b
PH
3238
3239
f2df4071 3240class download_range_func:
b4e0d758 3241 def __init__(self, chapters, ranges, from_info=False):
3242 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3243
3244 def __call__(self, info_dict, ydl):
0500ee3d 3245
5ec1b6b7 3246 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3247 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3248 for regex in self.chapters or []:
5ec1b6b7 3249 for i, chapter in enumerate(info_dict.get('chapters') or []):
3250 if re.search(regex, chapter['title']):
3251 warning = None
3252 yield {**chapter, 'index': i}
f2df4071 3253 if self.chapters and warning:
5ec1b6b7 3254 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3255
b4e0d758 3256 for start, end in self.ranges or []:
3257 yield {
3258 'start_time': self._handle_negative_timestamp(start, info_dict),
3259 'end_time': self._handle_negative_timestamp(end, info_dict),
3260 }
3261
3262 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3263 yield {
e59e2074 3264 'start_time': info_dict.get('start_time') or 0,
3265 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3266 }
e59e2074 3267 elif not self.ranges and not self.chapters:
3268 yield {}
b4e0d758 3269
3270 @staticmethod
3271 def _handle_negative_timestamp(time, info):
3272 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3273
f2df4071 3274 def __eq__(self, other):
3275 return (isinstance(other, download_range_func)
3276 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3277
71df9b7f 3278 def __repr__(self):
a5387729 3279 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3280
5ec1b6b7 3281
bf6427d2
YCH
3282def parse_dfxp_time_expr(time_expr):
3283 if not time_expr:
d631d5f9 3284 return
bf6427d2 3285
1d485a1a 3286 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3287 if mobj:
3288 return float(mobj.group('time_offset'))
3289
db2fe38b 3290 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3291 if mobj:
db2fe38b 3292 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3293
3294
c1c924ab 3295def srt_subtitles_timecode(seconds):
aa7785f8 3296 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3297
3298
3299def ass_subtitles_timecode(seconds):
3300 time = timetuple_from_msec(seconds * 1000)
3301 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3302
3303
3304def dfxp2srt(dfxp_data):
3869028f
YCH
3305 '''
3306 @param dfxp_data A bytes-like object containing DFXP data
3307 @returns A unicode object containing converted SRT data
3308 '''
5b995f71 3309 LEGACY_NAMESPACES = (
3869028f
YCH
3310 (b'http://www.w3.org/ns/ttml', [
3311 b'http://www.w3.org/2004/11/ttaf1',
3312 b'http://www.w3.org/2006/04/ttaf1',
3313 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3314 ]),
3869028f
YCH
3315 (b'http://www.w3.org/ns/ttml#styling', [
3316 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3317 ]),
3318 )
3319
3320 SUPPORTED_STYLING = [
3321 'color',
3322 'fontFamily',
3323 'fontSize',
3324 'fontStyle',
3325 'fontWeight',
3326 'textDecoration'
3327 ]
3328
4e335771 3329 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3330 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3331 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3332 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3333 })
bf6427d2 3334
5b995f71
RA
3335 styles = {}
3336 default_style = {}
3337
86e5f3ed 3338 class TTMLPElementParser:
5b995f71
RA
3339 _out = ''
3340 _unclosed_elements = []
3341 _applied_styles = []
bf6427d2 3342
2b14cb56 3343 def start(self, tag, attrib):
5b995f71
RA
3344 if tag in (_x('ttml:br'), 'br'):
3345 self._out += '\n'
3346 else:
3347 unclosed_elements = []
3348 style = {}
3349 element_style_id = attrib.get('style')
3350 if default_style:
3351 style.update(default_style)
3352 if element_style_id:
3353 style.update(styles.get(element_style_id, {}))
3354 for prop in SUPPORTED_STYLING:
3355 prop_val = attrib.get(_x('tts:' + prop))
3356 if prop_val:
3357 style[prop] = prop_val
3358 if style:
3359 font = ''
3360 for k, v in sorted(style.items()):
3361 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3362 continue
3363 if k == 'color':
3364 font += ' color="%s"' % v
3365 elif k == 'fontSize':
3366 font += ' size="%s"' % v
3367 elif k == 'fontFamily':
3368 font += ' face="%s"' % v
3369 elif k == 'fontWeight' and v == 'bold':
3370 self._out += '<b>'
3371 unclosed_elements.append('b')
3372 elif k == 'fontStyle' and v == 'italic':
3373 self._out += '<i>'
3374 unclosed_elements.append('i')
3375 elif k == 'textDecoration' and v == 'underline':
3376 self._out += '<u>'
3377 unclosed_elements.append('u')
3378 if font:
3379 self._out += '<font' + font + '>'
3380 unclosed_elements.append('font')
3381 applied_style = {}
3382 if self._applied_styles:
3383 applied_style.update(self._applied_styles[-1])
3384 applied_style.update(style)
3385 self._applied_styles.append(applied_style)
3386 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3387
2b14cb56 3388 def end(self, tag):
5b995f71
RA
3389 if tag not in (_x('ttml:br'), 'br'):
3390 unclosed_elements = self._unclosed_elements.pop()
3391 for element in reversed(unclosed_elements):
3392 self._out += '</%s>' % element
3393 if unclosed_elements and self._applied_styles:
3394 self._applied_styles.pop()
bf6427d2 3395
2b14cb56 3396 def data(self, data):
5b995f71 3397 self._out += data
2b14cb56 3398
3399 def close(self):
5b995f71 3400 return self._out.strip()
2b14cb56 3401
6a765f13 3402 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3403 # This will not trigger false positives since only UTF-8 text is being replaced
3404 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3405
2b14cb56 3406 def parse_node(node):
3407 target = TTMLPElementParser()
3408 parser = xml.etree.ElementTree.XMLParser(target=target)
3409 parser.feed(xml.etree.ElementTree.tostring(node))
3410 return parser.close()
bf6427d2 3411
5b995f71
RA
3412 for k, v in LEGACY_NAMESPACES:
3413 for ns in v:
3414 dfxp_data = dfxp_data.replace(ns, k)
3415
3869028f 3416 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3417 out = []
5b995f71 3418 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3419
3420 if not paras:
3421 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3422
5b995f71
RA
3423 repeat = False
3424 while True:
3425 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3426 style_id = style.get('id') or style.get(_x('xml:id'))
3427 if not style_id:
3428 continue
5b995f71
RA
3429 parent_style_id = style.get('style')
3430 if parent_style_id:
3431 if parent_style_id not in styles:
3432 repeat = True
3433 continue
3434 styles[style_id] = styles[parent_style_id].copy()
3435 for prop in SUPPORTED_STYLING:
3436 prop_val = style.get(_x('tts:' + prop))
3437 if prop_val:
3438 styles.setdefault(style_id, {})[prop] = prop_val
3439 if repeat:
3440 repeat = False
3441 else:
3442 break
3443
3444 for p in ('body', 'div'):
3445 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3446 if ele is None:
3447 continue
3448 style = styles.get(ele.get('style'))
3449 if not style:
3450 continue
3451 default_style.update(style)
3452
bf6427d2 3453 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3454 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3455 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3456 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3457 if begin_time is None:
3458 continue
7dff0363 3459 if not end_time:
d631d5f9
YCH
3460 if not dur:
3461 continue
3462 end_time = begin_time + dur
bf6427d2
YCH
3463 out.append('%d\n%s --> %s\n%s\n\n' % (
3464 index,
c1c924ab
YCH
3465 srt_subtitles_timecode(begin_time),
3466 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3467 parse_node(para)))
3468
3469 return ''.join(out)
3470
3471
c487cf00 3472def cli_option(params, command_option, param, separator=None):
66e289ba 3473 param = params.get(param)
c487cf00 3474 return ([] if param is None
3475 else [command_option, str(param)] if separator is None
3476 else [f'{command_option}{separator}{param}'])
66e289ba
S
3477
3478
3479def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3480 param = params.get(param)
c487cf00 3481 assert param in (True, False, None)
3482 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3483
3484
3485def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3486 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3487
3488
e92caff5 3489def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3490 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3491 if use_compat:
5b1ecbb3 3492 return argdict
3493 else:
3494 argdict = None
eab9b2bc 3495 if argdict is None:
5b1ecbb3 3496 return default
eab9b2bc 3497 assert isinstance(argdict, dict)
3498
e92caff5 3499 assert isinstance(keys, (list, tuple))
3500 for key_list in keys:
e92caff5 3501 arg_list = list(filter(
3502 lambda x: x is not None,
6606817a 3503 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3504 if arg_list:
3505 return [arg for args in arg_list for arg in args]
3506 return default
66e289ba 3507
6251555f 3508
330690a2 3509def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3510 main_key, exe = main_key.lower(), exe.lower()
3511 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3512 keys = [f'{root_key}{k}' for k in (keys or [''])]
3513 if root_key in keys:
3514 if main_key != exe:
3515 keys.append((main_key, exe))
3516 keys.append('default')
3517 else:
3518 use_compat = False
3519 return cli_configuration_args(argdict, keys, default, use_compat)
3520
66e289ba 3521
86e5f3ed 3522class ISO639Utils:
39672624
YCH
3523 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3524 _lang_map = {
3525 'aa': 'aar',
3526 'ab': 'abk',
3527 'ae': 'ave',
3528 'af': 'afr',
3529 'ak': 'aka',
3530 'am': 'amh',
3531 'an': 'arg',
3532 'ar': 'ara',
3533 'as': 'asm',
3534 'av': 'ava',
3535 'ay': 'aym',
3536 'az': 'aze',
3537 'ba': 'bak',
3538 'be': 'bel',
3539 'bg': 'bul',
3540 'bh': 'bih',
3541 'bi': 'bis',
3542 'bm': 'bam',
3543 'bn': 'ben',
3544 'bo': 'bod',
3545 'br': 'bre',
3546 'bs': 'bos',
3547 'ca': 'cat',
3548 'ce': 'che',
3549 'ch': 'cha',
3550 'co': 'cos',
3551 'cr': 'cre',
3552 'cs': 'ces',
3553 'cu': 'chu',
3554 'cv': 'chv',
3555 'cy': 'cym',
3556 'da': 'dan',
3557 'de': 'deu',
3558 'dv': 'div',
3559 'dz': 'dzo',
3560 'ee': 'ewe',
3561 'el': 'ell',
3562 'en': 'eng',
3563 'eo': 'epo',
3564 'es': 'spa',
3565 'et': 'est',
3566 'eu': 'eus',
3567 'fa': 'fas',
3568 'ff': 'ful',
3569 'fi': 'fin',
3570 'fj': 'fij',
3571 'fo': 'fao',
3572 'fr': 'fra',
3573 'fy': 'fry',
3574 'ga': 'gle',
3575 'gd': 'gla',
3576 'gl': 'glg',
3577 'gn': 'grn',
3578 'gu': 'guj',
3579 'gv': 'glv',
3580 'ha': 'hau',
3581 'he': 'heb',
b7acc835 3582 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3583 'hi': 'hin',
3584 'ho': 'hmo',
3585 'hr': 'hrv',
3586 'ht': 'hat',
3587 'hu': 'hun',
3588 'hy': 'hye',
3589 'hz': 'her',
3590 'ia': 'ina',
3591 'id': 'ind',
b7acc835 3592 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3593 'ie': 'ile',
3594 'ig': 'ibo',
3595 'ii': 'iii',
3596 'ik': 'ipk',
3597 'io': 'ido',
3598 'is': 'isl',
3599 'it': 'ita',
3600 'iu': 'iku',
3601 'ja': 'jpn',
3602 'jv': 'jav',
3603 'ka': 'kat',
3604 'kg': 'kon',
3605 'ki': 'kik',
3606 'kj': 'kua',
3607 'kk': 'kaz',
3608 'kl': 'kal',
3609 'km': 'khm',
3610 'kn': 'kan',
3611 'ko': 'kor',
3612 'kr': 'kau',
3613 'ks': 'kas',
3614 'ku': 'kur',
3615 'kv': 'kom',
3616 'kw': 'cor',
3617 'ky': 'kir',
3618 'la': 'lat',
3619 'lb': 'ltz',
3620 'lg': 'lug',
3621 'li': 'lim',
3622 'ln': 'lin',
3623 'lo': 'lao',
3624 'lt': 'lit',
3625 'lu': 'lub',
3626 'lv': 'lav',
3627 'mg': 'mlg',
3628 'mh': 'mah',
3629 'mi': 'mri',
3630 'mk': 'mkd',
3631 'ml': 'mal',
3632 'mn': 'mon',
3633 'mr': 'mar',
3634 'ms': 'msa',
3635 'mt': 'mlt',
3636 'my': 'mya',
3637 'na': 'nau',
3638 'nb': 'nob',
3639 'nd': 'nde',
3640 'ne': 'nep',
3641 'ng': 'ndo',
3642 'nl': 'nld',
3643 'nn': 'nno',
3644 'no': 'nor',
3645 'nr': 'nbl',
3646 'nv': 'nav',
3647 'ny': 'nya',
3648 'oc': 'oci',
3649 'oj': 'oji',
3650 'om': 'orm',
3651 'or': 'ori',
3652 'os': 'oss',
3653 'pa': 'pan',
7bcd4813 3654 'pe': 'per',
39672624
YCH
3655 'pi': 'pli',
3656 'pl': 'pol',
3657 'ps': 'pus',
3658 'pt': 'por',
3659 'qu': 'que',
3660 'rm': 'roh',
3661 'rn': 'run',
3662 'ro': 'ron',
3663 'ru': 'rus',
3664 'rw': 'kin',
3665 'sa': 'san',
3666 'sc': 'srd',
3667 'sd': 'snd',
3668 'se': 'sme',
3669 'sg': 'sag',
3670 'si': 'sin',
3671 'sk': 'slk',
3672 'sl': 'slv',
3673 'sm': 'smo',
3674 'sn': 'sna',
3675 'so': 'som',
3676 'sq': 'sqi',
3677 'sr': 'srp',
3678 'ss': 'ssw',
3679 'st': 'sot',
3680 'su': 'sun',
3681 'sv': 'swe',
3682 'sw': 'swa',
3683 'ta': 'tam',
3684 'te': 'tel',
3685 'tg': 'tgk',
3686 'th': 'tha',
3687 'ti': 'tir',
3688 'tk': 'tuk',
3689 'tl': 'tgl',
3690 'tn': 'tsn',
3691 'to': 'ton',
3692 'tr': 'tur',
3693 'ts': 'tso',
3694 'tt': 'tat',
3695 'tw': 'twi',
3696 'ty': 'tah',
3697 'ug': 'uig',
3698 'uk': 'ukr',
3699 'ur': 'urd',
3700 'uz': 'uzb',
3701 've': 'ven',
3702 'vi': 'vie',
3703 'vo': 'vol',
3704 'wa': 'wln',
3705 'wo': 'wol',
3706 'xh': 'xho',
3707 'yi': 'yid',
e9a50fba 3708 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3709 'yo': 'yor',
3710 'za': 'zha',
3711 'zh': 'zho',
3712 'zu': 'zul',
3713 }
3714
3715 @classmethod
3716 def short2long(cls, code):
3717 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3718 return cls._lang_map.get(code[:2])
3719
3720 @classmethod
3721 def long2short(cls, code):
3722 """Convert language code from ISO 639-2/T to ISO 639-1"""
3723 for short_name, long_name in cls._lang_map.items():
3724 if long_name == code:
3725 return short_name
3726
3727
86e5f3ed 3728class ISO3166Utils:
4eb10f66
YCH
3729 # From http://data.okfn.org/data/core/country-list
3730 _country_map = {
3731 'AF': 'Afghanistan',
3732 'AX': 'Åland Islands',
3733 'AL': 'Albania',
3734 'DZ': 'Algeria',
3735 'AS': 'American Samoa',
3736 'AD': 'Andorra',
3737 'AO': 'Angola',
3738 'AI': 'Anguilla',
3739 'AQ': 'Antarctica',
3740 'AG': 'Antigua and Barbuda',
3741 'AR': 'Argentina',
3742 'AM': 'Armenia',
3743 'AW': 'Aruba',
3744 'AU': 'Australia',
3745 'AT': 'Austria',
3746 'AZ': 'Azerbaijan',
3747 'BS': 'Bahamas',
3748 'BH': 'Bahrain',
3749 'BD': 'Bangladesh',
3750 'BB': 'Barbados',
3751 'BY': 'Belarus',
3752 'BE': 'Belgium',
3753 'BZ': 'Belize',
3754 'BJ': 'Benin',
3755 'BM': 'Bermuda',
3756 'BT': 'Bhutan',
3757 'BO': 'Bolivia, Plurinational State of',
3758 'BQ': 'Bonaire, Sint Eustatius and Saba',
3759 'BA': 'Bosnia and Herzegovina',
3760 'BW': 'Botswana',
3761 'BV': 'Bouvet Island',
3762 'BR': 'Brazil',
3763 'IO': 'British Indian Ocean Territory',
3764 'BN': 'Brunei Darussalam',
3765 'BG': 'Bulgaria',
3766 'BF': 'Burkina Faso',
3767 'BI': 'Burundi',
3768 'KH': 'Cambodia',
3769 'CM': 'Cameroon',
3770 'CA': 'Canada',
3771 'CV': 'Cape Verde',
3772 'KY': 'Cayman Islands',
3773 'CF': 'Central African Republic',
3774 'TD': 'Chad',
3775 'CL': 'Chile',
3776 'CN': 'China',
3777 'CX': 'Christmas Island',
3778 'CC': 'Cocos (Keeling) Islands',
3779 'CO': 'Colombia',
3780 'KM': 'Comoros',
3781 'CG': 'Congo',
3782 'CD': 'Congo, the Democratic Republic of the',
3783 'CK': 'Cook Islands',
3784 'CR': 'Costa Rica',
3785 'CI': 'Côte d\'Ivoire',
3786 'HR': 'Croatia',
3787 'CU': 'Cuba',
3788 'CW': 'Curaçao',
3789 'CY': 'Cyprus',
3790 'CZ': 'Czech Republic',
3791 'DK': 'Denmark',
3792 'DJ': 'Djibouti',
3793 'DM': 'Dominica',
3794 'DO': 'Dominican Republic',
3795 'EC': 'Ecuador',
3796 'EG': 'Egypt',
3797 'SV': 'El Salvador',
3798 'GQ': 'Equatorial Guinea',
3799 'ER': 'Eritrea',
3800 'EE': 'Estonia',
3801 'ET': 'Ethiopia',
3802 'FK': 'Falkland Islands (Malvinas)',
3803 'FO': 'Faroe Islands',
3804 'FJ': 'Fiji',
3805 'FI': 'Finland',
3806 'FR': 'France',
3807 'GF': 'French Guiana',
3808 'PF': 'French Polynesia',
3809 'TF': 'French Southern Territories',
3810 'GA': 'Gabon',
3811 'GM': 'Gambia',
3812 'GE': 'Georgia',
3813 'DE': 'Germany',
3814 'GH': 'Ghana',
3815 'GI': 'Gibraltar',
3816 'GR': 'Greece',
3817 'GL': 'Greenland',
3818 'GD': 'Grenada',
3819 'GP': 'Guadeloupe',
3820 'GU': 'Guam',
3821 'GT': 'Guatemala',
3822 'GG': 'Guernsey',
3823 'GN': 'Guinea',
3824 'GW': 'Guinea-Bissau',
3825 'GY': 'Guyana',
3826 'HT': 'Haiti',
3827 'HM': 'Heard Island and McDonald Islands',
3828 'VA': 'Holy See (Vatican City State)',
3829 'HN': 'Honduras',
3830 'HK': 'Hong Kong',
3831 'HU': 'Hungary',
3832 'IS': 'Iceland',
3833 'IN': 'India',
3834 'ID': 'Indonesia',
3835 'IR': 'Iran, Islamic Republic of',
3836 'IQ': 'Iraq',
3837 'IE': 'Ireland',
3838 'IM': 'Isle of Man',
3839 'IL': 'Israel',
3840 'IT': 'Italy',
3841 'JM': 'Jamaica',
3842 'JP': 'Japan',
3843 'JE': 'Jersey',
3844 'JO': 'Jordan',
3845 'KZ': 'Kazakhstan',
3846 'KE': 'Kenya',
3847 'KI': 'Kiribati',
3848 'KP': 'Korea, Democratic People\'s Republic of',
3849 'KR': 'Korea, Republic of',
3850 'KW': 'Kuwait',
3851 'KG': 'Kyrgyzstan',
3852 'LA': 'Lao People\'s Democratic Republic',
3853 'LV': 'Latvia',
3854 'LB': 'Lebanon',
3855 'LS': 'Lesotho',
3856 'LR': 'Liberia',
3857 'LY': 'Libya',
3858 'LI': 'Liechtenstein',
3859 'LT': 'Lithuania',
3860 'LU': 'Luxembourg',
3861 'MO': 'Macao',
3862 'MK': 'Macedonia, the Former Yugoslav Republic of',
3863 'MG': 'Madagascar',
3864 'MW': 'Malawi',
3865 'MY': 'Malaysia',
3866 'MV': 'Maldives',
3867 'ML': 'Mali',
3868 'MT': 'Malta',
3869 'MH': 'Marshall Islands',
3870 'MQ': 'Martinique',
3871 'MR': 'Mauritania',
3872 'MU': 'Mauritius',
3873 'YT': 'Mayotte',
3874 'MX': 'Mexico',
3875 'FM': 'Micronesia, Federated States of',
3876 'MD': 'Moldova, Republic of',
3877 'MC': 'Monaco',
3878 'MN': 'Mongolia',
3879 'ME': 'Montenegro',
3880 'MS': 'Montserrat',
3881 'MA': 'Morocco',
3882 'MZ': 'Mozambique',
3883 'MM': 'Myanmar',
3884 'NA': 'Namibia',
3885 'NR': 'Nauru',
3886 'NP': 'Nepal',
3887 'NL': 'Netherlands',
3888 'NC': 'New Caledonia',
3889 'NZ': 'New Zealand',
3890 'NI': 'Nicaragua',
3891 'NE': 'Niger',
3892 'NG': 'Nigeria',
3893 'NU': 'Niue',
3894 'NF': 'Norfolk Island',
3895 'MP': 'Northern Mariana Islands',
3896 'NO': 'Norway',
3897 'OM': 'Oman',
3898 'PK': 'Pakistan',
3899 'PW': 'Palau',
3900 'PS': 'Palestine, State of',
3901 'PA': 'Panama',
3902 'PG': 'Papua New Guinea',
3903 'PY': 'Paraguay',
3904 'PE': 'Peru',
3905 'PH': 'Philippines',
3906 'PN': 'Pitcairn',
3907 'PL': 'Poland',
3908 'PT': 'Portugal',
3909 'PR': 'Puerto Rico',
3910 'QA': 'Qatar',
3911 'RE': 'Réunion',
3912 'RO': 'Romania',
3913 'RU': 'Russian Federation',
3914 'RW': 'Rwanda',
3915 'BL': 'Saint Barthélemy',
3916 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3917 'KN': 'Saint Kitts and Nevis',
3918 'LC': 'Saint Lucia',
3919 'MF': 'Saint Martin (French part)',
3920 'PM': 'Saint Pierre and Miquelon',
3921 'VC': 'Saint Vincent and the Grenadines',
3922 'WS': 'Samoa',
3923 'SM': 'San Marino',
3924 'ST': 'Sao Tome and Principe',
3925 'SA': 'Saudi Arabia',
3926 'SN': 'Senegal',
3927 'RS': 'Serbia',
3928 'SC': 'Seychelles',
3929 'SL': 'Sierra Leone',
3930 'SG': 'Singapore',
3931 'SX': 'Sint Maarten (Dutch part)',
3932 'SK': 'Slovakia',
3933 'SI': 'Slovenia',
3934 'SB': 'Solomon Islands',
3935 'SO': 'Somalia',
3936 'ZA': 'South Africa',
3937 'GS': 'South Georgia and the South Sandwich Islands',
3938 'SS': 'South Sudan',
3939 'ES': 'Spain',
3940 'LK': 'Sri Lanka',
3941 'SD': 'Sudan',
3942 'SR': 'Suriname',
3943 'SJ': 'Svalbard and Jan Mayen',
3944 'SZ': 'Swaziland',
3945 'SE': 'Sweden',
3946 'CH': 'Switzerland',
3947 'SY': 'Syrian Arab Republic',
3948 'TW': 'Taiwan, Province of China',
3949 'TJ': 'Tajikistan',
3950 'TZ': 'Tanzania, United Republic of',
3951 'TH': 'Thailand',
3952 'TL': 'Timor-Leste',
3953 'TG': 'Togo',
3954 'TK': 'Tokelau',
3955 'TO': 'Tonga',
3956 'TT': 'Trinidad and Tobago',
3957 'TN': 'Tunisia',
3958 'TR': 'Turkey',
3959 'TM': 'Turkmenistan',
3960 'TC': 'Turks and Caicos Islands',
3961 'TV': 'Tuvalu',
3962 'UG': 'Uganda',
3963 'UA': 'Ukraine',
3964 'AE': 'United Arab Emirates',
3965 'GB': 'United Kingdom',
3966 'US': 'United States',
3967 'UM': 'United States Minor Outlying Islands',
3968 'UY': 'Uruguay',
3969 'UZ': 'Uzbekistan',
3970 'VU': 'Vanuatu',
3971 'VE': 'Venezuela, Bolivarian Republic of',
3972 'VN': 'Viet Nam',
3973 'VG': 'Virgin Islands, British',
3974 'VI': 'Virgin Islands, U.S.',
3975 'WF': 'Wallis and Futuna',
3976 'EH': 'Western Sahara',
3977 'YE': 'Yemen',
3978 'ZM': 'Zambia',
3979 'ZW': 'Zimbabwe',
2f97cc61 3980 # Not ISO 3166 codes, but used for IP blocks
3981 'AP': 'Asia/Pacific Region',
3982 'EU': 'Europe',
4eb10f66
YCH
3983 }
3984
3985 @classmethod
3986 def short2full(cls, code):
3987 """Convert an ISO 3166-2 country code to the corresponding full name"""
3988 return cls._country_map.get(code.upper())
3989
3990
86e5f3ed 3991class GeoUtils:
773f291d
S
3992 # Major IPv4 address blocks per country
3993 _country_ip_map = {
53896ca5 3994 'AD': '46.172.224.0/19',
773f291d
S
3995 'AE': '94.200.0.0/13',
3996 'AF': '149.54.0.0/17',
3997 'AG': '209.59.64.0/18',
3998 'AI': '204.14.248.0/21',
3999 'AL': '46.99.0.0/16',
4000 'AM': '46.70.0.0/15',
4001 'AO': '105.168.0.0/13',
53896ca5
S
4002 'AP': '182.50.184.0/21',
4003 'AQ': '23.154.160.0/24',
773f291d
S
4004 'AR': '181.0.0.0/12',
4005 'AS': '202.70.112.0/20',
53896ca5 4006 'AT': '77.116.0.0/14',
773f291d
S
4007 'AU': '1.128.0.0/11',
4008 'AW': '181.41.0.0/18',
53896ca5
S
4009 'AX': '185.217.4.0/22',
4010 'AZ': '5.197.0.0/16',
773f291d
S
4011 'BA': '31.176.128.0/17',
4012 'BB': '65.48.128.0/17',
4013 'BD': '114.130.0.0/16',
4014 'BE': '57.0.0.0/8',
53896ca5 4015 'BF': '102.178.0.0/15',
773f291d
S
4016 'BG': '95.42.0.0/15',
4017 'BH': '37.131.0.0/17',
4018 'BI': '154.117.192.0/18',
4019 'BJ': '137.255.0.0/16',
53896ca5 4020 'BL': '185.212.72.0/23',
773f291d
S
4021 'BM': '196.12.64.0/18',
4022 'BN': '156.31.0.0/16',
4023 'BO': '161.56.0.0/16',
4024 'BQ': '161.0.80.0/20',
53896ca5 4025 'BR': '191.128.0.0/12',
773f291d
S
4026 'BS': '24.51.64.0/18',
4027 'BT': '119.2.96.0/19',
4028 'BW': '168.167.0.0/16',
4029 'BY': '178.120.0.0/13',
4030 'BZ': '179.42.192.0/18',
4031 'CA': '99.224.0.0/11',
4032 'CD': '41.243.0.0/16',
53896ca5
S
4033 'CF': '197.242.176.0/21',
4034 'CG': '160.113.0.0/16',
773f291d 4035 'CH': '85.0.0.0/13',
53896ca5 4036 'CI': '102.136.0.0/14',
773f291d
S
4037 'CK': '202.65.32.0/19',
4038 'CL': '152.172.0.0/14',
53896ca5 4039 'CM': '102.244.0.0/14',
773f291d
S
4040 'CN': '36.128.0.0/10',
4041 'CO': '181.240.0.0/12',
4042 'CR': '201.192.0.0/12',
4043 'CU': '152.206.0.0/15',
4044 'CV': '165.90.96.0/19',
4045 'CW': '190.88.128.0/17',
53896ca5 4046 'CY': '31.153.0.0/16',
773f291d
S
4047 'CZ': '88.100.0.0/14',
4048 'DE': '53.0.0.0/8',
4049 'DJ': '197.241.0.0/17',
4050 'DK': '87.48.0.0/12',
4051 'DM': '192.243.48.0/20',
4052 'DO': '152.166.0.0/15',
4053 'DZ': '41.96.0.0/12',
4054 'EC': '186.68.0.0/15',
4055 'EE': '90.190.0.0/15',
4056 'EG': '156.160.0.0/11',
4057 'ER': '196.200.96.0/20',
4058 'ES': '88.0.0.0/11',
4059 'ET': '196.188.0.0/14',
4060 'EU': '2.16.0.0/13',
4061 'FI': '91.152.0.0/13',
4062 'FJ': '144.120.0.0/16',
53896ca5 4063 'FK': '80.73.208.0/21',
773f291d
S
4064 'FM': '119.252.112.0/20',
4065 'FO': '88.85.32.0/19',
4066 'FR': '90.0.0.0/9',
4067 'GA': '41.158.0.0/15',
4068 'GB': '25.0.0.0/8',
4069 'GD': '74.122.88.0/21',
4070 'GE': '31.146.0.0/16',
4071 'GF': '161.22.64.0/18',
4072 'GG': '62.68.160.0/19',
53896ca5
S
4073 'GH': '154.160.0.0/12',
4074 'GI': '95.164.0.0/16',
773f291d
S
4075 'GL': '88.83.0.0/19',
4076 'GM': '160.182.0.0/15',
4077 'GN': '197.149.192.0/18',
4078 'GP': '104.250.0.0/19',
4079 'GQ': '105.235.224.0/20',
4080 'GR': '94.64.0.0/13',
4081 'GT': '168.234.0.0/16',
4082 'GU': '168.123.0.0/16',
4083 'GW': '197.214.80.0/20',
4084 'GY': '181.41.64.0/18',
4085 'HK': '113.252.0.0/14',
4086 'HN': '181.210.0.0/16',
4087 'HR': '93.136.0.0/13',
4088 'HT': '148.102.128.0/17',
4089 'HU': '84.0.0.0/14',
4090 'ID': '39.192.0.0/10',
4091 'IE': '87.32.0.0/12',
4092 'IL': '79.176.0.0/13',
4093 'IM': '5.62.80.0/20',
4094 'IN': '117.192.0.0/10',
4095 'IO': '203.83.48.0/21',
4096 'IQ': '37.236.0.0/14',
4097 'IR': '2.176.0.0/12',
4098 'IS': '82.221.0.0/16',
4099 'IT': '79.0.0.0/10',
4100 'JE': '87.244.64.0/18',
4101 'JM': '72.27.0.0/17',
4102 'JO': '176.29.0.0/16',
53896ca5 4103 'JP': '133.0.0.0/8',
773f291d
S
4104 'KE': '105.48.0.0/12',
4105 'KG': '158.181.128.0/17',
4106 'KH': '36.37.128.0/17',
4107 'KI': '103.25.140.0/22',
4108 'KM': '197.255.224.0/20',
53896ca5 4109 'KN': '198.167.192.0/19',
773f291d
S
4110 'KP': '175.45.176.0/22',
4111 'KR': '175.192.0.0/10',
4112 'KW': '37.36.0.0/14',
4113 'KY': '64.96.0.0/15',
4114 'KZ': '2.72.0.0/13',
4115 'LA': '115.84.64.0/18',
4116 'LB': '178.135.0.0/16',
53896ca5 4117 'LC': '24.92.144.0/20',
773f291d
S
4118 'LI': '82.117.0.0/19',
4119 'LK': '112.134.0.0/15',
53896ca5 4120 'LR': '102.183.0.0/16',
773f291d
S
4121 'LS': '129.232.0.0/17',
4122 'LT': '78.56.0.0/13',
4123 'LU': '188.42.0.0/16',
4124 'LV': '46.109.0.0/16',
4125 'LY': '41.252.0.0/14',
4126 'MA': '105.128.0.0/11',
4127 'MC': '88.209.64.0/18',
4128 'MD': '37.246.0.0/16',
4129 'ME': '178.175.0.0/17',
4130 'MF': '74.112.232.0/21',
4131 'MG': '154.126.0.0/17',
4132 'MH': '117.103.88.0/21',
4133 'MK': '77.28.0.0/15',
4134 'ML': '154.118.128.0/18',
4135 'MM': '37.111.0.0/17',
4136 'MN': '49.0.128.0/17',
4137 'MO': '60.246.0.0/16',
4138 'MP': '202.88.64.0/20',
4139 'MQ': '109.203.224.0/19',
4140 'MR': '41.188.64.0/18',
4141 'MS': '208.90.112.0/22',
4142 'MT': '46.11.0.0/16',
4143 'MU': '105.16.0.0/12',
4144 'MV': '27.114.128.0/18',
53896ca5 4145 'MW': '102.70.0.0/15',
773f291d
S
4146 'MX': '187.192.0.0/11',
4147 'MY': '175.136.0.0/13',
4148 'MZ': '197.218.0.0/15',
4149 'NA': '41.182.0.0/16',
4150 'NC': '101.101.0.0/18',
4151 'NE': '197.214.0.0/18',
4152 'NF': '203.17.240.0/22',
4153 'NG': '105.112.0.0/12',
4154 'NI': '186.76.0.0/15',
4155 'NL': '145.96.0.0/11',
4156 'NO': '84.208.0.0/13',
4157 'NP': '36.252.0.0/15',
4158 'NR': '203.98.224.0/19',
4159 'NU': '49.156.48.0/22',
4160 'NZ': '49.224.0.0/14',
4161 'OM': '5.36.0.0/15',
4162 'PA': '186.72.0.0/15',
4163 'PE': '186.160.0.0/14',
4164 'PF': '123.50.64.0/18',
4165 'PG': '124.240.192.0/19',
4166 'PH': '49.144.0.0/13',
4167 'PK': '39.32.0.0/11',
4168 'PL': '83.0.0.0/11',
4169 'PM': '70.36.0.0/20',
4170 'PR': '66.50.0.0/16',
4171 'PS': '188.161.0.0/16',
4172 'PT': '85.240.0.0/13',
4173 'PW': '202.124.224.0/20',
4174 'PY': '181.120.0.0/14',
4175 'QA': '37.210.0.0/15',
53896ca5 4176 'RE': '102.35.0.0/16',
773f291d 4177 'RO': '79.112.0.0/13',
53896ca5 4178 'RS': '93.86.0.0/15',
773f291d 4179 'RU': '5.136.0.0/13',
53896ca5 4180 'RW': '41.186.0.0/16',
773f291d
S
4181 'SA': '188.48.0.0/13',
4182 'SB': '202.1.160.0/19',
4183 'SC': '154.192.0.0/11',
53896ca5 4184 'SD': '102.120.0.0/13',
773f291d 4185 'SE': '78.64.0.0/12',
53896ca5 4186 'SG': '8.128.0.0/10',
773f291d
S
4187 'SI': '188.196.0.0/14',
4188 'SK': '78.98.0.0/15',
53896ca5 4189 'SL': '102.143.0.0/17',
773f291d
S
4190 'SM': '89.186.32.0/19',
4191 'SN': '41.82.0.0/15',
53896ca5 4192 'SO': '154.115.192.0/18',
773f291d
S
4193 'SR': '186.179.128.0/17',
4194 'SS': '105.235.208.0/21',
4195 'ST': '197.159.160.0/19',
4196 'SV': '168.243.0.0/16',
4197 'SX': '190.102.0.0/20',
4198 'SY': '5.0.0.0/16',
4199 'SZ': '41.84.224.0/19',
4200 'TC': '65.255.48.0/20',
4201 'TD': '154.68.128.0/19',
4202 'TG': '196.168.0.0/14',
4203 'TH': '171.96.0.0/13',
4204 'TJ': '85.9.128.0/18',
4205 'TK': '27.96.24.0/21',
4206 'TL': '180.189.160.0/20',
4207 'TM': '95.85.96.0/19',
4208 'TN': '197.0.0.0/11',
4209 'TO': '175.176.144.0/21',
4210 'TR': '78.160.0.0/11',
4211 'TT': '186.44.0.0/15',
4212 'TV': '202.2.96.0/19',
4213 'TW': '120.96.0.0/11',
4214 'TZ': '156.156.0.0/14',
53896ca5
S
4215 'UA': '37.52.0.0/14',
4216 'UG': '102.80.0.0/13',
4217 'US': '6.0.0.0/8',
773f291d 4218 'UY': '167.56.0.0/13',
53896ca5 4219 'UZ': '84.54.64.0/18',
773f291d 4220 'VA': '212.77.0.0/19',
53896ca5 4221 'VC': '207.191.240.0/21',
773f291d 4222 'VE': '186.88.0.0/13',
53896ca5 4223 'VG': '66.81.192.0/20',
773f291d
S
4224 'VI': '146.226.0.0/16',
4225 'VN': '14.160.0.0/11',
4226 'VU': '202.80.32.0/20',
4227 'WF': '117.20.32.0/21',
4228 'WS': '202.4.32.0/19',
4229 'YE': '134.35.0.0/16',
4230 'YT': '41.242.116.0/22',
4231 'ZA': '41.0.0.0/11',
53896ca5
S
4232 'ZM': '102.144.0.0/13',
4233 'ZW': '102.177.192.0/18',
773f291d
S
4234 }
4235
4236 @classmethod
5f95927a
S
4237 def random_ipv4(cls, code_or_block):
4238 if len(code_or_block) == 2:
4239 block = cls._country_ip_map.get(code_or_block.upper())
4240 if not block:
4241 return None
4242 else:
4243 block = code_or_block
773f291d 4244 addr, preflen = block.split('/')
ac668111 4245 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4246 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4247 return str(socket.inet_ntoa(
ac668111 4248 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4249
4250
0a5445dd
YCH
4251# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4252# released into Public Domain
4253# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4254
4255def long_to_bytes(n, blocksize=0):
4256 """long_to_bytes(n:long, blocksize:int) : string
4257 Convert a long integer to a byte string.
4258
4259 If optional blocksize is given and greater than zero, pad the front of the
4260 byte string with binary zeros so that the length is a multiple of
4261 blocksize.
4262 """
4263 # after much testing, this algorithm was deemed to be the fastest
4264 s = b''
4265 n = int(n)
4266 while n > 0:
ac668111 4267 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4268 n = n >> 32
4269 # strip off leading zeros
4270 for i in range(len(s)):
4271 if s[i] != b'\000'[0]:
4272 break
4273 else:
4274 # only happens when n == 0
4275 s = b'\000'
4276 i = 0
4277 s = s[i:]
4278 # add back some pad bytes. this could be done more efficiently w.r.t. the
4279 # de-padding being done above, but sigh...
4280 if blocksize > 0 and len(s) % blocksize:
4281 s = (blocksize - len(s) % blocksize) * b'\000' + s
4282 return s
4283
4284
4285def bytes_to_long(s):
4286 """bytes_to_long(string) : long
4287 Convert a byte string to a long integer.
4288
4289 This is (essentially) the inverse of long_to_bytes().
4290 """
4291 acc = 0
4292 length = len(s)
4293 if length % 4:
4294 extra = (4 - length % 4)
4295 s = b'\000' * extra + s
4296 length = length + extra
4297 for i in range(0, length, 4):
ac668111 4298 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4299 return acc
4300
4301
5bc880b9
YCH
4302def ohdave_rsa_encrypt(data, exponent, modulus):
4303 '''
4304 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4305
4306 Input:
4307 data: data to encrypt, bytes-like object
4308 exponent, modulus: parameter e and N of RSA algorithm, both integer
4309 Output: hex string of encrypted data
4310
4311 Limitation: supports one block encryption only
4312 '''
4313
4314 payload = int(binascii.hexlify(data[::-1]), 16)
4315 encrypted = pow(payload, exponent, modulus)
4316 return '%x' % encrypted
81bdc8fd
YCH
4317
4318
f48409c7
YCH
4319def pkcs1pad(data, length):
4320 """
4321 Padding input data with PKCS#1 scheme
4322
4323 @param {int[]} data input data
4324 @param {int} length target length
4325 @returns {int[]} padded data
4326 """
4327 if len(data) > length - 11:
4328 raise ValueError('Input data too long for PKCS#1 padding')
4329
4330 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4331 return [0, 2] + pseudo_random + [0] + data
4332
4333
7b2c3f47 4334def _base_n_table(n, table):
4335 if not table and not n:
4336 raise ValueError('Either table or n must be specified')
612f2be5 4337 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4338
44f14eb4 4339 if n and n != len(table):
612f2be5 4340 raise ValueError(f'base {n} exceeds table length {len(table)}')
4341 return table
59f898b7 4342
5eb6bdce 4343
7b2c3f47 4344def encode_base_n(num, n=None, table=None):
4345 """Convert given int to a base-n string"""
612f2be5 4346 table = _base_n_table(n, table)
7b2c3f47 4347 if not num:
5eb6bdce
YCH
4348 return table[0]
4349
7b2c3f47 4350 result, base = '', len(table)
81bdc8fd 4351 while num:
7b2c3f47 4352 result = table[num % base] + result
612f2be5 4353 num = num // base
7b2c3f47 4354 return result
4355
4356
4357def decode_base_n(string, n=None, table=None):
4358 """Convert given base-n string to int"""
4359 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4360 result, base = 0, len(table)
4361 for char in string:
4362 result = result * base + table[char]
4363 return result
4364
4365
f52354a8 4366def decode_packed_codes(code):
06b3fe29 4367 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4368 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4369 base = int(base)
4370 count = int(count)
4371 symbols = symbols.split('|')
4372 symbol_table = {}
4373
4374 while count:
4375 count -= 1
5eb6bdce 4376 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4377 symbol_table[base_n_count] = symbols[count] or base_n_count
4378
4379 return re.sub(
4380 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4381 obfuscated_code)
e154c651 4382
4383
1ced2221
S
4384def caesar(s, alphabet, shift):
4385 if shift == 0:
4386 return s
4387 l = len(alphabet)
4388 return ''.join(
4389 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4390 for c in s)
4391
4392
4393def rot47(s):
4394 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4395
4396
e154c651 4397def parse_m3u8_attributes(attrib):
4398 info = {}
4399 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4400 if val.startswith('"'):
4401 val = val[1:-1]
4402 info[key] = val
4403 return info
1143535d
YCH
4404
4405
4406def urshift(val, n):
4407 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4408
4409
efa97bdc 4410def write_xattr(path, key, value):
6f7563be 4411 # Windows: Write xattrs to NTFS Alternate Data Streams:
4412 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4413 if compat_os_name == 'nt':
4414 assert ':' not in key
4415 assert os.path.exists(path)
efa97bdc
YCH
4416
4417 try:
6f7563be 4418 with open(f'{path}:{key}', 'wb') as f:
4419 f.write(value)
86e5f3ed 4420 except OSError as e:
efa97bdc 4421 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4422 return
efa97bdc 4423
6f7563be 4424 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4425
6f7563be 4426 setxattr = None
4427 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4428 # Unicode arguments are not supported in pyxattr until version 0.5.0
4429 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4430 if version_tuple(xattr.__version__) >= (0, 5, 0):
4431 setxattr = xattr.set
4432 elif xattr:
4433 setxattr = xattr.setxattr
efa97bdc 4434
6f7563be 4435 if setxattr:
4436 try:
4437 setxattr(path, key, value)
4438 except OSError as e:
4439 raise XAttrMetadataError(e.errno, e.strerror)
4440 return
efa97bdc 4441
6f7563be 4442 # UNIX Method 2. Use setfattr/xattr executables
4443 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4444 else 'xattr' if check_executable('xattr', ['-h']) else None)
4445 if not exe:
4446 raise XAttrUnavailableError(
4447 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4448 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4449
0f06bcd7 4450 value = value.decode()
6f7563be 4451 try:
f0c9fb96 4452 _, stderr, returncode = Popen.run(
6f7563be 4453 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4454 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4455 except OSError as e:
4456 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4457 if returncode:
4458 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4459
4460
4461def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4462 start_date = datetime.date(1950, 1, 1)
4463 end_date = datetime.date(1995, 12, 31)
4464 offset = random.randint(0, (end_date - start_date).days)
4465 random_date = start_date + datetime.timedelta(offset)
0c265486 4466 return {
aa374bc7
AS
4467 year_field: str(random_date.year),
4468 month_field: str(random_date.month),
4469 day_field: str(random_date.day),
0c265486 4470 }
732044af 4471
c76eb41b 4472
8c53322c
L
4473def find_available_port(interface=''):
4474 try:
4475 with socket.socket() as sock:
4476 sock.bind((interface, 0))
4477 return sock.getsockname()[1]
4478 except OSError:
4479 return None
4480
4481
732044af 4482# Templates for internet shortcut files, which are plain text files.
e5a998f3 4483DOT_URL_LINK_TEMPLATE = '''\
732044af 4484[InternetShortcut]
4485URL=%(url)s
e5a998f3 4486'''
732044af 4487
e5a998f3 4488DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4489<?xml version="1.0" encoding="UTF-8"?>
4490<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4491<plist version="1.0">
4492<dict>
4493\t<key>URL</key>
4494\t<string>%(url)s</string>
4495</dict>
4496</plist>
e5a998f3 4497'''
732044af 4498
e5a998f3 4499DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4500[Desktop Entry]
4501Encoding=UTF-8
4502Name=%(filename)s
4503Type=Link
4504URL=%(url)s
4505Icon=text-html
e5a998f3 4506'''
732044af 4507
08438d2c 4508LINK_TEMPLATES = {
4509 'url': DOT_URL_LINK_TEMPLATE,
4510 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4511 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4512}
4513
732044af 4514
4515def iri_to_uri(iri):
4516 """
4517 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4518
4519 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4520 """
4521
14f25df2 4522 iri_parts = urllib.parse.urlparse(iri)
732044af 4523
4524 if '[' in iri_parts.netloc:
4525 raise ValueError('IPv6 URIs are not, yet, supported.')
4526 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4527
4528 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4529
4530 net_location = ''
4531 if iri_parts.username:
f9934b96 4532 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4533 if iri_parts.password is not None:
f9934b96 4534 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4535 net_location += '@'
4536
0f06bcd7 4537 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4538 # The 'idna' encoding produces ASCII text.
4539 if iri_parts.port is not None and iri_parts.port != 80:
4540 net_location += ':' + str(iri_parts.port)
4541
f9934b96 4542 return urllib.parse.urlunparse(
732044af 4543 (iri_parts.scheme,
4544 net_location,
4545
f9934b96 4546 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4547
4548 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4549 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4550
4551 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4552 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4553
f9934b96 4554 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4555
4556 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4557
4558
4559def to_high_limit_path(path):
4560 if sys.platform in ['win32', 'cygwin']:
4561 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4562 return '\\\\?\\' + os.path.abspath(path)
732044af 4563
4564 return path
76d321f6 4565
c76eb41b 4566
7b2c3f47 4567def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4568 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4569 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4570 return default
7b2c3f47 4571 return template % func(val)
00dd0cd5 4572
4573
4574def clean_podcast_url(url):
91302ed3 4575 url = re.sub(r'''(?x)
00dd0cd5 4576 (?:
4577 (?:
4578 chtbl\.com/track|
4579 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4580 play\.podtrac\.com|
4581 chrt\.fm/track|
4582 mgln\.ai/e
4583 )(?:/[^/.]+)?|
00dd0cd5 4584 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4585 flex\.acast\.com|
4586 pd(?:
4587 cn\.co| # https://podcorn.com/analytics-prefix/
4588 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4589 )/e|
4590 [0-9]\.gum\.fm|
4591 pscrb\.fm/rss/p
00dd0cd5 4592 )/''', '', url)
91302ed3 4593 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4594
4595
4596_HEX_TABLE = '0123456789abcdef'
4597
4598
4599def random_uuidv4():
4600 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4601
4602
4603def make_dir(path, to_screen=None):
4604 try:
4605 dn = os.path.dirname(path)
b25d6cb9
AI
4606 if dn:
4607 os.makedirs(dn, exist_ok=True)
0202b52a 4608 return True
86e5f3ed 4609 except OSError as err:
0202b52a 4610 if callable(to_screen) is not None:
69bec673 4611 to_screen(f'unable to create directory {err}')
0202b52a 4612 return False
f74980cb 4613
4614
4615def get_executable_path():
69bec673 4616 from ..update import _get_variant_and_executable_path
c487cf00 4617
b5899f4f 4618 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4619
4620
8e40b9d1 4621def get_user_config_dirs(package_name):
8e40b9d1
M
4622 # .config (e.g. ~/.config/package_name)
4623 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4624 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4625
4626 # appdata (%APPDATA%/package_name)
4627 appdata_dir = os.getenv('appdata')
4628 if appdata_dir:
773c272d 4629 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4630
4631 # home (~/.package_name)
773c272d 4632 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4633
4634
4635def get_system_config_dirs(package_name):
8e40b9d1 4636 # /etc/package_name
773c272d 4637 yield os.path.join('/etc', package_name)
06167fbb 4638
4639
3e9b66d7 4640def time_seconds(**kwargs):
83c4970e
L
4641 """
4642 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4643 """
4644 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4645
4646
49fa4d9a
N
4647# create a JSON Web Signature (jws) with HS256 algorithm
4648# the resulting format is in JWS Compact Serialization
4649# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4650# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4651def jwt_encode_hs256(payload_data, key, headers={}):
4652 header_data = {
4653 'alg': 'HS256',
4654 'typ': 'JWT',
4655 }
4656 if headers:
4657 header_data.update(headers)
0f06bcd7 4658 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4659 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4660 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4661 signature_b64 = base64.b64encode(h.digest())
4662 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4663 return token
819e0531 4664
4665
16b0d7e6 4666# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4667def jwt_decode_hs256(jwt):
4668 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4669 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4670 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4671 return payload_data
4672
4673
53973b4d 4674WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4675
4676
7a32c70d 4677@functools.cache
819e0531 4678def supports_terminal_sequences(stream):
4679 if compat_os_name == 'nt':
8a82af35 4680 if not WINDOWS_VT_MODE:
819e0531 4681 return False
4682 elif not os.getenv('TERM'):
4683 return False
4684 try:
4685 return stream.isatty()
4686 except BaseException:
4687 return False
4688
4689
c53a18f0 4690def windows_enable_vt_mode():
4691 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4692 if get_windows_version() < (10, 0, 10586):
53973b4d 4693 return
53973b4d 4694
c53a18f0 4695 import ctypes
4696 import ctypes.wintypes
4697 import msvcrt
4698
4699 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4700
4701 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4702 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4703 try:
4704 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4705 dw_original_mode = ctypes.wintypes.DWORD()
4706 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4707 if not success:
4708 raise Exception('GetConsoleMode failed')
4709
4710 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4711 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4712 if not success:
4713 raise Exception('SetConsoleMode failed')
c53a18f0 4714 finally:
4715 os.close(handle)
53973b4d 4716
f0795149 4717 global WINDOWS_VT_MODE
4718 WINDOWS_VT_MODE = True
4719 supports_terminal_sequences.cache_clear()
4720
53973b4d 4721
ec11a9f4 4722_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4723
4724
4725def remove_terminal_sequences(string):
4726 return _terminal_sequences_re.sub('', string)
4727
4728
4729def number_of_digits(number):
4730 return len('%d' % number)
34921b43 4731
4732
4733def join_nonempty(*values, delim='-', from_dict=None):
4734 if from_dict is not None:
69bec673 4735 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4736 return delim.join(map(str, filter(None, values)))
06e57990 4737
4738
27231526
ZM
4739def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4740 """
4741 Find the largest format dimensions in terms of video width and, for each thumbnail:
4742 * Modify the URL: Match the width with the provided regex and replace with the former width
4743 * Update dimensions
4744
4745 This function is useful with video services that scale the provided thumbnails on demand
4746 """
4747 _keys = ('width', 'height')
4748 max_dimensions = max(
86e5f3ed 4749 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4750 default=(0, 0))
4751 if not max_dimensions[0]:
4752 return thumbnails
4753 return [
4754 merge_dicts(
4755 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4756 dict(zip(_keys, max_dimensions)), thumbnail)
4757 for thumbnail in thumbnails
4758 ]
4759
4760
93c8410d
LNO
4761def parse_http_range(range):
4762 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4763 if not range:
4764 return None, None, None
4765 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4766 if not crg:
4767 return None, None, None
4768 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4769
4770
6b9e832d 4771def read_stdin(what):
4772 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4773 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4774 return sys.stdin
4775
4776
a904a7f8
L
4777def determine_file_encoding(data):
4778 """
88f60feb 4779 Detect the text encoding used
a904a7f8
L
4780 @returns (encoding, bytes to skip)
4781 """
4782
88f60feb 4783 # BOM marks are given priority over declarations
a904a7f8 4784 for bom, enc in BOMS:
a904a7f8
L
4785 if data.startswith(bom):
4786 return enc, len(bom)
4787
88f60feb 4788 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4789 # We ignore the endianness to get a good enough match
a904a7f8 4790 data = data.replace(b'\0', b'')
88f60feb 4791 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4792 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4793
4794
06e57990 4795class Config:
4796 own_args = None
9e491463 4797 parsed_args = None
06e57990 4798 filename = None
4799 __initialized = False
4800
4801 def __init__(self, parser, label=None):
9e491463 4802 self.parser, self.label = parser, label
06e57990 4803 self._loaded_paths, self.configs = set(), []
4804
4805 def init(self, args=None, filename=None):
4806 assert not self.__initialized
284a60c5 4807 self.own_args, self.filename = args, filename
4808 return self.load_configs()
4809
4810 def load_configs(self):
65662dff 4811 directory = ''
284a60c5 4812 if self.filename:
4813 location = os.path.realpath(self.filename)
65662dff 4814 directory = os.path.dirname(location)
06e57990 4815 if location in self._loaded_paths:
4816 return False
4817 self._loaded_paths.add(location)
4818
284a60c5 4819 self.__initialized = True
4820 opts, _ = self.parser.parse_known_args(self.own_args)
4821 self.parsed_args = self.own_args
9e491463 4822 for location in opts.config_locations or []:
6b9e832d 4823 if location == '-':
1060f82f 4824 if location in self._loaded_paths:
4825 continue
4826 self._loaded_paths.add(location)
6b9e832d 4827 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4828 continue
65662dff 4829 location = os.path.join(directory, expand_path(location))
06e57990 4830 if os.path.isdir(location):
4831 location = os.path.join(location, 'yt-dlp.conf')
4832 if not os.path.exists(location):
9e491463 4833 self.parser.error(f'config location {location} does not exist')
06e57990 4834 self.append_config(self.read_file(location), location)
4835 return True
4836
4837 def __str__(self):
4838 label = join_nonempty(
4839 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4840 delim=' ')
4841 return join_nonempty(
4842 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4843 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4844 delim='\n')
4845
7a32c70d 4846 @staticmethod
06e57990 4847 def read_file(filename, default=[]):
4848 try:
a904a7f8 4849 optionf = open(filename, 'rb')
86e5f3ed 4850 except OSError:
06e57990 4851 return default # silently skip if file is not present
a904a7f8
L
4852 try:
4853 enc, skip = determine_file_encoding(optionf.read(512))
4854 optionf.seek(skip, io.SEEK_SET)
4855 except OSError:
4856 enc = None # silently skip read errors
06e57990 4857 try:
4858 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4859 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4860 res = shlex.split(contents, comments=True)
44a6fcff 4861 except Exception as err:
4862 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4863 finally:
4864 optionf.close()
4865 return res
4866
7a32c70d 4867 @staticmethod
06e57990 4868 def hide_login_info(opts):
86e5f3ed 4869 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4870 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4871
4872 def _scrub_eq(o):
4873 m = eqre.match(o)
4874 if m:
4875 return m.group('key') + '=PRIVATE'
4876 else:
4877 return o
4878
4879 opts = list(map(_scrub_eq, opts))
4880 for idx, opt in enumerate(opts):
4881 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4882 opts[idx + 1] = 'PRIVATE'
4883 return opts
4884
4885 def append_config(self, *args, label=None):
9e491463 4886 config = type(self)(self.parser, label)
06e57990 4887 config._loaded_paths = self._loaded_paths
4888 if config.init(*args):
4889 self.configs.append(config)
4890
7a32c70d 4891 @property
06e57990 4892 def all_args(self):
4893 for config in reversed(self.configs):
4894 yield from config.all_args
9e491463 4895 yield from self.parsed_args or []
4896
4897 def parse_known_args(self, **kwargs):
4898 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4899
4900 def parse_args(self):
9e491463 4901 return self.parser.parse_args(self.all_args)
da42679b
LNO
4902
4903
d5d1df8a 4904class WebSocketsWrapper:
da42679b 4905 """Wraps websockets module to use in non-async scopes"""
abfecb7b 4906 pool = None
da42679b 4907
3cea3edd 4908 def __init__(self, url, headers=None, connect=True):
059bc4db 4909 self.loop = asyncio.new_event_loop()
9cd08050 4910 # XXX: "loop" is deprecated
4911 self.conn = websockets.connect(
4912 url, extra_headers=headers, ping_interval=None,
4913 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
4914 if connect:
4915 self.__enter__()
15dfb392 4916 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
4917
4918 def __enter__(self):
3cea3edd 4919 if not self.pool:
9cd08050 4920 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
4921 return self
4922
4923 def send(self, *args):
4924 self.run_with_loop(self.pool.send(*args), self.loop)
4925
4926 def recv(self, *args):
4927 return self.run_with_loop(self.pool.recv(*args), self.loop)
4928
4929 def __exit__(self, type, value, traceback):
4930 try:
4931 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4932 finally:
4933 self.loop.close()
15dfb392 4934 self._cancel_all_tasks(self.loop)
da42679b
LNO
4935
4936 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4937 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 4938 @staticmethod
da42679b 4939 def run_with_loop(main, loop):
059bc4db 4940 if not asyncio.iscoroutine(main):
da42679b
LNO
4941 raise ValueError(f'a coroutine was expected, got {main!r}')
4942
4943 try:
4944 return loop.run_until_complete(main)
4945 finally:
4946 loop.run_until_complete(loop.shutdown_asyncgens())
4947 if hasattr(loop, 'shutdown_default_executor'):
4948 loop.run_until_complete(loop.shutdown_default_executor())
4949
7a32c70d 4950 @staticmethod
da42679b 4951 def _cancel_all_tasks(loop):
059bc4db 4952 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
4953
4954 if not to_cancel:
4955 return
4956
4957 for task in to_cancel:
4958 task.cancel()
4959
9cd08050 4960 # XXX: "loop" is removed in python 3.10+
da42679b 4961 loop.run_until_complete(
059bc4db 4962 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
4963
4964 for task in to_cancel:
4965 if task.cancelled():
4966 continue
4967 if task.exception() is not None:
4968 loop.call_exception_handler({
4969 'message': 'unhandled exception during asyncio.run() shutdown',
4970 'exception': task.exception(),
4971 'task': task,
4972 })
4973
4974
8b7539d2 4975def merge_headers(*dicts):
08d30158 4976 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 4977 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 4978
4979
b1f94422 4980def cached_method(f):
4981 """Cache a method"""
4982 signature = inspect.signature(f)
4983
7a32c70d 4984 @functools.wraps(f)
b1f94422 4985 def wrapper(self, *args, **kwargs):
4986 bound_args = signature.bind(self, *args, **kwargs)
4987 bound_args.apply_defaults()
d5d1df8a 4988 key = tuple(bound_args.arguments.values())[1:]
b1f94422 4989
6368e2e6 4990 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 4991 if key not in cache:
4992 cache[key] = f(self, *args, **kwargs)
4993 return cache[key]
4994 return wrapper
4995
4996
28787f16 4997class classproperty:
83cc7b8a 4998 """property access for class methods with optional caching"""
4999 def __new__(cls, func=None, *args, **kwargs):
5000 if not func:
5001 return functools.partial(cls, *args, **kwargs)
5002 return super().__new__(cls)
c487cf00 5003
83cc7b8a 5004 def __init__(self, func, *, cache=False):
c487cf00 5005 functools.update_wrapper(self, func)
5006 self.func = func
83cc7b8a 5007 self._cache = {} if cache else None
28787f16 5008
5009 def __get__(self, _, cls):
83cc7b8a 5010 if self._cache is None:
5011 return self.func(cls)
5012 elif cls not in self._cache:
5013 self._cache[cls] = self.func(cls)
5014 return self._cache[cls]
19a03940 5015
5016
a5387729 5017class function_with_repr:
b2e0343b 5018 def __init__(self, func, repr_=None):
a5387729 5019 functools.update_wrapper(self, func)
b2e0343b 5020 self.func, self.__repr = func, repr_
a5387729 5021
5022 def __call__(self, *args, **kwargs):
5023 return self.func(*args, **kwargs)
5024
5025 def __repr__(self):
b2e0343b 5026 if self.__repr:
5027 return self.__repr
a5387729 5028 return f'{self.func.__module__}.{self.func.__qualname__}'
5029
5030
64fa820c 5031class Namespace(types.SimpleNamespace):
591bb9d3 5032 """Immutable namespace"""
591bb9d3 5033
7896214c 5034 def __iter__(self):
64fa820c 5035 return iter(self.__dict__.values())
7896214c 5036
7a32c70d 5037 @property
64fa820c 5038 def items_(self):
5039 return self.__dict__.items()
9b8ee23b 5040
5041
8dc59305 5042MEDIA_EXTENSIONS = Namespace(
5043 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5044 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5045 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5046 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5047 thumbnails=('jpg', 'png', 'webp'),
5048 storyboards=('mhtml', ),
5049 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5050 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5051)
5052MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5053MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5054
5055KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5056
5057
be5c1ae8 5058class RetryManager:
5059 """Usage:
5060 for retry in RetryManager(...):
5061 try:
5062 ...
5063 except SomeException as err:
5064 retry.error = err
5065 continue
5066 """
5067 attempt, _error = 0, None
5068
5069 def __init__(self, _retries, _error_callback, **kwargs):
5070 self.retries = _retries or 0
5071 self.error_callback = functools.partial(_error_callback, **kwargs)
5072
5073 def _should_retry(self):
5074 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5075
7a32c70d 5076 @property
be5c1ae8 5077 def error(self):
5078 if self._error is NO_DEFAULT:
5079 return None
5080 return self._error
5081
7a32c70d 5082 @error.setter
be5c1ae8 5083 def error(self, value):
5084 self._error = value
5085
5086 def __iter__(self):
5087 while self._should_retry():
5088 self.error = NO_DEFAULT
5089 self.attempt += 1
5090 yield self
5091 if self.error:
5092 self.error_callback(self.error, self.attempt, self.retries)
5093
7a32c70d 5094 @staticmethod
be5c1ae8 5095 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5096 """Utility function for reporting retries"""
5097 if count > retries:
5098 if error:
5099 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5100 raise e
5101
5102 if not count:
5103 return warn(e)
5104 elif isinstance(e, ExtractorError):
3ce29336 5105 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5106 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5107
5108 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5109 if delay:
5110 info(f'Sleeping {delay:.2f} seconds ...')
5111 time.sleep(delay)
5112
5113
0647d925 5114def make_archive_id(ie, video_id):
5115 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5116 return f'{ie_key.lower()} {video_id}'
5117
5118
a1c5bd82 5119def truncate_string(s, left, right=0):
5120 assert left > 3 and right >= 0
5121 if s is None or len(s) <= left + right:
5122 return s
71df9b7f 5123 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5124
5125
5314b521 5126def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5127 assert 'all' in alias_dict, '"all" alias is required'
5128 requested = list(start or [])
5129 for val in options:
5130 discard = val.startswith('-')
5131 if discard:
5132 val = val[1:]
5133
5134 if val in alias_dict:
5135 val = alias_dict[val] if not discard else [
5136 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5137 # NB: Do not allow regex in aliases for performance
5138 requested = orderedSet_from_options(val, alias_dict, start=requested)
5139 continue
5140
5141 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5142 else [val] if val in alias_dict['all'] else None)
5143 if current is None:
5144 raise ValueError(val)
5145
5146 if discard:
5147 for item in current:
5148 while item in requested:
5149 requested.remove(item)
5150 else:
5151 requested.extend(current)
5152
5153 return orderedSet(requested)
5154
5155
eedda525 5156# TODO: Rewrite
d0d74b71 5157class FormatSorter:
5158 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5159
5160 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5161 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5162 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5163 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5164 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5165 'fps', 'fs_approx', 'source', 'id')
5166
5167 settings = {
5168 'vcodec': {'type': 'ordered', 'regex': True,
5169 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5170 'acodec': {'type': 'ordered', 'regex': True,
71082216 5171 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5172 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5173 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5174 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5175 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5176 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5177 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5178 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5179 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5180 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5181 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5182 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5183 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5184 'field': ('vcodec', 'acodec'),
5185 'function': lambda it: int(any(v != 'none' for v in it))},
5186 'ie_pref': {'priority': True, 'type': 'extractor'},
5187 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5188 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5189 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5190 'quality': {'convert': 'float', 'default': -1},
5191 'filesize': {'convert': 'bytes'},
5192 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5193 'id': {'convert': 'string', 'field': 'format_id'},
5194 'height': {'convert': 'float_none'},
5195 'width': {'convert': 'float_none'},
5196 'fps': {'convert': 'float_none'},
5197 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5198 'tbr': {'convert': 'float_none'},
5199 'vbr': {'convert': 'float_none'},
5200 'abr': {'convert': 'float_none'},
5201 'asr': {'convert': 'float_none'},
5202 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5203
5204 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5205 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5206 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5207 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5208 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5209 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5210 'res': {'type': 'multiple', 'field': ('height', 'width'),
5211 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5212
5213 # Actual field names
5214 'format_id': {'type': 'alias', 'field': 'id'},
5215 'preference': {'type': 'alias', 'field': 'ie_pref'},
5216 'language_preference': {'type': 'alias', 'field': 'lang'},
5217 'source_preference': {'type': 'alias', 'field': 'source'},
5218 'protocol': {'type': 'alias', 'field': 'proto'},
5219 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5220 'audio_channels': {'type': 'alias', 'field': 'channels'},
5221
5222 # Deprecated
5223 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5224 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5225 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5226 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5227 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5228 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5229 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5230 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5231 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5232 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5233 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5234 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5235 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5236 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5237 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5238 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5239 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5240 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5241 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5242 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5243 }
5244
5245 def __init__(self, ydl, field_preference):
5246 self.ydl = ydl
5247 self._order = []
5248 self.evaluate_params(self.ydl.params, field_preference)
5249 if ydl.params.get('verbose'):
5250 self.print_verbose_info(self.ydl.write_debug)
5251
5252 def _get_field_setting(self, field, key):
5253 if field not in self.settings:
5254 if key in ('forced', 'priority'):
5255 return False
5256 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5257 'deprecated and may be removed in a future version')
5258 self.settings[field] = {}
5259 propObj = self.settings[field]
5260 if key not in propObj:
5261 type = propObj.get('type')
5262 if key == 'field':
5263 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5264 elif key == 'convert':
5265 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5266 else:
5267 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5268 propObj[key] = default
5269 return propObj[key]
5270
5271 def _resolve_field_value(self, field, value, convertNone=False):
5272 if value is None:
5273 if not convertNone:
5274 return None
5275 else:
5276 value = value.lower()
5277 conversion = self._get_field_setting(field, 'convert')
5278 if conversion == 'ignore':
5279 return None
5280 if conversion == 'string':
5281 return value
5282 elif conversion == 'float_none':
5283 return float_or_none(value)
5284 elif conversion == 'bytes':
5285 return parse_bytes(value)
5286 elif conversion == 'order':
5287 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5288 use_regex = self._get_field_setting(field, 'regex')
5289 list_length = len(order_list)
5290 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5291 if use_regex and value is not None:
5292 for i, regex in enumerate(order_list):
5293 if regex and re.match(regex, value):
5294 return list_length - i
5295 return list_length - empty_pos # not in list
5296 else: # not regex or value = None
5297 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5298 else:
5299 if value.isnumeric():
5300 return float(value)
5301 else:
5302 self.settings[field]['convert'] = 'string'
5303 return value
5304
5305 def evaluate_params(self, params, sort_extractor):
5306 self._use_free_order = params.get('prefer_free_formats', False)
5307 self._sort_user = params.get('format_sort', [])
5308 self._sort_extractor = sort_extractor
5309
5310 def add_item(field, reverse, closest, limit_text):
5311 field = field.lower()
5312 if field in self._order:
5313 return
5314 self._order.append(field)
5315 limit = self._resolve_field_value(field, limit_text)
5316 data = {
5317 'reverse': reverse,
5318 'closest': False if limit is None else closest,
5319 'limit_text': limit_text,
5320 'limit': limit}
5321 if field in self.settings:
5322 self.settings[field].update(data)
5323 else:
5324 self.settings[field] = data
5325
5326 sort_list = (
5327 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5328 + (tuple() if params.get('format_sort_force', False)
5329 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5330 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5331
5332 for item in sort_list:
5333 match = re.match(self.regex, item)
5334 if match is None:
5335 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5336 field = match.group('field')
5337 if field is None:
5338 continue
5339 if self._get_field_setting(field, 'type') == 'alias':
5340 alias, field = field, self._get_field_setting(field, 'field')
5341 if self._get_field_setting(alias, 'deprecated'):
5342 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5343 f'be removed in a future version. Please use {field} instead')
5344 reverse = match.group('reverse') is not None
5345 closest = match.group('separator') == '~'
5346 limit_text = match.group('limit')
5347
5348 has_limit = limit_text is not None
5349 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5350 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5351
5352 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5353 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5354 limit_count = len(limits)
5355 for (i, f) in enumerate(fields):
5356 add_item(f, reverse, closest,
5357 limits[i] if i < limit_count
5358 else limits[0] if has_limit and not has_multiple_limits
5359 else None)
5360
5361 def print_verbose_info(self, write_debug):
5362 if self._sort_user:
5363 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5364 if self._sort_extractor:
5365 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5366 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5367 '+' if self._get_field_setting(field, 'reverse') else '', field,
5368 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5369 self._get_field_setting(field, 'limit_text'),
5370 self._get_field_setting(field, 'limit'))
5371 if self._get_field_setting(field, 'limit_text') is not None else '')
5372 for field in self._order if self._get_field_setting(field, 'visible')]))
5373
5374 def _calculate_field_preference_from_value(self, format, field, type, value):
5375 reverse = self._get_field_setting(field, 'reverse')
5376 closest = self._get_field_setting(field, 'closest')
5377 limit = self._get_field_setting(field, 'limit')
5378
5379 if type == 'extractor':
5380 maximum = self._get_field_setting(field, 'max')
5381 if value is None or (maximum is not None and value >= maximum):
5382 value = -1
5383 elif type == 'boolean':
5384 in_list = self._get_field_setting(field, 'in_list')
5385 not_in_list = self._get_field_setting(field, 'not_in_list')
5386 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5387 elif type == 'ordered':
5388 value = self._resolve_field_value(field, value, True)
5389
5390 # try to convert to number
5391 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5392 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5393 if is_num:
5394 value = val_num
5395
5396 return ((-10, 0) if value is None
5397 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5398 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5399 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5400 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5401 else (-1, value, 0))
5402
5403 def _calculate_field_preference(self, format, field):
5404 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5405 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5406 if type == 'multiple':
5407 type = 'field' # Only 'field' is allowed in multiple for now
5408 actual_fields = self._get_field_setting(field, 'field')
5409
5410 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5411 else:
5412 value = get_value(field)
5413 return self._calculate_field_preference_from_value(format, field, type, value)
5414
5415 def calculate_preference(self, format):
5416 # Determine missing protocol
5417 if not format.get('protocol'):
5418 format['protocol'] = determine_protocol(format)
5419
5420 # Determine missing ext
5421 if not format.get('ext') and 'url' in format:
5422 format['ext'] = determine_ext(format['url'])
5423 if format.get('vcodec') == 'none':
5424 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5425 format['video_ext'] = 'none'
5426 else:
5427 format['video_ext'] = format['ext']
5428 format['audio_ext'] = 'none'
5429 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5430 # format['preference'] = -1000
5431
5424dbaf
L
5432 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5433 # HEVC-over-FLV is out-of-spec by FLV's original spec
5434 # ref. https://trac.ffmpeg.org/ticket/6389
5435 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5436 format['preference'] = -100
5437
d0d74b71 5438 # Determine missing bitrates
eedda525 5439 if format.get('vcodec') == 'none':
5440 format['vbr'] = 0
5441 if format.get('acodec') == 'none':
5442 format['abr'] = 0
5443 if not format.get('vbr') and format.get('vcodec') != 'none':
5444 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5445 if not format.get('abr') and format.get('acodec') != 'none':
5446 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5447 if not format.get('tbr'):
5448 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5449
5450 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5451
5452
5453# XXX: Temporary
5454class _YDLLogger:
5455 def __init__(self, ydl=None):
5456 self._ydl = ydl
5457
5458 def debug(self, message):
5459 if self._ydl:
5460 self._ydl.write_debug(message)
5461
5462 def info(self, message):
5463 if self._ydl:
5464 self._ydl.to_screen(message)
5465
5466 def warning(self, message, *, once=False):
5467 if self._ydl:
3d2623a8 5468 self._ydl.report_warning(message, once)
1b392f90 5469
5470 def error(self, message, *, is_error=True):
5471 if self._ydl:
5472 self._ydl.report_error(message, is_error=is_error)
5473
5474 def stdout(self, message):
5475 if self._ydl:
5476 self._ydl.to_stdout(message)
5477
5478 def stderr(self, message):
5479 if self._ydl:
5480 self._ydl.to_stderr(message)