]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[cleanup] Standardize `import datetime as dt` (#8978)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
1e399778 1import base64
5bc880b9 2import binascii
912b38b4 3import calendar
676eb3f2 4import codecs
c380cc28 5import collections
ab029d7e 6import collections.abc
62e609ab 7import contextlib
c305a25c 8import datetime as dt
0c265486 9import email.header
f8271158 10import email.utils
f45c185f 11import errno
49fa4d9a
N
12import hashlib
13import hmac
ac668111 14import html.entities
15import html.parser
b1f94422 16import inspect
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
f8271158 22import mimetypes
db3ad8a6 23import netrc
347de493 24import operator
d77c3dfd 25import os
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
f8271158 29import shlex
c496ca96 30import socket
79a2e94e 31import ssl
ac668111 32import struct
1c088fa8 33import subprocess
d77c3dfd 34import sys
181c8655 35import tempfile
c380cc28 36import time
01951dda 37import traceback
64fa820c 38import types
989a01c2 39import unicodedata
14f25df2 40import urllib.error
f8271158 41import urllib.parse
ac668111 42import urllib.request
bcf89ce6 43import xml.etree.ElementTree
d77c3dfd 44
69bec673 45from . import traversal
46
47from ..compat import functools # isort: split
48from ..compat import (
36e6f62c 49 compat_etree_fromstring,
51098426 50 compat_expanduser,
f8271158 51 compat_HTMLParseError,
efa97bdc 52 compat_os_name,
702ccf2d 53 compat_shlex_quote,
8c25f81b 54)
ccfd70f4 55from ..dependencies import xattr
51fb4995 56
46f1370e 57__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
f7a147e3 62
4823ec9f 63class NO_DEFAULT:
64 pass
65
66
67def IDENTITY(x):
68 return x
69
bf42a990 70
7105440c
YCH
71ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
f6717dec
S
75MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
3e4185c3
S
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 85}
a942d6cb 86
8f53dc44 87# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95}
96
c587cbb7 97# needed for sanitizing filenames in restricted mode
c8827027 98ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 101
46f59e89
S
102DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
cb655f34
S
106 '%B %dst %Y',
107 '%B %dnd %Y',
9d30c213 108 '%B %drd %Y',
cb655f34 109 '%B %dth %Y',
46f59e89 110 '%b %d %Y',
cb655f34
S
111 '%b %dst %Y',
112 '%b %dnd %Y',
9d30c213 113 '%b %drd %Y',
cb655f34 114 '%b %dth %Y',
46f59e89
S
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
9d30c213 117 '%b %drd %Y %I:%M',
46f59e89
S
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
bccdbd22 121 '%Y.%m.%d.',
46f59e89 122 '%Y/%m/%d',
81c13222 123 '%Y/%m/%d %H:%M',
46f59e89 124 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
4f3fa23e 127 '%Y%m%d',
0c1c6f4b 128 '%Y-%m-%d %H:%M',
46f59e89
S
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
5014558a 131 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
c6eed6b8
S
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
b555ae9b
S
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
a63d9bd0 144 '%H:%M %d-%b-%Y',
46f59e89
S
145)
146
147DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
47304e07 155 '%d-%m-%Y %H:%M',
4cbfa570 156 '%H:%M %d/%m/%Y',
46f59e89
S
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
06b3fe29 168PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 169JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 170
1d485a1a 171NUMBER_RE = r'\d+(?:\.\d+)?'
172
7105440c 173
0b9c08b4 174@functools.cache
d77c3dfd 175def preferredencoding():
59ae15a5 176 """Get preferred encoding.
d77c3dfd 177
59ae15a5
PH
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
28e614de 183 'TEST'.encode(pref)
70a1165b 184 except Exception:
59ae15a5 185 pref = 'UTF-8'
bae611f2 186
59ae15a5 187 return pref
d77c3dfd 188
f4bfd65f 189
181c8655 190def write_json_file(obj, fn):
1394646a 191 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 192
cfb0511d 193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
196
197 try:
198 with tf:
45d86abe 199 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
19a03940 203 with contextlib.suppress(OSError):
1394646a 204 os.unlink(fn)
19a03940 205 with contextlib.suppress(OSError):
9cd5f54e
R
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
181c8655 209 os.rename(tf.name, fn)
70a1165b 210 except Exception:
19a03940 211 with contextlib.suppress(OSError):
181c8655 212 os.remove(tf.name)
181c8655
PH
213 raise
214
215
cfb0511d 216def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 220 return node.find(expr)
59ae56fa 221
d7e66d39
JMF
222# On python2.6 the xml.etree.ElementTree.Element methods don't support
223# the namespace parameter
5f6a1245
JW
224
225
d7e66d39
JMF
226def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
d77c3dfd 237
a41fb80c 238def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 239 def _find_xpath(xpath):
f9934b96 240 return node.find(xpath)
578c0745 241
14f25df2 242 if isinstance(xpath, str):
578c0745
S
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
d74bebd5 249
8e636da4 250 if n is None:
bf42a990
S
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
bf0ff932
PH
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
a41fb80c
S
258 return n
259
260
261def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
a41fb80c
S
274
275
276def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
86e5f3ed 282 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
bf0ff932
PH
287
288
c487cf00 289def get_element_by_id(id, html, **kwargs):
43e8fafd 290 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 291 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 292
12ea2f30 293
c487cf00 294def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 295 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 296 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
297
298
84c237fb 299def get_element_by_class(class_name, html):
2af12ad9
TC
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
6f32a0b5
ZM
305def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
c487cf00 311def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
313 return retval[0] if retval else None
314
315
c487cf00 316def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
318 return retval[0] if retval else None
319
320
c487cf00 321def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
64fa820c 324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
325 html, escape_value=False)
326
327
6f32a0b5
ZM
328def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
64fa820c 331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
332 html, escape_value=False)
333
334
335def get_elements_by_attribute(*args, **kwargs):
43e8fafd 336 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
4c9a1a3b 345def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
c61473c1
M
350 if not value:
351 return
9e6dd238 352
86e5f3ed 353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 354
84c237fb
YCH
355 value = re.escape(value) if escape_value else value
356
86e5f3ed 357 partial_element_re = rf'''(?x)
4c9a1a3b 358 <(?P<tag>{tag})
0254f162 359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
38285056 362
0254f162
ZM
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 365
0254f162
ZM
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
a921f407 370
c5229f39 371
ac668111 372class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
ac668111 384 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
46d09f87 414# XXX: This should be far less strict
6f32a0b5
ZM
415def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
ac668111 450class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 451 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 452
8bb56eee 453 def __init__(self):
c5229f39 454 self.attrs = {}
ac668111 455 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
7053aa3a 459 raise compat_HTMLParseError('done')
8bb56eee 460
c5229f39 461
ac668111 462class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
ac668111 466 html.parser.HTMLParser.__init__(self)
73673ccf
FF
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
8bb56eee
BF
479def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
8bb56eee
BF
492 """
493 parser = HTMLAttributeParser()
19a03940 494 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
495 parser.feed(html_element)
496 parser.close()
8bb56eee 497 return parser.attrs
9e6dd238 498
c5229f39 499
73673ccf
FF
500def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
9e6dd238 509def clean_html(html):
59ae15a5 510 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
49185227 515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
7decf895 522 return html.strip()
9e6dd238
FV
523
524
b7c47b74 525class LenientJSONDecoder(json.JSONDecoder):
cc090836 526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 529 self._close_attempts = 2 * close_objects
b7c47b74 530 super().__init__(*args, **kwargs)
531
cc090836 532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
b7c47b74 546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
cc090836 549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
f9fb3ce8 561 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
cc090836 562 assert False, 'Too many attempts to decode JSON'
b7c47b74 563
564
d77c3dfd 565def sanitize_open(filename, open_mode):
59ae15a5
PH
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
0edb3e33 575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
be5c1ae8 578
62b58c09 579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 583
0edb3e33 584 for attempt in range(2):
585 try:
586 try:
89737671 587 if sys.platform == 'win32':
b506289f 588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 591 raise LockingUnsupportedError()
0edb3e33 592 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 593 except OSError:
0edb3e33 594 stream = open(filename, open_mode)
8a82af35 595 return stream, filename
86e5f3ed 596 except OSError as err:
0edb3e33 597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
d77c3dfd
FV
602
603
604def timeconvert(timestr):
59ae15a5
PH
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
1c469a94 611
5f6a1245 612
5c3895ff 613def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 614 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 618 """
5c3895ff 619 if s == '':
620 return ''
621
59ae15a5 622 def replace_insane(char):
c587cbb7
AT
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
91dd88b9 625 elif not restricted and char == '\n':
5c3895ff 626 return '\0 '
989a01c2 627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
5c3895ff 635 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 636 elif char in '\\/|*<>':
5c3895ff 637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
65de7d20 639 return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
59ae15a5
PH
640 return char
641
db4678e4 642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 644 s = unicodedata.normalize('NFKC', s)
5c3895ff 645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 646 result = ''.join(map(replace_insane, s))
5c3895ff 647 if is_id is NO_DEFAULT:
ae61d108 648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
796173d0
PH
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
5a42414b
PH
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
a7440261 662 result = result.lstrip('.')
796173d0
PH
663 if not result:
664 result = '_'
59ae15a5 665 return result
d77c3dfd 666
5f6a1245 667
c2934512 668def sanitize_path(s, force=False):
a2aaf4db 669 """Sanitizes and normalizes path on Windows"""
836e06d2 670 # XXX: this handles drive relative paths (c:sth) incorrectly
c2934512 671 if sys.platform == 'win32':
c4218ac3 672 force = False
c2934512 673 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 674 elif force:
675 drive_or_unc = ''
676 else:
a2aaf4db 677 return s
c2934512 678
be531ef1
S
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
a2aaf4db
S
681 norm_path.pop(0)
682 sanitized_path = [
ec85ded8 683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 684 for path_part in norm_path]
be531ef1
S
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 687 elif force and s and s[0] == os.path.sep:
c4218ac3 688 sanitized_path.insert(0, os.path.sep)
836e06d2
SS
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
a2aaf4db
S
693
694
8f97a15d 695def sanitize_url(url, *, scheme='http'):
befa4708
S
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
21633673 698 if url is None:
699 return
700 elif url.startswith('//'):
8f97a15d 701 return f'{scheme}:{url}'
befa4708
S
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
067aa17e 704 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
bc6b9bcd 712 return url
17bcc626
S
713
714
5435dcf9 715def extract_basic_auth(url):
14f25df2 716 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
717 if parts.username is None:
718 return url, None
14f25df2 719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
0f06bcd7 723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
725
726
51098426 727def expand_path(s):
2fa669f7 728 """Expand shell variables and ~"""
51098426
S
729 return os.path.expandvars(compat_expanduser(s))
730
731
7e9a6125 732def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
d77c3dfd 742
912b38b4 743
55b2f099 744def _htmlentity_transform(entity_with_semicolon):
4e408e47 745 """Transforms an HTML entity to a character."""
55b2f099
YCH
746 entity = entity_with_semicolon[:-1]
747
4e408e47 748 # Known non-numeric HTML entity
ac668111 749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
4e408e47 751
62b58c09
L
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
55b2f099 756
91757b0f 757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
758 if mobj is not None:
759 numstr = mobj.group(1)
28e614de 760 if numstr.startswith('x'):
4e408e47 761 base = 16
28e614de 762 numstr = '0%s' % numstr
4e408e47
PH
763 else:
764 base = 10
067aa17e 765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 766 with contextlib.suppress(ValueError):
ac668111 767 return chr(int(numstr, base))
4e408e47
PH
768
769 # Unknown entity in name, return its literal representation
7a3f0c00 770 return '&%s;' % entity
4e408e47
PH
771
772
d77c3dfd 773def unescapeHTML(s):
912b38b4
PH
774 if s is None:
775 return None
19a03940 776 assert isinstance(s, str)
d77c3dfd 777
4e408e47 778 return re.sub(
95f3f7c2 779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 780
8bf48f23 781
cdb19aa4 782def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
db3ad8a6
ND
793class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
d3c93ec2 800class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
82ea226c
L
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
de015e93 826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
82ea226c
L
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
da8e2912 831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
de015e93
SS
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
d3c93ec2 851
852 def communicate_or_kill(self, *args, **kwargs):
8a82af35 853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
f0c9fb96 856 self.kill(timeout=None)
8a82af35 857 raise
d3c93ec2 858
f0c9fb96 859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
992dc6b4 865 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 866 with cls(*args, **kwargs) as proc:
da8e2912 867 default = '' if proc.__text_mode else b''
992dc6b4 868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 869 return stdout or default, stderr or default, proc.returncode
f0c9fb96 870
d3c93ec2 871
f07b74fc 872def encodeArgument(s):
cfb0511d 873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
14f25df2 875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 876 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
877
878
aa7785f8 879_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
cdb19aa4 889def formatSeconds(secs, delim=':', msec=False):
aa7785f8 890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 895 else:
aa7785f8 896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 898
a0ddb8a2 899
5873d4cc 900def bug_reports_message(before=';'):
69bec673 901 from ..update import REPOSITORY
57e0f077 902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
08f2a92c
JMF
911
912
bf5b9d85
PM
913class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
aa9369a2 915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
bf5b9d85
PM
923
924
925class ExtractorError(YoutubeDLError):
1c256f70 926 """Error during info extraction."""
5f6a1245 927
1151c407 928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 929 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 931 """
c365dba8 932 from ..networking.exceptions import network_exceptions
3158150c 933 if sys.exc_info()[0] in network_exceptions:
9a82b238 934 expected = True
d5979c5d 935
7265a219 936 self.orig_msg = str(msg)
1c256f70 937 self.traceback = tb
1151c407 938 self.expected = expected
2eabb802 939 self.cause = cause
d11271dd 940 self.video_id = video_id
1151c407 941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
5df14442 943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 945 super().__init__(self.__msg)
1151c407 946
9bcfe33b 947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
1c256f70 955
01951dda 956 def format_traceback(self):
497d2fab 957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 960 delim='\n') or None
01951dda 961
9bcfe33b 962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
1c256f70 968
416c7fcb
PH
969class UnsupportedError(ExtractorError):
970 def __init__(self, url):
86e5f3ed 971 super().__init__(
416c7fcb
PH
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
55b3e45b
JMF
976class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
773f291d
S
981class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
b6e0c7d2 987
0db3bae8 988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
86e5f3ed 990 super().__init__(msg, **kwargs)
773f291d
S
991 self.countries = countries
992
993
693f0600 994class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
bf5b9d85 1002class DownloadError(YoutubeDLError):
59ae15a5 1003 """Download Error exception.
d77c3dfd 1004
59ae15a5
PH
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
5f6a1245 1009
8cc83b8d
FV
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1012 super().__init__(msg)
8cc83b8d 1013 self.exc_info = exc_info
d77c3dfd
FV
1014
1015
498f5606 1016class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
aa9369a2 1022 msg = 'Entry not found in info'
498f5606 1023
1024
bf5b9d85 1025class SameFileError(YoutubeDLError):
59ae15a5 1026 """Same File exception.
d77c3dfd 1027
59ae15a5
PH
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
aa9369a2 1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
d77c3dfd
FV
1037
1038
bf5b9d85 1039class PostProcessingError(YoutubeDLError):
59ae15a5 1040 """Post Processing exception.
d77c3dfd 1041
59ae15a5
PH
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
5f6a1245 1045
5f6a1245 1046
48f79687 1047class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
8b0d7497 1050
8b0d7497 1051
48f79687 1052class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1055
48f79687 1056
1057class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1060
1061
48f79687 1062class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1063 """ --max-downloads limit has been reached. """
48f79687 1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
f2ebc5c7 1067class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075class ThrottledDownload(ReExtractInfo):
48f79687 1076 """ Download speed below --throttled-rate. """
aa9369a2 1077 msg = 'The download speed is below throttle limit'
d77c3dfd 1078
43b22906 1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
f2ebc5c7 1081
d77c3dfd 1082
bf5b9d85 1083class UnavailableVideoError(YoutubeDLError):
59ae15a5 1084 """Unavailable Format exception.
d77c3dfd 1085
59ae15a5
PH
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
aa9369a2 1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
d77c3dfd
FV
1095
1096
bf5b9d85 1097class ContentTooShortError(YoutubeDLError):
59ae15a5 1098 """Content Too Short exception.
d77c3dfd 1099
59ae15a5
PH
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
d77c3dfd 1104
59ae15a5 1105 def __init__(self, downloaded, expected):
86e5f3ed 1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1107 # Both in bytes
59ae15a5
PH
1108 self.downloaded = downloaded
1109 self.expected = expected
d77c3dfd 1110
5f6a1245 1111
bf5b9d85 1112class XAttrMetadataError(YoutubeDLError):
efa97bdc 1113 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1114 super().__init__(msg)
efa97bdc 1115 self.code = code
bd264412 1116 self.msg = msg
efa97bdc
YCH
1117
1118 # Parsing code and msg
3089bc74 1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
bf5b9d85 1128class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1129 pass
1130
1131
941e881e 1132def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
46f59e89
S
1136def extract_timezone(date_str):
1137 m = re.search(
f137e4c2 1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
46f59e89 1148 if not m:
8f53dc44 1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
c305a25c 1153 timezone = dt.timedelta(hours=timezone or 0)
46f59e89
S
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
c305a25c 1157 timezone = dt.timedelta()
46f59e89
S
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
c305a25c 1160 timezone = dt.timedelta(
46f59e89
S
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
08b38d54 1166def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
52c3a6e4
S
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
08b38d54 1174 if timezone is None:
46f59e89
S
1175 timezone, date_str = extract_timezone(date_str)
1176
19a03940 1177 with contextlib.suppress(ValueError):
86e5f3ed 1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
c305a25c 1179 dt_ = dt.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt_.timetuple())
912b38b4
PH
1181
1182
46f59e89
S
1183def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
42bdd9d0 1187def unified_strdate(date_str, day_first=True):
bf50b038 1188 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1189
1190 if date_str is None:
1191 return None
bf50b038 1192 upload_date = None
5f6a1245 1193 # Replace commas
026fcc04 1194 date_str = date_str.replace(',', ' ')
42bdd9d0 1195 # Remove AM/PM + timezone
9bb8e0a3 1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1197 _, date_str = extract_timezone(date_str)
42bdd9d0 1198
46f59e89 1199 for expression in date_formats(day_first):
19a03940 1200 with contextlib.suppress(ValueError):
c305a25c 1201 upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
19a03940 1205 with contextlib.suppress(ValueError):
c305a25c 1206 upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1207 if upload_date is not None:
14f25df2 1208 return str(upload_date)
bf50b038 1209
5f6a1245 1210
46f59e89 1211def unified_timestamp(date_str, day_first=True):
ad54c913 1212 if not isinstance(date_str, str):
46f59e89
S
1213 return None
1214
8f53dc44 1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1217
7dc2a74e 1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
deef3195
S
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
f226880c
PH
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
46f59e89 1234 for expression in date_formats(day_first):
19a03940 1235 with contextlib.suppress(ValueError):
c305a25c 1236 dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
1237 return calendar.timegm(dt_.timetuple())
8f53dc44 1238
46f59e89
S
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
8f53dc44 1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1242
1243
28e614de 1244def determine_ext(url, default_ext='unknown_video'):
85750f89 1245 if url is None or '.' not in url:
f4776371 1246 return default_ext
9cb9a5df 1247 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
a7aaa398
S
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1252 return guess.rstrip('/')
73e79f2a 1253 else:
cbdbb766 1254 return default_ext
73e79f2a 1255
5f6a1245 1256
824fa511
S
1257def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1259
5f6a1245 1260
9e62f283 1261def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
9e62f283 1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
c305a25c 1275 today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
f8795e10 1276 if date_str in ('now', 'today'):
37254abc 1277 return today
f8795e10 1278 if date_str == 'yesterday':
c305a25c 1279 return today - dt.timedelta(days=1)
9e62f283 1280 match = re.match(
3d38b2d6 1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1282 date_str)
37254abc 1283 if match is not None:
9e62f283 1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1286 unit = match.group('unit')
9e62f283 1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1289 unit = 'day'
9e62f283 1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
c305a25c 1294 delta = dt.timedelta(**{unit + 's': time})
9e62f283 1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
c305a25c 1300 return datetime_round(dt.datetime.strptime(date_str, format), precision)
9e62f283 1301
1302
d49f8db3 1303def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1304 R"""
1305 Return a date object from a string using datetime_from_str
9e62f283 1306
3d38b2d6 1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1309 """
3d38b2d6 1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
c305a25c 1315def datetime_add_months(dt_, months):
9e62f283 1316 """Increment/Decrement a datetime object by months."""
c305a25c 1317 month = dt_.month + months - 1
1318 year = dt_.year + month // 12
9e62f283 1319 month = month % 12 + 1
c305a25c 1320 day = min(dt_.day, calendar.monthrange(year, month)[1])
1321 return dt_.replace(year, month, day)
9e62f283 1322
1323
c305a25c 1324def datetime_round(dt_, precision='day'):
9e62f283 1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
c305a25c 1329 return dt_
9e62f283 1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
c305a25c 1338 timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
1339 return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
5f6a1245
JW
1340
1341
e63fc1be 1342def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
5f6a1245 1351
86e5f3ed 1352class DateRange:
bd558525 1353 """Represents a time interval between two dates"""
5f6a1245 1354
bd558525
JMF
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
d49f8db3 1358 self.start = date_from_str(start, strict=True)
bd558525 1359 else:
c305a25c 1360 self.start = dt.datetime.min.date()
bd558525 1361 if end is not None:
d49f8db3 1362 self.end = date_from_str(end, strict=True)
bd558525 1363 else:
c305a25c 1364 self.end = dt.datetime.max.date()
37254abc 1365 if self.start > self.end:
bd558525 1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1367
bd558525
JMF
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
5f6a1245
JW
1371 return cls(day, day)
1372
bd558525
JMF
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
c305a25c 1375 if not isinstance(date, dt.date):
37254abc
JMF
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
5f6a1245 1378
46f1370e 1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1381
45491a2a 1382 def __str__(self):
1383 return f'{self.start} to {self.end}'
1384
f2df4071 1385 def __eq__(self, other):
1386 return (isinstance(other, DateRange)
1387 and self.start == other.start and self.end == other.end)
1388
c496ca96 1389
b1f94422 1390@functools.cache
1391def system_identifier():
1392 python_implementation = platform.python_implementation()
1393 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1394 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1395 libc_ver = []
1396 with contextlib.suppress(OSError): # We may not have access to the executable
1397 libc_ver = platform.libc_ver()
b1f94422 1398
17fc3dc4 1399 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1400 platform.python_version(),
1401 python_implementation,
17fc3dc4 1402 platform.machine(),
b1f94422 1403 platform.architecture()[0],
1404 platform.platform(),
5b9f253f
M
1405 ssl.OPENSSL_VERSION,
1406 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1407 )
c257baff
PH
1408
1409
0b9c08b4 1410@functools.cache
49fa4d9a 1411def get_windows_version():
8a82af35 1412 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1413 if compat_os_name == 'nt':
1414 return version_tuple(platform.win32_ver()[1])
1415 else:
8a82af35 1416 return ()
49fa4d9a
N
1417
1418
734f90bb 1419def write_string(s, out=None, encoding=None):
19a03940 1420 assert isinstance(s, str)
1421 out = out or sys.stderr
3b479100
SS
1422 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1423 if not out:
1424 return
7459e3a2 1425
fe1daad3 1426 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1427 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1428
8a82af35 1429 enc, buffer = None, out
93240fc1 1430 # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1431 if 'b' in (getattr(out, 'mode', None) or ''):
c487cf00 1432 enc = encoding or preferredencoding()
104aa738 1433 elif hasattr(out, 'buffer'):
8a82af35 1434 buffer = out.buffer
104aa738 1435 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1436
8a82af35 1437 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1438 out.flush()
1439
1440
3d2623a8 1441# TODO: Use global logger
da4db748 1442def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1443 from .. import _IN_CLI
da4db748 1444 if _IN_CLI:
1445 if msg in deprecation_warning._cache:
1446 return
1447 deprecation_warning._cache.add(msg)
1448 if printer:
1449 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1450 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1451 else:
1452 import warnings
1453 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1454
1455
1456deprecation_warning._cache = set()
1457
1458
48ea9cea
PH
1459def bytes_to_intlist(bs):
1460 if not bs:
1461 return []
1462 if isinstance(bs[0], int): # Python 3
1463 return list(bs)
1464 else:
1465 return [ord(c) for c in bs]
1466
c257baff 1467
cba892fa 1468def intlist_to_bytes(xs):
1469 if not xs:
1470 return b''
ac668111 1471 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1472
1473
8a82af35 1474class LockingUnsupportedError(OSError):
1890fc63 1475 msg = 'File locking is not supported'
0edb3e33 1476
1477 def __init__(self):
1478 super().__init__(self.msg)
1479
1480
c1c9a79c
PH
1481# Cross-platform file locking
1482if sys.platform == 'win32':
fe0918bb 1483 import ctypes
c1c9a79c
PH
1484 import ctypes.wintypes
1485 import msvcrt
1486
1487 class OVERLAPPED(ctypes.Structure):
1488 _fields_ = [
1489 ('Internal', ctypes.wintypes.LPVOID),
1490 ('InternalHigh', ctypes.wintypes.LPVOID),
1491 ('Offset', ctypes.wintypes.DWORD),
1492 ('OffsetHigh', ctypes.wintypes.DWORD),
1493 ('hEvent', ctypes.wintypes.HANDLE),
1494 ]
1495
37e325b9 1496 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1497 LockFileEx = kernel32.LockFileEx
1498 LockFileEx.argtypes = [
1499 ctypes.wintypes.HANDLE, # hFile
1500 ctypes.wintypes.DWORD, # dwFlags
1501 ctypes.wintypes.DWORD, # dwReserved
1502 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1503 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1504 ctypes.POINTER(OVERLAPPED) # Overlapped
1505 ]
1506 LockFileEx.restype = ctypes.wintypes.BOOL
1507 UnlockFileEx = kernel32.UnlockFileEx
1508 UnlockFileEx.argtypes = [
1509 ctypes.wintypes.HANDLE, # hFile
1510 ctypes.wintypes.DWORD, # dwReserved
1511 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1512 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1513 ctypes.POINTER(OVERLAPPED) # Overlapped
1514 ]
1515 UnlockFileEx.restype = ctypes.wintypes.BOOL
1516 whole_low = 0xffffffff
1517 whole_high = 0x7fffffff
1518
747c0bd1 1519 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1520 overlapped = OVERLAPPED()
1521 overlapped.Offset = 0
1522 overlapped.OffsetHigh = 0
1523 overlapped.hEvent = 0
1524 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1525
1526 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1527 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1528 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1529 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1530 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1531
1532 def _unlock_file(f):
1533 assert f._lock_file_overlapped_p
1534 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1535 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1536 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1537
1538else:
399a76e6
YCH
1539 try:
1540 import fcntl
c1c9a79c 1541
a3125791 1542 def _lock_file(f, exclusive, block):
b63837bc 1543 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1544 if not block:
1545 flags |= fcntl.LOCK_NB
acea8d7c 1546 try:
b63837bc 1547 fcntl.flock(f, flags)
acea8d7c
JK
1548 except BlockingIOError:
1549 raise
1550 except OSError: # AOSP does not have flock()
b63837bc 1551 fcntl.lockf(f, flags)
c1c9a79c 1552
399a76e6 1553 def _unlock_file(f):
45998b3e
E
1554 with contextlib.suppress(OSError):
1555 return fcntl.flock(f, fcntl.LOCK_UN)
1556 with contextlib.suppress(OSError):
1557 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1558 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1559
399a76e6 1560 except ImportError:
399a76e6 1561
a3125791 1562 def _lock_file(f, exclusive, block):
0edb3e33 1563 raise LockingUnsupportedError()
399a76e6
YCH
1564
1565 def _unlock_file(f):
0edb3e33 1566 raise LockingUnsupportedError()
c1c9a79c
PH
1567
1568
86e5f3ed 1569class locked_file:
0edb3e33 1570 locked = False
747c0bd1 1571
a3125791 1572 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1573 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1574 raise NotImplementedError(mode)
1575 self.mode, self.block = mode, block
1576
1577 writable = any(f in mode for f in 'wax+')
1578 readable = any(f in mode for f in 'r+')
1579 flags = functools.reduce(operator.ior, (
1580 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1581 getattr(os, 'O_BINARY', 0), # Windows only
1582 getattr(os, 'O_NOINHERIT', 0), # Windows only
1583 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1584 os.O_APPEND if 'a' in mode else 0,
1585 os.O_EXCL if 'x' in mode else 0,
1586 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1587 ))
1588
98804d03 1589 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1590
1591 def __enter__(self):
a3125791 1592 exclusive = 'r' not in self.mode
c1c9a79c 1593 try:
a3125791 1594 _lock_file(self.f, exclusive, self.block)
0edb3e33 1595 self.locked = True
86e5f3ed 1596 except OSError:
c1c9a79c
PH
1597 self.f.close()
1598 raise
fcfa8853 1599 if 'w' in self.mode:
131e14dc
JK
1600 try:
1601 self.f.truncate()
1602 except OSError as e:
1890fc63 1603 if e.errno not in (
1604 errno.ESPIPE, # Illegal seek - expected for FIFO
1605 errno.EINVAL, # Invalid argument - expected for /dev/null
1606 ):
1607 raise
c1c9a79c
PH
1608 return self
1609
0edb3e33 1610 def unlock(self):
1611 if not self.locked:
1612 return
c1c9a79c 1613 try:
0edb3e33 1614 _unlock_file(self.f)
c1c9a79c 1615 finally:
0edb3e33 1616 self.locked = False
c1c9a79c 1617
0edb3e33 1618 def __exit__(self, *_):
1619 try:
1620 self.unlock()
1621 finally:
1622 self.f.close()
4eb7f1d1 1623
0edb3e33 1624 open = __enter__
1625 close = __exit__
a3125791 1626
0edb3e33 1627 def __getattr__(self, attr):
1628 return getattr(self.f, attr)
a3125791 1629
0edb3e33 1630 def __iter__(self):
1631 return iter(self.f)
a3125791 1632
4eb7f1d1 1633
0b9c08b4 1634@functools.cache
4644ac55
S
1635def get_filesystem_encoding():
1636 encoding = sys.getfilesystemencoding()
1637 return encoding if encoding is not None else 'utf-8'
1638
1639
4eb7f1d1 1640def shell_quote(args):
a6a173c2 1641 quoted_args = []
4644ac55 1642 encoding = get_filesystem_encoding()
a6a173c2
JMF
1643 for a in args:
1644 if isinstance(a, bytes):
1645 # We may get a filename encoded with 'encodeFilename'
1646 a = a.decode(encoding)
aefce8e6 1647 quoted_args.append(compat_shlex_quote(a))
28e614de 1648 return ' '.join(quoted_args)
9d4660ca
PH
1649
1650
1651def smuggle_url(url, data):
1652 """ Pass additional data in a URL for internal use. """
1653
81953d1a
RA
1654 url, idata = unsmuggle_url(url, {})
1655 data.update(idata)
14f25df2 1656 sdata = urllib.parse.urlencode(
28e614de
PH
1657 {'__youtubedl_smuggle': json.dumps(data)})
1658 return url + '#' + sdata
9d4660ca
PH
1659
1660
79f82953 1661def unsmuggle_url(smug_url, default=None):
83e865a3 1662 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1663 return smug_url, default
28e614de 1664 url, _, sdata = smug_url.rpartition('#')
14f25df2 1665 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1666 data = json.loads(jsond)
1667 return url, data
02dbf93f
PH
1668
1669
e0fd9573 1670def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1671 """ Formats numbers with decimal sufixes like K, M, etc """
1672 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1673 if num is None or num < 0:
e0fd9573 1674 return None
eeb2a770 1675 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1676 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1677 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1678 if factor == 1024:
1679 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1680 converted = num / (factor ** exponent)
abbeeebc 1681 return fmt % (converted, suffix)
e0fd9573 1682
1683
02dbf93f 1684def format_bytes(bytes):
f02d24d8 1685 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1686
1c088fa8 1687
64c464a1 1688def lookup_unit_table(unit_table, s, strict=False):
1689 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1690 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1691 m = (re.fullmatch if strict else re.match)(
1692 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1693 if not m:
1694 return None
64c464a1 1695
1696 num = float(m.group('num').replace(',', '.'))
fb47597b 1697 mult = unit_table[m.group('unit')]
64c464a1 1698 return round(num * mult)
1699
1700
1701def parse_bytes(s):
1702 """Parse a string indicating a byte quantity into an integer"""
1703 return lookup_unit_table(
1704 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1705 s.upper(), strict=True)
fb47597b
S
1706
1707
be64b5b0
PH
1708def parse_filesize(s):
1709 if s is None:
1710 return None
1711
dfb1b146 1712 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1713 # but we support those too
1714 _UNIT_TABLE = {
1715 'B': 1,
1716 'b': 1,
70852b47 1717 'bytes': 1,
be64b5b0
PH
1718 'KiB': 1024,
1719 'KB': 1000,
1720 'kB': 1024,
1721 'Kb': 1000,
13585d76 1722 'kb': 1000,
70852b47
YCH
1723 'kilobytes': 1000,
1724 'kibibytes': 1024,
be64b5b0
PH
1725 'MiB': 1024 ** 2,
1726 'MB': 1000 ** 2,
1727 'mB': 1024 ** 2,
1728 'Mb': 1000 ** 2,
13585d76 1729 'mb': 1000 ** 2,
70852b47
YCH
1730 'megabytes': 1000 ** 2,
1731 'mebibytes': 1024 ** 2,
be64b5b0
PH
1732 'GiB': 1024 ** 3,
1733 'GB': 1000 ** 3,
1734 'gB': 1024 ** 3,
1735 'Gb': 1000 ** 3,
13585d76 1736 'gb': 1000 ** 3,
70852b47
YCH
1737 'gigabytes': 1000 ** 3,
1738 'gibibytes': 1024 ** 3,
be64b5b0
PH
1739 'TiB': 1024 ** 4,
1740 'TB': 1000 ** 4,
1741 'tB': 1024 ** 4,
1742 'Tb': 1000 ** 4,
13585d76 1743 'tb': 1000 ** 4,
70852b47
YCH
1744 'terabytes': 1000 ** 4,
1745 'tebibytes': 1024 ** 4,
be64b5b0
PH
1746 'PiB': 1024 ** 5,
1747 'PB': 1000 ** 5,
1748 'pB': 1024 ** 5,
1749 'Pb': 1000 ** 5,
13585d76 1750 'pb': 1000 ** 5,
70852b47
YCH
1751 'petabytes': 1000 ** 5,
1752 'pebibytes': 1024 ** 5,
be64b5b0
PH
1753 'EiB': 1024 ** 6,
1754 'EB': 1000 ** 6,
1755 'eB': 1024 ** 6,
1756 'Eb': 1000 ** 6,
13585d76 1757 'eb': 1000 ** 6,
70852b47
YCH
1758 'exabytes': 1000 ** 6,
1759 'exbibytes': 1024 ** 6,
be64b5b0
PH
1760 'ZiB': 1024 ** 7,
1761 'ZB': 1000 ** 7,
1762 'zB': 1024 ** 7,
1763 'Zb': 1000 ** 7,
13585d76 1764 'zb': 1000 ** 7,
70852b47
YCH
1765 'zettabytes': 1000 ** 7,
1766 'zebibytes': 1024 ** 7,
be64b5b0
PH
1767 'YiB': 1024 ** 8,
1768 'YB': 1000 ** 8,
1769 'yB': 1024 ** 8,
1770 'Yb': 1000 ** 8,
13585d76 1771 'yb': 1000 ** 8,
70852b47
YCH
1772 'yottabytes': 1000 ** 8,
1773 'yobibytes': 1024 ** 8,
be64b5b0
PH
1774 }
1775
fb47597b
S
1776 return lookup_unit_table(_UNIT_TABLE, s)
1777
1778
1779def parse_count(s):
1780 if s is None:
be64b5b0
PH
1781 return None
1782
352d5da8 1783 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1784
1785 if re.match(r'^[\d,.]+$', s):
1786 return str_to_int(s)
1787
1788 _UNIT_TABLE = {
1789 'k': 1000,
1790 'K': 1000,
1791 'm': 1000 ** 2,
1792 'M': 1000 ** 2,
1793 'kk': 1000 ** 2,
1794 'KK': 1000 ** 2,
352d5da8 1795 'b': 1000 ** 3,
1796 'B': 1000 ** 3,
fb47597b 1797 }
be64b5b0 1798
352d5da8 1799 ret = lookup_unit_table(_UNIT_TABLE, s)
1800 if ret is not None:
1801 return ret
1802
1803 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1804 if mobj:
1805 return str_to_int(mobj.group(1))
be64b5b0 1806
2f7ae819 1807
5d45484c 1808def parse_resolution(s, *, lenient=False):
b871d7e9
S
1809 if s is None:
1810 return {}
1811
5d45484c
LNO
1812 if lenient:
1813 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1814 else:
1815 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1816 if mobj:
1817 return {
1818 'width': int(mobj.group('w')),
1819 'height': int(mobj.group('h')),
1820 }
1821
17ec8bcf 1822 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1823 if mobj:
1824 return {'height': int(mobj.group(1))}
1825
1826 mobj = re.search(r'\b([48])[kK]\b', s)
1827 if mobj:
1828 return {'height': int(mobj.group(1)) * 540}
1829
1830 return {}
1831
1832
0dc41787 1833def parse_bitrate(s):
14f25df2 1834 if not isinstance(s, str):
0dc41787
S
1835 return
1836 mobj = re.search(r'\b(\d+)\s*kbps', s)
1837 if mobj:
1838 return int(mobj.group(1))
1839
1840
a942d6cb 1841def month_by_name(name, lang='en'):
caefb1de
PH
1842 """ Return the number of a month by (locale-independently) English name """
1843
f6717dec 1844 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1845
caefb1de 1846 try:
f6717dec 1847 return month_names.index(name) + 1
7105440c
YCH
1848 except ValueError:
1849 return None
1850
1851
1852def month_by_abbreviation(abbrev):
1853 """ Return the number of a month by (locale-independently) English
1854 abbreviations """
1855
1856 try:
1857 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1858 except ValueError:
1859 return None
18258362
JMF
1860
1861
5aafe895 1862def fix_xml_ampersands(xml_str):
18258362 1863 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1864 return re.sub(
1865 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1866 '&amp;',
5aafe895 1867 xml_str)
e3946f98
PH
1868
1869
1870def setproctitle(title):
14f25df2 1871 assert isinstance(title, str)
c1c05c67 1872
fe0918bb 1873 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1874 try:
1875 import ctypes
1876 except ImportError:
c1c05c67
YCH
1877 return
1878
e3946f98 1879 try:
611c1dd9 1880 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1881 except OSError:
1882 return
2f49bcd6
RC
1883 except TypeError:
1884 # LoadLibrary in Windows Python 2.7.13 only expects
1885 # a bytestring, but since unicode_literals turns
1886 # every string into a unicode string, it fails.
1887 return
0f06bcd7 1888 title_bytes = title.encode()
6eefe533
PH
1889 buf = ctypes.create_string_buffer(len(title_bytes))
1890 buf.value = title_bytes
e3946f98 1891 try:
f9fb3ce8 1892 # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
6eefe533 1893 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1894 except AttributeError:
1895 return # Strange libc, just skip this
d7dda168
PH
1896
1897
1898def remove_start(s, start):
46bc9b7d 1899 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1900
1901
2b9faf55 1902def remove_end(s, end):
46bc9b7d 1903 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1904
1905
31b2051e
S
1906def remove_quotes(s):
1907 if s is None or len(s) < 2:
1908 return s
1909 for quote in ('"', "'", ):
1910 if s[0] == quote and s[-1] == quote:
1911 return s[1:-1]
1912 return s
1913
1914
b6e0c7d2 1915def get_domain(url):
ebf99aaf 1916 """
1917 This implementation is inconsistent, but is kept for compatibility.
1918 Use this only for "webpage_url_domain"
1919 """
1920 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1921
1922
29eb5174 1923def url_basename(url):
14f25df2 1924 path = urllib.parse.urlparse(url).path
28e614de 1925 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1926
1927
02dc0a36 1928def base_url(url):
7657ec7e 1929 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1930
1931
e34c3361 1932def urljoin(base, path):
4b5de77b 1933 if isinstance(path, bytes):
0f06bcd7 1934 path = path.decode()
14f25df2 1935 if not isinstance(path, str) or not path:
e34c3361 1936 return None
fad4ceb5 1937 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1938 return path
4b5de77b 1939 if isinstance(base, bytes):
0f06bcd7 1940 base = base.decode()
14f25df2 1941 if not isinstance(base, str) or not re.match(
4b5de77b 1942 r'^(?:https?:)?//', base):
e34c3361 1943 return None
14f25df2 1944 return urllib.parse.urljoin(base, path)
e34c3361
S
1945
1946
9732d77e 1947def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1948 if get_attr and v is not None:
1949 v = getattr(v, get_attr, None)
1812afb7
S
1950 try:
1951 return int(v) * invscale // scale
31c49255 1952 except (ValueError, TypeError, OverflowError):
af98f8ff 1953 return default
9732d77e 1954
9572013d 1955
40a90862 1956def str_or_none(v, default=None):
14f25df2 1957 return default if v is None else str(v)
40a90862 1958
9732d77e
PH
1959
1960def str_to_int(int_str):
48d4681e 1961 """ A more relaxed version of int_or_none """
f9934b96 1962 if isinstance(int_str, int):
348c6bf1 1963 return int_str
14f25df2 1964 elif isinstance(int_str, str):
42db58ec
S
1965 int_str = re.sub(r'[,\.\+]', '', int_str)
1966 return int_or_none(int_str)
608d11f5
PH
1967
1968
9732d77e 1969def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1970 if v is None:
1971 return default
1972 try:
1973 return float(v) * invscale / scale
5e1271c5 1974 except (ValueError, TypeError):
caf80631 1975 return default
43f775e4
PH
1976
1977
c7e327c4
S
1978def bool_or_none(v, default=None):
1979 return v if isinstance(v, bool) else default
1980
1981
53cd37ba 1982def strip_or_none(v, default=None):
14f25df2 1983 return v.strip() if isinstance(v, str) else default
b72b4431
S
1984
1985
af03000a 1986def url_or_none(url):
14f25df2 1987 if not url or not isinstance(url, str):
af03000a
S
1988 return None
1989 url = url.strip()
29f7c58a 1990 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
1991
1992
ad54c913 1993def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 1994 datetime_object = None
1995 try:
f9934b96 1996 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 1997 # Using naive datetime here can break timestamp() in Windows
1998 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
c305a25c 1999 # Also, dt.datetime.fromtimestamp breaks for negative timestamps
a35af430 2000 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
c305a25c 2001 datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
2002 + dt.timedelta(seconds=timestamp))
14f25df2 2003 elif isinstance(timestamp, str): # assume YYYYMMDD
c305a25c 2004 datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2005 date_format = re.sub( # Support %s on windows
2006 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2007 return datetime_object.strftime(date_format)
2008 except (ValueError, TypeError, AttributeError):
2009 return default
2010
2011
608d11f5 2012def parse_duration(s):
f9934b96 2013 if not isinstance(s, str):
608d11f5 2014 return None
ca7b3246 2015 s = s.strip()
38d79fd1 2016 if not s:
2017 return None
ca7b3246 2018
acaff495 2019 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2020 m = re.match(r'''(?x)
2021 (?P<before_secs>
2022 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2023 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2024 (?P<ms>[.:][0-9]+)?Z?$
2025 ''', s)
acaff495 2026 if m:
8bd1c00b 2027 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2028 else:
2029 m = re.match(
056653bb
S
2030 r'''(?ix)(?:P?
2031 (?:
1c1b2f96 2032 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2033 )?
2034 (?:
1c1b2f96 2035 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2036 )?
2037 (?:
1c1b2f96 2038 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2039 )?
8f4b58d7 2040 (?:
1c1b2f96 2041 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2042 )?
056653bb 2043 T)?
acaff495 2044 (?:
af868732 2045 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
acaff495 2046 )?
2047 (?:
1c1b2f96 2048 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2049 )?
2050 (?:
2051 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2052 )?Z?$''', s)
acaff495 2053 if m:
2054 days, hours, mins, secs, ms = m.groups()
2055 else:
15846398 2056 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2057 if m:
2058 hours, mins = m.groups()
2059 else:
2060 return None
2061
acaff495 2062 if ms:
19a03940 2063 ms = ms.replace(':', '.')
2064 return sum(float(part or 0) * mult for part, mult in (
2065 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2066
2067
e65e4c88 2068def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2069 name, real_ext = os.path.splitext(filename)
e65e4c88 2070 return (
86e5f3ed 2071 f'{name}.{ext}{real_ext}'
e65e4c88 2072 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2073 else f'{filename}.{ext}')
d70ad093
PH
2074
2075
b3ed15b7
S
2076def replace_extension(filename, ext, expected_real_ext=None):
2077 name, real_ext = os.path.splitext(filename)
86e5f3ed 2078 return '{}.{}'.format(
b3ed15b7
S
2079 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2080 ext)
2081
2082
d70ad093
PH
2083def check_executable(exe, args=[]):
2084 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2085 args can be a list of arguments for a short output (like -version) """
2086 try:
f0c9fb96 2087 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2088 except OSError:
2089 return False
2090 return exe
b7ab0590
PH
2091
2092
7aaf4cd2 2093def _get_exe_version_output(exe, args):
95807118 2094 try:
b64d04c1 2095 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2096 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2097 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2098 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2099 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2100 if ret:
2101 return None
95807118
PH
2102 except OSError:
2103 return False
f0c9fb96 2104 return stdout
cae97f65
PH
2105
2106
2107def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2108 assert isinstance(output, str)
cae97f65
PH
2109 if version_re is None:
2110 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2111 m = re.search(version_re, output)
95807118
PH
2112 if m:
2113 return m.group(1)
2114 else:
2115 return unrecognized
2116
2117
9af98e17 2118def get_exe_version(exe, args=['--version'],
1cdda329 2119 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2120 """ Returns the version of the specified executable,
2121 or False if the executable is not present """
1cdda329 2122 unrecognized = variadic(unrecognized)
2123 assert len(unrecognized) in (1, 2)
9af98e17 2124 out = _get_exe_version_output(exe, args)
1cdda329 2125 if out is None:
2126 return unrecognized[-1]
2127 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2128
2129
7e88d7d7 2130def frange(start=0, stop=None, step=1):
2131 """Float range"""
2132 if stop is None:
2133 start, stop = 0, start
2134 sign = [-1, 1][step > 0] if step else 0
2135 while sign * start < sign * stop:
2136 yield start
2137 start += step
2138
2139
cb89cfc1 2140class LazyList(collections.abc.Sequence):
0f06bcd7 2141 """Lazy immutable list from an iterable
2142 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2143
8e5fecc8 2144 class IndexError(IndexError):
2145 pass
2146
282f5709 2147 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2148 self._iterable = iter(iterable)
2149 self._cache = [] if _cache is None else _cache
2150 self._reversed = reverse
483336e7 2151
2152 def __iter__(self):
0f06bcd7 2153 if self._reversed:
28419ca2 2154 # We need to consume the entire iterable to iterate in reverse
981052c9 2155 yield from self.exhaust()
28419ca2 2156 return
0f06bcd7 2157 yield from self._cache
2158 for item in self._iterable:
2159 self._cache.append(item)
483336e7 2160 yield item
2161
0f06bcd7 2162 def _exhaust(self):
2163 self._cache.extend(self._iterable)
2164 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2165 return self._cache
28419ca2 2166
981052c9 2167 def exhaust(self):
0f06bcd7 2168 """Evaluate the entire iterable"""
2169 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2170
28419ca2 2171 @staticmethod
0f06bcd7 2172 def _reverse_index(x):
f2df4071 2173 return None if x is None else ~x
483336e7 2174
2175 def __getitem__(self, idx):
2176 if isinstance(idx, slice):
0f06bcd7 2177 if self._reversed:
2178 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2179 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2180 elif isinstance(idx, int):
0f06bcd7 2181 if self._reversed:
2182 idx = self._reverse_index(idx)
e0f2b4b4 2183 start, stop, step = idx, idx, 0
483336e7 2184 else:
2185 raise TypeError('indices must be integers or slices')
e0f2b4b4 2186 if ((start or 0) < 0 or (stop or 0) < 0
2187 or (start is None and step < 0)
2188 or (stop is None and step > 0)):
483336e7 2189 # We need to consume the entire iterable to be able to slice from the end
2190 # Obviously, never use this with infinite iterables
0f06bcd7 2191 self._exhaust()
8e5fecc8 2192 try:
0f06bcd7 2193 return self._cache[idx]
8e5fecc8 2194 except IndexError as e:
2195 raise self.IndexError(e) from e
0f06bcd7 2196 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2197 if n > 0:
0f06bcd7 2198 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2199 try:
0f06bcd7 2200 return self._cache[idx]
8e5fecc8 2201 except IndexError as e:
2202 raise self.IndexError(e) from e
483336e7 2203
2204 def __bool__(self):
2205 try:
0f06bcd7 2206 self[-1] if self._reversed else self[0]
8e5fecc8 2207 except self.IndexError:
483336e7 2208 return False
2209 return True
2210
2211 def __len__(self):
0f06bcd7 2212 self._exhaust()
2213 return len(self._cache)
483336e7 2214
282f5709 2215 def __reversed__(self):
0f06bcd7 2216 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2217
2218 def __copy__(self):
0f06bcd7 2219 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2220
28419ca2 2221 def __repr__(self):
2222 # repr and str should mimic a list. So we exhaust the iterable
2223 return repr(self.exhaust())
2224
2225 def __str__(self):
2226 return repr(self.exhaust())
2227
483336e7 2228
7be9ccff 2229class PagedList:
c07a39ae 2230
2231 class IndexError(IndexError):
2232 pass
2233
dd26ced1
PH
2234 def __len__(self):
2235 # This is only useful for tests
2236 return len(self.getslice())
2237
7be9ccff 2238 def __init__(self, pagefunc, pagesize, use_cache=True):
2239 self._pagefunc = pagefunc
2240 self._pagesize = pagesize
f1d13090 2241 self._pagecount = float('inf')
7be9ccff 2242 self._use_cache = use_cache
2243 self._cache = {}
2244
2245 def getpage(self, pagenum):
d8cf8d97 2246 page_results = self._cache.get(pagenum)
2247 if page_results is None:
f1d13090 2248 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2249 if self._use_cache:
2250 self._cache[pagenum] = page_results
2251 return page_results
2252
2253 def getslice(self, start=0, end=None):
2254 return list(self._getslice(start, end))
2255
2256 def _getslice(self, start, end):
55575225 2257 raise NotImplementedError('This method must be implemented by subclasses')
2258
2259 def __getitem__(self, idx):
f1d13090 2260 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2261 if not isinstance(idx, int) or idx < 0:
2262 raise TypeError('indices must be non-negative integers')
2263 entries = self.getslice(idx, idx + 1)
d8cf8d97 2264 if not entries:
c07a39ae 2265 raise self.IndexError()
d8cf8d97 2266 return entries[0]
55575225 2267
f9fb3ce8
SS
2268 def __bool__(self):
2269 return bool(self.getslice(0, 1))
2270
9c44d242
PH
2271
2272class OnDemandPagedList(PagedList):
a44ca5a4 2273 """Download pages until a page with less than maximum results"""
86e5f3ed 2274
7be9ccff 2275 def _getslice(self, start, end):
b7ab0590
PH
2276 for pagenum in itertools.count(start // self._pagesize):
2277 firstid = pagenum * self._pagesize
2278 nextfirstid = pagenum * self._pagesize + self._pagesize
2279 if start >= nextfirstid:
2280 continue
2281
b7ab0590
PH
2282 startv = (
2283 start % self._pagesize
2284 if firstid <= start < nextfirstid
2285 else 0)
b7ab0590
PH
2286 endv = (
2287 ((end - 1) % self._pagesize) + 1
2288 if (end is not None and firstid <= end <= nextfirstid)
2289 else None)
2290
f1d13090 2291 try:
2292 page_results = self.getpage(pagenum)
2293 except Exception:
2294 self._pagecount = pagenum - 1
2295 raise
b7ab0590
PH
2296 if startv != 0 or endv is not None:
2297 page_results = page_results[startv:endv]
7be9ccff 2298 yield from page_results
b7ab0590
PH
2299
2300 # A little optimization - if current page is not "full", ie. does
2301 # not contain page_size videos then we can assume that this page
2302 # is the last one - there are no more ids on further pages -
2303 # i.e. no need to query again.
2304 if len(page_results) + startv < self._pagesize:
2305 break
2306
2307 # If we got the whole page, but the next page is not interesting,
2308 # break out early as well
2309 if end == nextfirstid:
2310 break
81c2f20b
PH
2311
2312
9c44d242 2313class InAdvancePagedList(PagedList):
a44ca5a4 2314 """PagedList with total number of pages known in advance"""
86e5f3ed 2315
9c44d242 2316 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2317 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2318 self._pagecount = pagecount
9c44d242 2319
7be9ccff 2320 def _getslice(self, start, end):
9c44d242 2321 start_page = start // self._pagesize
d37707bd 2322 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2323 skip_elems = start - start_page * self._pagesize
2324 only_more = None if end is None else end - start
2325 for pagenum in range(start_page, end_page):
7be9ccff 2326 page_results = self.getpage(pagenum)
9c44d242 2327 if skip_elems:
7be9ccff 2328 page_results = page_results[skip_elems:]
9c44d242
PH
2329 skip_elems = None
2330 if only_more is not None:
7be9ccff 2331 if len(page_results) < only_more:
2332 only_more -= len(page_results)
9c44d242 2333 else:
7be9ccff 2334 yield from page_results[:only_more]
9c44d242 2335 break
7be9ccff 2336 yield from page_results
9c44d242
PH
2337
2338
7e88d7d7 2339class PlaylistEntries:
2340 MissingEntry = object()
2341 is_exhausted = False
2342
2343 def __init__(self, ydl, info_dict):
7e9a6125 2344 self.ydl = ydl
2345
2346 # _entries must be assigned now since infodict can change during iteration
2347 entries = info_dict.get('entries')
2348 if entries is None:
2349 raise EntryNotInPlaylist('There are no entries')
2350 elif isinstance(entries, list):
2351 self.is_exhausted = True
2352
2353 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2354 self.is_incomplete = requested_entries is not None
7e9a6125 2355 if self.is_incomplete:
2356 assert self.is_exhausted
bc5c2f8a 2357 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2358 for i, entry in zip(requested_entries, entries):
2359 self._entries[i - 1] = entry
2360 elif isinstance(entries, (list, PagedList, LazyList)):
2361 self._entries = entries
2362 else:
2363 self._entries = LazyList(entries)
7e88d7d7 2364
2365 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2366 (?P<start>[+-]?\d+)?
2367 (?P<range>[:-]
2368 (?P<end>[+-]?\d+|inf(?:inite)?)?
2369 (?::(?P<step>[+-]?\d+))?
2370 )?''')
2371
2372 @classmethod
2373 def parse_playlist_items(cls, string):
2374 for segment in string.split(','):
2375 if not segment:
2376 raise ValueError('There is two or more consecutive commas')
2377 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2378 if not mobj:
2379 raise ValueError(f'{segment!r} is not a valid specification')
2380 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2381 if int_or_none(step) == 0:
2382 raise ValueError(f'Step in {segment!r} cannot be zero')
2383 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2384
2385 def get_requested_items(self):
2386 playlist_items = self.ydl.params.get('playlist_items')
2387 playlist_start = self.ydl.params.get('playliststart', 1)
2388 playlist_end = self.ydl.params.get('playlistend')
2389 # For backwards compatibility, interpret -1 as whole list
2390 if playlist_end in (-1, None):
2391 playlist_end = ''
2392 if not playlist_items:
2393 playlist_items = f'{playlist_start}:{playlist_end}'
2394 elif playlist_start != 1 or playlist_end:
2395 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2396
2397 for index in self.parse_playlist_items(playlist_items):
2398 for i, entry in self[index]:
2399 yield i, entry
1ac4fd80 2400 if not entry:
2401 continue
7e88d7d7 2402 try:
d21056f4 2403 # The item may have just been added to archive. Don't break due to it
2404 if not self.ydl.params.get('lazy_playlist'):
2405 # TODO: Add auto-generated fields
2406 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2407 except (ExistingVideoReached, RejectedVideoReached):
2408 return
2409
7e9a6125 2410 def get_full_count(self):
2411 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2412 return len(self)
2413 elif isinstance(self._entries, InAdvancePagedList):
2414 if self._entries._pagesize == 1:
2415 return self._entries._pagecount
2416
7e88d7d7 2417 @functools.cached_property
2418 def _getter(self):
2419 if isinstance(self._entries, list):
2420 def get_entry(i):
2421 try:
2422 entry = self._entries[i]
2423 except IndexError:
2424 entry = self.MissingEntry
2425 if not self.is_incomplete:
2426 raise self.IndexError()
2427 if entry is self.MissingEntry:
bc5c2f8a 2428 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2429 return entry
2430 else:
2431 def get_entry(i):
2432 try:
2433 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2434 except (LazyList.IndexError, PagedList.IndexError):
2435 raise self.IndexError()
2436 return get_entry
2437
2438 def __getitem__(self, idx):
2439 if isinstance(idx, int):
2440 idx = slice(idx, idx)
2441
2442 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2443 step = 1 if idx.step is None else idx.step
2444 if idx.start is None:
2445 start = 0 if step > 0 else len(self) - 1
2446 else:
2447 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2448
2449 # NB: Do not call len(self) when idx == [:]
2450 if idx.stop is None:
2451 stop = 0 if step < 0 else float('inf')
2452 else:
2453 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2454 stop += [-1, 1][step > 0]
2455
2456 for i in frange(start, stop, step):
2457 if i < 0:
2458 continue
2459 try:
7e9a6125 2460 entry = self._getter(i)
2461 except self.IndexError:
2462 self.is_exhausted = True
2463 if step > 0:
7e88d7d7 2464 break
7e9a6125 2465 continue
7e88d7d7 2466 yield i + 1, entry
2467
2468 def __len__(self):
2469 return len(tuple(self[:]))
2470
2471 class IndexError(IndexError):
2472 pass
2473
2474
81c2f20b 2475def uppercase_escape(s):
676eb3f2 2476 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2477 return re.sub(
a612753d 2478 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2479 lambda m: unicode_escape(m.group(0))[0],
2480 s)
0fe2ff78
YCH
2481
2482
2483def lowercase_escape(s):
2484 unicode_escape = codecs.getdecoder('unicode_escape')
2485 return re.sub(
2486 r'\\u[0-9a-fA-F]{4}',
2487 lambda m: unicode_escape(m.group(0))[0],
2488 s)
b53466e1 2489
d05cfe06 2490
96b9e9cf 2491def parse_qs(url, **kwargs):
2492 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2493
2494
62e609ab
PH
2495def read_batch_urls(batch_fd):
2496 def fixup(url):
14f25df2 2497 if not isinstance(url, str):
62e609ab 2498 url = url.decode('utf-8', 'replace')
8c04f0be 2499 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2500 for bom in BOM_UTF8:
2501 if url.startswith(bom):
2502 url = url[len(bom):]
2503 url = url.lstrip()
2504 if not url or url.startswith(('#', ';', ']')):
62e609ab 2505 return False
8c04f0be 2506 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2507 # However, it can be safely stripped out if following a whitespace
8c04f0be 2508 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2509
2510 with contextlib.closing(batch_fd) as fd:
2511 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2512
2513
2514def urlencode_postdata(*args, **kargs):
14f25df2 2515 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2516
2517
45b2ee6f 2518def update_url(url, *, query_update=None, **kwargs):
2519 """Replace URL components specified by kwargs
2520 @param url str or parse url tuple
2521 @param query_update update query
2522 @returns str
2523 """
2524 if isinstance(url, str):
2525 if not kwargs and not query_update:
2526 return url
2527 else:
2528 url = urllib.parse.urlparse(url)
2529 if query_update:
2530 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2531 kwargs['query'] = urllib.parse.urlencode({
2532 **urllib.parse.parse_qs(url.query),
2533 **query_update
2534 }, True)
2535 return urllib.parse.urlunparse(url._replace(**kwargs))
2536
2537
38f9ef31 2538def update_url_query(url, query):
45b2ee6f 2539 return update_url(url, query_update=query)
16392824 2540
8e60dc75 2541
10c87c15 2542def _multipart_encode_impl(data, boundary):
0c265486
YCH
2543 content_type = 'multipart/form-data; boundary=%s' % boundary
2544
2545 out = b''
2546 for k, v in data.items():
2547 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2548 if isinstance(k, str):
0f06bcd7 2549 k = k.encode()
14f25df2 2550 if isinstance(v, str):
0f06bcd7 2551 v = v.encode()
0c265486
YCH
2552 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2553 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2554 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2555 if boundary.encode('ascii') in content:
2556 raise ValueError('Boundary overlaps with data')
2557 out += content
2558
2559 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2560
2561 return out, content_type
2562
2563
2564def multipart_encode(data, boundary=None):
2565 '''
2566 Encode a dict to RFC 7578-compliant form-data
2567
2568 data:
2569 A dict where keys and values can be either Unicode or bytes-like
2570 objects.
2571 boundary:
2572 If specified a Unicode object, it's used as the boundary. Otherwise
2573 a random boundary is generated.
2574
2575 Reference: https://tools.ietf.org/html/rfc7578
2576 '''
2577 has_specified_boundary = boundary is not None
2578
2579 while True:
2580 if boundary is None:
2581 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2582
2583 try:
10c87c15 2584 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2585 break
2586 except ValueError:
2587 if has_specified_boundary:
2588 raise
2589 boundary = None
2590
2591 return out, content_type
2592
2593
b079c26f
SS
2594def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2595 if blocked_types is NO_DEFAULT:
2596 blocked_types = (str, bytes, collections.abc.Mapping)
2597 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2598
2599
2600def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2601 if not isinstance(allowed_types, (tuple, type)):
2602 deprecation_warning('allowed_types should be a tuple or a type')
2603 allowed_types = tuple(allowed_types)
6f2287cb 2604 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2605
2606
c4f60dd7 2607def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2608 for f in funcs:
a32a9a7e 2609 try:
c4f60dd7 2610 val = f(*args, **kwargs)
ab029d7e 2611 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2612 pass
2613 else:
c4f60dd7 2614 if expected_type is None or isinstance(val, expected_type):
2615 return val
2616
2617
2618def try_get(src, getter, expected_type=None):
2619 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2620
2621
90137ca4 2622def filter_dict(dct, cndn=lambda _, v: v is not None):
2623 return {k: v for k, v in dct.items() if cndn(k, v)}
2624
2625
6cc62232
S
2626def merge_dicts(*dicts):
2627 merged = {}
2628 for a_dict in dicts:
2629 for k, v in a_dict.items():
90137ca4 2630 if (v is not None and k not in merged
2631 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2632 merged[k] = v
2633 return merged
2634
2635
8e60dc75 2636def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2637 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2638
16392824 2639
a1a530b0
PH
2640US_RATINGS = {
2641 'G': 0,
2642 'PG': 10,
2643 'PG-13': 13,
2644 'R': 16,
2645 'NC': 18,
2646}
fac55558
PH
2647
2648
a8795327 2649TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2650 'TV-Y': 0,
2651 'TV-Y7': 7,
2652 'TV-G': 0,
2653 'TV-PG': 0,
2654 'TV-14': 14,
2655 'TV-MA': 17,
a8795327
S
2656}
2657
2658
146c80e2 2659def parse_age_limit(s):
19a03940 2660 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2661 if type(s) is int: # noqa: E721
a8795327 2662 return s if 0 <= s <= 21 else None
19a03940 2663 elif not isinstance(s, str):
d838b1bd 2664 return None
146c80e2 2665 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2666 if m:
2667 return int(m.group('age'))
5c5fae6d 2668 s = s.upper()
a8795327
S
2669 if s in US_RATINGS:
2670 return US_RATINGS[s]
5a16c9d9 2671 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2672 if m:
5a16c9d9 2673 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2674 return None
146c80e2
S
2675
2676
fac55558 2677def strip_jsonp(code):
609a61e3 2678 return re.sub(
5552c9eb 2679 r'''(?sx)^
e9c671d5 2680 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2681 (?:\s*&&\s*(?P=func_name))?
2682 \s*\(\s*(?P<callback_data>.*)\);?
2683 \s*?(?://[^\n]*)*$''',
2684 r'\g<callback_data>', code)
478c2c61
PH
2685
2686
8f53dc44 2687def js_to_json(code, vars={}, *, strict=False):
5c610515 2688 # vars is a dict of var, val pairs to substitute
0898c5c8 2689 STRING_QUOTES = '\'"`'
a71b812f 2690 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2691 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2692 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2693 INTEGER_TABLE = (
86e5f3ed 2694 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2695 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2696 )
2697
a71b812f
SS
2698 def process_escape(match):
2699 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2700 escape = match.group(1) or match.group(2)
2701
2702 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2703 else R'\u00' if escape == 'x'
2704 else '' if escape == '\n'
2705 else escape)
2706
0898c5c8
SS
2707 def template_substitute(match):
2708 evaluated = js_to_json(match.group(1), vars, strict=strict)
2709 if evaluated[0] == '"':
2710 return json.loads(evaluated)
2711 return evaluated
2712
e05f6939 2713 def fix_kv(m):
e7b6d122
PH
2714 v = m.group(0)
2715 if v in ('true', 'false', 'null'):
2716 return v
421ddcb8
C
2717 elif v in ('undefined', 'void 0'):
2718 return 'null'
8bdd16b4 2719 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2720 return ''
2721
2722 if v[0] in STRING_QUOTES:
0898c5c8
SS
2723 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2724 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2725 return f'"{escaped}"'
2726
2727 for regex, base in INTEGER_TABLE:
2728 im = re.match(regex, v)
2729 if im:
2730 i = int(im.group(1), base)
2731 return f'"{i}":' if v.endswith(':') else str(i)
2732
2733 if v in vars:
d5f043d1
C
2734 try:
2735 if not strict:
2736 json.loads(vars[v])
08e29b9f 2737 except json.JSONDecodeError:
d5f043d1
C
2738 return json.dumps(vars[v])
2739 else:
2740 return vars[v]
89ac4a19 2741
a71b812f
SS
2742 if not strict:
2743 return f'"{v}"'
5c610515 2744
a71b812f 2745 raise ValueError(f'Unknown value: {v}')
e05f6939 2746
8072ef2b 2747 def create_map(mobj):
2748 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2749
52414d64 2750 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
8072ef2b 2751 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2752 if not strict:
9d7ded64 2753 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
f55523cf 2754 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2755 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2756 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2757
a71b812f
SS
2758 return re.sub(rf'''(?sx)
2759 {STRING_RE}|
2760 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2761 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2762 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2763 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2764 !+
a71b812f 2765 ''', fix_kv, code)
e05f6939
PH
2766
2767
478c2c61
PH
2768def qualities(quality_ids):
2769 """ Get a numeric quality value out of a list of possible values """
2770 def q(qid):
2771 try:
2772 return quality_ids.index(qid)
2773 except ValueError:
2774 return -1
2775 return q
2776
acd69589 2777
119e40ef 2778POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2779
2780
de6000d9 2781DEFAULT_OUTTMPL = {
2782 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2783 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2784}
2785OUTTMPL_TYPES = {
72755351 2786 'chapter': None,
de6000d9 2787 'subtitle': None,
2788 'thumbnail': None,
2789 'description': 'description',
2790 'annotation': 'annotations.xml',
2791 'infojson': 'info.json',
08438d2c 2792 'link': None,
3b603dbd 2793 'pl_video': None,
5112f26a 2794 'pl_thumbnail': None,
de6000d9 2795 'pl_description': 'description',
2796 'pl_infojson': 'info.json',
2797}
0a871f68 2798
143db31d 2799# As of [1] format syntax is:
2800# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2801# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2802STR_FORMAT_RE_TMPL = r'''(?x)
2803 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2804 %
524e2e4f 2805 (?P<has_key>\((?P<key>{0})\))?
752cda38 2806 (?P<format>
524e2e4f 2807 (?P<conversion>[#0\-+ ]+)?
2808 (?P<min_width>\d+)?
2809 (?P<precision>\.\d+)?
2810 (?P<len_mod>[hlL])? # unused in python
901130bb 2811 {1} # conversion type
752cda38 2812 )
143db31d 2813'''
2814
7d1eb38a 2815
ebe1b4e3 2816STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2817
7d1eb38a 2818
a020a0dc
PH
2819def limit_length(s, length):
2820 """ Add ellipses to overly long strings """
2821 if s is None:
2822 return None
2823 ELLIPSES = '...'
2824 if len(s) > length:
2825 return s[:length - len(ELLIPSES)] + ELLIPSES
2826 return s
48844745
PH
2827
2828
2829def version_tuple(v):
5f9b8394 2830 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2831
2832
2833def is_outdated_version(version, limit, assume_new=True):
2834 if not version:
2835 return not assume_new
2836 try:
2837 return version_tuple(version) < version_tuple(limit)
2838 except ValueError:
2839 return not assume_new
732ea2f0
PH
2840
2841
2842def ytdl_is_updateable():
7a5c1cfe 2843 """ Returns if yt-dlp can be updated with -U """
735d865e 2844
69bec673 2845 from ..update import is_non_updateable
732ea2f0 2846
5d535b4a 2847 return not is_non_updateable()
7d4111ed
PH
2848
2849
2850def args_to_str(args):
2851 # Get a short string representation for a subprocess command
702ccf2d 2852 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2853
2854
a44ca5a4 2855def error_to_str(err):
2856 return f'{type(err).__name__}: {err}'
2857
2858
2647c933 2859def mimetype2ext(mt, default=NO_DEFAULT):
2860 if not isinstance(mt, str):
2861 if default is not NO_DEFAULT:
2862 return default
eb9ee194
S
2863 return None
2864
2647c933 2865 MAP = {
2866 # video
f6861ec9 2867 '3gpp': '3gp',
2647c933 2868 'mp2t': 'ts',
2869 'mp4': 'mp4',
2870 'mpeg': 'mpeg',
2871 'mpegurl': 'm3u8',
2872 'quicktime': 'mov',
2873 'webm': 'webm',
2874 'vp9': 'vp9',
f659e643 2875 'video/ogg': 'ogv',
f6861ec9 2876 'x-flv': 'flv',
2647c933 2877 'x-m4v': 'm4v',
2878 'x-matroska': 'mkv',
2879 'x-mng': 'mng',
a0d8d704 2880 'x-mp4-fragmented': 'mp4',
2647c933 2881 'x-ms-asf': 'asf',
a0d8d704 2882 'x-ms-wmv': 'wmv',
2647c933 2883 'x-msvideo': 'avi',
2884
2885 # application (streaming playlists)
b4173f15 2886 'dash+xml': 'mpd',
b4173f15 2887 'f4m+xml': 'f4m',
f164b971 2888 'hds+xml': 'f4m',
2647c933 2889 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2890 'vnd.ms-sstr+xml': 'ism',
2647c933 2891 'x-mpegurl': 'm3u8',
2892
2893 # audio
2894 'audio/mp4': 'm4a',
2895 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2896 # Using .mp3 as it's the most popular one
2897 'audio/mpeg': 'mp3',
d80ca5de 2898 'audio/webm': 'webm',
2647c933 2899 'audio/x-matroska': 'mka',
2900 'audio/x-mpegurl': 'm3u',
2901 'midi': 'mid',
2902 'ogg': 'ogg',
2903 'wav': 'wav',
2904 'wave': 'wav',
2905 'x-aac': 'aac',
2906 'x-flac': 'flac',
2907 'x-m4a': 'm4a',
2908 'x-realaudio': 'ra',
39e7107d 2909 'x-wav': 'wav',
9359f3d4 2910
2647c933 2911 # image
2912 'avif': 'avif',
2913 'bmp': 'bmp',
2914 'gif': 'gif',
2915 'jpeg': 'jpg',
2916 'png': 'png',
2917 'svg+xml': 'svg',
2918 'tiff': 'tif',
2919 'vnd.wap.wbmp': 'wbmp',
2920 'webp': 'webp',
2921 'x-icon': 'ico',
2922 'x-jng': 'jng',
2923 'x-ms-bmp': 'bmp',
2924
2925 # caption
2926 'filmstrip+json': 'fs',
2927 'smptett+xml': 'tt',
2928 'ttaf+xml': 'dfxp',
2929 'ttml+xml': 'ttml',
2930 'x-ms-sami': 'sami',
9359f3d4 2931
2647c933 2932 # misc
2933 'gzip': 'gz',
9359f3d4
F
2934 'json': 'json',
2935 'xml': 'xml',
2936 'zip': 'zip',
9359f3d4
F
2937 }
2938
2647c933 2939 mimetype = mt.partition(';')[0].strip().lower()
2940 _, _, subtype = mimetype.rpartition('/')
9359f3d4 2941
69bec673 2942 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 2943 if ext:
2944 return ext
2945 elif default is not NO_DEFAULT:
2946 return default
9359f3d4 2947 return subtype.replace('+', '.')
c460bdd5
PH
2948
2949
2814f12b
THD
2950def ext2mimetype(ext_or_url):
2951 if not ext_or_url:
2952 return None
2953 if '.' not in ext_or_url:
2954 ext_or_url = f'file.{ext_or_url}'
2955 return mimetypes.guess_type(ext_or_url)[0]
2956
2957
4f3c5e06 2958def parse_codecs(codecs_str):
2959 # http://tools.ietf.org/html/rfc6381
2960 if not codecs_str:
2961 return {}
a0566bbf 2962 split_codecs = list(filter(None, map(
dbf5416a 2963 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 2964 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 2965 for full_codec in split_codecs:
d816f61f 2966 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2967 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2968 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2969 if vcodec:
2970 continue
2971 vcodec = full_codec
2972 if parts[0] in ('dvh1', 'dvhe'):
2973 hdr = 'DV'
69bec673 2974 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 2975 hdr = 'HDR10'
2976 elif parts[:2] == ['vp9', '2']:
2977 hdr = 'HDR10'
71082216 2978 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 2979 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2980 acodec = acodec or full_codec
2981 elif parts[0] in ('stpp', 'wvtt'):
2982 scodec = scodec or full_codec
4f3c5e06 2983 else:
19a03940 2984 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 2985 if vcodec or acodec or scodec:
4f3c5e06 2986 return {
2987 'vcodec': vcodec or 'none',
2988 'acodec': acodec or 'none',
176f1866 2989 'dynamic_range': hdr,
3fe75fdc 2990 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 2991 }
b69fd25c 2992 elif len(split_codecs) == 2:
2993 return {
2994 'vcodec': split_codecs[0],
2995 'acodec': split_codecs[1],
2996 }
4f3c5e06 2997 return {}
2998
2999
fc61aff4
LL
3000def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3001 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3002
3003 allow_mkv = not preferences or 'mkv' in preferences
3004
3005 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3006 return 'mkv' # TODO: any other format allows this?
3007
3008 # TODO: All codecs supported by parse_codecs isn't handled here
3009 COMPATIBLE_CODECS = {
3010 'mp4': {
71082216 3011 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3012 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3013 },
3014 'webm': {
3015 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3016 'vp9x', 'vp8x', # in the webm spec
3017 },
3018 }
3019
812cdfa0 3020 sanitize_codec = functools.partial(
3021 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3022 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3023
3024 for ext in preferences or COMPATIBLE_CODECS.keys():
3025 codec_set = COMPATIBLE_CODECS.get(ext, set())
3026 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3027 return ext
3028
3029 COMPATIBLE_EXTS = (
3030 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3031 {'webm', 'weba'},
fc61aff4
LL
3032 )
3033 for ext in preferences or vexts:
3034 current_exts = {ext, *vexts, *aexts}
3035 if ext == 'mkv' or current_exts == {ext} or any(
3036 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3037 return ext
3038 return 'mkv' if allow_mkv else preferences[-1]
3039
3040
2647c933 3041def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3042 getheader = url_handle.headers.get
2ccd1b10 3043
b55ee18f
PH
3044 cd = getheader('Content-Disposition')
3045 if cd:
3046 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3047 if m:
3048 e = determine_ext(m.group('filename'), default_ext=None)
3049 if e:
3050 return e
3051
2647c933 3052 meta_ext = getheader('x-amz-meta-name')
3053 if meta_ext:
3054 e = meta_ext.rpartition('.')[2]
3055 if e:
3056 return e
3057
3058 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3059
3060
1e399778
YCH
3061def encode_data_uri(data, mime_type):
3062 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3063
3064
05900629 3065def age_restricted(content_limit, age_limit):
6ec6cb4e 3066 """ Returns True iff the content should be blocked """
05900629
PH
3067
3068 if age_limit is None: # No limit set
3069 return False
3070 if content_limit is None:
3071 return False # Content available for everyone
3072 return age_limit < content_limit
61ca9a80
PH
3073
3074
88f60feb 3075# List of known byte-order-marks (BOM)
a904a7f8
L
3076BOMS = [
3077 (b'\xef\xbb\xbf', 'utf-8'),
3078 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3079 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3080 (b'\xff\xfe', 'utf-16-le'),
3081 (b'\xfe\xff', 'utf-16-be'),
3082]
a904a7f8
L
3083
3084
61ca9a80
PH
3085def is_html(first_bytes):
3086 """ Detect whether a file contains HTML by examining its first bytes. """
3087
80e8493e 3088 encoding = 'utf-8'
61ca9a80 3089 for bom, enc in BOMS:
80e8493e 3090 while first_bytes.startswith(bom):
3091 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3092
80e8493e 3093 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3094
3095
3096def determine_protocol(info_dict):
3097 protocol = info_dict.get('protocol')
3098 if protocol is not None:
3099 return protocol
3100
7de837a5 3101 url = sanitize_url(info_dict['url'])
a055469f
PH
3102 if url.startswith('rtmp'):
3103 return 'rtmp'
3104 elif url.startswith('mms'):
3105 return 'mms'
3106 elif url.startswith('rtsp'):
3107 return 'rtsp'
3108
3109 ext = determine_ext(url)
3110 if ext == 'm3u8':
deae7c17 3111 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3112 elif ext == 'f4m':
3113 return 'f4m'
3114
14f25df2 3115 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3116
3117
c5e3f849 3118def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3119 """ Render a list of rows, each as a list of values.
3120 Text after a \t will be right aligned """
ec11a9f4 3121 def width(string):
c5e3f849 3122 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3123
3124 def get_max_lens(table):
ec11a9f4 3125 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3126
3127 def filter_using_list(row, filterArray):
d16df59d 3128 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3129
d16df59d 3130 max_lens = get_max_lens(data) if hide_empty else []
3131 header_row = filter_using_list(header_row, max_lens)
3132 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3133
cfb56d1a 3134 table = [header_row] + data
76d321f6 3135 max_lens = get_max_lens(table)
c5e3f849 3136 extra_gap += 1
76d321f6 3137 if delim:
c5e3f849 3138 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3139 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3140 for row in table:
3141 for pos, text in enumerate(map(str, row)):
c5e3f849 3142 if '\t' in text:
3143 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3144 else:
3145 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3146 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3147 return ret
347de493
PH
3148
3149
8f18aca8 3150def _match_one(filter_part, dct, incomplete):
77b87f05 3151 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3152 STRING_OPERATORS = {
3153 '*=': operator.contains,
3154 '^=': lambda attr, value: attr.startswith(value),
3155 '$=': lambda attr, value: attr.endswith(value),
3156 '~=': lambda attr, value: re.search(value, attr),
3157 }
347de493 3158 COMPARISON_OPERATORS = {
a047eeb6 3159 **STRING_OPERATORS,
3160 '<=': operator.le, # "<=" must be defined above "<"
347de493 3161 '<': operator.lt,
347de493 3162 '>=': operator.ge,
a047eeb6 3163 '>': operator.gt,
347de493 3164 '=': operator.eq,
347de493 3165 }
a047eeb6 3166
6db9c4d5 3167 if isinstance(incomplete, bool):
3168 is_incomplete = lambda _: incomplete
3169 else:
3170 is_incomplete = lambda k: k in incomplete
3171
64fa820c 3172 operator_rex = re.compile(r'''(?x)
347de493 3173 (?P<key>[a-z_]+)
77b87f05 3174 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3175 (?:
a047eeb6 3176 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3177 (?P<strval>.+?)
347de493 3178 )
347de493 3179 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3180 m = operator_rex.fullmatch(filter_part.strip())
347de493 3181 if m:
18f96d12 3182 m = m.groupdict()
3183 unnegated_op = COMPARISON_OPERATORS[m['op']]
3184 if m['negation']:
77b87f05
MT
3185 op = lambda attr, value: not unnegated_op(attr, value)
3186 else:
3187 op = unnegated_op
18f96d12 3188 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3189 if m['quote']:
3190 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3191 actual_value = dct.get(m['key'])
3192 numeric_comparison = None
f9934b96 3193 if isinstance(actual_value, (int, float)):
e5a088dc
S
3194 # If the original field is a string and matching comparisonvalue is
3195 # a number we should respect the origin of the original field
3196 # and process comparison value as a string (see
18f96d12 3197 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3198 try:
18f96d12 3199 numeric_comparison = int(comparison_value)
347de493 3200 except ValueError:
18f96d12 3201 numeric_comparison = parse_filesize(comparison_value)
3202 if numeric_comparison is None:
3203 numeric_comparison = parse_filesize(f'{comparison_value}B')
3204 if numeric_comparison is None:
3205 numeric_comparison = parse_duration(comparison_value)
3206 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3207 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3208 if actual_value is None:
6db9c4d5 3209 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3210 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3211
3212 UNARY_OPERATORS = {
1cc47c66
S
3213 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3214 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3215 }
64fa820c 3216 operator_rex = re.compile(r'''(?x)
347de493 3217 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3218 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3219 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3220 if m:
3221 op = UNARY_OPERATORS[m.group('op')]
3222 actual_value = dct.get(m.group('key'))
6db9c4d5 3223 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3224 return True
347de493
PH
3225 return op(actual_value)
3226
3227 raise ValueError('Invalid filter part %r' % filter_part)
3228
3229
8f18aca8 3230def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3231 """ Filter a dictionary with a simple string syntax.
3232 @returns Whether the filter passes
3233 @param incomplete Set of keys that is expected to be missing from dct.
3234 Can be True/False to indicate all/none of the keys may be missing.
3235 All conditions on incomplete keys pass if the key is missing
8f18aca8 3236 """
347de493 3237 return all(
8f18aca8 3238 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3239 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3240
3241
fe2ce85a 3242def match_filter_func(filters, breaking_filters=None):
3243 if not filters and not breaking_filters:
d1b5f70b 3244 return None
45491a2a 3245 repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3246
fe2ce85a 3247 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3248 filters = set(variadic(filters or []))
d1b5f70b 3249
492272fe 3250 interactive = '-' in filters
3251 if interactive:
3252 filters.remove('-')
3253
45491a2a 3254 @function_with_repr.set_repr(repr_)
492272fe 3255 def _match_func(info_dict, incomplete=False):
fe2ce85a 3256 ret = breaking_filters(info_dict, incomplete)
3257 if ret is not None:
3258 raise RejectedVideoReached(ret)
3259
492272fe 3260 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3261 return NO_DEFAULT if interactive and not incomplete else None
347de493 3262 else:
3bec830a 3263 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3264 filter_str = ') | ('.join(map(str.strip, filters))
3265 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3266 return _match_func
91410c9b
PH
3267
3268
f2df4071 3269class download_range_func:
b4e0d758 3270 def __init__(self, chapters, ranges, from_info=False):
3271 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3272
3273 def __call__(self, info_dict, ydl):
0500ee3d 3274
5ec1b6b7 3275 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3276 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3277 for regex in self.chapters or []:
5ec1b6b7 3278 for i, chapter in enumerate(info_dict.get('chapters') or []):
3279 if re.search(regex, chapter['title']):
3280 warning = None
3281 yield {**chapter, 'index': i}
f2df4071 3282 if self.chapters and warning:
5ec1b6b7 3283 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3284
b4e0d758 3285 for start, end in self.ranges or []:
3286 yield {
3287 'start_time': self._handle_negative_timestamp(start, info_dict),
3288 'end_time': self._handle_negative_timestamp(end, info_dict),
3289 }
3290
3291 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3292 yield {
e59e2074 3293 'start_time': info_dict.get('start_time') or 0,
3294 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3295 }
e59e2074 3296 elif not self.ranges and not self.chapters:
3297 yield {}
b4e0d758 3298
3299 @staticmethod
3300 def _handle_negative_timestamp(time, info):
3301 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3302
f2df4071 3303 def __eq__(self, other):
3304 return (isinstance(other, download_range_func)
3305 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3306
71df9b7f 3307 def __repr__(self):
a5387729 3308 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3309
5ec1b6b7 3310
bf6427d2
YCH
3311def parse_dfxp_time_expr(time_expr):
3312 if not time_expr:
d631d5f9 3313 return
bf6427d2 3314
1d485a1a 3315 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3316 if mobj:
3317 return float(mobj.group('time_offset'))
3318
db2fe38b 3319 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3320 if mobj:
db2fe38b 3321 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3322
3323
c1c924ab 3324def srt_subtitles_timecode(seconds):
aa7785f8 3325 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3326
3327
3328def ass_subtitles_timecode(seconds):
3329 time = timetuple_from_msec(seconds * 1000)
3330 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3331
3332
3333def dfxp2srt(dfxp_data):
3869028f
YCH
3334 '''
3335 @param dfxp_data A bytes-like object containing DFXP data
3336 @returns A unicode object containing converted SRT data
3337 '''
5b995f71 3338 LEGACY_NAMESPACES = (
3869028f
YCH
3339 (b'http://www.w3.org/ns/ttml', [
3340 b'http://www.w3.org/2004/11/ttaf1',
3341 b'http://www.w3.org/2006/04/ttaf1',
3342 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3343 ]),
3869028f
YCH
3344 (b'http://www.w3.org/ns/ttml#styling', [
3345 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3346 ]),
3347 )
3348
3349 SUPPORTED_STYLING = [
3350 'color',
3351 'fontFamily',
3352 'fontSize',
3353 'fontStyle',
3354 'fontWeight',
3355 'textDecoration'
3356 ]
3357
4e335771 3358 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3359 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3360 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3361 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3362 })
bf6427d2 3363
5b995f71
RA
3364 styles = {}
3365 default_style = {}
3366
86e5f3ed 3367 class TTMLPElementParser:
5b995f71
RA
3368 _out = ''
3369 _unclosed_elements = []
3370 _applied_styles = []
bf6427d2 3371
2b14cb56 3372 def start(self, tag, attrib):
5b995f71
RA
3373 if tag in (_x('ttml:br'), 'br'):
3374 self._out += '\n'
3375 else:
3376 unclosed_elements = []
3377 style = {}
3378 element_style_id = attrib.get('style')
3379 if default_style:
3380 style.update(default_style)
3381 if element_style_id:
3382 style.update(styles.get(element_style_id, {}))
3383 for prop in SUPPORTED_STYLING:
3384 prop_val = attrib.get(_x('tts:' + prop))
3385 if prop_val:
3386 style[prop] = prop_val
3387 if style:
3388 font = ''
3389 for k, v in sorted(style.items()):
3390 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3391 continue
3392 if k == 'color':
3393 font += ' color="%s"' % v
3394 elif k == 'fontSize':
3395 font += ' size="%s"' % v
3396 elif k == 'fontFamily':
3397 font += ' face="%s"' % v
3398 elif k == 'fontWeight' and v == 'bold':
3399 self._out += '<b>'
3400 unclosed_elements.append('b')
3401 elif k == 'fontStyle' and v == 'italic':
3402 self._out += '<i>'
3403 unclosed_elements.append('i')
3404 elif k == 'textDecoration' and v == 'underline':
3405 self._out += '<u>'
3406 unclosed_elements.append('u')
3407 if font:
3408 self._out += '<font' + font + '>'
3409 unclosed_elements.append('font')
3410 applied_style = {}
3411 if self._applied_styles:
3412 applied_style.update(self._applied_styles[-1])
3413 applied_style.update(style)
3414 self._applied_styles.append(applied_style)
3415 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3416
2b14cb56 3417 def end(self, tag):
5b995f71
RA
3418 if tag not in (_x('ttml:br'), 'br'):
3419 unclosed_elements = self._unclosed_elements.pop()
3420 for element in reversed(unclosed_elements):
3421 self._out += '</%s>' % element
3422 if unclosed_elements and self._applied_styles:
3423 self._applied_styles.pop()
bf6427d2 3424
2b14cb56 3425 def data(self, data):
5b995f71 3426 self._out += data
2b14cb56 3427
3428 def close(self):
5b995f71 3429 return self._out.strip()
2b14cb56 3430
6a765f13 3431 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3432 # This will not trigger false positives since only UTF-8 text is being replaced
3433 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3434
2b14cb56 3435 def parse_node(node):
3436 target = TTMLPElementParser()
3437 parser = xml.etree.ElementTree.XMLParser(target=target)
3438 parser.feed(xml.etree.ElementTree.tostring(node))
3439 return parser.close()
bf6427d2 3440
5b995f71
RA
3441 for k, v in LEGACY_NAMESPACES:
3442 for ns in v:
3443 dfxp_data = dfxp_data.replace(ns, k)
3444
3869028f 3445 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3446 out = []
5b995f71 3447 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3448
3449 if not paras:
3450 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3451
5b995f71
RA
3452 repeat = False
3453 while True:
3454 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3455 style_id = style.get('id') or style.get(_x('xml:id'))
3456 if not style_id:
3457 continue
5b995f71
RA
3458 parent_style_id = style.get('style')
3459 if parent_style_id:
3460 if parent_style_id not in styles:
3461 repeat = True
3462 continue
3463 styles[style_id] = styles[parent_style_id].copy()
3464 for prop in SUPPORTED_STYLING:
3465 prop_val = style.get(_x('tts:' + prop))
3466 if prop_val:
3467 styles.setdefault(style_id, {})[prop] = prop_val
3468 if repeat:
3469 repeat = False
3470 else:
3471 break
3472
3473 for p in ('body', 'div'):
3474 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3475 if ele is None:
3476 continue
3477 style = styles.get(ele.get('style'))
3478 if not style:
3479 continue
3480 default_style.update(style)
3481
bf6427d2 3482 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3483 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3484 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3485 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3486 if begin_time is None:
3487 continue
7dff0363 3488 if not end_time:
d631d5f9
YCH
3489 if not dur:
3490 continue
3491 end_time = begin_time + dur
bf6427d2
YCH
3492 out.append('%d\n%s --> %s\n%s\n\n' % (
3493 index,
c1c924ab
YCH
3494 srt_subtitles_timecode(begin_time),
3495 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3496 parse_node(para)))
3497
3498 return ''.join(out)
3499
3500
c487cf00 3501def cli_option(params, command_option, param, separator=None):
66e289ba 3502 param = params.get(param)
c487cf00 3503 return ([] if param is None
3504 else [command_option, str(param)] if separator is None
3505 else [f'{command_option}{separator}{param}'])
66e289ba
S
3506
3507
3508def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3509 param = params.get(param)
c487cf00 3510 assert param in (True, False, None)
3511 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3512
3513
3514def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3515 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3516
3517
e92caff5 3518def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3519 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3520 if use_compat:
5b1ecbb3 3521 return argdict
3522 else:
3523 argdict = None
eab9b2bc 3524 if argdict is None:
5b1ecbb3 3525 return default
eab9b2bc 3526 assert isinstance(argdict, dict)
3527
e92caff5 3528 assert isinstance(keys, (list, tuple))
3529 for key_list in keys:
e92caff5 3530 arg_list = list(filter(
3531 lambda x: x is not None,
6606817a 3532 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3533 if arg_list:
3534 return [arg for args in arg_list for arg in args]
3535 return default
66e289ba 3536
6251555f 3537
330690a2 3538def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3539 main_key, exe = main_key.lower(), exe.lower()
3540 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3541 keys = [f'{root_key}{k}' for k in (keys or [''])]
3542 if root_key in keys:
3543 if main_key != exe:
3544 keys.append((main_key, exe))
3545 keys.append('default')
3546 else:
3547 use_compat = False
3548 return cli_configuration_args(argdict, keys, default, use_compat)
3549
66e289ba 3550
86e5f3ed 3551class ISO639Utils:
39672624
YCH
3552 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3553 _lang_map = {
3554 'aa': 'aar',
3555 'ab': 'abk',
3556 'ae': 'ave',
3557 'af': 'afr',
3558 'ak': 'aka',
3559 'am': 'amh',
3560 'an': 'arg',
3561 'ar': 'ara',
3562 'as': 'asm',
3563 'av': 'ava',
3564 'ay': 'aym',
3565 'az': 'aze',
3566 'ba': 'bak',
3567 'be': 'bel',
3568 'bg': 'bul',
3569 'bh': 'bih',
3570 'bi': 'bis',
3571 'bm': 'bam',
3572 'bn': 'ben',
3573 'bo': 'bod',
3574 'br': 'bre',
3575 'bs': 'bos',
3576 'ca': 'cat',
3577 'ce': 'che',
3578 'ch': 'cha',
3579 'co': 'cos',
3580 'cr': 'cre',
3581 'cs': 'ces',
3582 'cu': 'chu',
3583 'cv': 'chv',
3584 'cy': 'cym',
3585 'da': 'dan',
3586 'de': 'deu',
3587 'dv': 'div',
3588 'dz': 'dzo',
3589 'ee': 'ewe',
3590 'el': 'ell',
3591 'en': 'eng',
3592 'eo': 'epo',
3593 'es': 'spa',
3594 'et': 'est',
3595 'eu': 'eus',
3596 'fa': 'fas',
3597 'ff': 'ful',
3598 'fi': 'fin',
3599 'fj': 'fij',
3600 'fo': 'fao',
3601 'fr': 'fra',
3602 'fy': 'fry',
3603 'ga': 'gle',
3604 'gd': 'gla',
3605 'gl': 'glg',
3606 'gn': 'grn',
3607 'gu': 'guj',
3608 'gv': 'glv',
3609 'ha': 'hau',
3610 'he': 'heb',
b7acc835 3611 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3612 'hi': 'hin',
3613 'ho': 'hmo',
3614 'hr': 'hrv',
3615 'ht': 'hat',
3616 'hu': 'hun',
3617 'hy': 'hye',
3618 'hz': 'her',
3619 'ia': 'ina',
3620 'id': 'ind',
b7acc835 3621 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3622 'ie': 'ile',
3623 'ig': 'ibo',
3624 'ii': 'iii',
3625 'ik': 'ipk',
3626 'io': 'ido',
3627 'is': 'isl',
3628 'it': 'ita',
3629 'iu': 'iku',
3630 'ja': 'jpn',
3631 'jv': 'jav',
3632 'ka': 'kat',
3633 'kg': 'kon',
3634 'ki': 'kik',
3635 'kj': 'kua',
3636 'kk': 'kaz',
3637 'kl': 'kal',
3638 'km': 'khm',
3639 'kn': 'kan',
3640 'ko': 'kor',
3641 'kr': 'kau',
3642 'ks': 'kas',
3643 'ku': 'kur',
3644 'kv': 'kom',
3645 'kw': 'cor',
3646 'ky': 'kir',
3647 'la': 'lat',
3648 'lb': 'ltz',
3649 'lg': 'lug',
3650 'li': 'lim',
3651 'ln': 'lin',
3652 'lo': 'lao',
3653 'lt': 'lit',
3654 'lu': 'lub',
3655 'lv': 'lav',
3656 'mg': 'mlg',
3657 'mh': 'mah',
3658 'mi': 'mri',
3659 'mk': 'mkd',
3660 'ml': 'mal',
3661 'mn': 'mon',
3662 'mr': 'mar',
3663 'ms': 'msa',
3664 'mt': 'mlt',
3665 'my': 'mya',
3666 'na': 'nau',
3667 'nb': 'nob',
3668 'nd': 'nde',
3669 'ne': 'nep',
3670 'ng': 'ndo',
3671 'nl': 'nld',
3672 'nn': 'nno',
3673 'no': 'nor',
3674 'nr': 'nbl',
3675 'nv': 'nav',
3676 'ny': 'nya',
3677 'oc': 'oci',
3678 'oj': 'oji',
3679 'om': 'orm',
3680 'or': 'ori',
3681 'os': 'oss',
3682 'pa': 'pan',
7bcd4813 3683 'pe': 'per',
39672624
YCH
3684 'pi': 'pli',
3685 'pl': 'pol',
3686 'ps': 'pus',
3687 'pt': 'por',
3688 'qu': 'que',
3689 'rm': 'roh',
3690 'rn': 'run',
3691 'ro': 'ron',
3692 'ru': 'rus',
3693 'rw': 'kin',
3694 'sa': 'san',
3695 'sc': 'srd',
3696 'sd': 'snd',
3697 'se': 'sme',
3698 'sg': 'sag',
3699 'si': 'sin',
3700 'sk': 'slk',
3701 'sl': 'slv',
3702 'sm': 'smo',
3703 'sn': 'sna',
3704 'so': 'som',
3705 'sq': 'sqi',
3706 'sr': 'srp',
3707 'ss': 'ssw',
3708 'st': 'sot',
3709 'su': 'sun',
3710 'sv': 'swe',
3711 'sw': 'swa',
3712 'ta': 'tam',
3713 'te': 'tel',
3714 'tg': 'tgk',
3715 'th': 'tha',
3716 'ti': 'tir',
3717 'tk': 'tuk',
3718 'tl': 'tgl',
3719 'tn': 'tsn',
3720 'to': 'ton',
3721 'tr': 'tur',
3722 'ts': 'tso',
3723 'tt': 'tat',
3724 'tw': 'twi',
3725 'ty': 'tah',
3726 'ug': 'uig',
3727 'uk': 'ukr',
3728 'ur': 'urd',
3729 'uz': 'uzb',
3730 've': 'ven',
3731 'vi': 'vie',
3732 'vo': 'vol',
3733 'wa': 'wln',
3734 'wo': 'wol',
3735 'xh': 'xho',
3736 'yi': 'yid',
e9a50fba 3737 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3738 'yo': 'yor',
3739 'za': 'zha',
3740 'zh': 'zho',
3741 'zu': 'zul',
3742 }
3743
3744 @classmethod
3745 def short2long(cls, code):
3746 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3747 return cls._lang_map.get(code[:2])
3748
3749 @classmethod
3750 def long2short(cls, code):
3751 """Convert language code from ISO 639-2/T to ISO 639-1"""
3752 for short_name, long_name in cls._lang_map.items():
3753 if long_name == code:
3754 return short_name
3755
3756
86e5f3ed 3757class ISO3166Utils:
4eb10f66
YCH
3758 # From http://data.okfn.org/data/core/country-list
3759 _country_map = {
3760 'AF': 'Afghanistan',
3761 'AX': 'Åland Islands',
3762 'AL': 'Albania',
3763 'DZ': 'Algeria',
3764 'AS': 'American Samoa',
3765 'AD': 'Andorra',
3766 'AO': 'Angola',
3767 'AI': 'Anguilla',
3768 'AQ': 'Antarctica',
3769 'AG': 'Antigua and Barbuda',
3770 'AR': 'Argentina',
3771 'AM': 'Armenia',
3772 'AW': 'Aruba',
3773 'AU': 'Australia',
3774 'AT': 'Austria',
3775 'AZ': 'Azerbaijan',
3776 'BS': 'Bahamas',
3777 'BH': 'Bahrain',
3778 'BD': 'Bangladesh',
3779 'BB': 'Barbados',
3780 'BY': 'Belarus',
3781 'BE': 'Belgium',
3782 'BZ': 'Belize',
3783 'BJ': 'Benin',
3784 'BM': 'Bermuda',
3785 'BT': 'Bhutan',
3786 'BO': 'Bolivia, Plurinational State of',
3787 'BQ': 'Bonaire, Sint Eustatius and Saba',
3788 'BA': 'Bosnia and Herzegovina',
3789 'BW': 'Botswana',
3790 'BV': 'Bouvet Island',
3791 'BR': 'Brazil',
3792 'IO': 'British Indian Ocean Territory',
3793 'BN': 'Brunei Darussalam',
3794 'BG': 'Bulgaria',
3795 'BF': 'Burkina Faso',
3796 'BI': 'Burundi',
3797 'KH': 'Cambodia',
3798 'CM': 'Cameroon',
3799 'CA': 'Canada',
3800 'CV': 'Cape Verde',
3801 'KY': 'Cayman Islands',
3802 'CF': 'Central African Republic',
3803 'TD': 'Chad',
3804 'CL': 'Chile',
3805 'CN': 'China',
3806 'CX': 'Christmas Island',
3807 'CC': 'Cocos (Keeling) Islands',
3808 'CO': 'Colombia',
3809 'KM': 'Comoros',
3810 'CG': 'Congo',
3811 'CD': 'Congo, the Democratic Republic of the',
3812 'CK': 'Cook Islands',
3813 'CR': 'Costa Rica',
3814 'CI': 'Côte d\'Ivoire',
3815 'HR': 'Croatia',
3816 'CU': 'Cuba',
3817 'CW': 'Curaçao',
3818 'CY': 'Cyprus',
3819 'CZ': 'Czech Republic',
3820 'DK': 'Denmark',
3821 'DJ': 'Djibouti',
3822 'DM': 'Dominica',
3823 'DO': 'Dominican Republic',
3824 'EC': 'Ecuador',
3825 'EG': 'Egypt',
3826 'SV': 'El Salvador',
3827 'GQ': 'Equatorial Guinea',
3828 'ER': 'Eritrea',
3829 'EE': 'Estonia',
3830 'ET': 'Ethiopia',
3831 'FK': 'Falkland Islands (Malvinas)',
3832 'FO': 'Faroe Islands',
3833 'FJ': 'Fiji',
3834 'FI': 'Finland',
3835 'FR': 'France',
3836 'GF': 'French Guiana',
3837 'PF': 'French Polynesia',
3838 'TF': 'French Southern Territories',
3839 'GA': 'Gabon',
3840 'GM': 'Gambia',
3841 'GE': 'Georgia',
3842 'DE': 'Germany',
3843 'GH': 'Ghana',
3844 'GI': 'Gibraltar',
3845 'GR': 'Greece',
3846 'GL': 'Greenland',
3847 'GD': 'Grenada',
3848 'GP': 'Guadeloupe',
3849 'GU': 'Guam',
3850 'GT': 'Guatemala',
3851 'GG': 'Guernsey',
3852 'GN': 'Guinea',
3853 'GW': 'Guinea-Bissau',
3854 'GY': 'Guyana',
3855 'HT': 'Haiti',
3856 'HM': 'Heard Island and McDonald Islands',
3857 'VA': 'Holy See (Vatican City State)',
3858 'HN': 'Honduras',
3859 'HK': 'Hong Kong',
3860 'HU': 'Hungary',
3861 'IS': 'Iceland',
3862 'IN': 'India',
3863 'ID': 'Indonesia',
3864 'IR': 'Iran, Islamic Republic of',
3865 'IQ': 'Iraq',
3866 'IE': 'Ireland',
3867 'IM': 'Isle of Man',
3868 'IL': 'Israel',
3869 'IT': 'Italy',
3870 'JM': 'Jamaica',
3871 'JP': 'Japan',
3872 'JE': 'Jersey',
3873 'JO': 'Jordan',
3874 'KZ': 'Kazakhstan',
3875 'KE': 'Kenya',
3876 'KI': 'Kiribati',
3877 'KP': 'Korea, Democratic People\'s Republic of',
3878 'KR': 'Korea, Republic of',
3879 'KW': 'Kuwait',
3880 'KG': 'Kyrgyzstan',
3881 'LA': 'Lao People\'s Democratic Republic',
3882 'LV': 'Latvia',
3883 'LB': 'Lebanon',
3884 'LS': 'Lesotho',
3885 'LR': 'Liberia',
3886 'LY': 'Libya',
3887 'LI': 'Liechtenstein',
3888 'LT': 'Lithuania',
3889 'LU': 'Luxembourg',
3890 'MO': 'Macao',
3891 'MK': 'Macedonia, the Former Yugoslav Republic of',
3892 'MG': 'Madagascar',
3893 'MW': 'Malawi',
3894 'MY': 'Malaysia',
3895 'MV': 'Maldives',
3896 'ML': 'Mali',
3897 'MT': 'Malta',
3898 'MH': 'Marshall Islands',
3899 'MQ': 'Martinique',
3900 'MR': 'Mauritania',
3901 'MU': 'Mauritius',
3902 'YT': 'Mayotte',
3903 'MX': 'Mexico',
3904 'FM': 'Micronesia, Federated States of',
3905 'MD': 'Moldova, Republic of',
3906 'MC': 'Monaco',
3907 'MN': 'Mongolia',
3908 'ME': 'Montenegro',
3909 'MS': 'Montserrat',
3910 'MA': 'Morocco',
3911 'MZ': 'Mozambique',
3912 'MM': 'Myanmar',
3913 'NA': 'Namibia',
3914 'NR': 'Nauru',
3915 'NP': 'Nepal',
3916 'NL': 'Netherlands',
3917 'NC': 'New Caledonia',
3918 'NZ': 'New Zealand',
3919 'NI': 'Nicaragua',
3920 'NE': 'Niger',
3921 'NG': 'Nigeria',
3922 'NU': 'Niue',
3923 'NF': 'Norfolk Island',
3924 'MP': 'Northern Mariana Islands',
3925 'NO': 'Norway',
3926 'OM': 'Oman',
3927 'PK': 'Pakistan',
3928 'PW': 'Palau',
3929 'PS': 'Palestine, State of',
3930 'PA': 'Panama',
3931 'PG': 'Papua New Guinea',
3932 'PY': 'Paraguay',
3933 'PE': 'Peru',
3934 'PH': 'Philippines',
3935 'PN': 'Pitcairn',
3936 'PL': 'Poland',
3937 'PT': 'Portugal',
3938 'PR': 'Puerto Rico',
3939 'QA': 'Qatar',
3940 'RE': 'Réunion',
3941 'RO': 'Romania',
3942 'RU': 'Russian Federation',
3943 'RW': 'Rwanda',
3944 'BL': 'Saint Barthélemy',
3945 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3946 'KN': 'Saint Kitts and Nevis',
3947 'LC': 'Saint Lucia',
3948 'MF': 'Saint Martin (French part)',
3949 'PM': 'Saint Pierre and Miquelon',
3950 'VC': 'Saint Vincent and the Grenadines',
3951 'WS': 'Samoa',
3952 'SM': 'San Marino',
3953 'ST': 'Sao Tome and Principe',
3954 'SA': 'Saudi Arabia',
3955 'SN': 'Senegal',
3956 'RS': 'Serbia',
3957 'SC': 'Seychelles',
3958 'SL': 'Sierra Leone',
3959 'SG': 'Singapore',
3960 'SX': 'Sint Maarten (Dutch part)',
3961 'SK': 'Slovakia',
3962 'SI': 'Slovenia',
3963 'SB': 'Solomon Islands',
3964 'SO': 'Somalia',
3965 'ZA': 'South Africa',
3966 'GS': 'South Georgia and the South Sandwich Islands',
3967 'SS': 'South Sudan',
3968 'ES': 'Spain',
3969 'LK': 'Sri Lanka',
3970 'SD': 'Sudan',
3971 'SR': 'Suriname',
3972 'SJ': 'Svalbard and Jan Mayen',
3973 'SZ': 'Swaziland',
3974 'SE': 'Sweden',
3975 'CH': 'Switzerland',
3976 'SY': 'Syrian Arab Republic',
3977 'TW': 'Taiwan, Province of China',
3978 'TJ': 'Tajikistan',
3979 'TZ': 'Tanzania, United Republic of',
3980 'TH': 'Thailand',
3981 'TL': 'Timor-Leste',
3982 'TG': 'Togo',
3983 'TK': 'Tokelau',
3984 'TO': 'Tonga',
3985 'TT': 'Trinidad and Tobago',
3986 'TN': 'Tunisia',
3987 'TR': 'Turkey',
3988 'TM': 'Turkmenistan',
3989 'TC': 'Turks and Caicos Islands',
3990 'TV': 'Tuvalu',
3991 'UG': 'Uganda',
3992 'UA': 'Ukraine',
3993 'AE': 'United Arab Emirates',
3994 'GB': 'United Kingdom',
3995 'US': 'United States',
3996 'UM': 'United States Minor Outlying Islands',
3997 'UY': 'Uruguay',
3998 'UZ': 'Uzbekistan',
3999 'VU': 'Vanuatu',
4000 'VE': 'Venezuela, Bolivarian Republic of',
4001 'VN': 'Viet Nam',
4002 'VG': 'Virgin Islands, British',
4003 'VI': 'Virgin Islands, U.S.',
4004 'WF': 'Wallis and Futuna',
4005 'EH': 'Western Sahara',
4006 'YE': 'Yemen',
4007 'ZM': 'Zambia',
4008 'ZW': 'Zimbabwe',
2f97cc61 4009 # Not ISO 3166 codes, but used for IP blocks
4010 'AP': 'Asia/Pacific Region',
4011 'EU': 'Europe',
4eb10f66
YCH
4012 }
4013
4014 @classmethod
4015 def short2full(cls, code):
4016 """Convert an ISO 3166-2 country code to the corresponding full name"""
4017 return cls._country_map.get(code.upper())
4018
4019
86e5f3ed 4020class GeoUtils:
773f291d
S
4021 # Major IPv4 address blocks per country
4022 _country_ip_map = {
53896ca5 4023 'AD': '46.172.224.0/19',
773f291d
S
4024 'AE': '94.200.0.0/13',
4025 'AF': '149.54.0.0/17',
4026 'AG': '209.59.64.0/18',
4027 'AI': '204.14.248.0/21',
4028 'AL': '46.99.0.0/16',
4029 'AM': '46.70.0.0/15',
4030 'AO': '105.168.0.0/13',
53896ca5
S
4031 'AP': '182.50.184.0/21',
4032 'AQ': '23.154.160.0/24',
773f291d
S
4033 'AR': '181.0.0.0/12',
4034 'AS': '202.70.112.0/20',
53896ca5 4035 'AT': '77.116.0.0/14',
773f291d
S
4036 'AU': '1.128.0.0/11',
4037 'AW': '181.41.0.0/18',
53896ca5
S
4038 'AX': '185.217.4.0/22',
4039 'AZ': '5.197.0.0/16',
773f291d
S
4040 'BA': '31.176.128.0/17',
4041 'BB': '65.48.128.0/17',
4042 'BD': '114.130.0.0/16',
4043 'BE': '57.0.0.0/8',
53896ca5 4044 'BF': '102.178.0.0/15',
773f291d
S
4045 'BG': '95.42.0.0/15',
4046 'BH': '37.131.0.0/17',
4047 'BI': '154.117.192.0/18',
4048 'BJ': '137.255.0.0/16',
53896ca5 4049 'BL': '185.212.72.0/23',
773f291d
S
4050 'BM': '196.12.64.0/18',
4051 'BN': '156.31.0.0/16',
4052 'BO': '161.56.0.0/16',
4053 'BQ': '161.0.80.0/20',
53896ca5 4054 'BR': '191.128.0.0/12',
773f291d
S
4055 'BS': '24.51.64.0/18',
4056 'BT': '119.2.96.0/19',
4057 'BW': '168.167.0.0/16',
4058 'BY': '178.120.0.0/13',
4059 'BZ': '179.42.192.0/18',
4060 'CA': '99.224.0.0/11',
4061 'CD': '41.243.0.0/16',
53896ca5
S
4062 'CF': '197.242.176.0/21',
4063 'CG': '160.113.0.0/16',
773f291d 4064 'CH': '85.0.0.0/13',
53896ca5 4065 'CI': '102.136.0.0/14',
773f291d
S
4066 'CK': '202.65.32.0/19',
4067 'CL': '152.172.0.0/14',
53896ca5 4068 'CM': '102.244.0.0/14',
773f291d
S
4069 'CN': '36.128.0.0/10',
4070 'CO': '181.240.0.0/12',
4071 'CR': '201.192.0.0/12',
4072 'CU': '152.206.0.0/15',
4073 'CV': '165.90.96.0/19',
4074 'CW': '190.88.128.0/17',
53896ca5 4075 'CY': '31.153.0.0/16',
773f291d
S
4076 'CZ': '88.100.0.0/14',
4077 'DE': '53.0.0.0/8',
4078 'DJ': '197.241.0.0/17',
4079 'DK': '87.48.0.0/12',
4080 'DM': '192.243.48.0/20',
4081 'DO': '152.166.0.0/15',
4082 'DZ': '41.96.0.0/12',
4083 'EC': '186.68.0.0/15',
4084 'EE': '90.190.0.0/15',
4085 'EG': '156.160.0.0/11',
4086 'ER': '196.200.96.0/20',
4087 'ES': '88.0.0.0/11',
4088 'ET': '196.188.0.0/14',
4089 'EU': '2.16.0.0/13',
4090 'FI': '91.152.0.0/13',
4091 'FJ': '144.120.0.0/16',
53896ca5 4092 'FK': '80.73.208.0/21',
773f291d
S
4093 'FM': '119.252.112.0/20',
4094 'FO': '88.85.32.0/19',
4095 'FR': '90.0.0.0/9',
4096 'GA': '41.158.0.0/15',
4097 'GB': '25.0.0.0/8',
4098 'GD': '74.122.88.0/21',
4099 'GE': '31.146.0.0/16',
4100 'GF': '161.22.64.0/18',
4101 'GG': '62.68.160.0/19',
53896ca5
S
4102 'GH': '154.160.0.0/12',
4103 'GI': '95.164.0.0/16',
773f291d
S
4104 'GL': '88.83.0.0/19',
4105 'GM': '160.182.0.0/15',
4106 'GN': '197.149.192.0/18',
4107 'GP': '104.250.0.0/19',
4108 'GQ': '105.235.224.0/20',
4109 'GR': '94.64.0.0/13',
4110 'GT': '168.234.0.0/16',
4111 'GU': '168.123.0.0/16',
4112 'GW': '197.214.80.0/20',
4113 'GY': '181.41.64.0/18',
4114 'HK': '113.252.0.0/14',
4115 'HN': '181.210.0.0/16',
4116 'HR': '93.136.0.0/13',
4117 'HT': '148.102.128.0/17',
4118 'HU': '84.0.0.0/14',
4119 'ID': '39.192.0.0/10',
4120 'IE': '87.32.0.0/12',
4121 'IL': '79.176.0.0/13',
4122 'IM': '5.62.80.0/20',
4123 'IN': '117.192.0.0/10',
4124 'IO': '203.83.48.0/21',
4125 'IQ': '37.236.0.0/14',
4126 'IR': '2.176.0.0/12',
4127 'IS': '82.221.0.0/16',
4128 'IT': '79.0.0.0/10',
4129 'JE': '87.244.64.0/18',
4130 'JM': '72.27.0.0/17',
4131 'JO': '176.29.0.0/16',
53896ca5 4132 'JP': '133.0.0.0/8',
773f291d
S
4133 'KE': '105.48.0.0/12',
4134 'KG': '158.181.128.0/17',
4135 'KH': '36.37.128.0/17',
4136 'KI': '103.25.140.0/22',
4137 'KM': '197.255.224.0/20',
53896ca5 4138 'KN': '198.167.192.0/19',
773f291d
S
4139 'KP': '175.45.176.0/22',
4140 'KR': '175.192.0.0/10',
4141 'KW': '37.36.0.0/14',
4142 'KY': '64.96.0.0/15',
4143 'KZ': '2.72.0.0/13',
4144 'LA': '115.84.64.0/18',
4145 'LB': '178.135.0.0/16',
53896ca5 4146 'LC': '24.92.144.0/20',
773f291d
S
4147 'LI': '82.117.0.0/19',
4148 'LK': '112.134.0.0/15',
53896ca5 4149 'LR': '102.183.0.0/16',
773f291d
S
4150 'LS': '129.232.0.0/17',
4151 'LT': '78.56.0.0/13',
4152 'LU': '188.42.0.0/16',
4153 'LV': '46.109.0.0/16',
4154 'LY': '41.252.0.0/14',
4155 'MA': '105.128.0.0/11',
4156 'MC': '88.209.64.0/18',
4157 'MD': '37.246.0.0/16',
4158 'ME': '178.175.0.0/17',
4159 'MF': '74.112.232.0/21',
4160 'MG': '154.126.0.0/17',
4161 'MH': '117.103.88.0/21',
4162 'MK': '77.28.0.0/15',
4163 'ML': '154.118.128.0/18',
4164 'MM': '37.111.0.0/17',
4165 'MN': '49.0.128.0/17',
4166 'MO': '60.246.0.0/16',
4167 'MP': '202.88.64.0/20',
4168 'MQ': '109.203.224.0/19',
4169 'MR': '41.188.64.0/18',
4170 'MS': '208.90.112.0/22',
4171 'MT': '46.11.0.0/16',
4172 'MU': '105.16.0.0/12',
4173 'MV': '27.114.128.0/18',
53896ca5 4174 'MW': '102.70.0.0/15',
773f291d
S
4175 'MX': '187.192.0.0/11',
4176 'MY': '175.136.0.0/13',
4177 'MZ': '197.218.0.0/15',
4178 'NA': '41.182.0.0/16',
4179 'NC': '101.101.0.0/18',
4180 'NE': '197.214.0.0/18',
4181 'NF': '203.17.240.0/22',
4182 'NG': '105.112.0.0/12',
4183 'NI': '186.76.0.0/15',
4184 'NL': '145.96.0.0/11',
4185 'NO': '84.208.0.0/13',
4186 'NP': '36.252.0.0/15',
4187 'NR': '203.98.224.0/19',
4188 'NU': '49.156.48.0/22',
4189 'NZ': '49.224.0.0/14',
4190 'OM': '5.36.0.0/15',
4191 'PA': '186.72.0.0/15',
4192 'PE': '186.160.0.0/14',
4193 'PF': '123.50.64.0/18',
4194 'PG': '124.240.192.0/19',
4195 'PH': '49.144.0.0/13',
4196 'PK': '39.32.0.0/11',
4197 'PL': '83.0.0.0/11',
4198 'PM': '70.36.0.0/20',
4199 'PR': '66.50.0.0/16',
4200 'PS': '188.161.0.0/16',
4201 'PT': '85.240.0.0/13',
4202 'PW': '202.124.224.0/20',
4203 'PY': '181.120.0.0/14',
4204 'QA': '37.210.0.0/15',
53896ca5 4205 'RE': '102.35.0.0/16',
773f291d 4206 'RO': '79.112.0.0/13',
53896ca5 4207 'RS': '93.86.0.0/15',
773f291d 4208 'RU': '5.136.0.0/13',
53896ca5 4209 'RW': '41.186.0.0/16',
773f291d
S
4210 'SA': '188.48.0.0/13',
4211 'SB': '202.1.160.0/19',
4212 'SC': '154.192.0.0/11',
53896ca5 4213 'SD': '102.120.0.0/13',
773f291d 4214 'SE': '78.64.0.0/12',
53896ca5 4215 'SG': '8.128.0.0/10',
773f291d
S
4216 'SI': '188.196.0.0/14',
4217 'SK': '78.98.0.0/15',
53896ca5 4218 'SL': '102.143.0.0/17',
773f291d
S
4219 'SM': '89.186.32.0/19',
4220 'SN': '41.82.0.0/15',
53896ca5 4221 'SO': '154.115.192.0/18',
773f291d
S
4222 'SR': '186.179.128.0/17',
4223 'SS': '105.235.208.0/21',
4224 'ST': '197.159.160.0/19',
4225 'SV': '168.243.0.0/16',
4226 'SX': '190.102.0.0/20',
4227 'SY': '5.0.0.0/16',
4228 'SZ': '41.84.224.0/19',
4229 'TC': '65.255.48.0/20',
4230 'TD': '154.68.128.0/19',
4231 'TG': '196.168.0.0/14',
4232 'TH': '171.96.0.0/13',
4233 'TJ': '85.9.128.0/18',
4234 'TK': '27.96.24.0/21',
4235 'TL': '180.189.160.0/20',
4236 'TM': '95.85.96.0/19',
4237 'TN': '197.0.0.0/11',
4238 'TO': '175.176.144.0/21',
4239 'TR': '78.160.0.0/11',
4240 'TT': '186.44.0.0/15',
4241 'TV': '202.2.96.0/19',
4242 'TW': '120.96.0.0/11',
4243 'TZ': '156.156.0.0/14',
53896ca5
S
4244 'UA': '37.52.0.0/14',
4245 'UG': '102.80.0.0/13',
4246 'US': '6.0.0.0/8',
773f291d 4247 'UY': '167.56.0.0/13',
53896ca5 4248 'UZ': '84.54.64.0/18',
773f291d 4249 'VA': '212.77.0.0/19',
53896ca5 4250 'VC': '207.191.240.0/21',
773f291d 4251 'VE': '186.88.0.0/13',
53896ca5 4252 'VG': '66.81.192.0/20',
773f291d
S
4253 'VI': '146.226.0.0/16',
4254 'VN': '14.160.0.0/11',
4255 'VU': '202.80.32.0/20',
4256 'WF': '117.20.32.0/21',
4257 'WS': '202.4.32.0/19',
4258 'YE': '134.35.0.0/16',
4259 'YT': '41.242.116.0/22',
4260 'ZA': '41.0.0.0/11',
53896ca5
S
4261 'ZM': '102.144.0.0/13',
4262 'ZW': '102.177.192.0/18',
773f291d
S
4263 }
4264
4265 @classmethod
5f95927a
S
4266 def random_ipv4(cls, code_or_block):
4267 if len(code_or_block) == 2:
4268 block = cls._country_ip_map.get(code_or_block.upper())
4269 if not block:
4270 return None
4271 else:
4272 block = code_or_block
773f291d 4273 addr, preflen = block.split('/')
ac668111 4274 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4275 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4276 return str(socket.inet_ntoa(
ac668111 4277 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4278
4279
0a5445dd
YCH
4280# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4281# released into Public Domain
4282# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4283
4284def long_to_bytes(n, blocksize=0):
4285 """long_to_bytes(n:long, blocksize:int) : string
4286 Convert a long integer to a byte string.
4287
4288 If optional blocksize is given and greater than zero, pad the front of the
4289 byte string with binary zeros so that the length is a multiple of
4290 blocksize.
4291 """
4292 # after much testing, this algorithm was deemed to be the fastest
4293 s = b''
4294 n = int(n)
4295 while n > 0:
ac668111 4296 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4297 n = n >> 32
4298 # strip off leading zeros
4299 for i in range(len(s)):
4300 if s[i] != b'\000'[0]:
4301 break
4302 else:
4303 # only happens when n == 0
4304 s = b'\000'
4305 i = 0
4306 s = s[i:]
4307 # add back some pad bytes. this could be done more efficiently w.r.t. the
4308 # de-padding being done above, but sigh...
4309 if blocksize > 0 and len(s) % blocksize:
4310 s = (blocksize - len(s) % blocksize) * b'\000' + s
4311 return s
4312
4313
4314def bytes_to_long(s):
4315 """bytes_to_long(string) : long
4316 Convert a byte string to a long integer.
4317
4318 This is (essentially) the inverse of long_to_bytes().
4319 """
4320 acc = 0
4321 length = len(s)
4322 if length % 4:
4323 extra = (4 - length % 4)
4324 s = b'\000' * extra + s
4325 length = length + extra
4326 for i in range(0, length, 4):
ac668111 4327 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4328 return acc
4329
4330
5bc880b9
YCH
4331def ohdave_rsa_encrypt(data, exponent, modulus):
4332 '''
4333 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4334
4335 Input:
4336 data: data to encrypt, bytes-like object
4337 exponent, modulus: parameter e and N of RSA algorithm, both integer
4338 Output: hex string of encrypted data
4339
4340 Limitation: supports one block encryption only
4341 '''
4342
4343 payload = int(binascii.hexlify(data[::-1]), 16)
4344 encrypted = pow(payload, exponent, modulus)
4345 return '%x' % encrypted
81bdc8fd
YCH
4346
4347
f48409c7
YCH
4348def pkcs1pad(data, length):
4349 """
4350 Padding input data with PKCS#1 scheme
4351
4352 @param {int[]} data input data
4353 @param {int} length target length
4354 @returns {int[]} padded data
4355 """
4356 if len(data) > length - 11:
4357 raise ValueError('Input data too long for PKCS#1 padding')
4358
4359 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4360 return [0, 2] + pseudo_random + [0] + data
4361
4362
7b2c3f47 4363def _base_n_table(n, table):
4364 if not table and not n:
4365 raise ValueError('Either table or n must be specified')
612f2be5 4366 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4367
44f14eb4 4368 if n and n != len(table):
612f2be5 4369 raise ValueError(f'base {n} exceeds table length {len(table)}')
4370 return table
59f898b7 4371
5eb6bdce 4372
7b2c3f47 4373def encode_base_n(num, n=None, table=None):
4374 """Convert given int to a base-n string"""
612f2be5 4375 table = _base_n_table(n, table)
7b2c3f47 4376 if not num:
5eb6bdce
YCH
4377 return table[0]
4378
7b2c3f47 4379 result, base = '', len(table)
81bdc8fd 4380 while num:
7b2c3f47 4381 result = table[num % base] + result
612f2be5 4382 num = num // base
7b2c3f47 4383 return result
4384
4385
4386def decode_base_n(string, n=None, table=None):
4387 """Convert given base-n string to int"""
4388 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4389 result, base = 0, len(table)
4390 for char in string:
4391 result = result * base + table[char]
4392 return result
4393
4394
f52354a8 4395def decode_packed_codes(code):
06b3fe29 4396 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4397 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4398 base = int(base)
4399 count = int(count)
4400 symbols = symbols.split('|')
4401 symbol_table = {}
4402
4403 while count:
4404 count -= 1
5eb6bdce 4405 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4406 symbol_table[base_n_count] = symbols[count] or base_n_count
4407
4408 return re.sub(
4409 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4410 obfuscated_code)
e154c651 4411
4412
1ced2221
S
4413def caesar(s, alphabet, shift):
4414 if shift == 0:
4415 return s
4416 l = len(alphabet)
4417 return ''.join(
4418 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4419 for c in s)
4420
4421
4422def rot47(s):
4423 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4424
4425
e154c651 4426def parse_m3u8_attributes(attrib):
4427 info = {}
4428 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4429 if val.startswith('"'):
4430 val = val[1:-1]
4431 info[key] = val
4432 return info
1143535d
YCH
4433
4434
4435def urshift(val, n):
4436 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4437
4438
efa97bdc 4439def write_xattr(path, key, value):
6f7563be 4440 # Windows: Write xattrs to NTFS Alternate Data Streams:
4441 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4442 if compat_os_name == 'nt':
4443 assert ':' not in key
4444 assert os.path.exists(path)
efa97bdc
YCH
4445
4446 try:
6f7563be 4447 with open(f'{path}:{key}', 'wb') as f:
4448 f.write(value)
86e5f3ed 4449 except OSError as e:
efa97bdc 4450 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4451 return
efa97bdc 4452
84e26038 4453 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
efa97bdc 4454
6f7563be 4455 setxattr = None
84e26038 4456 if callable(getattr(os, 'setxattr', None)):
4457 setxattr = os.setxattr
4458 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
6f7563be 4459 # Unicode arguments are not supported in pyxattr until version 0.5.0
4460 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4461 if version_tuple(xattr.__version__) >= (0, 5, 0):
4462 setxattr = xattr.set
4463 elif xattr:
4464 setxattr = xattr.setxattr
efa97bdc 4465
6f7563be 4466 if setxattr:
4467 try:
4468 setxattr(path, key, value)
4469 except OSError as e:
4470 raise XAttrMetadataError(e.errno, e.strerror)
4471 return
efa97bdc 4472
6f7563be 4473 # UNIX Method 2. Use setfattr/xattr executables
4474 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4475 else 'xattr' if check_executable('xattr', ['-h']) else None)
4476 if not exe:
4477 raise XAttrUnavailableError(
47ab66db 4478 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
6f7563be 4479 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4480
0f06bcd7 4481 value = value.decode()
6f7563be 4482 try:
f0c9fb96 4483 _, stderr, returncode = Popen.run(
6f7563be 4484 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4485 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4486 except OSError as e:
4487 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4488 if returncode:
4489 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4490
4491
4492def random_birthday(year_field, month_field, day_field):
c305a25c 4493 start_date = dt.date(1950, 1, 1)
4494 end_date = dt.date(1995, 12, 31)
aa374bc7 4495 offset = random.randint(0, (end_date - start_date).days)
c305a25c 4496 random_date = start_date + dt.timedelta(offset)
0c265486 4497 return {
aa374bc7
AS
4498 year_field: str(random_date.year),
4499 month_field: str(random_date.month),
4500 day_field: str(random_date.day),
0c265486 4501 }
732044af 4502
c76eb41b 4503
8c53322c
L
4504def find_available_port(interface=''):
4505 try:
4506 with socket.socket() as sock:
4507 sock.bind((interface, 0))
4508 return sock.getsockname()[1]
4509 except OSError:
4510 return None
4511
4512
732044af 4513# Templates for internet shortcut files, which are plain text files.
e5a998f3 4514DOT_URL_LINK_TEMPLATE = '''\
732044af 4515[InternetShortcut]
4516URL=%(url)s
e5a998f3 4517'''
732044af 4518
e5a998f3 4519DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4520<?xml version="1.0" encoding="UTF-8"?>
4521<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4522<plist version="1.0">
4523<dict>
4524\t<key>URL</key>
4525\t<string>%(url)s</string>
4526</dict>
4527</plist>
e5a998f3 4528'''
732044af 4529
e5a998f3 4530DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4531[Desktop Entry]
4532Encoding=UTF-8
4533Name=%(filename)s
4534Type=Link
4535URL=%(url)s
4536Icon=text-html
e5a998f3 4537'''
732044af 4538
08438d2c 4539LINK_TEMPLATES = {
4540 'url': DOT_URL_LINK_TEMPLATE,
4541 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4542 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4543}
4544
732044af 4545
4546def iri_to_uri(iri):
4547 """
4548 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4549
4550 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4551 """
4552
14f25df2 4553 iri_parts = urllib.parse.urlparse(iri)
732044af 4554
4555 if '[' in iri_parts.netloc:
4556 raise ValueError('IPv6 URIs are not, yet, supported.')
4557 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4558
4559 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4560
4561 net_location = ''
4562 if iri_parts.username:
f9934b96 4563 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4564 if iri_parts.password is not None:
f9934b96 4565 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4566 net_location += '@'
4567
0f06bcd7 4568 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4569 # The 'idna' encoding produces ASCII text.
4570 if iri_parts.port is not None and iri_parts.port != 80:
4571 net_location += ':' + str(iri_parts.port)
4572
f9934b96 4573 return urllib.parse.urlunparse(
732044af 4574 (iri_parts.scheme,
4575 net_location,
4576
f9934b96 4577 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4578
4579 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4580 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4581
4582 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4583 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4584
f9934b96 4585 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4586
4587 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4588
4589
4590def to_high_limit_path(path):
4591 if sys.platform in ['win32', 'cygwin']:
4592 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4593 return '\\\\?\\' + os.path.abspath(path)
732044af 4594
4595 return path
76d321f6 4596
c76eb41b 4597
7b2c3f47 4598def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4599 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4600 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4601 return default
7b2c3f47 4602 return template % func(val)
00dd0cd5 4603
4604
4605def clean_podcast_url(url):
91302ed3 4606 url = re.sub(r'''(?x)
00dd0cd5 4607 (?:
4608 (?:
4609 chtbl\.com/track|
4610 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4611 play\.podtrac\.com|
4612 chrt\.fm/track|
4613 mgln\.ai/e
4614 )(?:/[^/.]+)?|
00dd0cd5 4615 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4616 flex\.acast\.com|
4617 pd(?:
4618 cn\.co| # https://podcorn.com/analytics-prefix/
4619 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4620 )/e|
4621 [0-9]\.gum\.fm|
4622 pscrb\.fm/rss/p
00dd0cd5 4623 )/''', '', url)
91302ed3 4624 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4625
4626
4627_HEX_TABLE = '0123456789abcdef'
4628
4629
4630def random_uuidv4():
4631 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4632
4633
4634def make_dir(path, to_screen=None):
4635 try:
4636 dn = os.path.dirname(path)
b25d6cb9
AI
4637 if dn:
4638 os.makedirs(dn, exist_ok=True)
0202b52a 4639 return True
86e5f3ed 4640 except OSError as err:
0202b52a 4641 if callable(to_screen) is not None:
69bec673 4642 to_screen(f'unable to create directory {err}')
0202b52a 4643 return False
f74980cb 4644
4645
4646def get_executable_path():
69bec673 4647 from ..update import _get_variant_and_executable_path
c487cf00 4648
b5899f4f 4649 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4650
4651
8e40b9d1 4652def get_user_config_dirs(package_name):
8e40b9d1
M
4653 # .config (e.g. ~/.config/package_name)
4654 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4655 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4656
4657 # appdata (%APPDATA%/package_name)
4658 appdata_dir = os.getenv('appdata')
4659 if appdata_dir:
773c272d 4660 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4661
4662 # home (~/.package_name)
773c272d 4663 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4664
4665
4666def get_system_config_dirs(package_name):
8e40b9d1 4667 # /etc/package_name
773c272d 4668 yield os.path.join('/etc', package_name)
06167fbb 4669
4670
3e9b66d7 4671def time_seconds(**kwargs):
83c4970e
L
4672 """
4673 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4674 """
c305a25c 4675 return time.time() + dt.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4676
4677
49fa4d9a
N
4678# create a JSON Web Signature (jws) with HS256 algorithm
4679# the resulting format is in JWS Compact Serialization
4680# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4681# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4682def jwt_encode_hs256(payload_data, key, headers={}):
4683 header_data = {
4684 'alg': 'HS256',
4685 'typ': 'JWT',
4686 }
4687 if headers:
4688 header_data.update(headers)
0f06bcd7 4689 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4690 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4691 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4692 signature_b64 = base64.b64encode(h.digest())
4693 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4694 return token
819e0531 4695
4696
16b0d7e6 4697# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4698def jwt_decode_hs256(jwt):
4699 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4700 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4701 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4702 return payload_data
4703
4704
53973b4d 4705WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4706
4707
7a32c70d 4708@functools.cache
819e0531 4709def supports_terminal_sequences(stream):
4710 if compat_os_name == 'nt':
8a82af35 4711 if not WINDOWS_VT_MODE:
819e0531 4712 return False
4713 elif not os.getenv('TERM'):
4714 return False
4715 try:
4716 return stream.isatty()
4717 except BaseException:
4718 return False
4719
4720
c53a18f0 4721def windows_enable_vt_mode():
4722 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4723 if get_windows_version() < (10, 0, 10586):
53973b4d 4724 return
53973b4d 4725
c53a18f0 4726 import ctypes
4727 import ctypes.wintypes
4728 import msvcrt
4729
4730 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4731
4732 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4733 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4734 try:
4735 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4736 dw_original_mode = ctypes.wintypes.DWORD()
4737 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4738 if not success:
4739 raise Exception('GetConsoleMode failed')
4740
4741 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4742 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4743 if not success:
4744 raise Exception('SetConsoleMode failed')
c53a18f0 4745 finally:
4746 os.close(handle)
53973b4d 4747
f0795149 4748 global WINDOWS_VT_MODE
4749 WINDOWS_VT_MODE = True
4750 supports_terminal_sequences.cache_clear()
4751
53973b4d 4752
ec11a9f4 4753_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4754
4755
4756def remove_terminal_sequences(string):
4757 return _terminal_sequences_re.sub('', string)
4758
4759
4760def number_of_digits(number):
4761 return len('%d' % number)
34921b43 4762
4763
4764def join_nonempty(*values, delim='-', from_dict=None):
4765 if from_dict is not None:
69bec673 4766 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4767 return delim.join(map(str, filter(None, values)))
06e57990 4768
4769
27231526
ZM
4770def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4771 """
4772 Find the largest format dimensions in terms of video width and, for each thumbnail:
4773 * Modify the URL: Match the width with the provided regex and replace with the former width
4774 * Update dimensions
4775
4776 This function is useful with video services that scale the provided thumbnails on demand
4777 """
4778 _keys = ('width', 'height')
4779 max_dimensions = max(
86e5f3ed 4780 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4781 default=(0, 0))
4782 if not max_dimensions[0]:
4783 return thumbnails
4784 return [
4785 merge_dicts(
4786 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4787 dict(zip(_keys, max_dimensions)), thumbnail)
4788 for thumbnail in thumbnails
4789 ]
4790
4791
93c8410d
LNO
4792def parse_http_range(range):
4793 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4794 if not range:
4795 return None, None, None
4796 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4797 if not crg:
4798 return None, None, None
4799 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4800
4801
6b9e832d 4802def read_stdin(what):
a174c453 4803 if what:
4804 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4805 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
6b9e832d 4806 return sys.stdin
4807
4808
a904a7f8
L
4809def determine_file_encoding(data):
4810 """
88f60feb 4811 Detect the text encoding used
a904a7f8
L
4812 @returns (encoding, bytes to skip)
4813 """
4814
88f60feb 4815 # BOM marks are given priority over declarations
a904a7f8 4816 for bom, enc in BOMS:
a904a7f8
L
4817 if data.startswith(bom):
4818 return enc, len(bom)
4819
88f60feb 4820 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4821 # We ignore the endianness to get a good enough match
a904a7f8 4822 data = data.replace(b'\0', b'')
88f60feb 4823 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4824 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4825
4826
06e57990 4827class Config:
4828 own_args = None
9e491463 4829 parsed_args = None
06e57990 4830 filename = None
4831 __initialized = False
4832
4833 def __init__(self, parser, label=None):
9e491463 4834 self.parser, self.label = parser, label
06e57990 4835 self._loaded_paths, self.configs = set(), []
4836
4837 def init(self, args=None, filename=None):
4838 assert not self.__initialized
284a60c5 4839 self.own_args, self.filename = args, filename
4840 return self.load_configs()
4841
4842 def load_configs(self):
65662dff 4843 directory = ''
284a60c5 4844 if self.filename:
4845 location = os.path.realpath(self.filename)
65662dff 4846 directory = os.path.dirname(location)
06e57990 4847 if location in self._loaded_paths:
4848 return False
4849 self._loaded_paths.add(location)
4850
284a60c5 4851 self.__initialized = True
4852 opts, _ = self.parser.parse_known_args(self.own_args)
4853 self.parsed_args = self.own_args
9e491463 4854 for location in opts.config_locations or []:
6b9e832d 4855 if location == '-':
1060f82f 4856 if location in self._loaded_paths:
4857 continue
4858 self._loaded_paths.add(location)
6b9e832d 4859 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4860 continue
65662dff 4861 location = os.path.join(directory, expand_path(location))
06e57990 4862 if os.path.isdir(location):
4863 location = os.path.join(location, 'yt-dlp.conf')
4864 if not os.path.exists(location):
9e491463 4865 self.parser.error(f'config location {location} does not exist')
06e57990 4866 self.append_config(self.read_file(location), location)
4867 return True
4868
4869 def __str__(self):
4870 label = join_nonempty(
4871 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4872 delim=' ')
4873 return join_nonempty(
4874 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4875 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4876 delim='\n')
4877
7a32c70d 4878 @staticmethod
06e57990 4879 def read_file(filename, default=[]):
4880 try:
a904a7f8 4881 optionf = open(filename, 'rb')
86e5f3ed 4882 except OSError:
06e57990 4883 return default # silently skip if file is not present
a904a7f8
L
4884 try:
4885 enc, skip = determine_file_encoding(optionf.read(512))
4886 optionf.seek(skip, io.SEEK_SET)
4887 except OSError:
4888 enc = None # silently skip read errors
06e57990 4889 try:
4890 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4891 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4892 res = shlex.split(contents, comments=True)
44a6fcff 4893 except Exception as err:
4894 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4895 finally:
4896 optionf.close()
4897 return res
4898
7a32c70d 4899 @staticmethod
06e57990 4900 def hide_login_info(opts):
86e5f3ed 4901 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4902 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4903
4904 def _scrub_eq(o):
4905 m = eqre.match(o)
4906 if m:
4907 return m.group('key') + '=PRIVATE'
4908 else:
4909 return o
4910
4911 opts = list(map(_scrub_eq, opts))
4912 for idx, opt in enumerate(opts):
4913 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4914 opts[idx + 1] = 'PRIVATE'
4915 return opts
4916
4917 def append_config(self, *args, label=None):
9e491463 4918 config = type(self)(self.parser, label)
06e57990 4919 config._loaded_paths = self._loaded_paths
4920 if config.init(*args):
4921 self.configs.append(config)
4922
7a32c70d 4923 @property
06e57990 4924 def all_args(self):
4925 for config in reversed(self.configs):
4926 yield from config.all_args
9e491463 4927 yield from self.parsed_args or []
4928
4929 def parse_known_args(self, **kwargs):
4930 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4931
4932 def parse_args(self):
9e491463 4933 return self.parser.parse_args(self.all_args)
da42679b
LNO
4934
4935
8b7539d2 4936def merge_headers(*dicts):
08d30158 4937 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 4938 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 4939
4940
b1f94422 4941def cached_method(f):
4942 """Cache a method"""
4943 signature = inspect.signature(f)
4944
7a32c70d 4945 @functools.wraps(f)
b1f94422 4946 def wrapper(self, *args, **kwargs):
4947 bound_args = signature.bind(self, *args, **kwargs)
4948 bound_args.apply_defaults()
d5d1df8a 4949 key = tuple(bound_args.arguments.values())[1:]
b1f94422 4950
6368e2e6 4951 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 4952 if key not in cache:
4953 cache[key] = f(self, *args, **kwargs)
4954 return cache[key]
4955 return wrapper
4956
4957
28787f16 4958class classproperty:
83cc7b8a 4959 """property access for class methods with optional caching"""
4960 def __new__(cls, func=None, *args, **kwargs):
4961 if not func:
4962 return functools.partial(cls, *args, **kwargs)
4963 return super().__new__(cls)
c487cf00 4964
83cc7b8a 4965 def __init__(self, func, *, cache=False):
c487cf00 4966 functools.update_wrapper(self, func)
4967 self.func = func
83cc7b8a 4968 self._cache = {} if cache else None
28787f16 4969
4970 def __get__(self, _, cls):
83cc7b8a 4971 if self._cache is None:
4972 return self.func(cls)
4973 elif cls not in self._cache:
4974 self._cache[cls] = self.func(cls)
4975 return self._cache[cls]
19a03940 4976
4977
a5387729 4978class function_with_repr:
b2e0343b 4979 def __init__(self, func, repr_=None):
a5387729 4980 functools.update_wrapper(self, func)
b2e0343b 4981 self.func, self.__repr = func, repr_
a5387729 4982
4983 def __call__(self, *args, **kwargs):
4984 return self.func(*args, **kwargs)
4985
45491a2a 4986 @classmethod
4987 def set_repr(cls, repr_):
4988 return functools.partial(cls, repr_=repr_)
4989
a5387729 4990 def __repr__(self):
b2e0343b 4991 if self.__repr:
4992 return self.__repr
a5387729 4993 return f'{self.func.__module__}.{self.func.__qualname__}'
4994
4995
64fa820c 4996class Namespace(types.SimpleNamespace):
591bb9d3 4997 """Immutable namespace"""
591bb9d3 4998
7896214c 4999 def __iter__(self):
64fa820c 5000 return iter(self.__dict__.values())
7896214c 5001
7a32c70d 5002 @property
64fa820c 5003 def items_(self):
5004 return self.__dict__.items()
9b8ee23b 5005
5006
8dc59305 5007MEDIA_EXTENSIONS = Namespace(
5008 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5009 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5010 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5011 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5012 thumbnails=('jpg', 'png', 'webp'),
5013 storyboards=('mhtml', ),
5014 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5015 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5016)
5017MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5018MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5019
5020KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5021
5022
be5c1ae8 5023class RetryManager:
5024 """Usage:
5025 for retry in RetryManager(...):
5026 try:
5027 ...
5028 except SomeException as err:
5029 retry.error = err
5030 continue
5031 """
5032 attempt, _error = 0, None
5033
5034 def __init__(self, _retries, _error_callback, **kwargs):
5035 self.retries = _retries or 0
5036 self.error_callback = functools.partial(_error_callback, **kwargs)
5037
5038 def _should_retry(self):
5039 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5040
7a32c70d 5041 @property
be5c1ae8 5042 def error(self):
5043 if self._error is NO_DEFAULT:
5044 return None
5045 return self._error
5046
7a32c70d 5047 @error.setter
be5c1ae8 5048 def error(self, value):
5049 self._error = value
5050
5051 def __iter__(self):
5052 while self._should_retry():
5053 self.error = NO_DEFAULT
5054 self.attempt += 1
5055 yield self
5056 if self.error:
5057 self.error_callback(self.error, self.attempt, self.retries)
5058
7a32c70d 5059 @staticmethod
be5c1ae8 5060 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5061 """Utility function for reporting retries"""
5062 if count > retries:
5063 if error:
5064 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5065 raise e
5066
5067 if not count:
5068 return warn(e)
5069 elif isinstance(e, ExtractorError):
3ce29336 5070 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5071 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5072
5073 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5074 if delay:
5075 info(f'Sleeping {delay:.2f} seconds ...')
5076 time.sleep(delay)
5077
5078
0647d925 5079def make_archive_id(ie, video_id):
5080 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5081 return f'{ie_key.lower()} {video_id}'
5082
5083
a1c5bd82 5084def truncate_string(s, left, right=0):
5085 assert left > 3 and right >= 0
5086 if s is None or len(s) <= left + right:
5087 return s
f9fb3ce8 5088 return f'{s[:left - 3]}...{s[-right:] if right else ""}'
a1c5bd82 5089
5090
5314b521 5091def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5092 assert 'all' in alias_dict, '"all" alias is required'
5093 requested = list(start or [])
5094 for val in options:
5095 discard = val.startswith('-')
5096 if discard:
5097 val = val[1:]
5098
5099 if val in alias_dict:
5100 val = alias_dict[val] if not discard else [
5101 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5102 # NB: Do not allow regex in aliases for performance
5103 requested = orderedSet_from_options(val, alias_dict, start=requested)
5104 continue
5105
5106 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5107 else [val] if val in alias_dict['all'] else None)
5108 if current is None:
5109 raise ValueError(val)
5110
5111 if discard:
5112 for item in current:
5113 while item in requested:
5114 requested.remove(item)
5115 else:
5116 requested.extend(current)
5117
5118 return orderedSet(requested)
5119
5120
eedda525 5121# TODO: Rewrite
d0d74b71 5122class FormatSorter:
5123 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5124
5125 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5126 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5127 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5128 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5129 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5130 'fps', 'fs_approx', 'source', 'id')
5131
5132 settings = {
5133 'vcodec': {'type': 'ordered', 'regex': True,
5134 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5135 'acodec': {'type': 'ordered', 'regex': True,
71082216 5136 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5137 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5138 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5139 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5140 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5141 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5142 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5143 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5144 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5145 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5146 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5147 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5148 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5149 'field': ('vcodec', 'acodec'),
5150 'function': lambda it: int(any(v != 'none' for v in it))},
5151 'ie_pref': {'priority': True, 'type': 'extractor'},
5152 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5153 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5154 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5155 'quality': {'convert': 'float', 'default': -1},
5156 'filesize': {'convert': 'bytes'},
5157 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5158 'id': {'convert': 'string', 'field': 'format_id'},
5159 'height': {'convert': 'float_none'},
5160 'width': {'convert': 'float_none'},
5161 'fps': {'convert': 'float_none'},
5162 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5163 'tbr': {'convert': 'float_none'},
5164 'vbr': {'convert': 'float_none'},
5165 'abr': {'convert': 'float_none'},
5166 'asr': {'convert': 'float_none'},
5167 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5168
5169 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5170 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5171 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5172 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5173 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5174 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5175 'res': {'type': 'multiple', 'field': ('height', 'width'),
5176 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5177
5178 # Actual field names
5179 'format_id': {'type': 'alias', 'field': 'id'},
5180 'preference': {'type': 'alias', 'field': 'ie_pref'},
5181 'language_preference': {'type': 'alias', 'field': 'lang'},
5182 'source_preference': {'type': 'alias', 'field': 'source'},
5183 'protocol': {'type': 'alias', 'field': 'proto'},
5184 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5185 'audio_channels': {'type': 'alias', 'field': 'channels'},
5186
5187 # Deprecated
5188 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5189 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5190 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5191 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5192 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5193 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5194 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5195 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5196 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5197 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5198 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5199 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5200 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5201 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5202 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5203 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5204 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5205 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5206 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5207 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5208 }
5209
5210 def __init__(self, ydl, field_preference):
5211 self.ydl = ydl
5212 self._order = []
5213 self.evaluate_params(self.ydl.params, field_preference)
5214 if ydl.params.get('verbose'):
5215 self.print_verbose_info(self.ydl.write_debug)
5216
5217 def _get_field_setting(self, field, key):
5218 if field not in self.settings:
5219 if key in ('forced', 'priority'):
5220 return False
5221 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5222 'deprecated and may be removed in a future version')
5223 self.settings[field] = {}
5224 propObj = self.settings[field]
5225 if key not in propObj:
5226 type = propObj.get('type')
5227 if key == 'field':
5228 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5229 elif key == 'convert':
5230 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5231 else:
5232 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5233 propObj[key] = default
5234 return propObj[key]
5235
5236 def _resolve_field_value(self, field, value, convertNone=False):
5237 if value is None:
5238 if not convertNone:
5239 return None
5240 else:
5241 value = value.lower()
5242 conversion = self._get_field_setting(field, 'convert')
5243 if conversion == 'ignore':
5244 return None
5245 if conversion == 'string':
5246 return value
5247 elif conversion == 'float_none':
5248 return float_or_none(value)
5249 elif conversion == 'bytes':
5250 return parse_bytes(value)
5251 elif conversion == 'order':
5252 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5253 use_regex = self._get_field_setting(field, 'regex')
5254 list_length = len(order_list)
5255 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5256 if use_regex and value is not None:
5257 for i, regex in enumerate(order_list):
5258 if regex and re.match(regex, value):
5259 return list_length - i
5260 return list_length - empty_pos # not in list
5261 else: # not regex or value = None
5262 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5263 else:
5264 if value.isnumeric():
5265 return float(value)
5266 else:
5267 self.settings[field]['convert'] = 'string'
5268 return value
5269
5270 def evaluate_params(self, params, sort_extractor):
5271 self._use_free_order = params.get('prefer_free_formats', False)
5272 self._sort_user = params.get('format_sort', [])
5273 self._sort_extractor = sort_extractor
5274
5275 def add_item(field, reverse, closest, limit_text):
5276 field = field.lower()
5277 if field in self._order:
5278 return
5279 self._order.append(field)
5280 limit = self._resolve_field_value(field, limit_text)
5281 data = {
5282 'reverse': reverse,
5283 'closest': False if limit is None else closest,
5284 'limit_text': limit_text,
5285 'limit': limit}
5286 if field in self.settings:
5287 self.settings[field].update(data)
5288 else:
5289 self.settings[field] = data
5290
5291 sort_list = (
5292 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5293 + (tuple() if params.get('format_sort_force', False)
5294 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5295 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5296
5297 for item in sort_list:
5298 match = re.match(self.regex, item)
5299 if match is None:
5300 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5301 field = match.group('field')
5302 if field is None:
5303 continue
5304 if self._get_field_setting(field, 'type') == 'alias':
5305 alias, field = field, self._get_field_setting(field, 'field')
5306 if self._get_field_setting(alias, 'deprecated'):
5307 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5308 f'be removed in a future version. Please use {field} instead')
5309 reverse = match.group('reverse') is not None
5310 closest = match.group('separator') == '~'
5311 limit_text = match.group('limit')
5312
5313 has_limit = limit_text is not None
5314 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5315 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5316
5317 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5318 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5319 limit_count = len(limits)
5320 for (i, f) in enumerate(fields):
5321 add_item(f, reverse, closest,
5322 limits[i] if i < limit_count
5323 else limits[0] if has_limit and not has_multiple_limits
5324 else None)
5325
5326 def print_verbose_info(self, write_debug):
5327 if self._sort_user:
5328 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5329 if self._sort_extractor:
5330 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5331 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5332 '+' if self._get_field_setting(field, 'reverse') else '', field,
5333 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5334 self._get_field_setting(field, 'limit_text'),
5335 self._get_field_setting(field, 'limit'))
5336 if self._get_field_setting(field, 'limit_text') is not None else '')
5337 for field in self._order if self._get_field_setting(field, 'visible')]))
5338
5339 def _calculate_field_preference_from_value(self, format, field, type, value):
5340 reverse = self._get_field_setting(field, 'reverse')
5341 closest = self._get_field_setting(field, 'closest')
5342 limit = self._get_field_setting(field, 'limit')
5343
5344 if type == 'extractor':
5345 maximum = self._get_field_setting(field, 'max')
5346 if value is None or (maximum is not None and value >= maximum):
5347 value = -1
5348 elif type == 'boolean':
5349 in_list = self._get_field_setting(field, 'in_list')
5350 not_in_list = self._get_field_setting(field, 'not_in_list')
5351 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5352 elif type == 'ordered':
5353 value = self._resolve_field_value(field, value, True)
5354
5355 # try to convert to number
5356 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5357 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5358 if is_num:
5359 value = val_num
5360
5361 return ((-10, 0) if value is None
5362 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5363 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5364 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5365 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5366 else (-1, value, 0))
5367
5368 def _calculate_field_preference(self, format, field):
5369 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5370 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5371 if type == 'multiple':
5372 type = 'field' # Only 'field' is allowed in multiple for now
5373 actual_fields = self._get_field_setting(field, 'field')
5374
5375 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5376 else:
5377 value = get_value(field)
5378 return self._calculate_field_preference_from_value(format, field, type, value)
5379
5380 def calculate_preference(self, format):
5381 # Determine missing protocol
5382 if not format.get('protocol'):
5383 format['protocol'] = determine_protocol(format)
5384
5385 # Determine missing ext
5386 if not format.get('ext') and 'url' in format:
5387 format['ext'] = determine_ext(format['url'])
5388 if format.get('vcodec') == 'none':
5389 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5390 format['video_ext'] = 'none'
5391 else:
5392 format['video_ext'] = format['ext']
5393 format['audio_ext'] = 'none'
5394 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5395 # format['preference'] = -1000
5396
5424dbaf
L
5397 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5398 # HEVC-over-FLV is out-of-spec by FLV's original spec
5399 # ref. https://trac.ffmpeg.org/ticket/6389
5400 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5401 format['preference'] = -100
5402
d0d74b71 5403 # Determine missing bitrates
eedda525 5404 if format.get('vcodec') == 'none':
5405 format['vbr'] = 0
5406 if format.get('acodec') == 'none':
5407 format['abr'] = 0
5408 if not format.get('vbr') and format.get('vcodec') != 'none':
5409 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5410 if not format.get('abr') and format.get('acodec') != 'none':
5411 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5412 if not format.get('tbr'):
5413 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5414
5415 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5416
5417
86e3b822 5418def filesize_from_tbr(tbr, duration):
5419 """
5420 @param tbr: Total bitrate in kbps (1000 bits/sec)
5421 @param duration: Duration in seconds
5422 @returns Filesize in bytes
5423 """
5424 if tbr is None or duration is None:
5425 return None
5426 return int(duration * tbr * (1000 / 8))
5427
5428
1b392f90 5429# XXX: Temporary
5430class _YDLLogger:
5431 def __init__(self, ydl=None):
5432 self._ydl = ydl
5433
5434 def debug(self, message):
5435 if self._ydl:
5436 self._ydl.write_debug(message)
5437
5438 def info(self, message):
5439 if self._ydl:
5440 self._ydl.to_screen(message)
5441
5442 def warning(self, message, *, once=False):
5443 if self._ydl:
3d2623a8 5444 self._ydl.report_warning(message, once)
1b392f90 5445
5446 def error(self, message, *, is_error=True):
5447 if self._ydl:
5448 self._ydl.report_error(message, is_error=is_error)
5449
5450 def stdout(self, message):
5451 if self._ydl:
5452 self._ydl.to_stdout(message)
5453
5454 def stderr(self, message):
5455 if self._ydl:
5456 self._ydl.to_stderr(message)