]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils/_utils.py
[networking] Add module (#2861)
[yt-dlp.git] / yt_dlp / utils / _utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
b1f94422 20import inspect
03f9daab 21import io
79a2e94e 22import itertools
f4bfd65f 23import json
d77c3dfd 24import locale
02dbf93f 25import math
f8271158 26import mimetypes
db3ad8a6 27import netrc
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48
69bec673 49from . import traversal
50
51from ..compat import functools # isort: split
52from ..compat import (
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
f8271158 55 compat_HTMLParseError,
efa97bdc 56 compat_os_name,
702ccf2d 57 compat_shlex_quote,
8c25f81b 58)
c365dba8 59from ..dependencies import websockets, xattr
51fb4995 60
46f1370e 61__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
62
468e2e92
FV
63# This is not clearly defined otherwise
64compiled_regex_type = type(re.compile(''))
65
f7a147e3 66
fb37eb25
S
67USER_AGENTS = {
68 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
69}
70
71
4823ec9f 72class NO_DEFAULT:
73 pass
74
75
76def IDENTITY(x):
77 return x
78
bf42a990 79
7105440c
YCH
80ENGLISH_MONTH_NAMES = [
81 'January', 'February', 'March', 'April', 'May', 'June',
82 'July', 'August', 'September', 'October', 'November', 'December']
83
f6717dec
S
84MONTH_NAMES = {
85 'en': ENGLISH_MONTH_NAMES,
86 'fr': [
3e4185c3
S
87 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
88 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 89 # these follow the genitive grammatical case (dopełniacz)
90 # some websites might be using nominative, which will require another month list
91 # https://en.wikibooks.org/wiki/Polish/Noun_cases
92 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
93 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 94}
a942d6cb 95
8f53dc44 96# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
97TIMEZONE_NAMES = {
98 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
99 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
100 'EST': -5, 'EDT': -4, # Eastern
101 'CST': -6, 'CDT': -5, # Central
102 'MST': -7, 'MDT': -6, # Mountain
103 'PST': -8, 'PDT': -7 # Pacific
104}
105
c587cbb7 106# needed for sanitizing filenames in restricted mode
c8827027 107ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
108 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
109 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 110
46f59e89
S
111DATE_FORMATS = (
112 '%d %B %Y',
113 '%d %b %Y',
114 '%B %d %Y',
cb655f34
S
115 '%B %dst %Y',
116 '%B %dnd %Y',
9d30c213 117 '%B %drd %Y',
cb655f34 118 '%B %dth %Y',
46f59e89 119 '%b %d %Y',
cb655f34
S
120 '%b %dst %Y',
121 '%b %dnd %Y',
9d30c213 122 '%b %drd %Y',
cb655f34 123 '%b %dth %Y',
46f59e89
S
124 '%b %dst %Y %I:%M',
125 '%b %dnd %Y %I:%M',
9d30c213 126 '%b %drd %Y %I:%M',
46f59e89
S
127 '%b %dth %Y %I:%M',
128 '%Y %m %d',
129 '%Y-%m-%d',
bccdbd22 130 '%Y.%m.%d.',
46f59e89 131 '%Y/%m/%d',
81c13222 132 '%Y/%m/%d %H:%M',
46f59e89 133 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
134 '%Y%m%d%H%M',
135 '%Y%m%d%H%M%S',
4f3fa23e 136 '%Y%m%d',
0c1c6f4b 137 '%Y-%m-%d %H:%M',
46f59e89
S
138 '%Y-%m-%d %H:%M:%S',
139 '%Y-%m-%d %H:%M:%S.%f',
5014558a 140 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
141 '%d.%m.%Y %H:%M',
142 '%d.%m.%Y %H.%M',
143 '%Y-%m-%dT%H:%M:%SZ',
144 '%Y-%m-%dT%H:%M:%S.%fZ',
145 '%Y-%m-%dT%H:%M:%S.%f0Z',
146 '%Y-%m-%dT%H:%M:%S',
147 '%Y-%m-%dT%H:%M:%S.%f',
148 '%Y-%m-%dT%H:%M',
c6eed6b8
S
149 '%b %d %Y at %H:%M',
150 '%b %d %Y at %H:%M:%S',
b555ae9b
S
151 '%B %d %Y at %H:%M',
152 '%B %d %Y at %H:%M:%S',
a63d9bd0 153 '%H:%M %d-%b-%Y',
46f59e89
S
154)
155
156DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
157DATE_FORMATS_DAY_FIRST.extend([
158 '%d-%m-%Y',
159 '%d.%m.%Y',
160 '%d.%m.%y',
161 '%d/%m/%Y',
162 '%d/%m/%y',
163 '%d/%m/%Y %H:%M:%S',
47304e07 164 '%d-%m-%Y %H:%M',
4cbfa570 165 '%H:%M %d/%m/%Y',
46f59e89
S
166])
167
168DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
169DATE_FORMATS_MONTH_FIRST.extend([
170 '%m-%d-%Y',
171 '%m.%d.%Y',
172 '%m/%d/%Y',
173 '%m/%d/%y',
174 '%m/%d/%Y %H:%M:%S',
175])
176
06b3fe29 177PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 178JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 179
1d485a1a 180NUMBER_RE = r'\d+(?:\.\d+)?'
181
7105440c 182
0b9c08b4 183@functools.cache
d77c3dfd 184def preferredencoding():
59ae15a5 185 """Get preferred encoding.
d77c3dfd 186
59ae15a5
PH
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
189 """
190 try:
191 pref = locale.getpreferredencoding()
28e614de 192 'TEST'.encode(pref)
70a1165b 193 except Exception:
59ae15a5 194 pref = 'UTF-8'
bae611f2 195
59ae15a5 196 return pref
d77c3dfd 197
f4bfd65f 198
181c8655 199def write_json_file(obj, fn):
1394646a 200 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 201
cfb0511d 202 tf = tempfile.NamedTemporaryFile(
203 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
204 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
205
206 try:
207 with tf:
45d86abe 208 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
209 if sys.platform == 'win32':
210 # Need to remove existing file on Windows, else os.rename raises
211 # WindowsError or FileExistsError.
19a03940 212 with contextlib.suppress(OSError):
1394646a 213 os.unlink(fn)
19a03940 214 with contextlib.suppress(OSError):
9cd5f54e
R
215 mask = os.umask(0)
216 os.umask(mask)
217 os.chmod(tf.name, 0o666 & ~mask)
181c8655 218 os.rename(tf.name, fn)
70a1165b 219 except Exception:
19a03940 220 with contextlib.suppress(OSError):
181c8655 221 os.remove(tf.name)
181c8655
PH
222 raise
223
224
cfb0511d 225def find_xpath_attr(node, xpath, key, val=None):
226 """ Find the xpath xpath[@key=val] """
227 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 228 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 229 return node.find(expr)
59ae56fa 230
d7e66d39
JMF
231# On python2.6 the xml.etree.ElementTree.Element methods don't support
232# the namespace parameter
5f6a1245
JW
233
234
d7e66d39
JMF
235def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
d77c3dfd 246
a41fb80c 247def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 248 def _find_xpath(xpath):
f9934b96 249 return node.find(xpath)
578c0745 250
14f25df2 251 if isinstance(xpath, str):
578c0745
S
252 n = _find_xpath(xpath)
253 else:
254 for xp in xpath:
255 n = _find_xpath(xp)
256 if n is not None:
257 break
d74bebd5 258
8e636da4 259 if n is None:
bf42a990
S
260 if default is not NO_DEFAULT:
261 return default
262 elif fatal:
bf0ff932
PH
263 name = xpath if name is None else name
264 raise ExtractorError('Could not find XML element %s' % name)
265 else:
266 return None
a41fb80c
S
267 return n
268
269
270def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
271 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
272 if n is None or n == default:
273 return n
274 if n.text is None:
275 if default is not NO_DEFAULT:
276 return default
277 elif fatal:
278 name = xpath if name is None else name
279 raise ExtractorError('Could not find XML element\'s text %s' % name)
280 else:
281 return None
282 return n.text
a41fb80c
S
283
284
285def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
286 n = find_xpath_attr(node, xpath, key)
287 if n is None:
288 if default is not NO_DEFAULT:
289 return default
290 elif fatal:
86e5f3ed 291 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
292 raise ExtractorError('Could not find XML attribute %s' % name)
293 else:
294 return None
295 return n.attrib[key]
bf0ff932
PH
296
297
c487cf00 298def get_element_by_id(id, html, **kwargs):
43e8fafd 299 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 300 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 301
12ea2f30 302
c487cf00 303def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 304 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 305 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
306
307
84c237fb 308def get_element_by_class(class_name, html):
2af12ad9
TC
309 """Return the content of the first tag with the specified class in the passed HTML document"""
310 retval = get_elements_by_class(class_name, html)
311 return retval[0] if retval else None
312
313
6f32a0b5
ZM
314def get_element_html_by_class(class_name, html):
315 """Return the html of the first tag with the specified class in the passed HTML document"""
316 retval = get_elements_html_by_class(class_name, html)
317 return retval[0] if retval else None
318
319
c487cf00 320def get_element_by_attribute(attribute, value, html, **kwargs):
321 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
322 return retval[0] if retval else None
323
324
c487cf00 325def get_element_html_by_attribute(attribute, value, html, **kargs):
326 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
327 return retval[0] if retval else None
328
329
c487cf00 330def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
331 """Return the content of all tags with the specified class in the passed HTML document as a list"""
332 return get_elements_by_attribute(
64fa820c 333 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
334 html, escape_value=False)
335
336
6f32a0b5
ZM
337def get_elements_html_by_class(class_name, html):
338 """Return the html of all tags with the specified class in the passed HTML document as a list"""
339 return get_elements_html_by_attribute(
64fa820c 340 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
341 html, escape_value=False)
342
343
344def get_elements_by_attribute(*args, **kwargs):
43e8fafd 345 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
346 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
347
348
349def get_elements_html_by_attribute(*args, **kwargs):
350 """Return the html of the tag with the specified attribute in the passed HTML document"""
351 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
352
353
4c9a1a3b 354def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
355 """
356 Return the text (content) and the html (whole) of the tag with the specified
357 attribute in the passed HTML document
358 """
c61473c1
M
359 if not value:
360 return
9e6dd238 361
86e5f3ed 362 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 363
84c237fb
YCH
364 value = re.escape(value) if escape_value else value
365
86e5f3ed 366 partial_element_re = rf'''(?x)
4c9a1a3b 367 <(?P<tag>{tag})
0254f162 368 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 369 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
370 '''
38285056 371
0254f162
ZM
372 for m in re.finditer(partial_element_re, html):
373 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 374
0254f162
ZM
375 yield (
376 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
377 whole
378 )
a921f407 379
c5229f39 380
ac668111 381class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
382 """
383 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
384 closing tag for the first opening tag it has encountered, and can be used
385 as a context manager
386 """
387
388 class HTMLBreakOnClosingTagException(Exception):
389 pass
390
391 def __init__(self):
392 self.tagstack = collections.deque()
ac668111 393 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
394
395 def __enter__(self):
396 return self
397
398 def __exit__(self, *_):
399 self.close()
400
401 def close(self):
402 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
403 # so data remains buffered; we no longer have any interest in it, thus
404 # override this method to discard it
405 pass
406
407 def handle_starttag(self, tag, _):
408 self.tagstack.append(tag)
409
410 def handle_endtag(self, tag):
411 if not self.tagstack:
412 raise compat_HTMLParseError('no tags in the stack')
413 while self.tagstack:
414 inner_tag = self.tagstack.pop()
415 if inner_tag == tag:
416 break
417 else:
418 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
419 if not self.tagstack:
420 raise self.HTMLBreakOnClosingTagException()
421
422
46d09f87 423# XXX: This should be far less strict
6f32a0b5
ZM
424def get_element_text_and_html_by_tag(tag, html):
425 """
426 For the first element with the specified tag in the passed HTML document
427 return its' content (text) and the whole element (html)
428 """
429 def find_or_raise(haystack, needle, exc):
430 try:
431 return haystack.index(needle)
432 except ValueError:
433 raise exc
434 closing_tag = f'</{tag}>'
435 whole_start = find_or_raise(
436 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
437 content_start = find_or_raise(
438 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
439 content_start += whole_start + 1
440 with HTMLBreakOnClosingTagParser() as parser:
441 parser.feed(html[whole_start:content_start])
442 if not parser.tagstack or parser.tagstack[0] != tag:
443 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
444 offset = content_start
445 while offset < len(html):
446 next_closing_tag_start = find_or_raise(
447 html[offset:], closing_tag,
448 compat_HTMLParseError(f'closing {tag} tag not found'))
449 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
450 try:
451 parser.feed(html[offset:offset + next_closing_tag_end])
452 offset += next_closing_tag_end
453 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
454 return html[content_start:offset + next_closing_tag_start], \
455 html[whole_start:offset + next_closing_tag_end]
456 raise compat_HTMLParseError('unexpected end of html')
457
458
ac668111 459class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 460 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 461
8bb56eee 462 def __init__(self):
c5229f39 463 self.attrs = {}
ac668111 464 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
465
466 def handle_starttag(self, tag, attrs):
467 self.attrs = dict(attrs)
7053aa3a 468 raise compat_HTMLParseError('done')
8bb56eee 469
c5229f39 470
ac668111 471class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
472 """HTML parser to gather the attributes for the elements of a list"""
473
474 def __init__(self):
ac668111 475 html.parser.HTMLParser.__init__(self)
73673ccf
FF
476 self.items = []
477 self._level = 0
478
479 def handle_starttag(self, tag, attrs):
480 if tag == 'li' and self._level == 0:
481 self.items.append(dict(attrs))
482 self._level += 1
483
484 def handle_endtag(self, tag):
485 self._level -= 1
486
487
8bb56eee
BF
488def extract_attributes(html_element):
489 """Given a string for an HTML element such as
490 <el
491 a="foo" B="bar" c="&98;az" d=boz
492 empty= noval entity="&amp;"
493 sq='"' dq="'"
494 >
495 Decode and return a dictionary of attributes.
496 {
497 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
498 'empty': '', 'noval': None, 'entity': '&',
499 'sq': '"', 'dq': '\''
500 }.
8bb56eee
BF
501 """
502 parser = HTMLAttributeParser()
19a03940 503 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
504 parser.feed(html_element)
505 parser.close()
8bb56eee 506 return parser.attrs
9e6dd238 507
c5229f39 508
73673ccf
FF
509def parse_list(webpage):
510 """Given a string for an series of HTML <li> elements,
511 return a dictionary of their attributes"""
512 parser = HTMLListAttrsParser()
513 parser.feed(webpage)
514 parser.close()
515 return parser.items
516
517
9e6dd238 518def clean_html(html):
59ae15a5 519 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
520
521 if html is None: # Convenience for sanitizing descriptions etc.
522 return html
523
49185227 524 html = re.sub(r'\s+', ' ', html)
525 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
526 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
527 # Strip html tags
528 html = re.sub('<.*?>', '', html)
529 # Replace html entities
530 html = unescapeHTML(html)
7decf895 531 return html.strip()
9e6dd238
FV
532
533
b7c47b74 534class LenientJSONDecoder(json.JSONDecoder):
cc090836 535 # TODO: Write tests
536 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 537 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 538 self._close_attempts = 2 * close_objects
b7c47b74 539 super().__init__(*args, **kwargs)
540
cc090836 541 @staticmethod
542 def _close_object(err):
543 doc = err.doc[:err.pos]
544 # We need to add comma first to get the correct error message
545 if err.msg.startswith('Expecting \',\''):
546 return doc + ','
547 elif not doc.endswith(','):
548 return
549
550 if err.msg.startswith('Expecting property name'):
551 return doc[:-1] + '}'
552 elif err.msg.startswith('Expecting value'):
553 return doc[:-1] + ']'
554
b7c47b74 555 def decode(self, s):
556 if self.transform_source:
557 s = self.transform_source(s)
cc090836 558 for attempt in range(self._close_attempts + 1):
559 try:
560 if self.ignore_extra:
561 return self.raw_decode(s.lstrip())[0]
562 return super().decode(s)
563 except json.JSONDecodeError as e:
564 if e.pos is None:
565 raise
566 elif attempt < self._close_attempts:
567 s = self._close_object(e)
568 if s is not None:
569 continue
2fa669f7 570 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 571 assert False, 'Too many attempts to decode JSON'
b7c47b74 572
573
d77c3dfd 574def sanitize_open(filename, open_mode):
59ae15a5
PH
575 """Try to open the given filename, and slightly tweak it if this fails.
576
577 Attempts to open the given filename. If this fails, it tries to change
578 the filename slightly, step by step, until it's either able to open it
579 or it fails and raises a final exception, like the standard open()
580 function.
581
582 It returns the tuple (stream, definitive_file_name).
583 """
0edb3e33 584 if filename == '-':
585 if sys.platform == 'win32':
586 import msvcrt
be5c1ae8 587
62b58c09 588 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 589 with contextlib.suppress(io.UnsupportedOperation):
590 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 591 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 592
0edb3e33 593 for attempt in range(2):
594 try:
595 try:
89737671 596 if sys.platform == 'win32':
b506289f 597 # FIXME: An exclusive lock also locks the file from being read.
598 # Since windows locks are mandatory, don't lock the file on windows (for now).
599 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 600 raise LockingUnsupportedError()
0edb3e33 601 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 602 except OSError:
0edb3e33 603 stream = open(filename, open_mode)
8a82af35 604 return stream, filename
86e5f3ed 605 except OSError as err:
0edb3e33 606 if attempt or err.errno in (errno.EACCES,):
607 raise
608 old_filename, filename = filename, sanitize_path(filename)
609 if old_filename == filename:
610 raise
d77c3dfd
FV
611
612
613def timeconvert(timestr):
59ae15a5
PH
614 """Convert RFC 2822 defined time string into system timestamp"""
615 timestamp = None
616 timetuple = email.utils.parsedate_tz(timestr)
617 if timetuple is not None:
618 timestamp = email.utils.mktime_tz(timetuple)
619 return timestamp
1c469a94 620
5f6a1245 621
5c3895ff 622def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 623 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 624 @param restricted Use a stricter subset of allowed characters
625 @param is_id Whether this is an ID that should be kept unchanged if possible.
626 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 627 """
5c3895ff 628 if s == '':
629 return ''
630
59ae15a5 631 def replace_insane(char):
c587cbb7
AT
632 if restricted and char in ACCENT_CHARS:
633 return ACCENT_CHARS[char]
91dd88b9 634 elif not restricted and char == '\n':
5c3895ff 635 return '\0 '
989a01c2 636 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
637 # Replace with their full-width unicode counterparts
638 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 639 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
640 return ''
641 elif char == '"':
642 return '' if restricted else '\''
643 elif char == ':':
5c3895ff 644 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 645 elif char in '\\/|*<>':
5c3895ff 646 return '\0_'
647 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
648 return '\0_'
59ae15a5
PH
649 return char
650
db4678e4 651 # Replace look-alike Unicode glyphs
652 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 653 s = unicodedata.normalize('NFKC', s)
5c3895ff 654 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 655 result = ''.join(map(replace_insane, s))
5c3895ff 656 if is_id is NO_DEFAULT:
ae61d108 657 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
658 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 659 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
660 result = result.replace('\0', '') or '_'
661
796173d0
PH
662 if not is_id:
663 while '__' in result:
664 result = result.replace('__', '_')
665 result = result.strip('_')
666 # Common case of "Foreign band name - English song title"
667 if restricted and result.startswith('-_'):
668 result = result[2:]
5a42414b
PH
669 if result.startswith('-'):
670 result = '_' + result[len('-'):]
a7440261 671 result = result.lstrip('.')
796173d0
PH
672 if not result:
673 result = '_'
59ae15a5 674 return result
d77c3dfd 675
5f6a1245 676
c2934512 677def sanitize_path(s, force=False):
a2aaf4db 678 """Sanitizes and normalizes path on Windows"""
c2934512 679 if sys.platform == 'win32':
c4218ac3 680 force = False
c2934512 681 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 682 elif force:
683 drive_or_unc = ''
684 else:
a2aaf4db 685 return s
c2934512 686
be531ef1
S
687 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
688 if drive_or_unc:
a2aaf4db
S
689 norm_path.pop(0)
690 sanitized_path = [
ec85ded8 691 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 692 for path_part in norm_path]
be531ef1
S
693 if drive_or_unc:
694 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 695 elif force and s and s[0] == os.path.sep:
c4218ac3 696 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
697 return os.path.join(*sanitized_path)
698
699
8f97a15d 700def sanitize_url(url, *, scheme='http'):
befa4708
S
701 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
702 # the number of unwanted failures due to missing protocol
21633673 703 if url is None:
704 return
705 elif url.startswith('//'):
8f97a15d 706 return f'{scheme}:{url}'
befa4708
S
707 # Fix some common typos seen so far
708 COMMON_TYPOS = (
067aa17e 709 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
710 (r'^httpss://', r'https://'),
711 # https://bx1.be/lives/direct-tv/
712 (r'^rmtp([es]?)://', r'rtmp\1://'),
713 )
714 for mistake, fixup in COMMON_TYPOS:
715 if re.match(mistake, url):
716 return re.sub(mistake, fixup, url)
bc6b9bcd 717 return url
17bcc626
S
718
719
5435dcf9 720def extract_basic_auth(url):
14f25df2 721 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
722 if parts.username is None:
723 return url, None
14f25df2 724 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
725 parts.hostname if parts.port is None
726 else '%s:%d' % (parts.hostname, parts.port))))
727 auth_payload = base64.b64encode(
0f06bcd7 728 ('%s:%s' % (parts.username, parts.password or '')).encode())
729 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
730
731
67dda517 732def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 733 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
734 if auth_header is not None:
735 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
736 headers['Authorization'] = auth_header
ac668111 737 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
738
739
51098426 740def expand_path(s):
2fa669f7 741 """Expand shell variables and ~"""
51098426
S
742 return os.path.expandvars(compat_expanduser(s))
743
744
7e9a6125 745def orderedSet(iterable, *, lazy=False):
746 """Remove all duplicates from the input iterable"""
747 def _iter():
748 seen = [] # Do not use set since the items can be unhashable
749 for x in iterable:
750 if x not in seen:
751 seen.append(x)
752 yield x
753
754 return _iter() if lazy else list(_iter())
d77c3dfd 755
912b38b4 756
55b2f099 757def _htmlentity_transform(entity_with_semicolon):
4e408e47 758 """Transforms an HTML entity to a character."""
55b2f099
YCH
759 entity = entity_with_semicolon[:-1]
760
4e408e47 761 # Known non-numeric HTML entity
ac668111 762 if entity in html.entities.name2codepoint:
763 return chr(html.entities.name2codepoint[entity])
4e408e47 764
62b58c09
L
765 # TODO: HTML5 allows entities without a semicolon.
766 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 767 if entity_with_semicolon in html.entities.html5:
768 return html.entities.html5[entity_with_semicolon]
55b2f099 769
91757b0f 770 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
771 if mobj is not None:
772 numstr = mobj.group(1)
28e614de 773 if numstr.startswith('x'):
4e408e47 774 base = 16
28e614de 775 numstr = '0%s' % numstr
4e408e47
PH
776 else:
777 base = 10
067aa17e 778 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 779 with contextlib.suppress(ValueError):
ac668111 780 return chr(int(numstr, base))
4e408e47
PH
781
782 # Unknown entity in name, return its literal representation
7a3f0c00 783 return '&%s;' % entity
4e408e47
PH
784
785
d77c3dfd 786def unescapeHTML(s):
912b38b4
PH
787 if s is None:
788 return None
19a03940 789 assert isinstance(s, str)
d77c3dfd 790
4e408e47 791 return re.sub(
95f3f7c2 792 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 793
8bf48f23 794
cdb19aa4 795def escapeHTML(text):
796 return (
797 text
798 .replace('&', '&amp;')
799 .replace('<', '&lt;')
800 .replace('>', '&gt;')
801 .replace('"', '&quot;')
802 .replace("'", '&#39;')
803 )
804
805
db3ad8a6
ND
806class netrc_from_content(netrc.netrc):
807 def __init__(self, content):
808 self.hosts, self.macros = {}, {}
809 with io.StringIO(content) as stream:
810 self._parse('-', stream, False)
811
812
d3c93ec2 813class Popen(subprocess.Popen):
814 if sys.platform == 'win32':
815 _startupinfo = subprocess.STARTUPINFO()
816 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
817 else:
818 _startupinfo = None
819
82ea226c
L
820 @staticmethod
821 def _fix_pyinstaller_ld_path(env):
822 """Restore LD_LIBRARY_PATH when using PyInstaller
823 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
824 https://github.com/yt-dlp/yt-dlp/issues/4573
825 """
826 if not hasattr(sys, '_MEIPASS'):
827 return
828
829 def _fix(key):
830 orig = env.get(f'{key}_ORIG')
831 if orig is None:
832 env.pop(key, None)
833 else:
834 env[key] = orig
835
836 _fix('LD_LIBRARY_PATH') # Linux
837 _fix('DYLD_LIBRARY_PATH') # macOS
838
839 def __init__(self, *args, env=None, text=False, **kwargs):
840 if env is None:
841 env = os.environ.copy()
842 self._fix_pyinstaller_ld_path(env)
843
da8e2912 844 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 845 if text is True:
846 kwargs['universal_newlines'] = True # For 3.6 compatibility
847 kwargs.setdefault('encoding', 'utf-8')
848 kwargs.setdefault('errors', 'replace')
82ea226c 849 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 850
851 def communicate_or_kill(self, *args, **kwargs):
8a82af35 852 try:
853 return self.communicate(*args, **kwargs)
854 except BaseException: # Including KeyboardInterrupt
f0c9fb96 855 self.kill(timeout=None)
8a82af35 856 raise
d3c93ec2 857
f0c9fb96 858 def kill(self, *, timeout=0):
859 super().kill()
860 if timeout != 0:
861 self.wait(timeout=timeout)
862
863 @classmethod
992dc6b4 864 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 865 with cls(*args, **kwargs) as proc:
da8e2912 866 default = '' if proc.__text_mode else b''
992dc6b4 867 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 868 return stdout or default, stderr or default, proc.returncode
f0c9fb96 869
d3c93ec2 870
f07b74fc 871def encodeArgument(s):
cfb0511d 872 # Legacy code that uses byte strings
873 # Uncomment the following line after fixing all post processors
14f25df2 874 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 875 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
876
877
aa7785f8 878_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
879
880
881def timetuple_from_msec(msec):
882 secs, msec = divmod(msec, 1000)
883 mins, secs = divmod(secs, 60)
884 hrs, mins = divmod(mins, 60)
885 return _timetuple(hrs, mins, secs, msec)
886
887
cdb19aa4 888def formatSeconds(secs, delim=':', msec=False):
aa7785f8 889 time = timetuple_from_msec(secs * 1000)
890 if time.hours:
891 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
892 elif time.minutes:
893 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 894 else:
aa7785f8 895 ret = '%d' % time.seconds
896 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 897
a0ddb8a2 898
77562778 899def make_HTTPS_handler(params, **kwargs):
c365dba8 900 from ..networking._helper import make_ssl_context
901 return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
902 verify=not params.get('nocheckcertificate'),
903 client_certificate=params.get('client_certificate'),
904 client_certificate_key=params.get('client_certificate_key'),
905 client_certificate_password=params.get('client_certificate_password'),
906 legacy_support=params.get('legacyserverconnect'),
907 use_certifi='no-certifi' not in params.get('compat_opts', []),
908 ), **kwargs)
ea6d901e 909
732ea2f0 910
5873d4cc 911def bug_reports_message(before=';'):
69bec673 912 from ..update import REPOSITORY
57e0f077 913
914 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
915 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
916
917 before = before.rstrip()
918 if not before or before.endswith(('.', '!', '?')):
919 msg = msg[0].title() + msg[1:]
920
921 return (before + ' ' if before else '') + msg
08f2a92c
JMF
922
923
bf5b9d85
PM
924class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
aa9369a2 926 msg = None
927
928 def __init__(self, msg=None):
929 if msg is not None:
930 self.msg = msg
931 elif self.msg is None:
932 self.msg = type(self).__name__
933 super().__init__(self.msg)
bf5b9d85
PM
934
935
936class ExtractorError(YoutubeDLError):
1c256f70 937 """Error during info extraction."""
5f6a1245 938
1151c407 939 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 940 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 941 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 942 """
c365dba8 943 from ..networking.exceptions import network_exceptions
3158150c 944 if sys.exc_info()[0] in network_exceptions:
9a82b238 945 expected = True
d5979c5d 946
7265a219 947 self.orig_msg = str(msg)
1c256f70 948 self.traceback = tb
1151c407 949 self.expected = expected
2eabb802 950 self.cause = cause
d11271dd 951 self.video_id = video_id
1151c407 952 self.ie = ie
953 self.exc_info = sys.exc_info() # preserve original exception
5df14442 954 if isinstance(self.exc_info[1], ExtractorError):
955 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 956 super().__init__(self.__msg)
1151c407 957
9bcfe33b 958 @property
959 def __msg(self):
960 return ''.join((
961 format_field(self.ie, None, '[%s] '),
962 format_field(self.video_id, None, '%s: '),
963 self.orig_msg,
964 format_field(self.cause, None, ' (caused by %r)'),
965 '' if self.expected else bug_reports_message()))
1c256f70 966
01951dda 967 def format_traceback(self):
497d2fab 968 return join_nonempty(
969 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 970 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 971 delim='\n') or None
01951dda 972
9bcfe33b 973 def __setattr__(self, name, value):
974 super().__setattr__(name, value)
975 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
976 self.msg = self.__msg or type(self).__name__
977 self.args = (self.msg, ) # Cannot be property
978
1c256f70 979
416c7fcb
PH
980class UnsupportedError(ExtractorError):
981 def __init__(self, url):
86e5f3ed 982 super().__init__(
416c7fcb
PH
983 'Unsupported URL: %s' % url, expected=True)
984 self.url = url
985
986
55b3e45b
JMF
987class RegexNotFoundError(ExtractorError):
988 """Error when a regex didn't match"""
989 pass
990
991
773f291d
S
992class GeoRestrictedError(ExtractorError):
993 """Geographic restriction Error exception.
994
995 This exception may be thrown when a video is not available from your
996 geographic location due to geographic restrictions imposed by a website.
997 """
b6e0c7d2 998
0db3bae8 999 def __init__(self, msg, countries=None, **kwargs):
1000 kwargs['expected'] = True
86e5f3ed 1001 super().__init__(msg, **kwargs)
773f291d
S
1002 self.countries = countries
1003
1004
693f0600 1005class UserNotLive(ExtractorError):
1006 """Error when a channel/user is not live"""
1007
1008 def __init__(self, msg=None, **kwargs):
1009 kwargs['expected'] = True
1010 super().__init__(msg or 'The channel is not currently live', **kwargs)
1011
1012
bf5b9d85 1013class DownloadError(YoutubeDLError):
59ae15a5 1014 """Download Error exception.
d77c3dfd 1015
59ae15a5
PH
1016 This exception may be thrown by FileDownloader objects if they are not
1017 configured to continue on errors. They will contain the appropriate
1018 error message.
1019 """
5f6a1245 1020
8cc83b8d
FV
1021 def __init__(self, msg, exc_info=None):
1022 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1023 super().__init__(msg)
8cc83b8d 1024 self.exc_info = exc_info
d77c3dfd
FV
1025
1026
498f5606 1027class EntryNotInPlaylist(YoutubeDLError):
1028 """Entry not in playlist exception.
1029
1030 This exception will be thrown by YoutubeDL when a requested entry
1031 is not found in the playlist info_dict
1032 """
aa9369a2 1033 msg = 'Entry not found in info'
498f5606 1034
1035
bf5b9d85 1036class SameFileError(YoutubeDLError):
59ae15a5 1037 """Same File exception.
d77c3dfd 1038
59ae15a5
PH
1039 This exception will be thrown by FileDownloader objects if they detect
1040 multiple files would have to be downloaded to the same file on disk.
1041 """
aa9369a2 1042 msg = 'Fixed output name but more than one file to download'
1043
1044 def __init__(self, filename=None):
1045 if filename is not None:
1046 self.msg += f': {filename}'
1047 super().__init__(self.msg)
d77c3dfd
FV
1048
1049
bf5b9d85 1050class PostProcessingError(YoutubeDLError):
59ae15a5 1051 """Post Processing exception.
d77c3dfd 1052
59ae15a5
PH
1053 This exception may be raised by PostProcessor's .run() method to
1054 indicate an error in the postprocessing task.
1055 """
5f6a1245 1056
5f6a1245 1057
48f79687 1058class DownloadCancelled(YoutubeDLError):
1059 """ Exception raised when the download queue should be interrupted """
1060 msg = 'The download was cancelled'
8b0d7497 1061
8b0d7497 1062
48f79687 1063class ExistingVideoReached(DownloadCancelled):
1064 """ --break-on-existing triggered """
1065 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1066
48f79687 1067
1068class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1069 """ --break-match-filter triggered """
1070 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1071
1072
48f79687 1073class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1074 """ --max-downloads limit has been reached. """
48f79687 1075 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1076
1077
f2ebc5c7 1078class ReExtractInfo(YoutubeDLError):
1079 """ Video info needs to be re-extracted. """
1080
1081 def __init__(self, msg, expected=False):
1082 super().__init__(msg)
1083 self.expected = expected
1084
1085
1086class ThrottledDownload(ReExtractInfo):
48f79687 1087 """ Download speed below --throttled-rate. """
aa9369a2 1088 msg = 'The download speed is below throttle limit'
d77c3dfd 1089
43b22906 1090 def __init__(self):
1091 super().__init__(self.msg, expected=False)
f2ebc5c7 1092
d77c3dfd 1093
bf5b9d85 1094class UnavailableVideoError(YoutubeDLError):
59ae15a5 1095 """Unavailable Format exception.
d77c3dfd 1096
59ae15a5
PH
1097 This exception will be thrown when a video is requested
1098 in a format that is not available for that video.
1099 """
aa9369a2 1100 msg = 'Unable to download video'
1101
1102 def __init__(self, err=None):
1103 if err is not None:
1104 self.msg += f': {err}'
1105 super().__init__(self.msg)
d77c3dfd
FV
1106
1107
bf5b9d85 1108class ContentTooShortError(YoutubeDLError):
59ae15a5 1109 """Content Too Short exception.
d77c3dfd 1110
59ae15a5
PH
1111 This exception may be raised by FileDownloader objects when a file they
1112 download is too small for what the server announced first, indicating
1113 the connection was probably interrupted.
1114 """
d77c3dfd 1115
59ae15a5 1116 def __init__(self, downloaded, expected):
86e5f3ed 1117 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1118 # Both in bytes
59ae15a5
PH
1119 self.downloaded = downloaded
1120 self.expected = expected
d77c3dfd 1121
5f6a1245 1122
bf5b9d85 1123class XAttrMetadataError(YoutubeDLError):
efa97bdc 1124 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1125 super().__init__(msg)
efa97bdc 1126 self.code = code
bd264412 1127 self.msg = msg
efa97bdc
YCH
1128
1129 # Parsing code and msg
3089bc74 1130 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1131 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1132 self.reason = 'NO_SPACE'
1133 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1134 self.reason = 'VALUE_TOO_LONG'
1135 else:
1136 self.reason = 'NOT_SUPPORTED'
1137
1138
bf5b9d85 1139class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1140 pass
1141
1142
ac668111 1143class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1144 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1145 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1146 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1147 self._params = params
1148
1149 def https_open(self, req):
4f264c02 1150 kwargs = {}
71aff188
YCH
1151 conn_class = self._https_conn_class
1152
4f264c02
JMF
1153 if hasattr(self, '_context'): # python > 2.6
1154 kwargs['context'] = self._context
1155 if hasattr(self, '_check_hostname'): # python 3.x
1156 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1157
1158 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1159 if socks_proxy:
c365dba8 1160 from ..networking._urllib import make_socks_conn_class
71aff188
YCH
1161 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1162 del req.headers['Ytdl-socks-proxy']
1163
c365dba8 1164 from ..networking._urllib import _create_http_connection
4f28b537 1165 try:
1166 return self.do_open(
1167 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1168 except urllib.error.URLError as e:
1169 if (isinstance(e.reason, ssl.SSLError)
1170 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1171 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1172 raise
be4a824d
PH
1173
1174
941e881e 1175def is_path_like(f):
1176 return isinstance(f, (str, bytes, os.PathLike))
1177
1178
ac668111 1179class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1180 def __init__(self, cookiejar=None):
ac668111 1181 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1182
1183 def http_response(self, request, response):
ac668111 1184 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1185
ac668111 1186 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1187 https_response = http_response
1188
1189
46f59e89
S
1190def extract_timezone(date_str):
1191 m = re.search(
f137e4c2 1192 r'''(?x)
1193 ^.{8,}? # >=8 char non-TZ prefix, if present
1194 (?P<tz>Z| # just the UTC Z, or
1195 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1196 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1197 [ ]? # optional space
1198 (?P<sign>\+|-) # +/-
1199 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1200 $)
1201 ''', date_str)
46f59e89 1202 if not m:
8f53dc44 1203 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1204 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1205 if timezone is not None:
1206 date_str = date_str[:-len(m.group('tz'))]
1207 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1208 else:
1209 date_str = date_str[:-len(m.group('tz'))]
1210 if not m.group('sign'):
1211 timezone = datetime.timedelta()
1212 else:
1213 sign = 1 if m.group('sign') == '+' else -1
1214 timezone = datetime.timedelta(
1215 hours=sign * int(m.group('hours')),
1216 minutes=sign * int(m.group('minutes')))
1217 return timezone, date_str
1218
1219
08b38d54 1220def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1221 """ Return a UNIX timestamp from the given date """
1222
1223 if date_str is None:
1224 return None
1225
52c3a6e4
S
1226 date_str = re.sub(r'\.[0-9]+', '', date_str)
1227
08b38d54 1228 if timezone is None:
46f59e89
S
1229 timezone, date_str = extract_timezone(date_str)
1230
19a03940 1231 with contextlib.suppress(ValueError):
86e5f3ed 1232 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1233 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1234 return calendar.timegm(dt.timetuple())
912b38b4
PH
1235
1236
46f59e89
S
1237def date_formats(day_first=True):
1238 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1239
1240
42bdd9d0 1241def unified_strdate(date_str, day_first=True):
bf50b038 1242 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1243
1244 if date_str is None:
1245 return None
bf50b038 1246 upload_date = None
5f6a1245 1247 # Replace commas
026fcc04 1248 date_str = date_str.replace(',', ' ')
42bdd9d0 1249 # Remove AM/PM + timezone
9bb8e0a3 1250 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1251 _, date_str = extract_timezone(date_str)
42bdd9d0 1252
46f59e89 1253 for expression in date_formats(day_first):
19a03940 1254 with contextlib.suppress(ValueError):
bf50b038 1255 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1256 if upload_date is None:
1257 timetuple = email.utils.parsedate_tz(date_str)
1258 if timetuple:
19a03940 1259 with contextlib.suppress(ValueError):
c6b9cf05 1260 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1261 if upload_date is not None:
14f25df2 1262 return str(upload_date)
bf50b038 1263
5f6a1245 1264
46f59e89 1265def unified_timestamp(date_str, day_first=True):
ad54c913 1266 if not isinstance(date_str, str):
46f59e89
S
1267 return None
1268
8f53dc44 1269 date_str = re.sub(r'\s+', ' ', re.sub(
1270 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1271
7dc2a74e 1272 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1273 timezone, date_str = extract_timezone(date_str)
1274
1275 # Remove AM/PM + timezone
1276 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1277
deef3195
S
1278 # Remove unrecognized timezones from ISO 8601 alike timestamps
1279 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1280 if m:
1281 date_str = date_str[:-len(m.group('tz'))]
1282
f226880c
PH
1283 # Python only supports microseconds, so remove nanoseconds
1284 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1285 if m:
1286 date_str = m.group(1)
1287
46f59e89 1288 for expression in date_formats(day_first):
19a03940 1289 with contextlib.suppress(ValueError):
7dc2a74e 1290 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1291 return calendar.timegm(dt.timetuple())
8f53dc44 1292
46f59e89
S
1293 timetuple = email.utils.parsedate_tz(date_str)
1294 if timetuple:
8f53dc44 1295 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1296
1297
28e614de 1298def determine_ext(url, default_ext='unknown_video'):
85750f89 1299 if url is None or '.' not in url:
f4776371 1300 return default_ext
9cb9a5df 1301 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1302 if re.match(r'^[A-Za-z0-9]+$', guess):
1303 return guess
a7aaa398
S
1304 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1305 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1306 return guess.rstrip('/')
73e79f2a 1307 else:
cbdbb766 1308 return default_ext
73e79f2a 1309
5f6a1245 1310
824fa511
S
1311def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1312 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1313
5f6a1245 1314
9e62f283 1315def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1316 R"""
1317 Return a datetime object from a string.
1318 Supported format:
1319 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1320
1321 @param format strftime format of DATE
1322 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1323 auto: round to the unit provided in date_str (if applicable).
9e62f283 1324 """
1325 auto_precision = False
1326 if precision == 'auto':
1327 auto_precision = True
1328 precision = 'microsecond'
396a76f7 1329 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1330 if date_str in ('now', 'today'):
37254abc 1331 return today
f8795e10
PH
1332 if date_str == 'yesterday':
1333 return today - datetime.timedelta(days=1)
9e62f283 1334 match = re.match(
3d38b2d6 1335 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1336 date_str)
37254abc 1337 if match is not None:
9e62f283 1338 start_time = datetime_from_str(match.group('start'), precision, format)
1339 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1340 unit = match.group('unit')
9e62f283 1341 if unit == 'month' or unit == 'year':
1342 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1343 unit = 'day'
9e62f283 1344 else:
1345 if unit == 'week':
1346 unit = 'day'
1347 time *= 7
1348 delta = datetime.timedelta(**{unit + 's': time})
1349 new_date = start_time + delta
1350 if auto_precision:
1351 return datetime_round(new_date, unit)
1352 return new_date
1353
1354 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1355
1356
d49f8db3 1357def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1358 R"""
1359 Return a date object from a string using datetime_from_str
9e62f283 1360
3d38b2d6 1361 @param strict Restrict allowed patterns to "YYYYMMDD" and
1362 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1363 """
3d38b2d6 1364 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1365 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1366 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1367
1368
1369def datetime_add_months(dt, months):
1370 """Increment/Decrement a datetime object by months."""
1371 month = dt.month + months - 1
1372 year = dt.year + month // 12
1373 month = month % 12 + 1
1374 day = min(dt.day, calendar.monthrange(year, month)[1])
1375 return dt.replace(year, month, day)
1376
1377
1378def datetime_round(dt, precision='day'):
1379 """
1380 Round a datetime object's time to a specific precision
1381 """
1382 if precision == 'microsecond':
1383 return dt
1384
1385 unit_seconds = {
1386 'day': 86400,
1387 'hour': 3600,
1388 'minute': 60,
1389 'second': 1,
1390 }
1391 roundto = lambda x, n: ((x + n / 2) // n) * n
1392 timestamp = calendar.timegm(dt.timetuple())
1393 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1394
1395
e63fc1be 1396def hyphenate_date(date_str):
1397 """
1398 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1399 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1400 if match is not None:
1401 return '-'.join(match.groups())
1402 else:
1403 return date_str
1404
5f6a1245 1405
86e5f3ed 1406class DateRange:
bd558525 1407 """Represents a time interval between two dates"""
5f6a1245 1408
bd558525
JMF
1409 def __init__(self, start=None, end=None):
1410 """start and end must be strings in the format accepted by date"""
1411 if start is not None:
d49f8db3 1412 self.start = date_from_str(start, strict=True)
bd558525
JMF
1413 else:
1414 self.start = datetime.datetime.min.date()
1415 if end is not None:
d49f8db3 1416 self.end = date_from_str(end, strict=True)
bd558525
JMF
1417 else:
1418 self.end = datetime.datetime.max.date()
37254abc 1419 if self.start > self.end:
bd558525 1420 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1421
bd558525
JMF
1422 @classmethod
1423 def day(cls, day):
1424 """Returns a range that only contains the given day"""
5f6a1245
JW
1425 return cls(day, day)
1426
bd558525
JMF
1427 def __contains__(self, date):
1428 """Check if the date is in the range"""
37254abc
JMF
1429 if not isinstance(date, datetime.date):
1430 date = date_from_str(date)
1431 return self.start <= date <= self.end
5f6a1245 1432
46f1370e 1433 def __repr__(self):
1434 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
c496ca96 1435
f2df4071 1436 def __eq__(self, other):
1437 return (isinstance(other, DateRange)
1438 and self.start == other.start and self.end == other.end)
1439
c496ca96 1440
b1f94422 1441@functools.cache
1442def system_identifier():
1443 python_implementation = platform.python_implementation()
1444 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1445 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1446 libc_ver = []
1447 with contextlib.suppress(OSError): # We may not have access to the executable
1448 libc_ver = platform.libc_ver()
b1f94422 1449
17fc3dc4 1450 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 1451 platform.python_version(),
1452 python_implementation,
17fc3dc4 1453 platform.machine(),
b1f94422 1454 platform.architecture()[0],
1455 platform.platform(),
5b9f253f
M
1456 ssl.OPENSSL_VERSION,
1457 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 1458 )
c257baff
PH
1459
1460
0b9c08b4 1461@functools.cache
49fa4d9a 1462def get_windows_version():
8a82af35 1463 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1464 if compat_os_name == 'nt':
1465 return version_tuple(platform.win32_ver()[1])
1466 else:
8a82af35 1467 return ()
49fa4d9a
N
1468
1469
734f90bb 1470def write_string(s, out=None, encoding=None):
19a03940 1471 assert isinstance(s, str)
1472 out = out or sys.stderr
3b479100
SS
1473 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1474 if not out:
1475 return
7459e3a2 1476
fe1daad3 1477 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1478 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1479
8a82af35 1480 enc, buffer = None, out
cfb0511d 1481 if 'b' in getattr(out, 'mode', ''):
c487cf00 1482 enc = encoding or preferredencoding()
104aa738 1483 elif hasattr(out, 'buffer'):
8a82af35 1484 buffer = out.buffer
104aa738 1485 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1486
8a82af35 1487 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1488 out.flush()
1489
1490
da4db748 1491def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
69bec673 1492 from .. import _IN_CLI
da4db748 1493 if _IN_CLI:
1494 if msg in deprecation_warning._cache:
1495 return
1496 deprecation_warning._cache.add(msg)
1497 if printer:
1498 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1499 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1500 else:
1501 import warnings
1502 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1503
1504
1505deprecation_warning._cache = set()
1506
1507
48ea9cea
PH
1508def bytes_to_intlist(bs):
1509 if not bs:
1510 return []
1511 if isinstance(bs[0], int): # Python 3
1512 return list(bs)
1513 else:
1514 return [ord(c) for c in bs]
1515
c257baff 1516
cba892fa 1517def intlist_to_bytes(xs):
1518 if not xs:
1519 return b''
ac668111 1520 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1521
1522
8a82af35 1523class LockingUnsupportedError(OSError):
1890fc63 1524 msg = 'File locking is not supported'
0edb3e33 1525
1526 def __init__(self):
1527 super().__init__(self.msg)
1528
1529
c1c9a79c
PH
1530# Cross-platform file locking
1531if sys.platform == 'win32':
fe0918bb 1532 import ctypes
c1c9a79c
PH
1533 import ctypes.wintypes
1534 import msvcrt
1535
1536 class OVERLAPPED(ctypes.Structure):
1537 _fields_ = [
1538 ('Internal', ctypes.wintypes.LPVOID),
1539 ('InternalHigh', ctypes.wintypes.LPVOID),
1540 ('Offset', ctypes.wintypes.DWORD),
1541 ('OffsetHigh', ctypes.wintypes.DWORD),
1542 ('hEvent', ctypes.wintypes.HANDLE),
1543 ]
1544
37e325b9 1545 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
1546 LockFileEx = kernel32.LockFileEx
1547 LockFileEx.argtypes = [
1548 ctypes.wintypes.HANDLE, # hFile
1549 ctypes.wintypes.DWORD, # dwFlags
1550 ctypes.wintypes.DWORD, # dwReserved
1551 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1552 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1553 ctypes.POINTER(OVERLAPPED) # Overlapped
1554 ]
1555 LockFileEx.restype = ctypes.wintypes.BOOL
1556 UnlockFileEx = kernel32.UnlockFileEx
1557 UnlockFileEx.argtypes = [
1558 ctypes.wintypes.HANDLE, # hFile
1559 ctypes.wintypes.DWORD, # dwReserved
1560 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1561 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1562 ctypes.POINTER(OVERLAPPED) # Overlapped
1563 ]
1564 UnlockFileEx.restype = ctypes.wintypes.BOOL
1565 whole_low = 0xffffffff
1566 whole_high = 0x7fffffff
1567
747c0bd1 1568 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1569 overlapped = OVERLAPPED()
1570 overlapped.Offset = 0
1571 overlapped.OffsetHigh = 0
1572 overlapped.hEvent = 0
1573 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1574
1575 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1576 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1577 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 1578 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1579 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
1580
1581 def _unlock_file(f):
1582 assert f._lock_file_overlapped_p
1583 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1584 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1585 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1586
1587else:
399a76e6
YCH
1588 try:
1589 import fcntl
c1c9a79c 1590
a3125791 1591 def _lock_file(f, exclusive, block):
b63837bc 1592 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1593 if not block:
1594 flags |= fcntl.LOCK_NB
acea8d7c 1595 try:
b63837bc 1596 fcntl.flock(f, flags)
acea8d7c
JK
1597 except BlockingIOError:
1598 raise
1599 except OSError: # AOSP does not have flock()
b63837bc 1600 fcntl.lockf(f, flags)
c1c9a79c 1601
399a76e6 1602 def _unlock_file(f):
45998b3e
E
1603 with contextlib.suppress(OSError):
1604 return fcntl.flock(f, fcntl.LOCK_UN)
1605 with contextlib.suppress(OSError):
1606 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1607 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
a3125791 1608
399a76e6 1609 except ImportError:
399a76e6 1610
a3125791 1611 def _lock_file(f, exclusive, block):
0edb3e33 1612 raise LockingUnsupportedError()
399a76e6
YCH
1613
1614 def _unlock_file(f):
0edb3e33 1615 raise LockingUnsupportedError()
c1c9a79c
PH
1616
1617
86e5f3ed 1618class locked_file:
0edb3e33 1619 locked = False
747c0bd1 1620
a3125791 1621 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
1622 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1623 raise NotImplementedError(mode)
1624 self.mode, self.block = mode, block
1625
1626 writable = any(f in mode for f in 'wax+')
1627 readable = any(f in mode for f in 'r+')
1628 flags = functools.reduce(operator.ior, (
1629 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1630 getattr(os, 'O_BINARY', 0), # Windows only
1631 getattr(os, 'O_NOINHERIT', 0), # Windows only
1632 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1633 os.O_APPEND if 'a' in mode else 0,
1634 os.O_EXCL if 'x' in mode else 0,
1635 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1636 ))
1637
98804d03 1638 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
1639
1640 def __enter__(self):
a3125791 1641 exclusive = 'r' not in self.mode
c1c9a79c 1642 try:
a3125791 1643 _lock_file(self.f, exclusive, self.block)
0edb3e33 1644 self.locked = True
86e5f3ed 1645 except OSError:
c1c9a79c
PH
1646 self.f.close()
1647 raise
fcfa8853 1648 if 'w' in self.mode:
131e14dc
JK
1649 try:
1650 self.f.truncate()
1651 except OSError as e:
1890fc63 1652 if e.errno not in (
1653 errno.ESPIPE, # Illegal seek - expected for FIFO
1654 errno.EINVAL, # Invalid argument - expected for /dev/null
1655 ):
1656 raise
c1c9a79c
PH
1657 return self
1658
0edb3e33 1659 def unlock(self):
1660 if not self.locked:
1661 return
c1c9a79c 1662 try:
0edb3e33 1663 _unlock_file(self.f)
c1c9a79c 1664 finally:
0edb3e33 1665 self.locked = False
c1c9a79c 1666
0edb3e33 1667 def __exit__(self, *_):
1668 try:
1669 self.unlock()
1670 finally:
1671 self.f.close()
4eb7f1d1 1672
0edb3e33 1673 open = __enter__
1674 close = __exit__
a3125791 1675
0edb3e33 1676 def __getattr__(self, attr):
1677 return getattr(self.f, attr)
a3125791 1678
0edb3e33 1679 def __iter__(self):
1680 return iter(self.f)
a3125791 1681
4eb7f1d1 1682
0b9c08b4 1683@functools.cache
4644ac55
S
1684def get_filesystem_encoding():
1685 encoding = sys.getfilesystemencoding()
1686 return encoding if encoding is not None else 'utf-8'
1687
1688
4eb7f1d1 1689def shell_quote(args):
a6a173c2 1690 quoted_args = []
4644ac55 1691 encoding = get_filesystem_encoding()
a6a173c2
JMF
1692 for a in args:
1693 if isinstance(a, bytes):
1694 # We may get a filename encoded with 'encodeFilename'
1695 a = a.decode(encoding)
aefce8e6 1696 quoted_args.append(compat_shlex_quote(a))
28e614de 1697 return ' '.join(quoted_args)
9d4660ca
PH
1698
1699
1700def smuggle_url(url, data):
1701 """ Pass additional data in a URL for internal use. """
1702
81953d1a
RA
1703 url, idata = unsmuggle_url(url, {})
1704 data.update(idata)
14f25df2 1705 sdata = urllib.parse.urlencode(
28e614de
PH
1706 {'__youtubedl_smuggle': json.dumps(data)})
1707 return url + '#' + sdata
9d4660ca
PH
1708
1709
79f82953 1710def unsmuggle_url(smug_url, default=None):
83e865a3 1711 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1712 return smug_url, default
28e614de 1713 url, _, sdata = smug_url.rpartition('#')
14f25df2 1714 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1715 data = json.loads(jsond)
1716 return url, data
02dbf93f
PH
1717
1718
e0fd9573 1719def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1720 """ Formats numbers with decimal sufixes like K, M, etc """
1721 num, factor = float_or_none(num), float(factor)
4c3f8c3f 1722 if num is None or num < 0:
e0fd9573 1723 return None
eeb2a770 1724 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1725 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1726 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 1727 if factor == 1024:
1728 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 1729 converted = num / (factor ** exponent)
abbeeebc 1730 return fmt % (converted, suffix)
e0fd9573 1731
1732
02dbf93f 1733def format_bytes(bytes):
f02d24d8 1734 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 1735
1c088fa8 1736
64c464a1 1737def lookup_unit_table(unit_table, s, strict=False):
1738 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 1739 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 1740 m = (re.fullmatch if strict else re.match)(
1741 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
1742 if not m:
1743 return None
64c464a1 1744
1745 num = float(m.group('num').replace(',', '.'))
fb47597b 1746 mult = unit_table[m.group('unit')]
64c464a1 1747 return round(num * mult)
1748
1749
1750def parse_bytes(s):
1751 """Parse a string indicating a byte quantity into an integer"""
1752 return lookup_unit_table(
1753 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1754 s.upper(), strict=True)
fb47597b
S
1755
1756
be64b5b0
PH
1757def parse_filesize(s):
1758 if s is None:
1759 return None
1760
dfb1b146 1761 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1762 # but we support those too
1763 _UNIT_TABLE = {
1764 'B': 1,
1765 'b': 1,
70852b47 1766 'bytes': 1,
be64b5b0
PH
1767 'KiB': 1024,
1768 'KB': 1000,
1769 'kB': 1024,
1770 'Kb': 1000,
13585d76 1771 'kb': 1000,
70852b47
YCH
1772 'kilobytes': 1000,
1773 'kibibytes': 1024,
be64b5b0
PH
1774 'MiB': 1024 ** 2,
1775 'MB': 1000 ** 2,
1776 'mB': 1024 ** 2,
1777 'Mb': 1000 ** 2,
13585d76 1778 'mb': 1000 ** 2,
70852b47
YCH
1779 'megabytes': 1000 ** 2,
1780 'mebibytes': 1024 ** 2,
be64b5b0
PH
1781 'GiB': 1024 ** 3,
1782 'GB': 1000 ** 3,
1783 'gB': 1024 ** 3,
1784 'Gb': 1000 ** 3,
13585d76 1785 'gb': 1000 ** 3,
70852b47
YCH
1786 'gigabytes': 1000 ** 3,
1787 'gibibytes': 1024 ** 3,
be64b5b0
PH
1788 'TiB': 1024 ** 4,
1789 'TB': 1000 ** 4,
1790 'tB': 1024 ** 4,
1791 'Tb': 1000 ** 4,
13585d76 1792 'tb': 1000 ** 4,
70852b47
YCH
1793 'terabytes': 1000 ** 4,
1794 'tebibytes': 1024 ** 4,
be64b5b0
PH
1795 'PiB': 1024 ** 5,
1796 'PB': 1000 ** 5,
1797 'pB': 1024 ** 5,
1798 'Pb': 1000 ** 5,
13585d76 1799 'pb': 1000 ** 5,
70852b47
YCH
1800 'petabytes': 1000 ** 5,
1801 'pebibytes': 1024 ** 5,
be64b5b0
PH
1802 'EiB': 1024 ** 6,
1803 'EB': 1000 ** 6,
1804 'eB': 1024 ** 6,
1805 'Eb': 1000 ** 6,
13585d76 1806 'eb': 1000 ** 6,
70852b47
YCH
1807 'exabytes': 1000 ** 6,
1808 'exbibytes': 1024 ** 6,
be64b5b0
PH
1809 'ZiB': 1024 ** 7,
1810 'ZB': 1000 ** 7,
1811 'zB': 1024 ** 7,
1812 'Zb': 1000 ** 7,
13585d76 1813 'zb': 1000 ** 7,
70852b47
YCH
1814 'zettabytes': 1000 ** 7,
1815 'zebibytes': 1024 ** 7,
be64b5b0
PH
1816 'YiB': 1024 ** 8,
1817 'YB': 1000 ** 8,
1818 'yB': 1024 ** 8,
1819 'Yb': 1000 ** 8,
13585d76 1820 'yb': 1000 ** 8,
70852b47
YCH
1821 'yottabytes': 1000 ** 8,
1822 'yobibytes': 1024 ** 8,
be64b5b0
PH
1823 }
1824
fb47597b
S
1825 return lookup_unit_table(_UNIT_TABLE, s)
1826
1827
1828def parse_count(s):
1829 if s is None:
be64b5b0
PH
1830 return None
1831
352d5da8 1832 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
1833
1834 if re.match(r'^[\d,.]+$', s):
1835 return str_to_int(s)
1836
1837 _UNIT_TABLE = {
1838 'k': 1000,
1839 'K': 1000,
1840 'm': 1000 ** 2,
1841 'M': 1000 ** 2,
1842 'kk': 1000 ** 2,
1843 'KK': 1000 ** 2,
352d5da8 1844 'b': 1000 ** 3,
1845 'B': 1000 ** 3,
fb47597b 1846 }
be64b5b0 1847
352d5da8 1848 ret = lookup_unit_table(_UNIT_TABLE, s)
1849 if ret is not None:
1850 return ret
1851
1852 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1853 if mobj:
1854 return str_to_int(mobj.group(1))
be64b5b0 1855
2f7ae819 1856
5d45484c 1857def parse_resolution(s, *, lenient=False):
b871d7e9
S
1858 if s is None:
1859 return {}
1860
5d45484c
LNO
1861 if lenient:
1862 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1863 else:
1864 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
1865 if mobj:
1866 return {
1867 'width': int(mobj.group('w')),
1868 'height': int(mobj.group('h')),
1869 }
1870
17ec8bcf 1871 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
1872 if mobj:
1873 return {'height': int(mobj.group(1))}
1874
1875 mobj = re.search(r'\b([48])[kK]\b', s)
1876 if mobj:
1877 return {'height': int(mobj.group(1)) * 540}
1878
1879 return {}
1880
1881
0dc41787 1882def parse_bitrate(s):
14f25df2 1883 if not isinstance(s, str):
0dc41787
S
1884 return
1885 mobj = re.search(r'\b(\d+)\s*kbps', s)
1886 if mobj:
1887 return int(mobj.group(1))
1888
1889
a942d6cb 1890def month_by_name(name, lang='en'):
caefb1de
PH
1891 """ Return the number of a month by (locale-independently) English name """
1892
f6717dec 1893 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1894
caefb1de 1895 try:
f6717dec 1896 return month_names.index(name) + 1
7105440c
YCH
1897 except ValueError:
1898 return None
1899
1900
1901def month_by_abbreviation(abbrev):
1902 """ Return the number of a month by (locale-independently) English
1903 abbreviations """
1904
1905 try:
1906 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1907 except ValueError:
1908 return None
18258362
JMF
1909
1910
5aafe895 1911def fix_xml_ampersands(xml_str):
18258362 1912 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1913 return re.sub(
1914 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1915 '&amp;',
5aafe895 1916 xml_str)
e3946f98
PH
1917
1918
1919def setproctitle(title):
14f25df2 1920 assert isinstance(title, str)
c1c05c67 1921
fe0918bb 1922 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1923 try:
1924 import ctypes
1925 except ImportError:
c1c05c67
YCH
1926 return
1927
e3946f98 1928 try:
611c1dd9 1929 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1930 except OSError:
1931 return
2f49bcd6
RC
1932 except TypeError:
1933 # LoadLibrary in Windows Python 2.7.13 only expects
1934 # a bytestring, but since unicode_literals turns
1935 # every string into a unicode string, it fails.
1936 return
0f06bcd7 1937 title_bytes = title.encode()
6eefe533
PH
1938 buf = ctypes.create_string_buffer(len(title_bytes))
1939 buf.value = title_bytes
e3946f98 1940 try:
6eefe533 1941 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1942 except AttributeError:
1943 return # Strange libc, just skip this
d7dda168
PH
1944
1945
1946def remove_start(s, start):
46bc9b7d 1947 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1948
1949
2b9faf55 1950def remove_end(s, end):
46bc9b7d 1951 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1952
1953
31b2051e
S
1954def remove_quotes(s):
1955 if s is None or len(s) < 2:
1956 return s
1957 for quote in ('"', "'", ):
1958 if s[0] == quote and s[-1] == quote:
1959 return s[1:-1]
1960 return s
1961
1962
b6e0c7d2 1963def get_domain(url):
ebf99aaf 1964 """
1965 This implementation is inconsistent, but is kept for compatibility.
1966 Use this only for "webpage_url_domain"
1967 """
1968 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
1969
1970
29eb5174 1971def url_basename(url):
14f25df2 1972 path = urllib.parse.urlparse(url).path
28e614de 1973 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1974
1975
02dc0a36 1976def base_url(url):
7657ec7e 1977 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
1978
1979
e34c3361 1980def urljoin(base, path):
4b5de77b 1981 if isinstance(path, bytes):
0f06bcd7 1982 path = path.decode()
14f25df2 1983 if not isinstance(path, str) or not path:
e34c3361 1984 return None
fad4ceb5 1985 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 1986 return path
4b5de77b 1987 if isinstance(base, bytes):
0f06bcd7 1988 base = base.decode()
14f25df2 1989 if not isinstance(base, str) or not re.match(
4b5de77b 1990 r'^(?:https?:)?//', base):
e34c3361 1991 return None
14f25df2 1992 return urllib.parse.urljoin(base, path)
e34c3361
S
1993
1994
9732d77e 1995def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 1996 if get_attr and v is not None:
1997 v = getattr(v, get_attr, None)
1812afb7
S
1998 try:
1999 return int(v) * invscale // scale
31c49255 2000 except (ValueError, TypeError, OverflowError):
af98f8ff 2001 return default
9732d77e 2002
9572013d 2003
40a90862 2004def str_or_none(v, default=None):
14f25df2 2005 return default if v is None else str(v)
40a90862 2006
9732d77e
PH
2007
2008def str_to_int(int_str):
48d4681e 2009 """ A more relaxed version of int_or_none """
f9934b96 2010 if isinstance(int_str, int):
348c6bf1 2011 return int_str
14f25df2 2012 elif isinstance(int_str, str):
42db58ec
S
2013 int_str = re.sub(r'[,\.\+]', '', int_str)
2014 return int_or_none(int_str)
608d11f5
PH
2015
2016
9732d77e 2017def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2018 if v is None:
2019 return default
2020 try:
2021 return float(v) * invscale / scale
5e1271c5 2022 except (ValueError, TypeError):
caf80631 2023 return default
43f775e4
PH
2024
2025
c7e327c4
S
2026def bool_or_none(v, default=None):
2027 return v if isinstance(v, bool) else default
2028
2029
53cd37ba 2030def strip_or_none(v, default=None):
14f25df2 2031 return v.strip() if isinstance(v, str) else default
b72b4431
S
2032
2033
af03000a 2034def url_or_none(url):
14f25df2 2035 if not url or not isinstance(url, str):
af03000a
S
2036 return None
2037 url = url.strip()
29f7c58a 2038 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2039
2040
3e9b66d7 2041def request_to_url(req):
ac668111 2042 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2043 return req.get_full_url()
2044 else:
2045 return req
2046
2047
ad54c913 2048def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
e29663c6 2049 datetime_object = None
2050 try:
f9934b96 2051 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2052 # Using naive datetime here can break timestamp() in Windows
2053 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
a35af430 2054 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2055 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2056 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2057 + datetime.timedelta(seconds=timestamp))
14f25df2 2058 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2059 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2060 date_format = re.sub( # Support %s on windows
2061 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2062 return datetime_object.strftime(date_format)
2063 except (ValueError, TypeError, AttributeError):
2064 return default
2065
2066
608d11f5 2067def parse_duration(s):
f9934b96 2068 if not isinstance(s, str):
608d11f5 2069 return None
ca7b3246 2070 s = s.strip()
38d79fd1 2071 if not s:
2072 return None
ca7b3246 2073
acaff495 2074 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2075 m = re.match(r'''(?x)
2076 (?P<before_secs>
2077 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2078 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2079 (?P<ms>[.:][0-9]+)?Z?$
2080 ''', s)
acaff495 2081 if m:
8bd1c00b 2082 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2083 else:
2084 m = re.match(
056653bb
S
2085 r'''(?ix)(?:P?
2086 (?:
1c1b2f96 2087 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2088 )?
2089 (?:
1c1b2f96 2090 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2091 )?
2092 (?:
1c1b2f96 2093 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2094 )?
8f4b58d7 2095 (?:
1c1b2f96 2096 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2097 )?
056653bb 2098 T)?
acaff495 2099 (?:
1c1b2f96 2100 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2101 )?
2102 (?:
1c1b2f96 2103 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2104 )?
2105 (?:
2106 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2107 )?Z?$''', s)
acaff495 2108 if m:
2109 days, hours, mins, secs, ms = m.groups()
2110 else:
15846398 2111 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2112 if m:
2113 hours, mins = m.groups()
2114 else:
2115 return None
2116
acaff495 2117 if ms:
19a03940 2118 ms = ms.replace(':', '.')
2119 return sum(float(part or 0) * mult for part, mult in (
2120 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2121
2122
e65e4c88 2123def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2124 name, real_ext = os.path.splitext(filename)
e65e4c88 2125 return (
86e5f3ed 2126 f'{name}.{ext}{real_ext}'
e65e4c88 2127 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2128 else f'{filename}.{ext}')
d70ad093
PH
2129
2130
b3ed15b7
S
2131def replace_extension(filename, ext, expected_real_ext=None):
2132 name, real_ext = os.path.splitext(filename)
86e5f3ed 2133 return '{}.{}'.format(
b3ed15b7
S
2134 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2135 ext)
2136
2137
d70ad093
PH
2138def check_executable(exe, args=[]):
2139 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2140 args can be a list of arguments for a short output (like -version) """
2141 try:
f0c9fb96 2142 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2143 except OSError:
2144 return False
2145 return exe
b7ab0590
PH
2146
2147
7aaf4cd2 2148def _get_exe_version_output(exe, args):
95807118 2149 try:
b64d04c1 2150 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2151 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2152 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2153 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2154 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2155 if ret:
2156 return None
95807118
PH
2157 except OSError:
2158 return False
f0c9fb96 2159 return stdout
cae97f65
PH
2160
2161
2162def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2163 assert isinstance(output, str)
cae97f65
PH
2164 if version_re is None:
2165 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2166 m = re.search(version_re, output)
95807118
PH
2167 if m:
2168 return m.group(1)
2169 else:
2170 return unrecognized
2171
2172
9af98e17 2173def get_exe_version(exe, args=['--version'],
1cdda329 2174 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2175 """ Returns the version of the specified executable,
2176 or False if the executable is not present """
1cdda329 2177 unrecognized = variadic(unrecognized)
2178 assert len(unrecognized) in (1, 2)
9af98e17 2179 out = _get_exe_version_output(exe, args)
1cdda329 2180 if out is None:
2181 return unrecognized[-1]
2182 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2183
2184
7e88d7d7 2185def frange(start=0, stop=None, step=1):
2186 """Float range"""
2187 if stop is None:
2188 start, stop = 0, start
2189 sign = [-1, 1][step > 0] if step else 0
2190 while sign * start < sign * stop:
2191 yield start
2192 start += step
2193
2194
cb89cfc1 2195class LazyList(collections.abc.Sequence):
0f06bcd7 2196 """Lazy immutable list from an iterable
2197 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2198
8e5fecc8 2199 class IndexError(IndexError):
2200 pass
2201
282f5709 2202 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2203 self._iterable = iter(iterable)
2204 self._cache = [] if _cache is None else _cache
2205 self._reversed = reverse
483336e7 2206
2207 def __iter__(self):
0f06bcd7 2208 if self._reversed:
28419ca2 2209 # We need to consume the entire iterable to iterate in reverse
981052c9 2210 yield from self.exhaust()
28419ca2 2211 return
0f06bcd7 2212 yield from self._cache
2213 for item in self._iterable:
2214 self._cache.append(item)
483336e7 2215 yield item
2216
0f06bcd7 2217 def _exhaust(self):
2218 self._cache.extend(self._iterable)
2219 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2220 return self._cache
28419ca2 2221
981052c9 2222 def exhaust(self):
0f06bcd7 2223 """Evaluate the entire iterable"""
2224 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2225
28419ca2 2226 @staticmethod
0f06bcd7 2227 def _reverse_index(x):
f2df4071 2228 return None if x is None else ~x
483336e7 2229
2230 def __getitem__(self, idx):
2231 if isinstance(idx, slice):
0f06bcd7 2232 if self._reversed:
2233 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2234 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2235 elif isinstance(idx, int):
0f06bcd7 2236 if self._reversed:
2237 idx = self._reverse_index(idx)
e0f2b4b4 2238 start, stop, step = idx, idx, 0
483336e7 2239 else:
2240 raise TypeError('indices must be integers or slices')
e0f2b4b4 2241 if ((start or 0) < 0 or (stop or 0) < 0
2242 or (start is None and step < 0)
2243 or (stop is None and step > 0)):
483336e7 2244 # We need to consume the entire iterable to be able to slice from the end
2245 # Obviously, never use this with infinite iterables
0f06bcd7 2246 self._exhaust()
8e5fecc8 2247 try:
0f06bcd7 2248 return self._cache[idx]
8e5fecc8 2249 except IndexError as e:
2250 raise self.IndexError(e) from e
0f06bcd7 2251 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2252 if n > 0:
0f06bcd7 2253 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2254 try:
0f06bcd7 2255 return self._cache[idx]
8e5fecc8 2256 except IndexError as e:
2257 raise self.IndexError(e) from e
483336e7 2258
2259 def __bool__(self):
2260 try:
0f06bcd7 2261 self[-1] if self._reversed else self[0]
8e5fecc8 2262 except self.IndexError:
483336e7 2263 return False
2264 return True
2265
2266 def __len__(self):
0f06bcd7 2267 self._exhaust()
2268 return len(self._cache)
483336e7 2269
282f5709 2270 def __reversed__(self):
0f06bcd7 2271 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2272
2273 def __copy__(self):
0f06bcd7 2274 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2275
28419ca2 2276 def __repr__(self):
2277 # repr and str should mimic a list. So we exhaust the iterable
2278 return repr(self.exhaust())
2279
2280 def __str__(self):
2281 return repr(self.exhaust())
2282
483336e7 2283
7be9ccff 2284class PagedList:
c07a39ae 2285
2286 class IndexError(IndexError):
2287 pass
2288
dd26ced1
PH
2289 def __len__(self):
2290 # This is only useful for tests
2291 return len(self.getslice())
2292
7be9ccff 2293 def __init__(self, pagefunc, pagesize, use_cache=True):
2294 self._pagefunc = pagefunc
2295 self._pagesize = pagesize
f1d13090 2296 self._pagecount = float('inf')
7be9ccff 2297 self._use_cache = use_cache
2298 self._cache = {}
2299
2300 def getpage(self, pagenum):
d8cf8d97 2301 page_results = self._cache.get(pagenum)
2302 if page_results is None:
f1d13090 2303 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2304 if self._use_cache:
2305 self._cache[pagenum] = page_results
2306 return page_results
2307
2308 def getslice(self, start=0, end=None):
2309 return list(self._getslice(start, end))
2310
2311 def _getslice(self, start, end):
55575225 2312 raise NotImplementedError('This method must be implemented by subclasses')
2313
2314 def __getitem__(self, idx):
f1d13090 2315 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2316 if not isinstance(idx, int) or idx < 0:
2317 raise TypeError('indices must be non-negative integers')
2318 entries = self.getslice(idx, idx + 1)
d8cf8d97 2319 if not entries:
c07a39ae 2320 raise self.IndexError()
d8cf8d97 2321 return entries[0]
55575225 2322
9c44d242
PH
2323
2324class OnDemandPagedList(PagedList):
a44ca5a4 2325 """Download pages until a page with less than maximum results"""
86e5f3ed 2326
7be9ccff 2327 def _getslice(self, start, end):
b7ab0590
PH
2328 for pagenum in itertools.count(start // self._pagesize):
2329 firstid = pagenum * self._pagesize
2330 nextfirstid = pagenum * self._pagesize + self._pagesize
2331 if start >= nextfirstid:
2332 continue
2333
b7ab0590
PH
2334 startv = (
2335 start % self._pagesize
2336 if firstid <= start < nextfirstid
2337 else 0)
b7ab0590
PH
2338 endv = (
2339 ((end - 1) % self._pagesize) + 1
2340 if (end is not None and firstid <= end <= nextfirstid)
2341 else None)
2342
f1d13090 2343 try:
2344 page_results = self.getpage(pagenum)
2345 except Exception:
2346 self._pagecount = pagenum - 1
2347 raise
b7ab0590
PH
2348 if startv != 0 or endv is not None:
2349 page_results = page_results[startv:endv]
7be9ccff 2350 yield from page_results
b7ab0590
PH
2351
2352 # A little optimization - if current page is not "full", ie. does
2353 # not contain page_size videos then we can assume that this page
2354 # is the last one - there are no more ids on further pages -
2355 # i.e. no need to query again.
2356 if len(page_results) + startv < self._pagesize:
2357 break
2358
2359 # If we got the whole page, but the next page is not interesting,
2360 # break out early as well
2361 if end == nextfirstid:
2362 break
81c2f20b
PH
2363
2364
9c44d242 2365class InAdvancePagedList(PagedList):
a44ca5a4 2366 """PagedList with total number of pages known in advance"""
86e5f3ed 2367
9c44d242 2368 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2369 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2370 self._pagecount = pagecount
9c44d242 2371
7be9ccff 2372 def _getslice(self, start, end):
9c44d242 2373 start_page = start // self._pagesize
d37707bd 2374 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2375 skip_elems = start - start_page * self._pagesize
2376 only_more = None if end is None else end - start
2377 for pagenum in range(start_page, end_page):
7be9ccff 2378 page_results = self.getpage(pagenum)
9c44d242 2379 if skip_elems:
7be9ccff 2380 page_results = page_results[skip_elems:]
9c44d242
PH
2381 skip_elems = None
2382 if only_more is not None:
7be9ccff 2383 if len(page_results) < only_more:
2384 only_more -= len(page_results)
9c44d242 2385 else:
7be9ccff 2386 yield from page_results[:only_more]
9c44d242 2387 break
7be9ccff 2388 yield from page_results
9c44d242
PH
2389
2390
7e88d7d7 2391class PlaylistEntries:
2392 MissingEntry = object()
2393 is_exhausted = False
2394
2395 def __init__(self, ydl, info_dict):
7e9a6125 2396 self.ydl = ydl
2397
2398 # _entries must be assigned now since infodict can change during iteration
2399 entries = info_dict.get('entries')
2400 if entries is None:
2401 raise EntryNotInPlaylist('There are no entries')
2402 elif isinstance(entries, list):
2403 self.is_exhausted = True
2404
2405 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2406 self.is_incomplete = requested_entries is not None
7e9a6125 2407 if self.is_incomplete:
2408 assert self.is_exhausted
bc5c2f8a 2409 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 2410 for i, entry in zip(requested_entries, entries):
2411 self._entries[i - 1] = entry
2412 elif isinstance(entries, (list, PagedList, LazyList)):
2413 self._entries = entries
2414 else:
2415 self._entries = LazyList(entries)
7e88d7d7 2416
2417 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2418 (?P<start>[+-]?\d+)?
2419 (?P<range>[:-]
2420 (?P<end>[+-]?\d+|inf(?:inite)?)?
2421 (?::(?P<step>[+-]?\d+))?
2422 )?''')
2423
2424 @classmethod
2425 def parse_playlist_items(cls, string):
2426 for segment in string.split(','):
2427 if not segment:
2428 raise ValueError('There is two or more consecutive commas')
2429 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2430 if not mobj:
2431 raise ValueError(f'{segment!r} is not a valid specification')
2432 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2433 if int_or_none(step) == 0:
2434 raise ValueError(f'Step in {segment!r} cannot be zero')
2435 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2436
2437 def get_requested_items(self):
2438 playlist_items = self.ydl.params.get('playlist_items')
2439 playlist_start = self.ydl.params.get('playliststart', 1)
2440 playlist_end = self.ydl.params.get('playlistend')
2441 # For backwards compatibility, interpret -1 as whole list
2442 if playlist_end in (-1, None):
2443 playlist_end = ''
2444 if not playlist_items:
2445 playlist_items = f'{playlist_start}:{playlist_end}'
2446 elif playlist_start != 1 or playlist_end:
2447 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2448
2449 for index in self.parse_playlist_items(playlist_items):
2450 for i, entry in self[index]:
2451 yield i, entry
1ac4fd80 2452 if not entry:
2453 continue
7e88d7d7 2454 try:
d21056f4 2455 # The item may have just been added to archive. Don't break due to it
2456 if not self.ydl.params.get('lazy_playlist'):
2457 # TODO: Add auto-generated fields
2458 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 2459 except (ExistingVideoReached, RejectedVideoReached):
2460 return
2461
7e9a6125 2462 def get_full_count(self):
2463 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2464 return len(self)
2465 elif isinstance(self._entries, InAdvancePagedList):
2466 if self._entries._pagesize == 1:
2467 return self._entries._pagecount
2468
7e88d7d7 2469 @functools.cached_property
2470 def _getter(self):
2471 if isinstance(self._entries, list):
2472 def get_entry(i):
2473 try:
2474 entry = self._entries[i]
2475 except IndexError:
2476 entry = self.MissingEntry
2477 if not self.is_incomplete:
2478 raise self.IndexError()
2479 if entry is self.MissingEntry:
bc5c2f8a 2480 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 2481 return entry
2482 else:
2483 def get_entry(i):
2484 try:
2485 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2486 except (LazyList.IndexError, PagedList.IndexError):
2487 raise self.IndexError()
2488 return get_entry
2489
2490 def __getitem__(self, idx):
2491 if isinstance(idx, int):
2492 idx = slice(idx, idx)
2493
2494 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2495 step = 1 if idx.step is None else idx.step
2496 if idx.start is None:
2497 start = 0 if step > 0 else len(self) - 1
2498 else:
2499 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2500
2501 # NB: Do not call len(self) when idx == [:]
2502 if idx.stop is None:
2503 stop = 0 if step < 0 else float('inf')
2504 else:
2505 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2506 stop += [-1, 1][step > 0]
2507
2508 for i in frange(start, stop, step):
2509 if i < 0:
2510 continue
2511 try:
7e9a6125 2512 entry = self._getter(i)
2513 except self.IndexError:
2514 self.is_exhausted = True
2515 if step > 0:
7e88d7d7 2516 break
7e9a6125 2517 continue
7e88d7d7 2518 yield i + 1, entry
2519
2520 def __len__(self):
2521 return len(tuple(self[:]))
2522
2523 class IndexError(IndexError):
2524 pass
2525
2526
81c2f20b 2527def uppercase_escape(s):
676eb3f2 2528 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2529 return re.sub(
a612753d 2530 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2531 lambda m: unicode_escape(m.group(0))[0],
2532 s)
0fe2ff78
YCH
2533
2534
2535def lowercase_escape(s):
2536 unicode_escape = codecs.getdecoder('unicode_escape')
2537 return re.sub(
2538 r'\\u[0-9a-fA-F]{4}',
2539 lambda m: unicode_escape(m.group(0))[0],
2540 s)
b53466e1 2541
d05cfe06
S
2542
2543def escape_rfc3986(s):
2544 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2545 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2546
2547
2548def escape_url(url):
2549 """Escape URL as suggested by RFC 3986"""
14f25df2 2550 url_parsed = urllib.parse.urlparse(url)
d05cfe06 2551 return url_parsed._replace(
efbed08d 2552 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2553 path=escape_rfc3986(url_parsed.path),
2554 params=escape_rfc3986(url_parsed.params),
2555 query=escape_rfc3986(url_parsed.query),
2556 fragment=escape_rfc3986(url_parsed.fragment)
2557 ).geturl()
2558
62e609ab 2559
96b9e9cf 2560def parse_qs(url, **kwargs):
2561 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 2562
2563
62e609ab
PH
2564def read_batch_urls(batch_fd):
2565 def fixup(url):
14f25df2 2566 if not isinstance(url, str):
62e609ab 2567 url = url.decode('utf-8', 'replace')
8c04f0be 2568 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2569 for bom in BOM_UTF8:
2570 if url.startswith(bom):
2571 url = url[len(bom):]
2572 url = url.lstrip()
2573 if not url or url.startswith(('#', ';', ']')):
62e609ab 2574 return False
8c04f0be 2575 # "#" cannot be stripped out since it is part of the URI
962ffcf8 2576 # However, it can be safely stripped out if following a whitespace
8c04f0be 2577 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2578
2579 with contextlib.closing(batch_fd) as fd:
2580 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2581
2582
2583def urlencode_postdata(*args, **kargs):
14f25df2 2584 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2585
2586
45b2ee6f 2587def update_url(url, *, query_update=None, **kwargs):
2588 """Replace URL components specified by kwargs
2589 @param url str or parse url tuple
2590 @param query_update update query
2591 @returns str
2592 """
2593 if isinstance(url, str):
2594 if not kwargs and not query_update:
2595 return url
2596 else:
2597 url = urllib.parse.urlparse(url)
2598 if query_update:
2599 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2600 kwargs['query'] = urllib.parse.urlencode({
2601 **urllib.parse.parse_qs(url.query),
2602 **query_update
2603 }, True)
2604 return urllib.parse.urlunparse(url._replace(**kwargs))
2605
2606
38f9ef31 2607def update_url_query(url, query):
45b2ee6f 2608 return update_url(url, query_update=query)
16392824 2609
8e60dc75 2610
10c87c15 2611def _multipart_encode_impl(data, boundary):
0c265486
YCH
2612 content_type = 'multipart/form-data; boundary=%s' % boundary
2613
2614 out = b''
2615 for k, v in data.items():
2616 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 2617 if isinstance(k, str):
0f06bcd7 2618 k = k.encode()
14f25df2 2619 if isinstance(v, str):
0f06bcd7 2620 v = v.encode()
0c265486
YCH
2621 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2622 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2623 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2624 if boundary.encode('ascii') in content:
2625 raise ValueError('Boundary overlaps with data')
2626 out += content
2627
2628 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2629
2630 return out, content_type
2631
2632
2633def multipart_encode(data, boundary=None):
2634 '''
2635 Encode a dict to RFC 7578-compliant form-data
2636
2637 data:
2638 A dict where keys and values can be either Unicode or bytes-like
2639 objects.
2640 boundary:
2641 If specified a Unicode object, it's used as the boundary. Otherwise
2642 a random boundary is generated.
2643
2644 Reference: https://tools.ietf.org/html/rfc7578
2645 '''
2646 has_specified_boundary = boundary is not None
2647
2648 while True:
2649 if boundary is None:
2650 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2651
2652 try:
10c87c15 2653 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2654 break
2655 except ValueError:
2656 if has_specified_boundary:
2657 raise
2658 boundary = None
2659
2660 return out, content_type
2661
2662
b079c26f
SS
2663def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2664 if blocked_types is NO_DEFAULT:
2665 blocked_types = (str, bytes, collections.abc.Mapping)
2666 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2667
2668
2669def variadic(x, allowed_types=NO_DEFAULT):
4823ec9f 2670 if not isinstance(allowed_types, (tuple, type)):
2671 deprecation_warning('allowed_types should be a tuple or a type')
2672 allowed_types = tuple(allowed_types)
6f2287cb 2673 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
304ad45a 2674
2675
c4f60dd7 2676def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2677 for f in funcs:
a32a9a7e 2678 try:
c4f60dd7 2679 val = f(*args, **kwargs)
ab029d7e 2680 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
2681 pass
2682 else:
c4f60dd7 2683 if expected_type is None or isinstance(val, expected_type):
2684 return val
2685
2686
2687def try_get(src, getter, expected_type=None):
2688 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2689
2690
90137ca4 2691def filter_dict(dct, cndn=lambda _, v: v is not None):
2692 return {k: v for k, v in dct.items() if cndn(k, v)}
2693
2694
6cc62232
S
2695def merge_dicts(*dicts):
2696 merged = {}
2697 for a_dict in dicts:
2698 for k, v in a_dict.items():
90137ca4 2699 if (v is not None and k not in merged
2700 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2701 merged[k] = v
2702 return merged
2703
2704
8e60dc75 2705def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 2706 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 2707
16392824 2708
a1a530b0
PH
2709US_RATINGS = {
2710 'G': 0,
2711 'PG': 10,
2712 'PG-13': 13,
2713 'R': 16,
2714 'NC': 18,
2715}
fac55558
PH
2716
2717
a8795327 2718TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2719 'TV-Y': 0,
2720 'TV-Y7': 7,
2721 'TV-G': 0,
2722 'TV-PG': 0,
2723 'TV-14': 14,
2724 'TV-MA': 17,
a8795327
S
2725}
2726
2727
146c80e2 2728def parse_age_limit(s):
19a03940 2729 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2730 if type(s) is int: # noqa: E721
a8795327 2731 return s if 0 <= s <= 21 else None
19a03940 2732 elif not isinstance(s, str):
d838b1bd 2733 return None
146c80e2 2734 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2735 if m:
2736 return int(m.group('age'))
5c5fae6d 2737 s = s.upper()
a8795327
S
2738 if s in US_RATINGS:
2739 return US_RATINGS[s]
5a16c9d9 2740 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2741 if m:
5a16c9d9 2742 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2743 return None
146c80e2
S
2744
2745
fac55558 2746def strip_jsonp(code):
609a61e3 2747 return re.sub(
5552c9eb 2748 r'''(?sx)^
e9c671d5 2749 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2750 (?:\s*&&\s*(?P=func_name))?
2751 \s*\(\s*(?P<callback_data>.*)\);?
2752 \s*?(?://[^\n]*)*$''',
2753 r'\g<callback_data>', code)
478c2c61
PH
2754
2755
8f53dc44 2756def js_to_json(code, vars={}, *, strict=False):
5c610515 2757 # vars is a dict of var, val pairs to substitute
0898c5c8 2758 STRING_QUOTES = '\'"`'
a71b812f 2759 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 2760 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 2761 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 2762 INTEGER_TABLE = (
86e5f3ed 2763 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2764 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
2765 )
2766
a71b812f
SS
2767 def process_escape(match):
2768 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2769 escape = match.group(1) or match.group(2)
2770
2771 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2772 else R'\u00' if escape == 'x'
2773 else '' if escape == '\n'
2774 else escape)
2775
0898c5c8
SS
2776 def template_substitute(match):
2777 evaluated = js_to_json(match.group(1), vars, strict=strict)
2778 if evaluated[0] == '"':
2779 return json.loads(evaluated)
2780 return evaluated
2781
e05f6939 2782 def fix_kv(m):
e7b6d122
PH
2783 v = m.group(0)
2784 if v in ('true', 'false', 'null'):
2785 return v
421ddcb8
C
2786 elif v in ('undefined', 'void 0'):
2787 return 'null'
8bdd16b4 2788 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
2789 return ''
2790
2791 if v[0] in STRING_QUOTES:
0898c5c8
SS
2792 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2793 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
2794 return f'"{escaped}"'
2795
2796 for regex, base in INTEGER_TABLE:
2797 im = re.match(regex, v)
2798 if im:
2799 i = int(im.group(1), base)
2800 return f'"{i}":' if v.endswith(':') else str(i)
2801
2802 if v in vars:
d5f043d1
C
2803 try:
2804 if not strict:
2805 json.loads(vars[v])
08e29b9f 2806 except json.JSONDecodeError:
d5f043d1
C
2807 return json.dumps(vars[v])
2808 else:
2809 return vars[v]
89ac4a19 2810
a71b812f
SS
2811 if not strict:
2812 return f'"{v}"'
5c610515 2813
a71b812f 2814 raise ValueError(f'Unknown value: {v}')
e05f6939 2815
8072ef2b 2816 def create_map(mobj):
2817 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2818
8072ef2b 2819 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 2820 if not strict:
2821 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 2822 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 2823 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2824 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 2825
a71b812f
SS
2826 return re.sub(rf'''(?sx)
2827 {STRING_RE}|
2828 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 2829 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
2830 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2831 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 2832 !+
a71b812f 2833 ''', fix_kv, code)
e05f6939
PH
2834
2835
478c2c61
PH
2836def qualities(quality_ids):
2837 """ Get a numeric quality value out of a list of possible values """
2838 def q(qid):
2839 try:
2840 return quality_ids.index(qid)
2841 except ValueError:
2842 return -1
2843 return q
2844
acd69589 2845
119e40ef 2846POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 2847
2848
de6000d9 2849DEFAULT_OUTTMPL = {
2850 'default': '%(title)s [%(id)s].%(ext)s',
72755351 2851 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 2852}
2853OUTTMPL_TYPES = {
72755351 2854 'chapter': None,
de6000d9 2855 'subtitle': None,
2856 'thumbnail': None,
2857 'description': 'description',
2858 'annotation': 'annotations.xml',
2859 'infojson': 'info.json',
08438d2c 2860 'link': None,
3b603dbd 2861 'pl_video': None,
5112f26a 2862 'pl_thumbnail': None,
de6000d9 2863 'pl_description': 'description',
2864 'pl_infojson': 'info.json',
2865}
0a871f68 2866
143db31d 2867# As of [1] format syntax is:
2868# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2869# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 2870STR_FORMAT_RE_TMPL = r'''(?x)
2871 (?<!%)(?P<prefix>(?:%%)*)
143db31d 2872 %
524e2e4f 2873 (?P<has_key>\((?P<key>{0})\))?
752cda38 2874 (?P<format>
524e2e4f 2875 (?P<conversion>[#0\-+ ]+)?
2876 (?P<min_width>\d+)?
2877 (?P<precision>\.\d+)?
2878 (?P<len_mod>[hlL])? # unused in python
901130bb 2879 {1} # conversion type
752cda38 2880 )
143db31d 2881'''
2882
7d1eb38a 2883
ebe1b4e3 2884STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
a020a0dc 2885
7d1eb38a 2886
a020a0dc
PH
2887def limit_length(s, length):
2888 """ Add ellipses to overly long strings """
2889 if s is None:
2890 return None
2891 ELLIPSES = '...'
2892 if len(s) > length:
2893 return s[:length - len(ELLIPSES)] + ELLIPSES
2894 return s
48844745
PH
2895
2896
2897def version_tuple(v):
5f9b8394 2898 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2899
2900
2901def is_outdated_version(version, limit, assume_new=True):
2902 if not version:
2903 return not assume_new
2904 try:
2905 return version_tuple(version) < version_tuple(limit)
2906 except ValueError:
2907 return not assume_new
732ea2f0
PH
2908
2909
2910def ytdl_is_updateable():
7a5c1cfe 2911 """ Returns if yt-dlp can be updated with -U """
735d865e 2912
69bec673 2913 from ..update import is_non_updateable
732ea2f0 2914
5d535b4a 2915 return not is_non_updateable()
7d4111ed
PH
2916
2917
2918def args_to_str(args):
2919 # Get a short string representation for a subprocess command
702ccf2d 2920 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2921
2922
a44ca5a4 2923def error_to_str(err):
2924 return f'{type(err).__name__}: {err}'
2925
2926
2647c933 2927def mimetype2ext(mt, default=NO_DEFAULT):
2928 if not isinstance(mt, str):
2929 if default is not NO_DEFAULT:
2930 return default
eb9ee194
S
2931 return None
2932
2647c933 2933 MAP = {
2934 # video
f6861ec9 2935 '3gpp': '3gp',
2647c933 2936 'mp2t': 'ts',
2937 'mp4': 'mp4',
2938 'mpeg': 'mpeg',
2939 'mpegurl': 'm3u8',
2940 'quicktime': 'mov',
2941 'webm': 'webm',
2942 'vp9': 'vp9',
f6861ec9 2943 'x-flv': 'flv',
2647c933 2944 'x-m4v': 'm4v',
2945 'x-matroska': 'mkv',
2946 'x-mng': 'mng',
a0d8d704 2947 'x-mp4-fragmented': 'mp4',
2647c933 2948 'x-ms-asf': 'asf',
a0d8d704 2949 'x-ms-wmv': 'wmv',
2647c933 2950 'x-msvideo': 'avi',
2951
2952 # application (streaming playlists)
b4173f15 2953 'dash+xml': 'mpd',
b4173f15 2954 'f4m+xml': 'f4m',
f164b971 2955 'hds+xml': 'f4m',
2647c933 2956 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 2957 'vnd.ms-sstr+xml': 'ism',
2647c933 2958 'x-mpegurl': 'm3u8',
2959
2960 # audio
2961 'audio/mp4': 'm4a',
2962 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2963 # Using .mp3 as it's the most popular one
2964 'audio/mpeg': 'mp3',
d80ca5de 2965 'audio/webm': 'webm',
2647c933 2966 'audio/x-matroska': 'mka',
2967 'audio/x-mpegurl': 'm3u',
2968 'midi': 'mid',
2969 'ogg': 'ogg',
2970 'wav': 'wav',
2971 'wave': 'wav',
2972 'x-aac': 'aac',
2973 'x-flac': 'flac',
2974 'x-m4a': 'm4a',
2975 'x-realaudio': 'ra',
39e7107d 2976 'x-wav': 'wav',
9359f3d4 2977
2647c933 2978 # image
2979 'avif': 'avif',
2980 'bmp': 'bmp',
2981 'gif': 'gif',
2982 'jpeg': 'jpg',
2983 'png': 'png',
2984 'svg+xml': 'svg',
2985 'tiff': 'tif',
2986 'vnd.wap.wbmp': 'wbmp',
2987 'webp': 'webp',
2988 'x-icon': 'ico',
2989 'x-jng': 'jng',
2990 'x-ms-bmp': 'bmp',
2991
2992 # caption
2993 'filmstrip+json': 'fs',
2994 'smptett+xml': 'tt',
2995 'ttaf+xml': 'dfxp',
2996 'ttml+xml': 'ttml',
2997 'x-ms-sami': 'sami',
9359f3d4 2998
2647c933 2999 # misc
3000 'gzip': 'gz',
9359f3d4
F
3001 'json': 'json',
3002 'xml': 'xml',
3003 'zip': 'zip',
9359f3d4
F
3004 }
3005
2647c933 3006 mimetype = mt.partition(';')[0].strip().lower()
3007 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3008
69bec673 3009 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2647c933 3010 if ext:
3011 return ext
3012 elif default is not NO_DEFAULT:
3013 return default
9359f3d4 3014 return subtype.replace('+', '.')
c460bdd5
PH
3015
3016
2814f12b
THD
3017def ext2mimetype(ext_or_url):
3018 if not ext_or_url:
3019 return None
3020 if '.' not in ext_or_url:
3021 ext_or_url = f'file.{ext_or_url}'
3022 return mimetypes.guess_type(ext_or_url)[0]
3023
3024
4f3c5e06 3025def parse_codecs(codecs_str):
3026 # http://tools.ietf.org/html/rfc6381
3027 if not codecs_str:
3028 return {}
a0566bbf 3029 split_codecs = list(filter(None, map(
dbf5416a 3030 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3031 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3032 for full_codec in split_codecs:
d816f61f 3033 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3034 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3035 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3036 if vcodec:
3037 continue
3038 vcodec = full_codec
3039 if parts[0] in ('dvh1', 'dvhe'):
3040 hdr = 'DV'
69bec673 3041 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
d816f61f 3042 hdr = 'HDR10'
3043 elif parts[:2] == ['vp9', '2']:
3044 hdr = 'HDR10'
71082216 3045 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3046 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3047 acodec = acodec or full_codec
3048 elif parts[0] in ('stpp', 'wvtt'):
3049 scodec = scodec or full_codec
4f3c5e06 3050 else:
19a03940 3051 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3052 if vcodec or acodec or scodec:
4f3c5e06 3053 return {
3054 'vcodec': vcodec or 'none',
3055 'acodec': acodec or 'none',
176f1866 3056 'dynamic_range': hdr,
3fe75fdc 3057 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3058 }
b69fd25c 3059 elif len(split_codecs) == 2:
3060 return {
3061 'vcodec': split_codecs[0],
3062 'acodec': split_codecs[1],
3063 }
4f3c5e06 3064 return {}
3065
3066
fc61aff4
LL
3067def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3068 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3069
3070 allow_mkv = not preferences or 'mkv' in preferences
3071
3072 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3073 return 'mkv' # TODO: any other format allows this?
3074
3075 # TODO: All codecs supported by parse_codecs isn't handled here
3076 COMPATIBLE_CODECS = {
3077 'mp4': {
71082216 3078 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3079 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3080 },
3081 'webm': {
3082 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3083 'vp9x', 'vp8x', # in the webm spec
3084 },
3085 }
3086
812cdfa0 3087 sanitize_codec = functools.partial(
3088 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3089 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3090
3091 for ext in preferences or COMPATIBLE_CODECS.keys():
3092 codec_set = COMPATIBLE_CODECS.get(ext, set())
3093 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3094 return ext
3095
3096 COMPATIBLE_EXTS = (
3097 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3098 {'webm', 'weba'},
fc61aff4
LL
3099 )
3100 for ext in preferences or vexts:
3101 current_exts = {ext, *vexts, *aexts}
3102 if ext == 'mkv' or current_exts == {ext} or any(
3103 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3104 return ext
3105 return 'mkv' if allow_mkv else preferences[-1]
3106
3107
2647c933 3108def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3109 getheader = url_handle.headers.get
2ccd1b10 3110
b55ee18f
PH
3111 cd = getheader('Content-Disposition')
3112 if cd:
3113 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3114 if m:
3115 e = determine_ext(m.group('filename'), default_ext=None)
3116 if e:
3117 return e
3118
2647c933 3119 meta_ext = getheader('x-amz-meta-name')
3120 if meta_ext:
3121 e = meta_ext.rpartition('.')[2]
3122 if e:
3123 return e
3124
3125 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3126
3127
1e399778
YCH
3128def encode_data_uri(data, mime_type):
3129 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3130
3131
05900629 3132def age_restricted(content_limit, age_limit):
6ec6cb4e 3133 """ Returns True iff the content should be blocked """
05900629
PH
3134
3135 if age_limit is None: # No limit set
3136 return False
3137 if content_limit is None:
3138 return False # Content available for everyone
3139 return age_limit < content_limit
61ca9a80
PH
3140
3141
88f60feb 3142# List of known byte-order-marks (BOM)
a904a7f8
L
3143BOMS = [
3144 (b'\xef\xbb\xbf', 'utf-8'),
3145 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3146 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3147 (b'\xff\xfe', 'utf-16-le'),
3148 (b'\xfe\xff', 'utf-16-be'),
3149]
a904a7f8
L
3150
3151
61ca9a80
PH
3152def is_html(first_bytes):
3153 """ Detect whether a file contains HTML by examining its first bytes. """
3154
80e8493e 3155 encoding = 'utf-8'
61ca9a80 3156 for bom, enc in BOMS:
80e8493e 3157 while first_bytes.startswith(bom):
3158 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3159
80e8493e 3160 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3161
3162
3163def determine_protocol(info_dict):
3164 protocol = info_dict.get('protocol')
3165 if protocol is not None:
3166 return protocol
3167
7de837a5 3168 url = sanitize_url(info_dict['url'])
a055469f
PH
3169 if url.startswith('rtmp'):
3170 return 'rtmp'
3171 elif url.startswith('mms'):
3172 return 'mms'
3173 elif url.startswith('rtsp'):
3174 return 'rtsp'
3175
3176 ext = determine_ext(url)
3177 if ext == 'm3u8':
deae7c17 3178 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3179 elif ext == 'f4m':
3180 return 'f4m'
3181
14f25df2 3182 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3183
3184
c5e3f849 3185def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3186 """ Render a list of rows, each as a list of values.
3187 Text after a \t will be right aligned """
ec11a9f4 3188 def width(string):
c5e3f849 3189 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3190
3191 def get_max_lens(table):
ec11a9f4 3192 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3193
3194 def filter_using_list(row, filterArray):
d16df59d 3195 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3196
d16df59d 3197 max_lens = get_max_lens(data) if hide_empty else []
3198 header_row = filter_using_list(header_row, max_lens)
3199 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3200
cfb56d1a 3201 table = [header_row] + data
76d321f6 3202 max_lens = get_max_lens(table)
c5e3f849 3203 extra_gap += 1
76d321f6 3204 if delim:
c5e3f849 3205 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3206 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3207 for row in table:
3208 for pos, text in enumerate(map(str, row)):
c5e3f849 3209 if '\t' in text:
3210 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3211 else:
3212 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3213 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3214 return ret
347de493
PH
3215
3216
8f18aca8 3217def _match_one(filter_part, dct, incomplete):
77b87f05 3218 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3219 STRING_OPERATORS = {
3220 '*=': operator.contains,
3221 '^=': lambda attr, value: attr.startswith(value),
3222 '$=': lambda attr, value: attr.endswith(value),
3223 '~=': lambda attr, value: re.search(value, attr),
3224 }
347de493 3225 COMPARISON_OPERATORS = {
a047eeb6 3226 **STRING_OPERATORS,
3227 '<=': operator.le, # "<=" must be defined above "<"
347de493 3228 '<': operator.lt,
347de493 3229 '>=': operator.ge,
a047eeb6 3230 '>': operator.gt,
347de493 3231 '=': operator.eq,
347de493 3232 }
a047eeb6 3233
6db9c4d5 3234 if isinstance(incomplete, bool):
3235 is_incomplete = lambda _: incomplete
3236 else:
3237 is_incomplete = lambda k: k in incomplete
3238
64fa820c 3239 operator_rex = re.compile(r'''(?x)
347de493 3240 (?P<key>[a-z_]+)
77b87f05 3241 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3242 (?:
a047eeb6 3243 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3244 (?P<strval>.+?)
347de493 3245 )
347de493 3246 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3247 m = operator_rex.fullmatch(filter_part.strip())
347de493 3248 if m:
18f96d12 3249 m = m.groupdict()
3250 unnegated_op = COMPARISON_OPERATORS[m['op']]
3251 if m['negation']:
77b87f05
MT
3252 op = lambda attr, value: not unnegated_op(attr, value)
3253 else:
3254 op = unnegated_op
18f96d12 3255 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3256 if m['quote']:
3257 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3258 actual_value = dct.get(m['key'])
3259 numeric_comparison = None
f9934b96 3260 if isinstance(actual_value, (int, float)):
e5a088dc
S
3261 # If the original field is a string and matching comparisonvalue is
3262 # a number we should respect the origin of the original field
3263 # and process comparison value as a string (see
18f96d12 3264 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3265 try:
18f96d12 3266 numeric_comparison = int(comparison_value)
347de493 3267 except ValueError:
18f96d12 3268 numeric_comparison = parse_filesize(comparison_value)
3269 if numeric_comparison is None:
3270 numeric_comparison = parse_filesize(f'{comparison_value}B')
3271 if numeric_comparison is None:
3272 numeric_comparison = parse_duration(comparison_value)
3273 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3274 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3275 if actual_value is None:
6db9c4d5 3276 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3277 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3278
3279 UNARY_OPERATORS = {
1cc47c66
S
3280 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3281 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3282 }
64fa820c 3283 operator_rex = re.compile(r'''(?x)
347de493 3284 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3285 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3286 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3287 if m:
3288 op = UNARY_OPERATORS[m.group('op')]
3289 actual_value = dct.get(m.group('key'))
6db9c4d5 3290 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3291 return True
347de493
PH
3292 return op(actual_value)
3293
3294 raise ValueError('Invalid filter part %r' % filter_part)
3295
3296
8f18aca8 3297def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3298 """ Filter a dictionary with a simple string syntax.
3299 @returns Whether the filter passes
3300 @param incomplete Set of keys that is expected to be missing from dct.
3301 Can be True/False to indicate all/none of the keys may be missing.
3302 All conditions on incomplete keys pass if the key is missing
8f18aca8 3303 """
347de493 3304 return all(
8f18aca8 3305 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3306 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3307
3308
fe2ce85a 3309def match_filter_func(filters, breaking_filters=None):
3310 if not filters and not breaking_filters:
d1b5f70b 3311 return None
fe2ce85a 3312 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3313 filters = set(variadic(filters or []))
d1b5f70b 3314
492272fe 3315 interactive = '-' in filters
3316 if interactive:
3317 filters.remove('-')
3318
3319 def _match_func(info_dict, incomplete=False):
fe2ce85a 3320 ret = breaking_filters(info_dict, incomplete)
3321 if ret is not None:
3322 raise RejectedVideoReached(ret)
3323
492272fe 3324 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3325 return NO_DEFAULT if interactive and not incomplete else None
347de493 3326 else:
3bec830a 3327 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3328 filter_str = ') | ('.join(map(str.strip, filters))
3329 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3330 return _match_func
91410c9b
PH
3331
3332
f2df4071 3333class download_range_func:
b4e0d758 3334 def __init__(self, chapters, ranges, from_info=False):
3335 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
f2df4071 3336
3337 def __call__(self, info_dict, ydl):
0500ee3d 3338
5ec1b6b7 3339 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3340 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3341 for regex in self.chapters or []:
5ec1b6b7 3342 for i, chapter in enumerate(info_dict.get('chapters') or []):
3343 if re.search(regex, chapter['title']):
3344 warning = None
3345 yield {**chapter, 'index': i}
f2df4071 3346 if self.chapters and warning:
5ec1b6b7 3347 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3348
b4e0d758 3349 for start, end in self.ranges or []:
3350 yield {
3351 'start_time': self._handle_negative_timestamp(start, info_dict),
3352 'end_time': self._handle_negative_timestamp(end, info_dict),
3353 }
3354
3355 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3356 yield {
e59e2074 3357 'start_time': info_dict.get('start_time') or 0,
3358 'end_time': info_dict.get('end_time') or float('inf'),
b4e0d758 3359 }
e59e2074 3360 elif not self.ranges and not self.chapters:
3361 yield {}
b4e0d758 3362
3363 @staticmethod
3364 def _handle_negative_timestamp(time, info):
3365 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
5ec1b6b7 3366
f2df4071 3367 def __eq__(self, other):
3368 return (isinstance(other, download_range_func)
3369 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3370
71df9b7f 3371 def __repr__(self):
a5387729 3372 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3373
5ec1b6b7 3374
bf6427d2
YCH
3375def parse_dfxp_time_expr(time_expr):
3376 if not time_expr:
d631d5f9 3377 return
bf6427d2 3378
1d485a1a 3379 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3380 if mobj:
3381 return float(mobj.group('time_offset'))
3382
db2fe38b 3383 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3384 if mobj:
db2fe38b 3385 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3386
3387
c1c924ab 3388def srt_subtitles_timecode(seconds):
aa7785f8 3389 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3390
3391
3392def ass_subtitles_timecode(seconds):
3393 time = timetuple_from_msec(seconds * 1000)
3394 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3395
3396
3397def dfxp2srt(dfxp_data):
3869028f
YCH
3398 '''
3399 @param dfxp_data A bytes-like object containing DFXP data
3400 @returns A unicode object containing converted SRT data
3401 '''
5b995f71 3402 LEGACY_NAMESPACES = (
3869028f
YCH
3403 (b'http://www.w3.org/ns/ttml', [
3404 b'http://www.w3.org/2004/11/ttaf1',
3405 b'http://www.w3.org/2006/04/ttaf1',
3406 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3407 ]),
3869028f
YCH
3408 (b'http://www.w3.org/ns/ttml#styling', [
3409 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3410 ]),
3411 )
3412
3413 SUPPORTED_STYLING = [
3414 'color',
3415 'fontFamily',
3416 'fontSize',
3417 'fontStyle',
3418 'fontWeight',
3419 'textDecoration'
3420 ]
3421
4e335771 3422 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3423 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3424 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3425 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3426 })
bf6427d2 3427
5b995f71
RA
3428 styles = {}
3429 default_style = {}
3430
86e5f3ed 3431 class TTMLPElementParser:
5b995f71
RA
3432 _out = ''
3433 _unclosed_elements = []
3434 _applied_styles = []
bf6427d2 3435
2b14cb56 3436 def start(self, tag, attrib):
5b995f71
RA
3437 if tag in (_x('ttml:br'), 'br'):
3438 self._out += '\n'
3439 else:
3440 unclosed_elements = []
3441 style = {}
3442 element_style_id = attrib.get('style')
3443 if default_style:
3444 style.update(default_style)
3445 if element_style_id:
3446 style.update(styles.get(element_style_id, {}))
3447 for prop in SUPPORTED_STYLING:
3448 prop_val = attrib.get(_x('tts:' + prop))
3449 if prop_val:
3450 style[prop] = prop_val
3451 if style:
3452 font = ''
3453 for k, v in sorted(style.items()):
3454 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3455 continue
3456 if k == 'color':
3457 font += ' color="%s"' % v
3458 elif k == 'fontSize':
3459 font += ' size="%s"' % v
3460 elif k == 'fontFamily':
3461 font += ' face="%s"' % v
3462 elif k == 'fontWeight' and v == 'bold':
3463 self._out += '<b>'
3464 unclosed_elements.append('b')
3465 elif k == 'fontStyle' and v == 'italic':
3466 self._out += '<i>'
3467 unclosed_elements.append('i')
3468 elif k == 'textDecoration' and v == 'underline':
3469 self._out += '<u>'
3470 unclosed_elements.append('u')
3471 if font:
3472 self._out += '<font' + font + '>'
3473 unclosed_elements.append('font')
3474 applied_style = {}
3475 if self._applied_styles:
3476 applied_style.update(self._applied_styles[-1])
3477 applied_style.update(style)
3478 self._applied_styles.append(applied_style)
3479 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3480
2b14cb56 3481 def end(self, tag):
5b995f71
RA
3482 if tag not in (_x('ttml:br'), 'br'):
3483 unclosed_elements = self._unclosed_elements.pop()
3484 for element in reversed(unclosed_elements):
3485 self._out += '</%s>' % element
3486 if unclosed_elements and self._applied_styles:
3487 self._applied_styles.pop()
bf6427d2 3488
2b14cb56 3489 def data(self, data):
5b995f71 3490 self._out += data
2b14cb56 3491
3492 def close(self):
5b995f71 3493 return self._out.strip()
2b14cb56 3494
6a765f13 3495 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3496 # This will not trigger false positives since only UTF-8 text is being replaced
3497 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3498
2b14cb56 3499 def parse_node(node):
3500 target = TTMLPElementParser()
3501 parser = xml.etree.ElementTree.XMLParser(target=target)
3502 parser.feed(xml.etree.ElementTree.tostring(node))
3503 return parser.close()
bf6427d2 3504
5b995f71
RA
3505 for k, v in LEGACY_NAMESPACES:
3506 for ns in v:
3507 dfxp_data = dfxp_data.replace(ns, k)
3508
3869028f 3509 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3510 out = []
5b995f71 3511 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3512
3513 if not paras:
3514 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3515
5b995f71
RA
3516 repeat = False
3517 while True:
3518 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3519 style_id = style.get('id') or style.get(_x('xml:id'))
3520 if not style_id:
3521 continue
5b995f71
RA
3522 parent_style_id = style.get('style')
3523 if parent_style_id:
3524 if parent_style_id not in styles:
3525 repeat = True
3526 continue
3527 styles[style_id] = styles[parent_style_id].copy()
3528 for prop in SUPPORTED_STYLING:
3529 prop_val = style.get(_x('tts:' + prop))
3530 if prop_val:
3531 styles.setdefault(style_id, {})[prop] = prop_val
3532 if repeat:
3533 repeat = False
3534 else:
3535 break
3536
3537 for p in ('body', 'div'):
3538 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3539 if ele is None:
3540 continue
3541 style = styles.get(ele.get('style'))
3542 if not style:
3543 continue
3544 default_style.update(style)
3545
bf6427d2 3546 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3547 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3548 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3549 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3550 if begin_time is None:
3551 continue
7dff0363 3552 if not end_time:
d631d5f9
YCH
3553 if not dur:
3554 continue
3555 end_time = begin_time + dur
bf6427d2
YCH
3556 out.append('%d\n%s --> %s\n%s\n\n' % (
3557 index,
c1c924ab
YCH
3558 srt_subtitles_timecode(begin_time),
3559 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3560 parse_node(para)))
3561
3562 return ''.join(out)
3563
3564
c487cf00 3565def cli_option(params, command_option, param, separator=None):
66e289ba 3566 param = params.get(param)
c487cf00 3567 return ([] if param is None
3568 else [command_option, str(param)] if separator is None
3569 else [f'{command_option}{separator}{param}'])
66e289ba
S
3570
3571
3572def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3573 param = params.get(param)
c487cf00 3574 assert param in (True, False, None)
3575 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3576
3577
3578def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3579 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3580
3581
e92caff5 3582def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3583 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3584 if use_compat:
5b1ecbb3 3585 return argdict
3586 else:
3587 argdict = None
eab9b2bc 3588 if argdict is None:
5b1ecbb3 3589 return default
eab9b2bc 3590 assert isinstance(argdict, dict)
3591
e92caff5 3592 assert isinstance(keys, (list, tuple))
3593 for key_list in keys:
e92caff5 3594 arg_list = list(filter(
3595 lambda x: x is not None,
6606817a 3596 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3597 if arg_list:
3598 return [arg for args in arg_list for arg in args]
3599 return default
66e289ba 3600
6251555f 3601
330690a2 3602def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3603 main_key, exe = main_key.lower(), exe.lower()
3604 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3605 keys = [f'{root_key}{k}' for k in (keys or [''])]
3606 if root_key in keys:
3607 if main_key != exe:
3608 keys.append((main_key, exe))
3609 keys.append('default')
3610 else:
3611 use_compat = False
3612 return cli_configuration_args(argdict, keys, default, use_compat)
3613
66e289ba 3614
86e5f3ed 3615class ISO639Utils:
39672624
YCH
3616 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3617 _lang_map = {
3618 'aa': 'aar',
3619 'ab': 'abk',
3620 'ae': 'ave',
3621 'af': 'afr',
3622 'ak': 'aka',
3623 'am': 'amh',
3624 'an': 'arg',
3625 'ar': 'ara',
3626 'as': 'asm',
3627 'av': 'ava',
3628 'ay': 'aym',
3629 'az': 'aze',
3630 'ba': 'bak',
3631 'be': 'bel',
3632 'bg': 'bul',
3633 'bh': 'bih',
3634 'bi': 'bis',
3635 'bm': 'bam',
3636 'bn': 'ben',
3637 'bo': 'bod',
3638 'br': 'bre',
3639 'bs': 'bos',
3640 'ca': 'cat',
3641 'ce': 'che',
3642 'ch': 'cha',
3643 'co': 'cos',
3644 'cr': 'cre',
3645 'cs': 'ces',
3646 'cu': 'chu',
3647 'cv': 'chv',
3648 'cy': 'cym',
3649 'da': 'dan',
3650 'de': 'deu',
3651 'dv': 'div',
3652 'dz': 'dzo',
3653 'ee': 'ewe',
3654 'el': 'ell',
3655 'en': 'eng',
3656 'eo': 'epo',
3657 'es': 'spa',
3658 'et': 'est',
3659 'eu': 'eus',
3660 'fa': 'fas',
3661 'ff': 'ful',
3662 'fi': 'fin',
3663 'fj': 'fij',
3664 'fo': 'fao',
3665 'fr': 'fra',
3666 'fy': 'fry',
3667 'ga': 'gle',
3668 'gd': 'gla',
3669 'gl': 'glg',
3670 'gn': 'grn',
3671 'gu': 'guj',
3672 'gv': 'glv',
3673 'ha': 'hau',
3674 'he': 'heb',
b7acc835 3675 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3676 'hi': 'hin',
3677 'ho': 'hmo',
3678 'hr': 'hrv',
3679 'ht': 'hat',
3680 'hu': 'hun',
3681 'hy': 'hye',
3682 'hz': 'her',
3683 'ia': 'ina',
3684 'id': 'ind',
b7acc835 3685 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3686 'ie': 'ile',
3687 'ig': 'ibo',
3688 'ii': 'iii',
3689 'ik': 'ipk',
3690 'io': 'ido',
3691 'is': 'isl',
3692 'it': 'ita',
3693 'iu': 'iku',
3694 'ja': 'jpn',
3695 'jv': 'jav',
3696 'ka': 'kat',
3697 'kg': 'kon',
3698 'ki': 'kik',
3699 'kj': 'kua',
3700 'kk': 'kaz',
3701 'kl': 'kal',
3702 'km': 'khm',
3703 'kn': 'kan',
3704 'ko': 'kor',
3705 'kr': 'kau',
3706 'ks': 'kas',
3707 'ku': 'kur',
3708 'kv': 'kom',
3709 'kw': 'cor',
3710 'ky': 'kir',
3711 'la': 'lat',
3712 'lb': 'ltz',
3713 'lg': 'lug',
3714 'li': 'lim',
3715 'ln': 'lin',
3716 'lo': 'lao',
3717 'lt': 'lit',
3718 'lu': 'lub',
3719 'lv': 'lav',
3720 'mg': 'mlg',
3721 'mh': 'mah',
3722 'mi': 'mri',
3723 'mk': 'mkd',
3724 'ml': 'mal',
3725 'mn': 'mon',
3726 'mr': 'mar',
3727 'ms': 'msa',
3728 'mt': 'mlt',
3729 'my': 'mya',
3730 'na': 'nau',
3731 'nb': 'nob',
3732 'nd': 'nde',
3733 'ne': 'nep',
3734 'ng': 'ndo',
3735 'nl': 'nld',
3736 'nn': 'nno',
3737 'no': 'nor',
3738 'nr': 'nbl',
3739 'nv': 'nav',
3740 'ny': 'nya',
3741 'oc': 'oci',
3742 'oj': 'oji',
3743 'om': 'orm',
3744 'or': 'ori',
3745 'os': 'oss',
3746 'pa': 'pan',
7bcd4813 3747 'pe': 'per',
39672624
YCH
3748 'pi': 'pli',
3749 'pl': 'pol',
3750 'ps': 'pus',
3751 'pt': 'por',
3752 'qu': 'que',
3753 'rm': 'roh',
3754 'rn': 'run',
3755 'ro': 'ron',
3756 'ru': 'rus',
3757 'rw': 'kin',
3758 'sa': 'san',
3759 'sc': 'srd',
3760 'sd': 'snd',
3761 'se': 'sme',
3762 'sg': 'sag',
3763 'si': 'sin',
3764 'sk': 'slk',
3765 'sl': 'slv',
3766 'sm': 'smo',
3767 'sn': 'sna',
3768 'so': 'som',
3769 'sq': 'sqi',
3770 'sr': 'srp',
3771 'ss': 'ssw',
3772 'st': 'sot',
3773 'su': 'sun',
3774 'sv': 'swe',
3775 'sw': 'swa',
3776 'ta': 'tam',
3777 'te': 'tel',
3778 'tg': 'tgk',
3779 'th': 'tha',
3780 'ti': 'tir',
3781 'tk': 'tuk',
3782 'tl': 'tgl',
3783 'tn': 'tsn',
3784 'to': 'ton',
3785 'tr': 'tur',
3786 'ts': 'tso',
3787 'tt': 'tat',
3788 'tw': 'twi',
3789 'ty': 'tah',
3790 'ug': 'uig',
3791 'uk': 'ukr',
3792 'ur': 'urd',
3793 'uz': 'uzb',
3794 've': 'ven',
3795 'vi': 'vie',
3796 'vo': 'vol',
3797 'wa': 'wln',
3798 'wo': 'wol',
3799 'xh': 'xho',
3800 'yi': 'yid',
e9a50fba 3801 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3802 'yo': 'yor',
3803 'za': 'zha',
3804 'zh': 'zho',
3805 'zu': 'zul',
3806 }
3807
3808 @classmethod
3809 def short2long(cls, code):
3810 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3811 return cls._lang_map.get(code[:2])
3812
3813 @classmethod
3814 def long2short(cls, code):
3815 """Convert language code from ISO 639-2/T to ISO 639-1"""
3816 for short_name, long_name in cls._lang_map.items():
3817 if long_name == code:
3818 return short_name
3819
3820
86e5f3ed 3821class ISO3166Utils:
4eb10f66
YCH
3822 # From http://data.okfn.org/data/core/country-list
3823 _country_map = {
3824 'AF': 'Afghanistan',
3825 'AX': 'Åland Islands',
3826 'AL': 'Albania',
3827 'DZ': 'Algeria',
3828 'AS': 'American Samoa',
3829 'AD': 'Andorra',
3830 'AO': 'Angola',
3831 'AI': 'Anguilla',
3832 'AQ': 'Antarctica',
3833 'AG': 'Antigua and Barbuda',
3834 'AR': 'Argentina',
3835 'AM': 'Armenia',
3836 'AW': 'Aruba',
3837 'AU': 'Australia',
3838 'AT': 'Austria',
3839 'AZ': 'Azerbaijan',
3840 'BS': 'Bahamas',
3841 'BH': 'Bahrain',
3842 'BD': 'Bangladesh',
3843 'BB': 'Barbados',
3844 'BY': 'Belarus',
3845 'BE': 'Belgium',
3846 'BZ': 'Belize',
3847 'BJ': 'Benin',
3848 'BM': 'Bermuda',
3849 'BT': 'Bhutan',
3850 'BO': 'Bolivia, Plurinational State of',
3851 'BQ': 'Bonaire, Sint Eustatius and Saba',
3852 'BA': 'Bosnia and Herzegovina',
3853 'BW': 'Botswana',
3854 'BV': 'Bouvet Island',
3855 'BR': 'Brazil',
3856 'IO': 'British Indian Ocean Territory',
3857 'BN': 'Brunei Darussalam',
3858 'BG': 'Bulgaria',
3859 'BF': 'Burkina Faso',
3860 'BI': 'Burundi',
3861 'KH': 'Cambodia',
3862 'CM': 'Cameroon',
3863 'CA': 'Canada',
3864 'CV': 'Cape Verde',
3865 'KY': 'Cayman Islands',
3866 'CF': 'Central African Republic',
3867 'TD': 'Chad',
3868 'CL': 'Chile',
3869 'CN': 'China',
3870 'CX': 'Christmas Island',
3871 'CC': 'Cocos (Keeling) Islands',
3872 'CO': 'Colombia',
3873 'KM': 'Comoros',
3874 'CG': 'Congo',
3875 'CD': 'Congo, the Democratic Republic of the',
3876 'CK': 'Cook Islands',
3877 'CR': 'Costa Rica',
3878 'CI': 'Côte d\'Ivoire',
3879 'HR': 'Croatia',
3880 'CU': 'Cuba',
3881 'CW': 'Curaçao',
3882 'CY': 'Cyprus',
3883 'CZ': 'Czech Republic',
3884 'DK': 'Denmark',
3885 'DJ': 'Djibouti',
3886 'DM': 'Dominica',
3887 'DO': 'Dominican Republic',
3888 'EC': 'Ecuador',
3889 'EG': 'Egypt',
3890 'SV': 'El Salvador',
3891 'GQ': 'Equatorial Guinea',
3892 'ER': 'Eritrea',
3893 'EE': 'Estonia',
3894 'ET': 'Ethiopia',
3895 'FK': 'Falkland Islands (Malvinas)',
3896 'FO': 'Faroe Islands',
3897 'FJ': 'Fiji',
3898 'FI': 'Finland',
3899 'FR': 'France',
3900 'GF': 'French Guiana',
3901 'PF': 'French Polynesia',
3902 'TF': 'French Southern Territories',
3903 'GA': 'Gabon',
3904 'GM': 'Gambia',
3905 'GE': 'Georgia',
3906 'DE': 'Germany',
3907 'GH': 'Ghana',
3908 'GI': 'Gibraltar',
3909 'GR': 'Greece',
3910 'GL': 'Greenland',
3911 'GD': 'Grenada',
3912 'GP': 'Guadeloupe',
3913 'GU': 'Guam',
3914 'GT': 'Guatemala',
3915 'GG': 'Guernsey',
3916 'GN': 'Guinea',
3917 'GW': 'Guinea-Bissau',
3918 'GY': 'Guyana',
3919 'HT': 'Haiti',
3920 'HM': 'Heard Island and McDonald Islands',
3921 'VA': 'Holy See (Vatican City State)',
3922 'HN': 'Honduras',
3923 'HK': 'Hong Kong',
3924 'HU': 'Hungary',
3925 'IS': 'Iceland',
3926 'IN': 'India',
3927 'ID': 'Indonesia',
3928 'IR': 'Iran, Islamic Republic of',
3929 'IQ': 'Iraq',
3930 'IE': 'Ireland',
3931 'IM': 'Isle of Man',
3932 'IL': 'Israel',
3933 'IT': 'Italy',
3934 'JM': 'Jamaica',
3935 'JP': 'Japan',
3936 'JE': 'Jersey',
3937 'JO': 'Jordan',
3938 'KZ': 'Kazakhstan',
3939 'KE': 'Kenya',
3940 'KI': 'Kiribati',
3941 'KP': 'Korea, Democratic People\'s Republic of',
3942 'KR': 'Korea, Republic of',
3943 'KW': 'Kuwait',
3944 'KG': 'Kyrgyzstan',
3945 'LA': 'Lao People\'s Democratic Republic',
3946 'LV': 'Latvia',
3947 'LB': 'Lebanon',
3948 'LS': 'Lesotho',
3949 'LR': 'Liberia',
3950 'LY': 'Libya',
3951 'LI': 'Liechtenstein',
3952 'LT': 'Lithuania',
3953 'LU': 'Luxembourg',
3954 'MO': 'Macao',
3955 'MK': 'Macedonia, the Former Yugoslav Republic of',
3956 'MG': 'Madagascar',
3957 'MW': 'Malawi',
3958 'MY': 'Malaysia',
3959 'MV': 'Maldives',
3960 'ML': 'Mali',
3961 'MT': 'Malta',
3962 'MH': 'Marshall Islands',
3963 'MQ': 'Martinique',
3964 'MR': 'Mauritania',
3965 'MU': 'Mauritius',
3966 'YT': 'Mayotte',
3967 'MX': 'Mexico',
3968 'FM': 'Micronesia, Federated States of',
3969 'MD': 'Moldova, Republic of',
3970 'MC': 'Monaco',
3971 'MN': 'Mongolia',
3972 'ME': 'Montenegro',
3973 'MS': 'Montserrat',
3974 'MA': 'Morocco',
3975 'MZ': 'Mozambique',
3976 'MM': 'Myanmar',
3977 'NA': 'Namibia',
3978 'NR': 'Nauru',
3979 'NP': 'Nepal',
3980 'NL': 'Netherlands',
3981 'NC': 'New Caledonia',
3982 'NZ': 'New Zealand',
3983 'NI': 'Nicaragua',
3984 'NE': 'Niger',
3985 'NG': 'Nigeria',
3986 'NU': 'Niue',
3987 'NF': 'Norfolk Island',
3988 'MP': 'Northern Mariana Islands',
3989 'NO': 'Norway',
3990 'OM': 'Oman',
3991 'PK': 'Pakistan',
3992 'PW': 'Palau',
3993 'PS': 'Palestine, State of',
3994 'PA': 'Panama',
3995 'PG': 'Papua New Guinea',
3996 'PY': 'Paraguay',
3997 'PE': 'Peru',
3998 'PH': 'Philippines',
3999 'PN': 'Pitcairn',
4000 'PL': 'Poland',
4001 'PT': 'Portugal',
4002 'PR': 'Puerto Rico',
4003 'QA': 'Qatar',
4004 'RE': 'Réunion',
4005 'RO': 'Romania',
4006 'RU': 'Russian Federation',
4007 'RW': 'Rwanda',
4008 'BL': 'Saint Barthélemy',
4009 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4010 'KN': 'Saint Kitts and Nevis',
4011 'LC': 'Saint Lucia',
4012 'MF': 'Saint Martin (French part)',
4013 'PM': 'Saint Pierre and Miquelon',
4014 'VC': 'Saint Vincent and the Grenadines',
4015 'WS': 'Samoa',
4016 'SM': 'San Marino',
4017 'ST': 'Sao Tome and Principe',
4018 'SA': 'Saudi Arabia',
4019 'SN': 'Senegal',
4020 'RS': 'Serbia',
4021 'SC': 'Seychelles',
4022 'SL': 'Sierra Leone',
4023 'SG': 'Singapore',
4024 'SX': 'Sint Maarten (Dutch part)',
4025 'SK': 'Slovakia',
4026 'SI': 'Slovenia',
4027 'SB': 'Solomon Islands',
4028 'SO': 'Somalia',
4029 'ZA': 'South Africa',
4030 'GS': 'South Georgia and the South Sandwich Islands',
4031 'SS': 'South Sudan',
4032 'ES': 'Spain',
4033 'LK': 'Sri Lanka',
4034 'SD': 'Sudan',
4035 'SR': 'Suriname',
4036 'SJ': 'Svalbard and Jan Mayen',
4037 'SZ': 'Swaziland',
4038 'SE': 'Sweden',
4039 'CH': 'Switzerland',
4040 'SY': 'Syrian Arab Republic',
4041 'TW': 'Taiwan, Province of China',
4042 'TJ': 'Tajikistan',
4043 'TZ': 'Tanzania, United Republic of',
4044 'TH': 'Thailand',
4045 'TL': 'Timor-Leste',
4046 'TG': 'Togo',
4047 'TK': 'Tokelau',
4048 'TO': 'Tonga',
4049 'TT': 'Trinidad and Tobago',
4050 'TN': 'Tunisia',
4051 'TR': 'Turkey',
4052 'TM': 'Turkmenistan',
4053 'TC': 'Turks and Caicos Islands',
4054 'TV': 'Tuvalu',
4055 'UG': 'Uganda',
4056 'UA': 'Ukraine',
4057 'AE': 'United Arab Emirates',
4058 'GB': 'United Kingdom',
4059 'US': 'United States',
4060 'UM': 'United States Minor Outlying Islands',
4061 'UY': 'Uruguay',
4062 'UZ': 'Uzbekistan',
4063 'VU': 'Vanuatu',
4064 'VE': 'Venezuela, Bolivarian Republic of',
4065 'VN': 'Viet Nam',
4066 'VG': 'Virgin Islands, British',
4067 'VI': 'Virgin Islands, U.S.',
4068 'WF': 'Wallis and Futuna',
4069 'EH': 'Western Sahara',
4070 'YE': 'Yemen',
4071 'ZM': 'Zambia',
4072 'ZW': 'Zimbabwe',
2f97cc61 4073 # Not ISO 3166 codes, but used for IP blocks
4074 'AP': 'Asia/Pacific Region',
4075 'EU': 'Europe',
4eb10f66
YCH
4076 }
4077
4078 @classmethod
4079 def short2full(cls, code):
4080 """Convert an ISO 3166-2 country code to the corresponding full name"""
4081 return cls._country_map.get(code.upper())
4082
4083
86e5f3ed 4084class GeoUtils:
773f291d
S
4085 # Major IPv4 address blocks per country
4086 _country_ip_map = {
53896ca5 4087 'AD': '46.172.224.0/19',
773f291d
S
4088 'AE': '94.200.0.0/13',
4089 'AF': '149.54.0.0/17',
4090 'AG': '209.59.64.0/18',
4091 'AI': '204.14.248.0/21',
4092 'AL': '46.99.0.0/16',
4093 'AM': '46.70.0.0/15',
4094 'AO': '105.168.0.0/13',
53896ca5
S
4095 'AP': '182.50.184.0/21',
4096 'AQ': '23.154.160.0/24',
773f291d
S
4097 'AR': '181.0.0.0/12',
4098 'AS': '202.70.112.0/20',
53896ca5 4099 'AT': '77.116.0.0/14',
773f291d
S
4100 'AU': '1.128.0.0/11',
4101 'AW': '181.41.0.0/18',
53896ca5
S
4102 'AX': '185.217.4.0/22',
4103 'AZ': '5.197.0.0/16',
773f291d
S
4104 'BA': '31.176.128.0/17',
4105 'BB': '65.48.128.0/17',
4106 'BD': '114.130.0.0/16',
4107 'BE': '57.0.0.0/8',
53896ca5 4108 'BF': '102.178.0.0/15',
773f291d
S
4109 'BG': '95.42.0.0/15',
4110 'BH': '37.131.0.0/17',
4111 'BI': '154.117.192.0/18',
4112 'BJ': '137.255.0.0/16',
53896ca5 4113 'BL': '185.212.72.0/23',
773f291d
S
4114 'BM': '196.12.64.0/18',
4115 'BN': '156.31.0.0/16',
4116 'BO': '161.56.0.0/16',
4117 'BQ': '161.0.80.0/20',
53896ca5 4118 'BR': '191.128.0.0/12',
773f291d
S
4119 'BS': '24.51.64.0/18',
4120 'BT': '119.2.96.0/19',
4121 'BW': '168.167.0.0/16',
4122 'BY': '178.120.0.0/13',
4123 'BZ': '179.42.192.0/18',
4124 'CA': '99.224.0.0/11',
4125 'CD': '41.243.0.0/16',
53896ca5
S
4126 'CF': '197.242.176.0/21',
4127 'CG': '160.113.0.0/16',
773f291d 4128 'CH': '85.0.0.0/13',
53896ca5 4129 'CI': '102.136.0.0/14',
773f291d
S
4130 'CK': '202.65.32.0/19',
4131 'CL': '152.172.0.0/14',
53896ca5 4132 'CM': '102.244.0.0/14',
773f291d
S
4133 'CN': '36.128.0.0/10',
4134 'CO': '181.240.0.0/12',
4135 'CR': '201.192.0.0/12',
4136 'CU': '152.206.0.0/15',
4137 'CV': '165.90.96.0/19',
4138 'CW': '190.88.128.0/17',
53896ca5 4139 'CY': '31.153.0.0/16',
773f291d
S
4140 'CZ': '88.100.0.0/14',
4141 'DE': '53.0.0.0/8',
4142 'DJ': '197.241.0.0/17',
4143 'DK': '87.48.0.0/12',
4144 'DM': '192.243.48.0/20',
4145 'DO': '152.166.0.0/15',
4146 'DZ': '41.96.0.0/12',
4147 'EC': '186.68.0.0/15',
4148 'EE': '90.190.0.0/15',
4149 'EG': '156.160.0.0/11',
4150 'ER': '196.200.96.0/20',
4151 'ES': '88.0.0.0/11',
4152 'ET': '196.188.0.0/14',
4153 'EU': '2.16.0.0/13',
4154 'FI': '91.152.0.0/13',
4155 'FJ': '144.120.0.0/16',
53896ca5 4156 'FK': '80.73.208.0/21',
773f291d
S
4157 'FM': '119.252.112.0/20',
4158 'FO': '88.85.32.0/19',
4159 'FR': '90.0.0.0/9',
4160 'GA': '41.158.0.0/15',
4161 'GB': '25.0.0.0/8',
4162 'GD': '74.122.88.0/21',
4163 'GE': '31.146.0.0/16',
4164 'GF': '161.22.64.0/18',
4165 'GG': '62.68.160.0/19',
53896ca5
S
4166 'GH': '154.160.0.0/12',
4167 'GI': '95.164.0.0/16',
773f291d
S
4168 'GL': '88.83.0.0/19',
4169 'GM': '160.182.0.0/15',
4170 'GN': '197.149.192.0/18',
4171 'GP': '104.250.0.0/19',
4172 'GQ': '105.235.224.0/20',
4173 'GR': '94.64.0.0/13',
4174 'GT': '168.234.0.0/16',
4175 'GU': '168.123.0.0/16',
4176 'GW': '197.214.80.0/20',
4177 'GY': '181.41.64.0/18',
4178 'HK': '113.252.0.0/14',
4179 'HN': '181.210.0.0/16',
4180 'HR': '93.136.0.0/13',
4181 'HT': '148.102.128.0/17',
4182 'HU': '84.0.0.0/14',
4183 'ID': '39.192.0.0/10',
4184 'IE': '87.32.0.0/12',
4185 'IL': '79.176.0.0/13',
4186 'IM': '5.62.80.0/20',
4187 'IN': '117.192.0.0/10',
4188 'IO': '203.83.48.0/21',
4189 'IQ': '37.236.0.0/14',
4190 'IR': '2.176.0.0/12',
4191 'IS': '82.221.0.0/16',
4192 'IT': '79.0.0.0/10',
4193 'JE': '87.244.64.0/18',
4194 'JM': '72.27.0.0/17',
4195 'JO': '176.29.0.0/16',
53896ca5 4196 'JP': '133.0.0.0/8',
773f291d
S
4197 'KE': '105.48.0.0/12',
4198 'KG': '158.181.128.0/17',
4199 'KH': '36.37.128.0/17',
4200 'KI': '103.25.140.0/22',
4201 'KM': '197.255.224.0/20',
53896ca5 4202 'KN': '198.167.192.0/19',
773f291d
S
4203 'KP': '175.45.176.0/22',
4204 'KR': '175.192.0.0/10',
4205 'KW': '37.36.0.0/14',
4206 'KY': '64.96.0.0/15',
4207 'KZ': '2.72.0.0/13',
4208 'LA': '115.84.64.0/18',
4209 'LB': '178.135.0.0/16',
53896ca5 4210 'LC': '24.92.144.0/20',
773f291d
S
4211 'LI': '82.117.0.0/19',
4212 'LK': '112.134.0.0/15',
53896ca5 4213 'LR': '102.183.0.0/16',
773f291d
S
4214 'LS': '129.232.0.0/17',
4215 'LT': '78.56.0.0/13',
4216 'LU': '188.42.0.0/16',
4217 'LV': '46.109.0.0/16',
4218 'LY': '41.252.0.0/14',
4219 'MA': '105.128.0.0/11',
4220 'MC': '88.209.64.0/18',
4221 'MD': '37.246.0.0/16',
4222 'ME': '178.175.0.0/17',
4223 'MF': '74.112.232.0/21',
4224 'MG': '154.126.0.0/17',
4225 'MH': '117.103.88.0/21',
4226 'MK': '77.28.0.0/15',
4227 'ML': '154.118.128.0/18',
4228 'MM': '37.111.0.0/17',
4229 'MN': '49.0.128.0/17',
4230 'MO': '60.246.0.0/16',
4231 'MP': '202.88.64.0/20',
4232 'MQ': '109.203.224.0/19',
4233 'MR': '41.188.64.0/18',
4234 'MS': '208.90.112.0/22',
4235 'MT': '46.11.0.0/16',
4236 'MU': '105.16.0.0/12',
4237 'MV': '27.114.128.0/18',
53896ca5 4238 'MW': '102.70.0.0/15',
773f291d
S
4239 'MX': '187.192.0.0/11',
4240 'MY': '175.136.0.0/13',
4241 'MZ': '197.218.0.0/15',
4242 'NA': '41.182.0.0/16',
4243 'NC': '101.101.0.0/18',
4244 'NE': '197.214.0.0/18',
4245 'NF': '203.17.240.0/22',
4246 'NG': '105.112.0.0/12',
4247 'NI': '186.76.0.0/15',
4248 'NL': '145.96.0.0/11',
4249 'NO': '84.208.0.0/13',
4250 'NP': '36.252.0.0/15',
4251 'NR': '203.98.224.0/19',
4252 'NU': '49.156.48.0/22',
4253 'NZ': '49.224.0.0/14',
4254 'OM': '5.36.0.0/15',
4255 'PA': '186.72.0.0/15',
4256 'PE': '186.160.0.0/14',
4257 'PF': '123.50.64.0/18',
4258 'PG': '124.240.192.0/19',
4259 'PH': '49.144.0.0/13',
4260 'PK': '39.32.0.0/11',
4261 'PL': '83.0.0.0/11',
4262 'PM': '70.36.0.0/20',
4263 'PR': '66.50.0.0/16',
4264 'PS': '188.161.0.0/16',
4265 'PT': '85.240.0.0/13',
4266 'PW': '202.124.224.0/20',
4267 'PY': '181.120.0.0/14',
4268 'QA': '37.210.0.0/15',
53896ca5 4269 'RE': '102.35.0.0/16',
773f291d 4270 'RO': '79.112.0.0/13',
53896ca5 4271 'RS': '93.86.0.0/15',
773f291d 4272 'RU': '5.136.0.0/13',
53896ca5 4273 'RW': '41.186.0.0/16',
773f291d
S
4274 'SA': '188.48.0.0/13',
4275 'SB': '202.1.160.0/19',
4276 'SC': '154.192.0.0/11',
53896ca5 4277 'SD': '102.120.0.0/13',
773f291d 4278 'SE': '78.64.0.0/12',
53896ca5 4279 'SG': '8.128.0.0/10',
773f291d
S
4280 'SI': '188.196.0.0/14',
4281 'SK': '78.98.0.0/15',
53896ca5 4282 'SL': '102.143.0.0/17',
773f291d
S
4283 'SM': '89.186.32.0/19',
4284 'SN': '41.82.0.0/15',
53896ca5 4285 'SO': '154.115.192.0/18',
773f291d
S
4286 'SR': '186.179.128.0/17',
4287 'SS': '105.235.208.0/21',
4288 'ST': '197.159.160.0/19',
4289 'SV': '168.243.0.0/16',
4290 'SX': '190.102.0.0/20',
4291 'SY': '5.0.0.0/16',
4292 'SZ': '41.84.224.0/19',
4293 'TC': '65.255.48.0/20',
4294 'TD': '154.68.128.0/19',
4295 'TG': '196.168.0.0/14',
4296 'TH': '171.96.0.0/13',
4297 'TJ': '85.9.128.0/18',
4298 'TK': '27.96.24.0/21',
4299 'TL': '180.189.160.0/20',
4300 'TM': '95.85.96.0/19',
4301 'TN': '197.0.0.0/11',
4302 'TO': '175.176.144.0/21',
4303 'TR': '78.160.0.0/11',
4304 'TT': '186.44.0.0/15',
4305 'TV': '202.2.96.0/19',
4306 'TW': '120.96.0.0/11',
4307 'TZ': '156.156.0.0/14',
53896ca5
S
4308 'UA': '37.52.0.0/14',
4309 'UG': '102.80.0.0/13',
4310 'US': '6.0.0.0/8',
773f291d 4311 'UY': '167.56.0.0/13',
53896ca5 4312 'UZ': '84.54.64.0/18',
773f291d 4313 'VA': '212.77.0.0/19',
53896ca5 4314 'VC': '207.191.240.0/21',
773f291d 4315 'VE': '186.88.0.0/13',
53896ca5 4316 'VG': '66.81.192.0/20',
773f291d
S
4317 'VI': '146.226.0.0/16',
4318 'VN': '14.160.0.0/11',
4319 'VU': '202.80.32.0/20',
4320 'WF': '117.20.32.0/21',
4321 'WS': '202.4.32.0/19',
4322 'YE': '134.35.0.0/16',
4323 'YT': '41.242.116.0/22',
4324 'ZA': '41.0.0.0/11',
53896ca5
S
4325 'ZM': '102.144.0.0/13',
4326 'ZW': '102.177.192.0/18',
773f291d
S
4327 }
4328
4329 @classmethod
5f95927a
S
4330 def random_ipv4(cls, code_or_block):
4331 if len(code_or_block) == 2:
4332 block = cls._country_ip_map.get(code_or_block.upper())
4333 if not block:
4334 return None
4335 else:
4336 block = code_or_block
773f291d 4337 addr, preflen = block.split('/')
ac668111 4338 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4339 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4340 return str(socket.inet_ntoa(
ac668111 4341 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4342
4343
0a5445dd
YCH
4344# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4345# released into Public Domain
4346# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4347
4348def long_to_bytes(n, blocksize=0):
4349 """long_to_bytes(n:long, blocksize:int) : string
4350 Convert a long integer to a byte string.
4351
4352 If optional blocksize is given and greater than zero, pad the front of the
4353 byte string with binary zeros so that the length is a multiple of
4354 blocksize.
4355 """
4356 # after much testing, this algorithm was deemed to be the fastest
4357 s = b''
4358 n = int(n)
4359 while n > 0:
ac668111 4360 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4361 n = n >> 32
4362 # strip off leading zeros
4363 for i in range(len(s)):
4364 if s[i] != b'\000'[0]:
4365 break
4366 else:
4367 # only happens when n == 0
4368 s = b'\000'
4369 i = 0
4370 s = s[i:]
4371 # add back some pad bytes. this could be done more efficiently w.r.t. the
4372 # de-padding being done above, but sigh...
4373 if blocksize > 0 and len(s) % blocksize:
4374 s = (blocksize - len(s) % blocksize) * b'\000' + s
4375 return s
4376
4377
4378def bytes_to_long(s):
4379 """bytes_to_long(string) : long
4380 Convert a byte string to a long integer.
4381
4382 This is (essentially) the inverse of long_to_bytes().
4383 """
4384 acc = 0
4385 length = len(s)
4386 if length % 4:
4387 extra = (4 - length % 4)
4388 s = b'\000' * extra + s
4389 length = length + extra
4390 for i in range(0, length, 4):
ac668111 4391 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4392 return acc
4393
4394
5bc880b9
YCH
4395def ohdave_rsa_encrypt(data, exponent, modulus):
4396 '''
4397 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4398
4399 Input:
4400 data: data to encrypt, bytes-like object
4401 exponent, modulus: parameter e and N of RSA algorithm, both integer
4402 Output: hex string of encrypted data
4403
4404 Limitation: supports one block encryption only
4405 '''
4406
4407 payload = int(binascii.hexlify(data[::-1]), 16)
4408 encrypted = pow(payload, exponent, modulus)
4409 return '%x' % encrypted
81bdc8fd
YCH
4410
4411
f48409c7
YCH
4412def pkcs1pad(data, length):
4413 """
4414 Padding input data with PKCS#1 scheme
4415
4416 @param {int[]} data input data
4417 @param {int} length target length
4418 @returns {int[]} padded data
4419 """
4420 if len(data) > length - 11:
4421 raise ValueError('Input data too long for PKCS#1 padding')
4422
4423 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4424 return [0, 2] + pseudo_random + [0] + data
4425
4426
7b2c3f47 4427def _base_n_table(n, table):
4428 if not table and not n:
4429 raise ValueError('Either table or n must be specified')
612f2be5 4430 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4431
44f14eb4 4432 if n and n != len(table):
612f2be5 4433 raise ValueError(f'base {n} exceeds table length {len(table)}')
4434 return table
59f898b7 4435
5eb6bdce 4436
7b2c3f47 4437def encode_base_n(num, n=None, table=None):
4438 """Convert given int to a base-n string"""
612f2be5 4439 table = _base_n_table(n, table)
7b2c3f47 4440 if not num:
5eb6bdce
YCH
4441 return table[0]
4442
7b2c3f47 4443 result, base = '', len(table)
81bdc8fd 4444 while num:
7b2c3f47 4445 result = table[num % base] + result
612f2be5 4446 num = num // base
7b2c3f47 4447 return result
4448
4449
4450def decode_base_n(string, n=None, table=None):
4451 """Convert given base-n string to int"""
4452 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4453 result, base = 0, len(table)
4454 for char in string:
4455 result = result * base + table[char]
4456 return result
4457
4458
f52354a8 4459def decode_packed_codes(code):
06b3fe29 4460 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4461 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4462 base = int(base)
4463 count = int(count)
4464 symbols = symbols.split('|')
4465 symbol_table = {}
4466
4467 while count:
4468 count -= 1
5eb6bdce 4469 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4470 symbol_table[base_n_count] = symbols[count] or base_n_count
4471
4472 return re.sub(
4473 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4474 obfuscated_code)
e154c651 4475
4476
1ced2221
S
4477def caesar(s, alphabet, shift):
4478 if shift == 0:
4479 return s
4480 l = len(alphabet)
4481 return ''.join(
4482 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4483 for c in s)
4484
4485
4486def rot47(s):
4487 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4488
4489
e154c651 4490def parse_m3u8_attributes(attrib):
4491 info = {}
4492 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4493 if val.startswith('"'):
4494 val = val[1:-1]
4495 info[key] = val
4496 return info
1143535d
YCH
4497
4498
4499def urshift(val, n):
4500 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4501
4502
efa97bdc 4503def write_xattr(path, key, value):
6f7563be 4504 # Windows: Write xattrs to NTFS Alternate Data Streams:
4505 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4506 if compat_os_name == 'nt':
4507 assert ':' not in key
4508 assert os.path.exists(path)
efa97bdc
YCH
4509
4510 try:
6f7563be 4511 with open(f'{path}:{key}', 'wb') as f:
4512 f.write(value)
86e5f3ed 4513 except OSError as e:
efa97bdc 4514 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4515 return
efa97bdc 4516
6f7563be 4517 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4518
6f7563be 4519 setxattr = None
4520 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4521 # Unicode arguments are not supported in pyxattr until version 0.5.0
4522 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4523 if version_tuple(xattr.__version__) >= (0, 5, 0):
4524 setxattr = xattr.set
4525 elif xattr:
4526 setxattr = xattr.setxattr
efa97bdc 4527
6f7563be 4528 if setxattr:
4529 try:
4530 setxattr(path, key, value)
4531 except OSError as e:
4532 raise XAttrMetadataError(e.errno, e.strerror)
4533 return
efa97bdc 4534
6f7563be 4535 # UNIX Method 2. Use setfattr/xattr executables
4536 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4537 else 'xattr' if check_executable('xattr', ['-h']) else None)
4538 if not exe:
4539 raise XAttrUnavailableError(
4540 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4541 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4542
0f06bcd7 4543 value = value.decode()
6f7563be 4544 try:
f0c9fb96 4545 _, stderr, returncode = Popen.run(
6f7563be 4546 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4547 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4548 except OSError as e:
4549 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4550 if returncode:
4551 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4552
4553
4554def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4555 start_date = datetime.date(1950, 1, 1)
4556 end_date = datetime.date(1995, 12, 31)
4557 offset = random.randint(0, (end_date - start_date).days)
4558 random_date = start_date + datetime.timedelta(offset)
0c265486 4559 return {
aa374bc7
AS
4560 year_field: str(random_date.year),
4561 month_field: str(random_date.month),
4562 day_field: str(random_date.day),
0c265486 4563 }
732044af 4564
c76eb41b 4565
8c53322c
L
4566def find_available_port(interface=''):
4567 try:
4568 with socket.socket() as sock:
4569 sock.bind((interface, 0))
4570 return sock.getsockname()[1]
4571 except OSError:
4572 return None
4573
4574
732044af 4575# Templates for internet shortcut files, which are plain text files.
e5a998f3 4576DOT_URL_LINK_TEMPLATE = '''\
732044af 4577[InternetShortcut]
4578URL=%(url)s
e5a998f3 4579'''
732044af 4580
e5a998f3 4581DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4582<?xml version="1.0" encoding="UTF-8"?>
4583<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4584<plist version="1.0">
4585<dict>
4586\t<key>URL</key>
4587\t<string>%(url)s</string>
4588</dict>
4589</plist>
e5a998f3 4590'''
732044af 4591
e5a998f3 4592DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4593[Desktop Entry]
4594Encoding=UTF-8
4595Name=%(filename)s
4596Type=Link
4597URL=%(url)s
4598Icon=text-html
e5a998f3 4599'''
732044af 4600
08438d2c 4601LINK_TEMPLATES = {
4602 'url': DOT_URL_LINK_TEMPLATE,
4603 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4604 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4605}
4606
732044af 4607
4608def iri_to_uri(iri):
4609 """
4610 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4611
4612 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4613 """
4614
14f25df2 4615 iri_parts = urllib.parse.urlparse(iri)
732044af 4616
4617 if '[' in iri_parts.netloc:
4618 raise ValueError('IPv6 URIs are not, yet, supported.')
4619 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4620
4621 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4622
4623 net_location = ''
4624 if iri_parts.username:
f9934b96 4625 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4626 if iri_parts.password is not None:
f9934b96 4627 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4628 net_location += '@'
4629
0f06bcd7 4630 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4631 # The 'idna' encoding produces ASCII text.
4632 if iri_parts.port is not None and iri_parts.port != 80:
4633 net_location += ':' + str(iri_parts.port)
4634
f9934b96 4635 return urllib.parse.urlunparse(
732044af 4636 (iri_parts.scheme,
4637 net_location,
4638
f9934b96 4639 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4640
4641 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4642 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4643
4644 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4645 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4646
f9934b96 4647 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4648
4649 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4650
4651
4652def to_high_limit_path(path):
4653 if sys.platform in ['win32', 'cygwin']:
4654 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4655 return '\\\\?\\' + os.path.abspath(path)
732044af 4656
4657 return path
76d321f6 4658
c76eb41b 4659
7b2c3f47 4660def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
69bec673 4661 val = traversal.traverse_obj(obj, *variadic(field))
6f2287cb 4662 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 4663 return default
7b2c3f47 4664 return template % func(val)
00dd0cd5 4665
4666
4667def clean_podcast_url(url):
91302ed3 4668 url = re.sub(r'''(?x)
00dd0cd5 4669 (?:
4670 (?:
4671 chtbl\.com/track|
4672 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
2af4eeb7
MAF
4673 play\.podtrac\.com|
4674 chrt\.fm/track|
4675 mgln\.ai/e
4676 )(?:/[^/.]+)?|
00dd0cd5 4677 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4678 flex\.acast\.com|
4679 pd(?:
4680 cn\.co| # https://podcorn.com/analytics-prefix/
4681 st\.fm # https://podsights.com/docs/
2af4eeb7
MAF
4682 )/e|
4683 [0-9]\.gum\.fm|
4684 pscrb\.fm/rss/p
00dd0cd5 4685 )/''', '', url)
91302ed3 4686 return re.sub(r'^\w+://(\w+://)', r'\1', url)
ffcb8191
THD
4687
4688
4689_HEX_TABLE = '0123456789abcdef'
4690
4691
4692def random_uuidv4():
4693 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4694
4695
4696def make_dir(path, to_screen=None):
4697 try:
4698 dn = os.path.dirname(path)
b25d6cb9
AI
4699 if dn:
4700 os.makedirs(dn, exist_ok=True)
0202b52a 4701 return True
86e5f3ed 4702 except OSError as err:
0202b52a 4703 if callable(to_screen) is not None:
69bec673 4704 to_screen(f'unable to create directory {err}')
0202b52a 4705 return False
f74980cb 4706
4707
4708def get_executable_path():
69bec673 4709 from ..update import _get_variant_and_executable_path
c487cf00 4710
b5899f4f 4711 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 4712
4713
8e40b9d1 4714def get_user_config_dirs(package_name):
8e40b9d1
M
4715 # .config (e.g. ~/.config/package_name)
4716 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 4717 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
4718
4719 # appdata (%APPDATA%/package_name)
4720 appdata_dir = os.getenv('appdata')
4721 if appdata_dir:
773c272d 4722 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
4723
4724 # home (~/.package_name)
773c272d 4725 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
4726
4727
4728def get_system_config_dirs(package_name):
8e40b9d1 4729 # /etc/package_name
773c272d 4730 yield os.path.join('/etc', package_name)
06167fbb 4731
4732
3e9b66d7 4733def time_seconds(**kwargs):
83c4970e
L
4734 """
4735 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4736 """
4737 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
4738
4739
49fa4d9a
N
4740# create a JSON Web Signature (jws) with HS256 algorithm
4741# the resulting format is in JWS Compact Serialization
4742# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4743# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4744def jwt_encode_hs256(payload_data, key, headers={}):
4745 header_data = {
4746 'alg': 'HS256',
4747 'typ': 'JWT',
4748 }
4749 if headers:
4750 header_data.update(headers)
0f06bcd7 4751 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4752 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4753 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
4754 signature_b64 = base64.b64encode(h.digest())
4755 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4756 return token
819e0531 4757
4758
16b0d7e6 4759# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4760def jwt_decode_hs256(jwt):
4761 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 4762 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4763 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 4764 return payload_data
4765
4766
53973b4d 4767WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4768
4769
7a32c70d 4770@functools.cache
819e0531 4771def supports_terminal_sequences(stream):
4772 if compat_os_name == 'nt':
8a82af35 4773 if not WINDOWS_VT_MODE:
819e0531 4774 return False
4775 elif not os.getenv('TERM'):
4776 return False
4777 try:
4778 return stream.isatty()
4779 except BaseException:
4780 return False
4781
4782
c53a18f0 4783def windows_enable_vt_mode():
4784 """Ref: https://bugs.python.org/issue30075 """
8a82af35 4785 if get_windows_version() < (10, 0, 10586):
53973b4d 4786 return
53973b4d 4787
c53a18f0 4788 import ctypes
4789 import ctypes.wintypes
4790 import msvcrt
4791
4792 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4793
4794 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4795 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 4796 try:
4797 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4798 dw_original_mode = ctypes.wintypes.DWORD()
4799 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4800 if not success:
4801 raise Exception('GetConsoleMode failed')
4802
4803 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4804 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4805 if not success:
4806 raise Exception('SetConsoleMode failed')
c53a18f0 4807 finally:
4808 os.close(handle)
53973b4d 4809
f0795149 4810 global WINDOWS_VT_MODE
4811 WINDOWS_VT_MODE = True
4812 supports_terminal_sequences.cache_clear()
4813
53973b4d 4814
ec11a9f4 4815_terminal_sequences_re = re.compile('\033\\[[^m]+m')
4816
4817
4818def remove_terminal_sequences(string):
4819 return _terminal_sequences_re.sub('', string)
4820
4821
4822def number_of_digits(number):
4823 return len('%d' % number)
34921b43 4824
4825
4826def join_nonempty(*values, delim='-', from_dict=None):
4827 if from_dict is not None:
69bec673 4828 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 4829 return delim.join(map(str, filter(None, values)))
06e57990 4830
4831
27231526
ZM
4832def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4833 """
4834 Find the largest format dimensions in terms of video width and, for each thumbnail:
4835 * Modify the URL: Match the width with the provided regex and replace with the former width
4836 * Update dimensions
4837
4838 This function is useful with video services that scale the provided thumbnails on demand
4839 """
4840 _keys = ('width', 'height')
4841 max_dimensions = max(
86e5f3ed 4842 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
4843 default=(0, 0))
4844 if not max_dimensions[0]:
4845 return thumbnails
4846 return [
4847 merge_dicts(
4848 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4849 dict(zip(_keys, max_dimensions)), thumbnail)
4850 for thumbnail in thumbnails
4851 ]
4852
4853
93c8410d
LNO
4854def parse_http_range(range):
4855 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4856 if not range:
4857 return None, None, None
4858 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4859 if not crg:
4860 return None, None, None
4861 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4862
4863
6b9e832d 4864def read_stdin(what):
4865 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4866 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4867 return sys.stdin
4868
4869
a904a7f8
L
4870def determine_file_encoding(data):
4871 """
88f60feb 4872 Detect the text encoding used
a904a7f8
L
4873 @returns (encoding, bytes to skip)
4874 """
4875
88f60feb 4876 # BOM marks are given priority over declarations
a904a7f8 4877 for bom, enc in BOMS:
a904a7f8
L
4878 if data.startswith(bom):
4879 return enc, len(bom)
4880
88f60feb 4881 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4882 # We ignore the endianness to get a good enough match
a904a7f8 4883 data = data.replace(b'\0', b'')
88f60feb 4884 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4885 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
4886
4887
06e57990 4888class Config:
4889 own_args = None
9e491463 4890 parsed_args = None
06e57990 4891 filename = None
4892 __initialized = False
4893
4894 def __init__(self, parser, label=None):
9e491463 4895 self.parser, self.label = parser, label
06e57990 4896 self._loaded_paths, self.configs = set(), []
4897
4898 def init(self, args=None, filename=None):
4899 assert not self.__initialized
284a60c5 4900 self.own_args, self.filename = args, filename
4901 return self.load_configs()
4902
4903 def load_configs(self):
65662dff 4904 directory = ''
284a60c5 4905 if self.filename:
4906 location = os.path.realpath(self.filename)
65662dff 4907 directory = os.path.dirname(location)
06e57990 4908 if location in self._loaded_paths:
4909 return False
4910 self._loaded_paths.add(location)
4911
284a60c5 4912 self.__initialized = True
4913 opts, _ = self.parser.parse_known_args(self.own_args)
4914 self.parsed_args = self.own_args
9e491463 4915 for location in opts.config_locations or []:
6b9e832d 4916 if location == '-':
1060f82f 4917 if location in self._loaded_paths:
4918 continue
4919 self._loaded_paths.add(location)
6b9e832d 4920 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4921 continue
65662dff 4922 location = os.path.join(directory, expand_path(location))
06e57990 4923 if os.path.isdir(location):
4924 location = os.path.join(location, 'yt-dlp.conf')
4925 if not os.path.exists(location):
9e491463 4926 self.parser.error(f'config location {location} does not exist')
06e57990 4927 self.append_config(self.read_file(location), location)
4928 return True
4929
4930 def __str__(self):
4931 label = join_nonempty(
4932 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4933 delim=' ')
4934 return join_nonempty(
4935 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4936 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4937 delim='\n')
4938
7a32c70d 4939 @staticmethod
06e57990 4940 def read_file(filename, default=[]):
4941 try:
a904a7f8 4942 optionf = open(filename, 'rb')
86e5f3ed 4943 except OSError:
06e57990 4944 return default # silently skip if file is not present
a904a7f8
L
4945 try:
4946 enc, skip = determine_file_encoding(optionf.read(512))
4947 optionf.seek(skip, io.SEEK_SET)
4948 except OSError:
4949 enc = None # silently skip read errors
06e57990 4950 try:
4951 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 4952 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 4953 res = shlex.split(contents, comments=True)
44a6fcff 4954 except Exception as err:
4955 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 4956 finally:
4957 optionf.close()
4958 return res
4959
7a32c70d 4960 @staticmethod
06e57990 4961 def hide_login_info(opts):
86e5f3ed 4962 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 4963 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4964
4965 def _scrub_eq(o):
4966 m = eqre.match(o)
4967 if m:
4968 return m.group('key') + '=PRIVATE'
4969 else:
4970 return o
4971
4972 opts = list(map(_scrub_eq, opts))
4973 for idx, opt in enumerate(opts):
4974 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4975 opts[idx + 1] = 'PRIVATE'
4976 return opts
4977
4978 def append_config(self, *args, label=None):
9e491463 4979 config = type(self)(self.parser, label)
06e57990 4980 config._loaded_paths = self._loaded_paths
4981 if config.init(*args):
4982 self.configs.append(config)
4983
7a32c70d 4984 @property
06e57990 4985 def all_args(self):
4986 for config in reversed(self.configs):
4987 yield from config.all_args
9e491463 4988 yield from self.parsed_args or []
4989
4990 def parse_known_args(self, **kwargs):
4991 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 4992
4993 def parse_args(self):
9e491463 4994 return self.parser.parse_args(self.all_args)
da42679b
LNO
4995
4996
d5d1df8a 4997class WebSocketsWrapper:
da42679b 4998 """Wraps websockets module to use in non-async scopes"""
abfecb7b 4999 pool = None
da42679b 5000
3cea3edd 5001 def __init__(self, url, headers=None, connect=True):
059bc4db 5002 self.loop = asyncio.new_event_loop()
9cd08050 5003 # XXX: "loop" is deprecated
5004 self.conn = websockets.connect(
5005 url, extra_headers=headers, ping_interval=None,
5006 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5007 if connect:
5008 self.__enter__()
15dfb392 5009 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5010
5011 def __enter__(self):
3cea3edd 5012 if not self.pool:
9cd08050 5013 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5014 return self
5015
5016 def send(self, *args):
5017 self.run_with_loop(self.pool.send(*args), self.loop)
5018
5019 def recv(self, *args):
5020 return self.run_with_loop(self.pool.recv(*args), self.loop)
5021
5022 def __exit__(self, type, value, traceback):
5023 try:
5024 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5025 finally:
5026 self.loop.close()
15dfb392 5027 self._cancel_all_tasks(self.loop)
da42679b
LNO
5028
5029 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5030 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5031 @staticmethod
da42679b 5032 def run_with_loop(main, loop):
059bc4db 5033 if not asyncio.iscoroutine(main):
da42679b
LNO
5034 raise ValueError(f'a coroutine was expected, got {main!r}')
5035
5036 try:
5037 return loop.run_until_complete(main)
5038 finally:
5039 loop.run_until_complete(loop.shutdown_asyncgens())
5040 if hasattr(loop, 'shutdown_default_executor'):
5041 loop.run_until_complete(loop.shutdown_default_executor())
5042
7a32c70d 5043 @staticmethod
da42679b 5044 def _cancel_all_tasks(loop):
059bc4db 5045 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5046
5047 if not to_cancel:
5048 return
5049
5050 for task in to_cancel:
5051 task.cancel()
5052
9cd08050 5053 # XXX: "loop" is removed in python 3.10+
da42679b 5054 loop.run_until_complete(
059bc4db 5055 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5056
5057 for task in to_cancel:
5058 if task.cancelled():
5059 continue
5060 if task.exception() is not None:
5061 loop.call_exception_handler({
5062 'message': 'unhandled exception during asyncio.run() shutdown',
5063 'exception': task.exception(),
5064 'task': task,
5065 })
5066
5067
8b7539d2 5068def merge_headers(*dicts):
08d30158 5069 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5070 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5071
5072
b1f94422 5073def cached_method(f):
5074 """Cache a method"""
5075 signature = inspect.signature(f)
5076
7a32c70d 5077 @functools.wraps(f)
b1f94422 5078 def wrapper(self, *args, **kwargs):
5079 bound_args = signature.bind(self, *args, **kwargs)
5080 bound_args.apply_defaults()
d5d1df8a 5081 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5082
6368e2e6 5083 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5084 if key not in cache:
5085 cache[key] = f(self, *args, **kwargs)
5086 return cache[key]
5087 return wrapper
5088
5089
28787f16 5090class classproperty:
83cc7b8a 5091 """property access for class methods with optional caching"""
5092 def __new__(cls, func=None, *args, **kwargs):
5093 if not func:
5094 return functools.partial(cls, *args, **kwargs)
5095 return super().__new__(cls)
c487cf00 5096
83cc7b8a 5097 def __init__(self, func, *, cache=False):
c487cf00 5098 functools.update_wrapper(self, func)
5099 self.func = func
83cc7b8a 5100 self._cache = {} if cache else None
28787f16 5101
5102 def __get__(self, _, cls):
83cc7b8a 5103 if self._cache is None:
5104 return self.func(cls)
5105 elif cls not in self._cache:
5106 self._cache[cls] = self.func(cls)
5107 return self._cache[cls]
19a03940 5108
5109
a5387729 5110class function_with_repr:
b2e0343b 5111 def __init__(self, func, repr_=None):
a5387729 5112 functools.update_wrapper(self, func)
b2e0343b 5113 self.func, self.__repr = func, repr_
a5387729 5114
5115 def __call__(self, *args, **kwargs):
5116 return self.func(*args, **kwargs)
5117
5118 def __repr__(self):
b2e0343b 5119 if self.__repr:
5120 return self.__repr
a5387729 5121 return f'{self.func.__module__}.{self.func.__qualname__}'
5122
5123
64fa820c 5124class Namespace(types.SimpleNamespace):
591bb9d3 5125 """Immutable namespace"""
591bb9d3 5126
7896214c 5127 def __iter__(self):
64fa820c 5128 return iter(self.__dict__.values())
7896214c 5129
7a32c70d 5130 @property
64fa820c 5131 def items_(self):
5132 return self.__dict__.items()
9b8ee23b 5133
5134
8dc59305 5135MEDIA_EXTENSIONS = Namespace(
5136 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5137 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5138 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 5139 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 5140 thumbnails=('jpg', 'png', 'webp'),
5141 storyboards=('mhtml', ),
5142 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5143 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5144)
5145MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5146MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5147
5148KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5149
5150
be5c1ae8 5151class RetryManager:
5152 """Usage:
5153 for retry in RetryManager(...):
5154 try:
5155 ...
5156 except SomeException as err:
5157 retry.error = err
5158 continue
5159 """
5160 attempt, _error = 0, None
5161
5162 def __init__(self, _retries, _error_callback, **kwargs):
5163 self.retries = _retries or 0
5164 self.error_callback = functools.partial(_error_callback, **kwargs)
5165
5166 def _should_retry(self):
5167 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5168
7a32c70d 5169 @property
be5c1ae8 5170 def error(self):
5171 if self._error is NO_DEFAULT:
5172 return None
5173 return self._error
5174
7a32c70d 5175 @error.setter
be5c1ae8 5176 def error(self, value):
5177 self._error = value
5178
5179 def __iter__(self):
5180 while self._should_retry():
5181 self.error = NO_DEFAULT
5182 self.attempt += 1
5183 yield self
5184 if self.error:
5185 self.error_callback(self.error, self.attempt, self.retries)
5186
7a32c70d 5187 @staticmethod
be5c1ae8 5188 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5189 """Utility function for reporting retries"""
5190 if count > retries:
5191 if error:
5192 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5193 raise e
5194
5195 if not count:
5196 return warn(e)
5197 elif isinstance(e, ExtractorError):
3ce29336 5198 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5199 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5200
5201 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5202 if delay:
5203 info(f'Sleeping {delay:.2f} seconds ...')
5204 time.sleep(delay)
5205
5206
0647d925 5207def make_archive_id(ie, video_id):
5208 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5209 return f'{ie_key.lower()} {video_id}'
5210
5211
a1c5bd82 5212def truncate_string(s, left, right=0):
5213 assert left > 3 and right >= 0
5214 if s is None or len(s) <= left + right:
5215 return s
71df9b7f 5216 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 5217
5218
5314b521 5219def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5220 assert 'all' in alias_dict, '"all" alias is required'
5221 requested = list(start or [])
5222 for val in options:
5223 discard = val.startswith('-')
5224 if discard:
5225 val = val[1:]
5226
5227 if val in alias_dict:
5228 val = alias_dict[val] if not discard else [
5229 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5230 # NB: Do not allow regex in aliases for performance
5231 requested = orderedSet_from_options(val, alias_dict, start=requested)
5232 continue
5233
5234 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5235 else [val] if val in alias_dict['all'] else None)
5236 if current is None:
5237 raise ValueError(val)
5238
5239 if discard:
5240 for item in current:
5241 while item in requested:
5242 requested.remove(item)
5243 else:
5244 requested.extend(current)
5245
5246 return orderedSet(requested)
5247
5248
eedda525 5249# TODO: Rewrite
d0d74b71 5250class FormatSorter:
5251 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5252
5253 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5254 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5255 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5256 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5257 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5258 'fps', 'fs_approx', 'source', 'id')
5259
5260 settings = {
5261 'vcodec': {'type': 'ordered', 'regex': True,
5262 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5263 'acodec': {'type': 'ordered', 'regex': True,
71082216 5264 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 5265 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5266 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5267 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5268 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5269 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 5270 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5271 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 5272 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5273 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5274 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 5275 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5276 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5277 'field': ('vcodec', 'acodec'),
5278 'function': lambda it: int(any(v != 'none' for v in it))},
5279 'ie_pref': {'priority': True, 'type': 'extractor'},
5280 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5281 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5282 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5283 'quality': {'convert': 'float', 'default': -1},
5284 'filesize': {'convert': 'bytes'},
5285 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5286 'id': {'convert': 'string', 'field': 'format_id'},
5287 'height': {'convert': 'float_none'},
5288 'width': {'convert': 'float_none'},
5289 'fps': {'convert': 'float_none'},
5290 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5291 'tbr': {'convert': 'float_none'},
5292 'vbr': {'convert': 'float_none'},
5293 'abr': {'convert': 'float_none'},
5294 'asr': {'convert': 'float_none'},
5295 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5296
5297 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
812cdfa0 5298 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
eedda525 5299 'function': lambda it: next(filter(None, it), None)},
812cdfa0 5300 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
eedda525 5301 'function': lambda it: next(filter(None, it), None)},
d0d74b71 5302 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5303 'res': {'type': 'multiple', 'field': ('height', 'width'),
5304 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5305
5306 # Actual field names
5307 'format_id': {'type': 'alias', 'field': 'id'},
5308 'preference': {'type': 'alias', 'field': 'ie_pref'},
5309 'language_preference': {'type': 'alias', 'field': 'lang'},
5310 'source_preference': {'type': 'alias', 'field': 'source'},
5311 'protocol': {'type': 'alias', 'field': 'proto'},
5312 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5313 'audio_channels': {'type': 'alias', 'field': 'channels'},
5314
5315 # Deprecated
5316 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5317 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5318 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5319 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5320 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5321 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5322 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5323 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5324 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5325 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5326 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5327 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5328 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5329 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5330 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5331 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5332 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5333 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5334 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5335 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5336 }
5337
5338 def __init__(self, ydl, field_preference):
5339 self.ydl = ydl
5340 self._order = []
5341 self.evaluate_params(self.ydl.params, field_preference)
5342 if ydl.params.get('verbose'):
5343 self.print_verbose_info(self.ydl.write_debug)
5344
5345 def _get_field_setting(self, field, key):
5346 if field not in self.settings:
5347 if key in ('forced', 'priority'):
5348 return False
5349 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5350 'deprecated and may be removed in a future version')
5351 self.settings[field] = {}
5352 propObj = self.settings[field]
5353 if key not in propObj:
5354 type = propObj.get('type')
5355 if key == 'field':
5356 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5357 elif key == 'convert':
5358 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5359 else:
5360 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5361 propObj[key] = default
5362 return propObj[key]
5363
5364 def _resolve_field_value(self, field, value, convertNone=False):
5365 if value is None:
5366 if not convertNone:
5367 return None
5368 else:
5369 value = value.lower()
5370 conversion = self._get_field_setting(field, 'convert')
5371 if conversion == 'ignore':
5372 return None
5373 if conversion == 'string':
5374 return value
5375 elif conversion == 'float_none':
5376 return float_or_none(value)
5377 elif conversion == 'bytes':
5378 return parse_bytes(value)
5379 elif conversion == 'order':
5380 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5381 use_regex = self._get_field_setting(field, 'regex')
5382 list_length = len(order_list)
5383 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5384 if use_regex and value is not None:
5385 for i, regex in enumerate(order_list):
5386 if regex and re.match(regex, value):
5387 return list_length - i
5388 return list_length - empty_pos # not in list
5389 else: # not regex or value = None
5390 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5391 else:
5392 if value.isnumeric():
5393 return float(value)
5394 else:
5395 self.settings[field]['convert'] = 'string'
5396 return value
5397
5398 def evaluate_params(self, params, sort_extractor):
5399 self._use_free_order = params.get('prefer_free_formats', False)
5400 self._sort_user = params.get('format_sort', [])
5401 self._sort_extractor = sort_extractor
5402
5403 def add_item(field, reverse, closest, limit_text):
5404 field = field.lower()
5405 if field in self._order:
5406 return
5407 self._order.append(field)
5408 limit = self._resolve_field_value(field, limit_text)
5409 data = {
5410 'reverse': reverse,
5411 'closest': False if limit is None else closest,
5412 'limit_text': limit_text,
5413 'limit': limit}
5414 if field in self.settings:
5415 self.settings[field].update(data)
5416 else:
5417 self.settings[field] = data
5418
5419 sort_list = (
5420 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5421 + (tuple() if params.get('format_sort_force', False)
5422 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5423 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5424
5425 for item in sort_list:
5426 match = re.match(self.regex, item)
5427 if match is None:
5428 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5429 field = match.group('field')
5430 if field is None:
5431 continue
5432 if self._get_field_setting(field, 'type') == 'alias':
5433 alias, field = field, self._get_field_setting(field, 'field')
5434 if self._get_field_setting(alias, 'deprecated'):
5435 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5436 f'be removed in a future version. Please use {field} instead')
5437 reverse = match.group('reverse') is not None
5438 closest = match.group('separator') == '~'
5439 limit_text = match.group('limit')
5440
5441 has_limit = limit_text is not None
5442 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5443 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5444
5445 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5446 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5447 limit_count = len(limits)
5448 for (i, f) in enumerate(fields):
5449 add_item(f, reverse, closest,
5450 limits[i] if i < limit_count
5451 else limits[0] if has_limit and not has_multiple_limits
5452 else None)
5453
5454 def print_verbose_info(self, write_debug):
5455 if self._sort_user:
5456 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5457 if self._sort_extractor:
5458 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5459 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5460 '+' if self._get_field_setting(field, 'reverse') else '', field,
5461 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5462 self._get_field_setting(field, 'limit_text'),
5463 self._get_field_setting(field, 'limit'))
5464 if self._get_field_setting(field, 'limit_text') is not None else '')
5465 for field in self._order if self._get_field_setting(field, 'visible')]))
5466
5467 def _calculate_field_preference_from_value(self, format, field, type, value):
5468 reverse = self._get_field_setting(field, 'reverse')
5469 closest = self._get_field_setting(field, 'closest')
5470 limit = self._get_field_setting(field, 'limit')
5471
5472 if type == 'extractor':
5473 maximum = self._get_field_setting(field, 'max')
5474 if value is None or (maximum is not None and value >= maximum):
5475 value = -1
5476 elif type == 'boolean':
5477 in_list = self._get_field_setting(field, 'in_list')
5478 not_in_list = self._get_field_setting(field, 'not_in_list')
5479 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5480 elif type == 'ordered':
5481 value = self._resolve_field_value(field, value, True)
5482
5483 # try to convert to number
5484 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5485 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5486 if is_num:
5487 value = val_num
5488
5489 return ((-10, 0) if value is None
5490 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5491 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5492 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5493 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5494 else (-1, value, 0))
5495
5496 def _calculate_field_preference(self, format, field):
5497 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5498 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5499 if type == 'multiple':
5500 type = 'field' # Only 'field' is allowed in multiple for now
5501 actual_fields = self._get_field_setting(field, 'field')
5502
5503 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5504 else:
5505 value = get_value(field)
5506 return self._calculate_field_preference_from_value(format, field, type, value)
5507
5508 def calculate_preference(self, format):
5509 # Determine missing protocol
5510 if not format.get('protocol'):
5511 format['protocol'] = determine_protocol(format)
5512
5513 # Determine missing ext
5514 if not format.get('ext') and 'url' in format:
5515 format['ext'] = determine_ext(format['url'])
5516 if format.get('vcodec') == 'none':
5517 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5518 format['video_ext'] = 'none'
5519 else:
5520 format['video_ext'] = format['ext']
5521 format['audio_ext'] = 'none'
5522 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5523 # format['preference'] = -1000
5524
5424dbaf
L
5525 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5526 # HEVC-over-FLV is out-of-spec by FLV's original spec
5527 # ref. https://trac.ffmpeg.org/ticket/6389
5528 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5529 format['preference'] = -100
5530
d0d74b71 5531 # Determine missing bitrates
eedda525 5532 if format.get('vcodec') == 'none':
5533 format['vbr'] = 0
5534 if format.get('acodec') == 'none':
5535 format['abr'] = 0
5536 if not format.get('vbr') and format.get('vcodec') != 'none':
5537 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5538 if not format.get('abr') and format.get('acodec') != 'none':
5539 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5540 if not format.get('tbr'):
5541 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
d0d74b71 5542
5543 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1b392f90 5544
5545
5546# XXX: Temporary
5547class _YDLLogger:
5548 def __init__(self, ydl=None):
5549 self._ydl = ydl
5550
5551 def debug(self, message):
5552 if self._ydl:
5553 self._ydl.write_debug(message)
5554
5555 def info(self, message):
5556 if self._ydl:
5557 self._ydl.to_screen(message)
5558
5559 def warning(self, message, *, once=False):
5560 if self._ydl:
5561 self._ydl.report_warning(message, only_once=once)
5562
5563 def error(self, message, *, is_error=True):
5564 if self._ydl:
5565 self._ydl.report_error(message, is_error=is_error)
5566
5567 def stdout(self, message):
5568 if self._ydl:
5569 self._ydl.to_stdout(message)
5570
5571 def stderr(self, message):
5572 if self._ydl:
5573 self._ydl.to_stderr(message)