]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
Fix bug in writing playlist info-json
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import importlib.util
22 import inspect
23 import io
24 import itertools
25 import json
26 import locale
27 import math
28 import mimetypes
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from .compat import functools # isort: split
52 from .compat import (
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_HTMLParseError,
56 compat_os_name,
57 compat_shlex_quote,
58 )
59 from .dependencies import brotli, certifi, websockets, xattr
60 from .socks import ProxyType, sockssocket
61
62
63 def register_socks_protocols():
64 # "Register" SOCKS protocols
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
70
71
72 # This is not clearly defined otherwise
73 compiled_regex_type = type(re.compile(''))
74
75
76 def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
121 SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123 ]
124 if brotli:
125 SUPPORTED_ENCODINGS.append('br')
126
127 std_headers = {
128 'User-Agent': random_user_agent(),
129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130 'Accept-Language': 'en-us,en;q=0.5',
131 'Sec-Fetch-Mode': 'navigate',
132 }
133
134
135 USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137 }
138
139
140 NO_DEFAULT = object()
141 IDENTITY = lambda x: x
142
143 ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
147 MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
157 }
158
159 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160 TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167 }
168
169 # needed for sanitizing filenames in restricted mode
170 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
173
174 DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
178 '%B %dst %Y',
179 '%B %dnd %Y',
180 '%B %drd %Y',
181 '%B %dth %Y',
182 '%b %d %Y',
183 '%b %dst %Y',
184 '%b %dnd %Y',
185 '%b %drd %Y',
186 '%b %dth %Y',
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
189 '%b %drd %Y %I:%M',
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
193 '%Y.%m.%d.',
194 '%Y/%m/%d',
195 '%Y/%m/%d %H:%M',
196 '%Y/%m/%d %H:%M:%S',
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
199 '%Y%m%d',
200 '%Y-%m-%d %H:%M',
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
203 '%Y-%m-%d %H:%M:%S:%f',
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
216 '%H:%M %d-%b-%Y',
217 )
218
219 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220 DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
227 '%d-%m-%Y %H:%M',
228 ])
229
230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231 DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237 ])
238
239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
241
242 NUMBER_RE = r'\d+(?:\.\d+)?'
243
244
245 @functools.cache
246 def preferredencoding():
247 """Get preferred encoding.
248
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
254 'TEST'.encode(pref)
255 except Exception:
256 pref = 'UTF-8'
257
258 return pref
259
260
261 def write_json_file(obj, fn):
262 """ Encode obj as JSON and write it to fn, atomically if possible """
263
264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
267
268 try:
269 with tf:
270 json.dump(obj, tf, ensure_ascii=False)
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
274 with contextlib.suppress(OSError):
275 os.unlink(fn)
276 with contextlib.suppress(OSError):
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
280 os.rename(tf.name, fn)
281 except Exception:
282 with contextlib.suppress(OSError):
283 os.remove(tf.name)
284 raise
285
286
287 def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
291 return node.find(expr)
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295
296
297 def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
308
309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 def _find_xpath(xpath):
311 return node.find(xpath)
312
313 if isinstance(xpath, str):
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
320
321 if n is None:
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
329 return n
330
331
332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
345
346
347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
353 name = f'{xpath}[@{key}]' if name is None else name
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
358
359
360 def get_element_by_id(id, html, **kwargs):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_html_by_id(id, html, **kwargs):
366 """Return the html of the tag with the specified ID in the passed HTML document"""
367 return get_element_html_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_by_class(class_name, html):
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
384 return retval[0] if retval else None
385
386
387 def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
389 return retval[0] if retval else None
390
391
392 def get_elements_by_class(class_name, html, **kargs):
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
403 html, escape_value=False)
404
405
406 def get_elements_by_attribute(*args, **kwargs):
407 """Return the content of the tag with the specified attribute in the passed HTML document"""
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
421 if not value:
422 return
423
424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
425
426 value = re.escape(value) if escape_value else value
427
428 partial_element_re = rf'''(?x)
429 <(?P<tag>{tag})
430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
433
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
436
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
441
442
443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
455 html.parser.HTMLParser.__init__(self)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
485 # XXX: This should be far less strict
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(html.parser.HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 html.parser.HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530 raise compat_HTMLParseError('done')
531
532
533 class HTMLListAttrsParser(html.parser.HTMLParser):
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
537 html.parser.HTMLParser.__init__(self)
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
550 def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
563 """
564 parser = HTMLAttributeParser()
565 with contextlib.suppress(compat_HTMLParseError):
566 parser.feed(html_element)
567 parser.close()
568 return parser.attrs
569
570
571 def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
580 def clean_html(html):
581 """Clean an HTML snippet into a readable string"""
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
593 return html.strip()
594
595
596 class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
612
613
614 def sanitize_open(filename, open_mode):
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
627
628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
632
633 for attempt in range(2):
634 try:
635 try:
636 if sys.platform == 'win32':
637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
640 raise LockingUnsupportedError()
641 stream = locked_file(filename, open_mode, block=False).__enter__()
642 except OSError:
643 stream = open(filename, open_mode)
644 return stream, filename
645 except OSError as err:
646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
651
652
653 def timeconvert(timestr):
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
660
661
662 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
663 """Sanitizes a string so it could be used as part of a filename.
664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
667 """
668 if s == '':
669 return ''
670
671 def replace_insane(char):
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
674 elif not restricted and char == '\n':
675 return '\0 '
676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
684 return '\0_\0-' if restricted else '\0 \0-'
685 elif char in '\\/|*<>':
686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
689 return char
690
691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
693 s = unicodedata.normalize('NFKC', s)
694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
695 result = ''.join(map(replace_insane, s))
696 if is_id is NO_DEFAULT:
697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
711 result = result.lstrip('.')
712 if not result:
713 result = '_'
714 return result
715
716
717 def sanitize_path(s, force=False):
718 """Sanitizes and normalizes path on Windows"""
719 if sys.platform == 'win32':
720 force = False
721 drive_or_unc, _ = os.path.splitdrive(s)
722 elif force:
723 drive_or_unc = ''
724 else:
725 return s
726
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
729 norm_path.pop(0)
730 sanitized_path = [
731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
732 for path_part in norm_path]
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
735 elif force and s and s[0] == os.path.sep:
736 sanitized_path.insert(0, os.path.sep)
737 return os.path.join(*sanitized_path)
738
739
740 def sanitize_url(url, *, scheme='http'):
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
743 if url is None:
744 return
745 elif url.startswith('//'):
746 return f'{scheme}:{url}'
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
749 # https://github.com/ytdl-org/youtube-dl/issues/15649
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
757 return url
758
759
760 def extract_basic_auth(url):
761 parts = urllib.parse.urlsplit(url)
762 if parts.username is None:
763 return url, None
764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
770
771
772 def sanitized_Request(url, *args, **kwargs):
773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
777 return urllib.request.Request(url, *args, **kwargs)
778
779
780 def expand_path(s):
781 """Expand shell variables and ~"""
782 return os.path.expandvars(compat_expanduser(s))
783
784
785 def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
795
796
797 def _htmlentity_transform(entity_with_semicolon):
798 """Transforms an HTML entity to a character."""
799 entity = entity_with_semicolon[:-1]
800
801 # Known non-numeric HTML entity
802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
804
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
809
810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
811 if mobj is not None:
812 numstr = mobj.group(1)
813 if numstr.startswith('x'):
814 base = 16
815 numstr = '0%s' % numstr
816 else:
817 base = 10
818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
819 with contextlib.suppress(ValueError):
820 return chr(int(numstr, base))
821
822 # Unknown entity in name, return its literal representation
823 return '&%s;' % entity
824
825
826 def unescapeHTML(s):
827 if s is None:
828 return None
829 assert isinstance(s, str)
830
831 return re.sub(
832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
833
834
835 def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
846 def process_communicate_or_kill(p, *args, **kwargs):
847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
849 return Popen.communicate_or_kill(p, *args, **kwargs)
850
851
852 class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
888
889 def communicate_or_kill(self, *args, **kwargs):
890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
893 self.kill(timeout=None)
894 raise
895
896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
902 def run(cls, *args, timeout=None, **kwargs):
903 with cls(*args, **kwargs) as proc:
904 default = '' if proc.text_mode else b''
905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
906 return stdout or default, stderr or default, proc.returncode
907
908
909 def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
921 def encodeFilename(s, for_subprocess=False):
922 assert isinstance(s, str)
923 return s
924
925
926 def decodeFilename(b, for_subprocess=False):
927 return b
928
929
930 def encodeArgument(s):
931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
934 return s if isinstance(s, str) else s.decode('ascii')
935
936
937 def decodeArgument(b):
938 return b
939
940
941 def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
947 assert isinstance(optval, str)
948 return optval
949
950
951 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954 def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
961 def formatSeconds(secs, delim=':', msec=False):
962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
967 else:
968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
970
971
972 def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
981 with contextlib.suppress(ssl.SSLError):
982 ssl_context.load_verify_locations(cadata=cert)
983
984
985 def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
1012
1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
1029
1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1046
1047
1048 def bug_reports_message(before=';'):
1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
1059
1060
1061 class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
1071
1072
1073 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1074 if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076 network_exceptions = tuple(network_exceptions)
1077
1078
1079 class ExtractorError(YoutubeDLError):
1080 """Error during info extraction."""
1081
1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1083 """ tb, if given, is the original traceback (so that it can be printed out).
1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1085 """
1086 if sys.exc_info()[0] in network_exceptions:
1087 expected = True
1088
1089 self.orig_msg = str(msg)
1090 self.traceback = tb
1091 self.expected = expected
1092 self.cause = cause
1093 self.video_id = video_id
1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
1098 super().__init__(self.__msg)
1099
1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1108
1109 def format_traceback(self):
1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1113 delim='\n') or None
1114
1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1121
1122 class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
1124 super().__init__(
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
1129 class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
1134 class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
1140
1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
1143 super().__init__(msg, **kwargs)
1144 self.countries = countries
1145
1146
1147 class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
1155 class DownloadError(YoutubeDLError):
1156 """Download Error exception.
1157
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
1162
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1165 super().__init__(msg)
1166 self.exc_info = exc_info
1167
1168
1169 class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
1175 msg = 'Entry not found in info'
1176
1177
1178 class SameFileError(YoutubeDLError):
1179 """Same File exception.
1180
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
1190
1191
1192 class PostProcessingError(YoutubeDLError):
1193 """Post Processing exception.
1194
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
1198
1199
1200 class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
1203
1204
1205 class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1208
1209
1210 class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1213
1214
1215 class MaxDownloadsReached(DownloadCancelled):
1216 """ --max-downloads limit has been reached. """
1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
1220 class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228 class ThrottledDownload(ReExtractInfo):
1229 """ Download speed below --throttled-rate. """
1230 msg = 'The download speed is below throttle limit'
1231
1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
1234
1235
1236 class UnavailableVideoError(YoutubeDLError):
1237 """Unavailable Format exception.
1238
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
1248
1249
1250 class ContentTooShortError(YoutubeDLError):
1251 """Content Too Short exception.
1252
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
1257
1258 def __init__(self, downloaded, expected):
1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1260 # Both in bytes
1261 self.downloaded = downloaded
1262 self.expected = expected
1263
1264
1265 class XAttrMetadataError(YoutubeDLError):
1266 def __init__(self, code=None, msg='Unknown error'):
1267 super().__init__(msg)
1268 self.code = code
1269 self.msg = msg
1270
1271 # Parsing code and msg
1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
1281 class XAttrUnavailableError(YoutubeDLError):
1282 pass
1283
1284
1285 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1286 hc = http_class(*args, **kwargs)
1287 source_address = ydl_handler._params.get('source_address')
1288
1289 if source_address is not None:
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1303 raise OSError(
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
1317 except OSError as _:
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
1324 raise OSError('getaddrinfo returns an empty list')
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
1327 hc.source_address = (source_address, 0)
1328
1329 return hc
1330
1331
1332 def handle_youtubedl_headers(headers):
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1337 del filtered_headers['Youtubedl-no-compression']
1338
1339 return filtered_headers
1340
1341
1342 class YoutubeDLHandler(urllib.request.HTTPHandler):
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
1349 to include the HTTP header "Youtubedl-no-compression", which will be
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
1360 def __init__(self, params, *args, **kwargs):
1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1362 self._params = params
1363
1364 def http_open(self, req):
1365 conn_class = http.client.HTTPConnection
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
1372 return self.do_open(functools.partial(
1373 _create_http_connection, self, conn_class, False),
1374 req)
1375
1376 @staticmethod
1377 def deflate(data):
1378 if not data:
1379 return data
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
1389 return brotli.decompress(data)
1390
1391 def http_request(self, req):
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
1405 req = update_Request(req, url=url_escaped)
1406
1407 for h, v in self._params.get('http_headers', std_headers).items():
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
1411 req.add_header(h, v)
1412
1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
1416 req.headers = handle_youtubedl_headers(req.headers)
1417
1418 return super().do_request_(req)
1419
1420 def http_response(self, req, resp):
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
1428 except OSError as original_ioerror:
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
1435 except OSError:
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1441 resp.msg = old_resp.msg
1442 del resp.headers['Content-encoding']
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1447 resp.msg = old_resp.msg
1448 del resp.headers['Content-encoding']
1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
1451 resp = urllib.request.addinfourl(
1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1461 location = location.encode('iso-8859-1').decode()
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
1466 return resp
1467
1468 https_request = http_request
1469 https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
1474 http.client.HTTPConnection, http.client.HTTPSConnection))
1475
1476 url_components = urllib.parse.urlparse(socks_proxy)
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
1483
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
1487 return urllib.parse.unquote_plus(s)
1488
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
1501 if isinstance(self.timeout, (int, float)):
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
1505 if isinstance(self, http.client.HTTPSConnection):
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1519 self._params = params
1520
1521 def https_open(self, req):
1522 kwargs = {}
1523 conn_class = self._https_conn_class
1524
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
1543
1544
1545 def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
1549 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
1555 _HTTPONLY_PREFIX = '#HttpOnly_'
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
1558 # This file is generated by yt-dlp. Do not edit.
1559
1560 '''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1564
1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
1567 if is_path_like(filename):
1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
1577 if is_path_like(file):
1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
1607 """
1608 Save cookies to a file.
1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1611
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1617
1618 # Store session cookies with `expires` set to 0 instead of an empty string
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
1622
1623 with self.open(filename, write=True) as f:
1624 f.write(self._HEADER)
1625 self._really_save(f, *args, **kwargs)
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1634
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1647 return line
1648
1649 cf = io.StringIO()
1650 with self.open(filename) as f:
1651 for line in f:
1652 try:
1653 cf.write(prepare_line(line))
1654 except http.cookiejar.LoadError as e:
1655 if f'{line.strip()} '[0] in '[{"':
1656 raise http.cookiejar.LoadError(
1657 'Cookies file must be Netscape formatted, not JSON. See '
1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1660 continue
1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
1679 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1680 def __init__(self, cookiejar=None):
1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1682
1683 def http_response(self, request, response):
1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1685
1686 https_request = urllib.request.HTTPCookieProcessor.http_request
1687 https_response = http_response
1688
1689
1690 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
1747 return urllib.request.Request(
1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1749 unverifiable=True, method=m)
1750
1751
1752 def extract_timezone(date_str):
1753 m = re.search(
1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
1764 if not m:
1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
1782 def parse_iso8601(date_str, delimiter='T', timezone=None):
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
1790 if timezone is None:
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 with contextlib.suppress(ValueError):
1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
1797
1798
1799 def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
1803 def unified_strdate(date_str, day_first=True):
1804 """Return a string with the date in the format YYYYMMDD"""
1805
1806 if date_str is None:
1807 return None
1808 upload_date = None
1809 # Replace commas
1810 date_str = date_str.replace(',', ' ')
1811 # Remove AM/PM + timezone
1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1813 _, date_str = extract_timezone(date_str)
1814
1815 for expression in date_formats(day_first):
1816 with contextlib.suppress(ValueError):
1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
1821 with contextlib.suppress(ValueError):
1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1823 if upload_date is not None:
1824 return str(upload_date)
1825
1826
1827 def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1833
1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
1850 for expression in date_formats(day_first):
1851 with contextlib.suppress(ValueError):
1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1853 return calendar.timegm(dt.timetuple())
1854
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1858
1859
1860 def determine_ext(url, default_ext='unknown_video'):
1861 if url is None or '.' not in url:
1862 return default_ext
1863 guess = url.partition('?')[0].rpartition('.')[2]
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1868 return guess.rstrip('/')
1869 else:
1870 return default_ext
1871
1872
1873 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1875
1876
1877 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
1891 today = datetime_round(datetime.datetime.utcnow(), precision)
1892 if date_str in ('now', 'today'):
1893 return today
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
1896 match = re.match(
1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1898 date_str)
1899 if match is not None:
1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1902 unit = match.group('unit')
1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1905 unit = 'day'
1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
1919 def date_from_str(date_str, format='%Y%m%d', strict=False):
1920 R"""
1921 Return a date object from a string using datetime_from_str
1922
1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1925 """
1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931 def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940 def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1956
1957
1958 def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
1967
1968 class DateRange:
1969 """Represents a time interval between two dates"""
1970
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
1974 self.start = date_from_str(start, strict=True)
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
1978 self.end = date_from_str(end, strict=True)
1979 else:
1980 self.end = datetime.datetime.max.date()
1981 if self.start > self.end:
1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1983
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
1987 return cls(day, day)
1988
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
1994
1995 def __str__(self):
1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1997
1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
2002
2003 def platform_name():
2004 """ Returns the platform name as a str """
2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2006 return platform.platform()
2007
2008
2009 @functools.cache
2010 def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
2017
2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2019 platform.python_version(),
2020 python_implementation,
2021 platform.machine(),
2022 platform.architecture()[0],
2023 platform.platform(),
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2026 )
2027
2028
2029 @functools.cache
2030 def get_windows_version():
2031 ''' Get Windows version. returns () if it's not running on Windows '''
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
2035 return ()
2036
2037
2038 def write_string(s, out=None, encoding=None):
2039 assert isinstance(s, str)
2040 out = out or sys.stderr
2041
2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2043 s = re.sub(r'([\r\n]+)', r' \1', s)
2044
2045 enc, buffer = None, out
2046 if 'b' in getattr(out, 'mode', ''):
2047 enc = encoding or preferredencoding()
2048 elif hasattr(out, 'buffer'):
2049 buffer = out.buffer
2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2051
2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
2053 out.flush()
2054
2055
2056 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070 deprecation_warning._cache = set()
2071
2072
2073 def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
2085 return struct.pack('%dB' % len(xs), *xs)
2086
2087
2088 class LockingUnsupportedError(OSError):
2089 msg = 'File locking is not supported'
2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
2095 # Cross-platform file locking
2096 if sys.platform == 'win32':
2097 import ctypes
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
2133 def _lock_file(f, exclusive, block):
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152 else:
2153 try:
2154 import fcntl
2155
2156 def _lock_file(f, exclusive, block):
2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
2160 try:
2161 fcntl.flock(f, flags)
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
2165 fcntl.lockf(f, flags)
2166
2167 def _unlock_file(f):
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
2172
2173 except ImportError:
2174
2175 def _lock_file(f, exclusive, block):
2176 raise LockingUnsupportedError()
2177
2178 def _unlock_file(f):
2179 raise LockingUnsupportedError()
2180
2181
2182 class locked_file:
2183 locked = False
2184
2185 def __init__(self, filename, mode, block=True, encoding=None):
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2203
2204 def __enter__(self):
2205 exclusive = 'r' not in self.mode
2206 try:
2207 _lock_file(self.f, exclusive, self.block)
2208 self.locked = True
2209 except OSError:
2210 self.f.close()
2211 raise
2212 if 'w' in self.mode:
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
2221 return self
2222
2223 def unlock(self):
2224 if not self.locked:
2225 return
2226 try:
2227 _unlock_file(self.f)
2228 finally:
2229 self.locked = False
2230
2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
2236
2237 open = __enter__
2238 close = __exit__
2239
2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
2242
2243 def __iter__(self):
2244 return iter(self.f)
2245
2246
2247 @functools.cache
2248 def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
2253 def shell_quote(args):
2254 quoted_args = []
2255 encoding = get_filesystem_encoding()
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
2260 quoted_args.append(compat_shlex_quote(a))
2261 return ' '.join(quoted_args)
2262
2263
2264 def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
2269 sdata = urllib.parse.urlencode(
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
2272
2273
2274 def unsmuggle_url(smug_url, default=None):
2275 if '#__youtubedl_smuggle' not in smug_url:
2276 return smug_url, default
2277 url, _, sdata = smug_url.rpartition('#')
2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2279 data = json.loads(jsond)
2280 return url, data
2281
2282
2283 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
2286 if num is None or num < 0:
2287 return None
2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2293 converted = num / (factor ** exponent)
2294 return fmt % (converted, suffix)
2295
2296
2297 def format_bytes(bytes):
2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2299
2300
2301 def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2303 units_re = '|'.join(re.escape(u) for u in unit_table)
2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2306 if not m:
2307 return None
2308
2309 num = float(m.group('num').replace(',', '.'))
2310 mult = unit_table[m.group('unit')]
2311 return round(num * mult)
2312
2313
2314 def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
2319
2320
2321 def parse_filesize(s):
2322 if s is None:
2323 return None
2324
2325 # The lower-case forms are of course incorrect and unofficial,
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
2330 'bytes': 1,
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
2335 'kb': 1000,
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
2342 'mb': 1000 ** 2,
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
2349 'gb': 1000 ** 3,
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
2356 'tb': 1000 ** 4,
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
2363 'pb': 1000 ** 5,
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
2370 'eb': 1000 ** 6,
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
2377 'zb': 1000 ** 7,
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
2384 'yb': 1000 ** 8,
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
2387 }
2388
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392 def parse_count(s):
2393 if s is None:
2394 return None
2395
2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
2410 }
2411
2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
2419
2420
2421 def parse_resolution(s, *, lenient=False):
2422 if s is None:
2423 return {}
2424
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
2446 def parse_bitrate(s):
2447 if not isinstance(s, str):
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
2454 def month_by_name(name, lang='en'):
2455 """ Return the number of a month by (locale-independently) English name """
2456
2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2458
2459 try:
2460 return month_names.index(name) + 1
2461 except ValueError:
2462 return None
2463
2464
2465 def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2471 except ValueError:
2472 return None
2473
2474
2475 def fix_xml_ampersands(xml_str):
2476 """Replace all the '&' by '&amp;' in XML"""
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2479 '&amp;',
2480 xml_str)
2481
2482
2483 def setproctitle(title):
2484 assert isinstance(title, str)
2485
2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
2490 return
2491
2492 try:
2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2494 except OSError:
2495 return
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
2501 title_bytes = title.encode()
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
2504 try:
2505 libc.prctl(15, buf, 0, 0, 0)
2506 except AttributeError:
2507 return # Strange libc, just skip this
2508
2509
2510 def remove_start(s, start):
2511 return s[len(start):] if s is not None and s.startswith(start) else s
2512
2513
2514 def remove_end(s, end):
2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2516
2517
2518 def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
2527 def get_domain(url):
2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2533
2534
2535 def url_basename(url):
2536 path = urllib.parse.urlparse(url).path
2537 return path.strip('/').split('/')[-1]
2538
2539
2540 def base_url(url):
2541 return re.match(r'https?://[^?#]+/', url).group()
2542
2543
2544 def urljoin(base, path):
2545 if isinstance(path, bytes):
2546 path = path.decode()
2547 if not isinstance(path, str) or not path:
2548 return None
2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2550 return path
2551 if isinstance(base, bytes):
2552 base = base.decode()
2553 if not isinstance(base, str) or not re.match(
2554 r'^(?:https?:)?//', base):
2555 return None
2556 return urllib.parse.urljoin(base, path)
2557
2558
2559 class HEADRequest(urllib.request.Request):
2560 def get_method(self):
2561 return 'HEAD'
2562
2563
2564 class PUTRequest(urllib.request.Request):
2565 def get_method(self):
2566 return 'PUT'
2567
2568
2569 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
2572 try:
2573 return int(v) * invscale // scale
2574 except (ValueError, TypeError, OverflowError):
2575 return default
2576
2577
2578 def str_or_none(v, default=None):
2579 return default if v is None else str(v)
2580
2581
2582 def str_to_int(int_str):
2583 """ A more relaxed version of int_or_none """
2584 if isinstance(int_str, int):
2585 return int_str
2586 elif isinstance(int_str, str):
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
2589
2590
2591 def float_or_none(v, scale=1, invscale=1, default=None):
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
2596 except (ValueError, TypeError):
2597 return default
2598
2599
2600 def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
2604 def strip_or_none(v, default=None):
2605 return v.strip() if isinstance(v, str) else default
2606
2607
2608 def url_or_none(url):
2609 if not url or not isinstance(url, str):
2610 return None
2611 url = url.strip()
2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2613
2614
2615 def request_to_url(req):
2616 if isinstance(req, urllib.request.Request):
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
2622 def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
2625 if isinstance(timestamp, (int, float)): # unix timestamp
2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2629 elif isinstance(timestamp, str): # assume YYYYMMDD
2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
2638 def parse_duration(s):
2639 if not isinstance(s, str):
2640 return None
2641 s = s.strip()
2642 if not s:
2643 return None
2644
2645 days, hours, mins, secs, ms = [None] * 5
2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
2652 if m:
2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2654 else:
2655 m = re.match(
2656 r'''(?ix)(?:P?
2657 (?:
2658 [0-9]+\s*y(?:ears?)?,?\s*
2659 )?
2660 (?:
2661 [0-9]+\s*m(?:onths?)?,?\s*
2662 )?
2663 (?:
2664 [0-9]+\s*w(?:eeks?)?,?\s*
2665 )?
2666 (?:
2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2668 )?
2669 T)?
2670 (?:
2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2672 )?
2673 (?:
2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2678 )?Z?$''', s)
2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
2688 if ms:
2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2692
2693
2694 def prepend_extension(filename, ext, expected_real_ext=None):
2695 name, real_ext = os.path.splitext(filename)
2696 return (
2697 f'{name}.{ext}{real_ext}'
2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
2699 else f'{filename}.{ext}')
2700
2701
2702 def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
2704 return '{}.{}'.format(
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
2709 def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2714 except OSError:
2715 return False
2716 return exe
2717
2718
2719 def _get_exe_version_output(exe, args):
2720 try:
2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2722 # SIGTTOU if yt-dlp is run in the background.
2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2726 except OSError:
2727 return False
2728 return stdout
2729
2730
2731 def detect_exe_version(output, version_re=None, unrecognized='present'):
2732 assert isinstance(output, str)
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
2742 def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
2750 def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
2760 class LazyList(collections.abc.Sequence):
2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
2763
2764 class IndexError(IndexError):
2765 pass
2766
2767 def __init__(self, iterable, *, reverse=False, _cache=None):
2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
2771
2772 def __iter__(self):
2773 if self._reversed:
2774 # We need to consume the entire iterable to iterate in reverse
2775 yield from self.exhaust()
2776 return
2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
2780 yield item
2781
2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
2786
2787 def exhaust(self):
2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
2790
2791 @staticmethod
2792 def _reverse_index(x):
2793 return None if x is None else ~x
2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2799 start, stop, step = idx.start, idx.stop, idx.step or 1
2800 elif isinstance(idx, int):
2801 if self._reversed:
2802 idx = self._reverse_index(idx)
2803 start, stop, step = idx, idx, 0
2804 else:
2805 raise TypeError('indices must be integers or slices')
2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
2811 self._exhaust()
2812 try:
2813 return self._cache[idx]
2814 except IndexError as e:
2815 raise self.IndexError(e) from e
2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
2817 if n > 0:
2818 self._cache.extend(itertools.islice(self._iterable, n))
2819 try:
2820 return self._cache[idx]
2821 except IndexError as e:
2822 raise self.IndexError(e) from e
2823
2824 def __bool__(self):
2825 try:
2826 self[-1] if self._reversed else self[0]
2827 except self.IndexError:
2828 return False
2829 return True
2830
2831 def __len__(self):
2832 self._exhaust()
2833 return len(self._cache)
2834
2835 def __reversed__(self):
2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2837
2838 def __copy__(self):
2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2840
2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
2848
2849 class PagedList:
2850
2851 class IndexError(IndexError):
2852 pass
2853
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
2861 self._pagecount = float('inf')
2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
2880 assert self._use_cache, 'Indexing PagedList requires cache'
2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
2884 if not entries:
2885 raise self.IndexError()
2886 return entries[0]
2887
2888
2889 class OnDemandPagedList(PagedList):
2890 """Download pages until a page with less than maximum results"""
2891
2892 def _getslice(self, start, end):
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
2915 yield from page_results
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
2928
2929
2930 class InAdvancePagedList(PagedList):
2931 """PagedList with total number of pages known in advance"""
2932
2933 def __init__(self, pagefunc, pagecount, pagesize):
2934 PagedList.__init__(self, pagefunc, pagesize, True)
2935 self._pagecount = pagecount
2936
2937 def _getslice(self, start, end):
2938 start_page = start // self._pagesize
2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
2943 page_results = self.getpage(pagenum)
2944 if skip_elems:
2945 page_results = page_results[skip_elems:]
2946 skip_elems = None
2947 if only_more is not None:
2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
2950 else:
2951 yield from page_results[:only_more]
2952 break
2953 yield from page_results
2954
2955
2956 class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
2971 self.is_incomplete = requested_entries is not None
2972 if self.is_incomplete:
2973 assert self.is_exhausted
2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
3017 if not entry:
3018 continue
3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
3079 break
3080 continue
3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
3090 def uppercase_escape(s):
3091 unicode_escape = codecs.getdecoder('unicode_escape')
3092 return re.sub(
3093 r'\\U[0-9a-fA-F]{8}',
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
3096
3097
3098 def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
3104
3105
3106 def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3109
3110
3111 def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
3113 url_parsed = urllib.parse.urlparse(url)
3114 return url_parsed._replace(
3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
3122
3123 def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3125
3126
3127 def read_batch_urls(batch_fd):
3128 def fixup(url):
3129 if not isinstance(url, str):
3130 url = url.decode('utf-8', 'replace')
3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
3137 return False
3138 # "#" cannot be stripped out since it is part of the URI
3139 # However, it can be safely stripped out if following a whitespace
3140 return re.split(r'\s#', url, 1)[0].rstrip()
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
3144
3145
3146 def urlencode_postdata(*args, **kargs):
3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3148
3149
3150 def update_url_query(url, query):
3151 if not query:
3152 return url
3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
3155 qs.update(query)
3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
3158
3159
3160 def update_Request(req, url=None, data=None, headers=None, query=None):
3161 req_headers = req.headers.copy()
3162 req_headers.update(headers or {})
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
3171 req_type = urllib.request.Request
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
3180 def _multipart_encode_impl(data, boundary):
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
3186 if isinstance(k, str):
3187 k = k.encode()
3188 if isinstance(v, str):
3189 v = v.encode()
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202 def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
3222 out, content_type = _multipart_encode_impl(data, boundary)
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
3232 def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
3236 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
3241
3242
3243 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
3245 try:
3246 val = f(*args, **kwargs)
3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3248 pass
3249 else:
3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254 def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3256
3257
3258 def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
3262 def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
3268 merged[k] = v
3269 return merged
3270
3271
3272 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3273 return string if isinstance(string, str) else str(string, encoding, errors)
3274
3275
3276 US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282 }
3283
3284
3285 TV_PARENTAL_GUIDELINES = {
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
3292 }
3293
3294
3295 def parse_age_limit(s):
3296 # isinstance(False, int) is True. So type() must be used instead
3297 if type(s) is int: # noqa: E721
3298 return s if 0 <= s <= 21 else None
3299 elif not isinstance(s, str):
3300 return None
3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3302 if m:
3303 return int(m.group('age'))
3304 s = s.upper()
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3308 if m:
3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3310 return None
3311
3312
3313 def strip_jsonp(code):
3314 return re.sub(
3315 r'''(?sx)^
3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
3321
3322
3323 def js_to_json(code, vars={}, *, strict=False):
3324 # vars is a dict of var, val pairs to substitute
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3329 INTEGER_TABLE = (
3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3332 )
3333
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
3343 def fix_kv(m):
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 try:
3364 if not strict:
3365 json.loads(vars[v])
3366 except json.decoder.JSONDecodeError:
3367 return json.dumps(vars[v])
3368 else:
3369 return vars[v]
3370
3371 if not strict:
3372 return f'"{v}"'
3373
3374 raise ValueError(f'Unknown value: {v}')
3375
3376 def create_map(mobj):
3377 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3378
3379 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3380 if not strict:
3381 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3382 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3383
3384 return re.sub(rf'''(?sx)
3385 {STRING_RE}|
3386 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3387 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3388 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3389 [0-9]+(?={SKIP_RE}:)|
3390 !+
3391 ''', fix_kv, code)
3392
3393
3394 def qualities(quality_ids):
3395 """ Get a numeric quality value out of a list of possible values """
3396 def q(qid):
3397 try:
3398 return quality_ids.index(qid)
3399 except ValueError:
3400 return -1
3401 return q
3402
3403
3404 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3405
3406
3407 DEFAULT_OUTTMPL = {
3408 'default': '%(title)s [%(id)s].%(ext)s',
3409 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3410 }
3411 OUTTMPL_TYPES = {
3412 'chapter': None,
3413 'subtitle': None,
3414 'thumbnail': None,
3415 'description': 'description',
3416 'annotation': 'annotations.xml',
3417 'infojson': 'info.json',
3418 'link': None,
3419 'pl_video': None,
3420 'pl_thumbnail': None,
3421 'pl_description': 'description',
3422 'pl_infojson': 'info.json',
3423 }
3424
3425 # As of [1] format syntax is:
3426 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3427 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3428 STR_FORMAT_RE_TMPL = r'''(?x)
3429 (?<!%)(?P<prefix>(?:%%)*)
3430 %
3431 (?P<has_key>\((?P<key>{0})\))?
3432 (?P<format>
3433 (?P<conversion>[#0\-+ ]+)?
3434 (?P<min_width>\d+)?
3435 (?P<precision>\.\d+)?
3436 (?P<len_mod>[hlL])? # unused in python
3437 {1} # conversion type
3438 )
3439 '''
3440
3441
3442 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3443
3444
3445 def limit_length(s, length):
3446 """ Add ellipses to overly long strings """
3447 if s is None:
3448 return None
3449 ELLIPSES = '...'
3450 if len(s) > length:
3451 return s[:length - len(ELLIPSES)] + ELLIPSES
3452 return s
3453
3454
3455 def version_tuple(v):
3456 return tuple(int(e) for e in re.split(r'[-.]', v))
3457
3458
3459 def is_outdated_version(version, limit, assume_new=True):
3460 if not version:
3461 return not assume_new
3462 try:
3463 return version_tuple(version) < version_tuple(limit)
3464 except ValueError:
3465 return not assume_new
3466
3467
3468 def ytdl_is_updateable():
3469 """ Returns if yt-dlp can be updated with -U """
3470
3471 from .update import is_non_updateable
3472
3473 return not is_non_updateable()
3474
3475
3476 def args_to_str(args):
3477 # Get a short string representation for a subprocess command
3478 return ' '.join(compat_shlex_quote(a) for a in args)
3479
3480
3481 def error_to_compat_str(err):
3482 return str(err)
3483
3484
3485 def error_to_str(err):
3486 return f'{type(err).__name__}: {err}'
3487
3488
3489 def mimetype2ext(mt, default=NO_DEFAULT):
3490 if not isinstance(mt, str):
3491 if default is not NO_DEFAULT:
3492 return default
3493 return None
3494
3495 MAP = {
3496 # video
3497 '3gpp': '3gp',
3498 'mp2t': 'ts',
3499 'mp4': 'mp4',
3500 'mpeg': 'mpeg',
3501 'mpegurl': 'm3u8',
3502 'quicktime': 'mov',
3503 'webm': 'webm',
3504 'vp9': 'vp9',
3505 'x-flv': 'flv',
3506 'x-m4v': 'm4v',
3507 'x-matroska': 'mkv',
3508 'x-mng': 'mng',
3509 'x-mp4-fragmented': 'mp4',
3510 'x-ms-asf': 'asf',
3511 'x-ms-wmv': 'wmv',
3512 'x-msvideo': 'avi',
3513
3514 # application (streaming playlists)
3515 'dash+xml': 'mpd',
3516 'f4m+xml': 'f4m',
3517 'hds+xml': 'f4m',
3518 'vnd.apple.mpegurl': 'm3u8',
3519 'vnd.ms-sstr+xml': 'ism',
3520 'x-mpegurl': 'm3u8',
3521
3522 # audio
3523 'audio/mp4': 'm4a',
3524 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3525 # Using .mp3 as it's the most popular one
3526 'audio/mpeg': 'mp3',
3527 'audio/webm': 'weba',
3528 'audio/x-matroska': 'mka',
3529 'audio/x-mpegurl': 'm3u',
3530 'midi': 'mid',
3531 'ogg': 'ogg',
3532 'wav': 'wav',
3533 'wave': 'wav',
3534 'x-aac': 'aac',
3535 'x-flac': 'flac',
3536 'x-m4a': 'm4a',
3537 'x-realaudio': 'ra',
3538 'x-wav': 'wav',
3539
3540 # image
3541 'avif': 'avif',
3542 'bmp': 'bmp',
3543 'gif': 'gif',
3544 'jpeg': 'jpg',
3545 'png': 'png',
3546 'svg+xml': 'svg',
3547 'tiff': 'tif',
3548 'vnd.wap.wbmp': 'wbmp',
3549 'webp': 'webp',
3550 'x-icon': 'ico',
3551 'x-jng': 'jng',
3552 'x-ms-bmp': 'bmp',
3553
3554 # caption
3555 'filmstrip+json': 'fs',
3556 'smptett+xml': 'tt',
3557 'ttaf+xml': 'dfxp',
3558 'ttml+xml': 'ttml',
3559 'x-ms-sami': 'sami',
3560
3561 # misc
3562 'gzip': 'gz',
3563 'json': 'json',
3564 'xml': 'xml',
3565 'zip': 'zip',
3566 }
3567
3568 mimetype = mt.partition(';')[0].strip().lower()
3569 _, _, subtype = mimetype.rpartition('/')
3570
3571 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3572 if ext:
3573 return ext
3574 elif default is not NO_DEFAULT:
3575 return default
3576 return subtype.replace('+', '.')
3577
3578
3579 def ext2mimetype(ext_or_url):
3580 if not ext_or_url:
3581 return None
3582 if '.' not in ext_or_url:
3583 ext_or_url = f'file.{ext_or_url}'
3584 return mimetypes.guess_type(ext_or_url)[0]
3585
3586
3587 def parse_codecs(codecs_str):
3588 # http://tools.ietf.org/html/rfc6381
3589 if not codecs_str:
3590 return {}
3591 split_codecs = list(filter(None, map(
3592 str.strip, codecs_str.strip().strip(',').split(','))))
3593 vcodec, acodec, scodec, hdr = None, None, None, None
3594 for full_codec in split_codecs:
3595 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3596 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3597 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3598 if vcodec:
3599 continue
3600 vcodec = full_codec
3601 if parts[0] in ('dvh1', 'dvhe'):
3602 hdr = 'DV'
3603 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3604 hdr = 'HDR10'
3605 elif parts[:2] == ['vp9', '2']:
3606 hdr = 'HDR10'
3607 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3608 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3609 acodec = acodec or full_codec
3610 elif parts[0] in ('stpp', 'wvtt'):
3611 scodec = scodec or full_codec
3612 else:
3613 write_string(f'WARNING: Unknown codec {full_codec}\n')
3614 if vcodec or acodec or scodec:
3615 return {
3616 'vcodec': vcodec or 'none',
3617 'acodec': acodec or 'none',
3618 'dynamic_range': hdr,
3619 **({'scodec': scodec} if scodec is not None else {}),
3620 }
3621 elif len(split_codecs) == 2:
3622 return {
3623 'vcodec': split_codecs[0],
3624 'acodec': split_codecs[1],
3625 }
3626 return {}
3627
3628
3629 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3630 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3631
3632 allow_mkv = not preferences or 'mkv' in preferences
3633
3634 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3635 return 'mkv' # TODO: any other format allows this?
3636
3637 # TODO: All codecs supported by parse_codecs isn't handled here
3638 COMPATIBLE_CODECS = {
3639 'mp4': {
3640 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3641 'h264', 'aacl', 'ec-3', # Set in ISM
3642 },
3643 'webm': {
3644 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3645 'vp9x', 'vp8x', # in the webm spec
3646 },
3647 }
3648
3649 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3650 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3651
3652 for ext in preferences or COMPATIBLE_CODECS.keys():
3653 codec_set = COMPATIBLE_CODECS.get(ext, set())
3654 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3655 return ext
3656
3657 COMPATIBLE_EXTS = (
3658 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3659 {'webm'},
3660 )
3661 for ext in preferences or vexts:
3662 current_exts = {ext, *vexts, *aexts}
3663 if ext == 'mkv' or current_exts == {ext} or any(
3664 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3665 return ext
3666 return 'mkv' if allow_mkv else preferences[-1]
3667
3668
3669 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3670 getheader = url_handle.headers.get
3671
3672 cd = getheader('Content-Disposition')
3673 if cd:
3674 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3675 if m:
3676 e = determine_ext(m.group('filename'), default_ext=None)
3677 if e:
3678 return e
3679
3680 meta_ext = getheader('x-amz-meta-name')
3681 if meta_ext:
3682 e = meta_ext.rpartition('.')[2]
3683 if e:
3684 return e
3685
3686 return mimetype2ext(getheader('Content-Type'), default=default)
3687
3688
3689 def encode_data_uri(data, mime_type):
3690 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3691
3692
3693 def age_restricted(content_limit, age_limit):
3694 """ Returns True iff the content should be blocked """
3695
3696 if age_limit is None: # No limit set
3697 return False
3698 if content_limit is None:
3699 return False # Content available for everyone
3700 return age_limit < content_limit
3701
3702
3703 # List of known byte-order-marks (BOM)
3704 BOMS = [
3705 (b'\xef\xbb\xbf', 'utf-8'),
3706 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3707 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3708 (b'\xff\xfe', 'utf-16-le'),
3709 (b'\xfe\xff', 'utf-16-be'),
3710 ]
3711
3712
3713 def is_html(first_bytes):
3714 """ Detect whether a file contains HTML by examining its first bytes. """
3715
3716 encoding = 'utf-8'
3717 for bom, enc in BOMS:
3718 while first_bytes.startswith(bom):
3719 encoding, first_bytes = enc, first_bytes[len(bom):]
3720
3721 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3722
3723
3724 def determine_protocol(info_dict):
3725 protocol = info_dict.get('protocol')
3726 if protocol is not None:
3727 return protocol
3728
3729 url = sanitize_url(info_dict['url'])
3730 if url.startswith('rtmp'):
3731 return 'rtmp'
3732 elif url.startswith('mms'):
3733 return 'mms'
3734 elif url.startswith('rtsp'):
3735 return 'rtsp'
3736
3737 ext = determine_ext(url)
3738 if ext == 'm3u8':
3739 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3740 elif ext == 'f4m':
3741 return 'f4m'
3742
3743 return urllib.parse.urlparse(url).scheme
3744
3745
3746 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3747 """ Render a list of rows, each as a list of values.
3748 Text after a \t will be right aligned """
3749 def width(string):
3750 return len(remove_terminal_sequences(string).replace('\t', ''))
3751
3752 def get_max_lens(table):
3753 return [max(width(str(v)) for v in col) for col in zip(*table)]
3754
3755 def filter_using_list(row, filterArray):
3756 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3757
3758 max_lens = get_max_lens(data) if hide_empty else []
3759 header_row = filter_using_list(header_row, max_lens)
3760 data = [filter_using_list(row, max_lens) for row in data]
3761
3762 table = [header_row] + data
3763 max_lens = get_max_lens(table)
3764 extra_gap += 1
3765 if delim:
3766 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3767 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3768 for row in table:
3769 for pos, text in enumerate(map(str, row)):
3770 if '\t' in text:
3771 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3772 else:
3773 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3774 ret = '\n'.join(''.join(row).rstrip() for row in table)
3775 return ret
3776
3777
3778 def _match_one(filter_part, dct, incomplete):
3779 # TODO: Generalize code with YoutubeDL._build_format_filter
3780 STRING_OPERATORS = {
3781 '*=': operator.contains,
3782 '^=': lambda attr, value: attr.startswith(value),
3783 '$=': lambda attr, value: attr.endswith(value),
3784 '~=': lambda attr, value: re.search(value, attr),
3785 }
3786 COMPARISON_OPERATORS = {
3787 **STRING_OPERATORS,
3788 '<=': operator.le, # "<=" must be defined above "<"
3789 '<': operator.lt,
3790 '>=': operator.ge,
3791 '>': operator.gt,
3792 '=': operator.eq,
3793 }
3794
3795 if isinstance(incomplete, bool):
3796 is_incomplete = lambda _: incomplete
3797 else:
3798 is_incomplete = lambda k: k in incomplete
3799
3800 operator_rex = re.compile(r'''(?x)
3801 (?P<key>[a-z_]+)
3802 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3803 (?:
3804 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3805 (?P<strval>.+?)
3806 )
3807 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3808 m = operator_rex.fullmatch(filter_part.strip())
3809 if m:
3810 m = m.groupdict()
3811 unnegated_op = COMPARISON_OPERATORS[m['op']]
3812 if m['negation']:
3813 op = lambda attr, value: not unnegated_op(attr, value)
3814 else:
3815 op = unnegated_op
3816 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3817 if m['quote']:
3818 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3819 actual_value = dct.get(m['key'])
3820 numeric_comparison = None
3821 if isinstance(actual_value, (int, float)):
3822 # If the original field is a string and matching comparisonvalue is
3823 # a number we should respect the origin of the original field
3824 # and process comparison value as a string (see
3825 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3826 try:
3827 numeric_comparison = int(comparison_value)
3828 except ValueError:
3829 numeric_comparison = parse_filesize(comparison_value)
3830 if numeric_comparison is None:
3831 numeric_comparison = parse_filesize(f'{comparison_value}B')
3832 if numeric_comparison is None:
3833 numeric_comparison = parse_duration(comparison_value)
3834 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3835 raise ValueError('Operator %s only supports string values!' % m['op'])
3836 if actual_value is None:
3837 return is_incomplete(m['key']) or m['none_inclusive']
3838 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3839
3840 UNARY_OPERATORS = {
3841 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3842 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3843 }
3844 operator_rex = re.compile(r'''(?x)
3845 (?P<op>%s)\s*(?P<key>[a-z_]+)
3846 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3847 m = operator_rex.fullmatch(filter_part.strip())
3848 if m:
3849 op = UNARY_OPERATORS[m.group('op')]
3850 actual_value = dct.get(m.group('key'))
3851 if is_incomplete(m.group('key')) and actual_value is None:
3852 return True
3853 return op(actual_value)
3854
3855 raise ValueError('Invalid filter part %r' % filter_part)
3856
3857
3858 def match_str(filter_str, dct, incomplete=False):
3859 """ Filter a dictionary with a simple string syntax.
3860 @returns Whether the filter passes
3861 @param incomplete Set of keys that is expected to be missing from dct.
3862 Can be True/False to indicate all/none of the keys may be missing.
3863 All conditions on incomplete keys pass if the key is missing
3864 """
3865 return all(
3866 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3867 for filter_part in re.split(r'(?<!\\)&', filter_str))
3868
3869
3870 def match_filter_func(filters):
3871 if not filters:
3872 return None
3873 filters = set(variadic(filters))
3874
3875 interactive = '-' in filters
3876 if interactive:
3877 filters.remove('-')
3878
3879 def _match_func(info_dict, incomplete=False):
3880 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3881 return NO_DEFAULT if interactive and not incomplete else None
3882 else:
3883 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3884 filter_str = ') | ('.join(map(str.strip, filters))
3885 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3886 return _match_func
3887
3888
3889 class download_range_func:
3890 def __init__(self, chapters, ranges):
3891 self.chapters, self.ranges = chapters, ranges
3892
3893 def __call__(self, info_dict, ydl):
3894 if not self.ranges and not self.chapters:
3895 yield {}
3896
3897 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3898 else 'Cannot match chapters since chapter information is unavailable')
3899 for regex in self.chapters or []:
3900 for i, chapter in enumerate(info_dict.get('chapters') or []):
3901 if re.search(regex, chapter['title']):
3902 warning = None
3903 yield {**chapter, 'index': i}
3904 if self.chapters and warning:
3905 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3906
3907 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3908
3909 def __eq__(self, other):
3910 return (isinstance(other, download_range_func)
3911 and self.chapters == other.chapters and self.ranges == other.ranges)
3912
3913 def __repr__(self):
3914 return f'{type(self).__name__}({self.chapters}, {self.ranges})'
3915
3916
3917 def parse_dfxp_time_expr(time_expr):
3918 if not time_expr:
3919 return
3920
3921 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3922 if mobj:
3923 return float(mobj.group('time_offset'))
3924
3925 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3926 if mobj:
3927 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3928
3929
3930 def srt_subtitles_timecode(seconds):
3931 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3932
3933
3934 def ass_subtitles_timecode(seconds):
3935 time = timetuple_from_msec(seconds * 1000)
3936 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3937
3938
3939 def dfxp2srt(dfxp_data):
3940 '''
3941 @param dfxp_data A bytes-like object containing DFXP data
3942 @returns A unicode object containing converted SRT data
3943 '''
3944 LEGACY_NAMESPACES = (
3945 (b'http://www.w3.org/ns/ttml', [
3946 b'http://www.w3.org/2004/11/ttaf1',
3947 b'http://www.w3.org/2006/04/ttaf1',
3948 b'http://www.w3.org/2006/10/ttaf1',
3949 ]),
3950 (b'http://www.w3.org/ns/ttml#styling', [
3951 b'http://www.w3.org/ns/ttml#style',
3952 ]),
3953 )
3954
3955 SUPPORTED_STYLING = [
3956 'color',
3957 'fontFamily',
3958 'fontSize',
3959 'fontStyle',
3960 'fontWeight',
3961 'textDecoration'
3962 ]
3963
3964 _x = functools.partial(xpath_with_ns, ns_map={
3965 'xml': 'http://www.w3.org/XML/1998/namespace',
3966 'ttml': 'http://www.w3.org/ns/ttml',
3967 'tts': 'http://www.w3.org/ns/ttml#styling',
3968 })
3969
3970 styles = {}
3971 default_style = {}
3972
3973 class TTMLPElementParser:
3974 _out = ''
3975 _unclosed_elements = []
3976 _applied_styles = []
3977
3978 def start(self, tag, attrib):
3979 if tag in (_x('ttml:br'), 'br'):
3980 self._out += '\n'
3981 else:
3982 unclosed_elements = []
3983 style = {}
3984 element_style_id = attrib.get('style')
3985 if default_style:
3986 style.update(default_style)
3987 if element_style_id:
3988 style.update(styles.get(element_style_id, {}))
3989 for prop in SUPPORTED_STYLING:
3990 prop_val = attrib.get(_x('tts:' + prop))
3991 if prop_val:
3992 style[prop] = prop_val
3993 if style:
3994 font = ''
3995 for k, v in sorted(style.items()):
3996 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3997 continue
3998 if k == 'color':
3999 font += ' color="%s"' % v
4000 elif k == 'fontSize':
4001 font += ' size="%s"' % v
4002 elif k == 'fontFamily':
4003 font += ' face="%s"' % v
4004 elif k == 'fontWeight' and v == 'bold':
4005 self._out += '<b>'
4006 unclosed_elements.append('b')
4007 elif k == 'fontStyle' and v == 'italic':
4008 self._out += '<i>'
4009 unclosed_elements.append('i')
4010 elif k == 'textDecoration' and v == 'underline':
4011 self._out += '<u>'
4012 unclosed_elements.append('u')
4013 if font:
4014 self._out += '<font' + font + '>'
4015 unclosed_elements.append('font')
4016 applied_style = {}
4017 if self._applied_styles:
4018 applied_style.update(self._applied_styles[-1])
4019 applied_style.update(style)
4020 self._applied_styles.append(applied_style)
4021 self._unclosed_elements.append(unclosed_elements)
4022
4023 def end(self, tag):
4024 if tag not in (_x('ttml:br'), 'br'):
4025 unclosed_elements = self._unclosed_elements.pop()
4026 for element in reversed(unclosed_elements):
4027 self._out += '</%s>' % element
4028 if unclosed_elements and self._applied_styles:
4029 self._applied_styles.pop()
4030
4031 def data(self, data):
4032 self._out += data
4033
4034 def close(self):
4035 return self._out.strip()
4036
4037 def parse_node(node):
4038 target = TTMLPElementParser()
4039 parser = xml.etree.ElementTree.XMLParser(target=target)
4040 parser.feed(xml.etree.ElementTree.tostring(node))
4041 return parser.close()
4042
4043 for k, v in LEGACY_NAMESPACES:
4044 for ns in v:
4045 dfxp_data = dfxp_data.replace(ns, k)
4046
4047 dfxp = compat_etree_fromstring(dfxp_data)
4048 out = []
4049 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4050
4051 if not paras:
4052 raise ValueError('Invalid dfxp/TTML subtitle')
4053
4054 repeat = False
4055 while True:
4056 for style in dfxp.findall(_x('.//ttml:style')):
4057 style_id = style.get('id') or style.get(_x('xml:id'))
4058 if not style_id:
4059 continue
4060 parent_style_id = style.get('style')
4061 if parent_style_id:
4062 if parent_style_id not in styles:
4063 repeat = True
4064 continue
4065 styles[style_id] = styles[parent_style_id].copy()
4066 for prop in SUPPORTED_STYLING:
4067 prop_val = style.get(_x('tts:' + prop))
4068 if prop_val:
4069 styles.setdefault(style_id, {})[prop] = prop_val
4070 if repeat:
4071 repeat = False
4072 else:
4073 break
4074
4075 for p in ('body', 'div'):
4076 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4077 if ele is None:
4078 continue
4079 style = styles.get(ele.get('style'))
4080 if not style:
4081 continue
4082 default_style.update(style)
4083
4084 for para, index in zip(paras, itertools.count(1)):
4085 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4086 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4087 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4088 if begin_time is None:
4089 continue
4090 if not end_time:
4091 if not dur:
4092 continue
4093 end_time = begin_time + dur
4094 out.append('%d\n%s --> %s\n%s\n\n' % (
4095 index,
4096 srt_subtitles_timecode(begin_time),
4097 srt_subtitles_timecode(end_time),
4098 parse_node(para)))
4099
4100 return ''.join(out)
4101
4102
4103 def cli_option(params, command_option, param, separator=None):
4104 param = params.get(param)
4105 return ([] if param is None
4106 else [command_option, str(param)] if separator is None
4107 else [f'{command_option}{separator}{param}'])
4108
4109
4110 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4111 param = params.get(param)
4112 assert param in (True, False, None)
4113 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4114
4115
4116 def cli_valueless_option(params, command_option, param, expected_value=True):
4117 return [command_option] if params.get(param) == expected_value else []
4118
4119
4120 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4121 if isinstance(argdict, (list, tuple)): # for backward compatibility
4122 if use_compat:
4123 return argdict
4124 else:
4125 argdict = None
4126 if argdict is None:
4127 return default
4128 assert isinstance(argdict, dict)
4129
4130 assert isinstance(keys, (list, tuple))
4131 for key_list in keys:
4132 arg_list = list(filter(
4133 lambda x: x is not None,
4134 [argdict.get(key.lower()) for key in variadic(key_list)]))
4135 if arg_list:
4136 return [arg for args in arg_list for arg in args]
4137 return default
4138
4139
4140 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4141 main_key, exe = main_key.lower(), exe.lower()
4142 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4143 keys = [f'{root_key}{k}' for k in (keys or [''])]
4144 if root_key in keys:
4145 if main_key != exe:
4146 keys.append((main_key, exe))
4147 keys.append('default')
4148 else:
4149 use_compat = False
4150 return cli_configuration_args(argdict, keys, default, use_compat)
4151
4152
4153 class ISO639Utils:
4154 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4155 _lang_map = {
4156 'aa': 'aar',
4157 'ab': 'abk',
4158 'ae': 'ave',
4159 'af': 'afr',
4160 'ak': 'aka',
4161 'am': 'amh',
4162 'an': 'arg',
4163 'ar': 'ara',
4164 'as': 'asm',
4165 'av': 'ava',
4166 'ay': 'aym',
4167 'az': 'aze',
4168 'ba': 'bak',
4169 'be': 'bel',
4170 'bg': 'bul',
4171 'bh': 'bih',
4172 'bi': 'bis',
4173 'bm': 'bam',
4174 'bn': 'ben',
4175 'bo': 'bod',
4176 'br': 'bre',
4177 'bs': 'bos',
4178 'ca': 'cat',
4179 'ce': 'che',
4180 'ch': 'cha',
4181 'co': 'cos',
4182 'cr': 'cre',
4183 'cs': 'ces',
4184 'cu': 'chu',
4185 'cv': 'chv',
4186 'cy': 'cym',
4187 'da': 'dan',
4188 'de': 'deu',
4189 'dv': 'div',
4190 'dz': 'dzo',
4191 'ee': 'ewe',
4192 'el': 'ell',
4193 'en': 'eng',
4194 'eo': 'epo',
4195 'es': 'spa',
4196 'et': 'est',
4197 'eu': 'eus',
4198 'fa': 'fas',
4199 'ff': 'ful',
4200 'fi': 'fin',
4201 'fj': 'fij',
4202 'fo': 'fao',
4203 'fr': 'fra',
4204 'fy': 'fry',
4205 'ga': 'gle',
4206 'gd': 'gla',
4207 'gl': 'glg',
4208 'gn': 'grn',
4209 'gu': 'guj',
4210 'gv': 'glv',
4211 'ha': 'hau',
4212 'he': 'heb',
4213 'iw': 'heb', # Replaced by he in 1989 revision
4214 'hi': 'hin',
4215 'ho': 'hmo',
4216 'hr': 'hrv',
4217 'ht': 'hat',
4218 'hu': 'hun',
4219 'hy': 'hye',
4220 'hz': 'her',
4221 'ia': 'ina',
4222 'id': 'ind',
4223 'in': 'ind', # Replaced by id in 1989 revision
4224 'ie': 'ile',
4225 'ig': 'ibo',
4226 'ii': 'iii',
4227 'ik': 'ipk',
4228 'io': 'ido',
4229 'is': 'isl',
4230 'it': 'ita',
4231 'iu': 'iku',
4232 'ja': 'jpn',
4233 'jv': 'jav',
4234 'ka': 'kat',
4235 'kg': 'kon',
4236 'ki': 'kik',
4237 'kj': 'kua',
4238 'kk': 'kaz',
4239 'kl': 'kal',
4240 'km': 'khm',
4241 'kn': 'kan',
4242 'ko': 'kor',
4243 'kr': 'kau',
4244 'ks': 'kas',
4245 'ku': 'kur',
4246 'kv': 'kom',
4247 'kw': 'cor',
4248 'ky': 'kir',
4249 'la': 'lat',
4250 'lb': 'ltz',
4251 'lg': 'lug',
4252 'li': 'lim',
4253 'ln': 'lin',
4254 'lo': 'lao',
4255 'lt': 'lit',
4256 'lu': 'lub',
4257 'lv': 'lav',
4258 'mg': 'mlg',
4259 'mh': 'mah',
4260 'mi': 'mri',
4261 'mk': 'mkd',
4262 'ml': 'mal',
4263 'mn': 'mon',
4264 'mr': 'mar',
4265 'ms': 'msa',
4266 'mt': 'mlt',
4267 'my': 'mya',
4268 'na': 'nau',
4269 'nb': 'nob',
4270 'nd': 'nde',
4271 'ne': 'nep',
4272 'ng': 'ndo',
4273 'nl': 'nld',
4274 'nn': 'nno',
4275 'no': 'nor',
4276 'nr': 'nbl',
4277 'nv': 'nav',
4278 'ny': 'nya',
4279 'oc': 'oci',
4280 'oj': 'oji',
4281 'om': 'orm',
4282 'or': 'ori',
4283 'os': 'oss',
4284 'pa': 'pan',
4285 'pi': 'pli',
4286 'pl': 'pol',
4287 'ps': 'pus',
4288 'pt': 'por',
4289 'qu': 'que',
4290 'rm': 'roh',
4291 'rn': 'run',
4292 'ro': 'ron',
4293 'ru': 'rus',
4294 'rw': 'kin',
4295 'sa': 'san',
4296 'sc': 'srd',
4297 'sd': 'snd',
4298 'se': 'sme',
4299 'sg': 'sag',
4300 'si': 'sin',
4301 'sk': 'slk',
4302 'sl': 'slv',
4303 'sm': 'smo',
4304 'sn': 'sna',
4305 'so': 'som',
4306 'sq': 'sqi',
4307 'sr': 'srp',
4308 'ss': 'ssw',
4309 'st': 'sot',
4310 'su': 'sun',
4311 'sv': 'swe',
4312 'sw': 'swa',
4313 'ta': 'tam',
4314 'te': 'tel',
4315 'tg': 'tgk',
4316 'th': 'tha',
4317 'ti': 'tir',
4318 'tk': 'tuk',
4319 'tl': 'tgl',
4320 'tn': 'tsn',
4321 'to': 'ton',
4322 'tr': 'tur',
4323 'ts': 'tso',
4324 'tt': 'tat',
4325 'tw': 'twi',
4326 'ty': 'tah',
4327 'ug': 'uig',
4328 'uk': 'ukr',
4329 'ur': 'urd',
4330 'uz': 'uzb',
4331 've': 'ven',
4332 'vi': 'vie',
4333 'vo': 'vol',
4334 'wa': 'wln',
4335 'wo': 'wol',
4336 'xh': 'xho',
4337 'yi': 'yid',
4338 'ji': 'yid', # Replaced by yi in 1989 revision
4339 'yo': 'yor',
4340 'za': 'zha',
4341 'zh': 'zho',
4342 'zu': 'zul',
4343 }
4344
4345 @classmethod
4346 def short2long(cls, code):
4347 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4348 return cls._lang_map.get(code[:2])
4349
4350 @classmethod
4351 def long2short(cls, code):
4352 """Convert language code from ISO 639-2/T to ISO 639-1"""
4353 for short_name, long_name in cls._lang_map.items():
4354 if long_name == code:
4355 return short_name
4356
4357
4358 class ISO3166Utils:
4359 # From http://data.okfn.org/data/core/country-list
4360 _country_map = {
4361 'AF': 'Afghanistan',
4362 'AX': 'Åland Islands',
4363 'AL': 'Albania',
4364 'DZ': 'Algeria',
4365 'AS': 'American Samoa',
4366 'AD': 'Andorra',
4367 'AO': 'Angola',
4368 'AI': 'Anguilla',
4369 'AQ': 'Antarctica',
4370 'AG': 'Antigua and Barbuda',
4371 'AR': 'Argentina',
4372 'AM': 'Armenia',
4373 'AW': 'Aruba',
4374 'AU': 'Australia',
4375 'AT': 'Austria',
4376 'AZ': 'Azerbaijan',
4377 'BS': 'Bahamas',
4378 'BH': 'Bahrain',
4379 'BD': 'Bangladesh',
4380 'BB': 'Barbados',
4381 'BY': 'Belarus',
4382 'BE': 'Belgium',
4383 'BZ': 'Belize',
4384 'BJ': 'Benin',
4385 'BM': 'Bermuda',
4386 'BT': 'Bhutan',
4387 'BO': 'Bolivia, Plurinational State of',
4388 'BQ': 'Bonaire, Sint Eustatius and Saba',
4389 'BA': 'Bosnia and Herzegovina',
4390 'BW': 'Botswana',
4391 'BV': 'Bouvet Island',
4392 'BR': 'Brazil',
4393 'IO': 'British Indian Ocean Territory',
4394 'BN': 'Brunei Darussalam',
4395 'BG': 'Bulgaria',
4396 'BF': 'Burkina Faso',
4397 'BI': 'Burundi',
4398 'KH': 'Cambodia',
4399 'CM': 'Cameroon',
4400 'CA': 'Canada',
4401 'CV': 'Cape Verde',
4402 'KY': 'Cayman Islands',
4403 'CF': 'Central African Republic',
4404 'TD': 'Chad',
4405 'CL': 'Chile',
4406 'CN': 'China',
4407 'CX': 'Christmas Island',
4408 'CC': 'Cocos (Keeling) Islands',
4409 'CO': 'Colombia',
4410 'KM': 'Comoros',
4411 'CG': 'Congo',
4412 'CD': 'Congo, the Democratic Republic of the',
4413 'CK': 'Cook Islands',
4414 'CR': 'Costa Rica',
4415 'CI': 'Côte d\'Ivoire',
4416 'HR': 'Croatia',
4417 'CU': 'Cuba',
4418 'CW': 'Curaçao',
4419 'CY': 'Cyprus',
4420 'CZ': 'Czech Republic',
4421 'DK': 'Denmark',
4422 'DJ': 'Djibouti',
4423 'DM': 'Dominica',
4424 'DO': 'Dominican Republic',
4425 'EC': 'Ecuador',
4426 'EG': 'Egypt',
4427 'SV': 'El Salvador',
4428 'GQ': 'Equatorial Guinea',
4429 'ER': 'Eritrea',
4430 'EE': 'Estonia',
4431 'ET': 'Ethiopia',
4432 'FK': 'Falkland Islands (Malvinas)',
4433 'FO': 'Faroe Islands',
4434 'FJ': 'Fiji',
4435 'FI': 'Finland',
4436 'FR': 'France',
4437 'GF': 'French Guiana',
4438 'PF': 'French Polynesia',
4439 'TF': 'French Southern Territories',
4440 'GA': 'Gabon',
4441 'GM': 'Gambia',
4442 'GE': 'Georgia',
4443 'DE': 'Germany',
4444 'GH': 'Ghana',
4445 'GI': 'Gibraltar',
4446 'GR': 'Greece',
4447 'GL': 'Greenland',
4448 'GD': 'Grenada',
4449 'GP': 'Guadeloupe',
4450 'GU': 'Guam',
4451 'GT': 'Guatemala',
4452 'GG': 'Guernsey',
4453 'GN': 'Guinea',
4454 'GW': 'Guinea-Bissau',
4455 'GY': 'Guyana',
4456 'HT': 'Haiti',
4457 'HM': 'Heard Island and McDonald Islands',
4458 'VA': 'Holy See (Vatican City State)',
4459 'HN': 'Honduras',
4460 'HK': 'Hong Kong',
4461 'HU': 'Hungary',
4462 'IS': 'Iceland',
4463 'IN': 'India',
4464 'ID': 'Indonesia',
4465 'IR': 'Iran, Islamic Republic of',
4466 'IQ': 'Iraq',
4467 'IE': 'Ireland',
4468 'IM': 'Isle of Man',
4469 'IL': 'Israel',
4470 'IT': 'Italy',
4471 'JM': 'Jamaica',
4472 'JP': 'Japan',
4473 'JE': 'Jersey',
4474 'JO': 'Jordan',
4475 'KZ': 'Kazakhstan',
4476 'KE': 'Kenya',
4477 'KI': 'Kiribati',
4478 'KP': 'Korea, Democratic People\'s Republic of',
4479 'KR': 'Korea, Republic of',
4480 'KW': 'Kuwait',
4481 'KG': 'Kyrgyzstan',
4482 'LA': 'Lao People\'s Democratic Republic',
4483 'LV': 'Latvia',
4484 'LB': 'Lebanon',
4485 'LS': 'Lesotho',
4486 'LR': 'Liberia',
4487 'LY': 'Libya',
4488 'LI': 'Liechtenstein',
4489 'LT': 'Lithuania',
4490 'LU': 'Luxembourg',
4491 'MO': 'Macao',
4492 'MK': 'Macedonia, the Former Yugoslav Republic of',
4493 'MG': 'Madagascar',
4494 'MW': 'Malawi',
4495 'MY': 'Malaysia',
4496 'MV': 'Maldives',
4497 'ML': 'Mali',
4498 'MT': 'Malta',
4499 'MH': 'Marshall Islands',
4500 'MQ': 'Martinique',
4501 'MR': 'Mauritania',
4502 'MU': 'Mauritius',
4503 'YT': 'Mayotte',
4504 'MX': 'Mexico',
4505 'FM': 'Micronesia, Federated States of',
4506 'MD': 'Moldova, Republic of',
4507 'MC': 'Monaco',
4508 'MN': 'Mongolia',
4509 'ME': 'Montenegro',
4510 'MS': 'Montserrat',
4511 'MA': 'Morocco',
4512 'MZ': 'Mozambique',
4513 'MM': 'Myanmar',
4514 'NA': 'Namibia',
4515 'NR': 'Nauru',
4516 'NP': 'Nepal',
4517 'NL': 'Netherlands',
4518 'NC': 'New Caledonia',
4519 'NZ': 'New Zealand',
4520 'NI': 'Nicaragua',
4521 'NE': 'Niger',
4522 'NG': 'Nigeria',
4523 'NU': 'Niue',
4524 'NF': 'Norfolk Island',
4525 'MP': 'Northern Mariana Islands',
4526 'NO': 'Norway',
4527 'OM': 'Oman',
4528 'PK': 'Pakistan',
4529 'PW': 'Palau',
4530 'PS': 'Palestine, State of',
4531 'PA': 'Panama',
4532 'PG': 'Papua New Guinea',
4533 'PY': 'Paraguay',
4534 'PE': 'Peru',
4535 'PH': 'Philippines',
4536 'PN': 'Pitcairn',
4537 'PL': 'Poland',
4538 'PT': 'Portugal',
4539 'PR': 'Puerto Rico',
4540 'QA': 'Qatar',
4541 'RE': 'Réunion',
4542 'RO': 'Romania',
4543 'RU': 'Russian Federation',
4544 'RW': 'Rwanda',
4545 'BL': 'Saint Barthélemy',
4546 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4547 'KN': 'Saint Kitts and Nevis',
4548 'LC': 'Saint Lucia',
4549 'MF': 'Saint Martin (French part)',
4550 'PM': 'Saint Pierre and Miquelon',
4551 'VC': 'Saint Vincent and the Grenadines',
4552 'WS': 'Samoa',
4553 'SM': 'San Marino',
4554 'ST': 'Sao Tome and Principe',
4555 'SA': 'Saudi Arabia',
4556 'SN': 'Senegal',
4557 'RS': 'Serbia',
4558 'SC': 'Seychelles',
4559 'SL': 'Sierra Leone',
4560 'SG': 'Singapore',
4561 'SX': 'Sint Maarten (Dutch part)',
4562 'SK': 'Slovakia',
4563 'SI': 'Slovenia',
4564 'SB': 'Solomon Islands',
4565 'SO': 'Somalia',
4566 'ZA': 'South Africa',
4567 'GS': 'South Georgia and the South Sandwich Islands',
4568 'SS': 'South Sudan',
4569 'ES': 'Spain',
4570 'LK': 'Sri Lanka',
4571 'SD': 'Sudan',
4572 'SR': 'Suriname',
4573 'SJ': 'Svalbard and Jan Mayen',
4574 'SZ': 'Swaziland',
4575 'SE': 'Sweden',
4576 'CH': 'Switzerland',
4577 'SY': 'Syrian Arab Republic',
4578 'TW': 'Taiwan, Province of China',
4579 'TJ': 'Tajikistan',
4580 'TZ': 'Tanzania, United Republic of',
4581 'TH': 'Thailand',
4582 'TL': 'Timor-Leste',
4583 'TG': 'Togo',
4584 'TK': 'Tokelau',
4585 'TO': 'Tonga',
4586 'TT': 'Trinidad and Tobago',
4587 'TN': 'Tunisia',
4588 'TR': 'Turkey',
4589 'TM': 'Turkmenistan',
4590 'TC': 'Turks and Caicos Islands',
4591 'TV': 'Tuvalu',
4592 'UG': 'Uganda',
4593 'UA': 'Ukraine',
4594 'AE': 'United Arab Emirates',
4595 'GB': 'United Kingdom',
4596 'US': 'United States',
4597 'UM': 'United States Minor Outlying Islands',
4598 'UY': 'Uruguay',
4599 'UZ': 'Uzbekistan',
4600 'VU': 'Vanuatu',
4601 'VE': 'Venezuela, Bolivarian Republic of',
4602 'VN': 'Viet Nam',
4603 'VG': 'Virgin Islands, British',
4604 'VI': 'Virgin Islands, U.S.',
4605 'WF': 'Wallis and Futuna',
4606 'EH': 'Western Sahara',
4607 'YE': 'Yemen',
4608 'ZM': 'Zambia',
4609 'ZW': 'Zimbabwe',
4610 # Not ISO 3166 codes, but used for IP blocks
4611 'AP': 'Asia/Pacific Region',
4612 'EU': 'Europe',
4613 }
4614
4615 @classmethod
4616 def short2full(cls, code):
4617 """Convert an ISO 3166-2 country code to the corresponding full name"""
4618 return cls._country_map.get(code.upper())
4619
4620
4621 class GeoUtils:
4622 # Major IPv4 address blocks per country
4623 _country_ip_map = {
4624 'AD': '46.172.224.0/19',
4625 'AE': '94.200.0.0/13',
4626 'AF': '149.54.0.0/17',
4627 'AG': '209.59.64.0/18',
4628 'AI': '204.14.248.0/21',
4629 'AL': '46.99.0.0/16',
4630 'AM': '46.70.0.0/15',
4631 'AO': '105.168.0.0/13',
4632 'AP': '182.50.184.0/21',
4633 'AQ': '23.154.160.0/24',
4634 'AR': '181.0.0.0/12',
4635 'AS': '202.70.112.0/20',
4636 'AT': '77.116.0.0/14',
4637 'AU': '1.128.0.0/11',
4638 'AW': '181.41.0.0/18',
4639 'AX': '185.217.4.0/22',
4640 'AZ': '5.197.0.0/16',
4641 'BA': '31.176.128.0/17',
4642 'BB': '65.48.128.0/17',
4643 'BD': '114.130.0.0/16',
4644 'BE': '57.0.0.0/8',
4645 'BF': '102.178.0.0/15',
4646 'BG': '95.42.0.0/15',
4647 'BH': '37.131.0.0/17',
4648 'BI': '154.117.192.0/18',
4649 'BJ': '137.255.0.0/16',
4650 'BL': '185.212.72.0/23',
4651 'BM': '196.12.64.0/18',
4652 'BN': '156.31.0.0/16',
4653 'BO': '161.56.0.0/16',
4654 'BQ': '161.0.80.0/20',
4655 'BR': '191.128.0.0/12',
4656 'BS': '24.51.64.0/18',
4657 'BT': '119.2.96.0/19',
4658 'BW': '168.167.0.0/16',
4659 'BY': '178.120.0.0/13',
4660 'BZ': '179.42.192.0/18',
4661 'CA': '99.224.0.0/11',
4662 'CD': '41.243.0.0/16',
4663 'CF': '197.242.176.0/21',
4664 'CG': '160.113.0.0/16',
4665 'CH': '85.0.0.0/13',
4666 'CI': '102.136.0.0/14',
4667 'CK': '202.65.32.0/19',
4668 'CL': '152.172.0.0/14',
4669 'CM': '102.244.0.0/14',
4670 'CN': '36.128.0.0/10',
4671 'CO': '181.240.0.0/12',
4672 'CR': '201.192.0.0/12',
4673 'CU': '152.206.0.0/15',
4674 'CV': '165.90.96.0/19',
4675 'CW': '190.88.128.0/17',
4676 'CY': '31.153.0.0/16',
4677 'CZ': '88.100.0.0/14',
4678 'DE': '53.0.0.0/8',
4679 'DJ': '197.241.0.0/17',
4680 'DK': '87.48.0.0/12',
4681 'DM': '192.243.48.0/20',
4682 'DO': '152.166.0.0/15',
4683 'DZ': '41.96.0.0/12',
4684 'EC': '186.68.0.0/15',
4685 'EE': '90.190.0.0/15',
4686 'EG': '156.160.0.0/11',
4687 'ER': '196.200.96.0/20',
4688 'ES': '88.0.0.0/11',
4689 'ET': '196.188.0.0/14',
4690 'EU': '2.16.0.0/13',
4691 'FI': '91.152.0.0/13',
4692 'FJ': '144.120.0.0/16',
4693 'FK': '80.73.208.0/21',
4694 'FM': '119.252.112.0/20',
4695 'FO': '88.85.32.0/19',
4696 'FR': '90.0.0.0/9',
4697 'GA': '41.158.0.0/15',
4698 'GB': '25.0.0.0/8',
4699 'GD': '74.122.88.0/21',
4700 'GE': '31.146.0.0/16',
4701 'GF': '161.22.64.0/18',
4702 'GG': '62.68.160.0/19',
4703 'GH': '154.160.0.0/12',
4704 'GI': '95.164.0.0/16',
4705 'GL': '88.83.0.0/19',
4706 'GM': '160.182.0.0/15',
4707 'GN': '197.149.192.0/18',
4708 'GP': '104.250.0.0/19',
4709 'GQ': '105.235.224.0/20',
4710 'GR': '94.64.0.0/13',
4711 'GT': '168.234.0.0/16',
4712 'GU': '168.123.0.0/16',
4713 'GW': '197.214.80.0/20',
4714 'GY': '181.41.64.0/18',
4715 'HK': '113.252.0.0/14',
4716 'HN': '181.210.0.0/16',
4717 'HR': '93.136.0.0/13',
4718 'HT': '148.102.128.0/17',
4719 'HU': '84.0.0.0/14',
4720 'ID': '39.192.0.0/10',
4721 'IE': '87.32.0.0/12',
4722 'IL': '79.176.0.0/13',
4723 'IM': '5.62.80.0/20',
4724 'IN': '117.192.0.0/10',
4725 'IO': '203.83.48.0/21',
4726 'IQ': '37.236.0.0/14',
4727 'IR': '2.176.0.0/12',
4728 'IS': '82.221.0.0/16',
4729 'IT': '79.0.0.0/10',
4730 'JE': '87.244.64.0/18',
4731 'JM': '72.27.0.0/17',
4732 'JO': '176.29.0.0/16',
4733 'JP': '133.0.0.0/8',
4734 'KE': '105.48.0.0/12',
4735 'KG': '158.181.128.0/17',
4736 'KH': '36.37.128.0/17',
4737 'KI': '103.25.140.0/22',
4738 'KM': '197.255.224.0/20',
4739 'KN': '198.167.192.0/19',
4740 'KP': '175.45.176.0/22',
4741 'KR': '175.192.0.0/10',
4742 'KW': '37.36.0.0/14',
4743 'KY': '64.96.0.0/15',
4744 'KZ': '2.72.0.0/13',
4745 'LA': '115.84.64.0/18',
4746 'LB': '178.135.0.0/16',
4747 'LC': '24.92.144.0/20',
4748 'LI': '82.117.0.0/19',
4749 'LK': '112.134.0.0/15',
4750 'LR': '102.183.0.0/16',
4751 'LS': '129.232.0.0/17',
4752 'LT': '78.56.0.0/13',
4753 'LU': '188.42.0.0/16',
4754 'LV': '46.109.0.0/16',
4755 'LY': '41.252.0.0/14',
4756 'MA': '105.128.0.0/11',
4757 'MC': '88.209.64.0/18',
4758 'MD': '37.246.0.0/16',
4759 'ME': '178.175.0.0/17',
4760 'MF': '74.112.232.0/21',
4761 'MG': '154.126.0.0/17',
4762 'MH': '117.103.88.0/21',
4763 'MK': '77.28.0.0/15',
4764 'ML': '154.118.128.0/18',
4765 'MM': '37.111.0.0/17',
4766 'MN': '49.0.128.0/17',
4767 'MO': '60.246.0.0/16',
4768 'MP': '202.88.64.0/20',
4769 'MQ': '109.203.224.0/19',
4770 'MR': '41.188.64.0/18',
4771 'MS': '208.90.112.0/22',
4772 'MT': '46.11.0.0/16',
4773 'MU': '105.16.0.0/12',
4774 'MV': '27.114.128.0/18',
4775 'MW': '102.70.0.0/15',
4776 'MX': '187.192.0.0/11',
4777 'MY': '175.136.0.0/13',
4778 'MZ': '197.218.0.0/15',
4779 'NA': '41.182.0.0/16',
4780 'NC': '101.101.0.0/18',
4781 'NE': '197.214.0.0/18',
4782 'NF': '203.17.240.0/22',
4783 'NG': '105.112.0.0/12',
4784 'NI': '186.76.0.0/15',
4785 'NL': '145.96.0.0/11',
4786 'NO': '84.208.0.0/13',
4787 'NP': '36.252.0.0/15',
4788 'NR': '203.98.224.0/19',
4789 'NU': '49.156.48.0/22',
4790 'NZ': '49.224.0.0/14',
4791 'OM': '5.36.0.0/15',
4792 'PA': '186.72.0.0/15',
4793 'PE': '186.160.0.0/14',
4794 'PF': '123.50.64.0/18',
4795 'PG': '124.240.192.0/19',
4796 'PH': '49.144.0.0/13',
4797 'PK': '39.32.0.0/11',
4798 'PL': '83.0.0.0/11',
4799 'PM': '70.36.0.0/20',
4800 'PR': '66.50.0.0/16',
4801 'PS': '188.161.0.0/16',
4802 'PT': '85.240.0.0/13',
4803 'PW': '202.124.224.0/20',
4804 'PY': '181.120.0.0/14',
4805 'QA': '37.210.0.0/15',
4806 'RE': '102.35.0.0/16',
4807 'RO': '79.112.0.0/13',
4808 'RS': '93.86.0.0/15',
4809 'RU': '5.136.0.0/13',
4810 'RW': '41.186.0.0/16',
4811 'SA': '188.48.0.0/13',
4812 'SB': '202.1.160.0/19',
4813 'SC': '154.192.0.0/11',
4814 'SD': '102.120.0.0/13',
4815 'SE': '78.64.0.0/12',
4816 'SG': '8.128.0.0/10',
4817 'SI': '188.196.0.0/14',
4818 'SK': '78.98.0.0/15',
4819 'SL': '102.143.0.0/17',
4820 'SM': '89.186.32.0/19',
4821 'SN': '41.82.0.0/15',
4822 'SO': '154.115.192.0/18',
4823 'SR': '186.179.128.0/17',
4824 'SS': '105.235.208.0/21',
4825 'ST': '197.159.160.0/19',
4826 'SV': '168.243.0.0/16',
4827 'SX': '190.102.0.0/20',
4828 'SY': '5.0.0.0/16',
4829 'SZ': '41.84.224.0/19',
4830 'TC': '65.255.48.0/20',
4831 'TD': '154.68.128.0/19',
4832 'TG': '196.168.0.0/14',
4833 'TH': '171.96.0.0/13',
4834 'TJ': '85.9.128.0/18',
4835 'TK': '27.96.24.0/21',
4836 'TL': '180.189.160.0/20',
4837 'TM': '95.85.96.0/19',
4838 'TN': '197.0.0.0/11',
4839 'TO': '175.176.144.0/21',
4840 'TR': '78.160.0.0/11',
4841 'TT': '186.44.0.0/15',
4842 'TV': '202.2.96.0/19',
4843 'TW': '120.96.0.0/11',
4844 'TZ': '156.156.0.0/14',
4845 'UA': '37.52.0.0/14',
4846 'UG': '102.80.0.0/13',
4847 'US': '6.0.0.0/8',
4848 'UY': '167.56.0.0/13',
4849 'UZ': '84.54.64.0/18',
4850 'VA': '212.77.0.0/19',
4851 'VC': '207.191.240.0/21',
4852 'VE': '186.88.0.0/13',
4853 'VG': '66.81.192.0/20',
4854 'VI': '146.226.0.0/16',
4855 'VN': '14.160.0.0/11',
4856 'VU': '202.80.32.0/20',
4857 'WF': '117.20.32.0/21',
4858 'WS': '202.4.32.0/19',
4859 'YE': '134.35.0.0/16',
4860 'YT': '41.242.116.0/22',
4861 'ZA': '41.0.0.0/11',
4862 'ZM': '102.144.0.0/13',
4863 'ZW': '102.177.192.0/18',
4864 }
4865
4866 @classmethod
4867 def random_ipv4(cls, code_or_block):
4868 if len(code_or_block) == 2:
4869 block = cls._country_ip_map.get(code_or_block.upper())
4870 if not block:
4871 return None
4872 else:
4873 block = code_or_block
4874 addr, preflen = block.split('/')
4875 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4876 addr_max = addr_min | (0xffffffff >> int(preflen))
4877 return str(socket.inet_ntoa(
4878 struct.pack('!L', random.randint(addr_min, addr_max))))
4879
4880
4881 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4882 def __init__(self, proxies=None):
4883 # Set default handlers
4884 for type in ('http', 'https'):
4885 setattr(self, '%s_open' % type,
4886 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4887 meth(r, proxy, type))
4888 urllib.request.ProxyHandler.__init__(self, proxies)
4889
4890 def proxy_open(self, req, proxy, type):
4891 req_proxy = req.headers.get('Ytdl-request-proxy')
4892 if req_proxy is not None:
4893 proxy = req_proxy
4894 del req.headers['Ytdl-request-proxy']
4895
4896 if proxy == '__noproxy__':
4897 return None # No Proxy
4898 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4899 req.add_header('Ytdl-socks-proxy', proxy)
4900 # yt-dlp's http/https handlers do wrapping the socket with socks
4901 return None
4902 return urllib.request.ProxyHandler.proxy_open(
4903 self, req, proxy, type)
4904
4905
4906 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4907 # released into Public Domain
4908 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4909
4910 def long_to_bytes(n, blocksize=0):
4911 """long_to_bytes(n:long, blocksize:int) : string
4912 Convert a long integer to a byte string.
4913
4914 If optional blocksize is given and greater than zero, pad the front of the
4915 byte string with binary zeros so that the length is a multiple of
4916 blocksize.
4917 """
4918 # after much testing, this algorithm was deemed to be the fastest
4919 s = b''
4920 n = int(n)
4921 while n > 0:
4922 s = struct.pack('>I', n & 0xffffffff) + s
4923 n = n >> 32
4924 # strip off leading zeros
4925 for i in range(len(s)):
4926 if s[i] != b'\000'[0]:
4927 break
4928 else:
4929 # only happens when n == 0
4930 s = b'\000'
4931 i = 0
4932 s = s[i:]
4933 # add back some pad bytes. this could be done more efficiently w.r.t. the
4934 # de-padding being done above, but sigh...
4935 if blocksize > 0 and len(s) % blocksize:
4936 s = (blocksize - len(s) % blocksize) * b'\000' + s
4937 return s
4938
4939
4940 def bytes_to_long(s):
4941 """bytes_to_long(string) : long
4942 Convert a byte string to a long integer.
4943
4944 This is (essentially) the inverse of long_to_bytes().
4945 """
4946 acc = 0
4947 length = len(s)
4948 if length % 4:
4949 extra = (4 - length % 4)
4950 s = b'\000' * extra + s
4951 length = length + extra
4952 for i in range(0, length, 4):
4953 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4954 return acc
4955
4956
4957 def ohdave_rsa_encrypt(data, exponent, modulus):
4958 '''
4959 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4960
4961 Input:
4962 data: data to encrypt, bytes-like object
4963 exponent, modulus: parameter e and N of RSA algorithm, both integer
4964 Output: hex string of encrypted data
4965
4966 Limitation: supports one block encryption only
4967 '''
4968
4969 payload = int(binascii.hexlify(data[::-1]), 16)
4970 encrypted = pow(payload, exponent, modulus)
4971 return '%x' % encrypted
4972
4973
4974 def pkcs1pad(data, length):
4975 """
4976 Padding input data with PKCS#1 scheme
4977
4978 @param {int[]} data input data
4979 @param {int} length target length
4980 @returns {int[]} padded data
4981 """
4982 if len(data) > length - 11:
4983 raise ValueError('Input data too long for PKCS#1 padding')
4984
4985 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4986 return [0, 2] + pseudo_random + [0] + data
4987
4988
4989 def _base_n_table(n, table):
4990 if not table and not n:
4991 raise ValueError('Either table or n must be specified')
4992 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4993
4994 if n and n != len(table):
4995 raise ValueError(f'base {n} exceeds table length {len(table)}')
4996 return table
4997
4998
4999 def encode_base_n(num, n=None, table=None):
5000 """Convert given int to a base-n string"""
5001 table = _base_n_table(n, table)
5002 if not num:
5003 return table[0]
5004
5005 result, base = '', len(table)
5006 while num:
5007 result = table[num % base] + result
5008 num = num // base
5009 return result
5010
5011
5012 def decode_base_n(string, n=None, table=None):
5013 """Convert given base-n string to int"""
5014 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5015 result, base = 0, len(table)
5016 for char in string:
5017 result = result * base + table[char]
5018 return result
5019
5020
5021 def decode_base(value, digits):
5022 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5023 f'in a future version. Use {__name__}.decode_base_n instead')
5024 return decode_base_n(value, table=digits)
5025
5026
5027 def decode_packed_codes(code):
5028 mobj = re.search(PACKED_CODES_RE, code)
5029 obfuscated_code, base, count, symbols = mobj.groups()
5030 base = int(base)
5031 count = int(count)
5032 symbols = symbols.split('|')
5033 symbol_table = {}
5034
5035 while count:
5036 count -= 1
5037 base_n_count = encode_base_n(count, base)
5038 symbol_table[base_n_count] = symbols[count] or base_n_count
5039
5040 return re.sub(
5041 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5042 obfuscated_code)
5043
5044
5045 def caesar(s, alphabet, shift):
5046 if shift == 0:
5047 return s
5048 l = len(alphabet)
5049 return ''.join(
5050 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5051 for c in s)
5052
5053
5054 def rot47(s):
5055 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5056
5057
5058 def parse_m3u8_attributes(attrib):
5059 info = {}
5060 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5061 if val.startswith('"'):
5062 val = val[1:-1]
5063 info[key] = val
5064 return info
5065
5066
5067 def urshift(val, n):
5068 return val >> n if val >= 0 else (val + 0x100000000) >> n
5069
5070
5071 # Based on png2str() written by @gdkchan and improved by @yokrysty
5072 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5073 def decode_png(png_data):
5074 # Reference: https://www.w3.org/TR/PNG/
5075 header = png_data[8:]
5076
5077 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5078 raise OSError('Not a valid PNG file.')
5079
5080 int_map = {1: '>B', 2: '>H', 4: '>I'}
5081 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5082
5083 chunks = []
5084
5085 while header:
5086 length = unpack_integer(header[:4])
5087 header = header[4:]
5088
5089 chunk_type = header[:4]
5090 header = header[4:]
5091
5092 chunk_data = header[:length]
5093 header = header[length:]
5094
5095 header = header[4:] # Skip CRC
5096
5097 chunks.append({
5098 'type': chunk_type,
5099 'length': length,
5100 'data': chunk_data
5101 })
5102
5103 ihdr = chunks[0]['data']
5104
5105 width = unpack_integer(ihdr[:4])
5106 height = unpack_integer(ihdr[4:8])
5107
5108 idat = b''
5109
5110 for chunk in chunks:
5111 if chunk['type'] == b'IDAT':
5112 idat += chunk['data']
5113
5114 if not idat:
5115 raise OSError('Unable to read PNG data.')
5116
5117 decompressed_data = bytearray(zlib.decompress(idat))
5118
5119 stride = width * 3
5120 pixels = []
5121
5122 def _get_pixel(idx):
5123 x = idx % stride
5124 y = idx // stride
5125 return pixels[y][x]
5126
5127 for y in range(height):
5128 basePos = y * (1 + stride)
5129 filter_type = decompressed_data[basePos]
5130
5131 current_row = []
5132
5133 pixels.append(current_row)
5134
5135 for x in range(stride):
5136 color = decompressed_data[1 + basePos + x]
5137 basex = y * stride + x
5138 left = 0
5139 up = 0
5140
5141 if x > 2:
5142 left = _get_pixel(basex - 3)
5143 if y > 0:
5144 up = _get_pixel(basex - stride)
5145
5146 if filter_type == 1: # Sub
5147 color = (color + left) & 0xff
5148 elif filter_type == 2: # Up
5149 color = (color + up) & 0xff
5150 elif filter_type == 3: # Average
5151 color = (color + ((left + up) >> 1)) & 0xff
5152 elif filter_type == 4: # Paeth
5153 a = left
5154 b = up
5155 c = 0
5156
5157 if x > 2 and y > 0:
5158 c = _get_pixel(basex - stride - 3)
5159
5160 p = a + b - c
5161
5162 pa = abs(p - a)
5163 pb = abs(p - b)
5164 pc = abs(p - c)
5165
5166 if pa <= pb and pa <= pc:
5167 color = (color + a) & 0xff
5168 elif pb <= pc:
5169 color = (color + b) & 0xff
5170 else:
5171 color = (color + c) & 0xff
5172
5173 current_row.append(color)
5174
5175 return width, height, pixels
5176
5177
5178 def write_xattr(path, key, value):
5179 # Windows: Write xattrs to NTFS Alternate Data Streams:
5180 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5181 if compat_os_name == 'nt':
5182 assert ':' not in key
5183 assert os.path.exists(path)
5184
5185 try:
5186 with open(f'{path}:{key}', 'wb') as f:
5187 f.write(value)
5188 except OSError as e:
5189 raise XAttrMetadataError(e.errno, e.strerror)
5190 return
5191
5192 # UNIX Method 1. Use xattrs/pyxattrs modules
5193
5194 setxattr = None
5195 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5196 # Unicode arguments are not supported in pyxattr until version 0.5.0
5197 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5198 if version_tuple(xattr.__version__) >= (0, 5, 0):
5199 setxattr = xattr.set
5200 elif xattr:
5201 setxattr = xattr.setxattr
5202
5203 if setxattr:
5204 try:
5205 setxattr(path, key, value)
5206 except OSError as e:
5207 raise XAttrMetadataError(e.errno, e.strerror)
5208 return
5209
5210 # UNIX Method 2. Use setfattr/xattr executables
5211 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5212 else 'xattr' if check_executable('xattr', ['-h']) else None)
5213 if not exe:
5214 raise XAttrUnavailableError(
5215 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5216 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5217
5218 value = value.decode()
5219 try:
5220 _, stderr, returncode = Popen.run(
5221 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5222 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5223 except OSError as e:
5224 raise XAttrMetadataError(e.errno, e.strerror)
5225 if returncode:
5226 raise XAttrMetadataError(returncode, stderr)
5227
5228
5229 def random_birthday(year_field, month_field, day_field):
5230 start_date = datetime.date(1950, 1, 1)
5231 end_date = datetime.date(1995, 12, 31)
5232 offset = random.randint(0, (end_date - start_date).days)
5233 random_date = start_date + datetime.timedelta(offset)
5234 return {
5235 year_field: str(random_date.year),
5236 month_field: str(random_date.month),
5237 day_field: str(random_date.day),
5238 }
5239
5240
5241 # Templates for internet shortcut files, which are plain text files.
5242 DOT_URL_LINK_TEMPLATE = '''\
5243 [InternetShortcut]
5244 URL=%(url)s
5245 '''
5246
5247 DOT_WEBLOC_LINK_TEMPLATE = '''\
5248 <?xml version="1.0" encoding="UTF-8"?>
5249 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5250 <plist version="1.0">
5251 <dict>
5252 \t<key>URL</key>
5253 \t<string>%(url)s</string>
5254 </dict>
5255 </plist>
5256 '''
5257
5258 DOT_DESKTOP_LINK_TEMPLATE = '''\
5259 [Desktop Entry]
5260 Encoding=UTF-8
5261 Name=%(filename)s
5262 Type=Link
5263 URL=%(url)s
5264 Icon=text-html
5265 '''
5266
5267 LINK_TEMPLATES = {
5268 'url': DOT_URL_LINK_TEMPLATE,
5269 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5270 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5271 }
5272
5273
5274 def iri_to_uri(iri):
5275 """
5276 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5277
5278 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5279 """
5280
5281 iri_parts = urllib.parse.urlparse(iri)
5282
5283 if '[' in iri_parts.netloc:
5284 raise ValueError('IPv6 URIs are not, yet, supported.')
5285 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5286
5287 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5288
5289 net_location = ''
5290 if iri_parts.username:
5291 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5292 if iri_parts.password is not None:
5293 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5294 net_location += '@'
5295
5296 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5297 # The 'idna' encoding produces ASCII text.
5298 if iri_parts.port is not None and iri_parts.port != 80:
5299 net_location += ':' + str(iri_parts.port)
5300
5301 return urllib.parse.urlunparse(
5302 (iri_parts.scheme,
5303 net_location,
5304
5305 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5306
5307 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5308 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5309
5310 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5311 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5312
5313 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5314
5315 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5316
5317
5318 def to_high_limit_path(path):
5319 if sys.platform in ['win32', 'cygwin']:
5320 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5321 return '\\\\?\\' + os.path.abspath(path)
5322
5323 return path
5324
5325
5326 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5327 val = traverse_obj(obj, *variadic(field))
5328 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5329 return default
5330 return template % func(val)
5331
5332
5333 def clean_podcast_url(url):
5334 return re.sub(r'''(?x)
5335 (?:
5336 (?:
5337 chtbl\.com/track|
5338 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5339 play\.podtrac\.com
5340 )/[^/]+|
5341 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5342 flex\.acast\.com|
5343 pd(?:
5344 cn\.co| # https://podcorn.com/analytics-prefix/
5345 st\.fm # https://podsights.com/docs/
5346 )/e
5347 )/''', '', url)
5348
5349
5350 _HEX_TABLE = '0123456789abcdef'
5351
5352
5353 def random_uuidv4():
5354 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5355
5356
5357 def make_dir(path, to_screen=None):
5358 try:
5359 dn = os.path.dirname(path)
5360 if dn and not os.path.exists(dn):
5361 os.makedirs(dn)
5362 return True
5363 except OSError as err:
5364 if callable(to_screen) is not None:
5365 to_screen('unable to create directory ' + error_to_compat_str(err))
5366 return False
5367
5368
5369 def get_executable_path():
5370 from .update import _get_variant_and_executable_path
5371
5372 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5373
5374
5375 def load_plugins(name, suffix, namespace):
5376 classes = {}
5377 with contextlib.suppress(FileNotFoundError):
5378 plugins_spec = importlib.util.spec_from_file_location(
5379 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5380 plugins = importlib.util.module_from_spec(plugins_spec)
5381 sys.modules[plugins_spec.name] = plugins
5382 plugins_spec.loader.exec_module(plugins)
5383 for name in dir(plugins):
5384 if name in namespace:
5385 continue
5386 if not name.endswith(suffix):
5387 continue
5388 klass = getattr(plugins, name)
5389 classes[name] = namespace[name] = klass
5390 return classes
5391
5392
5393 def traverse_obj(
5394 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5395 casesense=True, is_user_input=False, traverse_string=False):
5396 """
5397 Safely traverse nested `dict`s and `Sequence`s
5398
5399 >>> obj = [{}, {"key": "value"}]
5400 >>> traverse_obj(obj, (1, "key"))
5401 "value"
5402
5403 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5404 The next path will also be tested if the path branched but no results could be found.
5405 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5406 A value of None is treated as the absence of a value.
5407
5408 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5409
5410 The keys in the path can be one of:
5411 - `None`: Return the current object.
5412 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5413 - `slice`: Branch out and return all values in `obj[key]`.
5414 - `Ellipsis`: Branch out and return a list of all values.
5415 - `tuple`/`list`: Branch out and return a list of all matching values.
5416 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5417 - `function`: Branch out and return values filtered by the function.
5418 Read as: `[value for key, value in obj if function(key, value)]`.
5419 For `Sequence`s, `key` is the index of the value.
5420 - `dict` Transform the current object and return a matching dict.
5421 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5422
5423 `tuple`, `list`, and `dict` all support nested paths and branches.
5424
5425 @params paths Paths which to traverse by.
5426 @param default Value to return if the paths do not match.
5427 @param expected_type If a `type`, only accept final values of this type.
5428 If any other callable, try to call the function on each result.
5429 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5430 @param casesense If `False`, consider string dictionary keys as case insensitive.
5431
5432 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5433
5434 @param is_user_input Whether the keys are generated from user input.
5435 If `True` strings get converted to `int`/`slice` if needed.
5436 @param traverse_string Whether to traverse into objects as strings.
5437 If `True`, any non-compatible object will first be
5438 converted into a string and then traversed into.
5439
5440
5441 @returns The result of the object traversal.
5442 If successful, `get_all=True`, and the path branches at least once,
5443 then a list of results is returned instead.
5444 A list is always returned if the last path branches and no `default` is given.
5445 """
5446 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5447 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5448
5449 if isinstance(expected_type, type):
5450 type_test = lambda val: val if isinstance(val, expected_type) else None
5451 else:
5452 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5453
5454 def apply_key(key, obj):
5455 if obj is None:
5456 return
5457
5458 elif key is None:
5459 yield obj
5460
5461 elif isinstance(key, (list, tuple)):
5462 for branch in key:
5463 _, result = apply_path(obj, branch)
5464 yield from result
5465
5466 elif key is ...:
5467 if isinstance(obj, collections.abc.Mapping):
5468 yield from obj.values()
5469 elif is_sequence(obj):
5470 yield from obj
5471 elif isinstance(obj, re.Match):
5472 yield from obj.groups()
5473 elif traverse_string:
5474 yield from str(obj)
5475
5476 elif callable(key):
5477 if is_sequence(obj):
5478 iter_obj = enumerate(obj)
5479 elif isinstance(obj, collections.abc.Mapping):
5480 iter_obj = obj.items()
5481 elif isinstance(obj, re.Match):
5482 iter_obj = enumerate((obj.group(), *obj.groups()))
5483 elif traverse_string:
5484 iter_obj = enumerate(str(obj))
5485 else:
5486 return
5487 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5488
5489 elif isinstance(key, dict):
5490 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5491 yield {k: v if v is not None else default for k, v in iter_obj
5492 if v is not None or default is not NO_DEFAULT}
5493
5494 elif isinstance(obj, collections.abc.Mapping):
5495 yield (obj.get(key) if casesense or (key in obj)
5496 else next((v for k, v in obj.items() if casefold(k) == key), None))
5497
5498 elif isinstance(obj, re.Match):
5499 if isinstance(key, int) or casesense:
5500 with contextlib.suppress(IndexError):
5501 yield obj.group(key)
5502 return
5503
5504 if not isinstance(key, str):
5505 return
5506
5507 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5508
5509 else:
5510 if is_user_input:
5511 key = (int_or_none(key) if ':' not in key
5512 else slice(*map(int_or_none, key.split(':'))))
5513
5514 if not isinstance(key, (int, slice)):
5515 return
5516
5517 if not is_sequence(obj):
5518 if not traverse_string:
5519 return
5520 obj = str(obj)
5521
5522 with contextlib.suppress(IndexError):
5523 yield obj[key]
5524
5525 def apply_path(start_obj, path):
5526 objs = (start_obj,)
5527 has_branched = False
5528
5529 for key in variadic(path):
5530 if is_user_input and key == ':':
5531 key = ...
5532
5533 if not casesense and isinstance(key, str):
5534 key = key.casefold()
5535
5536 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5537 has_branched = True
5538
5539 key_func = functools.partial(apply_key, key)
5540 objs = itertools.chain.from_iterable(map(key_func, objs))
5541
5542 return has_branched, objs
5543
5544 def _traverse_obj(obj, path, use_list=True):
5545 has_branched, results = apply_path(obj, path)
5546 results = LazyList(x for x in map(type_test, results) if x is not None)
5547
5548 if get_all and has_branched:
5549 return results.exhaust() if results or use_list else None
5550
5551 return results[0] if results else None
5552
5553 for index, path in enumerate(paths, 1):
5554 use_list = default is NO_DEFAULT and index == len(paths)
5555 result = _traverse_obj(obj, path, use_list)
5556 if result is not None:
5557 return result
5558
5559 return None if default is NO_DEFAULT else default
5560
5561
5562 def traverse_dict(dictn, keys, casesense=True):
5563 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5564 f'in a future version. Use "{__name__}.traverse_obj" instead')
5565 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5566
5567
5568 def get_first(obj, keys, **kwargs):
5569 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5570
5571
5572 def time_seconds(**kwargs):
5573 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5574 return t.timestamp()
5575
5576
5577 # create a JSON Web Signature (jws) with HS256 algorithm
5578 # the resulting format is in JWS Compact Serialization
5579 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5580 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5581 def jwt_encode_hs256(payload_data, key, headers={}):
5582 header_data = {
5583 'alg': 'HS256',
5584 'typ': 'JWT',
5585 }
5586 if headers:
5587 header_data.update(headers)
5588 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5589 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5590 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5591 signature_b64 = base64.b64encode(h.digest())
5592 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5593 return token
5594
5595
5596 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5597 def jwt_decode_hs256(jwt):
5598 header_b64, payload_b64, signature_b64 = jwt.split('.')
5599 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5600 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5601 return payload_data
5602
5603
5604 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5605
5606
5607 @functools.cache
5608 def supports_terminal_sequences(stream):
5609 if compat_os_name == 'nt':
5610 if not WINDOWS_VT_MODE:
5611 return False
5612 elif not os.getenv('TERM'):
5613 return False
5614 try:
5615 return stream.isatty()
5616 except BaseException:
5617 return False
5618
5619
5620 def windows_enable_vt_mode():
5621 """Ref: https://bugs.python.org/issue30075 """
5622 if get_windows_version() < (10, 0, 10586):
5623 return
5624
5625 import ctypes
5626 import ctypes.wintypes
5627 import msvcrt
5628
5629 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5630
5631 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5632 handle = os.open('CONOUT$', os.O_RDWR)
5633
5634 try:
5635 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5636 dw_original_mode = ctypes.wintypes.DWORD()
5637 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5638 if not success:
5639 raise Exception('GetConsoleMode failed')
5640
5641 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5642 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5643 if not success:
5644 raise Exception('SetConsoleMode failed')
5645 except Exception as e:
5646 write_string(f'WARNING: Cannot enable VT mode - {e}')
5647 else:
5648 global WINDOWS_VT_MODE
5649 WINDOWS_VT_MODE = True
5650 supports_terminal_sequences.cache_clear()
5651 finally:
5652 os.close(handle)
5653
5654
5655 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5656
5657
5658 def remove_terminal_sequences(string):
5659 return _terminal_sequences_re.sub('', string)
5660
5661
5662 def number_of_digits(number):
5663 return len('%d' % number)
5664
5665
5666 def join_nonempty(*values, delim='-', from_dict=None):
5667 if from_dict is not None:
5668 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5669 return delim.join(map(str, filter(None, values)))
5670
5671
5672 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5673 """
5674 Find the largest format dimensions in terms of video width and, for each thumbnail:
5675 * Modify the URL: Match the width with the provided regex and replace with the former width
5676 * Update dimensions
5677
5678 This function is useful with video services that scale the provided thumbnails on demand
5679 """
5680 _keys = ('width', 'height')
5681 max_dimensions = max(
5682 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5683 default=(0, 0))
5684 if not max_dimensions[0]:
5685 return thumbnails
5686 return [
5687 merge_dicts(
5688 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5689 dict(zip(_keys, max_dimensions)), thumbnail)
5690 for thumbnail in thumbnails
5691 ]
5692
5693
5694 def parse_http_range(range):
5695 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5696 if not range:
5697 return None, None, None
5698 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5699 if not crg:
5700 return None, None, None
5701 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5702
5703
5704 def read_stdin(what):
5705 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5706 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5707 return sys.stdin
5708
5709
5710 def determine_file_encoding(data):
5711 """
5712 Detect the text encoding used
5713 @returns (encoding, bytes to skip)
5714 """
5715
5716 # BOM marks are given priority over declarations
5717 for bom, enc in BOMS:
5718 if data.startswith(bom):
5719 return enc, len(bom)
5720
5721 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5722 # We ignore the endianness to get a good enough match
5723 data = data.replace(b'\0', b'')
5724 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5725 return mobj.group(1).decode() if mobj else None, 0
5726
5727
5728 class Config:
5729 own_args = None
5730 parsed_args = None
5731 filename = None
5732 __initialized = False
5733
5734 def __init__(self, parser, label=None):
5735 self.parser, self.label = parser, label
5736 self._loaded_paths, self.configs = set(), []
5737
5738 def init(self, args=None, filename=None):
5739 assert not self.__initialized
5740 self.own_args, self.filename = args, filename
5741 return self.load_configs()
5742
5743 def load_configs(self):
5744 directory = ''
5745 if self.filename:
5746 location = os.path.realpath(self.filename)
5747 directory = os.path.dirname(location)
5748 if location in self._loaded_paths:
5749 return False
5750 self._loaded_paths.add(location)
5751
5752 self.__initialized = True
5753 opts, _ = self.parser.parse_known_args(self.own_args)
5754 self.parsed_args = self.own_args
5755 for location in opts.config_locations or []:
5756 if location == '-':
5757 if location in self._loaded_paths:
5758 continue
5759 self._loaded_paths.add(location)
5760 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5761 continue
5762 location = os.path.join(directory, expand_path(location))
5763 if os.path.isdir(location):
5764 location = os.path.join(location, 'yt-dlp.conf')
5765 if not os.path.exists(location):
5766 self.parser.error(f'config location {location} does not exist')
5767 self.append_config(self.read_file(location), location)
5768 return True
5769
5770 def __str__(self):
5771 label = join_nonempty(
5772 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5773 delim=' ')
5774 return join_nonempty(
5775 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5776 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5777 delim='\n')
5778
5779 @staticmethod
5780 def read_file(filename, default=[]):
5781 try:
5782 optionf = open(filename, 'rb')
5783 except OSError:
5784 return default # silently skip if file is not present
5785 try:
5786 enc, skip = determine_file_encoding(optionf.read(512))
5787 optionf.seek(skip, io.SEEK_SET)
5788 except OSError:
5789 enc = None # silently skip read errors
5790 try:
5791 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5792 contents = optionf.read().decode(enc or preferredencoding())
5793 res = shlex.split(contents, comments=True)
5794 except Exception as err:
5795 raise ValueError(f'Unable to parse "{filename}": {err}')
5796 finally:
5797 optionf.close()
5798 return res
5799
5800 @staticmethod
5801 def hide_login_info(opts):
5802 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5803 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5804
5805 def _scrub_eq(o):
5806 m = eqre.match(o)
5807 if m:
5808 return m.group('key') + '=PRIVATE'
5809 else:
5810 return o
5811
5812 opts = list(map(_scrub_eq, opts))
5813 for idx, opt in enumerate(opts):
5814 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5815 opts[idx + 1] = 'PRIVATE'
5816 return opts
5817
5818 def append_config(self, *args, label=None):
5819 config = type(self)(self.parser, label)
5820 config._loaded_paths = self._loaded_paths
5821 if config.init(*args):
5822 self.configs.append(config)
5823
5824 @property
5825 def all_args(self):
5826 for config in reversed(self.configs):
5827 yield from config.all_args
5828 yield from self.parsed_args or []
5829
5830 def parse_known_args(self, **kwargs):
5831 return self.parser.parse_known_args(self.all_args, **kwargs)
5832
5833 def parse_args(self):
5834 return self.parser.parse_args(self.all_args)
5835
5836
5837 class WebSocketsWrapper:
5838 """Wraps websockets module to use in non-async scopes"""
5839 pool = None
5840
5841 def __init__(self, url, headers=None, connect=True):
5842 self.loop = asyncio.new_event_loop()
5843 # XXX: "loop" is deprecated
5844 self.conn = websockets.connect(
5845 url, extra_headers=headers, ping_interval=None,
5846 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5847 if connect:
5848 self.__enter__()
5849 atexit.register(self.__exit__, None, None, None)
5850
5851 def __enter__(self):
5852 if not self.pool:
5853 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5854 return self
5855
5856 def send(self, *args):
5857 self.run_with_loop(self.pool.send(*args), self.loop)
5858
5859 def recv(self, *args):
5860 return self.run_with_loop(self.pool.recv(*args), self.loop)
5861
5862 def __exit__(self, type, value, traceback):
5863 try:
5864 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5865 finally:
5866 self.loop.close()
5867 self._cancel_all_tasks(self.loop)
5868
5869 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5870 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5871 @staticmethod
5872 def run_with_loop(main, loop):
5873 if not asyncio.iscoroutine(main):
5874 raise ValueError(f'a coroutine was expected, got {main!r}')
5875
5876 try:
5877 return loop.run_until_complete(main)
5878 finally:
5879 loop.run_until_complete(loop.shutdown_asyncgens())
5880 if hasattr(loop, 'shutdown_default_executor'):
5881 loop.run_until_complete(loop.shutdown_default_executor())
5882
5883 @staticmethod
5884 def _cancel_all_tasks(loop):
5885 to_cancel = asyncio.all_tasks(loop)
5886
5887 if not to_cancel:
5888 return
5889
5890 for task in to_cancel:
5891 task.cancel()
5892
5893 # XXX: "loop" is removed in python 3.10+
5894 loop.run_until_complete(
5895 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5896
5897 for task in to_cancel:
5898 if task.cancelled():
5899 continue
5900 if task.exception() is not None:
5901 loop.call_exception_handler({
5902 'message': 'unhandled exception during asyncio.run() shutdown',
5903 'exception': task.exception(),
5904 'task': task,
5905 })
5906
5907
5908 def merge_headers(*dicts):
5909 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5910 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5911
5912
5913 def cached_method(f):
5914 """Cache a method"""
5915 signature = inspect.signature(f)
5916
5917 @functools.wraps(f)
5918 def wrapper(self, *args, **kwargs):
5919 bound_args = signature.bind(self, *args, **kwargs)
5920 bound_args.apply_defaults()
5921 key = tuple(bound_args.arguments.values())[1:]
5922
5923 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5924 if key not in cache:
5925 cache[key] = f(self, *args, **kwargs)
5926 return cache[key]
5927 return wrapper
5928
5929
5930 class classproperty:
5931 """property access for class methods with optional caching"""
5932 def __new__(cls, func=None, *args, **kwargs):
5933 if not func:
5934 return functools.partial(cls, *args, **kwargs)
5935 return super().__new__(cls)
5936
5937 def __init__(self, func, *, cache=False):
5938 functools.update_wrapper(self, func)
5939 self.func = func
5940 self._cache = {} if cache else None
5941
5942 def __get__(self, _, cls):
5943 if self._cache is None:
5944 return self.func(cls)
5945 elif cls not in self._cache:
5946 self._cache[cls] = self.func(cls)
5947 return self._cache[cls]
5948
5949
5950 class Namespace(types.SimpleNamespace):
5951 """Immutable namespace"""
5952
5953 def __iter__(self):
5954 return iter(self.__dict__.values())
5955
5956 @property
5957 def items_(self):
5958 return self.__dict__.items()
5959
5960
5961 MEDIA_EXTENSIONS = Namespace(
5962 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5963 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5964 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5965 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5966 thumbnails=('jpg', 'png', 'webp'),
5967 storyboards=('mhtml', ),
5968 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5969 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5970 )
5971 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5972 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5973
5974 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5975
5976
5977 class RetryManager:
5978 """Usage:
5979 for retry in RetryManager(...):
5980 try:
5981 ...
5982 except SomeException as err:
5983 retry.error = err
5984 continue
5985 """
5986 attempt, _error = 0, None
5987
5988 def __init__(self, _retries, _error_callback, **kwargs):
5989 self.retries = _retries or 0
5990 self.error_callback = functools.partial(_error_callback, **kwargs)
5991
5992 def _should_retry(self):
5993 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5994
5995 @property
5996 def error(self):
5997 if self._error is NO_DEFAULT:
5998 return None
5999 return self._error
6000
6001 @error.setter
6002 def error(self, value):
6003 self._error = value
6004
6005 def __iter__(self):
6006 while self._should_retry():
6007 self.error = NO_DEFAULT
6008 self.attempt += 1
6009 yield self
6010 if self.error:
6011 self.error_callback(self.error, self.attempt, self.retries)
6012
6013 @staticmethod
6014 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6015 """Utility function for reporting retries"""
6016 if count > retries:
6017 if error:
6018 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6019 raise e
6020
6021 if not count:
6022 return warn(e)
6023 elif isinstance(e, ExtractorError):
6024 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
6025 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6026
6027 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6028 if delay:
6029 info(f'Sleeping {delay:.2f} seconds ...')
6030 time.sleep(delay)
6031
6032
6033 def make_archive_id(ie, video_id):
6034 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6035 return f'{ie_key.lower()} {video_id}'
6036
6037
6038 def truncate_string(s, left, right=0):
6039 assert left > 3 and right >= 0
6040 if s is None or len(s) <= left + right:
6041 return s
6042 return f'{s[:left-3]}...{s[-right:] if right else ""}'
6043
6044
6045 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6046 assert 'all' in alias_dict, '"all" alias is required'
6047 requested = list(start or [])
6048 for val in options:
6049 discard = val.startswith('-')
6050 if discard:
6051 val = val[1:]
6052
6053 if val in alias_dict:
6054 val = alias_dict[val] if not discard else [
6055 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6056 # NB: Do not allow regex in aliases for performance
6057 requested = orderedSet_from_options(val, alias_dict, start=requested)
6058 continue
6059
6060 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6061 else [val] if val in alias_dict['all'] else None)
6062 if current is None:
6063 raise ValueError(val)
6064
6065 if discard:
6066 for item in current:
6067 while item in requested:
6068 requested.remove(item)
6069 else:
6070 requested.extend(current)
6071
6072 return orderedSet(requested)
6073
6074
6075 class FormatSorter:
6076 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6077
6078 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6079 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6080 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6081 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6082 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6083 'fps', 'fs_approx', 'source', 'id')
6084
6085 settings = {
6086 'vcodec': {'type': 'ordered', 'regex': True,
6087 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6088 'acodec': {'type': 'ordered', 'regex': True,
6089 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6090 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6091 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6092 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6093 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6094 'vext': {'type': 'ordered', 'field': 'video_ext',
6095 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6096 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6097 'aext': {'type': 'ordered', 'field': 'audio_ext',
6098 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6099 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6100 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6101 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6102 'field': ('vcodec', 'acodec'),
6103 'function': lambda it: int(any(v != 'none' for v in it))},
6104 'ie_pref': {'priority': True, 'type': 'extractor'},
6105 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6106 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6107 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6108 'quality': {'convert': 'float', 'default': -1},
6109 'filesize': {'convert': 'bytes'},
6110 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6111 'id': {'convert': 'string', 'field': 'format_id'},
6112 'height': {'convert': 'float_none'},
6113 'width': {'convert': 'float_none'},
6114 'fps': {'convert': 'float_none'},
6115 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6116 'tbr': {'convert': 'float_none'},
6117 'vbr': {'convert': 'float_none'},
6118 'abr': {'convert': 'float_none'},
6119 'asr': {'convert': 'float_none'},
6120 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6121
6122 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6123 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6124 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6125 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6126 'res': {'type': 'multiple', 'field': ('height', 'width'),
6127 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6128
6129 # Actual field names
6130 'format_id': {'type': 'alias', 'field': 'id'},
6131 'preference': {'type': 'alias', 'field': 'ie_pref'},
6132 'language_preference': {'type': 'alias', 'field': 'lang'},
6133 'source_preference': {'type': 'alias', 'field': 'source'},
6134 'protocol': {'type': 'alias', 'field': 'proto'},
6135 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6136 'audio_channels': {'type': 'alias', 'field': 'channels'},
6137
6138 # Deprecated
6139 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6140 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6141 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6142 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6143 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6144 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6145 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6146 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6147 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6148 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6149 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6150 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6151 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6152 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6153 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6154 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6155 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6156 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6157 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6158 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6159 }
6160
6161 def __init__(self, ydl, field_preference):
6162 self.ydl = ydl
6163 self._order = []
6164 self.evaluate_params(self.ydl.params, field_preference)
6165 if ydl.params.get('verbose'):
6166 self.print_verbose_info(self.ydl.write_debug)
6167
6168 def _get_field_setting(self, field, key):
6169 if field not in self.settings:
6170 if key in ('forced', 'priority'):
6171 return False
6172 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6173 'deprecated and may be removed in a future version')
6174 self.settings[field] = {}
6175 propObj = self.settings[field]
6176 if key not in propObj:
6177 type = propObj.get('type')
6178 if key == 'field':
6179 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6180 elif key == 'convert':
6181 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6182 else:
6183 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6184 propObj[key] = default
6185 return propObj[key]
6186
6187 def _resolve_field_value(self, field, value, convertNone=False):
6188 if value is None:
6189 if not convertNone:
6190 return None
6191 else:
6192 value = value.lower()
6193 conversion = self._get_field_setting(field, 'convert')
6194 if conversion == 'ignore':
6195 return None
6196 if conversion == 'string':
6197 return value
6198 elif conversion == 'float_none':
6199 return float_or_none(value)
6200 elif conversion == 'bytes':
6201 return parse_bytes(value)
6202 elif conversion == 'order':
6203 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6204 use_regex = self._get_field_setting(field, 'regex')
6205 list_length = len(order_list)
6206 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6207 if use_regex and value is not None:
6208 for i, regex in enumerate(order_list):
6209 if regex and re.match(regex, value):
6210 return list_length - i
6211 return list_length - empty_pos # not in list
6212 else: # not regex or value = None
6213 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6214 else:
6215 if value.isnumeric():
6216 return float(value)
6217 else:
6218 self.settings[field]['convert'] = 'string'
6219 return value
6220
6221 def evaluate_params(self, params, sort_extractor):
6222 self._use_free_order = params.get('prefer_free_formats', False)
6223 self._sort_user = params.get('format_sort', [])
6224 self._sort_extractor = sort_extractor
6225
6226 def add_item(field, reverse, closest, limit_text):
6227 field = field.lower()
6228 if field in self._order:
6229 return
6230 self._order.append(field)
6231 limit = self._resolve_field_value(field, limit_text)
6232 data = {
6233 'reverse': reverse,
6234 'closest': False if limit is None else closest,
6235 'limit_text': limit_text,
6236 'limit': limit}
6237 if field in self.settings:
6238 self.settings[field].update(data)
6239 else:
6240 self.settings[field] = data
6241
6242 sort_list = (
6243 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6244 + (tuple() if params.get('format_sort_force', False)
6245 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6246 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6247
6248 for item in sort_list:
6249 match = re.match(self.regex, item)
6250 if match is None:
6251 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6252 field = match.group('field')
6253 if field is None:
6254 continue
6255 if self._get_field_setting(field, 'type') == 'alias':
6256 alias, field = field, self._get_field_setting(field, 'field')
6257 if self._get_field_setting(alias, 'deprecated'):
6258 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6259 f'be removed in a future version. Please use {field} instead')
6260 reverse = match.group('reverse') is not None
6261 closest = match.group('separator') == '~'
6262 limit_text = match.group('limit')
6263
6264 has_limit = limit_text is not None
6265 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6266 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6267
6268 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6269 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6270 limit_count = len(limits)
6271 for (i, f) in enumerate(fields):
6272 add_item(f, reverse, closest,
6273 limits[i] if i < limit_count
6274 else limits[0] if has_limit and not has_multiple_limits
6275 else None)
6276
6277 def print_verbose_info(self, write_debug):
6278 if self._sort_user:
6279 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6280 if self._sort_extractor:
6281 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6282 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6283 '+' if self._get_field_setting(field, 'reverse') else '', field,
6284 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6285 self._get_field_setting(field, 'limit_text'),
6286 self._get_field_setting(field, 'limit'))
6287 if self._get_field_setting(field, 'limit_text') is not None else '')
6288 for field in self._order if self._get_field_setting(field, 'visible')]))
6289
6290 def _calculate_field_preference_from_value(self, format, field, type, value):
6291 reverse = self._get_field_setting(field, 'reverse')
6292 closest = self._get_field_setting(field, 'closest')
6293 limit = self._get_field_setting(field, 'limit')
6294
6295 if type == 'extractor':
6296 maximum = self._get_field_setting(field, 'max')
6297 if value is None or (maximum is not None and value >= maximum):
6298 value = -1
6299 elif type == 'boolean':
6300 in_list = self._get_field_setting(field, 'in_list')
6301 not_in_list = self._get_field_setting(field, 'not_in_list')
6302 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6303 elif type == 'ordered':
6304 value = self._resolve_field_value(field, value, True)
6305
6306 # try to convert to number
6307 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6308 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6309 if is_num:
6310 value = val_num
6311
6312 return ((-10, 0) if value is None
6313 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6314 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6315 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6316 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6317 else (-1, value, 0))
6318
6319 def _calculate_field_preference(self, format, field):
6320 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6321 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6322 if type == 'multiple':
6323 type = 'field' # Only 'field' is allowed in multiple for now
6324 actual_fields = self._get_field_setting(field, 'field')
6325
6326 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6327 else:
6328 value = get_value(field)
6329 return self._calculate_field_preference_from_value(format, field, type, value)
6330
6331 def calculate_preference(self, format):
6332 # Determine missing protocol
6333 if not format.get('protocol'):
6334 format['protocol'] = determine_protocol(format)
6335
6336 # Determine missing ext
6337 if not format.get('ext') and 'url' in format:
6338 format['ext'] = determine_ext(format['url'])
6339 if format.get('vcodec') == 'none':
6340 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6341 format['video_ext'] = 'none'
6342 else:
6343 format['video_ext'] = format['ext']
6344 format['audio_ext'] = 'none'
6345 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6346 # format['preference'] = -1000
6347
6348 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6349 # HEVC-over-FLV is out-of-spec by FLV's original spec
6350 # ref. https://trac.ffmpeg.org/ticket/6389
6351 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6352 format['preference'] = -100
6353
6354 # Determine missing bitrates
6355 if format.get('tbr') is None:
6356 if format.get('vbr') is not None and format.get('abr') is not None:
6357 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6358 else:
6359 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6360 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6361 if format.get('acodec') != 'none' and format.get('abr') is None:
6362 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6363
6364 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6365
6366
6367 # Deprecated
6368 has_certifi = bool(certifi)
6369 has_websockets = bool(websockets)