]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
36170e125e01c39601ca43e47cebca6434f42ddb
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import importlib.util
22 import inspect
23 import io
24 import itertools
25 import json
26 import locale
27 import math
28 import mimetypes
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from .compat import functools # isort: split
52 from .compat import (
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_HTMLParseError,
56 compat_os_name,
57 compat_shlex_quote,
58 )
59 from .dependencies import brotli, certifi, websockets, xattr
60 from .socks import ProxyType, sockssocket
61
62
63 def register_socks_protocols():
64 # "Register" SOCKS protocols
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
70
71
72 # This is not clearly defined otherwise
73 compiled_regex_type = type(re.compile(''))
74
75
76 def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
121 SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123 ]
124 if brotli:
125 SUPPORTED_ENCODINGS.append('br')
126
127 std_headers = {
128 'User-Agent': random_user_agent(),
129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130 'Accept-Language': 'en-us,en;q=0.5',
131 'Sec-Fetch-Mode': 'navigate',
132 }
133
134
135 USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137 }
138
139
140 NO_DEFAULT = object()
141 IDENTITY = lambda x: x
142
143 ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
147 MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
157 }
158
159 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160 TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167 }
168
169 # needed for sanitizing filenames in restricted mode
170 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
173
174 DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
178 '%B %dst %Y',
179 '%B %dnd %Y',
180 '%B %drd %Y',
181 '%B %dth %Y',
182 '%b %d %Y',
183 '%b %dst %Y',
184 '%b %dnd %Y',
185 '%b %drd %Y',
186 '%b %dth %Y',
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
189 '%b %drd %Y %I:%M',
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
193 '%Y.%m.%d.',
194 '%Y/%m/%d',
195 '%Y/%m/%d %H:%M',
196 '%Y/%m/%d %H:%M:%S',
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
199 '%Y%m%d',
200 '%Y-%m-%d %H:%M',
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
203 '%Y-%m-%d %H:%M:%S:%f',
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
216 '%H:%M %d-%b-%Y',
217 )
218
219 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220 DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
227 '%d-%m-%Y %H:%M',
228 ])
229
230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231 DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237 ])
238
239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
241
242 NUMBER_RE = r'\d+(?:\.\d+)?'
243
244
245 @functools.cache
246 def preferredencoding():
247 """Get preferred encoding.
248
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
254 'TEST'.encode(pref)
255 except Exception:
256 pref = 'UTF-8'
257
258 return pref
259
260
261 def write_json_file(obj, fn):
262 """ Encode obj as JSON and write it to fn, atomically if possible """
263
264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
267
268 try:
269 with tf:
270 json.dump(obj, tf, ensure_ascii=False)
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
274 with contextlib.suppress(OSError):
275 os.unlink(fn)
276 with contextlib.suppress(OSError):
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
280 os.rename(tf.name, fn)
281 except Exception:
282 with contextlib.suppress(OSError):
283 os.remove(tf.name)
284 raise
285
286
287 def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
291 return node.find(expr)
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295
296
297 def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
308
309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 def _find_xpath(xpath):
311 return node.find(xpath)
312
313 if isinstance(xpath, str):
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
320
321 if n is None:
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
329 return n
330
331
332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
345
346
347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
353 name = f'{xpath}[@{key}]' if name is None else name
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
358
359
360 def get_element_by_id(id, html, **kwargs):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_html_by_id(id, html, **kwargs):
366 """Return the html of the tag with the specified ID in the passed HTML document"""
367 return get_element_html_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_by_class(class_name, html):
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
384 return retval[0] if retval else None
385
386
387 def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
389 return retval[0] if retval else None
390
391
392 def get_elements_by_class(class_name, html, **kargs):
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
403 html, escape_value=False)
404
405
406 def get_elements_by_attribute(*args, **kwargs):
407 """Return the content of the tag with the specified attribute in the passed HTML document"""
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
421 if not value:
422 return
423
424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
425
426 value = re.escape(value) if escape_value else value
427
428 partial_element_re = rf'''(?x)
429 <(?P<tag>{tag})
430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
433
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
436
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
441
442
443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
455 html.parser.HTMLParser.__init__(self)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
485 # XXX: This should be far less strict
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(html.parser.HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 html.parser.HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530 raise compat_HTMLParseError('done')
531
532
533 class HTMLListAttrsParser(html.parser.HTMLParser):
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
537 html.parser.HTMLParser.__init__(self)
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
550 def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
563 """
564 parser = HTMLAttributeParser()
565 with contextlib.suppress(compat_HTMLParseError):
566 parser.feed(html_element)
567 parser.close()
568 return parser.attrs
569
570
571 def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
580 def clean_html(html):
581 """Clean an HTML snippet into a readable string"""
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
593 return html.strip()
594
595
596 class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
612
613
614 def sanitize_open(filename, open_mode):
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
627
628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
632
633 for attempt in range(2):
634 try:
635 try:
636 if sys.platform == 'win32':
637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
640 raise LockingUnsupportedError()
641 stream = locked_file(filename, open_mode, block=False).__enter__()
642 except OSError:
643 stream = open(filename, open_mode)
644 return stream, filename
645 except OSError as err:
646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
651
652
653 def timeconvert(timestr):
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
660
661
662 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
663 """Sanitizes a string so it could be used as part of a filename.
664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
667 """
668 if s == '':
669 return ''
670
671 def replace_insane(char):
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
674 elif not restricted and char == '\n':
675 return '\0 '
676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
684 return '\0_\0-' if restricted else '\0 \0-'
685 elif char in '\\/|*<>':
686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
689 return char
690
691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
693 s = unicodedata.normalize('NFKC', s)
694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
695 result = ''.join(map(replace_insane, s))
696 if is_id is NO_DEFAULT:
697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
711 result = result.lstrip('.')
712 if not result:
713 result = '_'
714 return result
715
716
717 def sanitize_path(s, force=False):
718 """Sanitizes and normalizes path on Windows"""
719 if sys.platform == 'win32':
720 force = False
721 drive_or_unc, _ = os.path.splitdrive(s)
722 elif force:
723 drive_or_unc = ''
724 else:
725 return s
726
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
729 norm_path.pop(0)
730 sanitized_path = [
731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
732 for path_part in norm_path]
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
735 elif force and s and s[0] == os.path.sep:
736 sanitized_path.insert(0, os.path.sep)
737 return os.path.join(*sanitized_path)
738
739
740 def sanitize_url(url, *, scheme='http'):
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
743 if url is None:
744 return
745 elif url.startswith('//'):
746 return f'{scheme}:{url}'
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
749 # https://github.com/ytdl-org/youtube-dl/issues/15649
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
757 return url
758
759
760 def extract_basic_auth(url):
761 parts = urllib.parse.urlsplit(url)
762 if parts.username is None:
763 return url, None
764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
770
771
772 def sanitized_Request(url, *args, **kwargs):
773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
777 return urllib.request.Request(url, *args, **kwargs)
778
779
780 def expand_path(s):
781 """Expand shell variables and ~"""
782 return os.path.expandvars(compat_expanduser(s))
783
784
785 def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
795
796
797 def _htmlentity_transform(entity_with_semicolon):
798 """Transforms an HTML entity to a character."""
799 entity = entity_with_semicolon[:-1]
800
801 # Known non-numeric HTML entity
802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
804
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
809
810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
811 if mobj is not None:
812 numstr = mobj.group(1)
813 if numstr.startswith('x'):
814 base = 16
815 numstr = '0%s' % numstr
816 else:
817 base = 10
818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
819 with contextlib.suppress(ValueError):
820 return chr(int(numstr, base))
821
822 # Unknown entity in name, return its literal representation
823 return '&%s;' % entity
824
825
826 def unescapeHTML(s):
827 if s is None:
828 return None
829 assert isinstance(s, str)
830
831 return re.sub(
832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
833
834
835 def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
846 def process_communicate_or_kill(p, *args, **kwargs):
847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
849 return Popen.communicate_or_kill(p, *args, **kwargs)
850
851
852 class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
888
889 def communicate_or_kill(self, *args, **kwargs):
890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
893 self.kill(timeout=None)
894 raise
895
896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
902 def run(cls, *args, timeout=None, **kwargs):
903 with cls(*args, **kwargs) as proc:
904 default = '' if proc.text_mode else b''
905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
906 return stdout or default, stderr or default, proc.returncode
907
908
909 def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
921 def encodeFilename(s, for_subprocess=False):
922 assert isinstance(s, str)
923 return s
924
925
926 def decodeFilename(b, for_subprocess=False):
927 return b
928
929
930 def encodeArgument(s):
931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
934 return s if isinstance(s, str) else s.decode('ascii')
935
936
937 def decodeArgument(b):
938 return b
939
940
941 def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
947 assert isinstance(optval, str)
948 return optval
949
950
951 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954 def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
961 def formatSeconds(secs, delim=':', msec=False):
962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
967 else:
968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
970
971
972 def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
981 with contextlib.suppress(ssl.SSLError):
982 ssl_context.load_verify_locations(cadata=cert)
983
984
985 def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
1012
1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
1029
1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1046
1047
1048 def bug_reports_message(before=';'):
1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
1059
1060
1061 class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
1071
1072
1073 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1074 if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076 network_exceptions = tuple(network_exceptions)
1077
1078
1079 class ExtractorError(YoutubeDLError):
1080 """Error during info extraction."""
1081
1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1083 """ tb, if given, is the original traceback (so that it can be printed out).
1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1085 """
1086 if sys.exc_info()[0] in network_exceptions:
1087 expected = True
1088
1089 self.orig_msg = str(msg)
1090 self.traceback = tb
1091 self.expected = expected
1092 self.cause = cause
1093 self.video_id = video_id
1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
1098 super().__init__(self.__msg)
1099
1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1108
1109 def format_traceback(self):
1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1113 delim='\n') or None
1114
1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1121
1122 class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
1124 super().__init__(
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
1129 class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
1134 class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
1140
1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
1143 super().__init__(msg, **kwargs)
1144 self.countries = countries
1145
1146
1147 class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
1155 class DownloadError(YoutubeDLError):
1156 """Download Error exception.
1157
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
1162
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1165 super().__init__(msg)
1166 self.exc_info = exc_info
1167
1168
1169 class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
1175 msg = 'Entry not found in info'
1176
1177
1178 class SameFileError(YoutubeDLError):
1179 """Same File exception.
1180
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
1190
1191
1192 class PostProcessingError(YoutubeDLError):
1193 """Post Processing exception.
1194
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
1198
1199
1200 class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
1203
1204
1205 class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1208
1209
1210 class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1213
1214
1215 class MaxDownloadsReached(DownloadCancelled):
1216 """ --max-downloads limit has been reached. """
1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
1220 class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228 class ThrottledDownload(ReExtractInfo):
1229 """ Download speed below --throttled-rate. """
1230 msg = 'The download speed is below throttle limit'
1231
1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
1234
1235
1236 class UnavailableVideoError(YoutubeDLError):
1237 """Unavailable Format exception.
1238
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
1248
1249
1250 class ContentTooShortError(YoutubeDLError):
1251 """Content Too Short exception.
1252
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
1257
1258 def __init__(self, downloaded, expected):
1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1260 # Both in bytes
1261 self.downloaded = downloaded
1262 self.expected = expected
1263
1264
1265 class XAttrMetadataError(YoutubeDLError):
1266 def __init__(self, code=None, msg='Unknown error'):
1267 super().__init__(msg)
1268 self.code = code
1269 self.msg = msg
1270
1271 # Parsing code and msg
1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
1281 class XAttrUnavailableError(YoutubeDLError):
1282 pass
1283
1284
1285 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1286 hc = http_class(*args, **kwargs)
1287 source_address = ydl_handler._params.get('source_address')
1288
1289 if source_address is not None:
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1303 raise OSError(
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
1317 except OSError as _:
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
1324 raise OSError('getaddrinfo returns an empty list')
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
1327 hc.source_address = (source_address, 0)
1328
1329 return hc
1330
1331
1332 def handle_youtubedl_headers(headers):
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1337 del filtered_headers['Youtubedl-no-compression']
1338
1339 return filtered_headers
1340
1341
1342 class YoutubeDLHandler(urllib.request.HTTPHandler):
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
1349 to include the HTTP header "Youtubedl-no-compression", which will be
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
1360 def __init__(self, params, *args, **kwargs):
1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1362 self._params = params
1363
1364 def http_open(self, req):
1365 conn_class = http.client.HTTPConnection
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
1372 return self.do_open(functools.partial(
1373 _create_http_connection, self, conn_class, False),
1374 req)
1375
1376 @staticmethod
1377 def deflate(data):
1378 if not data:
1379 return data
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
1389 return brotli.decompress(data)
1390
1391 def http_request(self, req):
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
1405 req = update_Request(req, url=url_escaped)
1406
1407 for h, v in self._params.get('http_headers', std_headers).items():
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
1411 req.add_header(h, v)
1412
1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
1416 req.headers = handle_youtubedl_headers(req.headers)
1417
1418 return super().do_request_(req)
1419
1420 def http_response(self, req, resp):
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
1428 except OSError as original_ioerror:
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
1435 except OSError:
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1441 resp.msg = old_resp.msg
1442 del resp.headers['Content-encoding']
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1447 resp.msg = old_resp.msg
1448 del resp.headers['Content-encoding']
1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
1451 resp = urllib.request.addinfourl(
1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1461 location = location.encode('iso-8859-1').decode()
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
1466 return resp
1467
1468 https_request = http_request
1469 https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
1474 http.client.HTTPConnection, http.client.HTTPSConnection))
1475
1476 url_components = urllib.parse.urlparse(socks_proxy)
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
1483
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
1487 return urllib.parse.unquote_plus(s)
1488
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
1501 if isinstance(self.timeout, (int, float)):
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
1505 if isinstance(self, http.client.HTTPSConnection):
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1519 self._params = params
1520
1521 def https_open(self, req):
1522 kwargs = {}
1523 conn_class = self._https_conn_class
1524
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
1543
1544
1545 def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
1549 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
1555 _HTTPONLY_PREFIX = '#HttpOnly_'
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
1558 # This file is generated by yt-dlp. Do not edit.
1559
1560 '''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1564
1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
1567 if is_path_like(filename):
1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
1577 if is_path_like(file):
1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
1607 """
1608 Save cookies to a file.
1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1611
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1617
1618 # Store session cookies with `expires` set to 0 instead of an empty string
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
1622
1623 with self.open(filename, write=True) as f:
1624 f.write(self._HEADER)
1625 self._really_save(f, *args, **kwargs)
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1634
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1647 return line
1648
1649 cf = io.StringIO()
1650 with self.open(filename) as f:
1651 for line in f:
1652 try:
1653 cf.write(prepare_line(line))
1654 except http.cookiejar.LoadError as e:
1655 if f'{line.strip()} '[0] in '[{"':
1656 raise http.cookiejar.LoadError(
1657 'Cookies file must be Netscape formatted, not JSON. See '
1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1660 continue
1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
1679 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1680 def __init__(self, cookiejar=None):
1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1682
1683 def http_response(self, request, response):
1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1685
1686 https_request = urllib.request.HTTPCookieProcessor.http_request
1687 https_response = http_response
1688
1689
1690 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
1747 return urllib.request.Request(
1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1749 unverifiable=True, method=m)
1750
1751
1752 def extract_timezone(date_str):
1753 m = re.search(
1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
1764 if not m:
1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
1782 def parse_iso8601(date_str, delimiter='T', timezone=None):
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
1790 if timezone is None:
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 with contextlib.suppress(ValueError):
1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
1797
1798
1799 def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
1803 def unified_strdate(date_str, day_first=True):
1804 """Return a string with the date in the format YYYYMMDD"""
1805
1806 if date_str is None:
1807 return None
1808 upload_date = None
1809 # Replace commas
1810 date_str = date_str.replace(',', ' ')
1811 # Remove AM/PM + timezone
1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1813 _, date_str = extract_timezone(date_str)
1814
1815 for expression in date_formats(day_first):
1816 with contextlib.suppress(ValueError):
1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
1821 with contextlib.suppress(ValueError):
1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1823 if upload_date is not None:
1824 return str(upload_date)
1825
1826
1827 def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1833
1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
1850 for expression in date_formats(day_first):
1851 with contextlib.suppress(ValueError):
1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1853 return calendar.timegm(dt.timetuple())
1854
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1858
1859
1860 def determine_ext(url, default_ext='unknown_video'):
1861 if url is None or '.' not in url:
1862 return default_ext
1863 guess = url.partition('?')[0].rpartition('.')[2]
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1868 return guess.rstrip('/')
1869 else:
1870 return default_ext
1871
1872
1873 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1875
1876
1877 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
1891 today = datetime_round(datetime.datetime.utcnow(), precision)
1892 if date_str in ('now', 'today'):
1893 return today
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
1896 match = re.match(
1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1898 date_str)
1899 if match is not None:
1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1902 unit = match.group('unit')
1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1905 unit = 'day'
1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
1919 def date_from_str(date_str, format='%Y%m%d', strict=False):
1920 R"""
1921 Return a date object from a string using datetime_from_str
1922
1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1925 """
1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931 def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940 def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1956
1957
1958 def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
1967
1968 class DateRange:
1969 """Represents a time interval between two dates"""
1970
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
1974 self.start = date_from_str(start, strict=True)
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
1978 self.end = date_from_str(end, strict=True)
1979 else:
1980 self.end = datetime.datetime.max.date()
1981 if self.start > self.end:
1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1983
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
1987 return cls(day, day)
1988
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
1994
1995 def __str__(self):
1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1997
1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
2002
2003 def platform_name():
2004 """ Returns the platform name as a str """
2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2006 return platform.platform()
2007
2008
2009 @functools.cache
2010 def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
2017
2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2019 platform.python_version(),
2020 python_implementation,
2021 platform.machine(),
2022 platform.architecture()[0],
2023 platform.platform(),
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2026 )
2027
2028
2029 @functools.cache
2030 def get_windows_version():
2031 ''' Get Windows version. returns () if it's not running on Windows '''
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
2035 return ()
2036
2037
2038 def write_string(s, out=None, encoding=None):
2039 assert isinstance(s, str)
2040 out = out or sys.stderr
2041
2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2043 s = re.sub(r'([\r\n]+)', r' \1', s)
2044
2045 enc, buffer = None, out
2046 if 'b' in getattr(out, 'mode', ''):
2047 enc = encoding or preferredencoding()
2048 elif hasattr(out, 'buffer'):
2049 buffer = out.buffer
2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2051
2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
2053 out.flush()
2054
2055
2056 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070 deprecation_warning._cache = set()
2071
2072
2073 def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
2085 return struct.pack('%dB' % len(xs), *xs)
2086
2087
2088 class LockingUnsupportedError(OSError):
2089 msg = 'File locking is not supported'
2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
2095 # Cross-platform file locking
2096 if sys.platform == 'win32':
2097 import ctypes
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
2133 def _lock_file(f, exclusive, block):
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152 else:
2153 try:
2154 import fcntl
2155
2156 def _lock_file(f, exclusive, block):
2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
2160 try:
2161 fcntl.flock(f, flags)
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
2165 fcntl.lockf(f, flags)
2166
2167 def _unlock_file(f):
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
2172
2173 except ImportError:
2174
2175 def _lock_file(f, exclusive, block):
2176 raise LockingUnsupportedError()
2177
2178 def _unlock_file(f):
2179 raise LockingUnsupportedError()
2180
2181
2182 class locked_file:
2183 locked = False
2184
2185 def __init__(self, filename, mode, block=True, encoding=None):
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2203
2204 def __enter__(self):
2205 exclusive = 'r' not in self.mode
2206 try:
2207 _lock_file(self.f, exclusive, self.block)
2208 self.locked = True
2209 except OSError:
2210 self.f.close()
2211 raise
2212 if 'w' in self.mode:
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
2221 return self
2222
2223 def unlock(self):
2224 if not self.locked:
2225 return
2226 try:
2227 _unlock_file(self.f)
2228 finally:
2229 self.locked = False
2230
2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
2236
2237 open = __enter__
2238 close = __exit__
2239
2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
2242
2243 def __iter__(self):
2244 return iter(self.f)
2245
2246
2247 @functools.cache
2248 def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
2253 def shell_quote(args):
2254 quoted_args = []
2255 encoding = get_filesystem_encoding()
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
2260 quoted_args.append(compat_shlex_quote(a))
2261 return ' '.join(quoted_args)
2262
2263
2264 def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
2269 sdata = urllib.parse.urlencode(
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
2272
2273
2274 def unsmuggle_url(smug_url, default=None):
2275 if '#__youtubedl_smuggle' not in smug_url:
2276 return smug_url, default
2277 url, _, sdata = smug_url.rpartition('#')
2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2279 data = json.loads(jsond)
2280 return url, data
2281
2282
2283 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
2286 if num is None or num < 0:
2287 return None
2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2293 converted = num / (factor ** exponent)
2294 return fmt % (converted, suffix)
2295
2296
2297 def format_bytes(bytes):
2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2299
2300
2301 def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2303 units_re = '|'.join(re.escape(u) for u in unit_table)
2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2306 if not m:
2307 return None
2308
2309 num = float(m.group('num').replace(',', '.'))
2310 mult = unit_table[m.group('unit')]
2311 return round(num * mult)
2312
2313
2314 def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
2319
2320
2321 def parse_filesize(s):
2322 if s is None:
2323 return None
2324
2325 # The lower-case forms are of course incorrect and unofficial,
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
2330 'bytes': 1,
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
2335 'kb': 1000,
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
2342 'mb': 1000 ** 2,
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
2349 'gb': 1000 ** 3,
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
2356 'tb': 1000 ** 4,
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
2363 'pb': 1000 ** 5,
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
2370 'eb': 1000 ** 6,
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
2377 'zb': 1000 ** 7,
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
2384 'yb': 1000 ** 8,
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
2387 }
2388
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392 def parse_count(s):
2393 if s is None:
2394 return None
2395
2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
2410 }
2411
2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
2419
2420
2421 def parse_resolution(s, *, lenient=False):
2422 if s is None:
2423 return {}
2424
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
2446 def parse_bitrate(s):
2447 if not isinstance(s, str):
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
2454 def month_by_name(name, lang='en'):
2455 """ Return the number of a month by (locale-independently) English name """
2456
2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2458
2459 try:
2460 return month_names.index(name) + 1
2461 except ValueError:
2462 return None
2463
2464
2465 def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2471 except ValueError:
2472 return None
2473
2474
2475 def fix_xml_ampersands(xml_str):
2476 """Replace all the '&' by '&amp;' in XML"""
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2479 '&amp;',
2480 xml_str)
2481
2482
2483 def setproctitle(title):
2484 assert isinstance(title, str)
2485
2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
2490 return
2491
2492 try:
2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2494 except OSError:
2495 return
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
2501 title_bytes = title.encode()
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
2504 try:
2505 libc.prctl(15, buf, 0, 0, 0)
2506 except AttributeError:
2507 return # Strange libc, just skip this
2508
2509
2510 def remove_start(s, start):
2511 return s[len(start):] if s is not None and s.startswith(start) else s
2512
2513
2514 def remove_end(s, end):
2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2516
2517
2518 def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
2527 def get_domain(url):
2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2533
2534
2535 def url_basename(url):
2536 path = urllib.parse.urlparse(url).path
2537 return path.strip('/').split('/')[-1]
2538
2539
2540 def base_url(url):
2541 return re.match(r'https?://[^?#]+/', url).group()
2542
2543
2544 def urljoin(base, path):
2545 if isinstance(path, bytes):
2546 path = path.decode()
2547 if not isinstance(path, str) or not path:
2548 return None
2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2550 return path
2551 if isinstance(base, bytes):
2552 base = base.decode()
2553 if not isinstance(base, str) or not re.match(
2554 r'^(?:https?:)?//', base):
2555 return None
2556 return urllib.parse.urljoin(base, path)
2557
2558
2559 class HEADRequest(urllib.request.Request):
2560 def get_method(self):
2561 return 'HEAD'
2562
2563
2564 class PUTRequest(urllib.request.Request):
2565 def get_method(self):
2566 return 'PUT'
2567
2568
2569 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
2572 try:
2573 return int(v) * invscale // scale
2574 except (ValueError, TypeError, OverflowError):
2575 return default
2576
2577
2578 def str_or_none(v, default=None):
2579 return default if v is None else str(v)
2580
2581
2582 def str_to_int(int_str):
2583 """ A more relaxed version of int_or_none """
2584 if isinstance(int_str, int):
2585 return int_str
2586 elif isinstance(int_str, str):
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
2589
2590
2591 def float_or_none(v, scale=1, invscale=1, default=None):
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
2596 except (ValueError, TypeError):
2597 return default
2598
2599
2600 def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
2604 def strip_or_none(v, default=None):
2605 return v.strip() if isinstance(v, str) else default
2606
2607
2608 def url_or_none(url):
2609 if not url or not isinstance(url, str):
2610 return None
2611 url = url.strip()
2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2613
2614
2615 def request_to_url(req):
2616 if isinstance(req, urllib.request.Request):
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
2622 def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
2625 if isinstance(timestamp, (int, float)): # unix timestamp
2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2629 elif isinstance(timestamp, str): # assume YYYYMMDD
2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
2638 def parse_duration(s):
2639 if not isinstance(s, str):
2640 return None
2641 s = s.strip()
2642 if not s:
2643 return None
2644
2645 days, hours, mins, secs, ms = [None] * 5
2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
2652 if m:
2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2654 else:
2655 m = re.match(
2656 r'''(?ix)(?:P?
2657 (?:
2658 [0-9]+\s*y(?:ears?)?,?\s*
2659 )?
2660 (?:
2661 [0-9]+\s*m(?:onths?)?,?\s*
2662 )?
2663 (?:
2664 [0-9]+\s*w(?:eeks?)?,?\s*
2665 )?
2666 (?:
2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2668 )?
2669 T)?
2670 (?:
2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2672 )?
2673 (?:
2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2678 )?Z?$''', s)
2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
2688 if ms:
2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2692
2693
2694 def prepend_extension(filename, ext, expected_real_ext=None):
2695 name, real_ext = os.path.splitext(filename)
2696 return (
2697 f'{name}.{ext}{real_ext}'
2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
2699 else f'{filename}.{ext}')
2700
2701
2702 def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
2704 return '{}.{}'.format(
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
2709 def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2714 except OSError:
2715 return False
2716 return exe
2717
2718
2719 def _get_exe_version_output(exe, args):
2720 try:
2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2722 # SIGTTOU if yt-dlp is run in the background.
2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2726 except OSError:
2727 return False
2728 return stdout
2729
2730
2731 def detect_exe_version(output, version_re=None, unrecognized='present'):
2732 assert isinstance(output, str)
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
2742 def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
2750 def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
2760 class LazyList(collections.abc.Sequence):
2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
2763
2764 class IndexError(IndexError):
2765 pass
2766
2767 def __init__(self, iterable, *, reverse=False, _cache=None):
2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
2771
2772 def __iter__(self):
2773 if self._reversed:
2774 # We need to consume the entire iterable to iterate in reverse
2775 yield from self.exhaust()
2776 return
2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
2780 yield item
2781
2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
2786
2787 def exhaust(self):
2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
2790
2791 @staticmethod
2792 def _reverse_index(x):
2793 return None if x is None else ~x
2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2799 start, stop, step = idx.start, idx.stop, idx.step or 1
2800 elif isinstance(idx, int):
2801 if self._reversed:
2802 idx = self._reverse_index(idx)
2803 start, stop, step = idx, idx, 0
2804 else:
2805 raise TypeError('indices must be integers or slices')
2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
2811 self._exhaust()
2812 try:
2813 return self._cache[idx]
2814 except IndexError as e:
2815 raise self.IndexError(e) from e
2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
2817 if n > 0:
2818 self._cache.extend(itertools.islice(self._iterable, n))
2819 try:
2820 return self._cache[idx]
2821 except IndexError as e:
2822 raise self.IndexError(e) from e
2823
2824 def __bool__(self):
2825 try:
2826 self[-1] if self._reversed else self[0]
2827 except self.IndexError:
2828 return False
2829 return True
2830
2831 def __len__(self):
2832 self._exhaust()
2833 return len(self._cache)
2834
2835 def __reversed__(self):
2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2837
2838 def __copy__(self):
2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2840
2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
2848
2849 class PagedList:
2850
2851 class IndexError(IndexError):
2852 pass
2853
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
2861 self._pagecount = float('inf')
2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
2880 assert self._use_cache, 'Indexing PagedList requires cache'
2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
2884 if not entries:
2885 raise self.IndexError()
2886 return entries[0]
2887
2888
2889 class OnDemandPagedList(PagedList):
2890 """Download pages until a page with less than maximum results"""
2891
2892 def _getslice(self, start, end):
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
2915 yield from page_results
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
2928
2929
2930 class InAdvancePagedList(PagedList):
2931 """PagedList with total number of pages known in advance"""
2932
2933 def __init__(self, pagefunc, pagecount, pagesize):
2934 PagedList.__init__(self, pagefunc, pagesize, True)
2935 self._pagecount = pagecount
2936
2937 def _getslice(self, start, end):
2938 start_page = start // self._pagesize
2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
2943 page_results = self.getpage(pagenum)
2944 if skip_elems:
2945 page_results = page_results[skip_elems:]
2946 skip_elems = None
2947 if only_more is not None:
2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
2950 else:
2951 yield from page_results[:only_more]
2952 break
2953 yield from page_results
2954
2955
2956 class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
2971 self.is_incomplete = requested_entries is not None
2972 if self.is_incomplete:
2973 assert self.is_exhausted
2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
3017 if not entry:
3018 continue
3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
3079 break
3080 continue
3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
3090 def uppercase_escape(s):
3091 unicode_escape = codecs.getdecoder('unicode_escape')
3092 return re.sub(
3093 r'\\U[0-9a-fA-F]{8}',
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
3096
3097
3098 def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
3104
3105
3106 def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3109
3110
3111 def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
3113 url_parsed = urllib.parse.urlparse(url)
3114 return url_parsed._replace(
3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
3122
3123 def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3125
3126
3127 def read_batch_urls(batch_fd):
3128 def fixup(url):
3129 if not isinstance(url, str):
3130 url = url.decode('utf-8', 'replace')
3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
3137 return False
3138 # "#" cannot be stripped out since it is part of the URI
3139 # However, it can be safely stripped out if following a whitespace
3140 return re.split(r'\s#', url, 1)[0].rstrip()
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
3144
3145
3146 def urlencode_postdata(*args, **kargs):
3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3148
3149
3150 def update_url_query(url, query):
3151 if not query:
3152 return url
3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
3155 qs.update(query)
3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
3158
3159
3160 def update_Request(req, url=None, data=None, headers=None, query=None):
3161 req_headers = req.headers.copy()
3162 req_headers.update(headers or {})
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
3171 req_type = urllib.request.Request
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
3180 def _multipart_encode_impl(data, boundary):
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
3186 if isinstance(k, str):
3187 k = k.encode()
3188 if isinstance(v, str):
3189 v = v.encode()
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202 def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
3222 out, content_type = _multipart_encode_impl(data, boundary)
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
3232 def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
3236 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
3241
3242
3243 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
3245 try:
3246 val = f(*args, **kwargs)
3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3248 pass
3249 else:
3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254 def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3256
3257
3258 def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
3262 def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
3268 merged[k] = v
3269 return merged
3270
3271
3272 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3273 return string if isinstance(string, str) else str(string, encoding, errors)
3274
3275
3276 US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282 }
3283
3284
3285 TV_PARENTAL_GUIDELINES = {
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
3292 }
3293
3294
3295 def parse_age_limit(s):
3296 # isinstance(False, int) is True. So type() must be used instead
3297 if type(s) is int: # noqa: E721
3298 return s if 0 <= s <= 21 else None
3299 elif not isinstance(s, str):
3300 return None
3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3302 if m:
3303 return int(m.group('age'))
3304 s = s.upper()
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3308 if m:
3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3310 return None
3311
3312
3313 def strip_jsonp(code):
3314 return re.sub(
3315 r'''(?sx)^
3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
3321
3322
3323 def js_to_json(code, vars={}, *, strict=False):
3324 # vars is a dict of var, val pairs to substitute
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3329 INTEGER_TABLE = (
3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3332 )
3333
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
3343 def fix_kv(m):
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 return json.dumps(vars[v])
3364
3365 if not strict:
3366 return f'"{v}"'
3367
3368 raise ValueError(f'Unknown value: {v}')
3369
3370 def create_map(mobj):
3371 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
3373 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3374 if not strict:
3375 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3376 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3377
3378 return re.sub(rf'''(?sx)
3379 {STRING_RE}|
3380 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3381 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3382 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383 [0-9]+(?={SKIP_RE}:)|
3384 !+
3385 ''', fix_kv, code)
3386
3387
3388 def qualities(quality_ids):
3389 """ Get a numeric quality value out of a list of possible values """
3390 def q(qid):
3391 try:
3392 return quality_ids.index(qid)
3393 except ValueError:
3394 return -1
3395 return q
3396
3397
3398 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3399
3400
3401 DEFAULT_OUTTMPL = {
3402 'default': '%(title)s [%(id)s].%(ext)s',
3403 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3404 }
3405 OUTTMPL_TYPES = {
3406 'chapter': None,
3407 'subtitle': None,
3408 'thumbnail': None,
3409 'description': 'description',
3410 'annotation': 'annotations.xml',
3411 'infojson': 'info.json',
3412 'link': None,
3413 'pl_video': None,
3414 'pl_thumbnail': None,
3415 'pl_description': 'description',
3416 'pl_infojson': 'info.json',
3417 }
3418
3419 # As of [1] format syntax is:
3420 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3422 STR_FORMAT_RE_TMPL = r'''(?x)
3423 (?<!%)(?P<prefix>(?:%%)*)
3424 %
3425 (?P<has_key>\((?P<key>{0})\))?
3426 (?P<format>
3427 (?P<conversion>[#0\-+ ]+)?
3428 (?P<min_width>\d+)?
3429 (?P<precision>\.\d+)?
3430 (?P<len_mod>[hlL])? # unused in python
3431 {1} # conversion type
3432 )
3433 '''
3434
3435
3436 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3437
3438
3439 def limit_length(s, length):
3440 """ Add ellipses to overly long strings """
3441 if s is None:
3442 return None
3443 ELLIPSES = '...'
3444 if len(s) > length:
3445 return s[:length - len(ELLIPSES)] + ELLIPSES
3446 return s
3447
3448
3449 def version_tuple(v):
3450 return tuple(int(e) for e in re.split(r'[-.]', v))
3451
3452
3453 def is_outdated_version(version, limit, assume_new=True):
3454 if not version:
3455 return not assume_new
3456 try:
3457 return version_tuple(version) < version_tuple(limit)
3458 except ValueError:
3459 return not assume_new
3460
3461
3462 def ytdl_is_updateable():
3463 """ Returns if yt-dlp can be updated with -U """
3464
3465 from .update import is_non_updateable
3466
3467 return not is_non_updateable()
3468
3469
3470 def args_to_str(args):
3471 # Get a short string representation for a subprocess command
3472 return ' '.join(compat_shlex_quote(a) for a in args)
3473
3474
3475 def error_to_compat_str(err):
3476 return str(err)
3477
3478
3479 def error_to_str(err):
3480 return f'{type(err).__name__}: {err}'
3481
3482
3483 def mimetype2ext(mt):
3484 if mt is None:
3485 return None
3486
3487 mt, _, params = mt.partition(';')
3488 mt = mt.strip()
3489
3490 FULL_MAP = {
3491 'audio/mp4': 'm4a',
3492 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3493 # it's the most popular one
3494 'audio/mpeg': 'mp3',
3495 'audio/x-wav': 'wav',
3496 'audio/wav': 'wav',
3497 'audio/wave': 'wav',
3498 }
3499
3500 ext = FULL_MAP.get(mt)
3501 if ext is not None:
3502 return ext
3503
3504 SUBTYPE_MAP = {
3505 '3gpp': '3gp',
3506 'smptett+xml': 'tt',
3507 'ttaf+xml': 'dfxp',
3508 'ttml+xml': 'ttml',
3509 'x-flv': 'flv',
3510 'x-mp4-fragmented': 'mp4',
3511 'x-ms-sami': 'sami',
3512 'x-ms-wmv': 'wmv',
3513 'mpegurl': 'm3u8',
3514 'x-mpegurl': 'm3u8',
3515 'vnd.apple.mpegurl': 'm3u8',
3516 'dash+xml': 'mpd',
3517 'f4m+xml': 'f4m',
3518 'hds+xml': 'f4m',
3519 'vnd.ms-sstr+xml': 'ism',
3520 'quicktime': 'mov',
3521 'mp2t': 'ts',
3522 'x-wav': 'wav',
3523 'filmstrip+json': 'fs',
3524 'svg+xml': 'svg',
3525 }
3526
3527 _, _, subtype = mt.rpartition('/')
3528 ext = SUBTYPE_MAP.get(subtype.lower())
3529 if ext is not None:
3530 return ext
3531
3532 SUFFIX_MAP = {
3533 'json': 'json',
3534 'xml': 'xml',
3535 'zip': 'zip',
3536 'gzip': 'gz',
3537 }
3538
3539 _, _, suffix = subtype.partition('+')
3540 ext = SUFFIX_MAP.get(suffix)
3541 if ext is not None:
3542 return ext
3543
3544 return subtype.replace('+', '.')
3545
3546
3547 def ext2mimetype(ext_or_url):
3548 if not ext_or_url:
3549 return None
3550 if '.' not in ext_or_url:
3551 ext_or_url = f'file.{ext_or_url}'
3552 return mimetypes.guess_type(ext_or_url)[0]
3553
3554
3555 def parse_codecs(codecs_str):
3556 # http://tools.ietf.org/html/rfc6381
3557 if not codecs_str:
3558 return {}
3559 split_codecs = list(filter(None, map(
3560 str.strip, codecs_str.strip().strip(',').split(','))))
3561 vcodec, acodec, scodec, hdr = None, None, None, None
3562 for full_codec in split_codecs:
3563 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3564 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3565 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3566 if vcodec:
3567 continue
3568 vcodec = full_codec
3569 if parts[0] in ('dvh1', 'dvhe'):
3570 hdr = 'DV'
3571 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3572 hdr = 'HDR10'
3573 elif parts[:2] == ['vp9', '2']:
3574 hdr = 'HDR10'
3575 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3576 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3577 acodec = acodec or full_codec
3578 elif parts[0] in ('stpp', 'wvtt'):
3579 scodec = scodec or full_codec
3580 else:
3581 write_string(f'WARNING: Unknown codec {full_codec}\n')
3582 if vcodec or acodec or scodec:
3583 return {
3584 'vcodec': vcodec or 'none',
3585 'acodec': acodec or 'none',
3586 'dynamic_range': hdr,
3587 **({'scodec': scodec} if scodec is not None else {}),
3588 }
3589 elif len(split_codecs) == 2:
3590 return {
3591 'vcodec': split_codecs[0],
3592 'acodec': split_codecs[1],
3593 }
3594 return {}
3595
3596
3597 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3598 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3599
3600 allow_mkv = not preferences or 'mkv' in preferences
3601
3602 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3603 return 'mkv' # TODO: any other format allows this?
3604
3605 # TODO: All codecs supported by parse_codecs isn't handled here
3606 COMPATIBLE_CODECS = {
3607 'mp4': {
3608 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3609 'h264', 'aacl', 'ec-3', # Set in ISM
3610 },
3611 'webm': {
3612 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3613 'vp9x', 'vp8x', # in the webm spec
3614 },
3615 }
3616
3617 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3618 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3619
3620 for ext in preferences or COMPATIBLE_CODECS.keys():
3621 codec_set = COMPATIBLE_CODECS.get(ext, set())
3622 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3623 return ext
3624
3625 COMPATIBLE_EXTS = (
3626 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3627 {'webm'},
3628 )
3629 for ext in preferences or vexts:
3630 current_exts = {ext, *vexts, *aexts}
3631 if ext == 'mkv' or current_exts == {ext} or any(
3632 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3633 return ext
3634 return 'mkv' if allow_mkv else preferences[-1]
3635
3636
3637 def urlhandle_detect_ext(url_handle):
3638 getheader = url_handle.headers.get
3639
3640 cd = getheader('Content-Disposition')
3641 if cd:
3642 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3643 if m:
3644 e = determine_ext(m.group('filename'), default_ext=None)
3645 if e:
3646 return e
3647
3648 return mimetype2ext(getheader('Content-Type'))
3649
3650
3651 def encode_data_uri(data, mime_type):
3652 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3653
3654
3655 def age_restricted(content_limit, age_limit):
3656 """ Returns True iff the content should be blocked """
3657
3658 if age_limit is None: # No limit set
3659 return False
3660 if content_limit is None:
3661 return False # Content available for everyone
3662 return age_limit < content_limit
3663
3664
3665 # List of known byte-order-marks (BOM)
3666 BOMS = [
3667 (b'\xef\xbb\xbf', 'utf-8'),
3668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3670 (b'\xff\xfe', 'utf-16-le'),
3671 (b'\xfe\xff', 'utf-16-be'),
3672 ]
3673
3674
3675 def is_html(first_bytes):
3676 """ Detect whether a file contains HTML by examining its first bytes. """
3677
3678 encoding = 'utf-8'
3679 for bom, enc in BOMS:
3680 while first_bytes.startswith(bom):
3681 encoding, first_bytes = enc, first_bytes[len(bom):]
3682
3683 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3684
3685
3686 def determine_protocol(info_dict):
3687 protocol = info_dict.get('protocol')
3688 if protocol is not None:
3689 return protocol
3690
3691 url = sanitize_url(info_dict['url'])
3692 if url.startswith('rtmp'):
3693 return 'rtmp'
3694 elif url.startswith('mms'):
3695 return 'mms'
3696 elif url.startswith('rtsp'):
3697 return 'rtsp'
3698
3699 ext = determine_ext(url)
3700 if ext == 'm3u8':
3701 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3702 elif ext == 'f4m':
3703 return 'f4m'
3704
3705 return urllib.parse.urlparse(url).scheme
3706
3707
3708 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3709 """ Render a list of rows, each as a list of values.
3710 Text after a \t will be right aligned """
3711 def width(string):
3712 return len(remove_terminal_sequences(string).replace('\t', ''))
3713
3714 def get_max_lens(table):
3715 return [max(width(str(v)) for v in col) for col in zip(*table)]
3716
3717 def filter_using_list(row, filterArray):
3718 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3719
3720 max_lens = get_max_lens(data) if hide_empty else []
3721 header_row = filter_using_list(header_row, max_lens)
3722 data = [filter_using_list(row, max_lens) for row in data]
3723
3724 table = [header_row] + data
3725 max_lens = get_max_lens(table)
3726 extra_gap += 1
3727 if delim:
3728 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3729 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3730 for row in table:
3731 for pos, text in enumerate(map(str, row)):
3732 if '\t' in text:
3733 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3734 else:
3735 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3736 ret = '\n'.join(''.join(row).rstrip() for row in table)
3737 return ret
3738
3739
3740 def _match_one(filter_part, dct, incomplete):
3741 # TODO: Generalize code with YoutubeDL._build_format_filter
3742 STRING_OPERATORS = {
3743 '*=': operator.contains,
3744 '^=': lambda attr, value: attr.startswith(value),
3745 '$=': lambda attr, value: attr.endswith(value),
3746 '~=': lambda attr, value: re.search(value, attr),
3747 }
3748 COMPARISON_OPERATORS = {
3749 **STRING_OPERATORS,
3750 '<=': operator.le, # "<=" must be defined above "<"
3751 '<': operator.lt,
3752 '>=': operator.ge,
3753 '>': operator.gt,
3754 '=': operator.eq,
3755 }
3756
3757 if isinstance(incomplete, bool):
3758 is_incomplete = lambda _: incomplete
3759 else:
3760 is_incomplete = lambda k: k in incomplete
3761
3762 operator_rex = re.compile(r'''(?x)
3763 (?P<key>[a-z_]+)
3764 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3765 (?:
3766 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3767 (?P<strval>.+?)
3768 )
3769 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3770 m = operator_rex.fullmatch(filter_part.strip())
3771 if m:
3772 m = m.groupdict()
3773 unnegated_op = COMPARISON_OPERATORS[m['op']]
3774 if m['negation']:
3775 op = lambda attr, value: not unnegated_op(attr, value)
3776 else:
3777 op = unnegated_op
3778 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3779 if m['quote']:
3780 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3781 actual_value = dct.get(m['key'])
3782 numeric_comparison = None
3783 if isinstance(actual_value, (int, float)):
3784 # If the original field is a string and matching comparisonvalue is
3785 # a number we should respect the origin of the original field
3786 # and process comparison value as a string (see
3787 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3788 try:
3789 numeric_comparison = int(comparison_value)
3790 except ValueError:
3791 numeric_comparison = parse_filesize(comparison_value)
3792 if numeric_comparison is None:
3793 numeric_comparison = parse_filesize(f'{comparison_value}B')
3794 if numeric_comparison is None:
3795 numeric_comparison = parse_duration(comparison_value)
3796 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3797 raise ValueError('Operator %s only supports string values!' % m['op'])
3798 if actual_value is None:
3799 return is_incomplete(m['key']) or m['none_inclusive']
3800 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3801
3802 UNARY_OPERATORS = {
3803 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3804 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3805 }
3806 operator_rex = re.compile(r'''(?x)
3807 (?P<op>%s)\s*(?P<key>[a-z_]+)
3808 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3809 m = operator_rex.fullmatch(filter_part.strip())
3810 if m:
3811 op = UNARY_OPERATORS[m.group('op')]
3812 actual_value = dct.get(m.group('key'))
3813 if is_incomplete(m.group('key')) and actual_value is None:
3814 return True
3815 return op(actual_value)
3816
3817 raise ValueError('Invalid filter part %r' % filter_part)
3818
3819
3820 def match_str(filter_str, dct, incomplete=False):
3821 """ Filter a dictionary with a simple string syntax.
3822 @returns Whether the filter passes
3823 @param incomplete Set of keys that is expected to be missing from dct.
3824 Can be True/False to indicate all/none of the keys may be missing.
3825 All conditions on incomplete keys pass if the key is missing
3826 """
3827 return all(
3828 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3829 for filter_part in re.split(r'(?<!\\)&', filter_str))
3830
3831
3832 def match_filter_func(filters):
3833 if not filters:
3834 return None
3835 filters = set(variadic(filters))
3836
3837 interactive = '-' in filters
3838 if interactive:
3839 filters.remove('-')
3840
3841 def _match_func(info_dict, incomplete=False):
3842 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3843 return NO_DEFAULT if interactive and not incomplete else None
3844 else:
3845 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3846 filter_str = ') | ('.join(map(str.strip, filters))
3847 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3848 return _match_func
3849
3850
3851 class download_range_func:
3852 def __init__(self, chapters, ranges):
3853 self.chapters, self.ranges = chapters, ranges
3854
3855 def __call__(self, info_dict, ydl):
3856 if not self.ranges and not self.chapters:
3857 yield {}
3858
3859 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3860 else 'Cannot match chapters since chapter information is unavailable')
3861 for regex in self.chapters or []:
3862 for i, chapter in enumerate(info_dict.get('chapters') or []):
3863 if re.search(regex, chapter['title']):
3864 warning = None
3865 yield {**chapter, 'index': i}
3866 if self.chapters and warning:
3867 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3868
3869 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3870
3871 def __eq__(self, other):
3872 return (isinstance(other, download_range_func)
3873 and self.chapters == other.chapters and self.ranges == other.ranges)
3874
3875 def __repr__(self):
3876 return f'{type(self).__name__}({self.chapters}, {self.ranges})'
3877
3878
3879 def parse_dfxp_time_expr(time_expr):
3880 if not time_expr:
3881 return
3882
3883 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3884 if mobj:
3885 return float(mobj.group('time_offset'))
3886
3887 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3888 if mobj:
3889 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3890
3891
3892 def srt_subtitles_timecode(seconds):
3893 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3894
3895
3896 def ass_subtitles_timecode(seconds):
3897 time = timetuple_from_msec(seconds * 1000)
3898 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3899
3900
3901 def dfxp2srt(dfxp_data):
3902 '''
3903 @param dfxp_data A bytes-like object containing DFXP data
3904 @returns A unicode object containing converted SRT data
3905 '''
3906 LEGACY_NAMESPACES = (
3907 (b'http://www.w3.org/ns/ttml', [
3908 b'http://www.w3.org/2004/11/ttaf1',
3909 b'http://www.w3.org/2006/04/ttaf1',
3910 b'http://www.w3.org/2006/10/ttaf1',
3911 ]),
3912 (b'http://www.w3.org/ns/ttml#styling', [
3913 b'http://www.w3.org/ns/ttml#style',
3914 ]),
3915 )
3916
3917 SUPPORTED_STYLING = [
3918 'color',
3919 'fontFamily',
3920 'fontSize',
3921 'fontStyle',
3922 'fontWeight',
3923 'textDecoration'
3924 ]
3925
3926 _x = functools.partial(xpath_with_ns, ns_map={
3927 'xml': 'http://www.w3.org/XML/1998/namespace',
3928 'ttml': 'http://www.w3.org/ns/ttml',
3929 'tts': 'http://www.w3.org/ns/ttml#styling',
3930 })
3931
3932 styles = {}
3933 default_style = {}
3934
3935 class TTMLPElementParser:
3936 _out = ''
3937 _unclosed_elements = []
3938 _applied_styles = []
3939
3940 def start(self, tag, attrib):
3941 if tag in (_x('ttml:br'), 'br'):
3942 self._out += '\n'
3943 else:
3944 unclosed_elements = []
3945 style = {}
3946 element_style_id = attrib.get('style')
3947 if default_style:
3948 style.update(default_style)
3949 if element_style_id:
3950 style.update(styles.get(element_style_id, {}))
3951 for prop in SUPPORTED_STYLING:
3952 prop_val = attrib.get(_x('tts:' + prop))
3953 if prop_val:
3954 style[prop] = prop_val
3955 if style:
3956 font = ''
3957 for k, v in sorted(style.items()):
3958 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3959 continue
3960 if k == 'color':
3961 font += ' color="%s"' % v
3962 elif k == 'fontSize':
3963 font += ' size="%s"' % v
3964 elif k == 'fontFamily':
3965 font += ' face="%s"' % v
3966 elif k == 'fontWeight' and v == 'bold':
3967 self._out += '<b>'
3968 unclosed_elements.append('b')
3969 elif k == 'fontStyle' and v == 'italic':
3970 self._out += '<i>'
3971 unclosed_elements.append('i')
3972 elif k == 'textDecoration' and v == 'underline':
3973 self._out += '<u>'
3974 unclosed_elements.append('u')
3975 if font:
3976 self._out += '<font' + font + '>'
3977 unclosed_elements.append('font')
3978 applied_style = {}
3979 if self._applied_styles:
3980 applied_style.update(self._applied_styles[-1])
3981 applied_style.update(style)
3982 self._applied_styles.append(applied_style)
3983 self._unclosed_elements.append(unclosed_elements)
3984
3985 def end(self, tag):
3986 if tag not in (_x('ttml:br'), 'br'):
3987 unclosed_elements = self._unclosed_elements.pop()
3988 for element in reversed(unclosed_elements):
3989 self._out += '</%s>' % element
3990 if unclosed_elements and self._applied_styles:
3991 self._applied_styles.pop()
3992
3993 def data(self, data):
3994 self._out += data
3995
3996 def close(self):
3997 return self._out.strip()
3998
3999 def parse_node(node):
4000 target = TTMLPElementParser()
4001 parser = xml.etree.ElementTree.XMLParser(target=target)
4002 parser.feed(xml.etree.ElementTree.tostring(node))
4003 return parser.close()
4004
4005 for k, v in LEGACY_NAMESPACES:
4006 for ns in v:
4007 dfxp_data = dfxp_data.replace(ns, k)
4008
4009 dfxp = compat_etree_fromstring(dfxp_data)
4010 out = []
4011 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4012
4013 if not paras:
4014 raise ValueError('Invalid dfxp/TTML subtitle')
4015
4016 repeat = False
4017 while True:
4018 for style in dfxp.findall(_x('.//ttml:style')):
4019 style_id = style.get('id') or style.get(_x('xml:id'))
4020 if not style_id:
4021 continue
4022 parent_style_id = style.get('style')
4023 if parent_style_id:
4024 if parent_style_id not in styles:
4025 repeat = True
4026 continue
4027 styles[style_id] = styles[parent_style_id].copy()
4028 for prop in SUPPORTED_STYLING:
4029 prop_val = style.get(_x('tts:' + prop))
4030 if prop_val:
4031 styles.setdefault(style_id, {})[prop] = prop_val
4032 if repeat:
4033 repeat = False
4034 else:
4035 break
4036
4037 for p in ('body', 'div'):
4038 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4039 if ele is None:
4040 continue
4041 style = styles.get(ele.get('style'))
4042 if not style:
4043 continue
4044 default_style.update(style)
4045
4046 for para, index in zip(paras, itertools.count(1)):
4047 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4048 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4049 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4050 if begin_time is None:
4051 continue
4052 if not end_time:
4053 if not dur:
4054 continue
4055 end_time = begin_time + dur
4056 out.append('%d\n%s --> %s\n%s\n\n' % (
4057 index,
4058 srt_subtitles_timecode(begin_time),
4059 srt_subtitles_timecode(end_time),
4060 parse_node(para)))
4061
4062 return ''.join(out)
4063
4064
4065 def cli_option(params, command_option, param, separator=None):
4066 param = params.get(param)
4067 return ([] if param is None
4068 else [command_option, str(param)] if separator is None
4069 else [f'{command_option}{separator}{param}'])
4070
4071
4072 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4073 param = params.get(param)
4074 assert param in (True, False, None)
4075 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4076
4077
4078 def cli_valueless_option(params, command_option, param, expected_value=True):
4079 return [command_option] if params.get(param) == expected_value else []
4080
4081
4082 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4083 if isinstance(argdict, (list, tuple)): # for backward compatibility
4084 if use_compat:
4085 return argdict
4086 else:
4087 argdict = None
4088 if argdict is None:
4089 return default
4090 assert isinstance(argdict, dict)
4091
4092 assert isinstance(keys, (list, tuple))
4093 for key_list in keys:
4094 arg_list = list(filter(
4095 lambda x: x is not None,
4096 [argdict.get(key.lower()) for key in variadic(key_list)]))
4097 if arg_list:
4098 return [arg for args in arg_list for arg in args]
4099 return default
4100
4101
4102 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4103 main_key, exe = main_key.lower(), exe.lower()
4104 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4105 keys = [f'{root_key}{k}' for k in (keys or [''])]
4106 if root_key in keys:
4107 if main_key != exe:
4108 keys.append((main_key, exe))
4109 keys.append('default')
4110 else:
4111 use_compat = False
4112 return cli_configuration_args(argdict, keys, default, use_compat)
4113
4114
4115 class ISO639Utils:
4116 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4117 _lang_map = {
4118 'aa': 'aar',
4119 'ab': 'abk',
4120 'ae': 'ave',
4121 'af': 'afr',
4122 'ak': 'aka',
4123 'am': 'amh',
4124 'an': 'arg',
4125 'ar': 'ara',
4126 'as': 'asm',
4127 'av': 'ava',
4128 'ay': 'aym',
4129 'az': 'aze',
4130 'ba': 'bak',
4131 'be': 'bel',
4132 'bg': 'bul',
4133 'bh': 'bih',
4134 'bi': 'bis',
4135 'bm': 'bam',
4136 'bn': 'ben',
4137 'bo': 'bod',
4138 'br': 'bre',
4139 'bs': 'bos',
4140 'ca': 'cat',
4141 'ce': 'che',
4142 'ch': 'cha',
4143 'co': 'cos',
4144 'cr': 'cre',
4145 'cs': 'ces',
4146 'cu': 'chu',
4147 'cv': 'chv',
4148 'cy': 'cym',
4149 'da': 'dan',
4150 'de': 'deu',
4151 'dv': 'div',
4152 'dz': 'dzo',
4153 'ee': 'ewe',
4154 'el': 'ell',
4155 'en': 'eng',
4156 'eo': 'epo',
4157 'es': 'spa',
4158 'et': 'est',
4159 'eu': 'eus',
4160 'fa': 'fas',
4161 'ff': 'ful',
4162 'fi': 'fin',
4163 'fj': 'fij',
4164 'fo': 'fao',
4165 'fr': 'fra',
4166 'fy': 'fry',
4167 'ga': 'gle',
4168 'gd': 'gla',
4169 'gl': 'glg',
4170 'gn': 'grn',
4171 'gu': 'guj',
4172 'gv': 'glv',
4173 'ha': 'hau',
4174 'he': 'heb',
4175 'iw': 'heb', # Replaced by he in 1989 revision
4176 'hi': 'hin',
4177 'ho': 'hmo',
4178 'hr': 'hrv',
4179 'ht': 'hat',
4180 'hu': 'hun',
4181 'hy': 'hye',
4182 'hz': 'her',
4183 'ia': 'ina',
4184 'id': 'ind',
4185 'in': 'ind', # Replaced by id in 1989 revision
4186 'ie': 'ile',
4187 'ig': 'ibo',
4188 'ii': 'iii',
4189 'ik': 'ipk',
4190 'io': 'ido',
4191 'is': 'isl',
4192 'it': 'ita',
4193 'iu': 'iku',
4194 'ja': 'jpn',
4195 'jv': 'jav',
4196 'ka': 'kat',
4197 'kg': 'kon',
4198 'ki': 'kik',
4199 'kj': 'kua',
4200 'kk': 'kaz',
4201 'kl': 'kal',
4202 'km': 'khm',
4203 'kn': 'kan',
4204 'ko': 'kor',
4205 'kr': 'kau',
4206 'ks': 'kas',
4207 'ku': 'kur',
4208 'kv': 'kom',
4209 'kw': 'cor',
4210 'ky': 'kir',
4211 'la': 'lat',
4212 'lb': 'ltz',
4213 'lg': 'lug',
4214 'li': 'lim',
4215 'ln': 'lin',
4216 'lo': 'lao',
4217 'lt': 'lit',
4218 'lu': 'lub',
4219 'lv': 'lav',
4220 'mg': 'mlg',
4221 'mh': 'mah',
4222 'mi': 'mri',
4223 'mk': 'mkd',
4224 'ml': 'mal',
4225 'mn': 'mon',
4226 'mr': 'mar',
4227 'ms': 'msa',
4228 'mt': 'mlt',
4229 'my': 'mya',
4230 'na': 'nau',
4231 'nb': 'nob',
4232 'nd': 'nde',
4233 'ne': 'nep',
4234 'ng': 'ndo',
4235 'nl': 'nld',
4236 'nn': 'nno',
4237 'no': 'nor',
4238 'nr': 'nbl',
4239 'nv': 'nav',
4240 'ny': 'nya',
4241 'oc': 'oci',
4242 'oj': 'oji',
4243 'om': 'orm',
4244 'or': 'ori',
4245 'os': 'oss',
4246 'pa': 'pan',
4247 'pi': 'pli',
4248 'pl': 'pol',
4249 'ps': 'pus',
4250 'pt': 'por',
4251 'qu': 'que',
4252 'rm': 'roh',
4253 'rn': 'run',
4254 'ro': 'ron',
4255 'ru': 'rus',
4256 'rw': 'kin',
4257 'sa': 'san',
4258 'sc': 'srd',
4259 'sd': 'snd',
4260 'se': 'sme',
4261 'sg': 'sag',
4262 'si': 'sin',
4263 'sk': 'slk',
4264 'sl': 'slv',
4265 'sm': 'smo',
4266 'sn': 'sna',
4267 'so': 'som',
4268 'sq': 'sqi',
4269 'sr': 'srp',
4270 'ss': 'ssw',
4271 'st': 'sot',
4272 'su': 'sun',
4273 'sv': 'swe',
4274 'sw': 'swa',
4275 'ta': 'tam',
4276 'te': 'tel',
4277 'tg': 'tgk',
4278 'th': 'tha',
4279 'ti': 'tir',
4280 'tk': 'tuk',
4281 'tl': 'tgl',
4282 'tn': 'tsn',
4283 'to': 'ton',
4284 'tr': 'tur',
4285 'ts': 'tso',
4286 'tt': 'tat',
4287 'tw': 'twi',
4288 'ty': 'tah',
4289 'ug': 'uig',
4290 'uk': 'ukr',
4291 'ur': 'urd',
4292 'uz': 'uzb',
4293 've': 'ven',
4294 'vi': 'vie',
4295 'vo': 'vol',
4296 'wa': 'wln',
4297 'wo': 'wol',
4298 'xh': 'xho',
4299 'yi': 'yid',
4300 'ji': 'yid', # Replaced by yi in 1989 revision
4301 'yo': 'yor',
4302 'za': 'zha',
4303 'zh': 'zho',
4304 'zu': 'zul',
4305 }
4306
4307 @classmethod
4308 def short2long(cls, code):
4309 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4310 return cls._lang_map.get(code[:2])
4311
4312 @classmethod
4313 def long2short(cls, code):
4314 """Convert language code from ISO 639-2/T to ISO 639-1"""
4315 for short_name, long_name in cls._lang_map.items():
4316 if long_name == code:
4317 return short_name
4318
4319
4320 class ISO3166Utils:
4321 # From http://data.okfn.org/data/core/country-list
4322 _country_map = {
4323 'AF': 'Afghanistan',
4324 'AX': 'Åland Islands',
4325 'AL': 'Albania',
4326 'DZ': 'Algeria',
4327 'AS': 'American Samoa',
4328 'AD': 'Andorra',
4329 'AO': 'Angola',
4330 'AI': 'Anguilla',
4331 'AQ': 'Antarctica',
4332 'AG': 'Antigua and Barbuda',
4333 'AR': 'Argentina',
4334 'AM': 'Armenia',
4335 'AW': 'Aruba',
4336 'AU': 'Australia',
4337 'AT': 'Austria',
4338 'AZ': 'Azerbaijan',
4339 'BS': 'Bahamas',
4340 'BH': 'Bahrain',
4341 'BD': 'Bangladesh',
4342 'BB': 'Barbados',
4343 'BY': 'Belarus',
4344 'BE': 'Belgium',
4345 'BZ': 'Belize',
4346 'BJ': 'Benin',
4347 'BM': 'Bermuda',
4348 'BT': 'Bhutan',
4349 'BO': 'Bolivia, Plurinational State of',
4350 'BQ': 'Bonaire, Sint Eustatius and Saba',
4351 'BA': 'Bosnia and Herzegovina',
4352 'BW': 'Botswana',
4353 'BV': 'Bouvet Island',
4354 'BR': 'Brazil',
4355 'IO': 'British Indian Ocean Territory',
4356 'BN': 'Brunei Darussalam',
4357 'BG': 'Bulgaria',
4358 'BF': 'Burkina Faso',
4359 'BI': 'Burundi',
4360 'KH': 'Cambodia',
4361 'CM': 'Cameroon',
4362 'CA': 'Canada',
4363 'CV': 'Cape Verde',
4364 'KY': 'Cayman Islands',
4365 'CF': 'Central African Republic',
4366 'TD': 'Chad',
4367 'CL': 'Chile',
4368 'CN': 'China',
4369 'CX': 'Christmas Island',
4370 'CC': 'Cocos (Keeling) Islands',
4371 'CO': 'Colombia',
4372 'KM': 'Comoros',
4373 'CG': 'Congo',
4374 'CD': 'Congo, the Democratic Republic of the',
4375 'CK': 'Cook Islands',
4376 'CR': 'Costa Rica',
4377 'CI': 'Côte d\'Ivoire',
4378 'HR': 'Croatia',
4379 'CU': 'Cuba',
4380 'CW': 'Curaçao',
4381 'CY': 'Cyprus',
4382 'CZ': 'Czech Republic',
4383 'DK': 'Denmark',
4384 'DJ': 'Djibouti',
4385 'DM': 'Dominica',
4386 'DO': 'Dominican Republic',
4387 'EC': 'Ecuador',
4388 'EG': 'Egypt',
4389 'SV': 'El Salvador',
4390 'GQ': 'Equatorial Guinea',
4391 'ER': 'Eritrea',
4392 'EE': 'Estonia',
4393 'ET': 'Ethiopia',
4394 'FK': 'Falkland Islands (Malvinas)',
4395 'FO': 'Faroe Islands',
4396 'FJ': 'Fiji',
4397 'FI': 'Finland',
4398 'FR': 'France',
4399 'GF': 'French Guiana',
4400 'PF': 'French Polynesia',
4401 'TF': 'French Southern Territories',
4402 'GA': 'Gabon',
4403 'GM': 'Gambia',
4404 'GE': 'Georgia',
4405 'DE': 'Germany',
4406 'GH': 'Ghana',
4407 'GI': 'Gibraltar',
4408 'GR': 'Greece',
4409 'GL': 'Greenland',
4410 'GD': 'Grenada',
4411 'GP': 'Guadeloupe',
4412 'GU': 'Guam',
4413 'GT': 'Guatemala',
4414 'GG': 'Guernsey',
4415 'GN': 'Guinea',
4416 'GW': 'Guinea-Bissau',
4417 'GY': 'Guyana',
4418 'HT': 'Haiti',
4419 'HM': 'Heard Island and McDonald Islands',
4420 'VA': 'Holy See (Vatican City State)',
4421 'HN': 'Honduras',
4422 'HK': 'Hong Kong',
4423 'HU': 'Hungary',
4424 'IS': 'Iceland',
4425 'IN': 'India',
4426 'ID': 'Indonesia',
4427 'IR': 'Iran, Islamic Republic of',
4428 'IQ': 'Iraq',
4429 'IE': 'Ireland',
4430 'IM': 'Isle of Man',
4431 'IL': 'Israel',
4432 'IT': 'Italy',
4433 'JM': 'Jamaica',
4434 'JP': 'Japan',
4435 'JE': 'Jersey',
4436 'JO': 'Jordan',
4437 'KZ': 'Kazakhstan',
4438 'KE': 'Kenya',
4439 'KI': 'Kiribati',
4440 'KP': 'Korea, Democratic People\'s Republic of',
4441 'KR': 'Korea, Republic of',
4442 'KW': 'Kuwait',
4443 'KG': 'Kyrgyzstan',
4444 'LA': 'Lao People\'s Democratic Republic',
4445 'LV': 'Latvia',
4446 'LB': 'Lebanon',
4447 'LS': 'Lesotho',
4448 'LR': 'Liberia',
4449 'LY': 'Libya',
4450 'LI': 'Liechtenstein',
4451 'LT': 'Lithuania',
4452 'LU': 'Luxembourg',
4453 'MO': 'Macao',
4454 'MK': 'Macedonia, the Former Yugoslav Republic of',
4455 'MG': 'Madagascar',
4456 'MW': 'Malawi',
4457 'MY': 'Malaysia',
4458 'MV': 'Maldives',
4459 'ML': 'Mali',
4460 'MT': 'Malta',
4461 'MH': 'Marshall Islands',
4462 'MQ': 'Martinique',
4463 'MR': 'Mauritania',
4464 'MU': 'Mauritius',
4465 'YT': 'Mayotte',
4466 'MX': 'Mexico',
4467 'FM': 'Micronesia, Federated States of',
4468 'MD': 'Moldova, Republic of',
4469 'MC': 'Monaco',
4470 'MN': 'Mongolia',
4471 'ME': 'Montenegro',
4472 'MS': 'Montserrat',
4473 'MA': 'Morocco',
4474 'MZ': 'Mozambique',
4475 'MM': 'Myanmar',
4476 'NA': 'Namibia',
4477 'NR': 'Nauru',
4478 'NP': 'Nepal',
4479 'NL': 'Netherlands',
4480 'NC': 'New Caledonia',
4481 'NZ': 'New Zealand',
4482 'NI': 'Nicaragua',
4483 'NE': 'Niger',
4484 'NG': 'Nigeria',
4485 'NU': 'Niue',
4486 'NF': 'Norfolk Island',
4487 'MP': 'Northern Mariana Islands',
4488 'NO': 'Norway',
4489 'OM': 'Oman',
4490 'PK': 'Pakistan',
4491 'PW': 'Palau',
4492 'PS': 'Palestine, State of',
4493 'PA': 'Panama',
4494 'PG': 'Papua New Guinea',
4495 'PY': 'Paraguay',
4496 'PE': 'Peru',
4497 'PH': 'Philippines',
4498 'PN': 'Pitcairn',
4499 'PL': 'Poland',
4500 'PT': 'Portugal',
4501 'PR': 'Puerto Rico',
4502 'QA': 'Qatar',
4503 'RE': 'Réunion',
4504 'RO': 'Romania',
4505 'RU': 'Russian Federation',
4506 'RW': 'Rwanda',
4507 'BL': 'Saint Barthélemy',
4508 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4509 'KN': 'Saint Kitts and Nevis',
4510 'LC': 'Saint Lucia',
4511 'MF': 'Saint Martin (French part)',
4512 'PM': 'Saint Pierre and Miquelon',
4513 'VC': 'Saint Vincent and the Grenadines',
4514 'WS': 'Samoa',
4515 'SM': 'San Marino',
4516 'ST': 'Sao Tome and Principe',
4517 'SA': 'Saudi Arabia',
4518 'SN': 'Senegal',
4519 'RS': 'Serbia',
4520 'SC': 'Seychelles',
4521 'SL': 'Sierra Leone',
4522 'SG': 'Singapore',
4523 'SX': 'Sint Maarten (Dutch part)',
4524 'SK': 'Slovakia',
4525 'SI': 'Slovenia',
4526 'SB': 'Solomon Islands',
4527 'SO': 'Somalia',
4528 'ZA': 'South Africa',
4529 'GS': 'South Georgia and the South Sandwich Islands',
4530 'SS': 'South Sudan',
4531 'ES': 'Spain',
4532 'LK': 'Sri Lanka',
4533 'SD': 'Sudan',
4534 'SR': 'Suriname',
4535 'SJ': 'Svalbard and Jan Mayen',
4536 'SZ': 'Swaziland',
4537 'SE': 'Sweden',
4538 'CH': 'Switzerland',
4539 'SY': 'Syrian Arab Republic',
4540 'TW': 'Taiwan, Province of China',
4541 'TJ': 'Tajikistan',
4542 'TZ': 'Tanzania, United Republic of',
4543 'TH': 'Thailand',
4544 'TL': 'Timor-Leste',
4545 'TG': 'Togo',
4546 'TK': 'Tokelau',
4547 'TO': 'Tonga',
4548 'TT': 'Trinidad and Tobago',
4549 'TN': 'Tunisia',
4550 'TR': 'Turkey',
4551 'TM': 'Turkmenistan',
4552 'TC': 'Turks and Caicos Islands',
4553 'TV': 'Tuvalu',
4554 'UG': 'Uganda',
4555 'UA': 'Ukraine',
4556 'AE': 'United Arab Emirates',
4557 'GB': 'United Kingdom',
4558 'US': 'United States',
4559 'UM': 'United States Minor Outlying Islands',
4560 'UY': 'Uruguay',
4561 'UZ': 'Uzbekistan',
4562 'VU': 'Vanuatu',
4563 'VE': 'Venezuela, Bolivarian Republic of',
4564 'VN': 'Viet Nam',
4565 'VG': 'Virgin Islands, British',
4566 'VI': 'Virgin Islands, U.S.',
4567 'WF': 'Wallis and Futuna',
4568 'EH': 'Western Sahara',
4569 'YE': 'Yemen',
4570 'ZM': 'Zambia',
4571 'ZW': 'Zimbabwe',
4572 # Not ISO 3166 codes, but used for IP blocks
4573 'AP': 'Asia/Pacific Region',
4574 'EU': 'Europe',
4575 }
4576
4577 @classmethod
4578 def short2full(cls, code):
4579 """Convert an ISO 3166-2 country code to the corresponding full name"""
4580 return cls._country_map.get(code.upper())
4581
4582
4583 class GeoUtils:
4584 # Major IPv4 address blocks per country
4585 _country_ip_map = {
4586 'AD': '46.172.224.0/19',
4587 'AE': '94.200.0.0/13',
4588 'AF': '149.54.0.0/17',
4589 'AG': '209.59.64.0/18',
4590 'AI': '204.14.248.0/21',
4591 'AL': '46.99.0.0/16',
4592 'AM': '46.70.0.0/15',
4593 'AO': '105.168.0.0/13',
4594 'AP': '182.50.184.0/21',
4595 'AQ': '23.154.160.0/24',
4596 'AR': '181.0.0.0/12',
4597 'AS': '202.70.112.0/20',
4598 'AT': '77.116.0.0/14',
4599 'AU': '1.128.0.0/11',
4600 'AW': '181.41.0.0/18',
4601 'AX': '185.217.4.0/22',
4602 'AZ': '5.197.0.0/16',
4603 'BA': '31.176.128.0/17',
4604 'BB': '65.48.128.0/17',
4605 'BD': '114.130.0.0/16',
4606 'BE': '57.0.0.0/8',
4607 'BF': '102.178.0.0/15',
4608 'BG': '95.42.0.0/15',
4609 'BH': '37.131.0.0/17',
4610 'BI': '154.117.192.0/18',
4611 'BJ': '137.255.0.0/16',
4612 'BL': '185.212.72.0/23',
4613 'BM': '196.12.64.0/18',
4614 'BN': '156.31.0.0/16',
4615 'BO': '161.56.0.0/16',
4616 'BQ': '161.0.80.0/20',
4617 'BR': '191.128.0.0/12',
4618 'BS': '24.51.64.0/18',
4619 'BT': '119.2.96.0/19',
4620 'BW': '168.167.0.0/16',
4621 'BY': '178.120.0.0/13',
4622 'BZ': '179.42.192.0/18',
4623 'CA': '99.224.0.0/11',
4624 'CD': '41.243.0.0/16',
4625 'CF': '197.242.176.0/21',
4626 'CG': '160.113.0.0/16',
4627 'CH': '85.0.0.0/13',
4628 'CI': '102.136.0.0/14',
4629 'CK': '202.65.32.0/19',
4630 'CL': '152.172.0.0/14',
4631 'CM': '102.244.0.0/14',
4632 'CN': '36.128.0.0/10',
4633 'CO': '181.240.0.0/12',
4634 'CR': '201.192.0.0/12',
4635 'CU': '152.206.0.0/15',
4636 'CV': '165.90.96.0/19',
4637 'CW': '190.88.128.0/17',
4638 'CY': '31.153.0.0/16',
4639 'CZ': '88.100.0.0/14',
4640 'DE': '53.0.0.0/8',
4641 'DJ': '197.241.0.0/17',
4642 'DK': '87.48.0.0/12',
4643 'DM': '192.243.48.0/20',
4644 'DO': '152.166.0.0/15',
4645 'DZ': '41.96.0.0/12',
4646 'EC': '186.68.0.0/15',
4647 'EE': '90.190.0.0/15',
4648 'EG': '156.160.0.0/11',
4649 'ER': '196.200.96.0/20',
4650 'ES': '88.0.0.0/11',
4651 'ET': '196.188.0.0/14',
4652 'EU': '2.16.0.0/13',
4653 'FI': '91.152.0.0/13',
4654 'FJ': '144.120.0.0/16',
4655 'FK': '80.73.208.0/21',
4656 'FM': '119.252.112.0/20',
4657 'FO': '88.85.32.0/19',
4658 'FR': '90.0.0.0/9',
4659 'GA': '41.158.0.0/15',
4660 'GB': '25.0.0.0/8',
4661 'GD': '74.122.88.0/21',
4662 'GE': '31.146.0.0/16',
4663 'GF': '161.22.64.0/18',
4664 'GG': '62.68.160.0/19',
4665 'GH': '154.160.0.0/12',
4666 'GI': '95.164.0.0/16',
4667 'GL': '88.83.0.0/19',
4668 'GM': '160.182.0.0/15',
4669 'GN': '197.149.192.0/18',
4670 'GP': '104.250.0.0/19',
4671 'GQ': '105.235.224.0/20',
4672 'GR': '94.64.0.0/13',
4673 'GT': '168.234.0.0/16',
4674 'GU': '168.123.0.0/16',
4675 'GW': '197.214.80.0/20',
4676 'GY': '181.41.64.0/18',
4677 'HK': '113.252.0.0/14',
4678 'HN': '181.210.0.0/16',
4679 'HR': '93.136.0.0/13',
4680 'HT': '148.102.128.0/17',
4681 'HU': '84.0.0.0/14',
4682 'ID': '39.192.0.0/10',
4683 'IE': '87.32.0.0/12',
4684 'IL': '79.176.0.0/13',
4685 'IM': '5.62.80.0/20',
4686 'IN': '117.192.0.0/10',
4687 'IO': '203.83.48.0/21',
4688 'IQ': '37.236.0.0/14',
4689 'IR': '2.176.0.0/12',
4690 'IS': '82.221.0.0/16',
4691 'IT': '79.0.0.0/10',
4692 'JE': '87.244.64.0/18',
4693 'JM': '72.27.0.0/17',
4694 'JO': '176.29.0.0/16',
4695 'JP': '133.0.0.0/8',
4696 'KE': '105.48.0.0/12',
4697 'KG': '158.181.128.0/17',
4698 'KH': '36.37.128.0/17',
4699 'KI': '103.25.140.0/22',
4700 'KM': '197.255.224.0/20',
4701 'KN': '198.167.192.0/19',
4702 'KP': '175.45.176.0/22',
4703 'KR': '175.192.0.0/10',
4704 'KW': '37.36.0.0/14',
4705 'KY': '64.96.0.0/15',
4706 'KZ': '2.72.0.0/13',
4707 'LA': '115.84.64.0/18',
4708 'LB': '178.135.0.0/16',
4709 'LC': '24.92.144.0/20',
4710 'LI': '82.117.0.0/19',
4711 'LK': '112.134.0.0/15',
4712 'LR': '102.183.0.0/16',
4713 'LS': '129.232.0.0/17',
4714 'LT': '78.56.0.0/13',
4715 'LU': '188.42.0.0/16',
4716 'LV': '46.109.0.0/16',
4717 'LY': '41.252.0.0/14',
4718 'MA': '105.128.0.0/11',
4719 'MC': '88.209.64.0/18',
4720 'MD': '37.246.0.0/16',
4721 'ME': '178.175.0.0/17',
4722 'MF': '74.112.232.0/21',
4723 'MG': '154.126.0.0/17',
4724 'MH': '117.103.88.0/21',
4725 'MK': '77.28.0.0/15',
4726 'ML': '154.118.128.0/18',
4727 'MM': '37.111.0.0/17',
4728 'MN': '49.0.128.0/17',
4729 'MO': '60.246.0.0/16',
4730 'MP': '202.88.64.0/20',
4731 'MQ': '109.203.224.0/19',
4732 'MR': '41.188.64.0/18',
4733 'MS': '208.90.112.0/22',
4734 'MT': '46.11.0.0/16',
4735 'MU': '105.16.0.0/12',
4736 'MV': '27.114.128.0/18',
4737 'MW': '102.70.0.0/15',
4738 'MX': '187.192.0.0/11',
4739 'MY': '175.136.0.0/13',
4740 'MZ': '197.218.0.0/15',
4741 'NA': '41.182.0.0/16',
4742 'NC': '101.101.0.0/18',
4743 'NE': '197.214.0.0/18',
4744 'NF': '203.17.240.0/22',
4745 'NG': '105.112.0.0/12',
4746 'NI': '186.76.0.0/15',
4747 'NL': '145.96.0.0/11',
4748 'NO': '84.208.0.0/13',
4749 'NP': '36.252.0.0/15',
4750 'NR': '203.98.224.0/19',
4751 'NU': '49.156.48.0/22',
4752 'NZ': '49.224.0.0/14',
4753 'OM': '5.36.0.0/15',
4754 'PA': '186.72.0.0/15',
4755 'PE': '186.160.0.0/14',
4756 'PF': '123.50.64.0/18',
4757 'PG': '124.240.192.0/19',
4758 'PH': '49.144.0.0/13',
4759 'PK': '39.32.0.0/11',
4760 'PL': '83.0.0.0/11',
4761 'PM': '70.36.0.0/20',
4762 'PR': '66.50.0.0/16',
4763 'PS': '188.161.0.0/16',
4764 'PT': '85.240.0.0/13',
4765 'PW': '202.124.224.0/20',
4766 'PY': '181.120.0.0/14',
4767 'QA': '37.210.0.0/15',
4768 'RE': '102.35.0.0/16',
4769 'RO': '79.112.0.0/13',
4770 'RS': '93.86.0.0/15',
4771 'RU': '5.136.0.0/13',
4772 'RW': '41.186.0.0/16',
4773 'SA': '188.48.0.0/13',
4774 'SB': '202.1.160.0/19',
4775 'SC': '154.192.0.0/11',
4776 'SD': '102.120.0.0/13',
4777 'SE': '78.64.0.0/12',
4778 'SG': '8.128.0.0/10',
4779 'SI': '188.196.0.0/14',
4780 'SK': '78.98.0.0/15',
4781 'SL': '102.143.0.0/17',
4782 'SM': '89.186.32.0/19',
4783 'SN': '41.82.0.0/15',
4784 'SO': '154.115.192.0/18',
4785 'SR': '186.179.128.0/17',
4786 'SS': '105.235.208.0/21',
4787 'ST': '197.159.160.0/19',
4788 'SV': '168.243.0.0/16',
4789 'SX': '190.102.0.0/20',
4790 'SY': '5.0.0.0/16',
4791 'SZ': '41.84.224.0/19',
4792 'TC': '65.255.48.0/20',
4793 'TD': '154.68.128.0/19',
4794 'TG': '196.168.0.0/14',
4795 'TH': '171.96.0.0/13',
4796 'TJ': '85.9.128.0/18',
4797 'TK': '27.96.24.0/21',
4798 'TL': '180.189.160.0/20',
4799 'TM': '95.85.96.0/19',
4800 'TN': '197.0.0.0/11',
4801 'TO': '175.176.144.0/21',
4802 'TR': '78.160.0.0/11',
4803 'TT': '186.44.0.0/15',
4804 'TV': '202.2.96.0/19',
4805 'TW': '120.96.0.0/11',
4806 'TZ': '156.156.0.0/14',
4807 'UA': '37.52.0.0/14',
4808 'UG': '102.80.0.0/13',
4809 'US': '6.0.0.0/8',
4810 'UY': '167.56.0.0/13',
4811 'UZ': '84.54.64.0/18',
4812 'VA': '212.77.0.0/19',
4813 'VC': '207.191.240.0/21',
4814 'VE': '186.88.0.0/13',
4815 'VG': '66.81.192.0/20',
4816 'VI': '146.226.0.0/16',
4817 'VN': '14.160.0.0/11',
4818 'VU': '202.80.32.0/20',
4819 'WF': '117.20.32.0/21',
4820 'WS': '202.4.32.0/19',
4821 'YE': '134.35.0.0/16',
4822 'YT': '41.242.116.0/22',
4823 'ZA': '41.0.0.0/11',
4824 'ZM': '102.144.0.0/13',
4825 'ZW': '102.177.192.0/18',
4826 }
4827
4828 @classmethod
4829 def random_ipv4(cls, code_or_block):
4830 if len(code_or_block) == 2:
4831 block = cls._country_ip_map.get(code_or_block.upper())
4832 if not block:
4833 return None
4834 else:
4835 block = code_or_block
4836 addr, preflen = block.split('/')
4837 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4838 addr_max = addr_min | (0xffffffff >> int(preflen))
4839 return str(socket.inet_ntoa(
4840 struct.pack('!L', random.randint(addr_min, addr_max))))
4841
4842
4843 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4844 def __init__(self, proxies=None):
4845 # Set default handlers
4846 for type in ('http', 'https'):
4847 setattr(self, '%s_open' % type,
4848 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4849 meth(r, proxy, type))
4850 urllib.request.ProxyHandler.__init__(self, proxies)
4851
4852 def proxy_open(self, req, proxy, type):
4853 req_proxy = req.headers.get('Ytdl-request-proxy')
4854 if req_proxy is not None:
4855 proxy = req_proxy
4856 del req.headers['Ytdl-request-proxy']
4857
4858 if proxy == '__noproxy__':
4859 return None # No Proxy
4860 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4861 req.add_header('Ytdl-socks-proxy', proxy)
4862 # yt-dlp's http/https handlers do wrapping the socket with socks
4863 return None
4864 return urllib.request.ProxyHandler.proxy_open(
4865 self, req, proxy, type)
4866
4867
4868 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4869 # released into Public Domain
4870 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4871
4872 def long_to_bytes(n, blocksize=0):
4873 """long_to_bytes(n:long, blocksize:int) : string
4874 Convert a long integer to a byte string.
4875
4876 If optional blocksize is given and greater than zero, pad the front of the
4877 byte string with binary zeros so that the length is a multiple of
4878 blocksize.
4879 """
4880 # after much testing, this algorithm was deemed to be the fastest
4881 s = b''
4882 n = int(n)
4883 while n > 0:
4884 s = struct.pack('>I', n & 0xffffffff) + s
4885 n = n >> 32
4886 # strip off leading zeros
4887 for i in range(len(s)):
4888 if s[i] != b'\000'[0]:
4889 break
4890 else:
4891 # only happens when n == 0
4892 s = b'\000'
4893 i = 0
4894 s = s[i:]
4895 # add back some pad bytes. this could be done more efficiently w.r.t. the
4896 # de-padding being done above, but sigh...
4897 if blocksize > 0 and len(s) % blocksize:
4898 s = (blocksize - len(s) % blocksize) * b'\000' + s
4899 return s
4900
4901
4902 def bytes_to_long(s):
4903 """bytes_to_long(string) : long
4904 Convert a byte string to a long integer.
4905
4906 This is (essentially) the inverse of long_to_bytes().
4907 """
4908 acc = 0
4909 length = len(s)
4910 if length % 4:
4911 extra = (4 - length % 4)
4912 s = b'\000' * extra + s
4913 length = length + extra
4914 for i in range(0, length, 4):
4915 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4916 return acc
4917
4918
4919 def ohdave_rsa_encrypt(data, exponent, modulus):
4920 '''
4921 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4922
4923 Input:
4924 data: data to encrypt, bytes-like object
4925 exponent, modulus: parameter e and N of RSA algorithm, both integer
4926 Output: hex string of encrypted data
4927
4928 Limitation: supports one block encryption only
4929 '''
4930
4931 payload = int(binascii.hexlify(data[::-1]), 16)
4932 encrypted = pow(payload, exponent, modulus)
4933 return '%x' % encrypted
4934
4935
4936 def pkcs1pad(data, length):
4937 """
4938 Padding input data with PKCS#1 scheme
4939
4940 @param {int[]} data input data
4941 @param {int} length target length
4942 @returns {int[]} padded data
4943 """
4944 if len(data) > length - 11:
4945 raise ValueError('Input data too long for PKCS#1 padding')
4946
4947 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4948 return [0, 2] + pseudo_random + [0] + data
4949
4950
4951 def _base_n_table(n, table):
4952 if not table and not n:
4953 raise ValueError('Either table or n must be specified')
4954 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4955
4956 if n and n != len(table):
4957 raise ValueError(f'base {n} exceeds table length {len(table)}')
4958 return table
4959
4960
4961 def encode_base_n(num, n=None, table=None):
4962 """Convert given int to a base-n string"""
4963 table = _base_n_table(n, table)
4964 if not num:
4965 return table[0]
4966
4967 result, base = '', len(table)
4968 while num:
4969 result = table[num % base] + result
4970 num = num // base
4971 return result
4972
4973
4974 def decode_base_n(string, n=None, table=None):
4975 """Convert given base-n string to int"""
4976 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4977 result, base = 0, len(table)
4978 for char in string:
4979 result = result * base + table[char]
4980 return result
4981
4982
4983 def decode_base(value, digits):
4984 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4985 f'in a future version. Use {__name__}.decode_base_n instead')
4986 return decode_base_n(value, table=digits)
4987
4988
4989 def decode_packed_codes(code):
4990 mobj = re.search(PACKED_CODES_RE, code)
4991 obfuscated_code, base, count, symbols = mobj.groups()
4992 base = int(base)
4993 count = int(count)
4994 symbols = symbols.split('|')
4995 symbol_table = {}
4996
4997 while count:
4998 count -= 1
4999 base_n_count = encode_base_n(count, base)
5000 symbol_table[base_n_count] = symbols[count] or base_n_count
5001
5002 return re.sub(
5003 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5004 obfuscated_code)
5005
5006
5007 def caesar(s, alphabet, shift):
5008 if shift == 0:
5009 return s
5010 l = len(alphabet)
5011 return ''.join(
5012 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5013 for c in s)
5014
5015
5016 def rot47(s):
5017 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5018
5019
5020 def parse_m3u8_attributes(attrib):
5021 info = {}
5022 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5023 if val.startswith('"'):
5024 val = val[1:-1]
5025 info[key] = val
5026 return info
5027
5028
5029 def urshift(val, n):
5030 return val >> n if val >= 0 else (val + 0x100000000) >> n
5031
5032
5033 # Based on png2str() written by @gdkchan and improved by @yokrysty
5034 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5035 def decode_png(png_data):
5036 # Reference: https://www.w3.org/TR/PNG/
5037 header = png_data[8:]
5038
5039 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5040 raise OSError('Not a valid PNG file.')
5041
5042 int_map = {1: '>B', 2: '>H', 4: '>I'}
5043 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5044
5045 chunks = []
5046
5047 while header:
5048 length = unpack_integer(header[:4])
5049 header = header[4:]
5050
5051 chunk_type = header[:4]
5052 header = header[4:]
5053
5054 chunk_data = header[:length]
5055 header = header[length:]
5056
5057 header = header[4:] # Skip CRC
5058
5059 chunks.append({
5060 'type': chunk_type,
5061 'length': length,
5062 'data': chunk_data
5063 })
5064
5065 ihdr = chunks[0]['data']
5066
5067 width = unpack_integer(ihdr[:4])
5068 height = unpack_integer(ihdr[4:8])
5069
5070 idat = b''
5071
5072 for chunk in chunks:
5073 if chunk['type'] == b'IDAT':
5074 idat += chunk['data']
5075
5076 if not idat:
5077 raise OSError('Unable to read PNG data.')
5078
5079 decompressed_data = bytearray(zlib.decompress(idat))
5080
5081 stride = width * 3
5082 pixels = []
5083
5084 def _get_pixel(idx):
5085 x = idx % stride
5086 y = idx // stride
5087 return pixels[y][x]
5088
5089 for y in range(height):
5090 basePos = y * (1 + stride)
5091 filter_type = decompressed_data[basePos]
5092
5093 current_row = []
5094
5095 pixels.append(current_row)
5096
5097 for x in range(stride):
5098 color = decompressed_data[1 + basePos + x]
5099 basex = y * stride + x
5100 left = 0
5101 up = 0
5102
5103 if x > 2:
5104 left = _get_pixel(basex - 3)
5105 if y > 0:
5106 up = _get_pixel(basex - stride)
5107
5108 if filter_type == 1: # Sub
5109 color = (color + left) & 0xff
5110 elif filter_type == 2: # Up
5111 color = (color + up) & 0xff
5112 elif filter_type == 3: # Average
5113 color = (color + ((left + up) >> 1)) & 0xff
5114 elif filter_type == 4: # Paeth
5115 a = left
5116 b = up
5117 c = 0
5118
5119 if x > 2 and y > 0:
5120 c = _get_pixel(basex - stride - 3)
5121
5122 p = a + b - c
5123
5124 pa = abs(p - a)
5125 pb = abs(p - b)
5126 pc = abs(p - c)
5127
5128 if pa <= pb and pa <= pc:
5129 color = (color + a) & 0xff
5130 elif pb <= pc:
5131 color = (color + b) & 0xff
5132 else:
5133 color = (color + c) & 0xff
5134
5135 current_row.append(color)
5136
5137 return width, height, pixels
5138
5139
5140 def write_xattr(path, key, value):
5141 # Windows: Write xattrs to NTFS Alternate Data Streams:
5142 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5143 if compat_os_name == 'nt':
5144 assert ':' not in key
5145 assert os.path.exists(path)
5146
5147 try:
5148 with open(f'{path}:{key}', 'wb') as f:
5149 f.write(value)
5150 except OSError as e:
5151 raise XAttrMetadataError(e.errno, e.strerror)
5152 return
5153
5154 # UNIX Method 1. Use xattrs/pyxattrs modules
5155
5156 setxattr = None
5157 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5158 # Unicode arguments are not supported in pyxattr until version 0.5.0
5159 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5160 if version_tuple(xattr.__version__) >= (0, 5, 0):
5161 setxattr = xattr.set
5162 elif xattr:
5163 setxattr = xattr.setxattr
5164
5165 if setxattr:
5166 try:
5167 setxattr(path, key, value)
5168 except OSError as e:
5169 raise XAttrMetadataError(e.errno, e.strerror)
5170 return
5171
5172 # UNIX Method 2. Use setfattr/xattr executables
5173 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5174 else 'xattr' if check_executable('xattr', ['-h']) else None)
5175 if not exe:
5176 raise XAttrUnavailableError(
5177 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5178 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5179
5180 value = value.decode()
5181 try:
5182 _, stderr, returncode = Popen.run(
5183 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5184 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5185 except OSError as e:
5186 raise XAttrMetadataError(e.errno, e.strerror)
5187 if returncode:
5188 raise XAttrMetadataError(returncode, stderr)
5189
5190
5191 def random_birthday(year_field, month_field, day_field):
5192 start_date = datetime.date(1950, 1, 1)
5193 end_date = datetime.date(1995, 12, 31)
5194 offset = random.randint(0, (end_date - start_date).days)
5195 random_date = start_date + datetime.timedelta(offset)
5196 return {
5197 year_field: str(random_date.year),
5198 month_field: str(random_date.month),
5199 day_field: str(random_date.day),
5200 }
5201
5202
5203 # Templates for internet shortcut files, which are plain text files.
5204 DOT_URL_LINK_TEMPLATE = '''\
5205 [InternetShortcut]
5206 URL=%(url)s
5207 '''
5208
5209 DOT_WEBLOC_LINK_TEMPLATE = '''\
5210 <?xml version="1.0" encoding="UTF-8"?>
5211 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5212 <plist version="1.0">
5213 <dict>
5214 \t<key>URL</key>
5215 \t<string>%(url)s</string>
5216 </dict>
5217 </plist>
5218 '''
5219
5220 DOT_DESKTOP_LINK_TEMPLATE = '''\
5221 [Desktop Entry]
5222 Encoding=UTF-8
5223 Name=%(filename)s
5224 Type=Link
5225 URL=%(url)s
5226 Icon=text-html
5227 '''
5228
5229 LINK_TEMPLATES = {
5230 'url': DOT_URL_LINK_TEMPLATE,
5231 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5232 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5233 }
5234
5235
5236 def iri_to_uri(iri):
5237 """
5238 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5239
5240 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5241 """
5242
5243 iri_parts = urllib.parse.urlparse(iri)
5244
5245 if '[' in iri_parts.netloc:
5246 raise ValueError('IPv6 URIs are not, yet, supported.')
5247 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5248
5249 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5250
5251 net_location = ''
5252 if iri_parts.username:
5253 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5254 if iri_parts.password is not None:
5255 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5256 net_location += '@'
5257
5258 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5259 # The 'idna' encoding produces ASCII text.
5260 if iri_parts.port is not None and iri_parts.port != 80:
5261 net_location += ':' + str(iri_parts.port)
5262
5263 return urllib.parse.urlunparse(
5264 (iri_parts.scheme,
5265 net_location,
5266
5267 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5268
5269 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5270 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5271
5272 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5273 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5274
5275 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5276
5277 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5278
5279
5280 def to_high_limit_path(path):
5281 if sys.platform in ['win32', 'cygwin']:
5282 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5283 return '\\\\?\\' + os.path.abspath(path)
5284
5285 return path
5286
5287
5288 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5289 val = traverse_obj(obj, *variadic(field))
5290 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5291 return default
5292 return template % func(val)
5293
5294
5295 def clean_podcast_url(url):
5296 return re.sub(r'''(?x)
5297 (?:
5298 (?:
5299 chtbl\.com/track|
5300 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5301 play\.podtrac\.com
5302 )/[^/]+|
5303 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5304 flex\.acast\.com|
5305 pd(?:
5306 cn\.co| # https://podcorn.com/analytics-prefix/
5307 st\.fm # https://podsights.com/docs/
5308 )/e
5309 )/''', '', url)
5310
5311
5312 _HEX_TABLE = '0123456789abcdef'
5313
5314
5315 def random_uuidv4():
5316 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5317
5318
5319 def make_dir(path, to_screen=None):
5320 try:
5321 dn = os.path.dirname(path)
5322 if dn and not os.path.exists(dn):
5323 os.makedirs(dn)
5324 return True
5325 except OSError as err:
5326 if callable(to_screen) is not None:
5327 to_screen('unable to create directory ' + error_to_compat_str(err))
5328 return False
5329
5330
5331 def get_executable_path():
5332 from .update import _get_variant_and_executable_path
5333
5334 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5335
5336
5337 def load_plugins(name, suffix, namespace):
5338 classes = {}
5339 with contextlib.suppress(FileNotFoundError):
5340 plugins_spec = importlib.util.spec_from_file_location(
5341 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5342 plugins = importlib.util.module_from_spec(plugins_spec)
5343 sys.modules[plugins_spec.name] = plugins
5344 plugins_spec.loader.exec_module(plugins)
5345 for name in dir(plugins):
5346 if name in namespace:
5347 continue
5348 if not name.endswith(suffix):
5349 continue
5350 klass = getattr(plugins, name)
5351 classes[name] = namespace[name] = klass
5352 return classes
5353
5354
5355 def traverse_obj(
5356 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5357 casesense=True, is_user_input=False, traverse_string=False):
5358 """
5359 Safely traverse nested `dict`s and `Sequence`s
5360
5361 >>> obj = [{}, {"key": "value"}]
5362 >>> traverse_obj(obj, (1, "key"))
5363 "value"
5364
5365 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5366 The next path will also be tested if the path branched but no results could be found.
5367 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5368 A value of None is treated as the absence of a value.
5369
5370 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5371
5372 The keys in the path can be one of:
5373 - `None`: Return the current object.
5374 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5375 - `slice`: Branch out and return all values in `obj[key]`.
5376 - `Ellipsis`: Branch out and return a list of all values.
5377 - `tuple`/`list`: Branch out and return a list of all matching values.
5378 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5379 - `function`: Branch out and return values filtered by the function.
5380 Read as: `[value for key, value in obj if function(key, value)]`.
5381 For `Sequence`s, `key` is the index of the value.
5382 - `dict` Transform the current object and return a matching dict.
5383 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5384
5385 `tuple`, `list`, and `dict` all support nested paths and branches.
5386
5387 @params paths Paths which to traverse by.
5388 @param default Value to return if the paths do not match.
5389 @param expected_type If a `type`, only accept final values of this type.
5390 If any other callable, try to call the function on each result.
5391 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5392 @param casesense If `False`, consider string dictionary keys as case insensitive.
5393
5394 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5395
5396 @param is_user_input Whether the keys are generated from user input.
5397 If `True` strings get converted to `int`/`slice` if needed.
5398 @param traverse_string Whether to traverse into objects as strings.
5399 If `True`, any non-compatible object will first be
5400 converted into a string and then traversed into.
5401
5402
5403 @returns The result of the object traversal.
5404 If successful, `get_all=True`, and the path branches at least once,
5405 then a list of results is returned instead.
5406 A list is always returned if the last path branches and no `default` is given.
5407 """
5408 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5409 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5410
5411 if isinstance(expected_type, type):
5412 type_test = lambda val: val if isinstance(val, expected_type) else None
5413 else:
5414 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5415
5416 def apply_key(key, obj):
5417 if obj is None:
5418 return
5419
5420 elif key is None:
5421 yield obj
5422
5423 elif isinstance(key, (list, tuple)):
5424 for branch in key:
5425 _, result = apply_path(obj, branch)
5426 yield from result
5427
5428 elif key is ...:
5429 if isinstance(obj, collections.abc.Mapping):
5430 yield from obj.values()
5431 elif is_sequence(obj):
5432 yield from obj
5433 elif isinstance(obj, re.Match):
5434 yield from obj.groups()
5435 elif traverse_string:
5436 yield from str(obj)
5437
5438 elif callable(key):
5439 if is_sequence(obj):
5440 iter_obj = enumerate(obj)
5441 elif isinstance(obj, collections.abc.Mapping):
5442 iter_obj = obj.items()
5443 elif isinstance(obj, re.Match):
5444 iter_obj = enumerate((obj.group(), *obj.groups()))
5445 elif traverse_string:
5446 iter_obj = enumerate(str(obj))
5447 else:
5448 return
5449 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5450
5451 elif isinstance(key, dict):
5452 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5453 yield {k: v if v is not None else default for k, v in iter_obj
5454 if v is not None or default is not NO_DEFAULT}
5455
5456 elif isinstance(obj, collections.abc.Mapping):
5457 yield (obj.get(key) if casesense or (key in obj)
5458 else next((v for k, v in obj.items() if casefold(k) == key), None))
5459
5460 elif isinstance(obj, re.Match):
5461 if isinstance(key, int) or casesense:
5462 with contextlib.suppress(IndexError):
5463 yield obj.group(key)
5464 return
5465
5466 if not isinstance(key, str):
5467 return
5468
5469 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5470
5471 else:
5472 if is_user_input:
5473 key = (int_or_none(key) if ':' not in key
5474 else slice(*map(int_or_none, key.split(':'))))
5475
5476 if not isinstance(key, (int, slice)):
5477 return
5478
5479 if not is_sequence(obj):
5480 if not traverse_string:
5481 return
5482 obj = str(obj)
5483
5484 with contextlib.suppress(IndexError):
5485 yield obj[key]
5486
5487 def apply_path(start_obj, path):
5488 objs = (start_obj,)
5489 has_branched = False
5490
5491 for key in variadic(path):
5492 if is_user_input and key == ':':
5493 key = ...
5494
5495 if not casesense and isinstance(key, str):
5496 key = key.casefold()
5497
5498 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5499 has_branched = True
5500
5501 key_func = functools.partial(apply_key, key)
5502 objs = itertools.chain.from_iterable(map(key_func, objs))
5503
5504 return has_branched, objs
5505
5506 def _traverse_obj(obj, path, use_list=True):
5507 has_branched, results = apply_path(obj, path)
5508 results = LazyList(x for x in map(type_test, results) if x is not None)
5509
5510 if get_all and has_branched:
5511 return results.exhaust() if results or use_list else None
5512
5513 return results[0] if results else None
5514
5515 for index, path in enumerate(paths, 1):
5516 use_list = default is NO_DEFAULT and index == len(paths)
5517 result = _traverse_obj(obj, path, use_list)
5518 if result is not None:
5519 return result
5520
5521 return None if default is NO_DEFAULT else default
5522
5523
5524 def traverse_dict(dictn, keys, casesense=True):
5525 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5526 f'in a future version. Use "{__name__}.traverse_obj" instead')
5527 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5528
5529
5530 def get_first(obj, keys, **kwargs):
5531 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5532
5533
5534 def time_seconds(**kwargs):
5535 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5536 return t.timestamp()
5537
5538
5539 # create a JSON Web Signature (jws) with HS256 algorithm
5540 # the resulting format is in JWS Compact Serialization
5541 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5542 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5543 def jwt_encode_hs256(payload_data, key, headers={}):
5544 header_data = {
5545 'alg': 'HS256',
5546 'typ': 'JWT',
5547 }
5548 if headers:
5549 header_data.update(headers)
5550 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5551 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5552 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5553 signature_b64 = base64.b64encode(h.digest())
5554 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5555 return token
5556
5557
5558 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5559 def jwt_decode_hs256(jwt):
5560 header_b64, payload_b64, signature_b64 = jwt.split('.')
5561 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5562 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5563 return payload_data
5564
5565
5566 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5567
5568
5569 @functools.cache
5570 def supports_terminal_sequences(stream):
5571 if compat_os_name == 'nt':
5572 if not WINDOWS_VT_MODE:
5573 return False
5574 elif not os.getenv('TERM'):
5575 return False
5576 try:
5577 return stream.isatty()
5578 except BaseException:
5579 return False
5580
5581
5582 def windows_enable_vt_mode():
5583 """Ref: https://bugs.python.org/issue30075 """
5584 if get_windows_version() < (10, 0, 10586):
5585 return
5586
5587 import ctypes
5588 import ctypes.wintypes
5589 import msvcrt
5590
5591 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5592
5593 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5594 handle = os.open('CONOUT$', os.O_RDWR)
5595
5596 try:
5597 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5598 dw_original_mode = ctypes.wintypes.DWORD()
5599 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5600 if not success:
5601 raise Exception('GetConsoleMode failed')
5602
5603 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5604 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5605 if not success:
5606 raise Exception('SetConsoleMode failed')
5607 except Exception as e:
5608 write_string(f'WARNING: Cannot enable VT mode - {e}')
5609 else:
5610 global WINDOWS_VT_MODE
5611 WINDOWS_VT_MODE = True
5612 supports_terminal_sequences.cache_clear()
5613 finally:
5614 os.close(handle)
5615
5616
5617 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5618
5619
5620 def remove_terminal_sequences(string):
5621 return _terminal_sequences_re.sub('', string)
5622
5623
5624 def number_of_digits(number):
5625 return len('%d' % number)
5626
5627
5628 def join_nonempty(*values, delim='-', from_dict=None):
5629 if from_dict is not None:
5630 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5631 return delim.join(map(str, filter(None, values)))
5632
5633
5634 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5635 """
5636 Find the largest format dimensions in terms of video width and, for each thumbnail:
5637 * Modify the URL: Match the width with the provided regex and replace with the former width
5638 * Update dimensions
5639
5640 This function is useful with video services that scale the provided thumbnails on demand
5641 """
5642 _keys = ('width', 'height')
5643 max_dimensions = max(
5644 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5645 default=(0, 0))
5646 if not max_dimensions[0]:
5647 return thumbnails
5648 return [
5649 merge_dicts(
5650 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5651 dict(zip(_keys, max_dimensions)), thumbnail)
5652 for thumbnail in thumbnails
5653 ]
5654
5655
5656 def parse_http_range(range):
5657 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5658 if not range:
5659 return None, None, None
5660 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5661 if not crg:
5662 return None, None, None
5663 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5664
5665
5666 def read_stdin(what):
5667 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5668 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5669 return sys.stdin
5670
5671
5672 def determine_file_encoding(data):
5673 """
5674 Detect the text encoding used
5675 @returns (encoding, bytes to skip)
5676 """
5677
5678 # BOM marks are given priority over declarations
5679 for bom, enc in BOMS:
5680 if data.startswith(bom):
5681 return enc, len(bom)
5682
5683 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5684 # We ignore the endianness to get a good enough match
5685 data = data.replace(b'\0', b'')
5686 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5687 return mobj.group(1).decode() if mobj else None, 0
5688
5689
5690 class Config:
5691 own_args = None
5692 parsed_args = None
5693 filename = None
5694 __initialized = False
5695
5696 def __init__(self, parser, label=None):
5697 self.parser, self.label = parser, label
5698 self._loaded_paths, self.configs = set(), []
5699
5700 def init(self, args=None, filename=None):
5701 assert not self.__initialized
5702 self.own_args, self.filename = args, filename
5703 return self.load_configs()
5704
5705 def load_configs(self):
5706 directory = ''
5707 if self.filename:
5708 location = os.path.realpath(self.filename)
5709 directory = os.path.dirname(location)
5710 if location in self._loaded_paths:
5711 return False
5712 self._loaded_paths.add(location)
5713
5714 self.__initialized = True
5715 opts, _ = self.parser.parse_known_args(self.own_args)
5716 self.parsed_args = self.own_args
5717 for location in opts.config_locations or []:
5718 if location == '-':
5719 if location in self._loaded_paths:
5720 continue
5721 self._loaded_paths.add(location)
5722 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5723 continue
5724 location = os.path.join(directory, expand_path(location))
5725 if os.path.isdir(location):
5726 location = os.path.join(location, 'yt-dlp.conf')
5727 if not os.path.exists(location):
5728 self.parser.error(f'config location {location} does not exist')
5729 self.append_config(self.read_file(location), location)
5730 return True
5731
5732 def __str__(self):
5733 label = join_nonempty(
5734 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5735 delim=' ')
5736 return join_nonempty(
5737 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5738 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5739 delim='\n')
5740
5741 @staticmethod
5742 def read_file(filename, default=[]):
5743 try:
5744 optionf = open(filename, 'rb')
5745 except OSError:
5746 return default # silently skip if file is not present
5747 try:
5748 enc, skip = determine_file_encoding(optionf.read(512))
5749 optionf.seek(skip, io.SEEK_SET)
5750 except OSError:
5751 enc = None # silently skip read errors
5752 try:
5753 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5754 contents = optionf.read().decode(enc or preferredencoding())
5755 res = shlex.split(contents, comments=True)
5756 except Exception as err:
5757 raise ValueError(f'Unable to parse "{filename}": {err}')
5758 finally:
5759 optionf.close()
5760 return res
5761
5762 @staticmethod
5763 def hide_login_info(opts):
5764 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5765 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5766
5767 def _scrub_eq(o):
5768 m = eqre.match(o)
5769 if m:
5770 return m.group('key') + '=PRIVATE'
5771 else:
5772 return o
5773
5774 opts = list(map(_scrub_eq, opts))
5775 for idx, opt in enumerate(opts):
5776 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5777 opts[idx + 1] = 'PRIVATE'
5778 return opts
5779
5780 def append_config(self, *args, label=None):
5781 config = type(self)(self.parser, label)
5782 config._loaded_paths = self._loaded_paths
5783 if config.init(*args):
5784 self.configs.append(config)
5785
5786 @property
5787 def all_args(self):
5788 for config in reversed(self.configs):
5789 yield from config.all_args
5790 yield from self.parsed_args or []
5791
5792 def parse_known_args(self, **kwargs):
5793 return self.parser.parse_known_args(self.all_args, **kwargs)
5794
5795 def parse_args(self):
5796 return self.parser.parse_args(self.all_args)
5797
5798
5799 class WebSocketsWrapper:
5800 """Wraps websockets module to use in non-async scopes"""
5801 pool = None
5802
5803 def __init__(self, url, headers=None, connect=True):
5804 self.loop = asyncio.new_event_loop()
5805 # XXX: "loop" is deprecated
5806 self.conn = websockets.connect(
5807 url, extra_headers=headers, ping_interval=None,
5808 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5809 if connect:
5810 self.__enter__()
5811 atexit.register(self.__exit__, None, None, None)
5812
5813 def __enter__(self):
5814 if not self.pool:
5815 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5816 return self
5817
5818 def send(self, *args):
5819 self.run_with_loop(self.pool.send(*args), self.loop)
5820
5821 def recv(self, *args):
5822 return self.run_with_loop(self.pool.recv(*args), self.loop)
5823
5824 def __exit__(self, type, value, traceback):
5825 try:
5826 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5827 finally:
5828 self.loop.close()
5829 self._cancel_all_tasks(self.loop)
5830
5831 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5832 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5833 @staticmethod
5834 def run_with_loop(main, loop):
5835 if not asyncio.iscoroutine(main):
5836 raise ValueError(f'a coroutine was expected, got {main!r}')
5837
5838 try:
5839 return loop.run_until_complete(main)
5840 finally:
5841 loop.run_until_complete(loop.shutdown_asyncgens())
5842 if hasattr(loop, 'shutdown_default_executor'):
5843 loop.run_until_complete(loop.shutdown_default_executor())
5844
5845 @staticmethod
5846 def _cancel_all_tasks(loop):
5847 to_cancel = asyncio.all_tasks(loop)
5848
5849 if not to_cancel:
5850 return
5851
5852 for task in to_cancel:
5853 task.cancel()
5854
5855 # XXX: "loop" is removed in python 3.10+
5856 loop.run_until_complete(
5857 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5858
5859 for task in to_cancel:
5860 if task.cancelled():
5861 continue
5862 if task.exception() is not None:
5863 loop.call_exception_handler({
5864 'message': 'unhandled exception during asyncio.run() shutdown',
5865 'exception': task.exception(),
5866 'task': task,
5867 })
5868
5869
5870 def merge_headers(*dicts):
5871 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5872 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5873
5874
5875 def cached_method(f):
5876 """Cache a method"""
5877 signature = inspect.signature(f)
5878
5879 @functools.wraps(f)
5880 def wrapper(self, *args, **kwargs):
5881 bound_args = signature.bind(self, *args, **kwargs)
5882 bound_args.apply_defaults()
5883 key = tuple(bound_args.arguments.values())[1:]
5884
5885 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5886 if key not in cache:
5887 cache[key] = f(self, *args, **kwargs)
5888 return cache[key]
5889 return wrapper
5890
5891
5892 class classproperty:
5893 """property access for class methods with optional caching"""
5894 def __new__(cls, func=None, *args, **kwargs):
5895 if not func:
5896 return functools.partial(cls, *args, **kwargs)
5897 return super().__new__(cls)
5898
5899 def __init__(self, func, *, cache=False):
5900 functools.update_wrapper(self, func)
5901 self.func = func
5902 self._cache = {} if cache else None
5903
5904 def __get__(self, _, cls):
5905 if self._cache is None:
5906 return self.func(cls)
5907 elif cls not in self._cache:
5908 self._cache[cls] = self.func(cls)
5909 return self._cache[cls]
5910
5911
5912 class Namespace(types.SimpleNamespace):
5913 """Immutable namespace"""
5914
5915 def __iter__(self):
5916 return iter(self.__dict__.values())
5917
5918 @property
5919 def items_(self):
5920 return self.__dict__.items()
5921
5922
5923 MEDIA_EXTENSIONS = Namespace(
5924 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5925 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5926 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5927 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5928 thumbnails=('jpg', 'png', 'webp'),
5929 storyboards=('mhtml', ),
5930 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5931 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5932 )
5933 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5934 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5935
5936 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5937
5938
5939 class RetryManager:
5940 """Usage:
5941 for retry in RetryManager(...):
5942 try:
5943 ...
5944 except SomeException as err:
5945 retry.error = err
5946 continue
5947 """
5948 attempt, _error = 0, None
5949
5950 def __init__(self, _retries, _error_callback, **kwargs):
5951 self.retries = _retries or 0
5952 self.error_callback = functools.partial(_error_callback, **kwargs)
5953
5954 def _should_retry(self):
5955 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5956
5957 @property
5958 def error(self):
5959 if self._error is NO_DEFAULT:
5960 return None
5961 return self._error
5962
5963 @error.setter
5964 def error(self, value):
5965 self._error = value
5966
5967 def __iter__(self):
5968 while self._should_retry():
5969 self.error = NO_DEFAULT
5970 self.attempt += 1
5971 yield self
5972 if self.error:
5973 self.error_callback(self.error, self.attempt, self.retries)
5974
5975 @staticmethod
5976 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5977 """Utility function for reporting retries"""
5978 if count > retries:
5979 if error:
5980 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5981 raise e
5982
5983 if not count:
5984 return warn(e)
5985 elif isinstance(e, ExtractorError):
5986 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5987 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5988
5989 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5990 if delay:
5991 info(f'Sleeping {delay:.2f} seconds ...')
5992 time.sleep(delay)
5993
5994
5995 def make_archive_id(ie, video_id):
5996 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5997 return f'{ie_key.lower()} {video_id}'
5998
5999
6000 def truncate_string(s, left, right=0):
6001 assert left > 3 and right >= 0
6002 if s is None or len(s) <= left + right:
6003 return s
6004 return f'{s[:left-3]}...{s[-right:] if right else ""}'
6005
6006
6007 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6008 assert 'all' in alias_dict, '"all" alias is required'
6009 requested = list(start or [])
6010 for val in options:
6011 discard = val.startswith('-')
6012 if discard:
6013 val = val[1:]
6014
6015 if val in alias_dict:
6016 val = alias_dict[val] if not discard else [
6017 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6018 # NB: Do not allow regex in aliases for performance
6019 requested = orderedSet_from_options(val, alias_dict, start=requested)
6020 continue
6021
6022 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6023 else [val] if val in alias_dict['all'] else None)
6024 if current is None:
6025 raise ValueError(val)
6026
6027 if discard:
6028 for item in current:
6029 while item in requested:
6030 requested.remove(item)
6031 else:
6032 requested.extend(current)
6033
6034 return orderedSet(requested)
6035
6036
6037 class FormatSorter:
6038 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6039
6040 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6041 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6042 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6043 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6044 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6045 'fps', 'fs_approx', 'source', 'id')
6046
6047 settings = {
6048 'vcodec': {'type': 'ordered', 'regex': True,
6049 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6050 'acodec': {'type': 'ordered', 'regex': True,
6051 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6052 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6053 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6054 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6055 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6056 'vext': {'type': 'ordered', 'field': 'video_ext',
6057 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6058 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6059 'aext': {'type': 'ordered', 'field': 'audio_ext',
6060 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6061 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6062 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6063 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6064 'field': ('vcodec', 'acodec'),
6065 'function': lambda it: int(any(v != 'none' for v in it))},
6066 'ie_pref': {'priority': True, 'type': 'extractor'},
6067 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6068 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6069 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6070 'quality': {'convert': 'float', 'default': -1},
6071 'filesize': {'convert': 'bytes'},
6072 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6073 'id': {'convert': 'string', 'field': 'format_id'},
6074 'height': {'convert': 'float_none'},
6075 'width': {'convert': 'float_none'},
6076 'fps': {'convert': 'float_none'},
6077 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6078 'tbr': {'convert': 'float_none'},
6079 'vbr': {'convert': 'float_none'},
6080 'abr': {'convert': 'float_none'},
6081 'asr': {'convert': 'float_none'},
6082 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6083
6084 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6085 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6086 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6087 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6088 'res': {'type': 'multiple', 'field': ('height', 'width'),
6089 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6090
6091 # Actual field names
6092 'format_id': {'type': 'alias', 'field': 'id'},
6093 'preference': {'type': 'alias', 'field': 'ie_pref'},
6094 'language_preference': {'type': 'alias', 'field': 'lang'},
6095 'source_preference': {'type': 'alias', 'field': 'source'},
6096 'protocol': {'type': 'alias', 'field': 'proto'},
6097 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6098 'audio_channels': {'type': 'alias', 'field': 'channels'},
6099
6100 # Deprecated
6101 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6102 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6103 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6104 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6105 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6106 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6107 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6108 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6109 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6110 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6111 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6112 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6113 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6114 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6115 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6116 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6117 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6118 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6119 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6120 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6121 }
6122
6123 def __init__(self, ydl, field_preference):
6124 self.ydl = ydl
6125 self._order = []
6126 self.evaluate_params(self.ydl.params, field_preference)
6127 if ydl.params.get('verbose'):
6128 self.print_verbose_info(self.ydl.write_debug)
6129
6130 def _get_field_setting(self, field, key):
6131 if field not in self.settings:
6132 if key in ('forced', 'priority'):
6133 return False
6134 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6135 'deprecated and may be removed in a future version')
6136 self.settings[field] = {}
6137 propObj = self.settings[field]
6138 if key not in propObj:
6139 type = propObj.get('type')
6140 if key == 'field':
6141 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6142 elif key == 'convert':
6143 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6144 else:
6145 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6146 propObj[key] = default
6147 return propObj[key]
6148
6149 def _resolve_field_value(self, field, value, convertNone=False):
6150 if value is None:
6151 if not convertNone:
6152 return None
6153 else:
6154 value = value.lower()
6155 conversion = self._get_field_setting(field, 'convert')
6156 if conversion == 'ignore':
6157 return None
6158 if conversion == 'string':
6159 return value
6160 elif conversion == 'float_none':
6161 return float_or_none(value)
6162 elif conversion == 'bytes':
6163 return parse_bytes(value)
6164 elif conversion == 'order':
6165 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6166 use_regex = self._get_field_setting(field, 'regex')
6167 list_length = len(order_list)
6168 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6169 if use_regex and value is not None:
6170 for i, regex in enumerate(order_list):
6171 if regex and re.match(regex, value):
6172 return list_length - i
6173 return list_length - empty_pos # not in list
6174 else: # not regex or value = None
6175 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6176 else:
6177 if value.isnumeric():
6178 return float(value)
6179 else:
6180 self.settings[field]['convert'] = 'string'
6181 return value
6182
6183 def evaluate_params(self, params, sort_extractor):
6184 self._use_free_order = params.get('prefer_free_formats', False)
6185 self._sort_user = params.get('format_sort', [])
6186 self._sort_extractor = sort_extractor
6187
6188 def add_item(field, reverse, closest, limit_text):
6189 field = field.lower()
6190 if field in self._order:
6191 return
6192 self._order.append(field)
6193 limit = self._resolve_field_value(field, limit_text)
6194 data = {
6195 'reverse': reverse,
6196 'closest': False if limit is None else closest,
6197 'limit_text': limit_text,
6198 'limit': limit}
6199 if field in self.settings:
6200 self.settings[field].update(data)
6201 else:
6202 self.settings[field] = data
6203
6204 sort_list = (
6205 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6206 + (tuple() if params.get('format_sort_force', False)
6207 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6208 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6209
6210 for item in sort_list:
6211 match = re.match(self.regex, item)
6212 if match is None:
6213 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6214 field = match.group('field')
6215 if field is None:
6216 continue
6217 if self._get_field_setting(field, 'type') == 'alias':
6218 alias, field = field, self._get_field_setting(field, 'field')
6219 if self._get_field_setting(alias, 'deprecated'):
6220 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6221 f'be removed in a future version. Please use {field} instead')
6222 reverse = match.group('reverse') is not None
6223 closest = match.group('separator') == '~'
6224 limit_text = match.group('limit')
6225
6226 has_limit = limit_text is not None
6227 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6228 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6229
6230 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6231 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6232 limit_count = len(limits)
6233 for (i, f) in enumerate(fields):
6234 add_item(f, reverse, closest,
6235 limits[i] if i < limit_count
6236 else limits[0] if has_limit and not has_multiple_limits
6237 else None)
6238
6239 def print_verbose_info(self, write_debug):
6240 if self._sort_user:
6241 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6242 if self._sort_extractor:
6243 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6244 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6245 '+' if self._get_field_setting(field, 'reverse') else '', field,
6246 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6247 self._get_field_setting(field, 'limit_text'),
6248 self._get_field_setting(field, 'limit'))
6249 if self._get_field_setting(field, 'limit_text') is not None else '')
6250 for field in self._order if self._get_field_setting(field, 'visible')]))
6251
6252 def _calculate_field_preference_from_value(self, format, field, type, value):
6253 reverse = self._get_field_setting(field, 'reverse')
6254 closest = self._get_field_setting(field, 'closest')
6255 limit = self._get_field_setting(field, 'limit')
6256
6257 if type == 'extractor':
6258 maximum = self._get_field_setting(field, 'max')
6259 if value is None or (maximum is not None and value >= maximum):
6260 value = -1
6261 elif type == 'boolean':
6262 in_list = self._get_field_setting(field, 'in_list')
6263 not_in_list = self._get_field_setting(field, 'not_in_list')
6264 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6265 elif type == 'ordered':
6266 value = self._resolve_field_value(field, value, True)
6267
6268 # try to convert to number
6269 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6270 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6271 if is_num:
6272 value = val_num
6273
6274 return ((-10, 0) if value is None
6275 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6276 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6277 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6278 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6279 else (-1, value, 0))
6280
6281 def _calculate_field_preference(self, format, field):
6282 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6283 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6284 if type == 'multiple':
6285 type = 'field' # Only 'field' is allowed in multiple for now
6286 actual_fields = self._get_field_setting(field, 'field')
6287
6288 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6289 else:
6290 value = get_value(field)
6291 return self._calculate_field_preference_from_value(format, field, type, value)
6292
6293 def calculate_preference(self, format):
6294 # Determine missing protocol
6295 if not format.get('protocol'):
6296 format['protocol'] = determine_protocol(format)
6297
6298 # Determine missing ext
6299 if not format.get('ext') and 'url' in format:
6300 format['ext'] = determine_ext(format['url'])
6301 if format.get('vcodec') == 'none':
6302 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6303 format['video_ext'] = 'none'
6304 else:
6305 format['video_ext'] = format['ext']
6306 format['audio_ext'] = 'none'
6307 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6308 # format['preference'] = -1000
6309
6310 # Determine missing bitrates
6311 if format.get('tbr') is None:
6312 if format.get('vbr') is not None and format.get('abr') is not None:
6313 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6314 else:
6315 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6316 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6317 if format.get('acodec') != 'none' and format.get('abr') is None:
6318 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6319
6320 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6321
6322
6323 # Deprecated
6324 has_certifi = bool(certifi)
6325 has_websockets = bool(websockets)