47 import xml
.etree
.ElementTree
50 from .compat
import functools
# isort: split
52 compat_etree_fromstring
,
54 compat_HTMLParseError
,
58 from .dependencies
import brotli
, certifi
, websockets
, xattr
59 from .socks
import ProxyType
, sockssocket
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme
not in urllib
.parse
.uses_netloc
:
68 urllib
.parse
.uses_netloc
.append(scheme
)
71 # This is not clearly defined otherwise
72 compiled_regex_type
= type(re
.compile(''))
75 def random_user_agent():
76 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
117 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
120 SUPPORTED_ENCODINGS
= [
124 SUPPORTED_ENCODINGS
.append('br')
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
139 NO_DEFAULT
= object()
140 IDENTITY
= lambda x
: x
142 ENGLISH_MONTH_NAMES
= [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
147 'en': ENGLISH_MONTH_NAMES
,
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
168 # needed for sanitizing filenames in restricted mode
169 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
170 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
201 '%Y-%m-%d %H:%M:%S.%f',
202 '%Y-%m-%d %H:%M:%S:%f',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S.%f',
212 '%b %d %Y at %H:%M:%S',
214 '%B %d %Y at %H:%M:%S',
218 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
219 DATE_FORMATS_DAY_FIRST
.extend([
229 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
230 DATE_FORMATS_MONTH_FIRST
.extend([
238 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
239 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>\s
*(?P
<json_ld
>{.+?}|\
[.+?\
])\s
*</script
>'
241 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
245 def preferredencoding():
246 """Get preferred encoding.
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
252 pref = locale.getpreferredencoding()
260 def write_json_file(obj, fn):
261 """ Encode obj as JSON and write it to fn, atomically if possible """
263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
265 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
269 json.dump(obj, tf, ensure_ascii=False)
270 if sys.platform == 'win32
':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
273 with contextlib.suppress(OSError):
275 with contextlib.suppress(OSError):
278 os.chmod(tf.name, 0o666 & ~mask)
279 os.rename(tf.name, fn)
281 with contextlib.suppress(OSError):
286 def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
290 return node.find(expr)
292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
293 # the namespace parameter
296 def xpath_with_ns(path
, ns_map
):
297 components
= [c
.split(':') for c
in path
.split('/')]
301 replaced
.append(c
[0])
304 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
305 return '/'.join(replaced
)
308 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
309 def _find_xpath(xpath
):
310 return node
.find(xpath
)
312 if isinstance(xpath
, str):
313 n
= _find_xpath(xpath
)
321 if default
is not NO_DEFAULT
:
324 name
= xpath
if name
is None else name
325 raise ExtractorError('Could not find XML element %s' % name
)
331 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
332 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
333 if n
is None or n
== default
:
336 if default
is not NO_DEFAULT
:
339 name
= xpath
if name
is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name
)
346 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
347 n
= find_xpath_attr(node
, xpath
, key
)
349 if default
is not NO_DEFAULT
:
352 name
= f
'{xpath}[@{key}]' if name
is None else name
353 raise ExtractorError('Could not find XML attribute %s' % name
)
359 def get_element_by_id(id, html
, **kwargs
):
360 """Return the content of the tag with the specified ID in the passed HTML document"""
361 return get_element_by_attribute('id', id, html
, **kwargs
)
364 def get_element_html_by_id(id, html
, **kwargs
):
365 """Return the html of the tag with the specified ID in the passed HTML document"""
366 return get_element_html_by_attribute('id', id, html
, **kwargs
)
369 def get_element_by_class(class_name
, html
):
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval
= get_elements_by_class(class_name
, html
)
372 return retval
[0] if retval
else None
375 def get_element_html_by_class(class_name
, html
):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval
= get_elements_html_by_class(class_name
, html
)
378 return retval
[0] if retval
else None
381 def get_element_by_attribute(attribute
, value
, html
, **kwargs
):
382 retval
= get_elements_by_attribute(attribute
, value
, html
, **kwargs
)
383 return retval
[0] if retval
else None
386 def get_element_html_by_attribute(attribute
, value
, html
, **kargs
):
387 retval
= get_elements_html_by_attribute(attribute
, value
, html
, **kargs
)
388 return retval
[0] if retval
else None
391 def get_elements_by_class(class_name
, html
, **kargs
):
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
394 'class', r
'[^\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
395 html, escape_value=False)
398 def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
401 'class', r'[^
\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
405 def get_elements_by_attribute(*args, **kwargs):
406 """Return the content of the tag with the specified attribute in the passed HTML document"""
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
410 def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
:.-]+', escape_value=True):
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
423 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
425 value = re.escape(value) if escape_value else value
427 partial_element_re = rf'''(?x
)
429 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
449 class HTMLBreakOnClosingTagException(Exception):
453 self.tagstack = collections.deque()
454 html.parser.HTMLParser.__init__(self)
459 def __exit__(self, *_):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags
in the stack
')
475 inner_tag = self.tagstack.pop()
479 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
484 # XXX: This should be far less strict
485 def get_element_text_and_html_by_tag(tag, html):
487 For the first element with the specified tag in the passed HTML document
488 return its' content (text
) and the whole
element (html
)
490 def find_or_raise(haystack, needle, exc):
492 return haystack.index(needle)
495 closing_tag = f'</{tag}>'
496 whole_start = find_or_raise(
497 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
498 content_start = find_or_raise(
499 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
500 content_start += whole_start + 1
501 with HTMLBreakOnClosingTagParser() as parser:
502 parser.feed(html[whole_start:content_start])
503 if not parser.tagstack or parser.tagstack[0] != tag:
504 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
505 offset = content_start
506 while offset < len(html):
507 next_closing_tag_start = find_or_raise(
508 html[offset:], closing_tag,
509 compat_HTMLParseError(f'closing {tag} tag not found'))
510 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 parser.feed(html[offset:offset + next_closing_tag_end])
513 offset += next_closing_tag_end
514 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
515 return html[content_start:offset + next_closing_tag_start], \
516 html[whole_start:offset + next_closing_tag_end]
517 raise compat_HTMLParseError('unexpected end of html')
520 class HTMLAttributeParser(html.parser.HTMLParser):
521 """Trivial HTML parser to gather the attributes
for a single element
"""
525 html.parser.HTMLParser.__init__(self)
527 def handle_starttag(self, tag, attrs):
528 self.attrs = dict(attrs)
529 raise compat_HTMLParseError('done')
532 class HTMLListAttrsParser(html.parser.HTMLParser):
533 """HTML parser to gather the attributes
for the elements of a
list"""
536 html.parser.HTMLParser.__init__(self)
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
545 def handle_endtag(self, tag):
549 def extract_attributes(html_element):
550 """Given a string
for an HTML element such
as
552 a
="foo" B
="bar" c
="&98;az" d
=boz
553 empty
= noval entity
="&"
556 Decode
and return a dictionary of attributes
.
558 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
563 parser = HTMLAttributeParser()
564 with contextlib.suppress(compat_HTMLParseError):
565 parser.feed(html_element)
570 def parse_list(webpage):
571 """Given a string
for an series of HTML
<li
> elements
,
572 return a dictionary of their attributes
"""
573 parser = HTMLListAttrsParser()
579 def clean_html(html):
580 """Clean an HTML snippet into a readable string
"""
582 if html is None: # Convenience for sanitizing descriptions etc.
585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
595 class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 self._close_attempts = 2 * close_objects
600 super().__init__(*args, **kwargs)
603 def _close_object(err):
604 doc = err.doc[:err.pos]
605 # We need to add comma first to get the correct error message
606 if err.msg.startswith('Expecting \',\''):
608 elif not doc.endswith(','):
611 if err.msg.startswith('Expecting property name'):
612 return doc[:-1] + '}'
613 elif err.msg.startswith('Expecting value'):
614 return doc[:-1] + ']'
617 if self.transform_source:
618 s = self.transform_source(s)
619 for attempt in range(self._close_attempts + 1):
621 if self.ignore_extra:
622 return self.raw_decode(s.lstrip())[0]
623 return super().decode(s)
624 except json.JSONDecodeError as e:
627 elif attempt < self._close_attempts:
628 s = self._close_object(e)
631 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
632 assert False, 'Too many attempts to decode JSON'
635 def sanitize_open(filename, open_mode):
636 """Try to
open the given filename
, and slightly tweak it
if this fails
.
638 Attempts to
open the given filename
. If this fails
, it tries to change
639 the filename slightly
, step by step
, until it
's either able to open it
640 or it fails and raises a final exception, like the standard open()
643 It returns the tuple (stream, definitive_file_name).
646 if sys.platform == 'win32
':
649 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
650 with contextlib.suppress(io.UnsupportedOperation):
651 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
652 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
654 for attempt in range(2):
657 if sys.platform == 'win32
':
658 # FIXME: An exclusive lock also locks the file from being read.
659 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
660 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
661 raise LockingUnsupportedError()
662 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
664 stream
= open(filename
, open_mode
)
665 return stream
, filename
666 except OSError as err
:
667 if attempt
or err
.errno
in (errno
.EACCES
,):
669 old_filename
, filename
= filename
, sanitize_path(filename
)
670 if old_filename
== filename
:
674 def timeconvert(timestr
):
675 """Convert RFC 2822 defined time string into system timestamp"""
677 timetuple
= email
.utils
.parsedate_tz(timestr
)
678 if timetuple
is not None:
679 timestamp
= email
.utils
.mktime_tz(timetuple
)
683 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
684 """Sanitizes a string so it could be used as part of a filename.
685 @param restricted Use a stricter subset of allowed characters
686 @param is_id Whether this is an ID that should be kept unchanged if possible.
687 If unset, yt-dlp's new sanitization rules are in effect
692 def replace_insane(char
):
693 if restricted
and char
in ACCENT_CHARS
:
694 return ACCENT_CHARS
[char
]
695 elif not restricted
and char
== '\n':
697 elif is_id
is NO_DEFAULT
and not restricted
and char
in '"*:<>?|/\\':
698 # Replace with their full-width unicode counterparts
699 return {'/': '\u29F8', '\\': '\u29f9'}
.get(char
, chr(ord(char
) + 0xfee0))
700 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
703 return '' if restricted
else '\''
705 return '\0_\0-' if restricted
else '\0 \0-'
706 elif char
in '\\/|*<>':
708 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
712 # Replace look-alike Unicode glyphs
713 if restricted
and (is_id
is NO_DEFAULT
or not is_id
):
714 s
= unicodedata
.normalize('NFKC', s
)
715 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
716 result
= ''.join(map(replace_insane
, s
))
717 if is_id
is NO_DEFAULT
:
718 result
= re
.sub(r
'(\0.)(?:(?=\1)..)+', r
'\1', result
) # Remove repeated substitute chars
719 STRIP_RE
= r
'(?:\0.|[ _-])*'
720 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
721 result
= result
.replace('\0', '') or '_'
724 while '__' in result
:
725 result
= result
.replace('__', '_')
726 result
= result
.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted
and result
.startswith('-_'):
730 if result
.startswith('-'):
731 result
= '_' + result
[len('-'):]
732 result
= result
.lstrip('.')
738 def sanitize_path(s
, force
=False):
739 """Sanitizes and normalizes path on Windows"""
740 if sys
.platform
== 'win32':
742 drive_or_unc
, _
= os
.path
.splitdrive(s
)
748 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
752 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
753 for path_part
in norm_path
]
755 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
756 elif force
and s
and s
[0] == os
.path
.sep
:
757 sanitized_path
.insert(0, os
.path
.sep
)
758 return os
.path
.join(*sanitized_path
)
761 def sanitize_url(url
, *, scheme
='http'):
762 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
763 # the number of unwanted failures due to missing protocol
766 elif url
.startswith('//'):
767 return f
'{scheme}:{url}'
768 # Fix some common typos seen so far
770 # https://github.com/ytdl-org/youtube-dl/issues/15649
771 (r
'^httpss://', r
'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
775 for mistake
, fixup
in COMMON_TYPOS
:
776 if re
.match(mistake
, url
):
777 return re
.sub(mistake
, fixup
, url
)
781 def extract_basic_auth(url
):
782 parts
= urllib
.parse
.urlsplit(url
)
783 if parts
.username
is None:
785 url
= urllib
.parse
.urlunsplit(parts
._replace
(netloc
=(
786 parts
.hostname
if parts
.port
is None
787 else '%s:%d' % (parts
.hostname
, parts
.port
))))
788 auth_payload
= base64
.b64encode(
789 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
790 return url
, f
'Basic {auth_payload.decode()}'
793 def sanitized_Request(url
, *args
, **kwargs
):
794 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
795 if auth_header
is not None:
796 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
797 headers
['Authorization'] = auth_header
798 return urllib
.request
.Request(url
, *args
, **kwargs
)
802 """Expand shell variables and ~"""
803 return os
.path
.expandvars(compat_expanduser(s
))
806 def orderedSet(iterable
, *, lazy
=False):
807 """Remove all duplicates from the input iterable"""
809 seen
= [] # Do not use set since the items can be unhashable
815 return _iter() if lazy
else list(_iter())
818 def _htmlentity_transform(entity_with_semicolon
):
819 """Transforms an HTML entity to a character."""
820 entity
= entity_with_semicolon
[:-1]
822 # Known non-numeric HTML entity
823 if entity
in html
.entities
.name2codepoint
:
824 return chr(html
.entities
.name2codepoint
[entity
])
826 # TODO: HTML5 allows entities without a semicolon.
827 # E.g. 'Éric' should be decoded as 'Éric'.
828 if entity_with_semicolon
in html
.entities
.html5
:
829 return html
.entities
.html5
[entity_with_semicolon
]
831 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
833 numstr
= mobj
.group(1)
834 if numstr
.startswith('x'):
836 numstr
= '0%s' % numstr
839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
840 with contextlib
.suppress(ValueError):
841 return chr(int(numstr
, base
))
843 # Unknown entity in name, return its literal representation
844 return '&%s;' % entity
850 assert isinstance(s
, str)
853 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
856 def escapeHTML(text
):
859 .replace('&', '&')
860 .replace('<', '<')
861 .replace('>', '>')
862 .replace('"', '"')
863 .replace("'", ''')
867 def process_communicate_or_kill(p
, *args
, **kwargs
):
868 deprecation_warning(f
'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
869 f
'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
870 return Popen
.communicate_or_kill(p
, *args
, **kwargs
)
873 class Popen(subprocess
.Popen
):
874 if sys
.platform
== 'win32':
875 _startupinfo
= subprocess
.STARTUPINFO()
876 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
881 def _fix_pyinstaller_ld_path(env
):
882 """Restore LD_LIBRARY_PATH when using PyInstaller
883 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
884 https://github.com/yt-dlp/yt-dlp/issues/4573
886 if not hasattr(sys
, '_MEIPASS'):
890 orig
= env
.get(f
'{key}_ORIG')
896 _fix('LD_LIBRARY_PATH') # Linux
897 _fix('DYLD_LIBRARY_PATH') # macOS
899 def __init__(self
, *args
, env
=None, text
=False, **kwargs
):
901 env
= os
.environ
.copy()
902 self
._fix
_pyinstaller
_ld
_path
(env
)
904 self
.__text
_mode
= kwargs
.get('encoding') or kwargs
.get('errors') or text
or kwargs
.get('universal_newlines')
906 kwargs
['universal_newlines'] = True # For 3.6 compatibility
907 kwargs
.setdefault('encoding', 'utf-8')
908 kwargs
.setdefault('errors', 'replace')
909 super().__init
__(*args
, env
=env
, **kwargs
, startupinfo
=self
._startupinfo
)
911 def communicate_or_kill(self
, *args
, **kwargs
):
913 return self
.communicate(*args
, **kwargs
)
914 except BaseException
: # Including KeyboardInterrupt
915 self
.kill(timeout
=None)
918 def kill(self
, *, timeout
=0):
921 self
.wait(timeout
=timeout
)
924 def run(cls
, *args
, timeout
=None, **kwargs
):
925 with cls(*args
, **kwargs
) as proc
:
926 default
= '' if proc
.__text
_mode
else b
''
927 stdout
, stderr
= proc
.communicate_or_kill(timeout
=timeout
)
928 return stdout
or default
, stderr
or default
, proc
.returncode
931 def get_subprocess_encoding():
932 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
933 # For subprocess calls, encode with locale encoding
934 # Refer to http://stackoverflow.com/a/9951851/35070
935 encoding
= preferredencoding()
937 encoding
= sys
.getfilesystemencoding()
943 def encodeFilename(s
, for_subprocess
=False):
944 assert isinstance(s
, str)
948 def decodeFilename(b
, for_subprocess
=False):
952 def encodeArgument(s
):
953 # Legacy code that uses byte strings
954 # Uncomment the following line after fixing all post processors
955 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
956 return s
if isinstance(s
, str) else s
.decode('ascii')
959 def decodeArgument(b
):
963 def decodeOption(optval
):
966 if isinstance(optval
, bytes):
967 optval
= optval
.decode(preferredencoding())
969 assert isinstance(optval
, str)
973 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
976 def timetuple_from_msec(msec
):
977 secs
, msec
= divmod(msec
, 1000)
978 mins
, secs
= divmod(secs
, 60)
979 hrs
, mins
= divmod(mins
, 60)
980 return _timetuple(hrs
, mins
, secs
, msec
)
983 def formatSeconds(secs
, delim
=':', msec
=False):
984 time
= timetuple_from_msec(secs
* 1000)
986 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
988 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
990 ret
= '%d' % time
.seconds
991 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
994 def _ssl_load_windows_store_certs(ssl_context
, storename
):
995 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
997 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
998 if encoding
== 'x509_asn' and (
999 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
1000 except PermissionError
:
1003 with contextlib
.suppress(ssl
.SSLError
):
1004 ssl_context
.load_verify_locations(cadata
=cert
)
1007 def make_HTTPS_handler(params
, **kwargs
):
1008 opts_check_certificate
= not params
.get('nocheckcertificate')
1009 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
1010 context
.check_hostname
= opts_check_certificate
1011 if params
.get('legacyserverconnect'):
1012 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1013 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014 context
.set_ciphers('DEFAULT')
1016 sys
.version_info
< (3, 10)
1017 and ssl
.OPENSSL_VERSION_INFO
>= (1, 1, 1)
1018 and not ssl
.OPENSSL_VERSION
.startswith('LibreSSL')
1020 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022 # in some situations [2][3].
1023 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1025 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1026 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029 # 4. https://peps.python.org/pep-0644/
1030 # 5. https://peps.python.org/pep-0644/#libressl-support
1031 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1032 context
.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033 context
.minimum_version
= ssl
.TLSVersion
.TLSv1_2
1035 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
1036 if opts_check_certificate
:
1037 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
1038 context
.load_verify_locations(cafile
=certifi
.where())
1041 context
.load_default_certs()
1042 # Work around the issue in load_default_certs when there are bad certificates. See:
1043 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045 except ssl
.SSLError
:
1046 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
1048 for storename
in ('CA', 'ROOT'):
1049 _ssl_load_windows_store_certs(context
, storename
)
1050 context
.set_default_verify_paths()
1052 client_certfile
= params
.get('client_certificate')
1055 context
.load_cert_chain(
1056 client_certfile
, keyfile
=params
.get('client_certificate_key'),
1057 password
=params
.get('client_certificate_password'))
1058 except ssl
.SSLError
:
1059 raise YoutubeDLError('Unable to load client certificate')
1061 # Some servers may reject requests if ALPN extension is not sent. See:
1062 # https://github.com/python/cpython/issues/85140
1063 # https://github.com/yt-dlp/yt-dlp/issues/3878
1064 with contextlib
.suppress(NotImplementedError):
1065 context
.set_alpn_protocols(['http/1.1'])
1067 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
1070 def bug_reports_message(before
=';'):
1071 from .update
import REPOSITORY
1073 msg
= (f
'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1074 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1076 before
= before
.rstrip()
1077 if not before
or before
.endswith(('.', '!', '?')):
1078 msg
= msg
[0].title() + msg
[1:]
1080 return (before
+ ' ' if before
else '') + msg
1083 class YoutubeDLError(Exception):
1084 """Base exception for YoutubeDL errors."""
1087 def __init__(self
, msg
=None):
1090 elif self
.msg
is None:
1091 self
.msg
= type(self
).__name
__
1092 super().__init
__(self
.msg
)
1095 network_exceptions
= [urllib
.error
.URLError
, http
.client
.HTTPException
, socket
.error
]
1096 if hasattr(ssl
, 'CertificateError'):
1097 network_exceptions
.append(ssl
.CertificateError
)
1098 network_exceptions
= tuple(network_exceptions
)
1101 class ExtractorError(YoutubeDLError
):
1102 """Error during info extraction."""
1104 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
1105 """ tb, if given, is the original traceback (so that it can be printed out).
1106 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1108 if sys
.exc_info()[0] in network_exceptions
:
1111 self
.orig_msg
= str(msg
)
1113 self
.expected
= expected
1115 self
.video_id
= video_id
1117 self
.exc_info
= sys
.exc_info() # preserve original exception
1118 if isinstance(self
.exc_info
[1], ExtractorError
):
1119 self
.exc_info
= self
.exc_info
[1].exc_info
1120 super().__init
__(self
.__msg
)
1125 format_field(self
.ie
, None, '[%s] '),
1126 format_field(self
.video_id
, None, '%s: '),
1128 format_field(self
.cause
, None, ' (caused by %r)'),
1129 '' if self
.expected
else bug_reports_message()))
1131 def format_traceback(self
):
1132 return join_nonempty(
1133 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1134 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1137 def __setattr__(self
, name
, value
):
1138 super().__setattr
__(name
, value
)
1139 if getattr(self
, 'msg', None) and name
not in ('msg', 'args'):
1140 self
.msg
= self
.__msg
or type(self
).__name
__
1141 self
.args
= (self
.msg
, ) # Cannot be property
1144 class UnsupportedError(ExtractorError
):
1145 def __init__(self
, url
):
1147 'Unsupported URL: %s' % url
, expected
=True)
1151 class RegexNotFoundError(ExtractorError
):
1152 """Error when a regex didn't match"""
1156 class GeoRestrictedError(ExtractorError
):
1157 """Geographic restriction Error exception.
1159 This exception may be thrown when a video is not available from your
1160 geographic location due to geographic restrictions imposed by a website.
1163 def __init__(self
, msg
, countries
=None, **kwargs
):
1164 kwargs
['expected'] = True
1165 super().__init
__(msg
, **kwargs
)
1166 self
.countries
= countries
1169 class UserNotLive(ExtractorError
):
1170 """Error when a channel/user is not live"""
1172 def __init__(self
, msg
=None, **kwargs
):
1173 kwargs
['expected'] = True
1174 super().__init
__(msg
or 'The channel is not currently live', **kwargs
)
1177 class DownloadError(YoutubeDLError
):
1178 """Download Error exception.
1180 This exception may be thrown by FileDownloader objects if they are not
1181 configured to continue on errors. They will contain the appropriate
1185 def __init__(self
, msg
, exc_info
=None):
1186 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1187 super().__init
__(msg
)
1188 self
.exc_info
= exc_info
1191 class EntryNotInPlaylist(YoutubeDLError
):
1192 """Entry not in playlist exception.
1194 This exception will be thrown by YoutubeDL when a requested entry
1195 is not found in the playlist info_dict
1197 msg
= 'Entry not found in info'
1200 class SameFileError(YoutubeDLError
):
1201 """Same File exception.
1203 This exception will be thrown by FileDownloader objects if they detect
1204 multiple files would have to be downloaded to the same file on disk.
1206 msg
= 'Fixed output name but more than one file to download'
1208 def __init__(self
, filename
=None):
1209 if filename
is not None:
1210 self
.msg
+= f
': {filename}'
1211 super().__init
__(self
.msg
)
1214 class PostProcessingError(YoutubeDLError
):
1215 """Post Processing exception.
1217 This exception may be raised by PostProcessor's .run() method to
1218 indicate an error in the postprocessing task.
1222 class DownloadCancelled(YoutubeDLError
):
1223 """ Exception raised when the download queue should be interrupted """
1224 msg
= 'The download was cancelled'
1227 class ExistingVideoReached(DownloadCancelled
):
1228 """ --break-on-existing triggered """
1229 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1232 class RejectedVideoReached(DownloadCancelled
):
1233 """ --break-match-filter triggered """
1234 msg
= 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1237 class MaxDownloadsReached(DownloadCancelled
):
1238 """ --max-downloads limit has been reached. """
1239 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1242 class ReExtractInfo(YoutubeDLError
):
1243 """ Video info needs to be re-extracted. """
1245 def __init__(self
, msg
, expected
=False):
1246 super().__init
__(msg
)
1247 self
.expected
= expected
1250 class ThrottledDownload(ReExtractInfo
):
1251 """ Download speed below --throttled-rate. """
1252 msg
= 'The download speed is below throttle limit'
1255 super().__init
__(self
.msg
, expected
=False)
1258 class UnavailableVideoError(YoutubeDLError
):
1259 """Unavailable Format exception.
1261 This exception will be thrown when a video is requested
1262 in a format that is not available for that video.
1264 msg
= 'Unable to download video'
1266 def __init__(self
, err
=None):
1268 self
.msg
+= f
': {err}'
1269 super().__init
__(self
.msg
)
1272 class ContentTooShortError(YoutubeDLError
):
1273 """Content Too Short exception.
1275 This exception may be raised by FileDownloader objects when a file they
1276 download is too small for what the server announced first, indicating
1277 the connection was probably interrupted.
1280 def __init__(self
, downloaded
, expected
):
1281 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1283 self
.downloaded
= downloaded
1284 self
.expected
= expected
1287 class XAttrMetadataError(YoutubeDLError
):
1288 def __init__(self
, code
=None, msg
='Unknown error'):
1289 super().__init
__(msg
)
1293 # Parsing code and msg
1294 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1295 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1296 self
.reason
= 'NO_SPACE'
1297 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1298 self
.reason
= 'VALUE_TOO_LONG'
1300 self
.reason
= 'NOT_SUPPORTED'
1303 class XAttrUnavailableError(YoutubeDLError
):
1307 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1308 hc
= http_class(*args
, **kwargs
)
1309 source_address
= ydl_handler
._params
.get('source_address')
1311 if source_address
is not None:
1312 # This is to workaround _create_connection() from socket where it will try all
1313 # address data from getaddrinfo() including IPv6. This filters the result from
1314 # getaddrinfo() based on the source_address value.
1315 # This is based on the cpython socket.create_connection() function.
1316 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1318 host
, port
= address
1320 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1321 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1322 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1323 if addrs
and not ip_addrs
:
1324 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1326 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327 % (ip_version
, source_address
[0]))
1328 for res
in ip_addrs
:
1329 af
, socktype
, proto
, canonname
, sa
= res
1332 sock
= socket
.socket(af
, socktype
, proto
)
1333 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1334 sock
.settimeout(timeout
)
1335 sock
.bind(source_address
)
1337 err
= None # Explicitly break reference cycle
1339 except OSError as _
:
1341 if sock
is not None:
1346 raise OSError('getaddrinfo returns an empty list')
1347 if hasattr(hc
, '_create_connection'):
1348 hc
._create
_connection
= _create_connection
1349 hc
.source_address
= (source_address
, 0)
1354 def handle_youtubedl_headers(headers
):
1355 filtered_headers
= headers
1357 if 'Youtubedl-no-compression' in filtered_headers
:
1358 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1359 del filtered_headers
['Youtubedl-no-compression']
1361 return filtered_headers
1364 class YoutubeDLHandler(urllib
.request
.HTTPHandler
):
1365 """Handler for HTTP requests and responses.
1367 This class, when installed with an OpenerDirector, automatically adds
1368 the standard headers to every HTTP request and handles gzipped and
1369 deflated responses from web servers. If compression is to be avoided in
1370 a particular request, the original request in the program code only has
1371 to include the HTTP header "Youtubedl-no-compression", which will be
1372 removed before making the real request.
1374 Part of this code was copied from:
1376 http://techknack.net/python-urllib2-handlers/
1378 Andrew Rowls, the author of that code, agreed to release it to the
1382 def __init__(self
, params
, *args
, **kwargs
):
1383 urllib
.request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1384 self
._params
= params
1386 def http_open(self
, req
):
1387 conn_class
= http
.client
.HTTPConnection
1389 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1391 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1392 del req
.headers
['Ytdl-socks-proxy']
1394 return self
.do_open(functools
.partial(
1395 _create_http_connection
, self
, conn_class
, False),
1403 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1405 return zlib
.decompress(data
)
1411 return brotli
.decompress(data
)
1413 def http_request(self
, req
):
1414 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415 # always respected by websites, some tend to give out URLs with non percent-encoded
1416 # non-ASCII characters (see telemb.py, ard.py [#3412])
1417 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418 # To work around aforementioned issue we will replace request's original URL with
1419 # percent-encoded one
1420 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422 url
= req
.get_full_url()
1423 url_escaped
= escape_url(url
)
1425 # Substitute URL if any change after escaping
1426 if url
!= url_escaped
:
1427 req
= update_Request(req
, url
=url_escaped
)
1429 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1430 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431 # The dict keys are capitalized because of this bug by urllib
1432 if h
.capitalize() not in req
.headers
:
1433 req
.add_header(h
, v
)
1435 if 'Accept-encoding' not in req
.headers
:
1436 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1438 req
.headers
= handle_youtubedl_headers(req
.headers
)
1440 return super().do_request_(req
)
1442 def http_response(self
, req
, resp
):
1445 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1446 content
= resp
.read()
1447 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1449 uncompressed
= io
.BytesIO(gz
.read())
1450 except OSError as original_ioerror
:
1451 # There may be junk add the end of the file
1452 # See http://stackoverflow.com/q/4928560/35070 for details
1453 for i
in range(1, 1024):
1455 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1456 uncompressed
= io
.BytesIO(gz
.read())
1461 raise original_ioerror
1462 resp
= urllib
.request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1463 resp
.msg
= old_resp
.msg
1465 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1466 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1467 resp
= urllib
.request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1468 resp
.msg
= old_resp
.msg
1470 if resp
.headers
.get('Content-encoding', '') == 'br':
1471 resp
= urllib
.request
.addinfourl(
1472 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1473 resp
.msg
= old_resp
.msg
1474 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1475 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1476 if 300 <= resp
.code
< 400:
1477 location
= resp
.headers
.get('Location')
1479 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1480 location
= location
.encode('iso-8859-1').decode()
1481 location_escaped
= escape_url(location
)
1482 if location
!= location_escaped
:
1483 del resp
.headers
['Location']
1484 resp
.headers
['Location'] = location_escaped
1487 https_request
= http_request
1488 https_response
= http_response
1491 def make_socks_conn_class(base_class
, socks_proxy
):
1492 assert issubclass(base_class
, (
1493 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
1495 url_components
= urllib
.parse
.urlparse(socks_proxy
)
1496 if url_components
.scheme
.lower() == 'socks5':
1497 socks_type
= ProxyType
.SOCKS5
1498 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1499 socks_type
= ProxyType
.SOCKS4
1500 elif url_components
.scheme
.lower() == 'socks4a':
1501 socks_type
= ProxyType
.SOCKS4A
1503 def unquote_if_non_empty(s
):
1506 return urllib
.parse
.unquote_plus(s
)
1510 url_components
.hostname
, url_components
.port
or 1080,
1512 unquote_if_non_empty(url_components
.username
),
1513 unquote_if_non_empty(url_components
.password
),
1516 class SocksConnection(base_class
):
1518 self
.sock
= sockssocket()
1519 self
.sock
.setproxy(*proxy_args
)
1520 if isinstance(self
.timeout
, (int, float)):
1521 self
.sock
.settimeout(self
.timeout
)
1522 self
.sock
.connect((self
.host
, self
.port
))
1524 if isinstance(self
, http
.client
.HTTPSConnection
):
1525 if hasattr(self
, '_context'): # Python > 2.6
1526 self
.sock
= self
._context
.wrap_socket(
1527 self
.sock
, server_hostname
=self
.host
)
1529 self
.sock
= ssl
.wrap_socket(self
.sock
)
1531 return SocksConnection
1534 class YoutubeDLHTTPSHandler(urllib
.request
.HTTPSHandler
):
1535 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1536 urllib
.request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1537 self
._https
_conn
_class
= https_conn_class
or http
.client
.HTTPSConnection
1538 self
._params
= params
1540 def https_open(self
, req
):
1542 conn_class
= self
._https
_conn
_class
1544 if hasattr(self
, '_context'): # python > 2.6
1545 kwargs
['context'] = self
._context
1546 if hasattr(self
, '_check_hostname'): # python 3.x
1547 kwargs
['check_hostname'] = self
._check
_hostname
1549 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1551 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1552 del req
.headers
['Ytdl-socks-proxy']
1555 return self
.do_open(
1556 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1557 except urllib
.error
.URLError
as e
:
1558 if (isinstance(e
.reason
, ssl
.SSLError
)
1559 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1564 def is_path_like(f
):
1565 return isinstance(f
, (str, bytes, os
.PathLike
))
1568 class YoutubeDLCookieJar(http
.cookiejar
.MozillaCookieJar
):
1570 See [1] for cookie file format.
1572 1. https://curl.haxx.se/docs/http-cookies.html
1574 _HTTPONLY_PREFIX
= '#HttpOnly_'
1576 _HEADER
= '''# Netscape HTTP Cookie File
1577 # This file is generated by yt-dlp. Do not edit.
1580 _CookieFileEntry
= collections
.namedtuple(
1582 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1584 def __init__(self
, filename
=None, *args
, **kwargs
):
1585 super().__init
__(None, *args
, **kwargs
)
1586 if is_path_like(filename
):
1587 filename
= os
.fspath(filename
)
1588 self
.filename
= filename
1591 def _true_or_false(cndn
):
1592 return 'TRUE' if cndn
else 'FALSE'
1594 @contextlib.contextmanager
1595 def open(self
, file, *, write
=False):
1596 if is_path_like(file):
1597 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1604 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1607 if (not ignore_discard
and cookie
.discard
1608 or not ignore_expires
and cookie
.is_expired(now
)):
1610 name
, value
= cookie
.name
, cookie
.value
1612 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613 # with no name, whereas http.cookiejar regards it as a
1614 # cookie with no value.
1615 name
, value
= '', name
1616 f
.write('%s\n' % '\t'.join((
1618 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1620 self
._true
_or
_false
(cookie
.secure
),
1621 str_or_none(cookie
.expires
, default
=''),
1625 def save(self
, filename
=None, *args
, **kwargs
):
1627 Save cookies to a file.
1628 Code is taken from CPython 3.6
1629 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1631 if filename
is None:
1632 if self
.filename
is not None:
1633 filename
= self
.filename
1635 raise ValueError(http
.cookiejar
.MISSING_FILENAME_TEXT
)
1637 # Store session cookies with `expires` set to 0 instead of an empty string
1639 if cookie
.expires
is None:
1642 with self
.open(filename
, write
=True) as f
:
1643 f
.write(self
._HEADER
)
1644 self
._really
_save
(f
, *args
, **kwargs
)
1646 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1647 """Load cookies from a file."""
1648 if filename
is None:
1649 if self
.filename
is not None:
1650 filename
= self
.filename
1652 raise ValueError(http
.cookiejar
.MISSING_FILENAME_TEXT
)
1654 def prepare_line(line
):
1655 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1656 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1657 # comments and empty lines are fine
1658 if line
.startswith('#') or not line
.strip():
1660 cookie_list
= line
.split('\t')
1661 if len(cookie_list
) != self
._ENTRY
_LEN
:
1662 raise http
.cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1663 cookie
= self
._CookieFileEntry
(*cookie_list
)
1664 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1665 raise http
.cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1669 with self
.open(filename
) as f
:
1672 cf
.write(prepare_line(line
))
1673 except http
.cookiejar
.LoadError
as e
:
1674 if f
'{line.strip()} '[0] in '[{"':
1675 raise http
.cookiejar
.LoadError(
1676 'Cookies file must be Netscape formatted, not JSON. See '
1677 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1678 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1681 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1682 # Session cookies are denoted by either `expires` field set to
1683 # an empty string or 0. MozillaCookieJar only recognizes the former
1684 # (see [1]). So we need force the latter to be recognized as session
1685 # cookies on our own.
1686 # Session cookies may be important for cookies-based authentication,
1687 # e.g. usually, when user does not check 'Remember me' check box while
1688 # logging in on a site, some important cookies are stored as session
1689 # cookies so that not recognizing them will result in failed login.
1690 # 1. https://bugs.python.org/issue17164
1692 # Treat `expires=0` cookies as session cookies
1693 if cookie
.expires
== 0:
1694 cookie
.expires
= None
1695 cookie
.discard
= True
1698 class YoutubeDLCookieProcessor(urllib
.request
.HTTPCookieProcessor
):
1699 def __init__(self
, cookiejar
=None):
1700 urllib
.request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1702 def http_response(self
, request
, response
):
1703 return urllib
.request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1705 https_request
= urllib
.request
.HTTPCookieProcessor
.http_request
1706 https_response
= http_response
1709 class YoutubeDLRedirectHandler(urllib
.request
.HTTPRedirectHandler
):
1710 """YoutubeDL redirect handler
1712 The code is based on HTTPRedirectHandler implementation from CPython [1].
1714 This redirect handler solves two issues:
1715 - ensures redirect URL is always unicode under python 2
1716 - introduces support for experimental HTTP response status code
1717 308 Permanent Redirect [2] used by some sites [3]
1719 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1724 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
1726 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1727 """Return a Request or None in response to a redirect.
1729 This is called by the http_error_30x methods when a
1730 redirection response is received. If a redirection should
1731 take place, return a new Request to allow http_error_30x to
1732 perform the redirect. Otherwise, raise HTTPError if no-one
1733 else should try to handle this url. Return None if you can't
1734 but another Handler might.
1736 m
= req
.get_method()
1737 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1738 or code
in (301, 302, 303) and m
== "POST")):
1739 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1740 # Strictly (according to RFC 2616), 301 or 302 in response to
1741 # a POST MUST NOT cause a redirection without confirmation
1742 # from the user (of urllib.request, in this case). In practice,
1743 # essentially all clients do redirect in this case, so we do
1746 # Be conciliant with URIs containing a space. This is mainly
1747 # redundant with the more complete encoding done in http_error_302(),
1748 # but it is kept for compatibility with other callers.
1749 newurl
= newurl
.replace(' ', '%20')
1751 CONTENT_HEADERS
= ("content-length", "content-type")
1752 # NB: don't use dict comprehension for python 2.6 compatibility
1753 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1755 # A 303 must either use GET or HEAD for subsequent request
1756 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757 if code
== 303 and m
!= 'HEAD':
1759 # 301 and 302 redirects are commonly turned into a GET from a POST
1760 # for subsequent requests by browsers, so we'll do the same.
1761 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763 if code
in (301, 302) and m
== 'POST':
1766 return urllib
.request
.Request(
1767 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1768 unverifiable
=True, method
=m
)
1771 def extract_timezone(date_str
):
1774 ^.{8,}? # >=8 char non-TZ prefix, if present
1775 (?P<tz>Z| # just the UTC Z, or
1776 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1777 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778 [ ]? # optional space
1779 (?P<sign>\+|-) # +/-
1780 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1784 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1785 timezone
= TIMEZONE_NAMES
.get(m
and m
.group('tz').strip())
1786 if timezone
is not None:
1787 date_str
= date_str
[:-len(m
.group('tz'))]
1788 timezone
= datetime
.timedelta(hours
=timezone
or 0)
1790 date_str
= date_str
[:-len(m
.group('tz'))]
1791 if not m
.group('sign'):
1792 timezone
= datetime
.timedelta()
1794 sign
= 1 if m
.group('sign') == '+' else -1
1795 timezone
= datetime
.timedelta(
1796 hours
=sign
* int(m
.group('hours')),
1797 minutes
=sign
* int(m
.group('minutes')))
1798 return timezone
, date_str
1801 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1802 """ Return a UNIX timestamp from the given date """
1804 if date_str
is None:
1807 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1809 if timezone
is None:
1810 timezone
, date_str
= extract_timezone(date_str
)
1812 with contextlib
.suppress(ValueError):
1813 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1814 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1815 return calendar
.timegm(dt
.timetuple())
1818 def date_formats(day_first
=True):
1819 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1822 def unified_strdate(date_str
, day_first
=True):
1823 """Return a string with the date in the format YYYYMMDD"""
1825 if date_str
is None:
1829 date_str
= date_str
.replace(',', ' ')
1830 # Remove AM/PM + timezone
1831 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1832 _
, date_str
= extract_timezone(date_str
)
1834 for expression
in date_formats(day_first
):
1835 with contextlib
.suppress(ValueError):
1836 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1837 if upload_date
is None:
1838 timetuple
= email
.utils
.parsedate_tz(date_str
)
1840 with contextlib
.suppress(ValueError):
1841 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1842 if upload_date
is not None:
1843 return str(upload_date
)
1846 def unified_timestamp(date_str
, day_first
=True):
1847 if date_str
is None:
1850 date_str
= re
.sub(r
'\s+', ' ', re
.sub(
1851 r
'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str
))
1853 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1854 timezone
, date_str
= extract_timezone(date_str
)
1856 # Remove AM/PM + timezone
1857 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1859 # Remove unrecognized timezones from ISO 8601 alike timestamps
1860 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1862 date_str
= date_str
[:-len(m
.group('tz'))]
1864 # Python only supports microseconds, so remove nanoseconds
1865 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1867 date_str
= m
.group(1)
1869 for expression
in date_formats(day_first
):
1870 with contextlib
.suppress(ValueError):
1871 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1872 return calendar
.timegm(dt
.timetuple())
1874 timetuple
= email
.utils
.parsedate_tz(date_str
)
1876 return calendar
.timegm(timetuple
) + pm_delta
* 3600 - timezone
.total_seconds()
1879 def determine_ext(url
, default_ext
='unknown_video'):
1880 if url
is None or '.' not in url
:
1882 guess
= url
.partition('?')[0].rpartition('.')[2]
1883 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1885 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1887 return guess
.rstrip('/')
1892 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1893 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1896 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1898 Return a datetime object from a string.
1900 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1902 @param format strftime format of DATE
1903 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1904 auto: round to the unit provided in date_str (if applicable).
1906 auto_precision
= False
1907 if precision
== 'auto':
1908 auto_precision
= True
1909 precision
= 'microsecond'
1910 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1911 if date_str
in ('now', 'today'):
1913 if date_str
== 'yesterday':
1914 return today
- datetime
.timedelta(days
=1)
1916 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1918 if match
is not None:
1919 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1920 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1921 unit
= match
.group('unit')
1922 if unit
== 'month' or unit
== 'year':
1923 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1929 delta
= datetime
.timedelta(**{unit + 's': time}
)
1930 new_date
= start_time
+ delta
1932 return datetime_round(new_date
, unit
)
1935 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1938 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1940 Return a date object from a string using datetime_from_str
1942 @param strict Restrict allowed patterns to "YYYYMMDD" and
1943 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1945 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1946 raise ValueError(f
'Invalid date format "{date_str}"')
1947 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1950 def datetime_add_months(dt
, months
):
1951 """Increment/Decrement a datetime object by months."""
1952 month
= dt
.month
+ months
- 1
1953 year
= dt
.year
+ month
// 12
1954 month
= month
% 12 + 1
1955 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1956 return dt
.replace(year
, month
, day
)
1959 def datetime_round(dt
, precision
='day'):
1961 Round a datetime object's time to a specific precision
1963 if precision
== 'microsecond':
1972 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1973 timestamp
= calendar
.timegm(dt
.timetuple())
1974 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1977 def hyphenate_date(date_str
):
1979 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1981 if match
is not None:
1982 return '-'.join(match
.groups())
1988 """Represents a time interval between two dates"""
1990 def __init__(self
, start
=None, end
=None):
1991 """start and end must be strings in the format accepted by date"""
1992 if start
is not None:
1993 self
.start
= date_from_str(start
, strict
=True)
1995 self
.start
= datetime
.datetime
.min.date()
1997 self
.end
= date_from_str(end
, strict
=True)
1999 self
.end
= datetime
.datetime
.max.date()
2000 if self
.start
> self
.end
:
2001 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
2005 """Returns a range that only contains the given day"""
2006 return cls(day
, day
)
2008 def __contains__(self
, date
):
2009 """Check if the date is in the range"""
2010 if not isinstance(date
, datetime
.date
):
2011 date
= date_from_str(date
)
2012 return self
.start
<= date
<= self
.end
2015 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
2017 def __eq__(self
, other
):
2018 return (isinstance(other
, DateRange
)
2019 and self
.start
== other
.start
and self
.end
== other
.end
)
2022 def platform_name():
2023 """ Returns the platform name as a str """
2024 deprecation_warning(f
'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2025 return platform
.platform()
2029 def system_identifier():
2030 python_implementation
= platform
.python_implementation()
2031 if python_implementation
== 'PyPy' and hasattr(sys
, 'pypy_version_info'):
2032 python_implementation
+= ' version %d.%d.%d' % sys
.pypy_version_info
[:3]
2034 with contextlib
.suppress(OSError): # We may not have access to the executable
2035 libc_ver
= platform
.libc_ver()
2037 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2038 platform
.python_version(),
2039 python_implementation
,
2041 platform
.architecture()[0],
2042 platform
.platform(),
2043 ssl
.OPENSSL_VERSION
,
2044 format_field(join_nonempty(*libc_ver
, delim
=' '), None, ', %s'),
2049 def get_windows_version():
2050 ''' Get Windows version. returns () if it's not running on Windows '''
2051 if compat_os_name
== 'nt':
2052 return version_tuple(platform
.win32_ver()[1])
2057 def write_string(s
, out
=None, encoding
=None):
2058 assert isinstance(s
, str)
2059 out
= out
or sys
.stderr
2061 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
2062 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
2064 enc
, buffer = None, out
2065 if 'b' in getattr(out
, 'mode', ''):
2066 enc
= encoding
or preferredencoding()
2067 elif hasattr(out
, 'buffer'):
2069 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
2071 buffer.write(s
.encode(enc
, 'ignore') if enc
else s
)
2075 def deprecation_warning(msg
, *, printer
=None, stacklevel
=0, **kwargs
):
2076 from . import _IN_CLI
2078 if msg
in deprecation_warning
._cache
:
2080 deprecation_warning
._cache
.add(msg
)
2082 return printer(f
'{msg}{bug_reports_message()}', **kwargs
)
2083 return write_string(f
'ERROR: {msg}{bug_reports_message()}\n', **kwargs
)
2086 warnings
.warn(DeprecationWarning(msg
), stacklevel
=stacklevel
+ 3)
2089 deprecation_warning
._cache
= set()
2092 def bytes_to_intlist(bs
):
2095 if isinstance(bs
[0], int): # Python 3
2098 return [ord(c
) for c
in bs
]
2101 def intlist_to_bytes(xs
):
2104 return struct
.pack('%dB' % len(xs
), *xs
)
2107 class LockingUnsupportedError(OSError):
2108 msg
= 'File locking is not supported'
2111 super().__init
__(self
.msg
)
2114 # Cross-platform file locking
2115 if sys
.platform
== 'win32':
2117 import ctypes
.wintypes
2120 class OVERLAPPED(ctypes
.Structure
):
2122 ('Internal', ctypes
.wintypes
.LPVOID
),
2123 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
2124 ('Offset', ctypes
.wintypes
.DWORD
),
2125 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
2126 ('hEvent', ctypes
.wintypes
.HANDLE
),
2129 kernel32
= ctypes
.WinDLL('kernel32')
2130 LockFileEx
= kernel32
.LockFileEx
2131 LockFileEx
.argtypes
= [
2132 ctypes
.wintypes
.HANDLE
, # hFile
2133 ctypes
.wintypes
.DWORD
, # dwFlags
2134 ctypes
.wintypes
.DWORD
, # dwReserved
2135 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2136 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2137 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2139 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
2140 UnlockFileEx
= kernel32
.UnlockFileEx
2141 UnlockFileEx
.argtypes
= [
2142 ctypes
.wintypes
.HANDLE
, # hFile
2143 ctypes
.wintypes
.DWORD
, # dwReserved
2144 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2145 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2146 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2148 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
2149 whole_low
= 0xffffffff
2150 whole_high
= 0x7fffffff
2152 def _lock_file(f
, exclusive
, block
):
2153 overlapped
= OVERLAPPED()
2154 overlapped
.Offset
= 0
2155 overlapped
.OffsetHigh
= 0
2156 overlapped
.hEvent
= 0
2157 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
2159 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
2160 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
2161 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2162 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2163 raise BlockingIOError(f
'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2165 def _unlock_file(f
):
2166 assert f
._lock
_file
_overlapped
_p
2167 handle
= msvcrt
.get_osfhandle(f
.fileno())
2168 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2169 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2175 def _lock_file(f
, exclusive
, block
):
2176 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
2178 flags |
= fcntl
.LOCK_NB
2180 fcntl
.flock(f
, flags
)
2181 except BlockingIOError
:
2183 except OSError: # AOSP does not have flock()
2184 fcntl
.lockf(f
, flags
)
2186 def _unlock_file(f
):
2188 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2190 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2194 def _lock_file(f
, exclusive
, block
):
2195 raise LockingUnsupportedError()
2197 def _unlock_file(f
):
2198 raise LockingUnsupportedError()
2204 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2205 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2206 raise NotImplementedError(mode
)
2207 self
.mode
, self
.block
= mode
, block
2209 writable
= any(f
in mode
for f
in 'wax+')
2210 readable
= any(f
in mode
for f
in 'r+')
2211 flags
= functools
.reduce(operator
.ior
, (
2212 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2213 getattr(os
, 'O_BINARY', 0), # Windows only
2214 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2215 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2216 os
.O_APPEND
if 'a' in mode
else 0,
2217 os
.O_EXCL
if 'x' in mode
else 0,
2218 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2221 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2223 def __enter__(self
):
2224 exclusive
= 'r' not in self
.mode
2226 _lock_file(self
.f
, exclusive
, self
.block
)
2231 if 'w' in self
.mode
:
2234 except OSError as e
:
2236 errno
.ESPIPE
, # Illegal seek - expected for FIFO
2237 errno
.EINVAL
, # Invalid argument - expected for /dev/null
2246 _unlock_file(self
.f
)
2250 def __exit__(self
, *_
):
2259 def __getattr__(self
, attr
):
2260 return getattr(self
.f
, attr
)
2267 def get_filesystem_encoding():
2268 encoding
= sys
.getfilesystemencoding()
2269 return encoding
if encoding
is not None else 'utf-8'
2272 def shell_quote(args
):
2274 encoding
= get_filesystem_encoding()
2276 if isinstance(a
, bytes):
2277 # We may get a filename encoded with 'encodeFilename'
2278 a
= a
.decode(encoding
)
2279 quoted_args
.append(compat_shlex_quote(a
))
2280 return ' '.join(quoted_args
)
2283 def smuggle_url(url
, data
):
2284 """ Pass additional data in a URL for internal use. """
2286 url
, idata
= unsmuggle_url(url
, {})
2288 sdata
= urllib
.parse
.urlencode(
2289 {'__youtubedl_smuggle': json.dumps(data)}
)
2290 return url
+ '#' + sdata
2293 def unsmuggle_url(smug_url
, default
=None):
2294 if '#__youtubedl_smuggle' not in smug_url
:
2295 return smug_url
, default
2296 url
, _
, sdata
= smug_url
.rpartition('#')
2297 jsond
= urllib
.parse
.parse_qs(sdata
)['__youtubedl_smuggle'][0]
2298 data
= json
.loads(jsond
)
2302 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2303 """ Formats numbers with decimal sufixes like K, M, etc """
2304 num
, factor
= float_or_none(num
), float(factor
)
2305 if num
is None or num
< 0:
2307 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2308 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2309 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2311 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2312 converted
= num
/ (factor
** exponent
)
2313 return fmt
% (converted
, suffix
)
2316 def format_bytes(bytes):
2317 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2320 def lookup_unit_table(unit_table
, s
, strict
=False):
2321 num_re
= NUMBER_RE
if strict
else NUMBER_RE
.replace(R
'\.', '[,.]')
2322 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2323 m
= (re
.fullmatch
if strict
else re
.match
)(
2324 rf
'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s
)
2328 num
= float(m
.group('num').replace(',', '.'))
2329 mult
= unit_table
[m
.group('unit')]
2330 return round(num
* mult
)
2334 """Parse a string indicating a byte quantity into an integer"""
2335 return lookup_unit_table(
2336 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])}
,
2337 s
.upper(), strict
=True)
2340 def parse_filesize(s
):
2344 # The lower-case forms are of course incorrect and unofficial,
2345 # but we support those too
2362 'megabytes': 1000 ** 2,
2363 'mebibytes': 1024 ** 2,
2369 'gigabytes': 1000 ** 3,
2370 'gibibytes': 1024 ** 3,
2376 'terabytes': 1000 ** 4,
2377 'tebibytes': 1024 ** 4,
2383 'petabytes': 1000 ** 5,
2384 'pebibytes': 1024 ** 5,
2390 'exabytes': 1000 ** 6,
2391 'exbibytes': 1024 ** 6,
2397 'zettabytes': 1000 ** 7,
2398 'zebibytes': 1024 ** 7,
2404 'yottabytes': 1000 ** 8,
2405 'yobibytes': 1024 ** 8,
2408 return lookup_unit_table(_UNIT_TABLE
, s
)
2415 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2417 if re
.match(r
'^[\d,.]+$', s
):
2418 return str_to_int(s
)
2431 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2435 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2437 return str_to_int(mobj
.group(1))
2440 def parse_resolution(s
, *, lenient
=False):
2445 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2447 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2450 'width': int(mobj
.group('w')),
2451 'height': int(mobj
.group('h')),
2454 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2456 return {'height': int(mobj.group(1))}
2458 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2460 return {'height': int(mobj.group(1)) * 540}
2465 def parse_bitrate(s
):
2466 if not isinstance(s
, str):
2468 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2470 return int(mobj
.group(1))
2473 def month_by_name(name
, lang
='en'):
2474 """ Return the number of a month by (locale-independently) English name """
2476 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2479 return month_names
.index(name
) + 1
2484 def month_by_abbreviation(abbrev
):
2485 """ Return the number of a month by (locale-independently) English
2489 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2494 def fix_xml_ampersands(xml_str
):
2495 """Replace all the '&' by '&' in XML"""
2497 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2502 def setproctitle(title
):
2503 assert isinstance(title
, str)
2505 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2512 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2516 # LoadLibrary in Windows Python 2.7.13 only expects
2517 # a bytestring, but since unicode_literals turns
2518 # every string into a unicode string, it fails.
2520 title_bytes
= title
.encode()
2521 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2522 buf
.value
= title_bytes
2524 libc
.prctl(15, buf
, 0, 0, 0)
2525 except AttributeError:
2526 return # Strange libc, just skip this
2529 def remove_start(s
, start
):
2530 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2533 def remove_end(s
, end
):
2534 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2537 def remove_quotes(s
):
2538 if s
is None or len(s
) < 2:
2540 for quote
in ('"', "'", ):
2541 if s
[0] == quote
and s
[-1] == quote
:
2546 def get_domain(url
):
2548 This implementation is inconsistent, but is kept for compatibility.
2549 Use this only for "webpage_url_domain"
2551 return remove_start(urllib
.parse
.urlparse(url
).netloc
, 'www.') or None
2554 def url_basename(url
):
2555 path
= urllib
.parse
.urlparse(url
).path
2556 return path
.strip('/').split('/')[-1]
2560 return re
.match(r
'https?://[^?#]+/', url
).group()
2563 def urljoin(base
, path
):
2564 if isinstance(path
, bytes):
2565 path
= path
.decode()
2566 if not isinstance(path
, str) or not path
:
2568 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2570 if isinstance(base
, bytes):
2571 base
= base
.decode()
2572 if not isinstance(base
, str) or not re
.match(
2573 r
'^(?:https?:)?//', base
):
2575 return urllib
.parse
.urljoin(base
, path
)
2578 class HEADRequest(urllib
.request
.Request
):
2579 def get_method(self
):
2583 class PUTRequest(urllib
.request
.Request
):
2584 def get_method(self
):
2588 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2589 if get_attr
and v
is not None:
2590 v
= getattr(v
, get_attr
, None)
2592 return int(v
) * invscale
// scale
2593 except (ValueError, TypeError, OverflowError):
2597 def str_or_none(v
, default
=None):
2598 return default
if v
is None else str(v
)
2601 def str_to_int(int_str
):
2602 """ A more relaxed version of int_or_none """
2603 if isinstance(int_str
, int):
2605 elif isinstance(int_str
, str):
2606 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2607 return int_or_none(int_str
)
2610 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2614 return float(v
) * invscale
/ scale
2615 except (ValueError, TypeError):
2619 def bool_or_none(v
, default
=None):
2620 return v
if isinstance(v
, bool) else default
2623 def strip_or_none(v
, default
=None):
2624 return v
.strip() if isinstance(v
, str) else default
2627 def url_or_none(url
):
2628 if not url
or not isinstance(url
, str):
2631 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2634 def request_to_url(req
):
2635 if isinstance(req
, urllib
.request
.Request
):
2636 return req
.get_full_url()
2641 def strftime_or_none(timestamp
, date_format
, default
=None):
2642 datetime_object
= None
2644 if isinstance(timestamp
, (int, float)): # unix timestamp
2645 # Using naive datetime here can break timestamp() in Windows
2646 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2647 datetime_object
= datetime
.datetime
.fromtimestamp(timestamp
, datetime
.timezone
.utc
)
2648 elif isinstance(timestamp
, str): # assume YYYYMMDD
2649 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2650 date_format
= re
.sub( # Support %s on windows
2651 r
'(?<!%)(%%)*%s', rf
'\g<1>{int(datetime_object.timestamp())}', date_format
)
2652 return datetime_object
.strftime(date_format
)
2653 except (ValueError, TypeError, AttributeError):
2657 def parse_duration(s
):
2658 if not isinstance(s
, str):
2664 days
, hours
, mins
, secs
, ms
= [None] * 5
2665 m
= re
.match(r
'''(?x)
2667 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2668 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2669 (?P<ms>[.:][0-9]+)?Z?$
2672 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2677 [0-9]+\s*y(?:ears?)?,?\s*
2680 [0-9]+\s*m(?:onths?)?,?\s*
2683 [0-9]+\s*w(?:eeks?)?,?\s*
2686 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2690 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2693 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2696 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2699 days
, hours
, mins
, secs
, ms
= m
.groups()
2701 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2703 hours
, mins
= m
.groups()
2708 ms
= ms
.replace(':', '.')
2709 return sum(float(part
or 0) * mult
for part
, mult
in (
2710 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2713 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2714 name
, real_ext
= os
.path
.splitext(filename
)
2716 f
'{name}.{ext}{real_ext}'
2717 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2718 else f
'{filename}.{ext}')
2721 def replace_extension(filename
, ext
, expected_real_ext
=None):
2722 name
, real_ext
= os
.path
.splitext(filename
)
2723 return '{}.{}'.format(
2724 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2728 def check_executable(exe
, args
=[]):
2729 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2730 args can be a list of arguments for a short output (like -version) """
2732 Popen
.run([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
2738 def _get_exe_version_output(exe
, args
):
2740 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2741 # SIGTTOU if yt-dlp is run in the background.
2742 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2743 stdout
, _
, ret
= Popen
.run([encodeArgument(exe
)] + args
, text
=True,
2744 stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
2752 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2753 assert isinstance(output
, str)
2754 if version_re
is None:
2755 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2756 m
= re
.search(version_re
, output
)
2763 def get_exe_version(exe
, args
=['--version'],
2764 version_re
=None, unrecognized
=('present', 'broken')):
2765 """ Returns the version of the specified executable,
2766 or False if the executable is not present """
2767 unrecognized
= variadic(unrecognized
)
2768 assert len(unrecognized
) in (1, 2)
2769 out
= _get_exe_version_output(exe
, args
)
2771 return unrecognized
[-1]
2772 return out
and detect_exe_version(out
, version_re
, unrecognized
[0])
2775 def frange(start
=0, stop
=None, step
=1):
2778 start
, stop
= 0, start
2779 sign
= [-1, 1][step
> 0] if step
else 0
2780 while sign
* start
< sign
* stop
:
2785 class LazyList(collections
.abc
.Sequence
):
2786 """Lazy immutable list from an iterable
2787 Note that slices of a LazyList are lists and not LazyList"""
2789 class IndexError(IndexError):
2792 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2793 self
._iterable
= iter(iterable
)
2794 self
._cache
= [] if _cache
is None else _cache
2795 self
._reversed
= reverse
2799 # We need to consume the entire iterable to iterate in reverse
2800 yield from self
.exhaust()
2802 yield from self
._cache
2803 for item
in self
._iterable
:
2804 self
._cache
.append(item
)
2808 self
._cache
.extend(self
._iterable
)
2809 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2813 """Evaluate the entire iterable"""
2814 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2817 def _reverse_index(x
):
2818 return None if x
is None else ~x
2820 def __getitem__(self
, idx
):
2821 if isinstance(idx
, slice):
2823 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2824 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2825 elif isinstance(idx
, int):
2827 idx
= self
._reverse
_index
(idx
)
2828 start
, stop
, step
= idx
, idx
, 0
2830 raise TypeError('indices must be integers or slices')
2831 if ((start
or 0) < 0 or (stop
or 0) < 0
2832 or (start
is None and step
< 0)
2833 or (stop
is None and step
> 0)):
2834 # We need to consume the entire iterable to be able to slice from the end
2835 # Obviously, never use this with infinite iterables
2838 return self
._cache
[idx
]
2839 except IndexError as e
:
2840 raise self
.IndexError(e
) from e
2841 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2843 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2845 return self
._cache
[idx
]
2846 except IndexError as e
:
2847 raise self
.IndexError(e
) from e
2851 self
[-1] if self
._reversed
else self
[0]
2852 except self
.IndexError:
2858 return len(self
._cache
)
2860 def __reversed__(self
):
2861 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2864 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2867 # repr and str should mimic a list. So we exhaust the iterable
2868 return repr(self
.exhaust())
2871 return repr(self
.exhaust())
2876 class IndexError(IndexError):
2880 # This is only useful for tests
2881 return len(self
.getslice())
2883 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2884 self
._pagefunc
= pagefunc
2885 self
._pagesize
= pagesize
2886 self
._pagecount
= float('inf')
2887 self
._use
_cache
= use_cache
2890 def getpage(self
, pagenum
):
2891 page_results
= self
._cache
.get(pagenum
)
2892 if page_results
is None:
2893 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2895 self
._cache
[pagenum
] = page_results
2898 def getslice(self
, start
=0, end
=None):
2899 return list(self
._getslice
(start
, end
))
2901 def _getslice(self
, start
, end
):
2902 raise NotImplementedError('This method must be implemented by subclasses')
2904 def __getitem__(self
, idx
):
2905 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2906 if not isinstance(idx
, int) or idx
< 0:
2907 raise TypeError('indices must be non-negative integers')
2908 entries
= self
.getslice(idx
, idx
+ 1)
2910 raise self
.IndexError()
2914 class OnDemandPagedList(PagedList
):
2915 """Download pages until a page with less than maximum results"""
2917 def _getslice(self
, start
, end
):
2918 for pagenum
in itertools
.count(start
// self
._pagesize
):
2919 firstid
= pagenum
* self
._pagesize
2920 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2921 if start
>= nextfirstid
:
2925 start
% self
._pagesize
2926 if firstid
<= start
< nextfirstid
2929 ((end
- 1) % self
._pagesize
) + 1
2930 if (end
is not None and firstid
<= end
<= nextfirstid
)
2934 page_results
= self
.getpage(pagenum
)
2936 self
._pagecount
= pagenum
- 1
2938 if startv
!= 0 or endv
is not None:
2939 page_results
= page_results
[startv
:endv
]
2940 yield from page_results
2942 # A little optimization - if current page is not "full", ie. does
2943 # not contain page_size videos then we can assume that this page
2944 # is the last one - there are no more ids on further pages -
2945 # i.e. no need to query again.
2946 if len(page_results
) + startv
< self
._pagesize
:
2949 # If we got the whole page, but the next page is not interesting,
2950 # break out early as well
2951 if end
== nextfirstid
:
2955 class InAdvancePagedList(PagedList
):
2956 """PagedList with total number of pages known in advance"""
2958 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2959 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2960 self
._pagecount
= pagecount
2962 def _getslice(self
, start
, end
):
2963 start_page
= start
// self
._pagesize
2964 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2965 skip_elems
= start
- start_page
* self
._pagesize
2966 only_more
= None if end
is None else end
- start
2967 for pagenum
in range(start_page
, end_page
):
2968 page_results
= self
.getpage(pagenum
)
2970 page_results
= page_results
[skip_elems
:]
2972 if only_more
is not None:
2973 if len(page_results
) < only_more
:
2974 only_more
-= len(page_results
)
2976 yield from page_results
[:only_more
]
2978 yield from page_results
2981 class PlaylistEntries
:
2982 MissingEntry
= object()
2983 is_exhausted
= False
2985 def __init__(self
, ydl
, info_dict
):
2988 # _entries must be assigned now since infodict can change during iteration
2989 entries
= info_dict
.get('entries')
2991 raise EntryNotInPlaylist('There are no entries')
2992 elif isinstance(entries
, list):
2993 self
.is_exhausted
= True
2995 requested_entries
= info_dict
.get('requested_entries')
2996 self
.is_incomplete
= requested_entries
is not None
2997 if self
.is_incomplete
:
2998 assert self
.is_exhausted
2999 self
._entries
= [self
.MissingEntry
] * max(requested_entries
or [0])
3000 for i
, entry
in zip(requested_entries
, entries
):
3001 self
._entries
[i
- 1] = entry
3002 elif isinstance(entries
, (list, PagedList
, LazyList
)):
3003 self
._entries
= entries
3005 self
._entries
= LazyList(entries
)
3007 PLAYLIST_ITEMS_RE
= re
.compile(r
'''(?x)
3008 (?P<start>[+-]?\d+)?
3010 (?P<end>[+-]?\d+|inf(?:inite)?)?
3011 (?::(?P<step>[+-]?\d+))?
3015 def parse_playlist_items(cls
, string
):
3016 for segment
in string
.split(','):
3018 raise ValueError('There is two or more consecutive commas')
3019 mobj
= cls
.PLAYLIST_ITEMS_RE
.fullmatch(segment
)
3021 raise ValueError(f
'{segment!r} is not a valid specification')
3022 start
, end
, step
, has_range
= mobj
.group('start', 'end', 'step', 'range')
3023 if int_or_none(step
) == 0:
3024 raise ValueError(f
'Step in {segment!r} cannot be zero')
3025 yield slice(int_or_none(start
), float_or_none(end
), int_or_none(step
)) if has_range
else int(start
)
3027 def get_requested_items(self
):
3028 playlist_items
= self
.ydl
.params
.get('playlist_items')
3029 playlist_start
= self
.ydl
.params
.get('playliststart', 1)
3030 playlist_end
= self
.ydl
.params
.get('playlistend')
3031 # For backwards compatibility, interpret -1 as whole list
3032 if playlist_end
in (-1, None):
3034 if not playlist_items
:
3035 playlist_items
= f
'{playlist_start}:{playlist_end}'
3036 elif playlist_start
!= 1 or playlist_end
:
3037 self
.ydl
.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once
=True)
3039 for index
in self
.parse_playlist_items(playlist_items
):
3040 for i
, entry
in self
[index
]:
3045 # The item may have just been added to archive. Don't break due to it
3046 if not self
.ydl
.params
.get('lazy_playlist'):
3047 # TODO: Add auto-generated fields
3048 self
.ydl
._match
_entry
(entry
, incomplete
=True, silent
=True)
3049 except (ExistingVideoReached
, RejectedVideoReached
):
3052 def get_full_count(self
):
3053 if self
.is_exhausted
and not self
.is_incomplete
:
3055 elif isinstance(self
._entries
, InAdvancePagedList
):
3056 if self
._entries
._pagesize
== 1:
3057 return self
._entries
._pagecount
3059 @functools.cached_property
3061 if isinstance(self
._entries
, list):
3064 entry
= self
._entries
[i
]
3066 entry
= self
.MissingEntry
3067 if not self
.is_incomplete
:
3068 raise self
.IndexError()
3069 if entry
is self
.MissingEntry
:
3070 raise EntryNotInPlaylist(f
'Entry {i + 1} cannot be found')
3075 return type(self
.ydl
)._handle
_extraction
_exceptions
(lambda _
, i
: self
._entries
[i
])(self
.ydl
, i
)
3076 except (LazyList
.IndexError, PagedList
.IndexError):
3077 raise self
.IndexError()
3080 def __getitem__(self
, idx
):
3081 if isinstance(idx
, int):
3082 idx
= slice(idx
, idx
)
3084 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3085 step
= 1 if idx
.step
is None else idx
.step
3086 if idx
.start
is None:
3087 start
= 0 if step
> 0 else len(self
) - 1
3089 start
= idx
.start
- 1 if idx
.start
>= 0 else len(self
) + idx
.start
3091 # NB: Do not call len(self) when idx == [:]
3092 if idx
.stop
is None:
3093 stop
= 0 if step
< 0 else float('inf')
3095 stop
= idx
.stop
- 1 if idx
.stop
>= 0 else len(self
) + idx
.stop
3096 stop
+= [-1, 1][step
> 0]
3098 for i
in frange(start
, stop
, step
):
3102 entry
= self
._getter
(i
)
3103 except self
.IndexError:
3104 self
.is_exhausted
= True
3111 return len(tuple(self
[:]))
3113 class IndexError(IndexError):
3117 def uppercase_escape(s
):
3118 unicode_escape
= codecs
.getdecoder('unicode_escape')
3120 r
'\\U[0-9a-fA-F]{8}',
3121 lambda m
: unicode_escape(m
.group(0))[0],
3125 def lowercase_escape(s
):
3126 unicode_escape
= codecs
.getdecoder('unicode_escape')
3128 r
'\\u[0-9a-fA-F]{4}',
3129 lambda m
: unicode_escape(m
.group(0))[0],
3133 def escape_rfc3986(s
):
3134 """Escape non-ASCII characters as suggested by RFC 3986"""
3135 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3138 def escape_url(url
):
3139 """Escape URL as suggested by RFC 3986"""
3140 url_parsed
= urllib
.parse
.urlparse(url
)
3141 return url_parsed
._replace
(
3142 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3143 path
=escape_rfc3986(url_parsed
.path
),
3144 params
=escape_rfc3986(url_parsed
.params
),
3145 query
=escape_rfc3986(url_parsed
.query
),
3146 fragment
=escape_rfc3986(url_parsed
.fragment
)
3150 def parse_qs(url
, **kwargs
):
3151 return urllib
.parse
.parse_qs(urllib
.parse
.urlparse(url
).query
, **kwargs
)
3154 def read_batch_urls(batch_fd
):
3156 if not isinstance(url
, str):
3157 url
= url
.decode('utf-8', 'replace')
3158 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
3159 for bom
in BOM_UTF8
:
3160 if url
.startswith(bom
):
3161 url
= url
[len(bom
):]
3163 if not url
or url
.startswith(('#', ';', ']')):
3165 # "#" cannot be stripped out since it is part of the URI
3166 # However, it can be safely stripped out if following a whitespace
3167 return re
.split(r
'\s#', url
, 1)[0].rstrip()
3169 with contextlib
.closing(batch_fd
) as fd
:
3170 return [url
for url
in map(fixup
, fd
) if url
]
3173 def urlencode_postdata(*args
, **kargs
):
3174 return urllib
.parse
.urlencode(*args
, **kargs
).encode('ascii')
3177 def update_url(url
, *, query_update
=None, **kwargs
):
3178 """Replace URL components specified by kwargs
3179 @param url str or parse url tuple
3180 @param query_update update query
3183 if isinstance(url
, str):
3184 if not kwargs
and not query_update
:
3187 url
= urllib
.parse
.urlparse(url
)
3189 assert 'query' not in kwargs
, 'query_update and query cannot be specified at the same time'
3190 kwargs
['query'] = urllib
.parse
.urlencode({
3191 **urllib
.parse
.parse_qs(url
.query
),
3194 return urllib
.parse
.urlunparse(url
._replace
(**kwargs
))
3197 def update_url_query(url
, query
):
3198 return update_url(url
, query_update
=query
)
3201 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
3202 req_headers
= req
.headers
.copy()
3203 req_headers
.update(headers
or {})
3204 req_data
= data
or req
.data
3205 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3206 req_get_method
= req
.get_method()
3207 if req_get_method
== 'HEAD':
3208 req_type
= HEADRequest
3209 elif req_get_method
== 'PUT':
3210 req_type
= PUTRequest
3212 req_type
= urllib
.request
.Request
3214 req_url
, data
=req_data
, headers
=req_headers
,
3215 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3216 if hasattr(req
, 'timeout'):
3217 new_req
.timeout
= req
.timeout
3221 def _multipart_encode_impl(data
, boundary
):
3222 content_type
= 'multipart/form-data; boundary=%s' % boundary
3225 for k
, v
in data
.items():
3226 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3227 if isinstance(k
, str):
3229 if isinstance(v
, str):
3231 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3232 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3233 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3234 if boundary
.encode('ascii') in content
:
3235 raise ValueError('Boundary overlaps with data')
3238 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3240 return out
, content_type
3243 def multipart_encode(data
, boundary
=None):
3245 Encode a dict to RFC 7578-compliant form-data
3248 A dict where keys and values can be either Unicode or bytes-like
3251 If specified a Unicode object, it's used as the boundary. Otherwise
3252 a random boundary is generated.
3254 Reference: https://tools.ietf.org/html/rfc7578
3256 has_specified_boundary
= boundary
is not None
3259 if boundary
is None:
3260 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3263 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3266 if has_specified_boundary
:
3270 return out
, content_type
3273 def variadic(x
, allowed_types
=(str, bytes, dict)):
3274 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
3277 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3278 for val
in map(d
.get
, variadic(key_or_keys
)):
3279 if val
is not None and (val
or not skip_false_values
):
3284 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
3287 val
= f(*args
, **kwargs
)
3288 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3291 if expected_type
is None or isinstance(val
, expected_type
):
3295 def try_get(src
, getter
, expected_type
=None):
3296 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
3299 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
3300 return {k: v for k, v in dct.items() if cndn(k, v)}
3303 def merge_dicts(*dicts
):
3305 for a_dict
in dicts
:
3306 for k
, v
in a_dict
.items():
3307 if (v
is not None and k
not in merged
3308 or isinstance(v
, str) and merged
[k
] == ''):
3313 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
3314 return string
if isinstance(string
, str) else str(string
, encoding
, errors
)
3326 TV_PARENTAL_GUIDELINES
= {
3336 def parse_age_limit(s
):
3337 # isinstance(False, int) is True. So type() must be used instead
3338 if type(s
) is int: # noqa: E721
3339 return s
if 0 <= s
<= 21 else None
3340 elif not isinstance(s
, str):
3342 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
3344 return int(m
.group('age'))
3347 return US_RATINGS
[s
]
3348 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
3350 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3354 def strip_jsonp(code
):
3357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3358 (?:\s*&&\s*(?P=func_name))?
3359 \s*\(\s*(?P<callback_data>.*)\);?
3360 \s*?(?://[^\n]*)*$''',
3361 r
'\g<callback_data>', code
)
3364 def js_to_json(code
, vars={}, *, strict
=False):
3365 # vars is a dict of var, val pairs to substitute
3366 STRING_QUOTES
= '\'"'
3367 STRING_RE
= '|'.join(rf
'{q}(?:\\.|[^\\{q}])*{q}' for q
in STRING_QUOTES
)
3368 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3369 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3371 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3372 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3375 def process_escape(match
):
3376 JSON_PASSTHROUGH_ESCAPES
= R
'"\bfnrtu'
3377 escape
= match
.group(1) or match
.group(2)
3379 return (Rf
'\{escape}' if escape
in JSON_PASSTHROUGH_ESCAPES
3380 else R
'\u00' if escape
== 'x'
3381 else '' if escape
== '\n'
3386 if v
in ('true', 'false', 'null'):
3388 elif v
in ('undefined', 'void 0'):
3390 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3393 if v
[0] in STRING_QUOTES
:
3394 escaped
= re
.sub(r
'(?s)(")|\\(.)', process_escape
, v
[1:-1])
3395 return f
'"{escaped}"'
3397 for regex
, base
in INTEGER_TABLE
:
3398 im
= re
.match(regex
, v
)
3400 i
= int(im
.group(1), base
)
3401 return f
'"{i}":' if v
.endswith(':') else str(i
)
3407 except json
.JSONDecodeError
:
3408 return json
.dumps(vars[v
])
3415 raise ValueError(f
'Unknown value: {v}')
3417 def create_map(mobj
):
3418 return json
.dumps(dict(json
.loads(js_to_json(mobj
.group(1) or '[]', vars=vars))))
3420 code
= re
.sub(r
'new Map\((\[.*?\])?\)', create_map
, code
)
3422 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3423 code
= re
.sub(r
'new \w+\((.*?)\)', lambda m
: json
.dumps(m
.group(0)), code
)
3424 code
= re
.sub(r
'parseInt\([^\d]+(\d+)[^\d]+\)', r
'\1', code
)
3425 code
= re
.sub(r
'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^
)]*["\'])\s*\)', r'\1', code)
3427 return re.sub(rf'''(?sx)
3429 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3430 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3431 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3432 [0-9]+(?={SKIP_RE}:)|
3437 def qualities(quality_ids):
3438 """ Get a numeric quality value out of a list of possible values """
3441 return quality_ids.index(qid)
3447 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3451 'default': '%(title)s [%(id)s].%(ext)s',
3452 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3458 'description': 'description',
3459 'annotation': 'annotations.xml',
3460 'infojson': 'info.json',
3463 'pl_thumbnail': None,
3464 'pl_description': 'description',
3465 'pl_infojson': 'info.json',
3468 # As of [1] format syntax is:
3469 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3470 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3471 STR_FORMAT_RE_TMPL = r'''(?x)
3472 (?<!%)(?P<prefix>(?:%%)*)
3474 (?P<has_key>\((?P<key>{0})\))?
3476 (?P<conversion>[#0\-+ ]+)?
3478 (?P<precision>\.\d+)?
3479 (?P<len_mod>[hlL])? # unused in python
3480 {1} # conversion type
3485 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3488 def limit_length(s, length):
3489 """ Add ellipses to overly long strings """
3494 return s[:length - len(ELLIPSES)] + ELLIPSES
3498 def version_tuple(v):
3499 return tuple(int(e) for e in re.split(r'[-.]', v))
3502 def is_outdated_version(version, limit, assume_new=True):
3504 return not assume_new
3506 return version_tuple(version) < version_tuple(limit)
3508 return not assume_new
3511 def ytdl_is_updateable():
3512 """ Returns if yt-dlp can be updated with -U """
3514 from .update import is_non_updateable
3516 return not is_non_updateable()
3519 def args_to_str(args):
3520 # Get a short string representation for a subprocess command
3521 return ' '.join(compat_shlex_quote(a) for a in args)
3524 def error_to_compat_str(err):
3528 def error_to_str(err):
3529 return f'{type(err).__name__}: {err}'
3532 def mimetype2ext(mt, default=NO_DEFAULT):
3533 if not isinstance(mt, str):
3534 if default is not NO_DEFAULT:
3550 'x-matroska': 'mkv',
3552 'x-mp4-fragmented': 'mp4',
3557 # application (streaming playlists)
3561 'vnd.apple.mpegurl': 'm3u8',
3562 'vnd.ms-sstr+xml': 'ism',
3563 'x-mpegurl': 'm3u8',
3567 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3568 # Using .mp3 as it's the most popular one
3569 'audio/mpeg': 'mp3',
3570 'audio/webm': 'webm',
3571 'audio/x-matroska': 'mka',
3572 'audio/x-mpegurl': 'm3u',
3580 'x-realaudio': 'ra',
3591 'vnd.wap.wbmp': 'wbmp',
3598 'filmstrip+json': 'fs',
3599 'smptett+xml': 'tt',
3602 'x-ms-sami': 'sami',
3611 mimetype = mt.partition(';')[0].strip().lower()
3612 _, _, subtype = mimetype.rpartition('/')
3614 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3617 elif default is not NO_DEFAULT:
3619 return subtype.replace('+', '.')
3622 def ext2mimetype(ext_or_url):
3625 if '.' not in ext_or_url:
3626 ext_or_url = f'file.{ext_or_url}'
3627 return mimetypes.guess_type(ext_or_url)[0]
3630 def parse_codecs(codecs_str):
3631 # http://tools.ietf.org/html/rfc6381
3634 split_codecs = list(filter(None, map(
3635 str.strip, codecs_str.strip().strip(',').split(','))))
3636 vcodec, acodec, scodec, hdr = None, None, None, None
3637 for full_codec in split_codecs:
3638 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3639 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3640 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3644 if parts[0] in ('dvh1', 'dvhe'):
3646 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3648 elif parts[:2] == ['vp9', '2']:
3650 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3651 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3652 acodec = acodec or full_codec
3653 elif parts[0] in ('stpp', 'wvtt'):
3654 scodec = scodec or full_codec
3656 write_string(f'WARNING: Unknown codec {full_codec}\n')
3657 if vcodec or acodec or scodec:
3659 'vcodec': vcodec or 'none',
3660 'acodec': acodec or 'none',
3661 'dynamic_range': hdr,
3662 **({'scodec': scodec} if scodec is not None else {}),
3664 elif len(split_codecs) == 2:
3666 'vcodec': split_codecs[0],
3667 'acodec': split_codecs[1],
3672 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3673 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3675 allow_mkv = not preferences or 'mkv' in preferences
3677 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3678 return 'mkv' # TODO: any other format allows this?
3680 # TODO: All codecs supported by parse_codecs isn't handled here
3681 COMPATIBLE_CODECS = {
3683 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3684 'h264', 'aacl', 'ec-3', # Set in ISM
3687 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3688 'vp9x', 'vp8x', # in the webm spec
3692 sanitize_codec = functools.partial(
3693 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3694 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3696 for ext in preferences or COMPATIBLE_CODECS.keys():
3697 codec_set = COMPATIBLE_CODECS.get(ext, set())
3698 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3702 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3705 for ext in preferences or vexts:
3706 current_exts = {ext, *vexts, *aexts}
3707 if ext == 'mkv' or current_exts == {ext} or any(
3708 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3710 return 'mkv' if allow_mkv else preferences[-1]
3713 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3714 getheader = url_handle.headers.get
3716 cd = getheader('Content-Disposition')
3718 m = re.match(r'attachment;\s*filename="(?P
<filename
>[^
"]+)"', cd)
3720 e = determine_ext(m.group('filename
'), default_ext=None)
3724 meta_ext = getheader('x
-amz
-meta
-name
')
3726 e = meta_ext.rpartition('.')[2]
3730 return mimetype2ext(getheader('Content
-Type
'), default=default)
3733 def encode_data_uri(data, mime_type):
3734 return 'data
:%s;base64
,%s' % (mime_type, base64.b64encode(data).decode('ascii
'))
3737 def age_restricted(content_limit, age_limit):
3738 """ Returns True iff the content should be blocked """
3740 if age_limit is None: # No limit set
3742 if content_limit is None:
3743 return False # Content available for everyone
3744 return age_limit < content_limit
3747 # List of known byte-order-marks (BOM)
3749 (b'\xef\xbb\xbf', 'utf
-8'),
3750 (b'\x00\x00\xfe\xff', 'utf
-32-be
'),
3751 (b'\xff\xfe\x00\x00', 'utf
-32-le
'),
3752 (b'\xff\xfe', 'utf
-16-le
'),
3753 (b'\xfe\xff', 'utf
-16-be
'),
3757 def is_html(first_bytes):
3758 """ Detect whether a file contains HTML by examining its first bytes. """
3761 for bom, enc in BOMS:
3762 while first_bytes.startswith(bom):
3763 encoding, first_bytes = enc, first_bytes[len(bom):]
3765 return re.match(r'^\s
*<', first_bytes.decode(encoding, 'replace
'))
3768 def determine_protocol(info_dict):
3769 protocol = info_dict.get('protocol
')
3770 if protocol is not None:
3773 url = sanitize_url(info_dict['url
'])
3774 if url.startswith('rtmp
'):
3776 elif url.startswith('mms
'):
3778 elif url.startswith('rtsp
'):
3781 ext = determine_ext(url)
3783 return 'm3u8
' if info_dict.get('is_live
') else 'm3u8_native
'
3787 return urllib.parse.urlparse(url).scheme
3790 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3791 """ Render a list of rows, each as a list of values.
3792 Text after a \t will be right aligned """
3794 return len(remove_terminal_sequences(string).replace('\t', ''))
3796 def get_max_lens(table):
3797 return [max(width(str(v)) for v in col) for col in zip(*table)]
3799 def filter_using_list(row, filterArray):
3800 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3802 max_lens = get_max_lens(data) if hide_empty else []
3803 header_row = filter_using_list(header_row, max_lens)
3804 data = [filter_using_list(row, max_lens) for row in data]
3806 table = [header_row] + data
3807 max_lens = get_max_lens(table)
3810 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3811 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3813 for pos, text in enumerate(map(str, row)):
3815 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3817 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3818 ret = '\n'.join(''.join(row).rstrip() for row in table)
3822 def _match_one(filter_part, dct, incomplete):
3823 # TODO: Generalize code with YoutubeDL._build_format_filter
3824 STRING_OPERATORS = {
3825 '*=': operator.contains,
3826 '^
=': lambda attr, value: attr.startswith(value),
3827 '$
=': lambda attr, value: attr.endswith(value),
3828 '~
=': lambda attr, value: re.search(value, attr),
3830 COMPARISON_OPERATORS = {
3832 '<=': operator.le, # "<=" must be defined above "<"
3839 if isinstance(incomplete, bool):
3840 is_incomplete = lambda _: incomplete
3842 is_incomplete = lambda k: k in incomplete
3844 operator_rex = re.compile(r'''(?x)
3846 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3848 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3851 ''' % '|
'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3852 m = operator_rex.fullmatch(filter_part.strip())
3855 unnegated_op = COMPARISON_OPERATORS[m['op
']]
3857 op = lambda attr, value: not unnegated_op(attr, value)
3860 comparison_value = m['quotedstrval
'] or m['strval
'] or m['intval
']
3862 comparison_value = comparison_value.replace(r'\
%s' % m['quote
'], m['quote
'])
3863 actual_value = dct.get(m['key
'])
3864 numeric_comparison = None
3865 if isinstance(actual_value, (int, float)):
3866 # If the original field is a string and matching comparisonvalue is
3867 # a number we should respect the origin of the original field
3868 # and process comparison value as a string (see
3869 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3871 numeric_comparison = int(comparison_value)
3873 numeric_comparison = parse_filesize(comparison_value)
3874 if numeric_comparison is None:
3875 numeric_comparison = parse_filesize(f'{comparison_value}B
')
3876 if numeric_comparison is None:
3877 numeric_comparison = parse_duration(comparison_value)
3878 if numeric_comparison is not None and m['op
'] in STRING_OPERATORS:
3879 raise ValueError('Operator
%s only supports string values
!' % m['op
'])
3880 if actual_value is None:
3881 return is_incomplete(m['key
']) or m['none_inclusive
']
3882 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3885 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3886 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3888 operator_rex = re.compile(r'''(?x)
3889 (?P<op>%s)\s*(?P<key>[a-z_]+)
3890 ''' % '|
'.join(map(re.escape, UNARY_OPERATORS.keys())))
3891 m = operator_rex.fullmatch(filter_part.strip())
3893 op = UNARY_OPERATORS[m.group('op
')]
3894 actual_value = dct.get(m.group('key
'))
3895 if is_incomplete(m.group('key
')) and actual_value is None:
3897 return op(actual_value)
3899 raise ValueError('Invalid
filter part
%r' % filter_part)
3902 def match_str(filter_str, dct, incomplete=False):
3903 """ Filter a dictionary with a simple string syntax.
3904 @returns Whether the filter passes
3905 @param incomplete Set of keys that is expected to be missing from dct.
3906 Can be True/False to indicate all/none of the keys may be missing.
3907 All conditions on incomplete keys pass if the key is missing
3910 _match_one(filter_part.replace(r'\
&', '&'), dct, incomplete)
3911 for filter_part in re.split(r'(?
<!\\)&', filter_str))
3914 def match_filter_func(filters, breaking_filters=None):
3915 if not filters and not breaking_filters:
3917 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3918 filters = set(variadic(filters or []))
3920 interactive = '-' in filters
3924 def _match_func(info_dict, incomplete=False):
3925 ret = breaking_filters(info_dict, incomplete)
3927 raise RejectedVideoReached(ret)
3929 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3930 return NO_DEFAULT if interactive and not incomplete else None
3932 video_title = info_dict.get('title
') or info_dict.get('id') or 'entry
'
3933 filter_str = ') |
('.join(map(str.strip, filters))
3934 return f'{video_title} does
not pass filter ({filter_str}
), skipping
..'
3938 class download_range_func:
3939 def __init__(self, chapters, ranges):
3940 self.chapters, self.ranges = chapters, ranges
3942 def __call__(self, info_dict, ydl):
3943 if not self.ranges and not self.chapters:
3946 warning = ('There are no chapters matching the regex
' if info_dict.get('chapters
')
3947 else 'Cannot match chapters since chapter information
is unavailable
')
3948 for regex in self.chapters or []:
3949 for i, chapter in enumerate(info_dict.get('chapters
') or []):
3950 if re.search(regex, chapter['title
']):
3952 yield {**chapter, 'index': i}
3953 if self.chapters and warning:
3954 ydl.to_screen(f'[info
] {info_dict["id"]}
: {warning}
')
3956 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3958 def __eq__(self, other):
3959 return (isinstance(other, download_range_func)
3960 and self.chapters == other.chapters and self.ranges == other.ranges)
3963 return f'{__name__}
.{type(self).__name__}
({self.chapters}
, {self.ranges}
)'
3966 def parse_dfxp_time_expr(time_expr):
3970 mobj = re.match(rf'^
(?P
<time_offset
>{NUMBER_RE}
)s?$
', time_expr)
3972 return float(mobj.group('time_offset
'))
3974 mobj = re.match(r'^
(\d
+):(\d\d
):(\d\
d(?
:(?
:\
.|
:)\d
+)?
)$
', time_expr)
3976 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3979 def srt_subtitles_timecode(seconds):
3980 return '%02d
:%02d
:%02d
,%03d
' % timetuple_from_msec(seconds * 1000)
3983 def ass_subtitles_timecode(seconds):
3984 time = timetuple_from_msec(seconds * 1000)
3985 return '%01d
:%02d
:%02d
.%02d
' % (*time[:-1], time.milliseconds / 10)
3988 def dfxp2srt(dfxp_data):
3990 @param dfxp_data A bytes-like object containing DFXP data
3991 @returns A unicode object containing converted SRT data
3993 LEGACY_NAMESPACES = (
3994 (b'http
://www
.w3
.org
/ns
/ttml
', [
3995 b'http
://www
.w3
.org
/2004/11/ttaf1
',
3996 b'http
://www
.w3
.org
/2006/04/ttaf1
',
3997 b'http
://www
.w3
.org
/2006/10/ttaf1
',
3999 (b'http
://www
.w3
.org
/ns
/ttml
#styling', [
4000 b
'http://www.w3.org/ns/ttml#style',
4004 SUPPORTED_STYLING
= [
4013 _x
= functools
.partial(xpath_with_ns
, ns_map
={
4014 'xml': 'http://www.w3.org/XML/1998/namespace',
4015 'ttml': 'http://www.w3.org/ns/ttml',
4016 'tts': 'http://www.w3.org/ns/ttml#styling',
4022 class TTMLPElementParser
:
4024 _unclosed_elements
= []
4025 _applied_styles
= []
4027 def start(self
, tag
, attrib
):
4028 if tag
in (_x('ttml:br'), 'br'):
4031 unclosed_elements
= []
4033 element_style_id
= attrib
.get('style')
4035 style
.update(default_style
)
4036 if element_style_id
:
4037 style
.update(styles
.get(element_style_id
, {}))
4038 for prop
in SUPPORTED_STYLING
:
4039 prop_val
= attrib
.get(_x('tts:' + prop
))
4041 style
[prop
] = prop_val
4044 for k
, v
in sorted(style
.items()):
4045 if self
._applied
_styles
and self
._applied
_styles
[-1].get(k
) == v
:
4048 font
+= ' color="%s"' % v
4049 elif k
== 'fontSize':
4050 font
+= ' size="%s"' % v
4051 elif k
== 'fontFamily':
4052 font
+= ' face="%s"' % v
4053 elif k
== 'fontWeight' and v
== 'bold':
4055 unclosed_elements
.append('b')
4056 elif k
== 'fontStyle' and v
== 'italic':
4058 unclosed_elements
.append('i')
4059 elif k
== 'textDecoration' and v
== 'underline':
4061 unclosed_elements
.append('u')
4063 self
._out
+= '<font' + font
+ '>'
4064 unclosed_elements
.append('font')
4066 if self
._applied
_styles
:
4067 applied_style
.update(self
._applied
_styles
[-1])
4068 applied_style
.update(style
)
4069 self
._applied
_styles
.append(applied_style
)
4070 self
._unclosed
_elements
.append(unclosed_elements
)
4073 if tag
not in (_x('ttml:br'), 'br'):
4074 unclosed_elements
= self
._unclosed
_elements
.pop()
4075 for element
in reversed(unclosed_elements
):
4076 self
._out
+= '</%s>' % element
4077 if unclosed_elements
and self
._applied
_styles
:
4078 self
._applied
_styles
.pop()
4080 def data(self
, data
):
4084 return self
._out
.strip()
4086 def parse_node(node
):
4087 target
= TTMLPElementParser()
4088 parser
= xml
.etree
.ElementTree
.XMLParser(target
=target
)
4089 parser
.feed(xml
.etree
.ElementTree
.tostring(node
))
4090 return parser
.close()
4092 for k
, v
in LEGACY_NAMESPACES
:
4094 dfxp_data
= dfxp_data
.replace(ns
, k
)
4096 dfxp
= compat_etree_fromstring(dfxp_data
)
4098 paras
= dfxp
.findall(_x('.//ttml:p')) or dfxp
.findall('.//p')
4101 raise ValueError('Invalid dfxp/TTML subtitle')
4105 for style
in dfxp
.findall(_x('.//ttml:style')):
4106 style_id
= style
.get('id') or style
.get(_x('xml:id'))
4109 parent_style_id
= style
.get('style')
4111 if parent_style_id
not in styles
:
4114 styles
[style_id
] = styles
[parent_style_id
].copy()
4115 for prop
in SUPPORTED_STYLING
:
4116 prop_val
= style
.get(_x('tts:' + prop
))
4118 styles
.setdefault(style_id
, {})[prop
] = prop_val
4124 for p
in ('body', 'div'):
4125 ele
= xpath_element(dfxp
, [_x('.//ttml:' + p
), './/' + p
])
4128 style
= styles
.get(ele
.get('style'))
4131 default_style
.update(style
)
4133 for para
, index
in zip(paras
, itertools
.count(1)):
4134 begin_time
= parse_dfxp_time_expr(para
.attrib
.get('begin'))
4135 end_time
= parse_dfxp_time_expr(para
.attrib
.get('end'))
4136 dur
= parse_dfxp_time_expr(para
.attrib
.get('dur'))
4137 if begin_time
is None:
4142 end_time
= begin_time
+ dur
4143 out
.append('%d\n%s --> %s\n%s\n\n' % (
4145 srt_subtitles_timecode(begin_time
),
4146 srt_subtitles_timecode(end_time
),
4152 def cli_option(params
, command_option
, param
, separator
=None):
4153 param
= params
.get(param
)
4154 return ([] if param
is None
4155 else [command_option
, str(param
)] if separator
is None
4156 else [f
'{command_option}{separator}{param}'])
4159 def cli_bool_option(params
, command_option
, param
, true_value
='true', false_value
='false', separator
=None):
4160 param
= params
.get(param
)
4161 assert param
in (True, False, None)
4162 return cli_option({True: true_value, False: false_value}
, command_option
, param
, separator
)
4165 def cli_valueless_option(params
, command_option
, param
, expected_value
=True):
4166 return [command_option
] if params
.get(param
) == expected_value
else []
4169 def cli_configuration_args(argdict
, keys
, default
=[], use_compat
=True):
4170 if isinstance(argdict
, (list, tuple)): # for backward compatibility
4177 assert isinstance(argdict
, dict)
4179 assert isinstance(keys
, (list, tuple))
4180 for key_list
in keys
:
4181 arg_list
= list(filter(
4182 lambda x
: x
is not None,
4183 [argdict
.get(key
.lower()) for key
in variadic(key_list
)]))
4185 return [arg
for args
in arg_list
for arg
in args
]
4189 def _configuration_args(main_key
, argdict
, exe
, keys
=None, default
=[], use_compat
=True):
4190 main_key
, exe
= main_key
.lower(), exe
.lower()
4191 root_key
= exe
if main_key
== exe
else f
'{main_key}+{exe}'
4192 keys
= [f
'{root_key}{k}' for k
in (keys
or [''])]
4193 if root_key
in keys
:
4195 keys
.append((main_key
, exe
))
4196 keys
.append('default')
4199 return cli_configuration_args(argdict
, keys
, default
, use_compat
)
4203 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4262 'iw': 'heb', # Replaced by he in 1989 revision
4272 'in': 'ind', # Replaced by id in 1989 revision
4387 'ji': 'yid', # Replaced by yi in 1989 revision
4395 def short2long(cls
, code
):
4396 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4397 return cls
._lang
_map
.get(code
[:2])
4400 def long2short(cls
, code
):
4401 """Convert language code from ISO 639-2/T to ISO 639-1"""
4402 for short_name
, long_name
in cls
._lang
_map
.items():
4403 if long_name
== code
:
4408 # From http://data.okfn.org/data/core/country-list
4410 'AF': 'Afghanistan',
4411 'AX': 'Åland Islands',
4414 'AS': 'American Samoa',
4419 'AG': 'Antigua and Barbuda',
4436 'BO': 'Bolivia, Plurinational State of',
4437 'BQ': 'Bonaire, Sint Eustatius and Saba',
4438 'BA': 'Bosnia and Herzegovina',
4440 'BV': 'Bouvet Island',
4442 'IO': 'British Indian Ocean Territory',
4443 'BN': 'Brunei Darussalam',
4445 'BF': 'Burkina Faso',
4451 'KY': 'Cayman Islands',
4452 'CF': 'Central African Republic',
4456 'CX': 'Christmas Island',
4457 'CC': 'Cocos (Keeling) Islands',
4461 'CD': 'Congo, the Democratic Republic of the',
4462 'CK': 'Cook Islands',
4464 'CI': 'Côte d\'Ivoire',
4469 'CZ': 'Czech Republic',
4473 'DO': 'Dominican Republic',
4476 'SV': 'El Salvador',
4477 'GQ': 'Equatorial Guinea',
4481 'FK': 'Falkland Islands (Malvinas)',
4482 'FO': 'Faroe Islands',
4486 'GF': 'French Guiana',
4487 'PF': 'French Polynesia',
4488 'TF': 'French Southern Territories',
4503 'GW': 'Guinea-Bissau',
4506 'HM': 'Heard Island and McDonald Islands',
4507 'VA': 'Holy See (Vatican City State)',
4514 'IR': 'Iran, Islamic Republic of',
4517 'IM': 'Isle of Man',
4527 'KP': 'Korea, Democratic People\'s Republic of',
4528 'KR': 'Korea, Republic of',
4531 'LA': 'Lao People\'s Democratic Republic',
4537 'LI': 'Liechtenstein',
4541 'MK': 'Macedonia, the Former Yugoslav Republic of',
4548 'MH': 'Marshall Islands',
4554 'FM': 'Micronesia, Federated States of',
4555 'MD': 'Moldova, Republic of',
4566 'NL': 'Netherlands',
4567 'NC': 'New Caledonia',
4568 'NZ': 'New Zealand',
4573 'NF': 'Norfolk Island',
4574 'MP': 'Northern Mariana Islands',
4579 'PS': 'Palestine, State of',
4581 'PG': 'Papua New Guinea',
4584 'PH': 'Philippines',
4588 'PR': 'Puerto Rico',
4592 'RU': 'Russian Federation',
4594 'BL': 'Saint Barthélemy',
4595 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4596 'KN': 'Saint Kitts and Nevis',
4597 'LC': 'Saint Lucia',
4598 'MF': 'Saint Martin (French part)',
4599 'PM': 'Saint Pierre and Miquelon',
4600 'VC': 'Saint Vincent and the Grenadines',
4603 'ST': 'Sao Tome and Principe',
4604 'SA': 'Saudi Arabia',
4608 'SL': 'Sierra Leone',
4610 'SX': 'Sint Maarten (Dutch part)',
4613 'SB': 'Solomon Islands',
4615 'ZA': 'South Africa',
4616 'GS': 'South Georgia and the South Sandwich Islands',
4617 'SS': 'South Sudan',
4622 'SJ': 'Svalbard and Jan Mayen',
4625 'CH': 'Switzerland',
4626 'SY': 'Syrian Arab Republic',
4627 'TW': 'Taiwan, Province of China',
4629 'TZ': 'Tanzania, United Republic of',
4631 'TL': 'Timor-Leste',
4635 'TT': 'Trinidad and Tobago',
4638 'TM': 'Turkmenistan',
4639 'TC': 'Turks and Caicos Islands',
4643 'AE': 'United Arab Emirates',
4644 'GB': 'United Kingdom',
4645 'US': 'United States',
4646 'UM': 'United States Minor Outlying Islands',
4650 'VE': 'Venezuela, Bolivarian Republic of',
4652 'VG': 'Virgin Islands, British',
4653 'VI': 'Virgin Islands, U.S.',
4654 'WF': 'Wallis and Futuna',
4655 'EH': 'Western Sahara',
4659 # Not ISO 3166 codes, but used for IP blocks
4660 'AP': 'Asia/Pacific Region',
4665 def short2full(cls
, code
):
4666 """Convert an ISO 3166-2 country code to the corresponding full name"""
4667 return cls
._country
_map
.get(code
.upper())
4671 # Major IPv4 address blocks per country
4673 'AD': '46.172.224.0/19',
4674 'AE': '94.200.0.0/13',
4675 'AF': '149.54.0.0/17',
4676 'AG': '209.59.64.0/18',
4677 'AI': '204.14.248.0/21',
4678 'AL': '46.99.0.0/16',
4679 'AM': '46.70.0.0/15',
4680 'AO': '105.168.0.0/13',
4681 'AP': '182.50.184.0/21',
4682 'AQ': '23.154.160.0/24',
4683 'AR': '181.0.0.0/12',
4684 'AS': '202.70.112.0/20',
4685 'AT': '77.116.0.0/14',
4686 'AU': '1.128.0.0/11',
4687 'AW': '181.41.0.0/18',
4688 'AX': '185.217.4.0/22',
4689 'AZ': '5.197.0.0/16',
4690 'BA': '31.176.128.0/17',
4691 'BB': '65.48.128.0/17',
4692 'BD': '114.130.0.0/16',
4694 'BF': '102.178.0.0/15',
4695 'BG': '95.42.0.0/15',
4696 'BH': '37.131.0.0/17',
4697 'BI': '154.117.192.0/18',
4698 'BJ': '137.255.0.0/16',
4699 'BL': '185.212.72.0/23',
4700 'BM': '196.12.64.0/18',
4701 'BN': '156.31.0.0/16',
4702 'BO': '161.56.0.0/16',
4703 'BQ': '161.0.80.0/20',
4704 'BR': '191.128.0.0/12',
4705 'BS': '24.51.64.0/18',
4706 'BT': '119.2.96.0/19',
4707 'BW': '168.167.0.0/16',
4708 'BY': '178.120.0.0/13',
4709 'BZ': '179.42.192.0/18',
4710 'CA': '99.224.0.0/11',
4711 'CD': '41.243.0.0/16',
4712 'CF': '197.242.176.0/21',
4713 'CG': '160.113.0.0/16',
4714 'CH': '85.0.0.0/13',
4715 'CI': '102.136.0.0/14',
4716 'CK': '202.65.32.0/19',
4717 'CL': '152.172.0.0/14',
4718 'CM': '102.244.0.0/14',
4719 'CN': '36.128.0.0/10',
4720 'CO': '181.240.0.0/12',
4721 'CR': '201.192.0.0/12',
4722 'CU': '152.206.0.0/15',
4723 'CV': '165.90.96.0/19',
4724 'CW': '190.88.128.0/17',
4725 'CY': '31.153.0.0/16',
4726 'CZ': '88.100.0.0/14',
4728 'DJ': '197.241.0.0/17',
4729 'DK': '87.48.0.0/12',
4730 'DM': '192.243.48.0/20',
4731 'DO': '152.166.0.0/15',
4732 'DZ': '41.96.0.0/12',
4733 'EC': '186.68.0.0/15',
4734 'EE': '90.190.0.0/15',
4735 'EG': '156.160.0.0/11',
4736 'ER': '196.200.96.0/20',
4737 'ES': '88.0.0.0/11',
4738 'ET': '196.188.0.0/14',
4739 'EU': '2.16.0.0/13',
4740 'FI': '91.152.0.0/13',
4741 'FJ': '144.120.0.0/16',
4742 'FK': '80.73.208.0/21',
4743 'FM': '119.252.112.0/20',
4744 'FO': '88.85.32.0/19',
4746 'GA': '41.158.0.0/15',
4748 'GD': '74.122.88.0/21',
4749 'GE': '31.146.0.0/16',
4750 'GF': '161.22.64.0/18',
4751 'GG': '62.68.160.0/19',
4752 'GH': '154.160.0.0/12',
4753 'GI': '95.164.0.0/16',
4754 'GL': '88.83.0.0/19',
4755 'GM': '160.182.0.0/15',
4756 'GN': '197.149.192.0/18',
4757 'GP': '104.250.0.0/19',
4758 'GQ': '105.235.224.0/20',
4759 'GR': '94.64.0.0/13',
4760 'GT': '168.234.0.0/16',
4761 'GU': '168.123.0.0/16',
4762 'GW': '197.214.80.0/20',
4763 'GY': '181.41.64.0/18',
4764 'HK': '113.252.0.0/14',
4765 'HN': '181.210.0.0/16',
4766 'HR': '93.136.0.0/13',
4767 'HT': '148.102.128.0/17',
4768 'HU': '84.0.0.0/14',
4769 'ID': '39.192.0.0/10',
4770 'IE': '87.32.0.0/12',
4771 'IL': '79.176.0.0/13',
4772 'IM': '5.62.80.0/20',
4773 'IN': '117.192.0.0/10',
4774 'IO': '203.83.48.0/21',
4775 'IQ': '37.236.0.0/14',
4776 'IR': '2.176.0.0/12',
4777 'IS': '82.221.0.0/16',
4778 'IT': '79.0.0.0/10',
4779 'JE': '87.244.64.0/18',
4780 'JM': '72.27.0.0/17',
4781 'JO': '176.29.0.0/16',
4782 'JP': '133.0.0.0/8',
4783 'KE': '105.48.0.0/12',
4784 'KG': '158.181.128.0/17',
4785 'KH': '36.37.128.0/17',
4786 'KI': '103.25.140.0/22',
4787 'KM': '197.255.224.0/20',
4788 'KN': '198.167.192.0/19',
4789 'KP': '175.45.176.0/22',
4790 'KR': '175.192.0.0/10',
4791 'KW': '37.36.0.0/14',
4792 'KY': '64.96.0.0/15',
4793 'KZ': '2.72.0.0/13',
4794 'LA': '115.84.64.0/18',
4795 'LB': '178.135.0.0/16',
4796 'LC': '24.92.144.0/20',
4797 'LI': '82.117.0.0/19',
4798 'LK': '112.134.0.0/15',
4799 'LR': '102.183.0.0/16',
4800 'LS': '129.232.0.0/17',
4801 'LT': '78.56.0.0/13',
4802 'LU': '188.42.0.0/16',
4803 'LV': '46.109.0.0/16',
4804 'LY': '41.252.0.0/14',
4805 'MA': '105.128.0.0/11',
4806 'MC': '88.209.64.0/18',
4807 'MD': '37.246.0.0/16',
4808 'ME': '178.175.0.0/17',
4809 'MF': '74.112.232.0/21',
4810 'MG': '154.126.0.0/17',
4811 'MH': '117.103.88.0/21',
4812 'MK': '77.28.0.0/15',
4813 'ML': '154.118.128.0/18',
4814 'MM': '37.111.0.0/17',
4815 'MN': '49.0.128.0/17',
4816 'MO': '60.246.0.0/16',
4817 'MP': '202.88.64.0/20',
4818 'MQ': '109.203.224.0/19',
4819 'MR': '41.188.64.0/18',
4820 'MS': '208.90.112.0/22',
4821 'MT': '46.11.0.0/16',
4822 'MU': '105.16.0.0/12',
4823 'MV': '27.114.128.0/18',
4824 'MW': '102.70.0.0/15',
4825 'MX': '187.192.0.0/11',
4826 'MY': '175.136.0.0/13',
4827 'MZ': '197.218.0.0/15',
4828 'NA': '41.182.0.0/16',
4829 'NC': '101.101.0.0/18',
4830 'NE': '197.214.0.0/18',
4831 'NF': '203.17.240.0/22',
4832 'NG': '105.112.0.0/12',
4833 'NI': '186.76.0.0/15',
4834 'NL': '145.96.0.0/11',
4835 'NO': '84.208.0.0/13',
4836 'NP': '36.252.0.0/15',
4837 'NR': '203.98.224.0/19',
4838 'NU': '49.156.48.0/22',
4839 'NZ': '49.224.0.0/14',
4840 'OM': '5.36.0.0/15',
4841 'PA': '186.72.0.0/15',
4842 'PE': '186.160.0.0/14',
4843 'PF': '123.50.64.0/18',
4844 'PG': '124.240.192.0/19',
4845 'PH': '49.144.0.0/13',
4846 'PK': '39.32.0.0/11',
4847 'PL': '83.0.0.0/11',
4848 'PM': '70.36.0.0/20',
4849 'PR': '66.50.0.0/16',
4850 'PS': '188.161.0.0/16',
4851 'PT': '85.240.0.0/13',
4852 'PW': '202.124.224.0/20',
4853 'PY': '181.120.0.0/14',
4854 'QA': '37.210.0.0/15',
4855 'RE': '102.35.0.0/16',
4856 'RO': '79.112.0.0/13',
4857 'RS': '93.86.0.0/15',
4858 'RU': '5.136.0.0/13',
4859 'RW': '41.186.0.0/16',
4860 'SA': '188.48.0.0/13',
4861 'SB': '202.1.160.0/19',
4862 'SC': '154.192.0.0/11',
4863 'SD': '102.120.0.0/13',
4864 'SE': '78.64.0.0/12',
4865 'SG': '8.128.0.0/10',
4866 'SI': '188.196.0.0/14',
4867 'SK': '78.98.0.0/15',
4868 'SL': '102.143.0.0/17',
4869 'SM': '89.186.32.0/19',
4870 'SN': '41.82.0.0/15',
4871 'SO': '154.115.192.0/18',
4872 'SR': '186.179.128.0/17',
4873 'SS': '105.235.208.0/21',
4874 'ST': '197.159.160.0/19',
4875 'SV': '168.243.0.0/16',
4876 'SX': '190.102.0.0/20',
4878 'SZ': '41.84.224.0/19',
4879 'TC': '65.255.48.0/20',
4880 'TD': '154.68.128.0/19',
4881 'TG': '196.168.0.0/14',
4882 'TH': '171.96.0.0/13',
4883 'TJ': '85.9.128.0/18',
4884 'TK': '27.96.24.0/21',
4885 'TL': '180.189.160.0/20',
4886 'TM': '95.85.96.0/19',
4887 'TN': '197.0.0.0/11',
4888 'TO': '175.176.144.0/21',
4889 'TR': '78.160.0.0/11',
4890 'TT': '186.44.0.0/15',
4891 'TV': '202.2.96.0/19',
4892 'TW': '120.96.0.0/11',
4893 'TZ': '156.156.0.0/14',
4894 'UA': '37.52.0.0/14',
4895 'UG': '102.80.0.0/13',
4897 'UY': '167.56.0.0/13',
4898 'UZ': '84.54.64.0/18',
4899 'VA': '212.77.0.0/19',
4900 'VC': '207.191.240.0/21',
4901 'VE': '186.88.0.0/13',
4902 'VG': '66.81.192.0/20',
4903 'VI': '146.226.0.0/16',
4904 'VN': '14.160.0.0/11',
4905 'VU': '202.80.32.0/20',
4906 'WF': '117.20.32.0/21',
4907 'WS': '202.4.32.0/19',
4908 'YE': '134.35.0.0/16',
4909 'YT': '41.242.116.0/22',
4910 'ZA': '41.0.0.0/11',
4911 'ZM': '102.144.0.0/13',
4912 'ZW': '102.177.192.0/18',
4916 def random_ipv4(cls
, code_or_block
):
4917 if len(code_or_block
) == 2:
4918 block
= cls
._country
_ip
_map
.get(code_or_block
.upper())
4922 block
= code_or_block
4923 addr
, preflen
= block
.split('/')
4924 addr_min
= struct
.unpack('!L', socket
.inet_aton(addr
))[0]
4925 addr_max
= addr_min |
(0xffffffff >> int(preflen
))
4926 return str(socket
.inet_ntoa(
4927 struct
.pack('!L', random
.randint(addr_min
, addr_max
))))
4930 class PerRequestProxyHandler(urllib
.request
.ProxyHandler
):
4931 def __init__(self
, proxies
=None):
4932 # Set default handlers
4933 for type in ('http', 'https'):
4934 setattr(self
, '%s_open' % type,
4935 lambda r
, proxy
='__noproxy__', type=type, meth
=self
.proxy_open
:
4936 meth(r
, proxy
, type))
4937 urllib
.request
.ProxyHandler
.__init
__(self
, proxies
)
4939 def proxy_open(self
, req
, proxy
, type):
4940 req_proxy
= req
.headers
.get('Ytdl-request-proxy')
4941 if req_proxy
is not None:
4943 del req
.headers
['Ytdl-request-proxy']
4945 if proxy
== '__noproxy__':
4946 return None # No Proxy
4947 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4948 req
.add_header('Ytdl-socks-proxy', proxy
)
4949 # yt-dlp's http/https handlers do wrapping the socket with socks
4951 return urllib
.request
.ProxyHandler
.proxy_open(
4952 self
, req
, proxy
, type)
4955 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4956 # released into Public Domain
4957 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4959 def long_to_bytes(n
, blocksize
=0):
4960 """long_to_bytes(n:long, blocksize:int) : string
4961 Convert a long integer to a byte string.
4963 If optional blocksize is given and greater than zero, pad the front of the
4964 byte string with binary zeros so that the length is a multiple of
4967 # after much testing, this algorithm was deemed to be the fastest
4971 s
= struct
.pack('>I', n
& 0xffffffff) + s
4973 # strip off leading zeros
4974 for i
in range(len(s
)):
4975 if s
[i
] != b
'\000'[0]:
4978 # only happens when n == 0
4982 # add back some pad bytes. this could be done more efficiently w.r.t. the
4983 # de-padding being done above, but sigh...
4984 if blocksize
> 0 and len(s
) % blocksize
:
4985 s
= (blocksize
- len(s
) % blocksize
) * b
'\000' + s
4989 def bytes_to_long(s
):
4990 """bytes_to_long(string) : long
4991 Convert a byte string to a long integer.
4993 This is (essentially) the inverse of long_to_bytes().
4998 extra
= (4 - length
% 4)
4999 s
= b
'\000' * extra
+ s
5000 length
= length
+ extra
5001 for i
in range(0, length
, 4):
5002 acc
= (acc
<< 32) + struct
.unpack('>I', s
[i
:i
+ 4])[0]
5006 def ohdave_rsa_encrypt(data
, exponent
, modulus
):
5008 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5011 data: data to encrypt, bytes-like object
5012 exponent, modulus: parameter e and N of RSA algorithm, both integer
5013 Output: hex string of encrypted data
5015 Limitation: supports one block encryption only
5018 payload
= int(binascii
.hexlify(data
[::-1]), 16)
5019 encrypted
= pow(payload
, exponent
, modulus
)
5020 return '%x' % encrypted
5023 def pkcs1pad(data
, length
):
5025 Padding input data with PKCS#1 scheme
5027 @param {int[]} data input data
5028 @param {int} length target length
5029 @returns {int[]} padded data
5031 if len(data
) > length
- 11:
5032 raise ValueError('Input data too long for PKCS#1 padding')
5034 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5035 return [0, 2] + pseudo_random
+ [0] + data
5038 def _base_n_table(n
, table
):
5039 if not table
and not n
:
5040 raise ValueError('Either table or n must be specified')
5041 table
= (table
or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n
]
5043 if n
and n
!= len(table
):
5044 raise ValueError(f
'base {n} exceeds table length {len(table)}')
5048 def encode_base_n(num
, n
=None, table
=None):
5049 """Convert given int to a base-n string"""
5050 table
= _base_n_table(n
, table
)
5054 result
, base
= '', len(table
)
5056 result
= table
[num
% base
] + result
5061 def decode_base_n(string
, n
=None, table
=None):
5062 """Convert given base-n string to int"""
5063 table
= {char: index for index, char in enumerate(_base_n_table(n, table))}
5064 result
, base
= 0, len(table
)
5066 result
= result
* base
+ table
[char
]
5070 def decode_base(value
, digits
):
5071 deprecation_warning(f
'{__name__}.decode_base is deprecated and may be removed '
5072 f
'in a future version. Use {__name__}.decode_base_n instead')
5073 return decode_base_n(value
, table
=digits
)
5076 def decode_packed_codes(code
):
5077 mobj
= re
.search(PACKED_CODES_RE
, code
)
5078 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5081 symbols
= symbols
.split('|')
5086 base_n_count
= encode_base_n(count
, base
)
5087 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5090 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5094 def caesar(s
, alphabet
, shift
):
5099 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5104 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5107 def parse_m3u8_attributes(attrib
):
5109 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5110 if val
.startswith('"'):
5116 def urshift(val
, n
):
5117 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5120 # Based on png2str() written by @gdkchan and improved by @yokrysty
5121 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5122 def decode_png(png_data
):
5123 # Reference: https://www.w3.org/TR/PNG/
5124 header
= png_data
[8:]
5126 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5127 raise OSError('Not a valid PNG file.')
5129 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5130 unpack_integer
= lambda x
: struct
.unpack(int_map
[len(x
)], x
)[0]
5135 length
= unpack_integer(header
[:4])
5138 chunk_type
= header
[:4]
5141 chunk_data
= header
[:length
]
5142 header
= header
[length
:]
5144 header
= header
[4:] # Skip CRC
5152 ihdr
= chunks
[0]['data']
5154 width
= unpack_integer(ihdr
[:4])
5155 height
= unpack_integer(ihdr
[4:8])
5159 for chunk
in chunks
:
5160 if chunk
['type'] == b
'IDAT':
5161 idat
+= chunk
['data']
5164 raise OSError('Unable to read PNG data.')
5166 decompressed_data
= bytearray(zlib
.decompress(idat
))
5171 def _get_pixel(idx
):
5176 for y
in range(height
):
5177 basePos
= y
* (1 + stride
)
5178 filter_type
= decompressed_data
[basePos
]
5182 pixels
.append(current_row
)
5184 for x
in range(stride
):
5185 color
= decompressed_data
[1 + basePos
+ x
]
5186 basex
= y
* stride
+ x
5191 left
= _get_pixel(basex
- 3)
5193 up
= _get_pixel(basex
- stride
)
5195 if filter_type
== 1: # Sub
5196 color
= (color
+ left
) & 0xff
5197 elif filter_type
== 2: # Up
5198 color
= (color
+ up
) & 0xff
5199 elif filter_type
== 3: # Average
5200 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5201 elif filter_type
== 4: # Paeth
5207 c
= _get_pixel(basex
- stride
- 3)
5215 if pa
<= pb
and pa
<= pc
:
5216 color
= (color
+ a
) & 0xff
5218 color
= (color
+ b
) & 0xff
5220 color
= (color
+ c
) & 0xff
5222 current_row
.append(color
)
5224 return width
, height
, pixels
5227 def write_xattr(path
, key
, value
):
5228 # Windows: Write xattrs to NTFS Alternate Data Streams:
5229 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5230 if compat_os_name
== 'nt':
5231 assert ':' not in key
5232 assert os
.path
.exists(path
)
5235 with open(f
'{path}:{key}', 'wb') as f
:
5237 except OSError as e
:
5238 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5241 # UNIX Method 1. Use xattrs/pyxattrs modules
5244 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
5245 # Unicode arguments are not supported in pyxattr until version 0.5.0
5246 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5247 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
5248 setxattr
= xattr
.set
5250 setxattr
= xattr
.setxattr
5254 setxattr(path
, key
, value
)
5255 except OSError as e
:
5256 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5259 # UNIX Method 2. Use setfattr/xattr executables
5260 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
5261 else 'xattr' if check_executable('xattr', ['-h']) else None)
5263 raise XAttrUnavailableError(
5264 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5265 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5267 value
= value
.decode()
5269 _
, stderr
, returncode
= Popen
.run(
5270 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
5271 text
=True, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5272 except OSError as e
:
5273 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5275 raise XAttrMetadataError(returncode
, stderr
)
5278 def random_birthday(year_field
, month_field
, day_field
):
5279 start_date
= datetime
.date(1950, 1, 1)
5280 end_date
= datetime
.date(1995, 12, 31)
5281 offset
= random
.randint(0, (end_date
- start_date
).days
)
5282 random_date
= start_date
+ datetime
.timedelta(offset
)
5284 year_field
: str(random_date
.year
),
5285 month_field
: str(random_date
.month
),
5286 day_field
: str(random_date
.day
),
5290 def find_available_port(interface
=''):
5292 with socket
.socket() as sock
:
5293 sock
.bind((interface
, 0))
5294 return sock
.getsockname()[1]
5299 # Templates for internet shortcut files, which are plain text files.
5300 DOT_URL_LINK_TEMPLATE
= '''\
5305 DOT_WEBLOC_LINK_TEMPLATE
= '''\
5306 <?xml version="1.0" encoding="UTF-8"?>
5307 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5308 <plist version="1.0">
5311 \t<string>%(url)s</string>
5316 DOT_DESKTOP_LINK_TEMPLATE
= '''\
5326 'url': DOT_URL_LINK_TEMPLATE
,
5327 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
5328 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
5332 def iri_to_uri(iri
):
5334 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5336 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5339 iri_parts
= urllib
.parse
.urlparse(iri
)
5341 if '[' in iri_parts
.netloc
:
5342 raise ValueError('IPv6 URIs are not, yet, supported.')
5343 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5345 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5348 if iri_parts
.username
:
5349 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5350 if iri_parts
.password
is not None:
5351 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5354 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
5355 # The 'idna' encoding produces ASCII text.
5356 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5357 net_location
+= ':' + str(iri_parts
.port
)
5359 return urllib
.parse
.urlunparse(
5363 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5365 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5366 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5368 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5369 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5371 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5373 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5376 def to_high_limit_path(path
):
5377 if sys
.platform
in ['win32', 'cygwin']:
5378 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5379 return '\\\\?\\' + os
.path
.abspath(path
)
5384 def format_field(obj
, field
=None, template
='%s', ignore
=NO_DEFAULT
, default
='', func
=IDENTITY
):
5385 val
= traverse_obj(obj
, *variadic(field
))
5386 if (not val
and val
!= 0) if ignore
is NO_DEFAULT
else val
in variadic(ignore
):
5388 return template
% func(val
)
5391 def clean_podcast_url(url
):
5392 return re
.sub(r
'''(?x)
5396 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5399 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5402 cn\.co| # https://podcorn.com/analytics-prefix/
5403 st\.fm # https://podsights.com/docs/
5408 _HEX_TABLE
= '0123456789abcdef'
5411 def random_uuidv4():
5412 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5415 def make_dir(path
, to_screen
=None):
5417 dn
= os
.path
.dirname(path
)
5419 os
.makedirs(dn
, exist_ok
=True)
5421 except OSError as err
:
5422 if callable(to_screen
) is not None:
5423 to_screen('unable to create directory ' + error_to_compat_str(err
))
5427 def get_executable_path():
5428 from .update
import _get_variant_and_executable_path
5430 return os
.path
.dirname(os
.path
.abspath(_get_variant_and_executable_path()[1]))
5433 def get_user_config_dirs(package_name
):
5434 # .config (e.g. ~/.config/package_name)
5435 xdg_config_home
= os
.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5436 yield os
.path
.join(xdg_config_home
, package_name
)
5438 # appdata (%APPDATA%/package_name)
5439 appdata_dir
= os
.getenv('appdata')
5441 yield os
.path
.join(appdata_dir
, package_name
)
5443 # home (~/.package_name)
5444 yield os
.path
.join(compat_expanduser('~'), f
'.{package_name}')
5447 def get_system_config_dirs(package_name
):
5449 yield os
.path
.join('/etc', package_name
)
5453 obj
, *paths
, default
=NO_DEFAULT
, expected_type
=None, get_all
=True,
5454 casesense
=True, is_user_input
=False, traverse_string
=False):
5456 Safely traverse nested `dict`s and `Sequence`s
5458 >>> obj = [{}, {"key": "value"}]
5459 >>> traverse_obj(obj, (1, "key"))
5462 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5463 The next path will also be tested if the path branched but no results could be found.
5464 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5465 Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
5467 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5469 The keys in the path can be one of:
5470 - `None`: Return the current object.
5471 - `set`: Requires the only item in the set to be a type or function,
5472 like `{type}`/`{func}`. If a `type`, returns only values
5473 of this type. If a function, returns `func(obj)`.
5474 - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
5475 - `slice`: Branch out and return all values in `obj[key]`.
5476 - `Ellipsis`: Branch out and return a list of all values.
5477 - `tuple`/`list`: Branch out and return a list of all matching values.
5478 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5479 - `function`: Branch out and return values filtered by the function.
5480 Read as: `[value for key, value in obj if function(key, value)]`.
5481 For `Sequence`s, `key` is the index of the value.
5482 For `re.Match`es, `key` is the group number (0 = full match)
5483 as well as additionally any group names, if given.
5484 - `dict` Transform the current object and return a matching dict.
5485 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5487 `tuple`, `list`, and `dict` all support nested paths and branches.
5489 @params paths Paths which to traverse by.
5490 @param default Value to return if the paths do not match.
5491 If the last key in the path is a `dict`, it will apply to each value inside
5492 the dict instead, depth first. Try to avoid if using nested `dict` keys.
5493 @param expected_type If a `type`, only accept final values of this type.
5494 If any other callable, try to call the function on each result.
5495 If the last key in the path is a `dict`, it will apply to each value inside
5496 the dict instead, recursively. This does respect branching paths.
5497 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5498 @param casesense If `False`, consider string dictionary keys as case insensitive.
5500 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5502 @param is_user_input Whether the keys are generated from user input.
5503 If `True` strings get converted to `int`/`slice` if needed.
5504 @param traverse_string Whether to traverse into objects as strings.
5505 If `True`, any non-compatible object will first be
5506 converted into a string and then traversed into.
5507 The return value of that path will be a string instead,
5508 not respecting any further branching.
5511 @returns The result of the object traversal.
5512 If successful, `get_all=True`, and the path branches at least once,
5513 then a list of results is returned instead.
5514 If no `default` is given and the last path branches, a `list` of results
5515 is always returned. If a path ends on a `dict` that result will always be a `dict`.
5517 is_sequence
= lambda x
: isinstance(x
, collections
.abc
.Sequence
) and not isinstance(x
, (str, bytes))
5518 casefold
= lambda k
: k
.casefold() if isinstance(k
, str) else k
5520 if isinstance(expected_type
, type):
5521 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5523 type_test
= lambda val
: try_call(expected_type
or IDENTITY
, args
=(val
,))
5525 def apply_key(key
, obj
, is_last
):
5529 if obj
is None and traverse_string
:
5535 elif isinstance(key
, set):
5536 assert len(key
) == 1, 'Set should only be used to wrap a single item'
5537 item
= next(iter(key
))
5538 if isinstance(item
, type):
5539 if isinstance(obj
, item
):
5542 result
= try_call(item
, args
=(obj
,))
5544 elif isinstance(key
, (list, tuple)):
5546 result
= itertools
.chain
.from_iterable(
5547 apply_path(obj
, branch
, is_last
)[0] for branch
in key
)
5551 if isinstance(obj
, collections
.abc
.Mapping
):
5552 result
= obj
.values()
5553 elif is_sequence(obj
):
5555 elif isinstance(obj
, re
.Match
):
5556 result
= obj
.groups()
5557 elif traverse_string
:
5565 if isinstance(obj
, collections
.abc
.Mapping
):
5566 iter_obj
= obj
.items()
5567 elif is_sequence(obj
):
5568 iter_obj
= enumerate(obj
)
5569 elif isinstance(obj
, re
.Match
):
5570 iter_obj
= itertools
.chain(
5571 enumerate((obj
.group(), *obj
.groups())),
5572 obj
.groupdict().items())
5573 elif traverse_string
:
5575 iter_obj
= enumerate(str(obj
))
5579 result
= (v
for k
, v
in iter_obj
if try_call(key
, args
=(k
, v
)))
5580 if not branching
: # string traversal
5581 result
= ''.join(result
)
5583 elif isinstance(key
, dict):
5584 iter_obj
= ((k
, _traverse_obj(obj
, v
, False, is_last
)) for k
, v
in key
.items())
5586 k
: v
if v
is not None else default
for k
, v
in iter_obj
5587 if v
is not None or default
is not NO_DEFAULT
5590 elif isinstance(obj
, collections
.abc
.Mapping
):
5591 result
= (obj
.get(key
) if casesense
or (key
in obj
) else
5592 next((v
for k
, v
in obj
.items() if casefold(k
) == key
), None))
5594 elif isinstance(obj
, re
.Match
):
5595 if isinstance(key
, int) or casesense
:
5596 with contextlib
.suppress(IndexError):
5597 result
= obj
.group(key
)
5599 elif isinstance(key
, str):
5600 result
= next((v
for k
, v
in obj
.groupdict().items() if casefold(k
) == key
), None)
5602 elif isinstance(key
, (int, slice)):
5603 if is_sequence(obj
):
5604 branching
= isinstance(key
, slice)
5605 with contextlib
.suppress(IndexError):
5607 elif traverse_string
:
5608 with contextlib
.suppress(IndexError):
5609 result
= str(obj
)[key
]
5611 return branching
, result
if branching
else (result
,)
5613 def lazy_last(iterable
):
5614 iterator
= iter(iterable
)
5615 prev
= next(iterator
, NO_DEFAULT
)
5616 if prev
is NO_DEFAULT
:
5619 for item
in iterator
:
5625 def apply_path(start_obj
, path
, test_type
):
5627 has_branched
= False
5630 for last
, key
in lazy_last(variadic(path
, (str, bytes, dict, set))):
5631 if is_user_input
and isinstance(key
, str):
5635 key
= slice(*map(int_or_none
, key
.split(':')))
5636 elif int_or_none(key
) is not None:
5639 if not casesense
and isinstance(key
, str):
5640 key
= key
.casefold()
5642 if __debug__
and callable(key
):
5643 # Verify function signature
5644 inspect
.signature(key
).bind(None, None)
5648 branching
, results
= apply_key(key
, obj
, last
)
5649 has_branched |
= branching
5650 new_objs
.append(results
)
5652 objs
= itertools
.chain
.from_iterable(new_objs
)
5654 if test_type
and not isinstance(key
, (dict, list, tuple)):
5655 objs
= map(type_test
, objs
)
5657 return objs
, has_branched
, isinstance(key
, dict)
5659 def _traverse_obj(obj
, path
, allow_empty
, test_type
):
5660 results
, has_branched
, is_dict
= apply_path(obj
, path
, test_type
)
5661 results
= LazyList(item
for item
in results
if item
not in (None, {}))
5662 if get_all
and has_branched
:
5664 return results
.exhaust()
5666 return [] if default
is NO_DEFAULT
else default
5669 return results
[0] if results
else {} if allow_empty
and is_dict
else None
5671 for index
, path
in enumerate(paths
, 1):
5672 result
= _traverse_obj(obj
, path
, index
== len(paths
), True)
5673 if result
is not None:
5676 return None if default
is NO_DEFAULT
else default
5679 def traverse_dict(dictn
, keys
, casesense
=True):
5680 deprecation_warning(f
'"{__name__}.traverse_dict" is deprecated and may be removed '
5681 f
'in a future version. Use "{__name__}.traverse_obj" instead')
5682 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5685 def get_first(obj
, keys
, **kwargs
):
5686 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5689 def time_seconds(**kwargs
):
5691 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5693 return time
.time() + datetime
.timedelta(**kwargs
).total_seconds()
5696 # create a JSON Web Signature (jws) with HS256 algorithm
5697 # the resulting format is in JWS Compact Serialization
5698 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5699 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5700 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5706 header_data
.update(headers
)
5707 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5708 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5709 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5710 signature_b64
= base64
.b64encode(h
.digest())
5711 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5715 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5716 def jwt_decode_hs256(jwt
):
5717 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5718 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5719 payload_data
= json
.loads(base64
.urlsafe_b64decode(f
'{payload_b64}==='))
5723 WINDOWS_VT_MODE
= False if compat_os_name
== 'nt' else None
5727 def supports_terminal_sequences(stream
):
5728 if compat_os_name
== 'nt':
5729 if not WINDOWS_VT_MODE
:
5731 elif not os
.getenv('TERM'):
5734 return stream
.isatty()
5735 except BaseException
:
5739 def windows_enable_vt_mode():
5740 """Ref: https://bugs.python.org/issue30075 """
5741 if get_windows_version() < (10, 0, 10586):
5745 import ctypes
.wintypes
5748 ENABLE_VIRTUAL_TERMINAL_PROCESSING
= 0x0004
5750 dll
= ctypes
.WinDLL('kernel32', use_last_error
=False)
5751 handle
= os
.open('CONOUT$', os
.O_RDWR
)
5753 h_out
= ctypes
.wintypes
.HANDLE(msvcrt
.get_osfhandle(handle
))
5754 dw_original_mode
= ctypes
.wintypes
.DWORD()
5755 success
= dll
.GetConsoleMode(h_out
, ctypes
.byref(dw_original_mode
))
5757 raise Exception('GetConsoleMode failed')
5759 success
= dll
.SetConsoleMode(h_out
, ctypes
.wintypes
.DWORD(
5760 dw_original_mode
.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
))
5762 raise Exception('SetConsoleMode failed')
5766 global WINDOWS_VT_MODE
5767 WINDOWS_VT_MODE
= True
5768 supports_terminal_sequences
.cache_clear()
5771 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5774 def remove_terminal_sequences(string
):
5775 return _terminal_sequences_re
.sub('', string
)
5778 def number_of_digits(number
):
5779 return len('%d' % number
)
5782 def join_nonempty(*values
, delim
='-', from_dict
=None):
5783 if from_dict
is not None:
5784 values
= (traverse_obj(from_dict
, variadic(v
)) for v
in values
)
5785 return delim
.join(map(str, filter(None, values
)))
5788 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5790 Find the largest format dimensions in terms of video width and, for each thumbnail:
5791 * Modify the URL: Match the width with the provided regex and replace with the former width
5794 This function is useful with video services that scale the provided thumbnails on demand
5796 _keys
= ('width', 'height')
5797 max_dimensions
= max(
5798 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5800 if not max_dimensions
[0]:
5804 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5805 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5806 for thumbnail
in thumbnails
5810 def parse_http_range(range):
5811 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5813 return None, None, None
5814 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5816 return None, None, None
5817 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5820 def read_stdin(what
):
5821 eof
= 'Ctrl+Z' if compat_os_name
== 'nt' else 'Ctrl+D'
5822 write_string(f
'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5826 def determine_file_encoding(data
):
5828 Detect the text encoding used
5829 @returns (encoding, bytes to skip)
5832 # BOM marks are given priority over declarations
5833 for bom
, enc
in BOMS
:
5834 if data
.startswith(bom
):
5835 return enc
, len(bom
)
5837 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5838 # We ignore the endianness to get a good enough match
5839 data
= data
.replace(b
'\0', b
'')
5840 mobj
= re
.match(rb
'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data
)
5841 return mobj
.group(1).decode() if mobj
else None, 0
5848 __initialized
= False
5850 def __init__(self
, parser
, label
=None):
5851 self
.parser
, self
.label
= parser
, label
5852 self
._loaded
_paths
, self
.configs
= set(), []
5854 def init(self
, args
=None, filename
=None):
5855 assert not self
.__initialized
5856 self
.own_args
, self
.filename
= args
, filename
5857 return self
.load_configs()
5859 def load_configs(self
):
5862 location
= os
.path
.realpath(self
.filename
)
5863 directory
= os
.path
.dirname(location
)
5864 if location
in self
._loaded
_paths
:
5866 self
._loaded
_paths
.add(location
)
5868 self
.__initialized
= True
5869 opts
, _
= self
.parser
.parse_known_args(self
.own_args
)
5870 self
.parsed_args
= self
.own_args
5871 for location
in opts
.config_locations
or []:
5873 if location
in self
._loaded
_paths
:
5875 self
._loaded
_paths
.add(location
)
5876 self
.append_config(shlex
.split(read_stdin('options'), comments
=True), label
='stdin')
5878 location
= os
.path
.join(directory
, expand_path(location
))
5879 if os
.path
.isdir(location
):
5880 location
= os
.path
.join(location
, 'yt-dlp.conf')
5881 if not os
.path
.exists(location
):
5882 self
.parser
.error(f
'config location {location} does not exist')
5883 self
.append_config(self
.read_file(location
), location
)
5887 label
= join_nonempty(
5888 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5890 return join_nonempty(
5891 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5892 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5896 def read_file(filename
, default
=[]):
5898 optionf
= open(filename
, 'rb')
5900 return default
# silently skip if file is not present
5902 enc
, skip
= determine_file_encoding(optionf
.read(512))
5903 optionf
.seek(skip
, io
.SEEK_SET
)
5905 enc
= None # silently skip read errors
5907 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5908 contents
= optionf
.read().decode(enc
or preferredencoding())
5909 res
= shlex
.split(contents
, comments
=True)
5910 except Exception as err
:
5911 raise ValueError(f
'Unable to parse "{filename}": {err}')
5917 def hide_login_info(opts
):
5918 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5919 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5924 return m
.group('key') + '=PRIVATE'
5928 opts
= list(map(_scrub_eq
, opts
))
5929 for idx
, opt
in enumerate(opts
):
5930 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5931 opts
[idx
+ 1] = 'PRIVATE'
5934 def append_config(self
, *args
, label
=None):
5935 config
= type(self
)(self
.parser
, label
)
5936 config
._loaded
_paths
= self
._loaded
_paths
5937 if config
.init(*args
):
5938 self
.configs
.append(config
)
5942 for config
in reversed(self
.configs
):
5943 yield from config
.all_args
5944 yield from self
.parsed_args
or []
5946 def parse_known_args(self
, **kwargs
):
5947 return self
.parser
.parse_known_args(self
.all_args
, **kwargs
)
5949 def parse_args(self
):
5950 return self
.parser
.parse_args(self
.all_args
)
5953 class WebSocketsWrapper
:
5954 """Wraps websockets module to use in non-async scopes"""
5957 def __init__(self
, url
, headers
=None, connect
=True):
5958 self
.loop
= asyncio
.new_event_loop()
5959 # XXX: "loop" is deprecated
5960 self
.conn
= websockets
.connect(
5961 url
, extra_headers
=headers
, ping_interval
=None,
5962 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5965 atexit
.register(self
.__exit
__, None, None, None)
5967 def __enter__(self
):
5969 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5972 def send(self
, *args
):
5973 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5975 def recv(self
, *args
):
5976 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5978 def __exit__(self
, type, value
, traceback
):
5980 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5983 self
._cancel
_all
_tasks
(self
.loop
)
5985 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5986 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5988 def run_with_loop(main
, loop
):
5989 if not asyncio
.iscoroutine(main
):
5990 raise ValueError(f
'a coroutine was expected, got {main!r}')
5993 return loop
.run_until_complete(main
)
5995 loop
.run_until_complete(loop
.shutdown_asyncgens())
5996 if hasattr(loop
, 'shutdown_default_executor'):
5997 loop
.run_until_complete(loop
.shutdown_default_executor())
6000 def _cancel_all_tasks(loop
):
6001 to_cancel
= asyncio
.all_tasks(loop
)
6006 for task
in to_cancel
:
6009 # XXX: "loop" is removed in python 3.10+
6010 loop
.run_until_complete(
6011 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
6013 for task
in to_cancel
:
6014 if task
.cancelled():
6016 if task
.exception() is not None:
6017 loop
.call_exception_handler({
6018 'message': 'unhandled exception during asyncio.run() shutdown',
6019 'exception': task
.exception(),
6024 def merge_headers(*dicts
):
6025 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
6026 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
6029 def cached_method(f
):
6030 """Cache a method"""
6031 signature
= inspect
.signature(f
)
6034 def wrapper(self
, *args
, **kwargs
):
6035 bound_args
= signature
.bind(self
, *args
, **kwargs
)
6036 bound_args
.apply_defaults()
6037 key
= tuple(bound_args
.arguments
.values())[1:]
6039 cache
= vars(self
).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {}
)
6040 if key
not in cache
:
6041 cache
[key
] = f(self
, *args
, **kwargs
)
6046 class classproperty
:
6047 """property access for class methods with optional caching"""
6048 def __new__(cls
, func
=None, *args
, **kwargs
):
6050 return functools
.partial(cls
, *args
, **kwargs
)
6051 return super().__new
__(cls
)
6053 def __init__(self
, func
, *, cache
=False):
6054 functools
.update_wrapper(self
, func
)
6056 self
._cache
= {} if cache
else None
6058 def __get__(self
, _
, cls
):
6059 if self
._cache
is None:
6060 return self
.func(cls
)
6061 elif cls
not in self
._cache
:
6062 self
._cache
[cls
] = self
.func(cls
)
6063 return self
._cache
[cls
]
6066 class function_with_repr
:
6067 def __init__(self
, func
, repr_
=None):
6068 functools
.update_wrapper(self
, func
)
6069 self
.func
, self
.__repr
= func
, repr_
6071 def __call__(self
, *args
, **kwargs
):
6072 return self
.func(*args
, **kwargs
)
6077 return f
'{self.func.__module__}.{self.func.__qualname__}'
6080 class Namespace(types
.SimpleNamespace
):
6081 """Immutable namespace"""
6084 return iter(self
.__dict
__.values())
6088 return self
.__dict
__.items()
6091 MEDIA_EXTENSIONS
= Namespace(
6092 common_video
=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6093 video
=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6094 common_audio
=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
6095 audio
=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
6096 thumbnails
=('jpg', 'png', 'webp'),
6097 storyboards
=('mhtml', ),
6098 subtitles
=('srt', 'vtt', 'ass', 'lrc'),
6099 manifests
=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6101 MEDIA_EXTENSIONS
.video
+= MEDIA_EXTENSIONS
.common_video
6102 MEDIA_EXTENSIONS
.audio
+= MEDIA_EXTENSIONS
.common_audio
6104 KNOWN_EXTENSIONS
= (*MEDIA_EXTENSIONS
.video
, *MEDIA_EXTENSIONS
.audio
, *MEDIA_EXTENSIONS
.manifests
)
6109 for retry in RetryManager(...):
6112 except SomeException as err:
6116 attempt
, _error
= 0, None
6118 def __init__(self
, _retries
, _error_callback
, **kwargs
):
6119 self
.retries
= _retries
or 0
6120 self
.error_callback
= functools
.partial(_error_callback
, **kwargs
)
6122 def _should_retry(self
):
6123 return self
._error
is not NO_DEFAULT
and self
.attempt
<= self
.retries
6127 if self
._error
is NO_DEFAULT
:
6132 def error(self
, value
):
6136 while self
._should
_retry
():
6137 self
.error
= NO_DEFAULT
6141 self
.error_callback(self
.error
, self
.attempt
, self
.retries
)
6144 def report_retry(e
, count
, retries
, *, sleep_func
, info
, warn
, error
=None, suffix
=None):
6145 """Utility function for reporting retries"""
6148 return error(f
'{e}. Giving up after {count - 1} retries') if count
> 1 else error(str(e
))
6153 elif isinstance(e
, ExtractorError
):
6154 e
= remove_end(str_or_none(e
.cause
) or e
.orig_msg
, '.')
6155 warn(f
'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6157 delay
= float_or_none(sleep_func(n
=count
- 1)) if callable(sleep_func
) else sleep_func
6159 info(f
'Sleeping {delay:.2f} seconds ...')
6163 def make_archive_id(ie
, video_id
):
6164 ie_key
= ie
if isinstance(ie
, str) else ie
.ie_key()
6165 return f
'{ie_key.lower()} {video_id}'
6168 def truncate_string(s
, left
, right
=0):
6169 assert left
> 3 and right
>= 0
6170 if s
is None or len(s
) <= left
+ right
:
6172 return f
'{s[:left-3]}...{s[-right:] if right else ""}'
6175 def orderedSet_from_options(options
, alias_dict
, *, use_regex
=False, start
=None):
6176 assert 'all' in alias_dict
, '"all" alias is required'
6177 requested
= list(start
or [])
6179 discard
= val
.startswith('-')
6183 if val
in alias_dict
:
6184 val
= alias_dict
[val
] if not discard
else [
6185 i
[1:] if i
.startswith('-') else f
'-{i}' for i
in alias_dict
[val
]]
6186 # NB: Do not allow regex in aliases for performance
6187 requested
= orderedSet_from_options(val
, alias_dict
, start
=requested
)
6190 current
= (filter(re
.compile(val
, re
.I
).fullmatch
, alias_dict
['all']) if use_regex
6191 else [val
] if val
in alias_dict
['all'] else None)
6193 raise ValueError(val
)
6196 for item
in current
:
6197 while item
in requested
:
6198 requested
.remove(item
)
6200 requested
.extend(current
)
6202 return orderedSet(requested
)
6206 regex
= r
' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6208 default
= ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6209 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6210 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6211 ytdl_default
= ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6212 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6213 'fps', 'fs_approx', 'source', 'id')
6216 'vcodec': {'type': 'ordered', 'regex': True,
6217 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6218 'acodec': {'type': 'ordered', 'regex': True,
6219 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6220 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6221 'order': ['dv', '(hdr)?12', r
'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6222 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6223 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6224 'vext': {'type': 'ordered', 'field': 'video_ext',
6225 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6226 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6227 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6228 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6229 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
6230 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}
,
6231 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6232 'field': ('vcodec', 'acodec'),
6233 'function': lambda it
: int(any(v
!= 'none' for v
in it
))},
6234 'ie_pref': {'priority': True, 'type': 'extractor'}
,
6235 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}
,
6236 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}
,
6237 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}
,
6238 'quality': {'convert': 'float', 'default': -1}
,
6239 'filesize': {'convert': 'bytes'}
,
6240 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}
,
6241 'id': {'convert': 'string', 'field': 'format_id'}
,
6242 'height': {'convert': 'float_none'}
,
6243 'width': {'convert': 'float_none'}
,
6244 'fps': {'convert': 'float_none'}
,
6245 'channels': {'convert': 'float_none', 'field': 'audio_channels'}
,
6246 'tbr': {'convert': 'float_none'}
,
6247 'vbr': {'convert': 'float_none'}
,
6248 'abr': {'convert': 'float_none'}
,
6249 'asr': {'convert': 'float_none'}
,
6250 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}
,
6252 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}
,
6253 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}
,
6254 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}
,
6255 'ext': {'type': 'combined', 'field': ('vext', 'aext')}
,
6256 'res': {'type': 'multiple', 'field': ('height', 'width'),
6257 'function': lambda it
: (lambda l
: min(l
) if l
else 0)(tuple(filter(None, it
)))},
6259 # Actual field names
6260 'format_id': {'type': 'alias', 'field': 'id'}
,
6261 'preference': {'type': 'alias', 'field': 'ie_pref'}
,
6262 'language_preference': {'type': 'alias', 'field': 'lang'}
,
6263 'source_preference': {'type': 'alias', 'field': 'source'}
,
6264 'protocol': {'type': 'alias', 'field': 'proto'}
,
6265 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}
,
6266 'audio_channels': {'type': 'alias', 'field': 'channels'}
,
6269 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}
,
6270 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}
,
6271 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}
,
6272 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}
,
6273 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}
,
6274 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}
,
6275 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}
,
6276 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}
,
6277 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}
,
6278 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}
,
6279 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}
,
6280 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}
,
6281 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}
,
6282 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}
,
6283 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}
,
6284 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}
,
6285 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}
,
6286 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}
,
6287 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}
,
6288 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}
,
6291 def __init__(self
, ydl
, field_preference
):
6294 self
.evaluate_params(self
.ydl
.params
, field_preference
)
6295 if ydl
.params
.get('verbose'):
6296 self
.print_verbose_info(self
.ydl
.write_debug
)
6298 def _get_field_setting(self
, field
, key
):
6299 if field
not in self
.settings
:
6300 if key
in ('forced', 'priority'):
6302 self
.ydl
.deprecated_feature(f
'Using arbitrary fields ({field}) for format sorting is '
6303 'deprecated and may be removed in a future version')
6304 self
.settings
[field
] = {}
6305 propObj
= self
.settings
[field
]
6306 if key
not in propObj
:
6307 type = propObj
.get('type')
6309 default
= 'preference' if type == 'extractor' else (field
,) if type in ('combined', 'multiple') else field
6310 elif key
== 'convert':
6311 default
= 'order' if type == 'ordered' else 'float_string' if field
else 'ignore'
6313 default
= {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}
.get(key
, None)
6314 propObj
[key
] = default
6317 def _resolve_field_value(self
, field
, value
, convertNone
=False):
6322 value
= value
.lower()
6323 conversion
= self
._get
_field
_setting
(field
, 'convert')
6324 if conversion
== 'ignore':
6326 if conversion
== 'string':
6328 elif conversion
== 'float_none':
6329 return float_or_none(value
)
6330 elif conversion
== 'bytes':
6331 return parse_bytes(value
)
6332 elif conversion
== 'order':
6333 order_list
= (self
._use
_free
_order
and self
._get
_field
_setting
(field
, 'order_free')) or self
._get
_field
_setting
(field
, 'order')
6334 use_regex
= self
._get
_field
_setting
(field
, 'regex')
6335 list_length
= len(order_list
)
6336 empty_pos
= order_list
.index('') if '' in order_list
else list_length
+ 1
6337 if use_regex
and value
is not None:
6338 for i
, regex
in enumerate(order_list
):
6339 if regex
and re
.match(regex
, value
):
6340 return list_length
- i
6341 return list_length
- empty_pos
# not in list
6342 else: # not regex or value = None
6343 return list_length
- (order_list
.index(value
) if value
in order_list
else empty_pos
)
6345 if value
.isnumeric():
6348 self
.settings
[field
]['convert'] = 'string'
6351 def evaluate_params(self
, params
, sort_extractor
):
6352 self
._use
_free
_order
= params
.get('prefer_free_formats', False)
6353 self
._sort
_user
= params
.get('format_sort', [])
6354 self
._sort
_extractor
= sort_extractor
6356 def add_item(field
, reverse
, closest
, limit_text
):
6357 field
= field
.lower()
6358 if field
in self
._order
:
6360 self
._order
.append(field
)
6361 limit
= self
._resolve
_field
_value
(field
, limit_text
)
6364 'closest': False if limit
is None else closest
,
6365 'limit_text': limit_text
,
6367 if field
in self
.settings
:
6368 self
.settings
[field
].update(data
)
6370 self
.settings
[field
] = data
6373 tuple(field
for field
in self
.default
if self
._get
_field
_setting
(field
, 'forced'))
6374 + (tuple() if params
.get('format_sort_force', False)
6375 else tuple(field
for field
in self
.default
if self
._get
_field
_setting
(field
, 'priority')))
6376 + tuple(self
._sort
_user
) + tuple(sort_extractor
) + self
.default
)
6378 for item
in sort_list
:
6379 match
= re
.match(self
.regex
, item
)
6381 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item
)
6382 field
= match
.group('field')
6385 if self
._get
_field
_setting
(field
, 'type') == 'alias':
6386 alias
, field
= field
, self
._get
_field
_setting
(field
, 'field')
6387 if self
._get
_field
_setting
(alias
, 'deprecated'):
6388 self
.ydl
.deprecated_feature(f
'Format sorting alias {alias} is deprecated and may '
6389 f
'be removed in a future version. Please use {field} instead')
6390 reverse
= match
.group('reverse') is not None
6391 closest
= match
.group('separator') == '~'
6392 limit_text
= match
.group('limit')
6394 has_limit
= limit_text
is not None
6395 has_multiple_fields
= self
._get
_field
_setting
(field
, 'type') == 'combined'
6396 has_multiple_limits
= has_limit
and has_multiple_fields
and not self
._get
_field
_setting
(field
, 'same_limit')
6398 fields
= self
._get
_field
_setting
(field
, 'field') if has_multiple_fields
else (field
,)
6399 limits
= limit_text
.split(':') if has_multiple_limits
else (limit_text
,) if has_limit
else tuple()
6400 limit_count
= len(limits
)
6401 for (i
, f
) in enumerate(fields
):
6402 add_item(f
, reverse
, closest
,
6403 limits
[i
] if i
< limit_count
6404 else limits
[0] if has_limit
and not has_multiple_limits
6407 def print_verbose_info(self
, write_debug
):
6409 write_debug('Sort order given by user: %s' % ', '.join(self
._sort
_user
))
6410 if self
._sort
_extractor
:
6411 write_debug('Sort order given by extractor: %s' % ', '.join(self
._sort
_extractor
))
6412 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6413 '+' if self
._get
_field
_setting
(field
, 'reverse') else '', field
,
6414 '%s%s(%s)' % ('~' if self
._get
_field
_setting
(field
, 'closest') else ':',
6415 self
._get
_field
_setting
(field
, 'limit_text'),
6416 self
._get
_field
_setting
(field
, 'limit'))
6417 if self
._get
_field
_setting
(field
, 'limit_text') is not None else '')
6418 for field
in self
._order
if self
._get
_field
_setting
(field
, 'visible')]))
6420 def _calculate_field_preference_from_value(self
, format
, field
, type, value
):
6421 reverse
= self
._get
_field
_setting
(field
, 'reverse')
6422 closest
= self
._get
_field
_setting
(field
, 'closest')
6423 limit
= self
._get
_field
_setting
(field
, 'limit')
6425 if type == 'extractor':
6426 maximum
= self
._get
_field
_setting
(field
, 'max')
6427 if value
is None or (maximum
is not None and value
>= maximum
):
6429 elif type == 'boolean':
6430 in_list
= self
._get
_field
_setting
(field
, 'in_list')
6431 not_in_list
= self
._get
_field
_setting
(field
, 'not_in_list')
6432 value
= 0 if ((in_list
is None or value
in in_list
) and (not_in_list
is None or value
not in not_in_list
)) else -1
6433 elif type == 'ordered':
6434 value
= self
._resolve
_field
_value
(field
, value
, True)
6436 # try to convert to number
6437 val_num
= float_or_none(value
, default
=self
._get
_field
_setting
(field
, 'default'))
6438 is_num
= self
._get
_field
_setting
(field
, 'convert') != 'string' and val_num
is not None
6442 return ((-10, 0) if value
is None
6443 else (1, value
, 0) if not is_num
# if a field has mixed strings and numbers, strings are sorted higher
6444 else (0, -abs(value
- limit
), value
- limit
if reverse
else limit
- value
) if closest
6445 else (0, value
, 0) if not reverse
and (limit
is None or value
<= limit
)
6446 else (0, -value
, 0) if limit
is None or (reverse
and value
== limit
) or value
> limit
6447 else (-1, value
, 0))
6449 def _calculate_field_preference(self
, format
, field
):
6450 type = self
._get
_field
_setting
(field
, 'type') # extractor, boolean, ordered, field, multiple
6451 get_value
= lambda f
: format
.get(self
._get
_field
_setting
(f
, 'field'))
6452 if type == 'multiple':
6453 type = 'field' # Only 'field' is allowed in multiple for now
6454 actual_fields
= self
._get
_field
_setting
(field
, 'field')
6456 value
= self
._get
_field
_setting
(field
, 'function')(get_value(f
) for f
in actual_fields
)
6458 value
= get_value(field
)
6459 return self
._calculate
_field
_preference
_from
_value
(format
, field
, type, value
)
6461 def calculate_preference(self
, format
):
6462 # Determine missing protocol
6463 if not format
.get('protocol'):
6464 format
['protocol'] = determine_protocol(format
)
6466 # Determine missing ext
6467 if not format
.get('ext') and 'url' in format
:
6468 format
['ext'] = determine_ext(format
['url'])
6469 if format
.get('vcodec') == 'none':
6470 format
['audio_ext'] = format
['ext'] if format
.get('acodec') != 'none' else 'none'
6471 format
['video_ext'] = 'none'
6473 format
['video_ext'] = format
['ext']
6474 format
['audio_ext'] = 'none'
6475 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6476 # format['preference'] = -1000
6478 if format
.get('preference') is None and format
.get('ext') == 'flv' and re
.match('[hx]265|he?vc?', format
.get('vcodec') or ''):
6479 # HEVC-over-FLV is out-of-spec by FLV's original spec
6480 # ref. https://trac.ffmpeg.org/ticket/6389
6481 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6482 format
['preference'] = -100
6484 # Determine missing bitrates
6485 if format
.get('tbr') is None:
6486 if format
.get('vbr') is not None and format
.get('abr') is not None:
6487 format
['tbr'] = format
.get('vbr', 0) + format
.get('abr', 0)
6489 if format
.get('vcodec') != 'none' and format
.get('vbr') is None:
6490 format
['vbr'] = format
.get('tbr') - format
.get('abr', 0)
6491 if format
.get('acodec') != 'none' and format
.get('abr') is None:
6492 format
['abr'] = format
.get('tbr') - format
.get('vbr', 0)
6494 return tuple(self
._calculate
_field
_preference
(format
, field
) for field
in self
._order
)
6498 has_certifi
= bool(certifi
)
6499 has_websockets
= bool(websockets
)
6502 def load_plugins(name
, suffix
, namespace
):
6503 from .plugins
import load_plugins
6504 ret
= load_plugins(name
, suffix
)
6505 namespace
.update(ret
)