47 import xml
.etree
.ElementTree
49 from . import traversal
51 from ..compat
import functools
# isort: split
52 from ..compat
import (
53 compat_etree_fromstring
,
55 compat_HTMLParseError
,
59 from ..dependencies
import websockets
, xattr
61 __name__
= __name__
.rsplit('.', 1)[0] # Pretend to be the parent module
63 # This is not clearly defined otherwise
64 compiled_regex_type
= type(re
.compile(''))
68 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
80 ENGLISH_MONTH_NAMES
= [
81 'January', 'February', 'March', 'April', 'May', 'June',
82 'July', 'August', 'September', 'October', 'November', 'December']
85 'en': ENGLISH_MONTH_NAMES
,
87 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
88 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
89 # these follow the genitive grammatical case (dopełniacz)
90 # some websites might be using nominative, which will require another month list
91 # https://en.wikibooks.org/wiki/Polish/Noun_cases
92 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
93 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
96 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
98 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
99 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
100 'EST': -5, 'EDT': -4, # Eastern
101 'CST': -6, 'CDT': -5, # Central
102 'MST': -7, 'MDT': -6, # Mountain
103 'PST': -8, 'PDT': -7 # Pacific
106 # needed for sanitizing filenames in restricted mode
107 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
108 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
109 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
139 '%Y-%m-%d %H:%M:%S.%f',
140 '%Y-%m-%d %H:%M:%S:%f',
143 '%Y-%m-%dT%H:%M:%SZ',
144 '%Y-%m-%dT%H:%M:%S.%fZ',
145 '%Y-%m-%dT%H:%M:%S.%f0Z',
147 '%Y-%m-%dT%H:%M:%S.%f',
150 '%b %d %Y at %H:%M:%S',
152 '%B %d %Y at %H:%M:%S',
156 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
157 DATE_FORMATS_DAY_FIRST
.extend([
168 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
169 DATE_FORMATS_MONTH_FIRST
.extend([
177 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
178 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>\s
*(?P
<json_ld
>{.+?}|\
[.+?\
])\s
*</script
>'
180 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
184 def preferredencoding():
185 """Get preferred encoding.
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
191 pref = locale.getpreferredencoding()
199 def write_json_file(obj, fn):
200 """ Encode obj as JSON and write it to fn, atomically if possible """
202 tf = tempfile.NamedTemporaryFile(
203 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
204 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
208 json.dump(obj, tf, ensure_ascii=False)
209 if sys.platform == 'win32
':
210 # Need to remove existing file on Windows, else os.rename raises
211 # WindowsError or FileExistsError.
212 with contextlib.suppress(OSError):
214 with contextlib.suppress(OSError):
217 os.chmod(tf.name, 0o666 & ~mask)
218 os.rename(tf.name, fn)
220 with contextlib.suppress(OSError):
225 def find_xpath_attr(node, xpath, key, val=None):
226 """ Find the xpath xpath[@key=val] """
227 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
228 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
229 return node.find(expr)
231 # On python2.6 the xml.etree.ElementTree.Element methods don't support
232 # the namespace parameter
235 def xpath_with_ns(path
, ns_map
):
236 components
= [c
.split(':') for c
in path
.split('/')]
240 replaced
.append(c
[0])
243 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
244 return '/'.join(replaced
)
247 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
248 def _find_xpath(xpath
):
249 return node
.find(xpath
)
251 if isinstance(xpath
, str):
252 n
= _find_xpath(xpath
)
260 if default
is not NO_DEFAULT
:
263 name
= xpath
if name
is None else name
264 raise ExtractorError('Could not find XML element %s' % name
)
270 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
271 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
272 if n
is None or n
== default
:
275 if default
is not NO_DEFAULT
:
278 name
= xpath
if name
is None else name
279 raise ExtractorError('Could not find XML element\'s text %s' % name
)
285 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
286 n
= find_xpath_attr(node
, xpath
, key
)
288 if default
is not NO_DEFAULT
:
291 name
= f
'{xpath}[@{key}]' if name
is None else name
292 raise ExtractorError('Could not find XML attribute %s' % name
)
298 def get_element_by_id(id, html
, **kwargs
):
299 """Return the content of the tag with the specified ID in the passed HTML document"""
300 return get_element_by_attribute('id', id, html
, **kwargs
)
303 def get_element_html_by_id(id, html
, **kwargs
):
304 """Return the html of the tag with the specified ID in the passed HTML document"""
305 return get_element_html_by_attribute('id', id, html
, **kwargs
)
308 def get_element_by_class(class_name
, html
):
309 """Return the content of the first tag with the specified class in the passed HTML document"""
310 retval
= get_elements_by_class(class_name
, html
)
311 return retval
[0] if retval
else None
314 def get_element_html_by_class(class_name
, html
):
315 """Return the html of the first tag with the specified class in the passed HTML document"""
316 retval
= get_elements_html_by_class(class_name
, html
)
317 return retval
[0] if retval
else None
320 def get_element_by_attribute(attribute
, value
, html
, **kwargs
):
321 retval
= get_elements_by_attribute(attribute
, value
, html
, **kwargs
)
322 return retval
[0] if retval
else None
325 def get_element_html_by_attribute(attribute
, value
, html
, **kargs
):
326 retval
= get_elements_html_by_attribute(attribute
, value
, html
, **kargs
)
327 return retval
[0] if retval
else None
330 def get_elements_by_class(class_name
, html
, **kargs
):
331 """Return the content of all tags with the specified class in the passed HTML document as a list"""
332 return get_elements_by_attribute(
333 'class', r
'[^\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
334 html, escape_value=False)
337 def get_elements_html_by_class(class_name, html):
338 """Return the html of all tags with the specified class in the passed HTML document as a list"""
339 return get_elements_html_by_attribute(
340 'class', r'[^
\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
341 html, escape_value=False)
344 def get_elements_by_attribute(*args, **kwargs):
345 """Return the content of the tag with the specified attribute in the passed HTML document"""
346 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
349 def get_elements_html_by_attribute(*args, **kwargs):
350 """Return the html of the tag with the specified attribute in the passed HTML document"""
351 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
354 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
:.-]+', escape_value=True):
356 Return the text (content) and the html (whole) of the tag with the specified
357 attribute in the passed HTML document
362 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
364 value = re.escape(value) if escape_value else value
366 partial_element_re = rf'''(?x
)
368 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
369 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
372 for m in re.finditer(partial_element_re, html):
373 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
376 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
381 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
383 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
384 closing tag for the first opening tag it has encountered, and can be used
388 class HTMLBreakOnClosingTagException(Exception):
392 self.tagstack = collections.deque()
393 html.parser.HTMLParser.__init__(self)
398 def __exit__(self, *_):
402 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
403 # so data remains buffered; we no longer have any interest in it, thus
404 # override this method to discard it
407 def handle_starttag(self, tag, _):
408 self.tagstack.append(tag)
410 def handle_endtag(self, tag):
411 if not self.tagstack:
412 raise compat_HTMLParseError('no tags
in the stack
')
414 inner_tag = self.tagstack.pop()
418 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
419 if not self.tagstack:
420 raise self.HTMLBreakOnClosingTagException()
423 # XXX: This should be far less strict
424 def get_element_text_and_html_by_tag(tag, html):
426 For the first element with the specified tag in the passed HTML document
427 return its' content (text
) and the whole
element (html
)
429 def find_or_raise(haystack, needle, exc):
431 return haystack.index(needle)
434 closing_tag = f'</{tag}>'
435 whole_start = find_or_raise(
436 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
437 content_start = find_or_raise(
438 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
439 content_start += whole_start + 1
440 with HTMLBreakOnClosingTagParser() as parser:
441 parser.feed(html[whole_start:content_start])
442 if not parser.tagstack or parser.tagstack[0] != tag:
443 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
444 offset = content_start
445 while offset < len(html):
446 next_closing_tag_start = find_or_raise(
447 html[offset:], closing_tag,
448 compat_HTMLParseError(f'closing {tag} tag not found'))
449 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
451 parser.feed(html[offset:offset + next_closing_tag_end])
452 offset += next_closing_tag_end
453 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
454 return html[content_start:offset + next_closing_tag_start], \
455 html[whole_start:offset + next_closing_tag_end]
456 raise compat_HTMLParseError('unexpected end of html')
459 class HTMLAttributeParser(html.parser.HTMLParser):
460 """Trivial HTML parser to gather the attributes
for a single element
"""
464 html.parser.HTMLParser.__init__(self)
466 def handle_starttag(self, tag, attrs):
467 self.attrs = dict(attrs)
468 raise compat_HTMLParseError('done')
471 class HTMLListAttrsParser(html.parser.HTMLParser):
472 """HTML parser to gather the attributes
for the elements of a
list"""
475 html.parser.HTMLParser.__init__(self)
479 def handle_starttag(self, tag, attrs):
480 if tag == 'li' and self._level == 0:
481 self.items.append(dict(attrs))
484 def handle_endtag(self, tag):
488 def extract_attributes(html_element):
489 """Given a string
for an HTML element such
as
491 a
="foo" B
="bar" c
="&98;az" d
=boz
492 empty
= noval entity
="&"
495 Decode
and return a dictionary of attributes
.
497 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
498 'empty': '', 'noval': None, 'entity': '&',
499 'sq': '"', 'dq': '\''
502 parser = HTMLAttributeParser()
503 with contextlib.suppress(compat_HTMLParseError):
504 parser.feed(html_element)
509 def parse_list(webpage):
510 """Given a string
for an series of HTML
<li
> elements
,
511 return a dictionary of their attributes
"""
512 parser = HTMLListAttrsParser()
518 def clean_html(html):
519 """Clean an HTML snippet into a readable string
"""
521 if html is None: # Convenience for sanitizing descriptions etc.
524 html = re.sub(r'\s+', ' ', html)
525 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
526 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
528 html = re.sub('<.*?>', '', html)
529 # Replace html entities
530 html = unescapeHTML(html)
534 class LenientJSONDecoder(json.JSONDecoder):
536 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
537 self.transform_source, self.ignore_extra = transform_source, ignore_extra
538 self._close_attempts = 2 * close_objects
539 super().__init__(*args, **kwargs)
542 def _close_object(err):
543 doc = err.doc[:err.pos]
544 # We need to add comma first to get the correct error message
545 if err.msg.startswith('Expecting \',\''):
547 elif not doc.endswith(','):
550 if err.msg.startswith('Expecting property name'):
551 return doc[:-1] + '}'
552 elif err.msg.startswith('Expecting value'):
553 return doc[:-1] + ']'
556 if self.transform_source:
557 s = self.transform_source(s)
558 for attempt in range(self._close_attempts + 1):
560 if self.ignore_extra:
561 return self.raw_decode(s.lstrip())[0]
562 return super().decode(s)
563 except json.JSONDecodeError as e:
566 elif attempt < self._close_attempts:
567 s = self._close_object(e)
570 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
571 assert False, 'Too many attempts to decode JSON'
574 def sanitize_open(filename, open_mode):
575 """Try to
open the given filename
, and slightly tweak it
if this fails
.
577 Attempts to
open the given filename
. If this fails
, it tries to change
578 the filename slightly
, step by step
, until it
's either able to open it
579 or it fails and raises a final exception, like the standard open()
582 It returns the tuple (stream, definitive_file_name).
585 if sys.platform == 'win32
':
588 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
589 with contextlib.suppress(io.UnsupportedOperation):
590 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
591 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
593 for attempt in range(2):
596 if sys.platform == 'win32
':
597 # FIXME: An exclusive lock also locks the file from being read.
598 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
599 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
600 raise LockingUnsupportedError()
601 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
603 stream
= open(filename
, open_mode
)
604 return stream
, filename
605 except OSError as err
:
606 if attempt
or err
.errno
in (errno
.EACCES
,):
608 old_filename
, filename
= filename
, sanitize_path(filename
)
609 if old_filename
== filename
:
613 def timeconvert(timestr
):
614 """Convert RFC 2822 defined time string into system timestamp"""
616 timetuple
= email
.utils
.parsedate_tz(timestr
)
617 if timetuple
is not None:
618 timestamp
= email
.utils
.mktime_tz(timetuple
)
622 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
623 """Sanitizes a string so it could be used as part of a filename.
624 @param restricted Use a stricter subset of allowed characters
625 @param is_id Whether this is an ID that should be kept unchanged if possible.
626 If unset, yt-dlp's new sanitization rules are in effect
631 def replace_insane(char
):
632 if restricted
and char
in ACCENT_CHARS
:
633 return ACCENT_CHARS
[char
]
634 elif not restricted
and char
== '\n':
636 elif is_id
is NO_DEFAULT
and not restricted
and char
in '"*:<>?|/\\':
637 # Replace with their full-width unicode counterparts
638 return {'/': '\u29F8', '\\': '\u29f9'}
.get(char
, chr(ord(char
) + 0xfee0))
639 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
642 return '' if restricted
else '\''
644 return '\0_\0-' if restricted
else '\0 \0-'
645 elif char
in '\\/|*<>':
647 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
651 # Replace look-alike Unicode glyphs
652 if restricted
and (is_id
is NO_DEFAULT
or not is_id
):
653 s
= unicodedata
.normalize('NFKC', s
)
654 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
655 result
= ''.join(map(replace_insane
, s
))
656 if is_id
is NO_DEFAULT
:
657 result
= re
.sub(r
'(\0.)(?:(?=\1)..)+', r
'\1', result
) # Remove repeated substitute chars
658 STRIP_RE
= r
'(?:\0.|[ _-])*'
659 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
660 result
= result
.replace('\0', '') or '_'
663 while '__' in result
:
664 result
= result
.replace('__', '_')
665 result
= result
.strip('_')
666 # Common case of "Foreign band name - English song title"
667 if restricted
and result
.startswith('-_'):
669 if result
.startswith('-'):
670 result
= '_' + result
[len('-'):]
671 result
= result
.lstrip('.')
677 def sanitize_path(s
, force
=False):
678 """Sanitizes and normalizes path on Windows"""
679 if sys
.platform
== 'win32':
681 drive_or_unc
, _
= os
.path
.splitdrive(s
)
687 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
691 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
692 for path_part
in norm_path
]
694 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
695 elif force
and s
and s
[0] == os
.path
.sep
:
696 sanitized_path
.insert(0, os
.path
.sep
)
697 return os
.path
.join(*sanitized_path
)
700 def sanitize_url(url
, *, scheme
='http'):
701 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
702 # the number of unwanted failures due to missing protocol
705 elif url
.startswith('//'):
706 return f
'{scheme}:{url}'
707 # Fix some common typos seen so far
709 # https://github.com/ytdl-org/youtube-dl/issues/15649
710 (r
'^httpss://', r
'https://'),
711 # https://bx1.be/lives/direct-tv/
712 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
714 for mistake
, fixup
in COMMON_TYPOS
:
715 if re
.match(mistake
, url
):
716 return re
.sub(mistake
, fixup
, url
)
720 def extract_basic_auth(url
):
721 parts
= urllib
.parse
.urlsplit(url
)
722 if parts
.username
is None:
724 url
= urllib
.parse
.urlunsplit(parts
._replace
(netloc
=(
725 parts
.hostname
if parts
.port
is None
726 else '%s:%d' % (parts
.hostname
, parts
.port
))))
727 auth_payload
= base64
.b64encode(
728 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
729 return url
, f
'Basic {auth_payload.decode()}'
732 def sanitized_Request(url
, *args
, **kwargs
):
733 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
734 if auth_header
is not None:
735 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
736 headers
['Authorization'] = auth_header
737 return urllib
.request
.Request(url
, *args
, **kwargs
)
741 """Expand shell variables and ~"""
742 return os
.path
.expandvars(compat_expanduser(s
))
745 def orderedSet(iterable
, *, lazy
=False):
746 """Remove all duplicates from the input iterable"""
748 seen
= [] # Do not use set since the items can be unhashable
754 return _iter() if lazy
else list(_iter())
757 def _htmlentity_transform(entity_with_semicolon
):
758 """Transforms an HTML entity to a character."""
759 entity
= entity_with_semicolon
[:-1]
761 # Known non-numeric HTML entity
762 if entity
in html
.entities
.name2codepoint
:
763 return chr(html
.entities
.name2codepoint
[entity
])
765 # TODO: HTML5 allows entities without a semicolon.
766 # E.g. 'Éric' should be decoded as 'Éric'.
767 if entity_with_semicolon
in html
.entities
.html5
:
768 return html
.entities
.html5
[entity_with_semicolon
]
770 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
772 numstr
= mobj
.group(1)
773 if numstr
.startswith('x'):
775 numstr
= '0%s' % numstr
778 # See https://github.com/ytdl-org/youtube-dl/issues/7518
779 with contextlib
.suppress(ValueError):
780 return chr(int(numstr
, base
))
782 # Unknown entity in name, return its literal representation
783 return '&%s;' % entity
789 assert isinstance(s
, str)
792 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
795 def escapeHTML(text
):
798 .replace('&', '&')
799 .replace('<', '<')
800 .replace('>', '>')
801 .replace('"', '"')
802 .replace("'", ''')
806 class netrc_from_content(netrc
.netrc
):
807 def __init__(self
, content
):
808 self
.hosts
, self
.macros
= {}, {}
809 with io
.StringIO(content
) as stream
:
810 self
._parse
('-', stream
, False)
813 class Popen(subprocess
.Popen
):
814 if sys
.platform
== 'win32':
815 _startupinfo
= subprocess
.STARTUPINFO()
816 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
821 def _fix_pyinstaller_ld_path(env
):
822 """Restore LD_LIBRARY_PATH when using PyInstaller
823 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
824 https://github.com/yt-dlp/yt-dlp/issues/4573
826 if not hasattr(sys
, '_MEIPASS'):
830 orig
= env
.get(f
'{key}_ORIG')
836 _fix('LD_LIBRARY_PATH') # Linux
837 _fix('DYLD_LIBRARY_PATH') # macOS
839 def __init__(self
, *args
, env
=None, text
=False, **kwargs
):
841 env
= os
.environ
.copy()
842 self
._fix
_pyinstaller
_ld
_path
(env
)
844 self
.__text
_mode
= kwargs
.get('encoding') or kwargs
.get('errors') or text
or kwargs
.get('universal_newlines')
846 kwargs
['universal_newlines'] = True # For 3.6 compatibility
847 kwargs
.setdefault('encoding', 'utf-8')
848 kwargs
.setdefault('errors', 'replace')
849 super().__init
__(*args
, env
=env
, **kwargs
, startupinfo
=self
._startupinfo
)
851 def communicate_or_kill(self
, *args
, **kwargs
):
853 return self
.communicate(*args
, **kwargs
)
854 except BaseException
: # Including KeyboardInterrupt
855 self
.kill(timeout
=None)
858 def kill(self
, *, timeout
=0):
861 self
.wait(timeout
=timeout
)
864 def run(cls
, *args
, timeout
=None, **kwargs
):
865 with cls(*args
, **kwargs
) as proc
:
866 default
= '' if proc
.__text
_mode
else b
''
867 stdout
, stderr
= proc
.communicate_or_kill(timeout
=timeout
)
868 return stdout
or default
, stderr
or default
, proc
.returncode
871 def encodeArgument(s
):
872 # Legacy code that uses byte strings
873 # Uncomment the following line after fixing all post processors
874 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
875 return s
if isinstance(s
, str) else s
.decode('ascii')
878 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
881 def timetuple_from_msec(msec
):
882 secs
, msec
= divmod(msec
, 1000)
883 mins
, secs
= divmod(secs
, 60)
884 hrs
, mins
= divmod(mins
, 60)
885 return _timetuple(hrs
, mins
, secs
, msec
)
888 def formatSeconds(secs
, delim
=':', msec
=False):
889 time
= timetuple_from_msec(secs
* 1000)
891 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
893 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
895 ret
= '%d' % time
.seconds
896 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
899 def make_HTTPS_handler(params
, **kwargs
):
900 from ..networking
._helper
import make_ssl_context
901 return YoutubeDLHTTPSHandler(params
, context
=make_ssl_context(
902 verify
=not params
.get('nocheckcertificate'),
903 client_certificate
=params
.get('client_certificate'),
904 client_certificate_key
=params
.get('client_certificate_key'),
905 client_certificate_password
=params
.get('client_certificate_password'),
906 legacy_support
=params
.get('legacyserverconnect'),
907 use_certifi
='no-certifi' not in params
.get('compat_opts', []),
911 def bug_reports_message(before
=';'):
912 from ..update
import REPOSITORY
914 msg
= (f
'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
915 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
917 before
= before
.rstrip()
918 if not before
or before
.endswith(('.', '!', '?')):
919 msg
= msg
[0].title() + msg
[1:]
921 return (before
+ ' ' if before
else '') + msg
924 class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
928 def __init__(self
, msg
=None):
931 elif self
.msg
is None:
932 self
.msg
= type(self
).__name
__
933 super().__init
__(self
.msg
)
936 class ExtractorError(YoutubeDLError
):
937 """Error during info extraction."""
939 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
940 """ tb, if given, is the original traceback (so that it can be printed out).
941 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
943 from ..networking
.exceptions
import network_exceptions
944 if sys
.exc_info()[0] in network_exceptions
:
947 self
.orig_msg
= str(msg
)
949 self
.expected
= expected
951 self
.video_id
= video_id
953 self
.exc_info
= sys
.exc_info() # preserve original exception
954 if isinstance(self
.exc_info
[1], ExtractorError
):
955 self
.exc_info
= self
.exc_info
[1].exc_info
956 super().__init
__(self
.__msg
)
961 format_field(self
.ie
, None, '[%s] '),
962 format_field(self
.video_id
, None, '%s: '),
964 format_field(self
.cause
, None, ' (caused by %r)'),
965 '' if self
.expected
else bug_reports_message()))
967 def format_traceback(self
):
968 return join_nonempty(
969 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
970 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
973 def __setattr__(self
, name
, value
):
974 super().__setattr
__(name
, value
)
975 if getattr(self
, 'msg', None) and name
not in ('msg', 'args'):
976 self
.msg
= self
.__msg
or type(self
).__name
__
977 self
.args
= (self
.msg
, ) # Cannot be property
980 class UnsupportedError(ExtractorError
):
981 def __init__(self
, url
):
983 'Unsupported URL: %s' % url
, expected
=True)
987 class RegexNotFoundError(ExtractorError
):
988 """Error when a regex didn't match"""
992 class GeoRestrictedError(ExtractorError
):
993 """Geographic restriction Error exception.
995 This exception may be thrown when a video is not available from your
996 geographic location due to geographic restrictions imposed by a website.
999 def __init__(self
, msg
, countries
=None, **kwargs
):
1000 kwargs
['expected'] = True
1001 super().__init
__(msg
, **kwargs
)
1002 self
.countries
= countries
1005 class UserNotLive(ExtractorError
):
1006 """Error when a channel/user is not live"""
1008 def __init__(self
, msg
=None, **kwargs
):
1009 kwargs
['expected'] = True
1010 super().__init
__(msg
or 'The channel is not currently live', **kwargs
)
1013 class DownloadError(YoutubeDLError
):
1014 """Download Error exception.
1016 This exception may be thrown by FileDownloader objects if they are not
1017 configured to continue on errors. They will contain the appropriate
1021 def __init__(self
, msg
, exc_info
=None):
1022 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1023 super().__init
__(msg
)
1024 self
.exc_info
= exc_info
1027 class EntryNotInPlaylist(YoutubeDLError
):
1028 """Entry not in playlist exception.
1030 This exception will be thrown by YoutubeDL when a requested entry
1031 is not found in the playlist info_dict
1033 msg
= 'Entry not found in info'
1036 class SameFileError(YoutubeDLError
):
1037 """Same File exception.
1039 This exception will be thrown by FileDownloader objects if they detect
1040 multiple files would have to be downloaded to the same file on disk.
1042 msg
= 'Fixed output name but more than one file to download'
1044 def __init__(self
, filename
=None):
1045 if filename
is not None:
1046 self
.msg
+= f
': {filename}'
1047 super().__init
__(self
.msg
)
1050 class PostProcessingError(YoutubeDLError
):
1051 """Post Processing exception.
1053 This exception may be raised by PostProcessor's .run() method to
1054 indicate an error in the postprocessing task.
1058 class DownloadCancelled(YoutubeDLError
):
1059 """ Exception raised when the download queue should be interrupted """
1060 msg
= 'The download was cancelled'
1063 class ExistingVideoReached(DownloadCancelled
):
1064 """ --break-on-existing triggered """
1065 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1068 class RejectedVideoReached(DownloadCancelled
):
1069 """ --break-match-filter triggered """
1070 msg
= 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1073 class MaxDownloadsReached(DownloadCancelled
):
1074 """ --max-downloads limit has been reached. """
1075 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1078 class ReExtractInfo(YoutubeDLError
):
1079 """ Video info needs to be re-extracted. """
1081 def __init__(self
, msg
, expected
=False):
1082 super().__init
__(msg
)
1083 self
.expected
= expected
1086 class ThrottledDownload(ReExtractInfo
):
1087 """ Download speed below --throttled-rate. """
1088 msg
= 'The download speed is below throttle limit'
1091 super().__init
__(self
.msg
, expected
=False)
1094 class UnavailableVideoError(YoutubeDLError
):
1095 """Unavailable Format exception.
1097 This exception will be thrown when a video is requested
1098 in a format that is not available for that video.
1100 msg
= 'Unable to download video'
1102 def __init__(self
, err
=None):
1104 self
.msg
+= f
': {err}'
1105 super().__init
__(self
.msg
)
1108 class ContentTooShortError(YoutubeDLError
):
1109 """Content Too Short exception.
1111 This exception may be raised by FileDownloader objects when a file they
1112 download is too small for what the server announced first, indicating
1113 the connection was probably interrupted.
1116 def __init__(self
, downloaded
, expected
):
1117 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1119 self
.downloaded
= downloaded
1120 self
.expected
= expected
1123 class XAttrMetadataError(YoutubeDLError
):
1124 def __init__(self
, code
=None, msg
='Unknown error'):
1125 super().__init
__(msg
)
1129 # Parsing code and msg
1130 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1131 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1132 self
.reason
= 'NO_SPACE'
1133 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1134 self
.reason
= 'VALUE_TOO_LONG'
1136 self
.reason
= 'NOT_SUPPORTED'
1139 class XAttrUnavailableError(YoutubeDLError
):
1143 class YoutubeDLHTTPSHandler(urllib
.request
.HTTPSHandler
):
1144 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1145 urllib
.request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1146 self
._https
_conn
_class
= https_conn_class
or http
.client
.HTTPSConnection
1147 self
._params
= params
1149 def https_open(self
, req
):
1151 conn_class
= self
._https
_conn
_class
1153 if hasattr(self
, '_context'): # python > 2.6
1154 kwargs
['context'] = self
._context
1155 if hasattr(self
, '_check_hostname'): # python 3.x
1156 kwargs
['check_hostname'] = self
._check
_hostname
1158 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1160 from ..networking
._urllib
import make_socks_conn_class
1161 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1162 del req
.headers
['Ytdl-socks-proxy']
1164 from ..networking
._urllib
import _create_http_connection
1166 return self
.do_open(
1167 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1168 except urllib
.error
.URLError
as e
:
1169 if (isinstance(e
.reason
, ssl
.SSLError
)
1170 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1171 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1175 def is_path_like(f
):
1176 return isinstance(f
, (str, bytes, os
.PathLike
))
1179 class YoutubeDLCookieProcessor(urllib
.request
.HTTPCookieProcessor
):
1180 def __init__(self
, cookiejar
=None):
1181 urllib
.request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1183 def http_response(self
, request
, response
):
1184 return urllib
.request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1186 https_request
= urllib
.request
.HTTPCookieProcessor
.http_request
1187 https_response
= http_response
1190 def extract_timezone(date_str
):
1193 ^.{8,}? # >=8 char non-TZ prefix, if present
1194 (?P<tz>Z| # just the UTC Z, or
1195 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1196 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1197 [ ]? # optional space
1198 (?P<sign>\+|-) # +/-
1199 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1203 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1204 timezone
= TIMEZONE_NAMES
.get(m
and m
.group('tz').strip())
1205 if timezone
is not None:
1206 date_str
= date_str
[:-len(m
.group('tz'))]
1207 timezone
= datetime
.timedelta(hours
=timezone
or 0)
1209 date_str
= date_str
[:-len(m
.group('tz'))]
1210 if not m
.group('sign'):
1211 timezone
= datetime
.timedelta()
1213 sign
= 1 if m
.group('sign') == '+' else -1
1214 timezone
= datetime
.timedelta(
1215 hours
=sign
* int(m
.group('hours')),
1216 minutes
=sign
* int(m
.group('minutes')))
1217 return timezone
, date_str
1220 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1221 """ Return a UNIX timestamp from the given date """
1223 if date_str
is None:
1226 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1228 if timezone
is None:
1229 timezone
, date_str
= extract_timezone(date_str
)
1231 with contextlib
.suppress(ValueError):
1232 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1233 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1234 return calendar
.timegm(dt
.timetuple())
1237 def date_formats(day_first
=True):
1238 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1241 def unified_strdate(date_str
, day_first
=True):
1242 """Return a string with the date in the format YYYYMMDD"""
1244 if date_str
is None:
1248 date_str
= date_str
.replace(',', ' ')
1249 # Remove AM/PM + timezone
1250 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1251 _
, date_str
= extract_timezone(date_str
)
1253 for expression
in date_formats(day_first
):
1254 with contextlib
.suppress(ValueError):
1255 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1256 if upload_date
is None:
1257 timetuple
= email
.utils
.parsedate_tz(date_str
)
1259 with contextlib
.suppress(ValueError):
1260 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1261 if upload_date
is not None:
1262 return str(upload_date
)
1265 def unified_timestamp(date_str
, day_first
=True):
1266 if not isinstance(date_str
, str):
1269 date_str
= re
.sub(r
'\s+', ' ', re
.sub(
1270 r
'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str
))
1272 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1273 timezone
, date_str
= extract_timezone(date_str
)
1275 # Remove AM/PM + timezone
1276 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1278 # Remove unrecognized timezones from ISO 8601 alike timestamps
1279 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1281 date_str
= date_str
[:-len(m
.group('tz'))]
1283 # Python only supports microseconds, so remove nanoseconds
1284 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1286 date_str
= m
.group(1)
1288 for expression
in date_formats(day_first
):
1289 with contextlib
.suppress(ValueError):
1290 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1291 return calendar
.timegm(dt
.timetuple())
1293 timetuple
= email
.utils
.parsedate_tz(date_str
)
1295 return calendar
.timegm(timetuple
) + pm_delta
* 3600 - timezone
.total_seconds()
1298 def determine_ext(url
, default_ext
='unknown_video'):
1299 if url
is None or '.' not in url
:
1301 guess
= url
.partition('?')[0].rpartition('.')[2]
1302 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1304 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1305 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1306 return guess
.rstrip('/')
1311 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1312 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1315 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1317 Return a datetime object from a string.
1319 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1321 @param format strftime format of DATE
1322 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1323 auto: round to the unit provided in date_str (if applicable).
1325 auto_precision
= False
1326 if precision
== 'auto':
1327 auto_precision
= True
1328 precision
= 'microsecond'
1329 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1330 if date_str
in ('now', 'today'):
1332 if date_str
== 'yesterday':
1333 return today
- datetime
.timedelta(days
=1)
1335 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1337 if match
is not None:
1338 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1339 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1340 unit
= match
.group('unit')
1341 if unit
== 'month' or unit
== 'year':
1342 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1348 delta
= datetime
.timedelta(**{unit + 's': time}
)
1349 new_date
= start_time
+ delta
1351 return datetime_round(new_date
, unit
)
1354 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1357 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1359 Return a date object from a string using datetime_from_str
1361 @param strict Restrict allowed patterns to "YYYYMMDD" and
1362 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1364 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1365 raise ValueError(f
'Invalid date format "{date_str}"')
1366 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1369 def datetime_add_months(dt
, months
):
1370 """Increment/Decrement a datetime object by months."""
1371 month
= dt
.month
+ months
- 1
1372 year
= dt
.year
+ month
// 12
1373 month
= month
% 12 + 1
1374 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1375 return dt
.replace(year
, month
, day
)
1378 def datetime_round(dt
, precision
='day'):
1380 Round a datetime object's time to a specific precision
1382 if precision
== 'microsecond':
1391 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1392 timestamp
= calendar
.timegm(dt
.timetuple())
1393 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1396 def hyphenate_date(date_str
):
1398 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1399 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1400 if match
is not None:
1401 return '-'.join(match
.groups())
1407 """Represents a time interval between two dates"""
1409 def __init__(self
, start
=None, end
=None):
1410 """start and end must be strings in the format accepted by date"""
1411 if start
is not None:
1412 self
.start
= date_from_str(start
, strict
=True)
1414 self
.start
= datetime
.datetime
.min.date()
1416 self
.end
= date_from_str(end
, strict
=True)
1418 self
.end
= datetime
.datetime
.max.date()
1419 if self
.start
> self
.end
:
1420 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1424 """Returns a range that only contains the given day"""
1425 return cls(day
, day
)
1427 def __contains__(self
, date
):
1428 """Check if the date is in the range"""
1429 if not isinstance(date
, datetime
.date
):
1430 date
= date_from_str(date
)
1431 return self
.start
<= date
<= self
.end
1434 return f
'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1436 def __eq__(self
, other
):
1437 return (isinstance(other
, DateRange
)
1438 and self
.start
== other
.start
and self
.end
== other
.end
)
1442 def system_identifier():
1443 python_implementation
= platform
.python_implementation()
1444 if python_implementation
== 'PyPy' and hasattr(sys
, 'pypy_version_info'):
1445 python_implementation
+= ' version %d.%d.%d' % sys
.pypy_version_info
[:3]
1447 with contextlib
.suppress(OSError): # We may not have access to the executable
1448 libc_ver
= platform
.libc_ver()
1450 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1451 platform
.python_version(),
1452 python_implementation
,
1454 platform
.architecture()[0],
1455 platform
.platform(),
1456 ssl
.OPENSSL_VERSION
,
1457 format_field(join_nonempty(*libc_ver
, delim
=' '), None, ', %s'),
1462 def get_windows_version():
1463 ''' Get Windows version. returns () if it's not running on Windows '''
1464 if compat_os_name
== 'nt':
1465 return version_tuple(platform
.win32_ver()[1])
1470 def write_string(s
, out
=None, encoding
=None):
1471 assert isinstance(s
, str)
1472 out
= out
or sys
.stderr
1473 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1477 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1478 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1480 enc
, buffer = None, out
1481 if 'b' in getattr(out
, 'mode', ''):
1482 enc
= encoding
or preferredencoding()
1483 elif hasattr(out
, 'buffer'):
1485 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1487 buffer.write(s
.encode(enc
, 'ignore') if enc
else s
)
1491 def deprecation_warning(msg
, *, printer
=None, stacklevel
=0, **kwargs
):
1492 from .. import _IN_CLI
1494 if msg
in deprecation_warning
._cache
:
1496 deprecation_warning
._cache
.add(msg
)
1498 return printer(f
'{msg}{bug_reports_message()}', **kwargs
)
1499 return write_string(f
'ERROR: {msg}{bug_reports_message()}\n', **kwargs
)
1502 warnings
.warn(DeprecationWarning(msg
), stacklevel
=stacklevel
+ 3)
1505 deprecation_warning
._cache
= set()
1508 def bytes_to_intlist(bs
):
1511 if isinstance(bs
[0], int): # Python 3
1514 return [ord(c
) for c
in bs
]
1517 def intlist_to_bytes(xs
):
1520 return struct
.pack('%dB' % len(xs
), *xs
)
1523 class LockingUnsupportedError(OSError):
1524 msg
= 'File locking is not supported'
1527 super().__init
__(self
.msg
)
1530 # Cross-platform file locking
1531 if sys
.platform
== 'win32':
1533 import ctypes
.wintypes
1536 class OVERLAPPED(ctypes
.Structure
):
1538 ('Internal', ctypes
.wintypes
.LPVOID
),
1539 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1540 ('Offset', ctypes
.wintypes
.DWORD
),
1541 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1542 ('hEvent', ctypes
.wintypes
.HANDLE
),
1545 kernel32
= ctypes
.WinDLL('kernel32')
1546 LockFileEx
= kernel32
.LockFileEx
1547 LockFileEx
.argtypes
= [
1548 ctypes
.wintypes
.HANDLE
, # hFile
1549 ctypes
.wintypes
.DWORD
, # dwFlags
1550 ctypes
.wintypes
.DWORD
, # dwReserved
1551 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1552 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1553 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1555 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1556 UnlockFileEx
= kernel32
.UnlockFileEx
1557 UnlockFileEx
.argtypes
= [
1558 ctypes
.wintypes
.HANDLE
, # hFile
1559 ctypes
.wintypes
.DWORD
, # dwReserved
1560 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1561 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1562 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1564 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1565 whole_low
= 0xffffffff
1566 whole_high
= 0x7fffffff
1568 def _lock_file(f
, exclusive
, block
):
1569 overlapped
= OVERLAPPED()
1570 overlapped
.Offset
= 0
1571 overlapped
.OffsetHigh
= 0
1572 overlapped
.hEvent
= 0
1573 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1575 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
1576 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
1577 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1578 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1579 raise BlockingIOError(f
'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1581 def _unlock_file(f
):
1582 assert f
._lock
_file
_overlapped
_p
1583 handle
= msvcrt
.get_osfhandle(f
.fileno())
1584 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1585 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1591 def _lock_file(f
, exclusive
, block
):
1592 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
1594 flags |
= fcntl
.LOCK_NB
1596 fcntl
.flock(f
, flags
)
1597 except BlockingIOError
:
1599 except OSError: # AOSP does not have flock()
1600 fcntl
.lockf(f
, flags
)
1602 def _unlock_file(f
):
1603 with contextlib
.suppress(OSError):
1604 return fcntl
.flock(f
, fcntl
.LOCK_UN
)
1605 with contextlib
.suppress(OSError):
1606 return fcntl
.lockf(f
, fcntl
.LOCK_UN
) # AOSP does not have flock()
1607 return fcntl
.flock(f
, fcntl
.LOCK_UN | fcntl
.LOCK_NB
) # virtiofs needs LOCK_NB on unlocking
1611 def _lock_file(f
, exclusive
, block
):
1612 raise LockingUnsupportedError()
1614 def _unlock_file(f
):
1615 raise LockingUnsupportedError()
1621 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
1622 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
1623 raise NotImplementedError(mode
)
1624 self
.mode
, self
.block
= mode
, block
1626 writable
= any(f
in mode
for f
in 'wax+')
1627 readable
= any(f
in mode
for f
in 'r+')
1628 flags
= functools
.reduce(operator
.ior
, (
1629 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
1630 getattr(os
, 'O_BINARY', 0), # Windows only
1631 getattr(os
, 'O_NOINHERIT', 0), # Windows only
1632 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
1633 os
.O_APPEND
if 'a' in mode
else 0,
1634 os
.O_EXCL
if 'x' in mode
else 0,
1635 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
1638 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
1640 def __enter__(self
):
1641 exclusive
= 'r' not in self
.mode
1643 _lock_file(self
.f
, exclusive
, self
.block
)
1648 if 'w' in self
.mode
:
1651 except OSError as e
:
1653 errno
.ESPIPE
, # Illegal seek - expected for FIFO
1654 errno
.EINVAL
, # Invalid argument - expected for /dev/null
1663 _unlock_file(self
.f
)
1667 def __exit__(self
, *_
):
1676 def __getattr__(self
, attr
):
1677 return getattr(self
.f
, attr
)
1684 def get_filesystem_encoding():
1685 encoding
= sys
.getfilesystemencoding()
1686 return encoding
if encoding
is not None else 'utf-8'
1689 def shell_quote(args
):
1691 encoding
= get_filesystem_encoding()
1693 if isinstance(a
, bytes):
1694 # We may get a filename encoded with 'encodeFilename'
1695 a
= a
.decode(encoding
)
1696 quoted_args
.append(compat_shlex_quote(a
))
1697 return ' '.join(quoted_args
)
1700 def smuggle_url(url
, data
):
1701 """ Pass additional data in a URL for internal use. """
1703 url
, idata
= unsmuggle_url(url
, {})
1705 sdata
= urllib
.parse
.urlencode(
1706 {'__youtubedl_smuggle': json.dumps(data)}
)
1707 return url
+ '#' + sdata
1710 def unsmuggle_url(smug_url
, default
=None):
1711 if '#__youtubedl_smuggle' not in smug_url
:
1712 return smug_url
, default
1713 url
, _
, sdata
= smug_url
.rpartition('#')
1714 jsond
= urllib
.parse
.parse_qs(sdata
)['__youtubedl_smuggle'][0]
1715 data
= json
.loads(jsond
)
1719 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
1720 """ Formats numbers with decimal sufixes like K, M, etc """
1721 num
, factor
= float_or_none(num
), float(factor
)
1722 if num
is None or num
< 0:
1724 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
1725 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
1726 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
1728 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
1729 converted
= num
/ (factor
** exponent
)
1730 return fmt
% (converted
, suffix
)
1733 def format_bytes(bytes):
1734 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
1737 def lookup_unit_table(unit_table
, s
, strict
=False):
1738 num_re
= NUMBER_RE
if strict
else NUMBER_RE
.replace(R
'\.', '[,.]')
1739 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
1740 m
= (re
.fullmatch
if strict
else re
.match
)(
1741 rf
'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s
)
1745 num
= float(m
.group('num').replace(',', '.'))
1746 mult
= unit_table
[m
.group('unit')]
1747 return round(num
* mult
)
1751 """Parse a string indicating a byte quantity into an integer"""
1752 return lookup_unit_table(
1753 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])}
,
1754 s
.upper(), strict
=True)
1757 def parse_filesize(s
):
1761 # The lower-case forms are of course incorrect and unofficial,
1762 # but we support those too
1779 'megabytes': 1000 ** 2,
1780 'mebibytes': 1024 ** 2,
1786 'gigabytes': 1000 ** 3,
1787 'gibibytes': 1024 ** 3,
1793 'terabytes': 1000 ** 4,
1794 'tebibytes': 1024 ** 4,
1800 'petabytes': 1000 ** 5,
1801 'pebibytes': 1024 ** 5,
1807 'exabytes': 1000 ** 6,
1808 'exbibytes': 1024 ** 6,
1814 'zettabytes': 1000 ** 7,
1815 'zebibytes': 1024 ** 7,
1821 'yottabytes': 1000 ** 8,
1822 'yobibytes': 1024 ** 8,
1825 return lookup_unit_table(_UNIT_TABLE
, s
)
1832 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
1834 if re
.match(r
'^[\d,.]+$', s
):
1835 return str_to_int(s
)
1848 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
1852 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
1854 return str_to_int(mobj
.group(1))
1857 def parse_resolution(s
, *, lenient
=False):
1862 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
1864 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
1867 'width': int(mobj
.group('w')),
1868 'height': int(mobj
.group('h')),
1871 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
1873 return {'height': int(mobj.group(1))}
1875 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
1877 return {'height': int(mobj.group(1)) * 540}
1882 def parse_bitrate(s
):
1883 if not isinstance(s
, str):
1885 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
1887 return int(mobj
.group(1))
1890 def month_by_name(name
, lang
='en'):
1891 """ Return the number of a month by (locale-independently) English name """
1893 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
1896 return month_names
.index(name
) + 1
1901 def month_by_abbreviation(abbrev
):
1902 """ Return the number of a month by (locale-independently) English
1906 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
1911 def fix_xml_ampersands(xml_str
):
1912 """Replace all the '&' by '&' in XML"""
1914 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1919 def setproctitle(title
):
1920 assert isinstance(title
, str)
1922 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1929 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
1933 # LoadLibrary in Windows Python 2.7.13 only expects
1934 # a bytestring, but since unicode_literals turns
1935 # every string into a unicode string, it fails.
1937 title_bytes
= title
.encode()
1938 buf
= ctypes
.create_string_buffer(len(title_bytes
))
1939 buf
.value
= title_bytes
1941 libc
.prctl(15, buf
, 0, 0, 0)
1942 except AttributeError:
1943 return # Strange libc, just skip this
1946 def remove_start(s
, start
):
1947 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
1950 def remove_end(s
, end
):
1951 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
1954 def remove_quotes(s
):
1955 if s
is None or len(s
) < 2:
1957 for quote
in ('"', "'", ):
1958 if s
[0] == quote
and s
[-1] == quote
:
1963 def get_domain(url
):
1965 This implementation is inconsistent, but is kept for compatibility.
1966 Use this only for "webpage_url_domain"
1968 return remove_start(urllib
.parse
.urlparse(url
).netloc
, 'www.') or None
1971 def url_basename(url
):
1972 path
= urllib
.parse
.urlparse(url
).path
1973 return path
.strip('/').split('/')[-1]
1977 return re
.match(r
'https?://[^?#]+/', url
).group()
1980 def urljoin(base
, path
):
1981 if isinstance(path
, bytes):
1982 path
= path
.decode()
1983 if not isinstance(path
, str) or not path
:
1985 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
1987 if isinstance(base
, bytes):
1988 base
= base
.decode()
1989 if not isinstance(base
, str) or not re
.match(
1990 r
'^(?:https?:)?//', base
):
1992 return urllib
.parse
.urljoin(base
, path
)
1995 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
1996 if get_attr
and v
is not None:
1997 v
= getattr(v
, get_attr
, None)
1999 return int(v
) * invscale
// scale
2000 except (ValueError, TypeError, OverflowError):
2004 def str_or_none(v
, default
=None):
2005 return default
if v
is None else str(v
)
2008 def str_to_int(int_str
):
2009 """ A more relaxed version of int_or_none """
2010 if isinstance(int_str
, int):
2012 elif isinstance(int_str
, str):
2013 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2014 return int_or_none(int_str
)
2017 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2021 return float(v
) * invscale
/ scale
2022 except (ValueError, TypeError):
2026 def bool_or_none(v
, default
=None):
2027 return v
if isinstance(v
, bool) else default
2030 def strip_or_none(v
, default
=None):
2031 return v
.strip() if isinstance(v
, str) else default
2034 def url_or_none(url
):
2035 if not url
or not isinstance(url
, str):
2038 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2041 def request_to_url(req
):
2042 if isinstance(req
, urllib
.request
.Request
):
2043 return req
.get_full_url()
2048 def strftime_or_none(timestamp
, date_format
='%Y%m%d', default
=None):
2049 datetime_object
= None
2051 if isinstance(timestamp
, (int, float)): # unix timestamp
2052 # Using naive datetime here can break timestamp() in Windows
2053 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2054 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2055 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2056 datetime_object
= (datetime
.datetime
.fromtimestamp(0, datetime
.timezone
.utc
)
2057 + datetime
.timedelta(seconds
=timestamp
))
2058 elif isinstance(timestamp
, str): # assume YYYYMMDD
2059 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2060 date_format
= re
.sub( # Support %s on windows
2061 r
'(?<!%)(%%)*%s', rf
'\g<1>{int(datetime_object.timestamp())}', date_format
)
2062 return datetime_object
.strftime(date_format
)
2063 except (ValueError, TypeError, AttributeError):
2067 def parse_duration(s
):
2068 if not isinstance(s
, str):
2074 days
, hours
, mins
, secs
, ms
= [None] * 5
2075 m
= re
.match(r
'''(?x)
2077 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2078 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2079 (?P<ms>[.:][0-9]+)?Z?$
2082 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2087 [0-9]+\s*y(?:ears?)?,?\s*
2090 [0-9]+\s*m(?:onths?)?,?\s*
2093 [0-9]+\s*w(?:eeks?)?,?\s*
2096 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2100 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2103 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2106 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2109 days
, hours
, mins
, secs
, ms
= m
.groups()
2111 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2113 hours
, mins
= m
.groups()
2118 ms
= ms
.replace(':', '.')
2119 return sum(float(part
or 0) * mult
for part
, mult
in (
2120 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2123 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2124 name
, real_ext
= os
.path
.splitext(filename
)
2126 f
'{name}.{ext}{real_ext}'
2127 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2128 else f
'{filename}.{ext}')
2131 def replace_extension(filename
, ext
, expected_real_ext
=None):
2132 name
, real_ext
= os
.path
.splitext(filename
)
2133 return '{}.{}'.format(
2134 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2138 def check_executable(exe
, args
=[]):
2139 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2140 args can be a list of arguments for a short output (like -version) """
2142 Popen
.run([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
2148 def _get_exe_version_output(exe
, args
):
2150 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2151 # SIGTTOU if yt-dlp is run in the background.
2152 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2153 stdout
, _
, ret
= Popen
.run([encodeArgument(exe
)] + args
, text
=True,
2154 stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
2162 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2163 assert isinstance(output
, str)
2164 if version_re
is None:
2165 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2166 m
= re
.search(version_re
, output
)
2173 def get_exe_version(exe
, args
=['--version'],
2174 version_re
=None, unrecognized
=('present', 'broken')):
2175 """ Returns the version of the specified executable,
2176 or False if the executable is not present """
2177 unrecognized
= variadic(unrecognized
)
2178 assert len(unrecognized
) in (1, 2)
2179 out
= _get_exe_version_output(exe
, args
)
2181 return unrecognized
[-1]
2182 return out
and detect_exe_version(out
, version_re
, unrecognized
[0])
2185 def frange(start
=0, stop
=None, step
=1):
2188 start
, stop
= 0, start
2189 sign
= [-1, 1][step
> 0] if step
else 0
2190 while sign
* start
< sign
* stop
:
2195 class LazyList(collections
.abc
.Sequence
):
2196 """Lazy immutable list from an iterable
2197 Note that slices of a LazyList are lists and not LazyList"""
2199 class IndexError(IndexError):
2202 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2203 self
._iterable
= iter(iterable
)
2204 self
._cache
= [] if _cache
is None else _cache
2205 self
._reversed
= reverse
2209 # We need to consume the entire iterable to iterate in reverse
2210 yield from self
.exhaust()
2212 yield from self
._cache
2213 for item
in self
._iterable
:
2214 self
._cache
.append(item
)
2218 self
._cache
.extend(self
._iterable
)
2219 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2223 """Evaluate the entire iterable"""
2224 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2227 def _reverse_index(x
):
2228 return None if x
is None else ~x
2230 def __getitem__(self
, idx
):
2231 if isinstance(idx
, slice):
2233 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2234 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2235 elif isinstance(idx
, int):
2237 idx
= self
._reverse
_index
(idx
)
2238 start
, stop
, step
= idx
, idx
, 0
2240 raise TypeError('indices must be integers or slices')
2241 if ((start
or 0) < 0 or (stop
or 0) < 0
2242 or (start
is None and step
< 0)
2243 or (stop
is None and step
> 0)):
2244 # We need to consume the entire iterable to be able to slice from the end
2245 # Obviously, never use this with infinite iterables
2248 return self
._cache
[idx
]
2249 except IndexError as e
:
2250 raise self
.IndexError(e
) from e
2251 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2253 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2255 return self
._cache
[idx
]
2256 except IndexError as e
:
2257 raise self
.IndexError(e
) from e
2261 self
[-1] if self
._reversed
else self
[0]
2262 except self
.IndexError:
2268 return len(self
._cache
)
2270 def __reversed__(self
):
2271 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2274 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2277 # repr and str should mimic a list. So we exhaust the iterable
2278 return repr(self
.exhaust())
2281 return repr(self
.exhaust())
2286 class IndexError(IndexError):
2290 # This is only useful for tests
2291 return len(self
.getslice())
2293 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2294 self
._pagefunc
= pagefunc
2295 self
._pagesize
= pagesize
2296 self
._pagecount
= float('inf')
2297 self
._use
_cache
= use_cache
2300 def getpage(self
, pagenum
):
2301 page_results
= self
._cache
.get(pagenum
)
2302 if page_results
is None:
2303 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2305 self
._cache
[pagenum
] = page_results
2308 def getslice(self
, start
=0, end
=None):
2309 return list(self
._getslice
(start
, end
))
2311 def _getslice(self
, start
, end
):
2312 raise NotImplementedError('This method must be implemented by subclasses')
2314 def __getitem__(self
, idx
):
2315 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2316 if not isinstance(idx
, int) or idx
< 0:
2317 raise TypeError('indices must be non-negative integers')
2318 entries
= self
.getslice(idx
, idx
+ 1)
2320 raise self
.IndexError()
2324 class OnDemandPagedList(PagedList
):
2325 """Download pages until a page with less than maximum results"""
2327 def _getslice(self
, start
, end
):
2328 for pagenum
in itertools
.count(start
// self
._pagesize
):
2329 firstid
= pagenum
* self
._pagesize
2330 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2331 if start
>= nextfirstid
:
2335 start
% self
._pagesize
2336 if firstid
<= start
< nextfirstid
2339 ((end
- 1) % self
._pagesize
) + 1
2340 if (end
is not None and firstid
<= end
<= nextfirstid
)
2344 page_results
= self
.getpage(pagenum
)
2346 self
._pagecount
= pagenum
- 1
2348 if startv
!= 0 or endv
is not None:
2349 page_results
= page_results
[startv
:endv
]
2350 yield from page_results
2352 # A little optimization - if current page is not "full", ie. does
2353 # not contain page_size videos then we can assume that this page
2354 # is the last one - there are no more ids on further pages -
2355 # i.e. no need to query again.
2356 if len(page_results
) + startv
< self
._pagesize
:
2359 # If we got the whole page, but the next page is not interesting,
2360 # break out early as well
2361 if end
== nextfirstid
:
2365 class InAdvancePagedList(PagedList
):
2366 """PagedList with total number of pages known in advance"""
2368 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2369 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2370 self
._pagecount
= pagecount
2372 def _getslice(self
, start
, end
):
2373 start_page
= start
// self
._pagesize
2374 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2375 skip_elems
= start
- start_page
* self
._pagesize
2376 only_more
= None if end
is None else end
- start
2377 for pagenum
in range(start_page
, end_page
):
2378 page_results
= self
.getpage(pagenum
)
2380 page_results
= page_results
[skip_elems
:]
2382 if only_more
is not None:
2383 if len(page_results
) < only_more
:
2384 only_more
-= len(page_results
)
2386 yield from page_results
[:only_more
]
2388 yield from page_results
2391 class PlaylistEntries
:
2392 MissingEntry
= object()
2393 is_exhausted
= False
2395 def __init__(self
, ydl
, info_dict
):
2398 # _entries must be assigned now since infodict can change during iteration
2399 entries
= info_dict
.get('entries')
2401 raise EntryNotInPlaylist('There are no entries')
2402 elif isinstance(entries
, list):
2403 self
.is_exhausted
= True
2405 requested_entries
= info_dict
.get('requested_entries')
2406 self
.is_incomplete
= requested_entries
is not None
2407 if self
.is_incomplete
:
2408 assert self
.is_exhausted
2409 self
._entries
= [self
.MissingEntry
] * max(requested_entries
or [0])
2410 for i
, entry
in zip(requested_entries
, entries
):
2411 self
._entries
[i
- 1] = entry
2412 elif isinstance(entries
, (list, PagedList
, LazyList
)):
2413 self
._entries
= entries
2415 self
._entries
= LazyList(entries
)
2417 PLAYLIST_ITEMS_RE
= re
.compile(r
'''(?x)
2418 (?P<start>[+-]?\d+)?
2420 (?P<end>[+-]?\d+|inf(?:inite)?)?
2421 (?::(?P<step>[+-]?\d+))?
2425 def parse_playlist_items(cls
, string
):
2426 for segment
in string
.split(','):
2428 raise ValueError('There is two or more consecutive commas')
2429 mobj
= cls
.PLAYLIST_ITEMS_RE
.fullmatch(segment
)
2431 raise ValueError(f
'{segment!r} is not a valid specification')
2432 start
, end
, step
, has_range
= mobj
.group('start', 'end', 'step', 'range')
2433 if int_or_none(step
) == 0:
2434 raise ValueError(f
'Step in {segment!r} cannot be zero')
2435 yield slice(int_or_none(start
), float_or_none(end
), int_or_none(step
)) if has_range
else int(start
)
2437 def get_requested_items(self
):
2438 playlist_items
= self
.ydl
.params
.get('playlist_items')
2439 playlist_start
= self
.ydl
.params
.get('playliststart', 1)
2440 playlist_end
= self
.ydl
.params
.get('playlistend')
2441 # For backwards compatibility, interpret -1 as whole list
2442 if playlist_end
in (-1, None):
2444 if not playlist_items
:
2445 playlist_items
= f
'{playlist_start}:{playlist_end}'
2446 elif playlist_start
!= 1 or playlist_end
:
2447 self
.ydl
.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once
=True)
2449 for index
in self
.parse_playlist_items(playlist_items
):
2450 for i
, entry
in self
[index
]:
2455 # The item may have just been added to archive. Don't break due to it
2456 if not self
.ydl
.params
.get('lazy_playlist'):
2457 # TODO: Add auto-generated fields
2458 self
.ydl
._match
_entry
(entry
, incomplete
=True, silent
=True)
2459 except (ExistingVideoReached
, RejectedVideoReached
):
2462 def get_full_count(self
):
2463 if self
.is_exhausted
and not self
.is_incomplete
:
2465 elif isinstance(self
._entries
, InAdvancePagedList
):
2466 if self
._entries
._pagesize
== 1:
2467 return self
._entries
._pagecount
2469 @functools.cached_property
2471 if isinstance(self
._entries
, list):
2474 entry
= self
._entries
[i
]
2476 entry
= self
.MissingEntry
2477 if not self
.is_incomplete
:
2478 raise self
.IndexError()
2479 if entry
is self
.MissingEntry
:
2480 raise EntryNotInPlaylist(f
'Entry {i + 1} cannot be found')
2485 return type(self
.ydl
)._handle
_extraction
_exceptions
(lambda _
, i
: self
._entries
[i
])(self
.ydl
, i
)
2486 except (LazyList
.IndexError, PagedList
.IndexError):
2487 raise self
.IndexError()
2490 def __getitem__(self
, idx
):
2491 if isinstance(idx
, int):
2492 idx
= slice(idx
, idx
)
2494 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2495 step
= 1 if idx
.step
is None else idx
.step
2496 if idx
.start
is None:
2497 start
= 0 if step
> 0 else len(self
) - 1
2499 start
= idx
.start
- 1 if idx
.start
>= 0 else len(self
) + idx
.start
2501 # NB: Do not call len(self) when idx == [:]
2502 if idx
.stop
is None:
2503 stop
= 0 if step
< 0 else float('inf')
2505 stop
= idx
.stop
- 1 if idx
.stop
>= 0 else len(self
) + idx
.stop
2506 stop
+= [-1, 1][step
> 0]
2508 for i
in frange(start
, stop
, step
):
2512 entry
= self
._getter
(i
)
2513 except self
.IndexError:
2514 self
.is_exhausted
= True
2521 return len(tuple(self
[:]))
2523 class IndexError(IndexError):
2527 def uppercase_escape(s
):
2528 unicode_escape
= codecs
.getdecoder('unicode_escape')
2530 r
'\\U[0-9a-fA-F]{8}',
2531 lambda m
: unicode_escape(m
.group(0))[0],
2535 def lowercase_escape(s
):
2536 unicode_escape
= codecs
.getdecoder('unicode_escape')
2538 r
'\\u[0-9a-fA-F]{4}',
2539 lambda m
: unicode_escape(m
.group(0))[0],
2543 def escape_rfc3986(s
):
2544 """Escape non-ASCII characters as suggested by RFC 3986"""
2545 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2548 def escape_url(url
):
2549 """Escape URL as suggested by RFC 3986"""
2550 url_parsed
= urllib
.parse
.urlparse(url
)
2551 return url_parsed
._replace
(
2552 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2553 path
=escape_rfc3986(url_parsed
.path
),
2554 params
=escape_rfc3986(url_parsed
.params
),
2555 query
=escape_rfc3986(url_parsed
.query
),
2556 fragment
=escape_rfc3986(url_parsed
.fragment
)
2560 def parse_qs(url
, **kwargs
):
2561 return urllib
.parse
.parse_qs(urllib
.parse
.urlparse(url
).query
, **kwargs
)
2564 def read_batch_urls(batch_fd
):
2566 if not isinstance(url
, str):
2567 url
= url
.decode('utf-8', 'replace')
2568 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2569 for bom
in BOM_UTF8
:
2570 if url
.startswith(bom
):
2571 url
= url
[len(bom
):]
2573 if not url
or url
.startswith(('#', ';', ']')):
2575 # "#" cannot be stripped out since it is part of the URI
2576 # However, it can be safely stripped out if following a whitespace
2577 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2579 with contextlib
.closing(batch_fd
) as fd
:
2580 return [url
for url
in map(fixup
, fd
) if url
]
2583 def urlencode_postdata(*args
, **kargs
):
2584 return urllib
.parse
.urlencode(*args
, **kargs
).encode('ascii')
2587 def update_url(url
, *, query_update
=None, **kwargs
):
2588 """Replace URL components specified by kwargs
2589 @param url str or parse url tuple
2590 @param query_update update query
2593 if isinstance(url
, str):
2594 if not kwargs
and not query_update
:
2597 url
= urllib
.parse
.urlparse(url
)
2599 assert 'query' not in kwargs
, 'query_update and query cannot be specified at the same time'
2600 kwargs
['query'] = urllib
.parse
.urlencode({
2601 **urllib
.parse
.parse_qs(url
.query
),
2604 return urllib
.parse
.urlunparse(url
._replace
(**kwargs
))
2607 def update_url_query(url
, query
):
2608 return update_url(url
, query_update
=query
)
2611 def _multipart_encode_impl(data
, boundary
):
2612 content_type
= 'multipart/form-data; boundary=%s' % boundary
2615 for k
, v
in data
.items():
2616 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2617 if isinstance(k
, str):
2619 if isinstance(v
, str):
2621 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2622 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2623 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2624 if boundary
.encode('ascii') in content
:
2625 raise ValueError('Boundary overlaps with data')
2628 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2630 return out
, content_type
2633 def multipart_encode(data
, boundary
=None):
2635 Encode a dict to RFC 7578-compliant form-data
2638 A dict where keys and values can be either Unicode or bytes-like
2641 If specified a Unicode object, it's used as the boundary. Otherwise
2642 a random boundary is generated.
2644 Reference: https://tools.ietf.org/html/rfc7578
2646 has_specified_boundary
= boundary
is not None
2649 if boundary
is None:
2650 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2653 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2656 if has_specified_boundary
:
2660 return out
, content_type
2663 def is_iterable_like(x
, allowed_types
=collections
.abc
.Iterable
, blocked_types
=NO_DEFAULT
):
2664 if blocked_types
is NO_DEFAULT
:
2665 blocked_types
= (str, bytes, collections
.abc
.Mapping
)
2666 return isinstance(x
, allowed_types
) and not isinstance(x
, blocked_types
)
2669 def variadic(x
, allowed_types
=NO_DEFAULT
):
2670 if not isinstance(allowed_types
, (tuple, type)):
2671 deprecation_warning('allowed_types should be a tuple or a type')
2672 allowed_types
= tuple(allowed_types
)
2673 return x
if is_iterable_like(x
, blocked_types
=allowed_types
) else (x
, )
2676 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
2679 val
= f(*args
, **kwargs
)
2680 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2683 if expected_type
is None or isinstance(val
, expected_type
):
2687 def try_get(src
, getter
, expected_type
=None):
2688 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
2691 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
2692 return {k: v for k, v in dct.items() if cndn(k, v)}
2695 def merge_dicts(*dicts
):
2697 for a_dict
in dicts
:
2698 for k
, v
in a_dict
.items():
2699 if (v
is not None and k
not in merged
2700 or isinstance(v
, str) and merged
[k
] == ''):
2705 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2706 return string
if isinstance(string
, str) else str(string
, encoding
, errors
)
2718 TV_PARENTAL_GUIDELINES
= {
2728 def parse_age_limit(s
):
2729 # isinstance(False, int) is True. So type() must be used instead
2730 if type(s
) is int: # noqa: E721
2731 return s
if 0 <= s
<= 21 else None
2732 elif not isinstance(s
, str):
2734 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2736 return int(m
.group('age'))
2739 return US_RATINGS
[s
]
2740 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2742 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
2746 def strip_jsonp(code
):
2749 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2750 (?:\s*&&\s*(?P=func_name))?
2751 \s*\(\s*(?P<callback_data>.*)\);?
2752 \s*?(?://[^\n]*)*$''',
2753 r
'\g<callback_data>', code
)
2756 def js_to_json(code
, vars={}, *, strict
=False):
2757 # vars is a dict of var, val pairs to substitute
2758 STRING_QUOTES
= '\'"`'
2759 STRING_RE
= '|'.join(rf
'{q}(?:\\.|[^\\{q}])*{q}' for q
in STRING_QUOTES
)
2760 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2761 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
2763 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2764 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2767 def process_escape(match
):
2768 JSON_PASSTHROUGH_ESCAPES
= R
'"\bfnrtu'
2769 escape
= match
.group(1) or match
.group(2)
2771 return (Rf
'\{escape}' if escape
in JSON_PASSTHROUGH_ESCAPES
2772 else R
'\u00' if escape
== 'x'
2773 else '' if escape
== '\n'
2776 def template_substitute(match
):
2777 evaluated
= js_to_json(match
.group(1), vars, strict
=strict
)
2778 if evaluated
[0] == '"':
2779 return json
.loads(evaluated
)
2784 if v
in ('true', 'false', 'null'):
2786 elif v
in ('undefined', 'void 0'):
2788 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
2791 if v
[0] in STRING_QUOTES
:
2792 v
= re
.sub(r
'(?s)\${([^}]+)}', template_substitute
, v
[1:-1]) if v
[0] == '`' else v
[1:-1]
2793 escaped
= re
.sub(r
'(?s)(")|\\(.)', process_escape
, v
)
2794 return f
'"{escaped}"'
2796 for regex
, base
in INTEGER_TABLE
:
2797 im
= re
.match(regex
, v
)
2799 i
= int(im
.group(1), base
)
2800 return f
'"{i}":' if v
.endswith(':') else str(i
)
2806 except json
.JSONDecodeError
:
2807 return json
.dumps(vars[v
])
2814 raise ValueError(f
'Unknown value: {v}')
2816 def create_map(mobj
):
2817 return json
.dumps(dict(json
.loads(js_to_json(mobj
.group(1) or '[]', vars=vars))))
2819 code
= re
.sub(r
'new Map\((\[.*?\])?\)', create_map
, code
)
2821 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
2822 code
= re
.sub(r
'new \w+\((.*?)\)', lambda m
: json
.dumps(m
.group(0)), code
)
2823 code
= re
.sub(r
'parseInt\([^\d]+(\d+)[^\d]+\)', r
'\1', code
)
2824 code
= re
.sub(r
'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^
)]*["\'])\s*\)', r'\1', code)
2826 return re.sub(rf'''(?sx)
2828 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2829 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2830 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2831 [0-9]+(?={SKIP_RE}:)|
2836 def qualities(quality_ids):
2837 """ Get a numeric quality value out of a list of possible values """
2840 return quality_ids.index(qid)
2846 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2850 'default': '%(title)s [%(id)s].%(ext)s',
2851 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2857 'description': 'description',
2858 'annotation': 'annotations.xml',
2859 'infojson': 'info.json',
2862 'pl_thumbnail': None,
2863 'pl_description': 'description',
2864 'pl_infojson': 'info.json',
2867 # As of [1] format syntax is:
2868 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2869 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2870 STR_FORMAT_RE_TMPL = r'''(?x)
2871 (?<!%)(?P<prefix>(?:%%)*)
2873 (?P<has_key>\((?P<key>{0})\))?
2875 (?P<conversion>[#0\-+ ]+)?
2877 (?P<precision>\.\d+)?
2878 (?P<len_mod>[hlL])? # unused in python
2879 {1} # conversion type
2884 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2887 def limit_length(s, length):
2888 """ Add ellipses to overly long strings """
2893 return s[:length - len(ELLIPSES)] + ELLIPSES
2897 def version_tuple(v):
2898 return tuple(int(e) for e in re.split(r'[-.]', v))
2901 def is_outdated_version(version, limit, assume_new=True):
2903 return not assume_new
2905 return version_tuple(version) < version_tuple(limit)
2907 return not assume_new
2910 def ytdl_is_updateable():
2911 """ Returns if yt-dlp can be updated with -U """
2913 from ..update import is_non_updateable
2915 return not is_non_updateable()
2918 def args_to_str(args):
2919 # Get a short string representation for a subprocess command
2920 return ' '.join(compat_shlex_quote(a) for a in args)
2923 def error_to_str(err):
2924 return f'{type(err).__name__}: {err}'
2927 def mimetype2ext(mt, default=NO_DEFAULT):
2928 if not isinstance(mt, str):
2929 if default is not NO_DEFAULT:
2945 'x-matroska': 'mkv',
2947 'x-mp4-fragmented': 'mp4',
2952 # application (streaming playlists)
2956 'vnd.apple.mpegurl': 'm3u8',
2957 'vnd.ms-sstr+xml': 'ism',
2958 'x-mpegurl': 'm3u8',
2962 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2963 # Using .mp3 as it's the most popular one
2964 'audio/mpeg': 'mp3',
2965 'audio/webm': 'webm',
2966 'audio/x-matroska': 'mka',
2967 'audio/x-mpegurl': 'm3u',
2975 'x-realaudio': 'ra',
2986 'vnd.wap.wbmp': 'wbmp',
2993 'filmstrip+json': 'fs',
2994 'smptett+xml': 'tt',
2997 'x-ms-sami': 'sami',
3006 mimetype = mt.partition(';')[0].strip().lower()
3007 _, _, subtype = mimetype.rpartition('/')
3009 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3012 elif default is not NO_DEFAULT:
3014 return subtype.replace('+', '.')
3017 def ext2mimetype(ext_or_url):
3020 if '.' not in ext_or_url:
3021 ext_or_url = f'file.{ext_or_url}'
3022 return mimetypes.guess_type(ext_or_url)[0]
3025 def parse_codecs(codecs_str):
3026 # http://tools.ietf.org/html/rfc6381
3029 split_codecs = list(filter(None, map(
3030 str.strip, codecs_str.strip().strip(',').split(','))))
3031 vcodec, acodec, scodec, hdr = None, None, None, None
3032 for full_codec in split_codecs:
3033 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3034 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3035 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3039 if parts[0] in ('dvh1', 'dvhe'):
3041 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3043 elif parts[:2] == ['vp9', '2']:
3045 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3046 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3047 acodec = acodec or full_codec
3048 elif parts[0] in ('stpp', 'wvtt'):
3049 scodec = scodec or full_codec
3051 write_string(f'WARNING: Unknown codec {full_codec}\n')
3052 if vcodec or acodec or scodec:
3054 'vcodec': vcodec or 'none',
3055 'acodec': acodec or 'none',
3056 'dynamic_range': hdr,
3057 **({'scodec': scodec} if scodec is not None else {}),
3059 elif len(split_codecs) == 2:
3061 'vcodec': split_codecs[0],
3062 'acodec': split_codecs[1],
3067 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3068 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3070 allow_mkv = not preferences or 'mkv' in preferences
3072 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3073 return 'mkv' # TODO: any other format allows this?
3075 # TODO: All codecs supported by parse_codecs isn't handled here
3076 COMPATIBLE_CODECS = {
3078 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3079 'h264', 'aacl', 'ec-3', # Set in ISM
3082 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3083 'vp9x', 'vp8x', # in the webm spec
3087 sanitize_codec = functools.partial(
3088 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3089 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3091 for ext in preferences or COMPATIBLE_CODECS.keys():
3092 codec_set = COMPATIBLE_CODECS.get(ext, set())
3093 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3097 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3100 for ext in preferences or vexts:
3101 current_exts = {ext, *vexts, *aexts}
3102 if ext == 'mkv' or current_exts == {ext} or any(
3103 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3105 return 'mkv' if allow_mkv else preferences[-1]
3108 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3109 getheader = url_handle.headers.get
3111 cd = getheader('Content-Disposition')
3113 m = re.match(r'attachment;\s*filename="(?P
<filename
>[^
"]+)"', cd)
3115 e = determine_ext(m.group('filename
'), default_ext=None)
3119 meta_ext = getheader('x
-amz
-meta
-name
')
3121 e = meta_ext.rpartition('.')[2]
3125 return mimetype2ext(getheader('Content
-Type
'), default=default)
3128 def encode_data_uri(data, mime_type):
3129 return 'data
:%s;base64
,%s' % (mime_type, base64.b64encode(data).decode('ascii
'))
3132 def age_restricted(content_limit, age_limit):
3133 """ Returns True iff the content should be blocked """
3135 if age_limit is None: # No limit set
3137 if content_limit is None:
3138 return False # Content available for everyone
3139 return age_limit < content_limit
3142 # List of known byte-order-marks (BOM)
3144 (b'\xef\xbb\xbf', 'utf
-8'),
3145 (b'\x00\x00\xfe\xff', 'utf
-32-be
'),
3146 (b'\xff\xfe\x00\x00', 'utf
-32-le
'),
3147 (b'\xff\xfe', 'utf
-16-le
'),
3148 (b'\xfe\xff', 'utf
-16-be
'),
3152 def is_html(first_bytes):
3153 """ Detect whether a file contains HTML by examining its first bytes. """
3156 for bom, enc in BOMS:
3157 while first_bytes.startswith(bom):
3158 encoding, first_bytes = enc, first_bytes[len(bom):]
3160 return re.match(r'^\s
*<', first_bytes.decode(encoding, 'replace
'))
3163 def determine_protocol(info_dict):
3164 protocol = info_dict.get('protocol
')
3165 if protocol is not None:
3168 url = sanitize_url(info_dict['url
'])
3169 if url.startswith('rtmp
'):
3171 elif url.startswith('mms
'):
3173 elif url.startswith('rtsp
'):
3176 ext = determine_ext(url)
3178 return 'm3u8
' if info_dict.get('is_live
') else 'm3u8_native
'
3182 return urllib.parse.urlparse(url).scheme
3185 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3186 """ Render a list of rows, each as a list of values.
3187 Text after a \t will be right aligned """
3189 return len(remove_terminal_sequences(string).replace('\t', ''))
3191 def get_max_lens(table):
3192 return [max(width(str(v)) for v in col) for col in zip(*table)]
3194 def filter_using_list(row, filterArray):
3195 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3197 max_lens = get_max_lens(data) if hide_empty else []
3198 header_row = filter_using_list(header_row, max_lens)
3199 data = [filter_using_list(row, max_lens) for row in data]
3201 table = [header_row] + data
3202 max_lens = get_max_lens(table)
3205 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3206 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3208 for pos, text in enumerate(map(str, row)):
3210 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3212 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3213 ret = '\n'.join(''.join(row).rstrip() for row in table)
3217 def _match_one(filter_part, dct, incomplete):
3218 # TODO: Generalize code with YoutubeDL._build_format_filter
3219 STRING_OPERATORS = {
3220 '*=': operator.contains,
3221 '^
=': lambda attr, value: attr.startswith(value),
3222 '$
=': lambda attr, value: attr.endswith(value),
3223 '~
=': lambda attr, value: re.search(value, attr),
3225 COMPARISON_OPERATORS = {
3227 '<=': operator.le, # "<=" must be defined above "<"
3234 if isinstance(incomplete, bool):
3235 is_incomplete = lambda _: incomplete
3237 is_incomplete = lambda k: k in incomplete
3239 operator_rex = re.compile(r'''(?x)
3241 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3243 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3246 ''' % '|
'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3247 m = operator_rex.fullmatch(filter_part.strip())
3250 unnegated_op = COMPARISON_OPERATORS[m['op
']]
3252 op = lambda attr, value: not unnegated_op(attr, value)
3255 comparison_value = m['quotedstrval
'] or m['strval
'] or m['intval
']
3257 comparison_value = comparison_value.replace(r'\
%s' % m['quote
'], m['quote
'])
3258 actual_value = dct.get(m['key
'])
3259 numeric_comparison = None
3260 if isinstance(actual_value, (int, float)):
3261 # If the original field is a string and matching comparisonvalue is
3262 # a number we should respect the origin of the original field
3263 # and process comparison value as a string (see
3264 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3266 numeric_comparison = int(comparison_value)
3268 numeric_comparison = parse_filesize(comparison_value)
3269 if numeric_comparison is None:
3270 numeric_comparison = parse_filesize(f'{comparison_value}B
')
3271 if numeric_comparison is None:
3272 numeric_comparison = parse_duration(comparison_value)
3273 if numeric_comparison is not None and m['op
'] in STRING_OPERATORS:
3274 raise ValueError('Operator
%s only supports string values
!' % m['op
'])
3275 if actual_value is None:
3276 return is_incomplete(m['key
']) or m['none_inclusive
']
3277 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3280 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3281 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3283 operator_rex = re.compile(r'''(?x)
3284 (?P<op>%s)\s*(?P<key>[a-z_]+)
3285 ''' % '|
'.join(map(re.escape, UNARY_OPERATORS.keys())))
3286 m = operator_rex.fullmatch(filter_part.strip())
3288 op = UNARY_OPERATORS[m.group('op
')]
3289 actual_value = dct.get(m.group('key
'))
3290 if is_incomplete(m.group('key
')) and actual_value is None:
3292 return op(actual_value)
3294 raise ValueError('Invalid
filter part
%r' % filter_part)
3297 def match_str(filter_str, dct, incomplete=False):
3298 """ Filter a dictionary with a simple string syntax.
3299 @returns Whether the filter passes
3300 @param incomplete Set of keys that is expected to be missing from dct.
3301 Can be True/False to indicate all/none of the keys may be missing.
3302 All conditions on incomplete keys pass if the key is missing
3305 _match_one(filter_part.replace(r'\
&', '&'), dct, incomplete)
3306 for filter_part in re.split(r'(?
<!\\)&', filter_str))
3309 def match_filter_func(filters, breaking_filters=None):
3310 if not filters and not breaking_filters:
3312 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3313 filters = set(variadic(filters or []))
3315 interactive = '-' in filters
3319 def _match_func(info_dict, incomplete=False):
3320 ret = breaking_filters(info_dict, incomplete)
3322 raise RejectedVideoReached(ret)
3324 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3325 return NO_DEFAULT if interactive and not incomplete else None
3327 video_title = info_dict.get('title
') or info_dict.get('id') or 'entry
'
3328 filter_str = ') |
('.join(map(str.strip, filters))
3329 return f'{video_title} does
not pass filter ({filter_str}
), skipping
..'
3333 class download_range_func:
3334 def __init__(self, chapters, ranges, from_info=False):
3335 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3337 def __call__(self, info_dict, ydl):
3339 warning = ('There are no chapters matching the regex
' if info_dict.get('chapters
')
3340 else 'Cannot match chapters since chapter information
is unavailable
')
3341 for regex in self.chapters or []:
3342 for i, chapter in enumerate(info_dict.get('chapters
') or []):
3343 if re.search(regex, chapter['title
']):
3345 yield {**chapter, 'index': i}
3346 if self.chapters and warning:
3347 ydl.to_screen(f'[info
] {info_dict["id"]}
: {warning}
')
3349 for start, end in self.ranges or []:
3351 'start_time
': self._handle_negative_timestamp(start, info_dict),
3352 'end_time
': self._handle_negative_timestamp(end, info_dict),
3355 if self.from_info and (info_dict.get('start_time
') or info_dict.get('end_time
')):
3357 'start_time
': info_dict.get('start_time
') or 0,
3358 'end_time
': info_dict.get('end_time
') or float('inf
'),
3360 elif not self.ranges and not self.chapters:
3364 def _handle_negative_timestamp(time, info):
3365 return max(info['duration
'] + time, 0) if info.get('duration
') and time < 0 else time
3367 def __eq__(self, other):
3368 return (isinstance(other, download_range_func)
3369 and self.chapters == other.chapters and self.ranges == other.ranges)
3372 return f'{__name__}
.{type(self).__name__}
({self.chapters}
, {self.ranges}
)'
3375 def parse_dfxp_time_expr(time_expr):
3379 mobj = re.match(rf'^
(?P
<time_offset
>{NUMBER_RE}
)s?$
', time_expr)
3381 return float(mobj.group('time_offset
'))
3383 mobj = re.match(r'^
(\d
+):(\d\d
):(\d\
d(?
:(?
:\
.|
:)\d
+)?
)$
', time_expr)
3385 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3388 def srt_subtitles_timecode(seconds):
3389 return '%02d
:%02d
:%02d
,%03d
' % timetuple_from_msec(seconds * 1000)
3392 def ass_subtitles_timecode(seconds):
3393 time = timetuple_from_msec(seconds * 1000)
3394 return '%01d
:%02d
:%02d
.%02d
' % (*time[:-1], time.milliseconds / 10)
3397 def dfxp2srt(dfxp_data):
3399 @param dfxp_data A bytes-like object containing DFXP data
3400 @returns A unicode object containing converted SRT data
3402 LEGACY_NAMESPACES = (
3403 (b'http
://www
.w3
.org
/ns
/ttml
', [
3404 b'http
://www
.w3
.org
/2004/11/ttaf1
',
3405 b'http
://www
.w3
.org
/2006/04/ttaf1
',
3406 b'http
://www
.w3
.org
/2006/10/ttaf1
',
3408 (b'http
://www
.w3
.org
/ns
/ttml
#styling', [
3409 b
'http://www.w3.org/ns/ttml#style',
3413 SUPPORTED_STYLING
= [
3422 _x
= functools
.partial(xpath_with_ns
, ns_map
={
3423 'xml': 'http://www.w3.org/XML/1998/namespace',
3424 'ttml': 'http://www.w3.org/ns/ttml',
3425 'tts': 'http://www.w3.org/ns/ttml#styling',
3431 class TTMLPElementParser
:
3433 _unclosed_elements
= []
3434 _applied_styles
= []
3436 def start(self
, tag
, attrib
):
3437 if tag
in (_x('ttml:br'), 'br'):
3440 unclosed_elements
= []
3442 element_style_id
= attrib
.get('style')
3444 style
.update(default_style
)
3445 if element_style_id
:
3446 style
.update(styles
.get(element_style_id
, {}))
3447 for prop
in SUPPORTED_STYLING
:
3448 prop_val
= attrib
.get(_x('tts:' + prop
))
3450 style
[prop
] = prop_val
3453 for k
, v
in sorted(style
.items()):
3454 if self
._applied
_styles
and self
._applied
_styles
[-1].get(k
) == v
:
3457 font
+= ' color="%s"' % v
3458 elif k
== 'fontSize':
3459 font
+= ' size="%s"' % v
3460 elif k
== 'fontFamily':
3461 font
+= ' face="%s"' % v
3462 elif k
== 'fontWeight' and v
== 'bold':
3464 unclosed_elements
.append('b')
3465 elif k
== 'fontStyle' and v
== 'italic':
3467 unclosed_elements
.append('i')
3468 elif k
== 'textDecoration' and v
== 'underline':
3470 unclosed_elements
.append('u')
3472 self
._out
+= '<font' + font
+ '>'
3473 unclosed_elements
.append('font')
3475 if self
._applied
_styles
:
3476 applied_style
.update(self
._applied
_styles
[-1])
3477 applied_style
.update(style
)
3478 self
._applied
_styles
.append(applied_style
)
3479 self
._unclosed
_elements
.append(unclosed_elements
)
3482 if tag
not in (_x('ttml:br'), 'br'):
3483 unclosed_elements
= self
._unclosed
_elements
.pop()
3484 for element
in reversed(unclosed_elements
):
3485 self
._out
+= '</%s>' % element
3486 if unclosed_elements
and self
._applied
_styles
:
3487 self
._applied
_styles
.pop()
3489 def data(self
, data
):
3493 return self
._out
.strip()
3495 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3496 # This will not trigger false positives since only UTF-8 text is being replaced
3497 dfxp_data
= dfxp_data
.replace(b
'encoding=\'UTF-16\'', b
'encoding=\'UTF-8\'')
3499 def parse_node(node
):
3500 target
= TTMLPElementParser()
3501 parser
= xml
.etree
.ElementTree
.XMLParser(target
=target
)
3502 parser
.feed(xml
.etree
.ElementTree
.tostring(node
))
3503 return parser
.close()
3505 for k
, v
in LEGACY_NAMESPACES
:
3507 dfxp_data
= dfxp_data
.replace(ns
, k
)
3509 dfxp
= compat_etree_fromstring(dfxp_data
)
3511 paras
= dfxp
.findall(_x('.//ttml:p')) or dfxp
.findall('.//p')
3514 raise ValueError('Invalid dfxp/TTML subtitle')
3518 for style
in dfxp
.findall(_x('.//ttml:style')):
3519 style_id
= style
.get('id') or style
.get(_x('xml:id'))
3522 parent_style_id
= style
.get('style')
3524 if parent_style_id
not in styles
:
3527 styles
[style_id
] = styles
[parent_style_id
].copy()
3528 for prop
in SUPPORTED_STYLING
:
3529 prop_val
= style
.get(_x('tts:' + prop
))
3531 styles
.setdefault(style_id
, {})[prop
] = prop_val
3537 for p
in ('body', 'div'):
3538 ele
= xpath_element(dfxp
, [_x('.//ttml:' + p
), './/' + p
])
3541 style
= styles
.get(ele
.get('style'))
3544 default_style
.update(style
)
3546 for para
, index
in zip(paras
, itertools
.count(1)):
3547 begin_time
= parse_dfxp_time_expr(para
.attrib
.get('begin'))
3548 end_time
= parse_dfxp_time_expr(para
.attrib
.get('end'))
3549 dur
= parse_dfxp_time_expr(para
.attrib
.get('dur'))
3550 if begin_time
is None:
3555 end_time
= begin_time
+ dur
3556 out
.append('%d\n%s --> %s\n%s\n\n' % (
3558 srt_subtitles_timecode(begin_time
),
3559 srt_subtitles_timecode(end_time
),
3565 def cli_option(params
, command_option
, param
, separator
=None):
3566 param
= params
.get(param
)
3567 return ([] if param
is None
3568 else [command_option
, str(param
)] if separator
is None
3569 else [f
'{command_option}{separator}{param}'])
3572 def cli_bool_option(params
, command_option
, param
, true_value
='true', false_value
='false', separator
=None):
3573 param
= params
.get(param
)
3574 assert param
in (True, False, None)
3575 return cli_option({True: true_value, False: false_value}
, command_option
, param
, separator
)
3578 def cli_valueless_option(params
, command_option
, param
, expected_value
=True):
3579 return [command_option
] if params
.get(param
) == expected_value
else []
3582 def cli_configuration_args(argdict
, keys
, default
=[], use_compat
=True):
3583 if isinstance(argdict
, (list, tuple)): # for backward compatibility
3590 assert isinstance(argdict
, dict)
3592 assert isinstance(keys
, (list, tuple))
3593 for key_list
in keys
:
3594 arg_list
= list(filter(
3595 lambda x
: x
is not None,
3596 [argdict
.get(key
.lower()) for key
in variadic(key_list
)]))
3598 return [arg
for args
in arg_list
for arg
in args
]
3602 def _configuration_args(main_key
, argdict
, exe
, keys
=None, default
=[], use_compat
=True):
3603 main_key
, exe
= main_key
.lower(), exe
.lower()
3604 root_key
= exe
if main_key
== exe
else f
'{main_key}+{exe}'
3605 keys
= [f
'{root_key}{k}' for k
in (keys
or [''])]
3606 if root_key
in keys
:
3608 keys
.append((main_key
, exe
))
3609 keys
.append('default')
3612 return cli_configuration_args(argdict
, keys
, default
, use_compat
)
3616 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3675 'iw': 'heb', # Replaced by he in 1989 revision
3685 'in': 'ind', # Replaced by id in 1989 revision
3801 'ji': 'yid', # Replaced by yi in 1989 revision
3809 def short2long(cls
, code
):
3810 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3811 return cls
._lang
_map
.get(code
[:2])
3814 def long2short(cls
, code
):
3815 """Convert language code from ISO 639-2/T to ISO 639-1"""
3816 for short_name
, long_name
in cls
._lang
_map
.items():
3817 if long_name
== code
:
3822 # From http://data.okfn.org/data/core/country-list
3824 'AF': 'Afghanistan',
3825 'AX': 'Åland Islands',
3828 'AS': 'American Samoa',
3833 'AG': 'Antigua and Barbuda',
3850 'BO': 'Bolivia, Plurinational State of',
3851 'BQ': 'Bonaire, Sint Eustatius and Saba',
3852 'BA': 'Bosnia and Herzegovina',
3854 'BV': 'Bouvet Island',
3856 'IO': 'British Indian Ocean Territory',
3857 'BN': 'Brunei Darussalam',
3859 'BF': 'Burkina Faso',
3865 'KY': 'Cayman Islands',
3866 'CF': 'Central African Republic',
3870 'CX': 'Christmas Island',
3871 'CC': 'Cocos (Keeling) Islands',
3875 'CD': 'Congo, the Democratic Republic of the',
3876 'CK': 'Cook Islands',
3878 'CI': 'Côte d\'Ivoire',
3883 'CZ': 'Czech Republic',
3887 'DO': 'Dominican Republic',
3890 'SV': 'El Salvador',
3891 'GQ': 'Equatorial Guinea',
3895 'FK': 'Falkland Islands (Malvinas)',
3896 'FO': 'Faroe Islands',
3900 'GF': 'French Guiana',
3901 'PF': 'French Polynesia',
3902 'TF': 'French Southern Territories',
3917 'GW': 'Guinea-Bissau',
3920 'HM': 'Heard Island and McDonald Islands',
3921 'VA': 'Holy See (Vatican City State)',
3928 'IR': 'Iran, Islamic Republic of',
3931 'IM': 'Isle of Man',
3941 'KP': 'Korea, Democratic People\'s Republic of',
3942 'KR': 'Korea, Republic of',
3945 'LA': 'Lao People\'s Democratic Republic',
3951 'LI': 'Liechtenstein',
3955 'MK': 'Macedonia, the Former Yugoslav Republic of',
3962 'MH': 'Marshall Islands',
3968 'FM': 'Micronesia, Federated States of',
3969 'MD': 'Moldova, Republic of',
3980 'NL': 'Netherlands',
3981 'NC': 'New Caledonia',
3982 'NZ': 'New Zealand',
3987 'NF': 'Norfolk Island',
3988 'MP': 'Northern Mariana Islands',
3993 'PS': 'Palestine, State of',
3995 'PG': 'Papua New Guinea',
3998 'PH': 'Philippines',
4002 'PR': 'Puerto Rico',
4006 'RU': 'Russian Federation',
4008 'BL': 'Saint Barthélemy',
4009 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4010 'KN': 'Saint Kitts and Nevis',
4011 'LC': 'Saint Lucia',
4012 'MF': 'Saint Martin (French part)',
4013 'PM': 'Saint Pierre and Miquelon',
4014 'VC': 'Saint Vincent and the Grenadines',
4017 'ST': 'Sao Tome and Principe',
4018 'SA': 'Saudi Arabia',
4022 'SL': 'Sierra Leone',
4024 'SX': 'Sint Maarten (Dutch part)',
4027 'SB': 'Solomon Islands',
4029 'ZA': 'South Africa',
4030 'GS': 'South Georgia and the South Sandwich Islands',
4031 'SS': 'South Sudan',
4036 'SJ': 'Svalbard and Jan Mayen',
4039 'CH': 'Switzerland',
4040 'SY': 'Syrian Arab Republic',
4041 'TW': 'Taiwan, Province of China',
4043 'TZ': 'Tanzania, United Republic of',
4045 'TL': 'Timor-Leste',
4049 'TT': 'Trinidad and Tobago',
4052 'TM': 'Turkmenistan',
4053 'TC': 'Turks and Caicos Islands',
4057 'AE': 'United Arab Emirates',
4058 'GB': 'United Kingdom',
4059 'US': 'United States',
4060 'UM': 'United States Minor Outlying Islands',
4064 'VE': 'Venezuela, Bolivarian Republic of',
4066 'VG': 'Virgin Islands, British',
4067 'VI': 'Virgin Islands, U.S.',
4068 'WF': 'Wallis and Futuna',
4069 'EH': 'Western Sahara',
4073 # Not ISO 3166 codes, but used for IP blocks
4074 'AP': 'Asia/Pacific Region',
4079 def short2full(cls
, code
):
4080 """Convert an ISO 3166-2 country code to the corresponding full name"""
4081 return cls
._country
_map
.get(code
.upper())
4085 # Major IPv4 address blocks per country
4087 'AD': '46.172.224.0/19',
4088 'AE': '94.200.0.0/13',
4089 'AF': '149.54.0.0/17',
4090 'AG': '209.59.64.0/18',
4091 'AI': '204.14.248.0/21',
4092 'AL': '46.99.0.0/16',
4093 'AM': '46.70.0.0/15',
4094 'AO': '105.168.0.0/13',
4095 'AP': '182.50.184.0/21',
4096 'AQ': '23.154.160.0/24',
4097 'AR': '181.0.0.0/12',
4098 'AS': '202.70.112.0/20',
4099 'AT': '77.116.0.0/14',
4100 'AU': '1.128.0.0/11',
4101 'AW': '181.41.0.0/18',
4102 'AX': '185.217.4.0/22',
4103 'AZ': '5.197.0.0/16',
4104 'BA': '31.176.128.0/17',
4105 'BB': '65.48.128.0/17',
4106 'BD': '114.130.0.0/16',
4108 'BF': '102.178.0.0/15',
4109 'BG': '95.42.0.0/15',
4110 'BH': '37.131.0.0/17',
4111 'BI': '154.117.192.0/18',
4112 'BJ': '137.255.0.0/16',
4113 'BL': '185.212.72.0/23',
4114 'BM': '196.12.64.0/18',
4115 'BN': '156.31.0.0/16',
4116 'BO': '161.56.0.0/16',
4117 'BQ': '161.0.80.0/20',
4118 'BR': '191.128.0.0/12',
4119 'BS': '24.51.64.0/18',
4120 'BT': '119.2.96.0/19',
4121 'BW': '168.167.0.0/16',
4122 'BY': '178.120.0.0/13',
4123 'BZ': '179.42.192.0/18',
4124 'CA': '99.224.0.0/11',
4125 'CD': '41.243.0.0/16',
4126 'CF': '197.242.176.0/21',
4127 'CG': '160.113.0.0/16',
4128 'CH': '85.0.0.0/13',
4129 'CI': '102.136.0.0/14',
4130 'CK': '202.65.32.0/19',
4131 'CL': '152.172.0.0/14',
4132 'CM': '102.244.0.0/14',
4133 'CN': '36.128.0.0/10',
4134 'CO': '181.240.0.0/12',
4135 'CR': '201.192.0.0/12',
4136 'CU': '152.206.0.0/15',
4137 'CV': '165.90.96.0/19',
4138 'CW': '190.88.128.0/17',
4139 'CY': '31.153.0.0/16',
4140 'CZ': '88.100.0.0/14',
4142 'DJ': '197.241.0.0/17',
4143 'DK': '87.48.0.0/12',
4144 'DM': '192.243.48.0/20',
4145 'DO': '152.166.0.0/15',
4146 'DZ': '41.96.0.0/12',
4147 'EC': '186.68.0.0/15',
4148 'EE': '90.190.0.0/15',
4149 'EG': '156.160.0.0/11',
4150 'ER': '196.200.96.0/20',
4151 'ES': '88.0.0.0/11',
4152 'ET': '196.188.0.0/14',
4153 'EU': '2.16.0.0/13',
4154 'FI': '91.152.0.0/13',
4155 'FJ': '144.120.0.0/16',
4156 'FK': '80.73.208.0/21',
4157 'FM': '119.252.112.0/20',
4158 'FO': '88.85.32.0/19',
4160 'GA': '41.158.0.0/15',
4162 'GD': '74.122.88.0/21',
4163 'GE': '31.146.0.0/16',
4164 'GF': '161.22.64.0/18',
4165 'GG': '62.68.160.0/19',
4166 'GH': '154.160.0.0/12',
4167 'GI': '95.164.0.0/16',
4168 'GL': '88.83.0.0/19',
4169 'GM': '160.182.0.0/15',
4170 'GN': '197.149.192.0/18',
4171 'GP': '104.250.0.0/19',
4172 'GQ': '105.235.224.0/20',
4173 'GR': '94.64.0.0/13',
4174 'GT': '168.234.0.0/16',
4175 'GU': '168.123.0.0/16',
4176 'GW': '197.214.80.0/20',
4177 'GY': '181.41.64.0/18',
4178 'HK': '113.252.0.0/14',
4179 'HN': '181.210.0.0/16',
4180 'HR': '93.136.0.0/13',
4181 'HT': '148.102.128.0/17',
4182 'HU': '84.0.0.0/14',
4183 'ID': '39.192.0.0/10',
4184 'IE': '87.32.0.0/12',
4185 'IL': '79.176.0.0/13',
4186 'IM': '5.62.80.0/20',
4187 'IN': '117.192.0.0/10',
4188 'IO': '203.83.48.0/21',
4189 'IQ': '37.236.0.0/14',
4190 'IR': '2.176.0.0/12',
4191 'IS': '82.221.0.0/16',
4192 'IT': '79.0.0.0/10',
4193 'JE': '87.244.64.0/18',
4194 'JM': '72.27.0.0/17',
4195 'JO': '176.29.0.0/16',
4196 'JP': '133.0.0.0/8',
4197 'KE': '105.48.0.0/12',
4198 'KG': '158.181.128.0/17',
4199 'KH': '36.37.128.0/17',
4200 'KI': '103.25.140.0/22',
4201 'KM': '197.255.224.0/20',
4202 'KN': '198.167.192.0/19',
4203 'KP': '175.45.176.0/22',
4204 'KR': '175.192.0.0/10',
4205 'KW': '37.36.0.0/14',
4206 'KY': '64.96.0.0/15',
4207 'KZ': '2.72.0.0/13',
4208 'LA': '115.84.64.0/18',
4209 'LB': '178.135.0.0/16',
4210 'LC': '24.92.144.0/20',
4211 'LI': '82.117.0.0/19',
4212 'LK': '112.134.0.0/15',
4213 'LR': '102.183.0.0/16',
4214 'LS': '129.232.0.0/17',
4215 'LT': '78.56.0.0/13',
4216 'LU': '188.42.0.0/16',
4217 'LV': '46.109.0.0/16',
4218 'LY': '41.252.0.0/14',
4219 'MA': '105.128.0.0/11',
4220 'MC': '88.209.64.0/18',
4221 'MD': '37.246.0.0/16',
4222 'ME': '178.175.0.0/17',
4223 'MF': '74.112.232.0/21',
4224 'MG': '154.126.0.0/17',
4225 'MH': '117.103.88.0/21',
4226 'MK': '77.28.0.0/15',
4227 'ML': '154.118.128.0/18',
4228 'MM': '37.111.0.0/17',
4229 'MN': '49.0.128.0/17',
4230 'MO': '60.246.0.0/16',
4231 'MP': '202.88.64.0/20',
4232 'MQ': '109.203.224.0/19',
4233 'MR': '41.188.64.0/18',
4234 'MS': '208.90.112.0/22',
4235 'MT': '46.11.0.0/16',
4236 'MU': '105.16.0.0/12',
4237 'MV': '27.114.128.0/18',
4238 'MW': '102.70.0.0/15',
4239 'MX': '187.192.0.0/11',
4240 'MY': '175.136.0.0/13',
4241 'MZ': '197.218.0.0/15',
4242 'NA': '41.182.0.0/16',
4243 'NC': '101.101.0.0/18',
4244 'NE': '197.214.0.0/18',
4245 'NF': '203.17.240.0/22',
4246 'NG': '105.112.0.0/12',
4247 'NI': '186.76.0.0/15',
4248 'NL': '145.96.0.0/11',
4249 'NO': '84.208.0.0/13',
4250 'NP': '36.252.0.0/15',
4251 'NR': '203.98.224.0/19',
4252 'NU': '49.156.48.0/22',
4253 'NZ': '49.224.0.0/14',
4254 'OM': '5.36.0.0/15',
4255 'PA': '186.72.0.0/15',
4256 'PE': '186.160.0.0/14',
4257 'PF': '123.50.64.0/18',
4258 'PG': '124.240.192.0/19',
4259 'PH': '49.144.0.0/13',
4260 'PK': '39.32.0.0/11',
4261 'PL': '83.0.0.0/11',
4262 'PM': '70.36.0.0/20',
4263 'PR': '66.50.0.0/16',
4264 'PS': '188.161.0.0/16',
4265 'PT': '85.240.0.0/13',
4266 'PW': '202.124.224.0/20',
4267 'PY': '181.120.0.0/14',
4268 'QA': '37.210.0.0/15',
4269 'RE': '102.35.0.0/16',
4270 'RO': '79.112.0.0/13',
4271 'RS': '93.86.0.0/15',
4272 'RU': '5.136.0.0/13',
4273 'RW': '41.186.0.0/16',
4274 'SA': '188.48.0.0/13',
4275 'SB': '202.1.160.0/19',
4276 'SC': '154.192.0.0/11',
4277 'SD': '102.120.0.0/13',
4278 'SE': '78.64.0.0/12',
4279 'SG': '8.128.0.0/10',
4280 'SI': '188.196.0.0/14',
4281 'SK': '78.98.0.0/15',
4282 'SL': '102.143.0.0/17',
4283 'SM': '89.186.32.0/19',
4284 'SN': '41.82.0.0/15',
4285 'SO': '154.115.192.0/18',
4286 'SR': '186.179.128.0/17',
4287 'SS': '105.235.208.0/21',
4288 'ST': '197.159.160.0/19',
4289 'SV': '168.243.0.0/16',
4290 'SX': '190.102.0.0/20',
4292 'SZ': '41.84.224.0/19',
4293 'TC': '65.255.48.0/20',
4294 'TD': '154.68.128.0/19',
4295 'TG': '196.168.0.0/14',
4296 'TH': '171.96.0.0/13',
4297 'TJ': '85.9.128.0/18',
4298 'TK': '27.96.24.0/21',
4299 'TL': '180.189.160.0/20',
4300 'TM': '95.85.96.0/19',
4301 'TN': '197.0.0.0/11',
4302 'TO': '175.176.144.0/21',
4303 'TR': '78.160.0.0/11',
4304 'TT': '186.44.0.0/15',
4305 'TV': '202.2.96.0/19',
4306 'TW': '120.96.0.0/11',
4307 'TZ': '156.156.0.0/14',
4308 'UA': '37.52.0.0/14',
4309 'UG': '102.80.0.0/13',
4311 'UY': '167.56.0.0/13',
4312 'UZ': '84.54.64.0/18',
4313 'VA': '212.77.0.0/19',
4314 'VC': '207.191.240.0/21',
4315 'VE': '186.88.0.0/13',
4316 'VG': '66.81.192.0/20',
4317 'VI': '146.226.0.0/16',
4318 'VN': '14.160.0.0/11',
4319 'VU': '202.80.32.0/20',
4320 'WF': '117.20.32.0/21',
4321 'WS': '202.4.32.0/19',
4322 'YE': '134.35.0.0/16',
4323 'YT': '41.242.116.0/22',
4324 'ZA': '41.0.0.0/11',
4325 'ZM': '102.144.0.0/13',
4326 'ZW': '102.177.192.0/18',
4330 def random_ipv4(cls
, code_or_block
):
4331 if len(code_or_block
) == 2:
4332 block
= cls
._country
_ip
_map
.get(code_or_block
.upper())
4336 block
= code_or_block
4337 addr
, preflen
= block
.split('/')
4338 addr_min
= struct
.unpack('!L', socket
.inet_aton(addr
))[0]
4339 addr_max
= addr_min |
(0xffffffff >> int(preflen
))
4340 return str(socket
.inet_ntoa(
4341 struct
.pack('!L', random
.randint(addr_min
, addr_max
))))
4344 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4345 # released into Public Domain
4346 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4348 def long_to_bytes(n
, blocksize
=0):
4349 """long_to_bytes(n:long, blocksize:int) : string
4350 Convert a long integer to a byte string.
4352 If optional blocksize is given and greater than zero, pad the front of the
4353 byte string with binary zeros so that the length is a multiple of
4356 # after much testing, this algorithm was deemed to be the fastest
4360 s
= struct
.pack('>I', n
& 0xffffffff) + s
4362 # strip off leading zeros
4363 for i
in range(len(s
)):
4364 if s
[i
] != b
'\000'[0]:
4367 # only happens when n == 0
4371 # add back some pad bytes. this could be done more efficiently w.r.t. the
4372 # de-padding being done above, but sigh...
4373 if blocksize
> 0 and len(s
) % blocksize
:
4374 s
= (blocksize
- len(s
) % blocksize
) * b
'\000' + s
4378 def bytes_to_long(s
):
4379 """bytes_to_long(string) : long
4380 Convert a byte string to a long integer.
4382 This is (essentially) the inverse of long_to_bytes().
4387 extra
= (4 - length
% 4)
4388 s
= b
'\000' * extra
+ s
4389 length
= length
+ extra
4390 for i
in range(0, length
, 4):
4391 acc
= (acc
<< 32) + struct
.unpack('>I', s
[i
:i
+ 4])[0]
4395 def ohdave_rsa_encrypt(data
, exponent
, modulus
):
4397 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4400 data: data to encrypt, bytes-like object
4401 exponent, modulus: parameter e and N of RSA algorithm, both integer
4402 Output: hex string of encrypted data
4404 Limitation: supports one block encryption only
4407 payload
= int(binascii
.hexlify(data
[::-1]), 16)
4408 encrypted
= pow(payload
, exponent
, modulus
)
4409 return '%x' % encrypted
4412 def pkcs1pad(data
, length
):
4414 Padding input data with PKCS#1 scheme
4416 @param {int[]} data input data
4417 @param {int} length target length
4418 @returns {int[]} padded data
4420 if len(data
) > length
- 11:
4421 raise ValueError('Input data too long for PKCS#1 padding')
4423 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4424 return [0, 2] + pseudo_random
+ [0] + data
4427 def _base_n_table(n
, table
):
4428 if not table
and not n
:
4429 raise ValueError('Either table or n must be specified')
4430 table
= (table
or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n
]
4432 if n
and n
!= len(table
):
4433 raise ValueError(f
'base {n} exceeds table length {len(table)}')
4437 def encode_base_n(num
, n
=None, table
=None):
4438 """Convert given int to a base-n string"""
4439 table
= _base_n_table(n
, table
)
4443 result
, base
= '', len(table
)
4445 result
= table
[num
% base
] + result
4450 def decode_base_n(string
, n
=None, table
=None):
4451 """Convert given base-n string to int"""
4452 table
= {char: index for index, char in enumerate(_base_n_table(n, table))}
4453 result
, base
= 0, len(table
)
4455 result
= result
* base
+ table
[char
]
4459 def decode_packed_codes(code
):
4460 mobj
= re
.search(PACKED_CODES_RE
, code
)
4461 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4464 symbols
= symbols
.split('|')
4469 base_n_count
= encode_base_n(count
, base
)
4470 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4473 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4477 def caesar(s
, alphabet
, shift
):
4482 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4487 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4490 def parse_m3u8_attributes(attrib
):
4492 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4493 if val
.startswith('"'):
4499 def urshift(val
, n
):
4500 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4503 def write_xattr(path
, key
, value
):
4504 # Windows: Write xattrs to NTFS Alternate Data Streams:
4505 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4506 if compat_os_name
== 'nt':
4507 assert ':' not in key
4508 assert os
.path
.exists(path
)
4511 with open(f
'{path}:{key}', 'wb') as f
:
4513 except OSError as e
:
4514 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4517 # UNIX Method 1. Use xattrs/pyxattrs modules
4520 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
4521 # Unicode arguments are not supported in pyxattr until version 0.5.0
4522 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4523 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
4524 setxattr
= xattr
.set
4526 setxattr
= xattr
.setxattr
4530 setxattr(path
, key
, value
)
4531 except OSError as e
:
4532 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4535 # UNIX Method 2. Use setfattr/xattr executables
4536 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
4537 else 'xattr' if check_executable('xattr', ['-h']) else None)
4539 raise XAttrUnavailableError(
4540 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4541 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4543 value
= value
.decode()
4545 _
, stderr
, returncode
= Popen
.run(
4546 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
4547 text
=True, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4548 except OSError as e
:
4549 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4551 raise XAttrMetadataError(returncode
, stderr
)
4554 def random_birthday(year_field
, month_field
, day_field
):
4555 start_date
= datetime
.date(1950, 1, 1)
4556 end_date
= datetime
.date(1995, 12, 31)
4557 offset
= random
.randint(0, (end_date
- start_date
).days
)
4558 random_date
= start_date
+ datetime
.timedelta(offset
)
4560 year_field
: str(random_date
.year
),
4561 month_field
: str(random_date
.month
),
4562 day_field
: str(random_date
.day
),
4566 def find_available_port(interface
=''):
4568 with socket
.socket() as sock
:
4569 sock
.bind((interface
, 0))
4570 return sock
.getsockname()[1]
4575 # Templates for internet shortcut files, which are plain text files.
4576 DOT_URL_LINK_TEMPLATE
= '''\
4581 DOT_WEBLOC_LINK_TEMPLATE
= '''\
4582 <?xml version="1.0" encoding="UTF-8"?>
4583 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4584 <plist version="1.0">
4587 \t<string>%(url)s</string>
4592 DOT_DESKTOP_LINK_TEMPLATE
= '''\
4602 'url': DOT_URL_LINK_TEMPLATE
,
4603 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4604 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4608 def iri_to_uri(iri
):
4610 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4612 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4615 iri_parts
= urllib
.parse
.urlparse(iri
)
4617 if '[' in iri_parts
.netloc
:
4618 raise ValueError('IPv6 URIs are not, yet, supported.')
4619 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4621 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4624 if iri_parts
.username
:
4625 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4626 if iri_parts
.password
is not None:
4627 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4630 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
4631 # The 'idna' encoding produces ASCII text.
4632 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4633 net_location
+= ':' + str(iri_parts
.port
)
4635 return urllib
.parse
.urlunparse(
4639 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
4641 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4642 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
4644 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4645 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
4647 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
4649 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4652 def to_high_limit_path(path
):
4653 if sys
.platform
in ['win32', 'cygwin']:
4654 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4655 return '\\\\?\\' + os
.path
.abspath(path
)
4660 def format_field(obj
, field
=None, template
='%s', ignore
=NO_DEFAULT
, default
='', func
=IDENTITY
):
4661 val
= traversal
.traverse_obj(obj
, *variadic(field
))
4662 if not val
if ignore
is NO_DEFAULT
else val
in variadic(ignore
):
4664 return template
% func(val
)
4667 def clean_podcast_url(url
):
4668 url
= re
.sub(r
'''(?x)
4672 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4677 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4680 cn\.co| # https://podcorn.com/analytics-prefix/
4681 st\.fm # https://podsights.com/docs/
4686 return re
.sub(r
'^\w+://(\w+://)', r
'\1', url
)
4689 _HEX_TABLE
= '0123456789abcdef'
4692 def random_uuidv4():
4693 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4696 def make_dir(path
, to_screen
=None):
4698 dn
= os
.path
.dirname(path
)
4700 os
.makedirs(dn
, exist_ok
=True)
4702 except OSError as err
:
4703 if callable(to_screen
) is not None:
4704 to_screen(f
'unable to create directory {err}')
4708 def get_executable_path():
4709 from ..update
import _get_variant_and_executable_path
4711 return os
.path
.dirname(os
.path
.abspath(_get_variant_and_executable_path()[1]))
4714 def get_user_config_dirs(package_name
):
4715 # .config (e.g. ~/.config/package_name)
4716 xdg_config_home
= os
.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4717 yield os
.path
.join(xdg_config_home
, package_name
)
4719 # appdata (%APPDATA%/package_name)
4720 appdata_dir
= os
.getenv('appdata')
4722 yield os
.path
.join(appdata_dir
, package_name
)
4724 # home (~/.package_name)
4725 yield os
.path
.join(compat_expanduser('~'), f
'.{package_name}')
4728 def get_system_config_dirs(package_name
):
4730 yield os
.path
.join('/etc', package_name
)
4733 def time_seconds(**kwargs
):
4735 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4737 return time
.time() + datetime
.timedelta(**kwargs
).total_seconds()
4740 # create a JSON Web Signature (jws) with HS256 algorithm
4741 # the resulting format is in JWS Compact Serialization
4742 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4743 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4744 def jwt_encode_hs256(payload_data
, key
, headers
={}):
4750 header_data
.update(headers
)
4751 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
4752 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
4753 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
4754 signature_b64
= base64
.b64encode(h
.digest())
4755 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
4759 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4760 def jwt_decode_hs256(jwt
):
4761 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
4762 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4763 payload_data
= json
.loads(base64
.urlsafe_b64decode(f
'{payload_b64}==='))
4767 WINDOWS_VT_MODE
= False if compat_os_name
== 'nt' else None
4771 def supports_terminal_sequences(stream
):
4772 if compat_os_name
== 'nt':
4773 if not WINDOWS_VT_MODE
:
4775 elif not os
.getenv('TERM'):
4778 return stream
.isatty()
4779 except BaseException
:
4783 def windows_enable_vt_mode():
4784 """Ref: https://bugs.python.org/issue30075 """
4785 if get_windows_version() < (10, 0, 10586):
4789 import ctypes
.wintypes
4792 ENABLE_VIRTUAL_TERMINAL_PROCESSING
= 0x0004
4794 dll
= ctypes
.WinDLL('kernel32', use_last_error
=False)
4795 handle
= os
.open('CONOUT$', os
.O_RDWR
)
4797 h_out
= ctypes
.wintypes
.HANDLE(msvcrt
.get_osfhandle(handle
))
4798 dw_original_mode
= ctypes
.wintypes
.DWORD()
4799 success
= dll
.GetConsoleMode(h_out
, ctypes
.byref(dw_original_mode
))
4801 raise Exception('GetConsoleMode failed')
4803 success
= dll
.SetConsoleMode(h_out
, ctypes
.wintypes
.DWORD(
4804 dw_original_mode
.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
))
4806 raise Exception('SetConsoleMode failed')
4810 global WINDOWS_VT_MODE
4811 WINDOWS_VT_MODE
= True
4812 supports_terminal_sequences
.cache_clear()
4815 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
4818 def remove_terminal_sequences(string
):
4819 return _terminal_sequences_re
.sub('', string
)
4822 def number_of_digits(number
):
4823 return len('%d' % number
)
4826 def join_nonempty(*values
, delim
='-', from_dict
=None):
4827 if from_dict
is not None:
4828 values
= (traversal
.traverse_obj(from_dict
, variadic(v
)) for v
in values
)
4829 return delim
.join(map(str, filter(None, values
)))
4832 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
4834 Find the largest format dimensions in terms of video width and, for each thumbnail:
4835 * Modify the URL: Match the width with the provided regex and replace with the former width
4838 This function is useful with video services that scale the provided thumbnails on demand
4840 _keys
= ('width', 'height')
4841 max_dimensions
= max(
4842 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
4844 if not max_dimensions
[0]:
4848 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
4849 dict(zip(_keys
, max_dimensions
)), thumbnail
)
4850 for thumbnail
in thumbnails
4854 def parse_http_range(range):
4855 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4857 return None, None, None
4858 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4860 return None, None, None
4861 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
4864 def read_stdin(what
):
4865 eof
= 'Ctrl+Z' if compat_os_name
== 'nt' else 'Ctrl+D'
4866 write_string(f
'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4870 def determine_file_encoding(data
):
4872 Detect the text encoding used
4873 @returns (encoding, bytes to skip)
4876 # BOM marks are given priority over declarations
4877 for bom
, enc
in BOMS
:
4878 if data
.startswith(bom
):
4879 return enc
, len(bom
)
4881 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4882 # We ignore the endianness to get a good enough match
4883 data
= data
.replace(b
'\0', b
'')
4884 mobj
= re
.match(rb
'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data
)
4885 return mobj
.group(1).decode() if mobj
else None, 0
4892 __initialized
= False
4894 def __init__(self
, parser
, label
=None):
4895 self
.parser
, self
.label
= parser
, label
4896 self
._loaded
_paths
, self
.configs
= set(), []
4898 def init(self
, args
=None, filename
=None):
4899 assert not self
.__initialized
4900 self
.own_args
, self
.filename
= args
, filename
4901 return self
.load_configs()
4903 def load_configs(self
):
4906 location
= os
.path
.realpath(self
.filename
)
4907 directory
= os
.path
.dirname(location
)
4908 if location
in self
._loaded
_paths
:
4910 self
._loaded
_paths
.add(location
)
4912 self
.__initialized
= True
4913 opts
, _
= self
.parser
.parse_known_args(self
.own_args
)
4914 self
.parsed_args
= self
.own_args
4915 for location
in opts
.config_locations
or []:
4917 if location
in self
._loaded
_paths
:
4919 self
._loaded
_paths
.add(location
)
4920 self
.append_config(shlex
.split(read_stdin('options'), comments
=True), label
='stdin')
4922 location
= os
.path
.join(directory
, expand_path(location
))
4923 if os
.path
.isdir(location
):
4924 location
= os
.path
.join(location
, 'yt-dlp.conf')
4925 if not os
.path
.exists(location
):
4926 self
.parser
.error(f
'config location {location} does not exist')
4927 self
.append_config(self
.read_file(location
), location
)
4931 label
= join_nonempty(
4932 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
4934 return join_nonempty(
4935 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4936 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
4940 def read_file(filename
, default
=[]):
4942 optionf
= open(filename
, 'rb')
4944 return default
# silently skip if file is not present
4946 enc
, skip
= determine_file_encoding(optionf
.read(512))
4947 optionf
.seek(skip
, io
.SEEK_SET
)
4949 enc
= None # silently skip read errors
4951 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4952 contents
= optionf
.read().decode(enc
or preferredencoding())
4953 res
= shlex
.split(contents
, comments
=True)
4954 except Exception as err
:
4955 raise ValueError(f
'Unable to parse "{filename}": {err}')
4961 def hide_login_info(opts
):
4962 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4963 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
4968 return m
.group('key') + '=PRIVATE'
4972 opts
= list(map(_scrub_eq
, opts
))
4973 for idx
, opt
in enumerate(opts
):
4974 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
4975 opts
[idx
+ 1] = 'PRIVATE'
4978 def append_config(self
, *args
, label
=None):
4979 config
= type(self
)(self
.parser
, label
)
4980 config
._loaded
_paths
= self
._loaded
_paths
4981 if config
.init(*args
):
4982 self
.configs
.append(config
)
4986 for config
in reversed(self
.configs
):
4987 yield from config
.all_args
4988 yield from self
.parsed_args
or []
4990 def parse_known_args(self
, **kwargs
):
4991 return self
.parser
.parse_known_args(self
.all_args
, **kwargs
)
4993 def parse_args(self
):
4994 return self
.parser
.parse_args(self
.all_args
)
4997 class WebSocketsWrapper
:
4998 """Wraps websockets module to use in non-async scopes"""
5001 def __init__(self
, url
, headers
=None, connect
=True):
5002 self
.loop
= asyncio
.new_event_loop()
5003 # XXX: "loop" is deprecated
5004 self
.conn
= websockets
.connect(
5005 url
, extra_headers
=headers
, ping_interval
=None,
5006 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5009 atexit
.register(self
.__exit
__, None, None, None)
5011 def __enter__(self
):
5013 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5016 def send(self
, *args
):
5017 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5019 def recv(self
, *args
):
5020 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5022 def __exit__(self
, type, value
, traceback
):
5024 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5027 self
._cancel
_all
_tasks
(self
.loop
)
5029 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5030 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5032 def run_with_loop(main
, loop
):
5033 if not asyncio
.iscoroutine(main
):
5034 raise ValueError(f
'a coroutine was expected, got {main!r}')
5037 return loop
.run_until_complete(main
)
5039 loop
.run_until_complete(loop
.shutdown_asyncgens())
5040 if hasattr(loop
, 'shutdown_default_executor'):
5041 loop
.run_until_complete(loop
.shutdown_default_executor())
5044 def _cancel_all_tasks(loop
):
5045 to_cancel
= asyncio
.all_tasks(loop
)
5050 for task
in to_cancel
:
5053 # XXX: "loop" is removed in python 3.10+
5054 loop
.run_until_complete(
5055 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5057 for task
in to_cancel
:
5058 if task
.cancelled():
5060 if task
.exception() is not None:
5061 loop
.call_exception_handler({
5062 'message': 'unhandled exception during asyncio.run() shutdown',
5063 'exception': task
.exception(),
5068 def merge_headers(*dicts
):
5069 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5070 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5073 def cached_method(f
):
5074 """Cache a method"""
5075 signature
= inspect
.signature(f
)
5078 def wrapper(self
, *args
, **kwargs
):
5079 bound_args
= signature
.bind(self
, *args
, **kwargs
)
5080 bound_args
.apply_defaults()
5081 key
= tuple(bound_args
.arguments
.values())[1:]
5083 cache
= vars(self
).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {}
)
5084 if key
not in cache
:
5085 cache
[key
] = f(self
, *args
, **kwargs
)
5090 class classproperty
:
5091 """property access for class methods with optional caching"""
5092 def __new__(cls
, func
=None, *args
, **kwargs
):
5094 return functools
.partial(cls
, *args
, **kwargs
)
5095 return super().__new
__(cls
)
5097 def __init__(self
, func
, *, cache
=False):
5098 functools
.update_wrapper(self
, func
)
5100 self
._cache
= {} if cache
else None
5102 def __get__(self
, _
, cls
):
5103 if self
._cache
is None:
5104 return self
.func(cls
)
5105 elif cls
not in self
._cache
:
5106 self
._cache
[cls
] = self
.func(cls
)
5107 return self
._cache
[cls
]
5110 class function_with_repr
:
5111 def __init__(self
, func
, repr_
=None):
5112 functools
.update_wrapper(self
, func
)
5113 self
.func
, self
.__repr
= func
, repr_
5115 def __call__(self
, *args
, **kwargs
):
5116 return self
.func(*args
, **kwargs
)
5121 return f
'{self.func.__module__}.{self.func.__qualname__}'
5124 class Namespace(types
.SimpleNamespace
):
5125 """Immutable namespace"""
5128 return iter(self
.__dict
__.values())
5132 return self
.__dict
__.items()
5135 MEDIA_EXTENSIONS
= Namespace(
5136 common_video
=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5137 video
=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5138 common_audio
=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5139 audio
=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5140 thumbnails
=('jpg', 'png', 'webp'),
5141 storyboards
=('mhtml', ),
5142 subtitles
=('srt', 'vtt', 'ass', 'lrc'),
5143 manifests
=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5145 MEDIA_EXTENSIONS
.video
+= MEDIA_EXTENSIONS
.common_video
5146 MEDIA_EXTENSIONS
.audio
+= MEDIA_EXTENSIONS
.common_audio
5148 KNOWN_EXTENSIONS
= (*MEDIA_EXTENSIONS
.video
, *MEDIA_EXTENSIONS
.audio
, *MEDIA_EXTENSIONS
.manifests
)
5153 for retry in RetryManager(...):
5156 except SomeException as err:
5160 attempt
, _error
= 0, None
5162 def __init__(self
, _retries
, _error_callback
, **kwargs
):
5163 self
.retries
= _retries
or 0
5164 self
.error_callback
= functools
.partial(_error_callback
, **kwargs
)
5166 def _should_retry(self
):
5167 return self
._error
is not NO_DEFAULT
and self
.attempt
<= self
.retries
5171 if self
._error
is NO_DEFAULT
:
5176 def error(self
, value
):
5180 while self
._should
_retry
():
5181 self
.error
= NO_DEFAULT
5185 self
.error_callback(self
.error
, self
.attempt
, self
.retries
)
5188 def report_retry(e
, count
, retries
, *, sleep_func
, info
, warn
, error
=None, suffix
=None):
5189 """Utility function for reporting retries"""
5192 return error(f
'{e}. Giving up after {count - 1} retries') if count
> 1 else error(str(e
))
5197 elif isinstance(e
, ExtractorError
):
5198 e
= remove_end(str_or_none(e
.cause
) or e
.orig_msg
, '.')
5199 warn(f
'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5201 delay
= float_or_none(sleep_func(n
=count
- 1)) if callable(sleep_func
) else sleep_func
5203 info(f
'Sleeping {delay:.2f} seconds ...')
5207 def make_archive_id(ie
, video_id
):
5208 ie_key
= ie
if isinstance(ie
, str) else ie
.ie_key()
5209 return f
'{ie_key.lower()} {video_id}'
5212 def truncate_string(s
, left
, right
=0):
5213 assert left
> 3 and right
>= 0
5214 if s
is None or len(s
) <= left
+ right
:
5216 return f
'{s[:left-3]}...{s[-right:] if right else ""}'
5219 def orderedSet_from_options(options
, alias_dict
, *, use_regex
=False, start
=None):
5220 assert 'all' in alias_dict
, '"all" alias is required'
5221 requested
= list(start
or [])
5223 discard
= val
.startswith('-')
5227 if val
in alias_dict
:
5228 val
= alias_dict
[val
] if not discard
else [
5229 i
[1:] if i
.startswith('-') else f
'-{i}' for i
in alias_dict
[val
]]
5230 # NB: Do not allow regex in aliases for performance
5231 requested
= orderedSet_from_options(val
, alias_dict
, start
=requested
)
5234 current
= (filter(re
.compile(val
, re
.I
).fullmatch
, alias_dict
['all']) if use_regex
5235 else [val
] if val
in alias_dict
['all'] else None)
5237 raise ValueError(val
)
5240 for item
in current
:
5241 while item
in requested
:
5242 requested
.remove(item
)
5244 requested
.extend(current
)
5246 return orderedSet(requested
)
5251 regex
= r
' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5253 default
= ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5254 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5255 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5256 ytdl_default
= ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5257 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5258 'fps', 'fs_approx', 'source', 'id')
5261 'vcodec': {'type': 'ordered', 'regex': True,
5262 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5263 'acodec': {'type': 'ordered', 'regex': True,
5264 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5265 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5266 'order': ['dv', '(hdr)?12', r
'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5267 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5268 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5269 'vext': {'type': 'ordered', 'field': 'video_ext',
5270 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5271 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5272 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5273 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5274 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5275 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}
,
5276 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5277 'field': ('vcodec', 'acodec'),
5278 'function': lambda it
: int(any(v
!= 'none' for v
in it
))},
5279 'ie_pref': {'priority': True, 'type': 'extractor'}
,
5280 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}
,
5281 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}
,
5282 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}
,
5283 'quality': {'convert': 'float', 'default': -1}
,
5284 'filesize': {'convert': 'bytes'}
,
5285 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}
,
5286 'id': {'convert': 'string', 'field': 'format_id'}
,
5287 'height': {'convert': 'float_none'}
,
5288 'width': {'convert': 'float_none'}
,
5289 'fps': {'convert': 'float_none'}
,
5290 'channels': {'convert': 'float_none', 'field': 'audio_channels'}
,
5291 'tbr': {'convert': 'float_none'}
,
5292 'vbr': {'convert': 'float_none'}
,
5293 'abr': {'convert': 'float_none'}
,
5294 'asr': {'convert': 'float_none'}
,
5295 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}
,
5297 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}
,
5298 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5299 'function': lambda it
: next(filter(None, it
), None)},
5300 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5301 'function': lambda it
: next(filter(None, it
), None)},
5302 'ext': {'type': 'combined', 'field': ('vext', 'aext')}
,
5303 'res': {'type': 'multiple', 'field': ('height', 'width'),
5304 'function': lambda it
: (lambda l
: min(l
) if l
else 0)(tuple(filter(None, it
)))},
5306 # Actual field names
5307 'format_id': {'type': 'alias', 'field': 'id'}
,
5308 'preference': {'type': 'alias', 'field': 'ie_pref'}
,
5309 'language_preference': {'type': 'alias', 'field': 'lang'}
,
5310 'source_preference': {'type': 'alias', 'field': 'source'}
,
5311 'protocol': {'type': 'alias', 'field': 'proto'}
,
5312 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}
,
5313 'audio_channels': {'type': 'alias', 'field': 'channels'}
,
5316 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}
,
5317 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}
,
5318 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}
,
5319 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}
,
5320 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}
,
5321 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}
,
5322 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}
,
5323 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}
,
5324 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}
,
5325 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}
,
5326 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}
,
5327 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}
,
5328 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}
,
5329 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}
,
5330 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}
,
5331 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}
,
5332 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}
,
5333 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}
,
5334 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}
,
5335 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}
,
5338 def __init__(self
, ydl
, field_preference
):
5341 self
.evaluate_params(self
.ydl
.params
, field_preference
)
5342 if ydl
.params
.get('verbose'):
5343 self
.print_verbose_info(self
.ydl
.write_debug
)
5345 def _get_field_setting(self
, field
, key
):
5346 if field
not in self
.settings
:
5347 if key
in ('forced', 'priority'):
5349 self
.ydl
.deprecated_feature(f
'Using arbitrary fields ({field}) for format sorting is '
5350 'deprecated and may be removed in a future version')
5351 self
.settings
[field
] = {}
5352 propObj
= self
.settings
[field
]
5353 if key
not in propObj
:
5354 type = propObj
.get('type')
5356 default
= 'preference' if type == 'extractor' else (field
,) if type in ('combined', 'multiple') else field
5357 elif key
== 'convert':
5358 default
= 'order' if type == 'ordered' else 'float_string' if field
else 'ignore'
5360 default
= {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}
.get(key
, None)
5361 propObj
[key
] = default
5364 def _resolve_field_value(self
, field
, value
, convertNone
=False):
5369 value
= value
.lower()
5370 conversion
= self
._get
_field
_setting
(field
, 'convert')
5371 if conversion
== 'ignore':
5373 if conversion
== 'string':
5375 elif conversion
== 'float_none':
5376 return float_or_none(value
)
5377 elif conversion
== 'bytes':
5378 return parse_bytes(value
)
5379 elif conversion
== 'order':
5380 order_list
= (self
._use
_free
_order
and self
._get
_field
_setting
(field
, 'order_free')) or self
._get
_field
_setting
(field
, 'order')
5381 use_regex
= self
._get
_field
_setting
(field
, 'regex')
5382 list_length
= len(order_list
)
5383 empty_pos
= order_list
.index('') if '' in order_list
else list_length
+ 1
5384 if use_regex
and value
is not None:
5385 for i
, regex
in enumerate(order_list
):
5386 if regex
and re
.match(regex
, value
):
5387 return list_length
- i
5388 return list_length
- empty_pos
# not in list
5389 else: # not regex or value = None
5390 return list_length
- (order_list
.index(value
) if value
in order_list
else empty_pos
)
5392 if value
.isnumeric():
5395 self
.settings
[field
]['convert'] = 'string'
5398 def evaluate_params(self
, params
, sort_extractor
):
5399 self
._use
_free
_order
= params
.get('prefer_free_formats', False)
5400 self
._sort
_user
= params
.get('format_sort', [])
5401 self
._sort
_extractor
= sort_extractor
5403 def add_item(field
, reverse
, closest
, limit_text
):
5404 field
= field
.lower()
5405 if field
in self
._order
:
5407 self
._order
.append(field
)
5408 limit
= self
._resolve
_field
_value
(field
, limit_text
)
5411 'closest': False if limit
is None else closest
,
5412 'limit_text': limit_text
,
5414 if field
in self
.settings
:
5415 self
.settings
[field
].update(data
)
5417 self
.settings
[field
] = data
5420 tuple(field
for field
in self
.default
if self
._get
_field
_setting
(field
, 'forced'))
5421 + (tuple() if params
.get('format_sort_force', False)
5422 else tuple(field
for field
in self
.default
if self
._get
_field
_setting
(field
, 'priority')))
5423 + tuple(self
._sort
_user
) + tuple(sort_extractor
) + self
.default
)
5425 for item
in sort_list
:
5426 match
= re
.match(self
.regex
, item
)
5428 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item
)
5429 field
= match
.group('field')
5432 if self
._get
_field
_setting
(field
, 'type') == 'alias':
5433 alias
, field
= field
, self
._get
_field
_setting
(field
, 'field')
5434 if self
._get
_field
_setting
(alias
, 'deprecated'):
5435 self
.ydl
.deprecated_feature(f
'Format sorting alias {alias} is deprecated and may '
5436 f
'be removed in a future version. Please use {field} instead')
5437 reverse
= match
.group('reverse') is not None
5438 closest
= match
.group('separator') == '~'
5439 limit_text
= match
.group('limit')
5441 has_limit
= limit_text
is not None
5442 has_multiple_fields
= self
._get
_field
_setting
(field
, 'type') == 'combined'
5443 has_multiple_limits
= has_limit
and has_multiple_fields
and not self
._get
_field
_setting
(field
, 'same_limit')
5445 fields
= self
._get
_field
_setting
(field
, 'field') if has_multiple_fields
else (field
,)
5446 limits
= limit_text
.split(':') if has_multiple_limits
else (limit_text
,) if has_limit
else tuple()
5447 limit_count
= len(limits
)
5448 for (i
, f
) in enumerate(fields
):
5449 add_item(f
, reverse
, closest
,
5450 limits
[i
] if i
< limit_count
5451 else limits
[0] if has_limit
and not has_multiple_limits
5454 def print_verbose_info(self
, write_debug
):
5456 write_debug('Sort order given by user: %s' % ', '.join(self
._sort
_user
))
5457 if self
._sort
_extractor
:
5458 write_debug('Sort order given by extractor: %s' % ', '.join(self
._sort
_extractor
))
5459 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5460 '+' if self
._get
_field
_setting
(field
, 'reverse') else '', field
,
5461 '%s%s(%s)' % ('~' if self
._get
_field
_setting
(field
, 'closest') else ':',
5462 self
._get
_field
_setting
(field
, 'limit_text'),
5463 self
._get
_field
_setting
(field
, 'limit'))
5464 if self
._get
_field
_setting
(field
, 'limit_text') is not None else '')
5465 for field
in self
._order
if self
._get
_field
_setting
(field
, 'visible')]))
5467 def _calculate_field_preference_from_value(self
, format
, field
, type, value
):
5468 reverse
= self
._get
_field
_setting
(field
, 'reverse')
5469 closest
= self
._get
_field
_setting
(field
, 'closest')
5470 limit
= self
._get
_field
_setting
(field
, 'limit')
5472 if type == 'extractor':
5473 maximum
= self
._get
_field
_setting
(field
, 'max')
5474 if value
is None or (maximum
is not None and value
>= maximum
):
5476 elif type == 'boolean':
5477 in_list
= self
._get
_field
_setting
(field
, 'in_list')
5478 not_in_list
= self
._get
_field
_setting
(field
, 'not_in_list')
5479 value
= 0 if ((in_list
is None or value
in in_list
) and (not_in_list
is None or value
not in not_in_list
)) else -1
5480 elif type == 'ordered':
5481 value
= self
._resolve
_field
_value
(field
, value
, True)
5483 # try to convert to number
5484 val_num
= float_or_none(value
, default
=self
._get
_field
_setting
(field
, 'default'))
5485 is_num
= self
._get
_field
_setting
(field
, 'convert') != 'string' and val_num
is not None
5489 return ((-10, 0) if value
is None
5490 else (1, value
, 0) if not is_num
# if a field has mixed strings and numbers, strings are sorted higher
5491 else (0, -abs(value
- limit
), value
- limit
if reverse
else limit
- value
) if closest
5492 else (0, value
, 0) if not reverse
and (limit
is None or value
<= limit
)
5493 else (0, -value
, 0) if limit
is None or (reverse
and value
== limit
) or value
> limit
5494 else (-1, value
, 0))
5496 def _calculate_field_preference(self
, format
, field
):
5497 type = self
._get
_field
_setting
(field
, 'type') # extractor, boolean, ordered, field, multiple
5498 get_value
= lambda f
: format
.get(self
._get
_field
_setting
(f
, 'field'))
5499 if type == 'multiple':
5500 type = 'field' # Only 'field' is allowed in multiple for now
5501 actual_fields
= self
._get
_field
_setting
(field
, 'field')
5503 value
= self
._get
_field
_setting
(field
, 'function')(get_value(f
) for f
in actual_fields
)
5505 value
= get_value(field
)
5506 return self
._calculate
_field
_preference
_from
_value
(format
, field
, type, value
)
5508 def calculate_preference(self
, format
):
5509 # Determine missing protocol
5510 if not format
.get('protocol'):
5511 format
['protocol'] = determine_protocol(format
)
5513 # Determine missing ext
5514 if not format
.get('ext') and 'url' in format
:
5515 format
['ext'] = determine_ext(format
['url'])
5516 if format
.get('vcodec') == 'none':
5517 format
['audio_ext'] = format
['ext'] if format
.get('acodec') != 'none' else 'none'
5518 format
['video_ext'] = 'none'
5520 format
['video_ext'] = format
['ext']
5521 format
['audio_ext'] = 'none'
5522 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5523 # format['preference'] = -1000
5525 if format
.get('preference') is None and format
.get('ext') == 'flv' and re
.match('[hx]265|he?vc?', format
.get('vcodec') or ''):
5526 # HEVC-over-FLV is out-of-spec by FLV's original spec
5527 # ref. https://trac.ffmpeg.org/ticket/6389
5528 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5529 format
['preference'] = -100
5531 # Determine missing bitrates
5532 if format
.get('vcodec') == 'none':
5534 if format
.get('acodec') == 'none':
5536 if not format
.get('vbr') and format
.get('vcodec') != 'none':
5537 format
['vbr'] = try_call(lambda: format
['tbr'] - format
['abr']) or None
5538 if not format
.get('abr') and format
.get('acodec') != 'none':
5539 format
['abr'] = try_call(lambda: format
['tbr'] - format
['vbr']) or None
5540 if not format
.get('tbr'):
5541 format
['tbr'] = try_call(lambda: format
['vbr'] + format
['abr']) or None
5543 return tuple(self
._calculate
_field
_preference
(format
, field
) for field
in self
._order
)
5548 def __init__(self
, ydl
=None):
5551 def debug(self
, message
):
5553 self
._ydl
.write_debug(message
)
5555 def info(self
, message
):
5557 self
._ydl
.to_screen(message
)
5559 def warning(self
, message
, *, once
=False):
5561 self
._ydl
.report_warning(message
, only_once
=once
)
5563 def error(self
, message
, *, is_error
=True):
5565 self
._ydl
.report_error(message
, is_error
=is_error
)
5567 def stdout(self
, message
):
5569 self
._ydl
.to_stdout(message
)
5571 def stderr(self
, message
):
5573 self
._ydl
.to_stderr(message
)