47 import xml
.etree
.ElementTree
50 from .compat
import functools
# isort: split
52 compat_etree_fromstring
,
54 compat_HTMLParseError
,
58 from .dependencies
import brotli
, certifi
, websockets
, xattr
59 from .socks
import ProxyType
, sockssocket
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme
not in urllib
.parse
.uses_netloc
:
68 urllib
.parse
.uses_netloc
.append(scheme
)
71 # This is not clearly defined otherwise
72 compiled_regex_type
= type(re
.compile(''))
75 def random_user_agent():
76 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
117 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
120 SUPPORTED_ENCODINGS
= [
124 SUPPORTED_ENCODINGS
.append('br')
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
139 NO_DEFAULT
= object()
140 IDENTITY
= lambda x
: x
142 ENGLISH_MONTH_NAMES
= [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
147 'en': ENGLISH_MONTH_NAMES
,
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
153 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
155 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
156 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
157 'EST': -5, 'EDT': -4, # Eastern
158 'CST': -6, 'CDT': -5, # Central
159 'MST': -7, 'MDT': -6, # Mountain
160 'PST': -8, 'PDT': -7 # Pacific
163 # needed for sanitizing filenames in restricted mode
164 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
165 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
166 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
196 '%Y-%m-%d %H:%M:%S.%f',
197 '%Y-%m-%d %H:%M:%S:%f',
200 '%Y-%m-%dT%H:%M:%SZ',
201 '%Y-%m-%dT%H:%M:%S.%fZ',
202 '%Y-%m-%dT%H:%M:%S.%f0Z',
204 '%Y-%m-%dT%H:%M:%S.%f',
207 '%b %d %Y at %H:%M:%S',
209 '%B %d %Y at %H:%M:%S',
213 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
214 DATE_FORMATS_DAY_FIRST
.extend([
224 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
225 DATE_FORMATS_MONTH_FIRST
.extend([
233 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
234 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>\s
*(?P
<json_ld
>{.+?}
)\s
*</script
>'
236 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
240 def preferredencoding():
241 """Get preferred encoding.
243 Returns the best encoding scheme for the system, based on
244 locale.getpreferredencoding() and some further tweaks.
247 pref = locale.getpreferredencoding()
255 def write_json_file(obj, fn):
256 """ Encode obj as JSON and write it to fn, atomically if possible """
258 tf = tempfile.NamedTemporaryFile(
259 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
260 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
264 json.dump(obj, tf, ensure_ascii=False)
265 if sys.platform == 'win32
':
266 # Need to remove existing file on Windows, else os.rename raises
267 # WindowsError or FileExistsError.
268 with contextlib.suppress(OSError):
270 with contextlib.suppress(OSError):
273 os.chmod(tf.name, 0o666 & ~mask)
274 os.rename(tf.name, fn)
276 with contextlib.suppress(OSError):
281 def find_xpath_attr(node, xpath, key, val=None):
282 """ Find the xpath xpath[@key=val] """
283 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
284 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
285 return node.find(expr)
287 # On python2.6 the xml.etree.ElementTree.Element methods don't support
288 # the namespace parameter
291 def xpath_with_ns(path
, ns_map
):
292 components
= [c
.split(':') for c
in path
.split('/')]
296 replaced
.append(c
[0])
299 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
300 return '/'.join(replaced
)
303 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
304 def _find_xpath(xpath
):
305 return node
.find(xpath
)
307 if isinstance(xpath
, str):
308 n
= _find_xpath(xpath
)
316 if default
is not NO_DEFAULT
:
319 name
= xpath
if name
is None else name
320 raise ExtractorError('Could not find XML element %s' % name
)
326 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
327 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
328 if n
is None or n
== default
:
331 if default
is not NO_DEFAULT
:
334 name
= xpath
if name
is None else name
335 raise ExtractorError('Could not find XML element\'s text %s' % name
)
341 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
342 n
= find_xpath_attr(node
, xpath
, key
)
344 if default
is not NO_DEFAULT
:
347 name
= f
'{xpath}[@{key}]' if name
is None else name
348 raise ExtractorError('Could not find XML attribute %s' % name
)
354 def get_element_by_id(id, html
, **kwargs
):
355 """Return the content of the tag with the specified ID in the passed HTML document"""
356 return get_element_by_attribute('id', id, html
, **kwargs
)
359 def get_element_html_by_id(id, html
, **kwargs
):
360 """Return the html of the tag with the specified ID in the passed HTML document"""
361 return get_element_html_by_attribute('id', id, html
, **kwargs
)
364 def get_element_by_class(class_name
, html
):
365 """Return the content of the first tag with the specified class in the passed HTML document"""
366 retval
= get_elements_by_class(class_name
, html
)
367 return retval
[0] if retval
else None
370 def get_element_html_by_class(class_name
, html
):
371 """Return the html of the first tag with the specified class in the passed HTML document"""
372 retval
= get_elements_html_by_class(class_name
, html
)
373 return retval
[0] if retval
else None
376 def get_element_by_attribute(attribute
, value
, html
, **kwargs
):
377 retval
= get_elements_by_attribute(attribute
, value
, html
, **kwargs
)
378 return retval
[0] if retval
else None
381 def get_element_html_by_attribute(attribute
, value
, html
, **kargs
):
382 retval
= get_elements_html_by_attribute(attribute
, value
, html
, **kargs
)
383 return retval
[0] if retval
else None
386 def get_elements_by_class(class_name
, html
, **kargs
):
387 """Return the content of all tags with the specified class in the passed HTML document as a list"""
388 return get_elements_by_attribute(
389 'class', r
'[^\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
390 html, escape_value=False)
393 def get_elements_html_by_class(class_name, html):
394 """Return the html of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_html_by_attribute(
396 'class', r'[^
\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
397 html, escape_value=False)
400 def get_elements_by_attribute(*args, **kwargs):
401 """Return the content of the tag with the specified attribute in the passed HTML document"""
402 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
405 def get_elements_html_by_attribute(*args, **kwargs):
406 """Return the html of the tag with the specified attribute in the passed HTML document"""
407 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
410 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
412 Return the text (content) and the html (whole) of the tag with the specified
413 attribute in the passed HTML document
416 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
418 value = re.escape(value) if escape_value else value
420 partial_element_re = rf'''(?x
)
421 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
422 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
423 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
426 for m in re.finditer(partial_element_re, html):
427 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
430 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
435 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
437 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
438 closing tag for the first opening tag it has encountered, and can be used
442 class HTMLBreakOnClosingTagException(Exception):
446 self.tagstack = collections.deque()
447 html.parser.HTMLParser.__init__(self)
452 def __exit__(self, *_):
456 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
457 # so data remains buffered; we no longer have any interest in it, thus
458 # override this method to discard it
461 def handle_starttag(self, tag, _):
462 self.tagstack.append(tag)
464 def handle_endtag(self, tag):
465 if not self.tagstack:
466 raise compat_HTMLParseError('no tags
in the stack
')
468 inner_tag = self.tagstack.pop()
472 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
473 if not self.tagstack:
474 raise self.HTMLBreakOnClosingTagException()
477 def get_element_text_and_html_by_tag(tag, html):
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text
) and the whole
element (html
)
482 def find_or_raise(haystack, needle, exc):
484 return haystack.index(needle)
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
512 class HTMLAttributeParser(html.parser.HTMLParser):
513 """Trivial HTML parser to gather the attributes
for a single element
"""
517 html.parser.HTMLParser.__init__(self)
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
523 class HTMLListAttrsParser(html.parser.HTMLParser):
524 """HTML parser to gather the attributes
for the elements of a
list"""
527 html.parser.HTMLParser.__init__(self)
531 def handle_starttag(self, tag, attrs):
532 if tag == 'li' and self._level == 0:
533 self.items.append(dict(attrs))
536 def handle_endtag(self, tag):
540 def extract_attributes(html_element):
541 """Given a string
for an HTML element such
as
543 a
="foo" B
="bar" c
="&98;az" d
=boz
544 empty
= noval entity
="&"
547 Decode
and return a dictionary of attributes
.
549 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
550 'empty': '', 'noval': None, 'entity': '&',
551 'sq': '"', 'dq': '\''
554 parser = HTMLAttributeParser()
555 with contextlib.suppress(compat_HTMLParseError):
556 parser.feed(html_element)
561 def parse_list(webpage):
562 """Given a string
for an series of HTML
<li
> elements
,
563 return a dictionary of their attributes
"""
564 parser = HTMLListAttrsParser()
570 def clean_html(html):
571 """Clean an HTML snippet into a readable string
"""
573 if html is None: # Convenience for sanitizing descriptions etc.
576 html = re.sub(r'\s+', ' ', html)
577 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
578 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
580 html = re.sub('<.*?>', '', html)
581 # Replace html entities
582 html = unescapeHTML(html)
586 class LenientJSONDecoder(json.JSONDecoder):
587 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
588 self.transform_source, self.ignore_extra = transform_source, ignore_extra
589 super().__init__(*args, **kwargs)
592 if self.transform_source:
593 s = self.transform_source(s)
594 if self.ignore_extra:
595 return self.raw_decode(s.lstrip())[0]
596 return super().decode(s)
599 def sanitize_open(filename, open_mode):
600 """Try to
open the given filename
, and slightly tweak it
if this fails
.
602 Attempts to
open the given filename
. If this fails
, it tries to change
603 the filename slightly
, step by step
, until it
's either able to open it
604 or it fails and raises a final exception, like the standard open()
607 It returns the tuple (stream, definitive_file_name).
610 if sys.platform == 'win32
':
613 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
614 with contextlib.suppress(io.UnsupportedOperation):
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
618 for attempt in range(2):
621 if sys.platform == 'win32
':
622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
625 raise LockingUnsupportedError()
626 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
628 stream
= open(filename
, open_mode
)
629 return stream
, filename
630 except OSError as err
:
631 if attempt
or err
.errno
in (errno
.EACCES
,):
633 old_filename
, filename
= filename
, sanitize_path(filename
)
634 if old_filename
== filename
:
638 def timeconvert(timestr
):
639 """Convert RFC 2822 defined time string into system timestamp"""
641 timetuple
= email
.utils
.parsedate_tz(timestr
)
642 if timetuple
is not None:
643 timestamp
= email
.utils
.mktime_tz(timetuple
)
647 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
648 """Sanitizes a string so it could be used as part of a filename.
649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
656 def replace_insane(char
):
657 if restricted
and char
in ACCENT_CHARS
:
658 return ACCENT_CHARS
[char
]
659 elif not restricted
and char
== '\n':
661 elif is_id
is NO_DEFAULT
and not restricted
and char
in '"*:<>?|/\\':
662 # Replace with their full-width unicode counterparts
663 return {'/': '\u29F8', '\\': '\u29f9'}
.get(char
, chr(ord(char
) + 0xfee0))
664 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
667 return '' if restricted
else '\''
669 return '\0_\0-' if restricted
else '\0 \0-'
670 elif char
in '\\/|*<>':
672 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
676 if restricted
and is_id
is NO_DEFAULT
:
677 s
= unicodedata
.normalize('NFKC', s
)
678 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
679 result
= ''.join(map(replace_insane
, s
))
680 if is_id
is NO_DEFAULT
:
681 result
= re
.sub(r
'(\0.)(?:(?=\1)..)+', r
'\1', result
) # Remove repeated substitute chars
682 STRIP_RE
= r
'(?:\0.|[ _-])*'
683 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
684 result
= result
.replace('\0', '') or '_'
687 while '__' in result
:
688 result
= result
.replace('__', '_')
689 result
= result
.strip('_')
690 # Common case of "Foreign band name - English song title"
691 if restricted
and result
.startswith('-_'):
693 if result
.startswith('-'):
694 result
= '_' + result
[len('-'):]
695 result
= result
.lstrip('.')
701 def sanitize_path(s
, force
=False):
702 """Sanitizes and normalizes path on Windows"""
703 if sys
.platform
== 'win32':
705 drive_or_unc
, _
= os
.path
.splitdrive(s
)
711 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
715 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
716 for path_part
in norm_path
]
718 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
719 elif force
and s
and s
[0] == os
.path
.sep
:
720 sanitized_path
.insert(0, os
.path
.sep
)
721 return os
.path
.join(*sanitized_path
)
724 def sanitize_url(url
, *, scheme
='http'):
725 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
726 # the number of unwanted failures due to missing protocol
729 elif url
.startswith('//'):
730 return f
'{scheme}:{url}'
731 # Fix some common typos seen so far
733 # https://github.com/ytdl-org/youtube-dl/issues/15649
734 (r
'^httpss://', r
'https://'),
735 # https://bx1.be/lives/direct-tv/
736 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
738 for mistake
, fixup
in COMMON_TYPOS
:
739 if re
.match(mistake
, url
):
740 return re
.sub(mistake
, fixup
, url
)
744 def extract_basic_auth(url
):
745 parts
= urllib
.parse
.urlsplit(url
)
746 if parts
.username
is None:
748 url
= urllib
.parse
.urlunsplit(parts
._replace
(netloc
=(
749 parts
.hostname
if parts
.port
is None
750 else '%s:%d' % (parts
.hostname
, parts
.port
))))
751 auth_payload
= base64
.b64encode(
752 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
753 return url
, f
'Basic {auth_payload.decode()}'
756 def sanitized_Request(url
, *args
, **kwargs
):
757 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
758 if auth_header
is not None:
759 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
760 headers
['Authorization'] = auth_header
761 return urllib
.request
.Request(url
, *args
, **kwargs
)
765 """Expand shell variables and ~"""
766 return os
.path
.expandvars(compat_expanduser(s
))
769 def orderedSet(iterable
, *, lazy
=False):
770 """Remove all duplicates from the input iterable"""
772 seen
= [] # Do not use set since the items can be unhashable
778 return _iter() if lazy
else list(_iter())
781 def _htmlentity_transform(entity_with_semicolon
):
782 """Transforms an HTML entity to a character."""
783 entity
= entity_with_semicolon
[:-1]
785 # Known non-numeric HTML entity
786 if entity
in html
.entities
.name2codepoint
:
787 return chr(html
.entities
.name2codepoint
[entity
])
789 # TODO: HTML5 allows entities without a semicolon.
790 # E.g. 'Éric' should be decoded as 'Éric'.
791 if entity_with_semicolon
in html
.entities
.html5
:
792 return html
.entities
.html5
[entity_with_semicolon
]
794 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
796 numstr
= mobj
.group(1)
797 if numstr
.startswith('x'):
799 numstr
= '0%s' % numstr
802 # See https://github.com/ytdl-org/youtube-dl/issues/7518
803 with contextlib
.suppress(ValueError):
804 return chr(int(numstr
, base
))
806 # Unknown entity in name, return its literal representation
807 return '&%s;' % entity
813 assert isinstance(s
, str)
816 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
819 def escapeHTML(text
):
822 .replace('&', '&')
823 .replace('<', '<')
824 .replace('>', '>')
825 .replace('"', '"')
826 .replace("'", ''')
830 def process_communicate_or_kill(p
, *args
, **kwargs
):
831 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
832 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
833 return Popen
.communicate_or_kill(p
, *args
, **kwargs
)
836 class Popen(subprocess
.Popen
):
837 if sys
.platform
== 'win32':
838 _startupinfo
= subprocess
.STARTUPINFO()
839 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
843 def __init__(self
, *args
, text
=False, **kwargs
):
845 kwargs
['universal_newlines'] = True # For 3.6 compatibility
846 kwargs
.setdefault('encoding', 'utf-8')
847 kwargs
.setdefault('errors', 'replace')
848 super().__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
850 def communicate_or_kill(self
, *args
, **kwargs
):
852 return self
.communicate(*args
, **kwargs
)
853 except BaseException
: # Including KeyboardInterrupt
854 self
.kill(timeout
=None)
857 def kill(self
, *, timeout
=0):
860 self
.wait(timeout
=timeout
)
863 def run(cls
, *args
, **kwargs
):
864 with cls(*args
, **kwargs
) as proc
:
865 stdout
, stderr
= proc
.communicate_or_kill()
866 return stdout
or '', stderr
or '', proc
.returncode
869 def get_subprocess_encoding():
870 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
871 # For subprocess calls, encode with locale encoding
872 # Refer to http://stackoverflow.com/a/9951851/35070
873 encoding
= preferredencoding()
875 encoding
= sys
.getfilesystemencoding()
881 def encodeFilename(s
, for_subprocess
=False):
882 assert isinstance(s
, str)
886 def decodeFilename(b
, for_subprocess
=False):
890 def encodeArgument(s
):
891 # Legacy code that uses byte strings
892 # Uncomment the following line after fixing all post processors
893 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
894 return s
if isinstance(s
, str) else s
.decode('ascii')
897 def decodeArgument(b
):
901 def decodeOption(optval
):
904 if isinstance(optval
, bytes):
905 optval
= optval
.decode(preferredencoding())
907 assert isinstance(optval
, str)
911 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
914 def timetuple_from_msec(msec
):
915 secs
, msec
= divmod(msec
, 1000)
916 mins
, secs
= divmod(secs
, 60)
917 hrs
, mins
= divmod(mins
, 60)
918 return _timetuple(hrs
, mins
, secs
, msec
)
921 def formatSeconds(secs
, delim
=':', msec
=False):
922 time
= timetuple_from_msec(secs
* 1000)
924 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
926 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
928 ret
= '%d' % time
.seconds
929 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
932 def _ssl_load_windows_store_certs(ssl_context
, storename
):
933 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
935 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
936 if encoding
== 'x509_asn' and (
937 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
938 except PermissionError
:
941 with contextlib
.suppress(ssl
.SSLError
):
942 ssl_context
.load_verify_locations(cadata
=cert
)
945 def make_HTTPS_handler(params
, **kwargs
):
946 opts_check_certificate
= not params
.get('nocheckcertificate')
947 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
948 context
.check_hostname
= opts_check_certificate
949 if params
.get('legacyserverconnect'):
950 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
951 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
952 context
.set_ciphers('DEFAULT')
954 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
955 if opts_check_certificate
:
956 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
957 context
.load_verify_locations(cafile
=certifi
.where())
960 context
.load_default_certs()
961 # Work around the issue in load_default_certs when there are bad certificates. See:
962 # https://github.com/yt-dlp/yt-dlp/issues/1060,
963 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
965 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
966 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
967 for storename
in ('CA', 'ROOT'):
968 _ssl_load_windows_store_certs(context
, storename
)
969 context
.set_default_verify_paths()
971 client_certfile
= params
.get('client_certificate')
974 context
.load_cert_chain(
975 client_certfile
, keyfile
=params
.get('client_certificate_key'),
976 password
=params
.get('client_certificate_password'))
978 raise YoutubeDLError('Unable to load client certificate')
980 # Some servers may reject requests if ALPN extension is not sent. See:
981 # https://github.com/python/cpython/issues/85140
982 # https://github.com/yt-dlp/yt-dlp/issues/3878
983 with contextlib
.suppress(NotImplementedError):
984 context
.set_alpn_protocols(['http/1.1'])
986 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
989 def bug_reports_message(before
=';'):
990 from .update
import REPOSITORY
992 msg
= (f
'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
993 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
995 before
= before
.rstrip()
996 if not before
or before
.endswith(('.', '!', '?')):
997 msg
= msg
[0].title() + msg
[1:]
999 return (before
+ ' ' if before
else '') + msg
1002 class YoutubeDLError(Exception):
1003 """Base exception for YoutubeDL errors."""
1006 def __init__(self
, msg
=None):
1009 elif self
.msg
is None:
1010 self
.msg
= type(self
).__name
__
1011 super().__init
__(self
.msg
)
1014 network_exceptions
= [urllib
.error
.URLError
, http
.client
.HTTPException
, socket
.error
]
1015 if hasattr(ssl
, 'CertificateError'):
1016 network_exceptions
.append(ssl
.CertificateError
)
1017 network_exceptions
= tuple(network_exceptions
)
1020 class ExtractorError(YoutubeDLError
):
1021 """Error during info extraction."""
1023 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
1024 """ tb, if given, is the original traceback (so that it can be printed out).
1025 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1027 if sys
.exc_info()[0] in network_exceptions
:
1030 self
.orig_msg
= str(msg
)
1032 self
.expected
= expected
1034 self
.video_id
= video_id
1036 self
.exc_info
= sys
.exc_info() # preserve original exception
1037 if isinstance(self
.exc_info
[1], ExtractorError
):
1038 self
.exc_info
= self
.exc_info
[1].exc_info
1040 super().__init
__(''.join((
1041 format_field(ie
, None, '[%s] '),
1042 format_field(video_id
, None, '%s: '),
1044 format_field(cause
, None, ' (caused by %r)'),
1045 '' if expected
else bug_reports_message())))
1047 def format_traceback(self
):
1048 return join_nonempty(
1049 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1050 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1054 class UnsupportedError(ExtractorError
):
1055 def __init__(self
, url
):
1057 'Unsupported URL: %s' % url
, expected
=True)
1061 class RegexNotFoundError(ExtractorError
):
1062 """Error when a regex didn't match"""
1066 class GeoRestrictedError(ExtractorError
):
1067 """Geographic restriction Error exception.
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1073 def __init__(self
, msg
, countries
=None, **kwargs
):
1074 kwargs
['expected'] = True
1075 super().__init
__(msg
, **kwargs
)
1076 self
.countries
= countries
1079 class UserNotLive(ExtractorError
):
1080 """Error when a channel/user is not live"""
1082 def __init__(self
, msg
=None, **kwargs
):
1083 kwargs
['expected'] = True
1084 super().__init
__(msg
or 'The channel is not currently live', **kwargs
)
1087 class DownloadError(YoutubeDLError
):
1088 """Download Error exception.
1090 This exception may be thrown by FileDownloader objects if they are not
1091 configured to continue on errors. They will contain the appropriate
1095 def __init__(self
, msg
, exc_info
=None):
1096 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1097 super().__init
__(msg
)
1098 self
.exc_info
= exc_info
1101 class EntryNotInPlaylist(YoutubeDLError
):
1102 """Entry not in playlist exception.
1104 This exception will be thrown by YoutubeDL when a requested entry
1105 is not found in the playlist info_dict
1107 msg
= 'Entry not found in info'
1110 class SameFileError(YoutubeDLError
):
1111 """Same File exception.
1113 This exception will be thrown by FileDownloader objects if they detect
1114 multiple files would have to be downloaded to the same file on disk.
1116 msg
= 'Fixed output name but more than one file to download'
1118 def __init__(self
, filename
=None):
1119 if filename
is not None:
1120 self
.msg
+= f
': {filename}'
1121 super().__init
__(self
.msg
)
1124 class PostProcessingError(YoutubeDLError
):
1125 """Post Processing exception.
1127 This exception may be raised by PostProcessor's .run() method to
1128 indicate an error in the postprocessing task.
1132 class DownloadCancelled(YoutubeDLError
):
1133 """ Exception raised when the download queue should be interrupted """
1134 msg
= 'The download was cancelled'
1137 class ExistingVideoReached(DownloadCancelled
):
1138 """ --break-on-existing triggered """
1139 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1142 class RejectedVideoReached(DownloadCancelled
):
1143 """ --break-on-reject triggered """
1144 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1147 class MaxDownloadsReached(DownloadCancelled
):
1148 """ --max-downloads limit has been reached. """
1149 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1152 class ReExtractInfo(YoutubeDLError
):
1153 """ Video info needs to be re-extracted. """
1155 def __init__(self
, msg
, expected
=False):
1156 super().__init
__(msg
)
1157 self
.expected
= expected
1160 class ThrottledDownload(ReExtractInfo
):
1161 """ Download speed below --throttled-rate. """
1162 msg
= 'The download speed is below throttle limit'
1165 super().__init
__(self
.msg
, expected
=False)
1168 class UnavailableVideoError(YoutubeDLError
):
1169 """Unavailable Format exception.
1171 This exception will be thrown when a video is requested
1172 in a format that is not available for that video.
1174 msg
= 'Unable to download video'
1176 def __init__(self
, err
=None):
1178 self
.msg
+= f
': {err}'
1179 super().__init
__(self
.msg
)
1182 class ContentTooShortError(YoutubeDLError
):
1183 """Content Too Short exception.
1185 This exception may be raised by FileDownloader objects when a file they
1186 download is too small for what the server announced first, indicating
1187 the connection was probably interrupted.
1190 def __init__(self
, downloaded
, expected
):
1191 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1193 self
.downloaded
= downloaded
1194 self
.expected
= expected
1197 class XAttrMetadataError(YoutubeDLError
):
1198 def __init__(self
, code
=None, msg
='Unknown error'):
1199 super().__init
__(msg
)
1203 # Parsing code and msg
1204 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1205 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1206 self
.reason
= 'NO_SPACE'
1207 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1208 self
.reason
= 'VALUE_TOO_LONG'
1210 self
.reason
= 'NOT_SUPPORTED'
1213 class XAttrUnavailableError(YoutubeDLError
):
1217 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1218 hc
= http_class(*args
, **kwargs
)
1219 source_address
= ydl_handler
._params
.get('source_address')
1221 if source_address
is not None:
1222 # This is to workaround _create_connection() from socket where it will try all
1223 # address data from getaddrinfo() including IPv6. This filters the result from
1224 # getaddrinfo() based on the source_address value.
1225 # This is based on the cpython socket.create_connection() function.
1226 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1227 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1228 host
, port
= address
1230 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1231 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1232 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1233 if addrs
and not ip_addrs
:
1234 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1236 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1237 % (ip_version
, source_address
[0]))
1238 for res
in ip_addrs
:
1239 af
, socktype
, proto
, canonname
, sa
= res
1242 sock
= socket
.socket(af
, socktype
, proto
)
1243 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1244 sock
.settimeout(timeout
)
1245 sock
.bind(source_address
)
1247 err
= None # Explicitly break reference cycle
1249 except OSError as _
:
1251 if sock
is not None:
1256 raise OSError('getaddrinfo returns an empty list')
1257 if hasattr(hc
, '_create_connection'):
1258 hc
._create
_connection
= _create_connection
1259 hc
.source_address
= (source_address
, 0)
1264 def handle_youtubedl_headers(headers
):
1265 filtered_headers
= headers
1267 if 'Youtubedl-no-compression' in filtered_headers
:
1268 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1269 del filtered_headers
['Youtubedl-no-compression']
1271 return filtered_headers
1274 class YoutubeDLHandler(urllib
.request
.HTTPHandler
):
1275 """Handler for HTTP requests and responses.
1277 This class, when installed with an OpenerDirector, automatically adds
1278 the standard headers to every HTTP request and handles gzipped and
1279 deflated responses from web servers. If compression is to be avoided in
1280 a particular request, the original request in the program code only has
1281 to include the HTTP header "Youtubedl-no-compression", which will be
1282 removed before making the real request.
1284 Part of this code was copied from:
1286 http://techknack.net/python-urllib2-handlers/
1288 Andrew Rowls, the author of that code, agreed to release it to the
1292 def __init__(self
, params
, *args
, **kwargs
):
1293 urllib
.request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1294 self
._params
= params
1296 def http_open(self
, req
):
1297 conn_class
= http
.client
.HTTPConnection
1299 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1301 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1302 del req
.headers
['Ytdl-socks-proxy']
1304 return self
.do_open(functools
.partial(
1305 _create_http_connection
, self
, conn_class
, False),
1313 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1315 return zlib
.decompress(data
)
1321 return brotli
.decompress(data
)
1323 def http_request(self
, req
):
1324 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1325 # always respected by websites, some tend to give out URLs with non percent-encoded
1326 # non-ASCII characters (see telemb.py, ard.py [#3412])
1327 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1328 # To work around aforementioned issue we will replace request's original URL with
1329 # percent-encoded one
1330 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1331 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1332 url
= req
.get_full_url()
1333 url_escaped
= escape_url(url
)
1335 # Substitute URL if any change after escaping
1336 if url
!= url_escaped
:
1337 req
= update_Request(req
, url
=url_escaped
)
1339 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1340 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1341 # The dict keys are capitalized because of this bug by urllib
1342 if h
.capitalize() not in req
.headers
:
1343 req
.add_header(h
, v
)
1345 if 'Accept-encoding' not in req
.headers
:
1346 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1348 req
.headers
= handle_youtubedl_headers(req
.headers
)
1350 return super().do_request_(req
)
1352 def http_response(self
, req
, resp
):
1355 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1356 content
= resp
.read()
1357 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1359 uncompressed
= io
.BytesIO(gz
.read())
1360 except OSError as original_ioerror
:
1361 # There may be junk add the end of the file
1362 # See http://stackoverflow.com/q/4928560/35070 for details
1363 for i
in range(1, 1024):
1365 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1366 uncompressed
= io
.BytesIO(gz
.read())
1371 raise original_ioerror
1372 resp
= urllib
.request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1373 resp
.msg
= old_resp
.msg
1374 del resp
.headers
['Content-encoding']
1376 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1377 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1378 resp
= urllib
.request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1379 resp
.msg
= old_resp
.msg
1380 del resp
.headers
['Content-encoding']
1382 if resp
.headers
.get('Content-encoding', '') == 'br':
1383 resp
= urllib
.request
.addinfourl(
1384 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1385 resp
.msg
= old_resp
.msg
1386 del resp
.headers
['Content-encoding']
1387 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1388 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1389 if 300 <= resp
.code
< 400:
1390 location
= resp
.headers
.get('Location')
1392 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1393 location
= location
.encode('iso-8859-1').decode()
1394 location_escaped
= escape_url(location
)
1395 if location
!= location_escaped
:
1396 del resp
.headers
['Location']
1397 resp
.headers
['Location'] = location_escaped
1400 https_request
= http_request
1401 https_response
= http_response
1404 def make_socks_conn_class(base_class
, socks_proxy
):
1405 assert issubclass(base_class
, (
1406 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
1408 url_components
= urllib
.parse
.urlparse(socks_proxy
)
1409 if url_components
.scheme
.lower() == 'socks5':
1410 socks_type
= ProxyType
.SOCKS5
1411 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1412 socks_type
= ProxyType
.SOCKS4
1413 elif url_components
.scheme
.lower() == 'socks4a':
1414 socks_type
= ProxyType
.SOCKS4A
1416 def unquote_if_non_empty(s
):
1419 return urllib
.parse
.unquote_plus(s
)
1423 url_components
.hostname
, url_components
.port
or 1080,
1425 unquote_if_non_empty(url_components
.username
),
1426 unquote_if_non_empty(url_components
.password
),
1429 class SocksConnection(base_class
):
1431 self
.sock
= sockssocket()
1432 self
.sock
.setproxy(*proxy_args
)
1433 if isinstance(self
.timeout
, (int, float)):
1434 self
.sock
.settimeout(self
.timeout
)
1435 self
.sock
.connect((self
.host
, self
.port
))
1437 if isinstance(self
, http
.client
.HTTPSConnection
):
1438 if hasattr(self
, '_context'): # Python > 2.6
1439 self
.sock
= self
._context
.wrap_socket(
1440 self
.sock
, server_hostname
=self
.host
)
1442 self
.sock
= ssl
.wrap_socket(self
.sock
)
1444 return SocksConnection
1447 class YoutubeDLHTTPSHandler(urllib
.request
.HTTPSHandler
):
1448 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1449 urllib
.request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1450 self
._https
_conn
_class
= https_conn_class
or http
.client
.HTTPSConnection
1451 self
._params
= params
1453 def https_open(self
, req
):
1455 conn_class
= self
._https
_conn
_class
1457 if hasattr(self
, '_context'): # python > 2.6
1458 kwargs
['context'] = self
._context
1459 if hasattr(self
, '_check_hostname'): # python 3.x
1460 kwargs
['check_hostname'] = self
._check
_hostname
1462 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1464 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1465 del req
.headers
['Ytdl-socks-proxy']
1468 return self
.do_open(
1469 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1470 except urllib
.error
.URLError
as e
:
1471 if (isinstance(e
.reason
, ssl
.SSLError
)
1472 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1473 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1477 class YoutubeDLCookieJar(http
.cookiejar
.MozillaCookieJar
):
1479 See [1] for cookie file format.
1481 1. https://curl.haxx.se/docs/http-cookies.html
1483 _HTTPONLY_PREFIX
= '#HttpOnly_'
1485 _HEADER
= '''# Netscape HTTP Cookie File
1486 # This file is generated by yt-dlp. Do not edit.
1489 _CookieFileEntry
= collections
.namedtuple(
1491 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1493 def __init__(self
, filename
=None, *args
, **kwargs
):
1494 super().__init
__(None, *args
, **kwargs
)
1495 if self
.is_path(filename
):
1496 filename
= os
.fspath(filename
)
1497 self
.filename
= filename
1500 def _true_or_false(cndn
):
1501 return 'TRUE' if cndn
else 'FALSE'
1505 return isinstance(file, (str, bytes, os
.PathLike
))
1507 @contextlib.contextmanager
1508 def open(self
, file, *, write
=False):
1509 if self
.is_path(file):
1510 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1517 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1520 if (not ignore_discard
and cookie
.discard
1521 or not ignore_expires
and cookie
.is_expired(now
)):
1523 name
, value
= cookie
.name
, cookie
.value
1525 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1526 # with no name, whereas http.cookiejar regards it as a
1527 # cookie with no value.
1528 name
, value
= '', name
1529 f
.write('%s\n' % '\t'.join((
1531 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1533 self
._true
_or
_false
(cookie
.secure
),
1534 str_or_none(cookie
.expires
, default
=''),
1538 def save(self
, filename
=None, *args
, **kwargs
):
1540 Save cookies to a file.
1541 Code is taken from CPython 3.6
1542 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1544 if filename
is None:
1545 if self
.filename
is not None:
1546 filename
= self
.filename
1548 raise ValueError(http
.cookiejar
.MISSING_FILENAME_TEXT
)
1550 # Store session cookies with `expires` set to 0 instead of an empty string
1552 if cookie
.expires
is None:
1555 with self
.open(filename
, write
=True) as f
:
1556 f
.write(self
._HEADER
)
1557 self
._really
_save
(f
, *args
, **kwargs
)
1559 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1560 """Load cookies from a file."""
1561 if filename
is None:
1562 if self
.filename
is not None:
1563 filename
= self
.filename
1565 raise ValueError(http
.cookiejar
.MISSING_FILENAME_TEXT
)
1567 def prepare_line(line
):
1568 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1569 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1570 # comments and empty lines are fine
1571 if line
.startswith('#') or not line
.strip():
1573 cookie_list
= line
.split('\t')
1574 if len(cookie_list
) != self
._ENTRY
_LEN
:
1575 raise http
.cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1576 cookie
= self
._CookieFileEntry
(*cookie_list
)
1577 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1578 raise http
.cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1582 with self
.open(filename
) as f
:
1585 cf
.write(prepare_line(line
))
1586 except http
.cookiejar
.LoadError
as e
:
1587 if f
'{line.strip()} '[0] in '[{"':
1588 raise http
.cookiejar
.LoadError(
1589 'Cookies file must be Netscape formatted, not JSON. See '
1590 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1591 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1594 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1595 # Session cookies are denoted by either `expires` field set to
1596 # an empty string or 0. MozillaCookieJar only recognizes the former
1597 # (see [1]). So we need force the latter to be recognized as session
1598 # cookies on our own.
1599 # Session cookies may be important for cookies-based authentication,
1600 # e.g. usually, when user does not check 'Remember me' check box while
1601 # logging in on a site, some important cookies are stored as session
1602 # cookies so that not recognizing them will result in failed login.
1603 # 1. https://bugs.python.org/issue17164
1605 # Treat `expires=0` cookies as session cookies
1606 if cookie
.expires
== 0:
1607 cookie
.expires
= None
1608 cookie
.discard
= True
1611 class YoutubeDLCookieProcessor(urllib
.request
.HTTPCookieProcessor
):
1612 def __init__(self
, cookiejar
=None):
1613 urllib
.request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1615 def http_response(self
, request
, response
):
1616 return urllib
.request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1618 https_request
= urllib
.request
.HTTPCookieProcessor
.http_request
1619 https_response
= http_response
1622 class YoutubeDLRedirectHandler(urllib
.request
.HTTPRedirectHandler
):
1623 """YoutubeDL redirect handler
1625 The code is based on HTTPRedirectHandler implementation from CPython [1].
1627 This redirect handler solves two issues:
1628 - ensures redirect URL is always unicode under python 2
1629 - introduces support for experimental HTTP response status code
1630 308 Permanent Redirect [2] used by some sites [3]
1632 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1633 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1634 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1637 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
1639 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1640 """Return a Request or None in response to a redirect.
1642 This is called by the http_error_30x methods when a
1643 redirection response is received. If a redirection should
1644 take place, return a new Request to allow http_error_30x to
1645 perform the redirect. Otherwise, raise HTTPError if no-one
1646 else should try to handle this url. Return None if you can't
1647 but another Handler might.
1649 m
= req
.get_method()
1650 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1651 or code
in (301, 302, 303) and m
== "POST")):
1652 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1653 # Strictly (according to RFC 2616), 301 or 302 in response to
1654 # a POST MUST NOT cause a redirection without confirmation
1655 # from the user (of urllib.request, in this case). In practice,
1656 # essentially all clients do redirect in this case, so we do
1659 # Be conciliant with URIs containing a space. This is mainly
1660 # redundant with the more complete encoding done in http_error_302(),
1661 # but it is kept for compatibility with other callers.
1662 newurl
= newurl
.replace(' ', '%20')
1664 CONTENT_HEADERS
= ("content-length", "content-type")
1665 # NB: don't use dict comprehension for python 2.6 compatibility
1666 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1668 # A 303 must either use GET or HEAD for subsequent request
1669 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1670 if code
== 303 and m
!= 'HEAD':
1672 # 301 and 302 redirects are commonly turned into a GET from a POST
1673 # for subsequent requests by browsers, so we'll do the same.
1674 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1675 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1676 if code
in (301, 302) and m
== 'POST':
1679 return urllib
.request
.Request(
1680 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1681 unverifiable
=True, method
=m
)
1684 def extract_timezone(date_str
):
1687 ^.{8,}? # >=8 char non-TZ prefix, if present
1688 (?P<tz>Z| # just the UTC Z, or
1689 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1690 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1691 [ ]? # optional space
1692 (?P<sign>\+|-) # +/-
1693 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1697 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1698 timezone
= TIMEZONE_NAMES
.get(m
and m
.group('tz').strip())
1699 if timezone
is not None:
1700 date_str
= date_str
[:-len(m
.group('tz'))]
1701 timezone
= datetime
.timedelta(hours
=timezone
or 0)
1703 date_str
= date_str
[:-len(m
.group('tz'))]
1704 if not m
.group('sign'):
1705 timezone
= datetime
.timedelta()
1707 sign
= 1 if m
.group('sign') == '+' else -1
1708 timezone
= datetime
.timedelta(
1709 hours
=sign
* int(m
.group('hours')),
1710 minutes
=sign
* int(m
.group('minutes')))
1711 return timezone
, date_str
1714 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1715 """ Return a UNIX timestamp from the given date """
1717 if date_str
is None:
1720 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1722 if timezone
is None:
1723 timezone
, date_str
= extract_timezone(date_str
)
1725 with contextlib
.suppress(ValueError):
1726 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1727 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1728 return calendar
.timegm(dt
.timetuple())
1731 def date_formats(day_first
=True):
1732 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1735 def unified_strdate(date_str
, day_first
=True):
1736 """Return a string with the date in the format YYYYMMDD"""
1738 if date_str
is None:
1742 date_str
= date_str
.replace(',', ' ')
1743 # Remove AM/PM + timezone
1744 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1745 _
, date_str
= extract_timezone(date_str
)
1747 for expression
in date_formats(day_first
):
1748 with contextlib
.suppress(ValueError):
1749 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1750 if upload_date
is None:
1751 timetuple
= email
.utils
.parsedate_tz(date_str
)
1753 with contextlib
.suppress(ValueError):
1754 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1755 if upload_date
is not None:
1756 return str(upload_date
)
1759 def unified_timestamp(date_str
, day_first
=True):
1760 if date_str
is None:
1763 date_str
= re
.sub(r
'\s+', ' ', re
.sub(
1764 r
'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str
))
1766 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1767 timezone
, date_str
= extract_timezone(date_str
)
1769 # Remove AM/PM + timezone
1770 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1772 # Remove unrecognized timezones from ISO 8601 alike timestamps
1773 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1775 date_str
= date_str
[:-len(m
.group('tz'))]
1777 # Python only supports microseconds, so remove nanoseconds
1778 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1780 date_str
= m
.group(1)
1782 for expression
in date_formats(day_first
):
1783 with contextlib
.suppress(ValueError):
1784 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1785 return calendar
.timegm(dt
.timetuple())
1787 timetuple
= email
.utils
.parsedate_tz(date_str
)
1789 return calendar
.timegm(timetuple
) + pm_delta
* 3600 - timezone
.total_seconds()
1792 def determine_ext(url
, default_ext
='unknown_video'):
1793 if url
is None or '.' not in url
:
1795 guess
= url
.partition('?')[0].rpartition('.')[2]
1796 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1798 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1799 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1800 return guess
.rstrip('/')
1805 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1806 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1809 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1811 Return a datetime object from a string.
1813 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1815 @param format strftime format of DATE
1816 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1817 auto: round to the unit provided in date_str (if applicable).
1819 auto_precision
= False
1820 if precision
== 'auto':
1821 auto_precision
= True
1822 precision
= 'microsecond'
1823 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1824 if date_str
in ('now', 'today'):
1826 if date_str
== 'yesterday':
1827 return today
- datetime
.timedelta(days
=1)
1829 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1831 if match
is not None:
1832 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1833 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1834 unit
= match
.group('unit')
1835 if unit
== 'month' or unit
== 'year':
1836 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1842 delta
= datetime
.timedelta(**{unit + 's': time}
)
1843 new_date
= start_time
+ delta
1845 return datetime_round(new_date
, unit
)
1848 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1851 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1853 Return a date object from a string using datetime_from_str
1855 @param strict Restrict allowed patterns to "YYYYMMDD" and
1856 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1858 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1859 raise ValueError(f
'Invalid date format "{date_str}"')
1860 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1863 def datetime_add_months(dt
, months
):
1864 """Increment/Decrement a datetime object by months."""
1865 month
= dt
.month
+ months
- 1
1866 year
= dt
.year
+ month
// 12
1867 month
= month
% 12 + 1
1868 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1869 return dt
.replace(year
, month
, day
)
1872 def datetime_round(dt
, precision
='day'):
1874 Round a datetime object's time to a specific precision
1876 if precision
== 'microsecond':
1885 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1886 timestamp
= calendar
.timegm(dt
.timetuple())
1887 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1890 def hyphenate_date(date_str
):
1892 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1893 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1894 if match
is not None:
1895 return '-'.join(match
.groups())
1901 """Represents a time interval between two dates"""
1903 def __init__(self
, start
=None, end
=None):
1904 """start and end must be strings in the format accepted by date"""
1905 if start
is not None:
1906 self
.start
= date_from_str(start
, strict
=True)
1908 self
.start
= datetime
.datetime
.min.date()
1910 self
.end
= date_from_str(end
, strict
=True)
1912 self
.end
= datetime
.datetime
.max.date()
1913 if self
.start
> self
.end
:
1914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1918 """Returns a range that only contains the given day"""
1919 return cls(day
, day
)
1921 def __contains__(self
, date
):
1922 """Check if the date is in the range"""
1923 if not isinstance(date
, datetime
.date
):
1924 date
= date_from_str(date
)
1925 return self
.start
<= date
<= self
.end
1928 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
1930 def __eq__(self
, other
):
1931 return (isinstance(other
, DateRange
)
1932 and self
.start
== other
.start
and self
.end
== other
.end
)
1935 def platform_name():
1936 """ Returns the platform name as a str """
1937 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1938 return platform
.platform()
1942 def system_identifier():
1943 python_implementation
= platform
.python_implementation()
1944 if python_implementation
== 'PyPy' and hasattr(sys
, 'pypy_version_info'):
1945 python_implementation
+= ' version %d.%d.%d' % sys
.pypy_version_info
[:3]
1947 return 'Python %s (%s %s) - %s %s' % (
1948 platform
.python_version(),
1949 python_implementation
,
1950 platform
.architecture()[0],
1951 platform
.platform(),
1952 format_field(join_nonempty(*platform
.libc_ver(), delim
=' '), None, '(%s)'),
1957 def get_windows_version():
1958 ''' Get Windows version. returns () if it's not running on Windows '''
1959 if compat_os_name
== 'nt':
1960 return version_tuple(platform
.win32_ver()[1])
1965 def write_string(s
, out
=None, encoding
=None):
1966 assert isinstance(s
, str)
1967 out
= out
or sys
.stderr
1969 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1970 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1972 enc
, buffer = None, out
1973 if 'b' in getattr(out
, 'mode', ''):
1974 enc
= encoding
or preferredencoding()
1975 elif hasattr(out
, 'buffer'):
1977 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1979 buffer.write(s
.encode(enc
, 'ignore') if enc
else s
)
1983 def bytes_to_intlist(bs
):
1986 if isinstance(bs
[0], int): # Python 3
1989 return [ord(c
) for c
in bs
]
1992 def intlist_to_bytes(xs
):
1995 return struct
.pack('%dB' % len(xs
), *xs
)
1998 class LockingUnsupportedError(OSError):
1999 msg
= 'File locking is not supported'
2002 super().__init
__(self
.msg
)
2005 # Cross-platform file locking
2006 if sys
.platform
== 'win32':
2008 import ctypes
.wintypes
2011 class OVERLAPPED(ctypes
.Structure
):
2013 ('Internal', ctypes
.wintypes
.LPVOID
),
2014 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
2015 ('Offset', ctypes
.wintypes
.DWORD
),
2016 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
2017 ('hEvent', ctypes
.wintypes
.HANDLE
),
2020 kernel32
= ctypes
.windll
.kernel32
2021 LockFileEx
= kernel32
.LockFileEx
2022 LockFileEx
.argtypes
= [
2023 ctypes
.wintypes
.HANDLE
, # hFile
2024 ctypes
.wintypes
.DWORD
, # dwFlags
2025 ctypes
.wintypes
.DWORD
, # dwReserved
2026 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2027 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2028 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2030 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
2031 UnlockFileEx
= kernel32
.UnlockFileEx
2032 UnlockFileEx
.argtypes
= [
2033 ctypes
.wintypes
.HANDLE
, # hFile
2034 ctypes
.wintypes
.DWORD
, # dwReserved
2035 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2036 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2037 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2039 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
2040 whole_low
= 0xffffffff
2041 whole_high
= 0x7fffffff
2043 def _lock_file(f
, exclusive
, block
):
2044 overlapped
= OVERLAPPED()
2045 overlapped
.Offset
= 0
2046 overlapped
.OffsetHigh
= 0
2047 overlapped
.hEvent
= 0
2048 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
2050 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
2051 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
2052 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2053 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2054 raise BlockingIOError(f
'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2056 def _unlock_file(f
):
2057 assert f
._lock
_file
_overlapped
_p
2058 handle
= msvcrt
.get_osfhandle(f
.fileno())
2059 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2060 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2066 def _lock_file(f
, exclusive
, block
):
2067 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
2069 flags |
= fcntl
.LOCK_NB
2071 fcntl
.flock(f
, flags
)
2072 except BlockingIOError
:
2074 except OSError: # AOSP does not have flock()
2075 fcntl
.lockf(f
, flags
)
2077 def _unlock_file(f
):
2079 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2081 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2085 def _lock_file(f
, exclusive
, block
):
2086 raise LockingUnsupportedError()
2088 def _unlock_file(f
):
2089 raise LockingUnsupportedError()
2095 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2096 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2097 raise NotImplementedError(mode
)
2098 self
.mode
, self
.block
= mode
, block
2100 writable
= any(f
in mode
for f
in 'wax+')
2101 readable
= any(f
in mode
for f
in 'r+')
2102 flags
= functools
.reduce(operator
.ior
, (
2103 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2104 getattr(os
, 'O_BINARY', 0), # Windows only
2105 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2106 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2107 os
.O_APPEND
if 'a' in mode
else 0,
2108 os
.O_EXCL
if 'x' in mode
else 0,
2109 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2112 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2114 def __enter__(self
):
2115 exclusive
= 'r' not in self
.mode
2117 _lock_file(self
.f
, exclusive
, self
.block
)
2122 if 'w' in self
.mode
:
2125 except OSError as e
:
2127 errno
.ESPIPE
, # Illegal seek - expected for FIFO
2128 errno
.EINVAL
, # Invalid argument - expected for /dev/null
2137 _unlock_file(self
.f
)
2141 def __exit__(self
, *_
):
2150 def __getattr__(self
, attr
):
2151 return getattr(self
.f
, attr
)
2158 def get_filesystem_encoding():
2159 encoding
= sys
.getfilesystemencoding()
2160 return encoding
if encoding
is not None else 'utf-8'
2163 def shell_quote(args
):
2165 encoding
= get_filesystem_encoding()
2167 if isinstance(a
, bytes):
2168 # We may get a filename encoded with 'encodeFilename'
2169 a
= a
.decode(encoding
)
2170 quoted_args
.append(compat_shlex_quote(a
))
2171 return ' '.join(quoted_args
)
2174 def smuggle_url(url
, data
):
2175 """ Pass additional data in a URL for internal use. """
2177 url
, idata
= unsmuggle_url(url
, {})
2179 sdata
= urllib
.parse
.urlencode(
2180 {'__youtubedl_smuggle': json.dumps(data)}
)
2181 return url
+ '#' + sdata
2184 def unsmuggle_url(smug_url
, default
=None):
2185 if '#__youtubedl_smuggle' not in smug_url
:
2186 return smug_url
, default
2187 url
, _
, sdata
= smug_url
.rpartition('#')
2188 jsond
= urllib
.parse
.parse_qs(sdata
)['__youtubedl_smuggle'][0]
2189 data
= json
.loads(jsond
)
2193 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2194 """ Formats numbers with decimal sufixes like K, M, etc """
2195 num
, factor
= float_or_none(num
), float(factor
)
2196 if num
is None or num
< 0:
2198 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2199 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2200 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2202 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2203 converted
= num
/ (factor
** exponent
)
2204 return fmt
% (converted
, suffix
)
2207 def format_bytes(bytes):
2208 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2211 def lookup_unit_table(unit_table
, s
):
2212 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2214 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2217 num_str
= m
.group('num').replace(',', '.')
2218 mult
= unit_table
[m
.group('unit')]
2219 return int(float(num_str
) * mult
)
2222 def parse_filesize(s
):
2226 # The lower-case forms are of course incorrect and unofficial,
2227 # but we support those too
2244 'megabytes': 1000 ** 2,
2245 'mebibytes': 1024 ** 2,
2251 'gigabytes': 1000 ** 3,
2252 'gibibytes': 1024 ** 3,
2258 'terabytes': 1000 ** 4,
2259 'tebibytes': 1024 ** 4,
2265 'petabytes': 1000 ** 5,
2266 'pebibytes': 1024 ** 5,
2272 'exabytes': 1000 ** 6,
2273 'exbibytes': 1024 ** 6,
2279 'zettabytes': 1000 ** 7,
2280 'zebibytes': 1024 ** 7,
2286 'yottabytes': 1000 ** 8,
2287 'yobibytes': 1024 ** 8,
2290 return lookup_unit_table(_UNIT_TABLE
, s
)
2297 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2299 if re
.match(r
'^[\d,.]+$', s
):
2300 return str_to_int(s
)
2313 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2317 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2319 return str_to_int(mobj
.group(1))
2322 def parse_resolution(s
, *, lenient
=False):
2327 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2329 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2332 'width': int(mobj
.group('w')),
2333 'height': int(mobj
.group('h')),
2336 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2338 return {'height': int(mobj.group(1))}
2340 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2342 return {'height': int(mobj.group(1)) * 540}
2347 def parse_bitrate(s
):
2348 if not isinstance(s
, str):
2350 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2352 return int(mobj
.group(1))
2355 def month_by_name(name
, lang
='en'):
2356 """ Return the number of a month by (locale-independently) English name """
2358 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2361 return month_names
.index(name
) + 1
2366 def month_by_abbreviation(abbrev
):
2367 """ Return the number of a month by (locale-independently) English
2371 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2376 def fix_xml_ampersands(xml_str
):
2377 """Replace all the '&' by '&' in XML"""
2379 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2384 def setproctitle(title
):
2385 assert isinstance(title
, str)
2387 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2394 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2398 # LoadLibrary in Windows Python 2.7.13 only expects
2399 # a bytestring, but since unicode_literals turns
2400 # every string into a unicode string, it fails.
2402 title_bytes
= title
.encode()
2403 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2404 buf
.value
= title_bytes
2406 libc
.prctl(15, buf
, 0, 0, 0)
2407 except AttributeError:
2408 return # Strange libc, just skip this
2411 def remove_start(s
, start
):
2412 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2415 def remove_end(s
, end
):
2416 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2419 def remove_quotes(s
):
2420 if s
is None or len(s
) < 2:
2422 for quote
in ('"', "'", ):
2423 if s
[0] == quote
and s
[-1] == quote
:
2428 def get_domain(url
):
2430 This implementation is inconsistent, but is kept for compatibility.
2431 Use this only for "webpage_url_domain"
2433 return remove_start(urllib
.parse
.urlparse(url
).netloc
, 'www.') or None
2436 def url_basename(url
):
2437 path
= urllib
.parse
.urlparse(url
).path
2438 return path
.strip('/').split('/')[-1]
2442 return re
.match(r
'https?://[^?#&]+/', url
).group()
2445 def urljoin(base
, path
):
2446 if isinstance(path
, bytes):
2447 path
= path
.decode()
2448 if not isinstance(path
, str) or not path
:
2450 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2452 if isinstance(base
, bytes):
2453 base
= base
.decode()
2454 if not isinstance(base
, str) or not re
.match(
2455 r
'^(?:https?:)?//', base
):
2457 return urllib
.parse
.urljoin(base
, path
)
2460 class HEADRequest(urllib
.request
.Request
):
2461 def get_method(self
):
2465 class PUTRequest(urllib
.request
.Request
):
2466 def get_method(self
):
2470 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2471 if get_attr
and v
is not None:
2472 v
= getattr(v
, get_attr
, None)
2474 return int(v
) * invscale
// scale
2475 except (ValueError, TypeError, OverflowError):
2479 def str_or_none(v
, default
=None):
2480 return default
if v
is None else str(v
)
2483 def str_to_int(int_str
):
2484 """ A more relaxed version of int_or_none """
2485 if isinstance(int_str
, int):
2487 elif isinstance(int_str
, str):
2488 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2489 return int_or_none(int_str
)
2492 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2496 return float(v
) * invscale
/ scale
2497 except (ValueError, TypeError):
2501 def bool_or_none(v
, default
=None):
2502 return v
if isinstance(v
, bool) else default
2505 def strip_or_none(v
, default
=None):
2506 return v
.strip() if isinstance(v
, str) else default
2509 def url_or_none(url
):
2510 if not url
or not isinstance(url
, str):
2513 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2516 def request_to_url(req
):
2517 if isinstance(req
, urllib
.request
.Request
):
2518 return req
.get_full_url()
2523 def strftime_or_none(timestamp
, date_format
, default
=None):
2524 datetime_object
= None
2526 if isinstance(timestamp
, (int, float)): # unix timestamp
2527 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2528 elif isinstance(timestamp
, str): # assume YYYYMMDD
2529 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2530 return datetime_object
.strftime(date_format
)
2531 except (ValueError, TypeError, AttributeError):
2535 def parse_duration(s
):
2536 if not isinstance(s
, str):
2542 days
, hours
, mins
, secs
, ms
= [None] * 5
2543 m
= re
.match(r
'''(?x)
2545 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2546 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2547 (?P<ms>[.:][0-9]+)?Z?$
2550 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2555 [0-9]+\s*y(?:ears?)?,?\s*
2558 [0-9]+\s*m(?:onths?)?,?\s*
2561 [0-9]+\s*w(?:eeks?)?,?\s*
2564 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2568 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2571 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2574 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2577 days
, hours
, mins
, secs
, ms
= m
.groups()
2579 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2581 hours
, mins
= m
.groups()
2586 ms
= ms
.replace(':', '.')
2587 return sum(float(part
or 0) * mult
for part
, mult
in (
2588 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2591 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2592 name
, real_ext
= os
.path
.splitext(filename
)
2594 f
'{name}.{ext}{real_ext}'
2595 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2596 else f
'{filename}.{ext}')
2599 def replace_extension(filename
, ext
, expected_real_ext
=None):
2600 name
, real_ext
= os
.path
.splitext(filename
)
2601 return '{}.{}'.format(
2602 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2606 def check_executable(exe
, args
=[]):
2607 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2608 args can be a list of arguments for a short output (like -version) """
2610 Popen
.run([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
2616 def _get_exe_version_output(exe
, args
, *, to_screen
=None):
2618 to_screen(f
'Checking exe version: {shell_quote([exe] + args)}')
2620 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2621 # SIGTTOU if yt-dlp is run in the background.
2622 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2623 stdout
, _
, _
= Popen
.run([encodeArgument(exe
)] + args
, text
=True,
2624 stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
2630 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2631 assert isinstance(output
, str)
2632 if version_re
is None:
2633 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2634 m
= re
.search(version_re
, output
)
2641 def get_exe_version(exe
, args
=['--version'],
2642 version_re
=None, unrecognized
='present'):
2643 """ Returns the version of the specified executable,
2644 or False if the executable is not present """
2645 out
= _get_exe_version_output(exe
, args
)
2646 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2649 def frange(start
=0, stop
=None, step
=1):
2652 start
, stop
= 0, start
2653 sign
= [-1, 1][step
> 0] if step
else 0
2654 while sign
* start
< sign
* stop
:
2659 class LazyList(collections
.abc
.Sequence
):
2660 """Lazy immutable list from an iterable
2661 Note that slices of a LazyList are lists and not LazyList"""
2663 class IndexError(IndexError):
2666 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2667 self
._iterable
= iter(iterable
)
2668 self
._cache
= [] if _cache
is None else _cache
2669 self
._reversed
= reverse
2673 # We need to consume the entire iterable to iterate in reverse
2674 yield from self
.exhaust()
2676 yield from self
._cache
2677 for item
in self
._iterable
:
2678 self
._cache
.append(item
)
2682 self
._cache
.extend(self
._iterable
)
2683 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2687 """Evaluate the entire iterable"""
2688 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2691 def _reverse_index(x
):
2692 return None if x
is None else ~x
2694 def __getitem__(self
, idx
):
2695 if isinstance(idx
, slice):
2697 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2698 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2699 elif isinstance(idx
, int):
2701 idx
= self
._reverse
_index
(idx
)
2702 start
, stop
, step
= idx
, idx
, 0
2704 raise TypeError('indices must be integers or slices')
2705 if ((start
or 0) < 0 or (stop
or 0) < 0
2706 or (start
is None and step
< 0)
2707 or (stop
is None and step
> 0)):
2708 # We need to consume the entire iterable to be able to slice from the end
2709 # Obviously, never use this with infinite iterables
2712 return self
._cache
[idx
]
2713 except IndexError as e
:
2714 raise self
.IndexError(e
) from e
2715 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2717 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2719 return self
._cache
[idx
]
2720 except IndexError as e
:
2721 raise self
.IndexError(e
) from e
2725 self
[-1] if self
._reversed
else self
[0]
2726 except self
.IndexError:
2732 return len(self
._cache
)
2734 def __reversed__(self
):
2735 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2738 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2741 # repr and str should mimic a list. So we exhaust the iterable
2742 return repr(self
.exhaust())
2745 return repr(self
.exhaust())
2750 class IndexError(IndexError):
2754 # This is only useful for tests
2755 return len(self
.getslice())
2757 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2758 self
._pagefunc
= pagefunc
2759 self
._pagesize
= pagesize
2760 self
._pagecount
= float('inf')
2761 self
._use
_cache
= use_cache
2764 def getpage(self
, pagenum
):
2765 page_results
= self
._cache
.get(pagenum
)
2766 if page_results
is None:
2767 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2769 self
._cache
[pagenum
] = page_results
2772 def getslice(self
, start
=0, end
=None):
2773 return list(self
._getslice
(start
, end
))
2775 def _getslice(self
, start
, end
):
2776 raise NotImplementedError('This method must be implemented by subclasses')
2778 def __getitem__(self
, idx
):
2779 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2780 if not isinstance(idx
, int) or idx
< 0:
2781 raise TypeError('indices must be non-negative integers')
2782 entries
= self
.getslice(idx
, idx
+ 1)
2784 raise self
.IndexError()
2788 class OnDemandPagedList(PagedList
):
2789 """Download pages until a page with less than maximum results"""
2791 def _getslice(self
, start
, end
):
2792 for pagenum
in itertools
.count(start
// self
._pagesize
):
2793 firstid
= pagenum
* self
._pagesize
2794 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2795 if start
>= nextfirstid
:
2799 start
% self
._pagesize
2800 if firstid
<= start
< nextfirstid
2803 ((end
- 1) % self
._pagesize
) + 1
2804 if (end
is not None and firstid
<= end
<= nextfirstid
)
2808 page_results
= self
.getpage(pagenum
)
2810 self
._pagecount
= pagenum
- 1
2812 if startv
!= 0 or endv
is not None:
2813 page_results
= page_results
[startv
:endv
]
2814 yield from page_results
2816 # A little optimization - if current page is not "full", ie. does
2817 # not contain page_size videos then we can assume that this page
2818 # is the last one - there are no more ids on further pages -
2819 # i.e. no need to query again.
2820 if len(page_results
) + startv
< self
._pagesize
:
2823 # If we got the whole page, but the next page is not interesting,
2824 # break out early as well
2825 if end
== nextfirstid
:
2829 class InAdvancePagedList(PagedList
):
2830 """PagedList with total number of pages known in advance"""
2832 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2833 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2834 self
._pagecount
= pagecount
2836 def _getslice(self
, start
, end
):
2837 start_page
= start
// self
._pagesize
2838 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2839 skip_elems
= start
- start_page
* self
._pagesize
2840 only_more
= None if end
is None else end
- start
2841 for pagenum
in range(start_page
, end_page
):
2842 page_results
= self
.getpage(pagenum
)
2844 page_results
= page_results
[skip_elems
:]
2846 if only_more
is not None:
2847 if len(page_results
) < only_more
:
2848 only_more
-= len(page_results
)
2850 yield from page_results
[:only_more
]
2852 yield from page_results
2855 class PlaylistEntries
:
2856 MissingEntry
= object()
2857 is_exhausted
= False
2859 def __init__(self
, ydl
, info_dict
):
2862 # _entries must be assigned now since infodict can change during iteration
2863 entries
= info_dict
.get('entries')
2865 raise EntryNotInPlaylist('There are no entries')
2866 elif isinstance(entries
, list):
2867 self
.is_exhausted
= True
2869 requested_entries
= info_dict
.get('requested_entries')
2870 self
.is_incomplete
= bool(requested_entries
)
2871 if self
.is_incomplete
:
2872 assert self
.is_exhausted
2873 self
._entries
= [self
.MissingEntry
] * max(requested_entries
)
2874 for i
, entry
in zip(requested_entries
, entries
):
2875 self
._entries
[i
- 1] = entry
2876 elif isinstance(entries
, (list, PagedList
, LazyList
)):
2877 self
._entries
= entries
2879 self
._entries
= LazyList(entries
)
2881 PLAYLIST_ITEMS_RE
= re
.compile(r
'''(?x)
2882 (?P<start>[+-]?\d+)?
2884 (?P<end>[+-]?\d+|inf(?:inite)?)?
2885 (?::(?P<step>[+-]?\d+))?
2889 def parse_playlist_items(cls
, string
):
2890 for segment
in string
.split(','):
2892 raise ValueError('There is two or more consecutive commas')
2893 mobj
= cls
.PLAYLIST_ITEMS_RE
.fullmatch(segment
)
2895 raise ValueError(f
'{segment!r} is not a valid specification')
2896 start
, end
, step
, has_range
= mobj
.group('start', 'end', 'step', 'range')
2897 if int_or_none(step
) == 0:
2898 raise ValueError(f
'Step in {segment!r} cannot be zero')
2899 yield slice(int_or_none(start
), float_or_none(end
), int_or_none(step
)) if has_range
else int(start
)
2901 def get_requested_items(self
):
2902 playlist_items
= self
.ydl
.params
.get('playlist_items')
2903 playlist_start
= self
.ydl
.params
.get('playliststart', 1)
2904 playlist_end
= self
.ydl
.params
.get('playlistend')
2905 # For backwards compatibility, interpret -1 as whole list
2906 if playlist_end
in (-1, None):
2908 if not playlist_items
:
2909 playlist_items
= f
'{playlist_start}:{playlist_end}'
2910 elif playlist_start
!= 1 or playlist_end
:
2911 self
.ydl
.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once
=True)
2913 for index
in self
.parse_playlist_items(playlist_items
):
2914 for i
, entry
in self
[index
]:
2919 # TODO: Add auto-generated fields
2920 self
.ydl
._match
_entry
(entry
, incomplete
=True, silent
=True)
2921 except (ExistingVideoReached
, RejectedVideoReached
):
2924 def get_full_count(self
):
2925 if self
.is_exhausted
and not self
.is_incomplete
:
2927 elif isinstance(self
._entries
, InAdvancePagedList
):
2928 if self
._entries
._pagesize
== 1:
2929 return self
._entries
._pagecount
2931 @functools.cached_property
2933 if isinstance(self
._entries
, list):
2936 entry
= self
._entries
[i
]
2938 entry
= self
.MissingEntry
2939 if not self
.is_incomplete
:
2940 raise self
.IndexError()
2941 if entry
is self
.MissingEntry
:
2942 raise EntryNotInPlaylist(f
'Entry {i} cannot be found')
2947 return type(self
.ydl
)._handle
_extraction
_exceptions
(lambda _
, i
: self
._entries
[i
])(self
.ydl
, i
)
2948 except (LazyList
.IndexError, PagedList
.IndexError):
2949 raise self
.IndexError()
2952 def __getitem__(self
, idx
):
2953 if isinstance(idx
, int):
2954 idx
= slice(idx
, idx
)
2956 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2957 step
= 1 if idx
.step
is None else idx
.step
2958 if idx
.start
is None:
2959 start
= 0 if step
> 0 else len(self
) - 1
2961 start
= idx
.start
- 1 if idx
.start
>= 0 else len(self
) + idx
.start
2963 # NB: Do not call len(self) when idx == [:]
2964 if idx
.stop
is None:
2965 stop
= 0 if step
< 0 else float('inf')
2967 stop
= idx
.stop
- 1 if idx
.stop
>= 0 else len(self
) + idx
.stop
2968 stop
+= [-1, 1][step
> 0]
2970 for i
in frange(start
, stop
, step
):
2974 entry
= self
._getter
(i
)
2975 except self
.IndexError:
2976 self
.is_exhausted
= True
2983 return len(tuple(self
[:]))
2985 class IndexError(IndexError):
2989 def uppercase_escape(s
):
2990 unicode_escape
= codecs
.getdecoder('unicode_escape')
2992 r
'\\U[0-9a-fA-F]{8}',
2993 lambda m
: unicode_escape(m
.group(0))[0],
2997 def lowercase_escape(s
):
2998 unicode_escape
= codecs
.getdecoder('unicode_escape')
3000 r
'\\u[0-9a-fA-F]{4}',
3001 lambda m
: unicode_escape(m
.group(0))[0],
3005 def escape_rfc3986(s
):
3006 """Escape non-ASCII characters as suggested by RFC 3986"""
3007 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3010 def escape_url(url
):
3011 """Escape URL as suggested by RFC 3986"""
3012 url_parsed
= urllib
.parse
.urlparse(url
)
3013 return url_parsed
._replace
(
3014 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3015 path
=escape_rfc3986(url_parsed
.path
),
3016 params
=escape_rfc3986(url_parsed
.params
),
3017 query
=escape_rfc3986(url_parsed
.query
),
3018 fragment
=escape_rfc3986(url_parsed
.fragment
)
3023 return urllib
.parse
.parse_qs(urllib
.parse
.urlparse(url
).query
)
3026 def read_batch_urls(batch_fd
):
3028 if not isinstance(url
, str):
3029 url
= url
.decode('utf-8', 'replace')
3030 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
3031 for bom
in BOM_UTF8
:
3032 if url
.startswith(bom
):
3033 url
= url
[len(bom
):]
3035 if not url
or url
.startswith(('#', ';', ']')):
3037 # "#" cannot be stripped out since it is part of the URI
3038 # However, it can be safely stripped out if following a whitespace
3039 return re
.split(r
'\s#', url
, 1)[0].rstrip()
3041 with contextlib
.closing(batch_fd
) as fd
:
3042 return [url
for url
in map(fixup
, fd
) if url
]
3045 def urlencode_postdata(*args
, **kargs
):
3046 return urllib
.parse
.urlencode(*args
, **kargs
).encode('ascii')
3049 def update_url_query(url
, query
):
3052 parsed_url
= urllib
.parse
.urlparse(url
)
3053 qs
= urllib
.parse
.parse_qs(parsed_url
.query
)
3055 return urllib
.parse
.urlunparse(parsed_url
._replace
(
3056 query
=urllib
.parse
.urlencode(qs
, True)))
3059 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
3060 req_headers
= req
.headers
.copy()
3061 req_headers
.update(headers
or {})
3062 req_data
= data
or req
.data
3063 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3064 req_get_method
= req
.get_method()
3065 if req_get_method
== 'HEAD':
3066 req_type
= HEADRequest
3067 elif req_get_method
== 'PUT':
3068 req_type
= PUTRequest
3070 req_type
= urllib
.request
.Request
3072 req_url
, data
=req_data
, headers
=req_headers
,
3073 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3074 if hasattr(req
, 'timeout'):
3075 new_req
.timeout
= req
.timeout
3079 def _multipart_encode_impl(data
, boundary
):
3080 content_type
= 'multipart/form-data; boundary=%s' % boundary
3083 for k
, v
in data
.items():
3084 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3085 if isinstance(k
, str):
3087 if isinstance(v
, str):
3089 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3090 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3091 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3092 if boundary
.encode('ascii') in content
:
3093 raise ValueError('Boundary overlaps with data')
3096 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3098 return out
, content_type
3101 def multipart_encode(data
, boundary
=None):
3103 Encode a dict to RFC 7578-compliant form-data
3106 A dict where keys and values can be either Unicode or bytes-like
3109 If specified a Unicode object, it's used as the boundary. Otherwise
3110 a random boundary is generated.
3112 Reference: https://tools.ietf.org/html/rfc7578
3114 has_specified_boundary
= boundary
is not None
3117 if boundary
is None:
3118 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3121 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3124 if has_specified_boundary
:
3128 return out
, content_type
3131 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3132 for val
in map(d
.get
, variadic(key_or_keys
)):
3133 if val
is not None and (val
or not skip_false_values
):
3138 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
3141 val
= f(*args
, **kwargs
)
3142 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3145 if expected_type
is None or isinstance(val
, expected_type
):
3149 def try_get(src
, getter
, expected_type
=None):
3150 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
3153 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
3154 return {k: v for k, v in dct.items() if cndn(k, v)}
3157 def merge_dicts(*dicts
):
3159 for a_dict
in dicts
:
3160 for k
, v
in a_dict
.items():
3161 if (v
is not None and k
not in merged
3162 or isinstance(v
, str) and merged
[k
] == ''):
3167 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
3168 return string
if isinstance(string
, str) else str(string
, encoding
, errors
)
3180 TV_PARENTAL_GUIDELINES
= {
3190 def parse_age_limit(s
):
3191 # isinstance(False, int) is True. So type() must be used instead
3192 if type(s
) is int: # noqa: E721
3193 return s
if 0 <= s
<= 21 else None
3194 elif not isinstance(s
, str):
3196 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
3198 return int(m
.group('age'))
3201 return US_RATINGS
[s
]
3202 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
3204 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3208 def strip_jsonp(code
):
3211 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3212 (?:\s*&&\s*(?P=func_name))?
3213 \s*\(\s*(?P<callback_data>.*)\);?
3214 \s*?(?://[^\n]*)*$''',
3215 r
'\g<callback_data>', code
)
3218 def js_to_json(code
, vars={}, *, strict
=False):
3219 # vars is a dict of var, val pairs to substitute
3220 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3221 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3223 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3224 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3229 if v
in ('true', 'false', 'null'):
3231 elif v
in ('undefined', 'void 0'):
3233 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3236 if v
[0] in ("'", '"'):
3237 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3242 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3244 for regex
, base
in INTEGER_TABLE
:
3245 im
= re
.match(regex
, v
)
3247 i
= int(im
.group(1), base
)
3248 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3253 raise ValueError(f
'Unknown value: {v}')
3257 def create_map(mobj
):
3258 return json
.dumps(dict(json
.loads(js_to_json(mobj
.group(1) or '[]', vars=vars))))
3260 code
= re
.sub(r
'new Map\((\[.*?\])?\)', create_map
, code
)
3262 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3264 return re
.sub(r
'''(?sx)
3265 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3266 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3267 {comment}|,(?={skip}[\]}}])|
3268 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3269 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3272 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3275 def qualities(quality_ids
):
3276 """ Get a numeric quality value out of a list of possible values """
3279 return quality_ids
.index(qid
)
3285 POSTPROCESS_WHEN
= ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3289 'default': '%(title)s [%(id)s].%(ext)s',
3290 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3296 'description': 'description',
3297 'annotation': 'annotations.xml',
3298 'infojson': 'info.json',
3301 'pl_thumbnail': None,
3302 'pl_description': 'description',
3303 'pl_infojson': 'info.json',
3306 # As of [1] format syntax is:
3307 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3308 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3309 STR_FORMAT_RE_TMPL
= r
'''(?x)
3310 (?<!%)(?P<prefix>(?:%%)*)
3312 (?P<has_key>\((?P<key>{0})\))?
3314 (?P<conversion>[#0\-+ ]+)?
3316 (?P<precision>\.\d+)?
3317 (?P<len_mod>[hlL])? # unused in python
3318 {1} # conversion type
3323 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3326 def limit_length(s
, length
):
3327 """ Add ellipses to overly long strings """
3332 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3336 def version_tuple(v
):
3337 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3340 def is_outdated_version(version
, limit
, assume_new
=True):
3342 return not assume_new
3344 return version_tuple(version
) < version_tuple(limit
)
3346 return not assume_new
3349 def ytdl_is_updateable():
3350 """ Returns if yt-dlp can be updated with -U """
3352 from .update
import is_non_updateable
3354 return not is_non_updateable()
3357 def args_to_str(args
):
3358 # Get a short string representation for a subprocess command
3359 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3362 def error_to_compat_str(err
):
3366 def error_to_str(err
):
3367 return f
'{type(err).__name__}: {err}'
3370 def mimetype2ext(mt
):
3374 mt
, _
, params
= mt
.partition(';')
3379 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3380 # it's the most popular one
3381 'audio/mpeg': 'mp3',
3382 'audio/x-wav': 'wav',
3384 'audio/wave': 'wav',
3387 ext
= FULL_MAP
.get(mt
)
3393 'smptett+xml': 'tt',
3397 'x-mp4-fragmented': 'mp4',
3398 'x-ms-sami': 'sami',
3401 'x-mpegurl': 'm3u8',
3402 'vnd.apple.mpegurl': 'm3u8',
3406 'vnd.ms-sstr+xml': 'ism',
3410 'filmstrip+json': 'fs',
3414 _
, _
, subtype
= mt
.rpartition('/')
3415 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3426 _
, _
, suffix
= subtype
.partition('+')
3427 ext
= SUFFIX_MAP
.get(suffix
)
3431 return subtype
.replace('+', '.')
3434 def ext2mimetype(ext_or_url
):
3437 if '.' not in ext_or_url
:
3438 ext_or_url
= f
'file.{ext_or_url}'
3439 return mimetypes
.guess_type(ext_or_url
)[0]
3442 def parse_codecs(codecs_str
):
3443 # http://tools.ietf.org/html/rfc6381
3446 split_codecs
= list(filter(None, map(
3447 str.strip
, codecs_str
.strip().strip(',').split(','))))
3448 vcodec
, acodec
, scodec
, hdr
= None, None, None, None
3449 for full_codec
in split_codecs
:
3450 parts
= re
.sub(r
'0+(?=\d)', '', full_codec
).split('.')
3451 if parts
[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3456 if parts
[0] in ('dvh1', 'dvhe'):
3458 elif parts
[0] == 'av1' and traverse_obj(parts
, 3) == '10':
3460 elif parts
[:2] == ['vp9', '2']:
3462 elif parts
[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3463 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464 acodec
= acodec
or full_codec
3465 elif parts
[0] in ('stpp', 'wvtt'):
3466 scodec
= scodec
or full_codec
3468 write_string(f
'WARNING: Unknown codec {full_codec}\n')
3469 if vcodec
or acodec
or scodec
:
3471 'vcodec': vcodec
or 'none',
3472 'acodec': acodec
or 'none',
3473 'dynamic_range': hdr
,
3474 **({'scodec': scodec}
if scodec
is not None else {}),
3476 elif len(split_codecs
) == 2:
3478 'vcodec': split_codecs
[0],
3479 'acodec': split_codecs
[1],
3484 def get_compatible_ext(*, vcodecs
, acodecs
, vexts
, aexts
, preferences
=None):
3485 assert len(vcodecs
) == len(vexts
) and len(acodecs
) == len(aexts
)
3487 allow_mkv
= not preferences
or 'mkv' in preferences
3489 if allow_mkv
and max(len(acodecs
), len(vcodecs
)) > 1:
3490 return 'mkv' # TODO: any other format allows this?
3492 # TODO: All codecs supported by parse_codecs isn't handled here
3493 COMPATIBLE_CODECS
= {
3495 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3496 'h264', 'aacl', # Set in ISM
3499 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500 'vp9x', 'vp8x', # in the webm spec
3504 sanitize_codec
= functools
.partial(try_get
, getter
=lambda x
: x
[0].split('.')[0].replace('0', ''))
3505 vcodec
, acodec
= sanitize_codec(vcodecs
), sanitize_codec(acodecs
)
3507 for ext
in preferences
or COMPATIBLE_CODECS
.keys():
3508 codec_set
= COMPATIBLE_CODECS
.get(ext
, set())
3509 if ext
== 'mkv' or codec_set
.issuperset((vcodec
, acodec
)):
3513 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}
,
3516 for ext
in preferences
or vexts
:
3517 current_exts
= {ext, *vexts, *aexts}
3518 if ext
== 'mkv' or current_exts
== {ext}
or any(
3519 ext_sets
.issuperset(current_exts
) for ext_sets
in COMPATIBLE_EXTS
):
3521 return 'mkv' if allow_mkv
else preferences
[-1]
3524 def urlhandle_detect_ext(url_handle
):
3525 getheader
= url_handle
.headers
.get
3527 cd
= getheader('Content-Disposition')
3529 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3531 e
= determine_ext(m
.group('filename'), default_ext
=None)
3535 return mimetype2ext(getheader('Content-Type'))
3538 def encode_data_uri(data
, mime_type
):
3539 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3542 def age_restricted(content_limit
, age_limit
):
3543 """ Returns True iff the content should be blocked """
3545 if age_limit
is None: # No limit set
3547 if content_limit
is None:
3548 return False # Content available for everyone
3549 return age_limit
< content_limit
3552 # List of known byte-order-marks (BOM)
3554 (b
'\xef\xbb\xbf', 'utf-8'),
3555 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3556 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3557 (b
'\xff\xfe', 'utf-16-le'),
3558 (b
'\xfe\xff', 'utf-16-be'),
3562 def is_html(first_bytes
):
3563 """ Detect whether a file contains HTML by examining its first bytes. """
3566 for bom
, enc
in BOMS
:
3567 while first_bytes
.startswith(bom
):
3568 encoding
, first_bytes
= enc
, first_bytes
[len(bom
):]
3570 return re
.match(r
'^\s*<', first_bytes
.decode(encoding
, 'replace'))
3573 def determine_protocol(info_dict
):
3574 protocol
= info_dict
.get('protocol')
3575 if protocol
is not None:
3578 url
= sanitize_url(info_dict
['url'])
3579 if url
.startswith('rtmp'):
3581 elif url
.startswith('mms'):
3583 elif url
.startswith('rtsp'):
3586 ext
= determine_ext(url
)
3592 return urllib
.parse
.urlparse(url
).scheme
3595 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3596 """ Render a list of rows, each as a list of values.
3597 Text after a \t will be right aligned """
3599 return len(remove_terminal_sequences(string
).replace('\t', ''))
3601 def get_max_lens(table
):
3602 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3604 def filter_using_list(row
, filterArray
):
3605 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3607 max_lens
= get_max_lens(data
) if hide_empty
else []
3608 header_row
= filter_using_list(header_row
, max_lens
)
3609 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3611 table
= [header_row
] + data
3612 max_lens
= get_max_lens(table
)
3615 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3616 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3618 for pos
, text
in enumerate(map(str, row
)):
3620 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3622 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3623 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3627 def _match_one(filter_part
, dct
, incomplete
):
3628 # TODO: Generalize code with YoutubeDL._build_format_filter
3629 STRING_OPERATORS
= {
3630 '*=': operator
.contains
,
3631 '^=': lambda attr
, value
: attr
.startswith(value
),
3632 '$=': lambda attr
, value
: attr
.endswith(value
),
3633 '~=': lambda attr
, value
: re
.search(value
, attr
),
3635 COMPARISON_OPERATORS
= {
3637 '<=': operator
.le
, # "<=" must be defined above "<"
3644 if isinstance(incomplete
, bool):
3645 is_incomplete
= lambda _
: incomplete
3647 is_incomplete
= lambda k
: k
in incomplete
3649 operator_rex
= re
.compile(r
'''(?x)
3651 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3653 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3656 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3657 m = operator_rex.fullmatch(filter_part.strip())
3660 unnegated_op = COMPARISON_OPERATORS[m['op']]
3662 op = lambda attr, value: not unnegated_op(attr, value)
3665 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3667 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3668 actual_value = dct.get(m['key'])
3669 numeric_comparison = None
3670 if isinstance(actual_value, (int, float)):
3671 # If the original field is a string and matching comparisonvalue is
3672 # a number we should respect the origin of the original field
3673 # and process comparison value as a string (see
3674 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3676 numeric_comparison = int(comparison_value)
3678 numeric_comparison = parse_filesize(comparison_value)
3679 if numeric_comparison is None:
3680 numeric_comparison = parse_filesize(f'{comparison_value}B')
3681 if numeric_comparison is None:
3682 numeric_comparison = parse_duration(comparison_value)
3683 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3684 raise ValueError('Operator %s only supports string values!' % m['op'])
3685 if actual_value is None:
3686 return is_incomplete(m['key']) or m['none_inclusive']
3687 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3690 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3691 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3693 operator_rex = re.compile(r'''(?x
)
3694 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3695 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3696 m = operator_rex.fullmatch(filter_part.strip())
3698 op = UNARY_OPERATORS[m.group('op')]
3699 actual_value = dct.get(m.group('key'))
3700 if is_incomplete(m.group('key')) and actual_value is None:
3702 return op(actual_value)
3704 raise ValueError('Invalid filter part %r' % filter_part)
3707 def match_str(filter_str, dct, incomplete=False):
3708 """ Filter a dictionary with a simple string syntax.
3709 @returns Whether the filter passes
3710 @param incomplete Set of keys that is expected to be missing from dct.
3711 Can be True/False to indicate all/none of the keys may be missing.
3712 All conditions on incomplete keys pass if the key is missing
3715 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3716 for filter_part in re.split(r'(?<!\\)&', filter_str))
3719 def match_filter_func(filters):
3722 filters = set(variadic(filters))
3724 interactive = '-' in filters
3728 def _match_func(info_dict, incomplete=False):
3729 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3730 return NO_DEFAULT if interactive and not incomplete else None
3732 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3733 filter_str = ') | ('.join(map(str.strip, filters))
3734 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3738 class download_range_func:
3739 def __init__(self, chapters, ranges):
3740 self.chapters, self.ranges = chapters, ranges
3742 def __call__(self, info_dict, ydl):
3743 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3744 else 'Cannot match chapters since chapter information is unavailable')
3745 for regex in self.chapters or []:
3746 for i, chapter in enumerate(info_dict.get('chapters') or []):
3747 if re.search(regex, chapter['title']):
3749 yield {**chapter, 'index': i}
3750 if self.chapters and warning:
3751 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3753 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3755 def __eq__(self, other):
3756 return (isinstance(other, download_range_func)
3757 and self.chapters == other.chapters and self.ranges == other.ranges)
3760 def parse_dfxp_time_expr(time_expr):
3764 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3766 return float(mobj.group('time_offset'))
3768 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3770 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3773 def srt_subtitles_timecode(seconds):
3774 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3777 def ass_subtitles_timecode(seconds):
3778 time = timetuple_from_msec(seconds * 1000)
3779 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3782 def dfxp2srt(dfxp_data):
3784 @param dfxp_data A
bytes-like
object containing DFXP data
3785 @returns A
unicode object containing converted SRT data
3787 LEGACY_NAMESPACES = (
3788 (b'http://www.w3.org/ns/ttml', [
3789 b'http://www.w3.org/2004/11/ttaf1',
3790 b'http://www.w3.org/2006/04/ttaf1',
3791 b'http://www.w3.org/2006/10/ttaf1',
3793 (b'http://www.w3.org/ns/ttml#styling', [
3794 b'http://www.w3.org/ns/ttml#style',
3798 SUPPORTED_STYLING = [
3807 _x = functools.partial(xpath_with_ns, ns_map={
3808 'xml': 'http://www.w3.org/XML/1998/namespace',
3809 'ttml': 'http://www.w3.org/ns/ttml',
3810 'tts': 'http://www.w3.org/ns/ttml#styling',
3816 class TTMLPElementParser:
3818 _unclosed_elements = []
3819 _applied_styles = []
3821 def start(self, tag, attrib):
3822 if tag in (_x('ttml:br'), 'br'):
3825 unclosed_elements = []
3827 element_style_id = attrib.get('style')
3829 style.update(default_style)
3830 if element_style_id:
3831 style.update(styles.get(element_style_id, {}))
3832 for prop in SUPPORTED_STYLING:
3833 prop_val = attrib.get(_x('tts:' + prop))
3835 style[prop] = prop_val
3838 for k, v in sorted(style.items()):
3839 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3842 font += ' color="%s"' % v
3843 elif k == 'fontSize':
3844 font += ' size="%s"' % v
3845 elif k == 'fontFamily':
3846 font += ' face="%s"' % v
3847 elif k == 'fontWeight' and v == 'bold':
3849 unclosed_elements.append('b')
3850 elif k == 'fontStyle' and v == 'italic':
3852 unclosed_elements.append('i')
3853 elif k == 'textDecoration' and v == 'underline':
3855 unclosed_elements.append('u')
3857 self._out += '<font' + font + '>'
3858 unclosed_elements.append('font')
3860 if self._applied_styles:
3861 applied_style.update(self._applied_styles[-1])
3862 applied_style.update(style)
3863 self._applied_styles.append(applied_style)
3864 self._unclosed_elements.append(unclosed_elements)
3867 if tag not in (_x('ttml:br'), 'br'):
3868 unclosed_elements = self._unclosed_elements.pop()
3869 for element in reversed(unclosed_elements):
3870 self._out += '</%s>' % element
3871 if unclosed_elements and self._applied_styles:
3872 self._applied_styles.pop()
3874 def data(self, data):
3878 return self._out.strip()
3880 def parse_node(node):
3881 target = TTMLPElementParser()
3882 parser = xml.etree.ElementTree.XMLParser(target=target)
3883 parser.feed(xml.etree.ElementTree.tostring(node))
3884 return parser.close()
3886 for k, v in LEGACY_NAMESPACES:
3888 dfxp_data = dfxp_data.replace(ns, k)
3890 dfxp = compat_etree_fromstring(dfxp_data)
3892 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3895 raise ValueError('Invalid dfxp/TTML subtitle')
3899 for style in dfxp.findall(_x('.//ttml:style')):
3900 style_id = style.get('id') or style.get(_x('xml:id'))
3903 parent_style_id = style.get('style')
3905 if parent_style_id not in styles:
3908 styles[style_id] = styles[parent_style_id].copy()
3909 for prop in SUPPORTED_STYLING:
3910 prop_val = style.get(_x('tts:' + prop))
3912 styles.setdefault(style_id, {})[prop] = prop_val
3918 for p in ('body', 'div'):
3919 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3922 style = styles.get(ele.get('style'))
3925 default_style.update(style)
3927 for para, index in zip(paras, itertools.count(1)):
3928 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3929 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3930 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3931 if begin_time is None:
3936 end_time = begin_time + dur
3937 out.append('%d\n%s --> %s\n%s\n\n' % (
3939 srt_subtitles_timecode(begin_time),
3940 srt_subtitles_timecode(end_time),
3946 def cli_option(params, command_option, param, separator=None):
3947 param = params.get(param)
3948 return ([] if param is None
3949 else [command_option, str(param)] if separator is None
3950 else [f'{command_option}{separator}{param}'])
3953 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3954 param = params.get(param)
3955 assert param in (True, False, None)
3956 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3959 def cli_valueless_option(params, command_option, param, expected_value=True):
3960 return [command_option] if params.get(param) == expected_value else []
3963 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3964 if isinstance(argdict, (list, tuple)): # for backward compatibility
3971 assert isinstance(argdict, dict)
3973 assert isinstance(keys, (list, tuple))
3974 for key_list in keys:
3975 arg_list = list(filter(
3976 lambda x: x is not None,
3977 [argdict.get(key.lower()) for key in variadic(key_list)]))
3979 return [arg for args in arg_list for arg in args]
3983 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3984 main_key, exe = main_key.lower(), exe.lower()
3985 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3986 keys = [f'{root_key}{k}' for k in (keys or [''])]
3987 if root_key in keys:
3989 keys.append((main_key, exe))
3990 keys.append('default')
3993 return cli_configuration_args(argdict, keys, default, use_compat)
3997 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4056 'iw': 'heb', # Replaced by he in 1989 revision
4066 'in': 'ind', # Replaced by id in 1989 revision
4181 'ji': 'yid', # Replaced by yi in 1989 revision
4189 def short2long(cls, code):
4190 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4191 return cls._lang_map.get(code[:2])
4194 def long2short(cls, code):
4195 """Convert language code from ISO 639-2/T to ISO 639-1"""
4196 for short_name, long_name in cls._lang_map.items():
4197 if long_name == code:
4202 # From http://data.okfn.org/data/core/country-list
4204 'AF': 'Afghanistan',
4205 'AX': 'Åland Islands',
4208 'AS': 'American Samoa',
4213 'AG': 'Antigua and Barbuda',
4230 'BO': 'Bolivia, Plurinational State of',
4231 'BQ': 'Bonaire, Sint Eustatius and Saba',
4232 'BA': 'Bosnia and Herzegovina',
4234 'BV': 'Bouvet Island',
4236 'IO': 'British Indian Ocean Territory',
4237 'BN': 'Brunei Darussalam',
4239 'BF': 'Burkina Faso',
4245 'KY': 'Cayman Islands',
4246 'CF': 'Central African Republic',
4250 'CX': 'Christmas Island',
4251 'CC': 'Cocos (Keeling) Islands',
4255 'CD': 'Congo, the Democratic Republic of the',
4256 'CK': 'Cook Islands',
4258 'CI': 'Côte d\'Ivoire',
4263 'CZ': 'Czech Republic',
4267 'DO': 'Dominican Republic',
4270 'SV': 'El Salvador',
4271 'GQ': 'Equatorial Guinea',
4275 'FK': 'Falkland Islands (Malvinas)',
4276 'FO': 'Faroe Islands',
4280 'GF': 'French Guiana',
4281 'PF': 'French Polynesia',
4282 'TF': 'French Southern Territories',
4297 'GW': 'Guinea-Bissau',
4300 'HM': 'Heard Island and McDonald Islands',
4301 'VA': 'Holy See (Vatican City State)',
4308 'IR': 'Iran, Islamic Republic of',
4311 'IM': 'Isle of Man',
4321 'KP': 'Korea, Democratic People\'s Republic of',
4322 'KR': 'Korea, Republic of',
4325 'LA': 'Lao People\'s Democratic Republic',
4331 'LI': 'Liechtenstein',
4335 'MK': 'Macedonia, the Former Yugoslav Republic of',
4342 'MH': 'Marshall Islands',
4348 'FM': 'Micronesia, Federated States of',
4349 'MD': 'Moldova, Republic of',
4360 'NL': 'Netherlands',
4361 'NC': 'New Caledonia',
4362 'NZ': 'New Zealand',
4367 'NF': 'Norfolk Island',
4368 'MP': 'Northern Mariana Islands',
4373 'PS': 'Palestine, State of',
4375 'PG': 'Papua New Guinea',
4378 'PH': 'Philippines',
4382 'PR': 'Puerto Rico',
4386 'RU': 'Russian Federation',
4388 'BL': 'Saint Barthélemy',
4389 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4390 'KN': 'Saint Kitts and Nevis',
4391 'LC': 'Saint Lucia',
4392 'MF': 'Saint Martin (French part)',
4393 'PM': 'Saint Pierre and Miquelon',
4394 'VC': 'Saint Vincent and the Grenadines',
4397 'ST': 'Sao Tome and Principe',
4398 'SA': 'Saudi Arabia',
4402 'SL': 'Sierra Leone',
4404 'SX': 'Sint Maarten (Dutch part)',
4407 'SB': 'Solomon Islands',
4409 'ZA': 'South Africa',
4410 'GS': 'South Georgia and the South Sandwich Islands',
4411 'SS': 'South Sudan',
4416 'SJ': 'Svalbard and Jan Mayen',
4419 'CH': 'Switzerland',
4420 'SY': 'Syrian Arab Republic',
4421 'TW': 'Taiwan, Province of China',
4423 'TZ': 'Tanzania, United Republic of',
4425 'TL': 'Timor-Leste',
4429 'TT': 'Trinidad and Tobago',
4432 'TM': 'Turkmenistan',
4433 'TC': 'Turks and Caicos Islands',
4437 'AE': 'United Arab Emirates',
4438 'GB': 'United Kingdom',
4439 'US': 'United States',
4440 'UM': 'United States Minor Outlying Islands',
4444 'VE': 'Venezuela, Bolivarian Republic of',
4446 'VG': 'Virgin Islands, British',
4447 'VI': 'Virgin Islands, U.S.',
4448 'WF': 'Wallis and Futuna',
4449 'EH': 'Western Sahara',
4453 # Not ISO 3166 codes, but used for IP blocks
4454 'AP': 'Asia/Pacific Region',
4459 def short2full(cls, code):
4460 """Convert an ISO 3166-2 country code to the corresponding full name"""
4461 return cls._country_map.get(code.upper())
4465 # Major IPv4 address blocks per country
4467 'AD': '46.172.224.0/19',
4468 'AE': '94.200.0.0/13',
4469 'AF': '149.54.0.0/17',
4470 'AG': '209.59.64.0/18',
4471 'AI': '204.14.248.0/21',
4472 'AL': '46.99.0.0/16',
4473 'AM': '46.70.0.0/15',
4474 'AO': '105.168.0.0/13',
4475 'AP': '182.50.184.0/21',
4476 'AQ': '23.154.160.0/24',
4477 'AR': '181.0.0.0/12',
4478 'AS': '202.70.112.0/20',
4479 'AT': '77.116.0.0/14',
4480 'AU': '1.128.0.0/11',
4481 'AW': '181.41.0.0/18',
4482 'AX': '185.217.4.0/22',
4483 'AZ': '5.197.0.0/16',
4484 'BA': '31.176.128.0/17',
4485 'BB': '65.48.128.0/17',
4486 'BD': '114.130.0.0/16',
4488 'BF': '102.178.0.0/15',
4489 'BG': '95.42.0.0/15',
4490 'BH': '37.131.0.0/17',
4491 'BI': '154.117.192.0/18',
4492 'BJ': '137.255.0.0/16',
4493 'BL': '185.212.72.0/23',
4494 'BM': '196.12.64.0/18',
4495 'BN': '156.31.0.0/16',
4496 'BO': '161.56.0.0/16',
4497 'BQ': '161.0.80.0/20',
4498 'BR': '191.128.0.0/12',
4499 'BS': '24.51.64.0/18',
4500 'BT': '119.2.96.0/19',
4501 'BW': '168.167.0.0/16',
4502 'BY': '178.120.0.0/13',
4503 'BZ': '179.42.192.0/18',
4504 'CA': '99.224.0.0/11',
4505 'CD': '41.243.0.0/16',
4506 'CF': '197.242.176.0/21',
4507 'CG': '160.113.0.0/16',
4508 'CH': '85.0.0.0/13',
4509 'CI': '102.136.0.0/14',
4510 'CK': '202.65.32.0/19',
4511 'CL': '152.172.0.0/14',
4512 'CM': '102.244.0.0/14',
4513 'CN': '36.128.0.0/10',
4514 'CO': '181.240.0.0/12',
4515 'CR': '201.192.0.0/12',
4516 'CU': '152.206.0.0/15',
4517 'CV': '165.90.96.0/19',
4518 'CW': '190.88.128.0/17',
4519 'CY': '31.153.0.0/16',
4520 'CZ': '88.100.0.0/14',
4522 'DJ': '197.241.0.0/17',
4523 'DK': '87.48.0.0/12',
4524 'DM': '192.243.48.0/20',
4525 'DO': '152.166.0.0/15',
4526 'DZ': '41.96.0.0/12',
4527 'EC': '186.68.0.0/15',
4528 'EE': '90.190.0.0/15',
4529 'EG': '156.160.0.0/11',
4530 'ER': '196.200.96.0/20',
4531 'ES': '88.0.0.0/11',
4532 'ET': '196.188.0.0/14',
4533 'EU': '2.16.0.0/13',
4534 'FI': '91.152.0.0/13',
4535 'FJ': '144.120.0.0/16',
4536 'FK': '80.73.208.0/21',
4537 'FM': '119.252.112.0/20',
4538 'FO': '88.85.32.0/19',
4540 'GA': '41.158.0.0/15',
4542 'GD': '74.122.88.0/21',
4543 'GE': '31.146.0.0/16',
4544 'GF': '161.22.64.0/18',
4545 'GG': '62.68.160.0/19',
4546 'GH': '154.160.0.0/12',
4547 'GI': '95.164.0.0/16',
4548 'GL': '88.83.0.0/19',
4549 'GM': '160.182.0.0/15',
4550 'GN': '197.149.192.0/18',
4551 'GP': '104.250.0.0/19',
4552 'GQ': '105.235.224.0/20',
4553 'GR': '94.64.0.0/13',
4554 'GT': '168.234.0.0/16',
4555 'GU': '168.123.0.0/16',
4556 'GW': '197.214.80.0/20',
4557 'GY': '181.41.64.0/18',
4558 'HK': '113.252.0.0/14',
4559 'HN': '181.210.0.0/16',
4560 'HR': '93.136.0.0/13',
4561 'HT': '148.102.128.0/17',
4562 'HU': '84.0.0.0/14',
4563 'ID': '39.192.0.0/10',
4564 'IE': '87.32.0.0/12',
4565 'IL': '79.176.0.0/13',
4566 'IM': '5.62.80.0/20',
4567 'IN': '117.192.0.0/10',
4568 'IO': '203.83.48.0/21',
4569 'IQ': '37.236.0.0/14',
4570 'IR': '2.176.0.0/12',
4571 'IS': '82.221.0.0/16',
4572 'IT': '79.0.0.0/10',
4573 'JE': '87.244.64.0/18',
4574 'JM': '72.27.0.0/17',
4575 'JO': '176.29.0.0/16',
4576 'JP': '133.0.0.0/8',
4577 'KE': '105.48.0.0/12',
4578 'KG': '158.181.128.0/17',
4579 'KH': '36.37.128.0/17',
4580 'KI': '103.25.140.0/22',
4581 'KM': '197.255.224.0/20',
4582 'KN': '198.167.192.0/19',
4583 'KP': '175.45.176.0/22',
4584 'KR': '175.192.0.0/10',
4585 'KW': '37.36.0.0/14',
4586 'KY': '64.96.0.0/15',
4587 'KZ': '2.72.0.0/13',
4588 'LA': '115.84.64.0/18',
4589 'LB': '178.135.0.0/16',
4590 'LC': '24.92.144.0/20',
4591 'LI': '82.117.0.0/19',
4592 'LK': '112.134.0.0/15',
4593 'LR': '102.183.0.0/16',
4594 'LS': '129.232.0.0/17',
4595 'LT': '78.56.0.0/13',
4596 'LU': '188.42.0.0/16',
4597 'LV': '46.109.0.0/16',
4598 'LY': '41.252.0.0/14',
4599 'MA': '105.128.0.0/11',
4600 'MC': '88.209.64.0/18',
4601 'MD': '37.246.0.0/16',
4602 'ME': '178.175.0.0/17',
4603 'MF': '74.112.232.0/21',
4604 'MG': '154.126.0.0/17',
4605 'MH': '117.103.88.0/21',
4606 'MK': '77.28.0.0/15',
4607 'ML': '154.118.128.0/18',
4608 'MM': '37.111.0.0/17',
4609 'MN': '49.0.128.0/17',
4610 'MO': '60.246.0.0/16',
4611 'MP': '202.88.64.0/20',
4612 'MQ': '109.203.224.0/19',
4613 'MR': '41.188.64.0/18',
4614 'MS': '208.90.112.0/22',
4615 'MT': '46.11.0.0/16',
4616 'MU': '105.16.0.0/12',
4617 'MV': '27.114.128.0/18',
4618 'MW': '102.70.0.0/15',
4619 'MX': '187.192.0.0/11',
4620 'MY': '175.136.0.0/13',
4621 'MZ': '197.218.0.0/15',
4622 'NA': '41.182.0.0/16',
4623 'NC': '101.101.0.0/18',
4624 'NE': '197.214.0.0/18',
4625 'NF': '203.17.240.0/22',
4626 'NG': '105.112.0.0/12',
4627 'NI': '186.76.0.0/15',
4628 'NL': '145.96.0.0/11',
4629 'NO': '84.208.0.0/13',
4630 'NP': '36.252.0.0/15',
4631 'NR': '203.98.224.0/19',
4632 'NU': '49.156.48.0/22',
4633 'NZ': '49.224.0.0/14',
4634 'OM': '5.36.0.0/15',
4635 'PA': '186.72.0.0/15',
4636 'PE': '186.160.0.0/14',
4637 'PF': '123.50.64.0/18',
4638 'PG': '124.240.192.0/19',
4639 'PH': '49.144.0.0/13',
4640 'PK': '39.32.0.0/11',
4641 'PL': '83.0.0.0/11',
4642 'PM': '70.36.0.0/20',
4643 'PR': '66.50.0.0/16',
4644 'PS': '188.161.0.0/16',
4645 'PT': '85.240.0.0/13',
4646 'PW': '202.124.224.0/20',
4647 'PY': '181.120.0.0/14',
4648 'QA': '37.210.0.0/15',
4649 'RE': '102.35.0.0/16',
4650 'RO': '79.112.0.0/13',
4651 'RS': '93.86.0.0/15',
4652 'RU': '5.136.0.0/13',
4653 'RW': '41.186.0.0/16',
4654 'SA': '188.48.0.0/13',
4655 'SB': '202.1.160.0/19',
4656 'SC': '154.192.0.0/11',
4657 'SD': '102.120.0.0/13',
4658 'SE': '78.64.0.0/12',
4659 'SG': '8.128.0.0/10',
4660 'SI': '188.196.0.0/14',
4661 'SK': '78.98.0.0/15',
4662 'SL': '102.143.0.0/17',
4663 'SM': '89.186.32.0/19',
4664 'SN': '41.82.0.0/15',
4665 'SO': '154.115.192.0/18',
4666 'SR': '186.179.128.0/17',
4667 'SS': '105.235.208.0/21',
4668 'ST': '197.159.160.0/19',
4669 'SV': '168.243.0.0/16',
4670 'SX': '190.102.0.0/20',
4672 'SZ': '41.84.224.0/19',
4673 'TC': '65.255.48.0/20',
4674 'TD': '154.68.128.0/19',
4675 'TG': '196.168.0.0/14',
4676 'TH': '171.96.0.0/13',
4677 'TJ': '85.9.128.0/18',
4678 'TK': '27.96.24.0/21',
4679 'TL': '180.189.160.0/20',
4680 'TM': '95.85.96.0/19',
4681 'TN': '197.0.0.0/11',
4682 'TO': '175.176.144.0/21',
4683 'TR': '78.160.0.0/11',
4684 'TT': '186.44.0.0/15',
4685 'TV': '202.2.96.0/19',
4686 'TW': '120.96.0.0/11',
4687 'TZ': '156.156.0.0/14',
4688 'UA': '37.52.0.0/14',
4689 'UG': '102.80.0.0/13',
4691 'UY': '167.56.0.0/13',
4692 'UZ': '84.54.64.0/18',
4693 'VA': '212.77.0.0/19',
4694 'VC': '207.191.240.0/21',
4695 'VE': '186.88.0.0/13',
4696 'VG': '66.81.192.0/20',
4697 'VI': '146.226.0.0/16',
4698 'VN': '14.160.0.0/11',
4699 'VU': '202.80.32.0/20',
4700 'WF': '117.20.32.0/21',
4701 'WS': '202.4.32.0/19',
4702 'YE': '134.35.0.0/16',
4703 'YT': '41.242.116.0/22',
4704 'ZA': '41.0.0.0/11',
4705 'ZM': '102.144.0.0/13',
4706 'ZW': '102.177.192.0/18',
4710 def random_ipv4(cls, code_or_block):
4711 if len(code_or_block) == 2:
4712 block = cls._country_ip_map.get(code_or_block.upper())
4716 block = code_or_block
4717 addr, preflen = block.split('/')
4718 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4719 addr_max = addr_min | (0xffffffff >> int(preflen))
4720 return str(socket.inet_ntoa(
4721 struct.pack('!L', random.randint(addr_min, addr_max))))
4724 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4725 def __init__(self, proxies=None):
4726 # Set default handlers
4727 for type in ('http', 'https'):
4728 setattr(self, '%s_open' % type,
4729 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4730 meth(r, proxy, type))
4731 urllib.request.ProxyHandler.__init__(self, proxies)
4733 def proxy_open(self, req, proxy, type):
4734 req_proxy = req.headers.get('Ytdl-request-proxy')
4735 if req_proxy is not None:
4737 del req.headers['Ytdl-request-proxy']
4739 if proxy == '__noproxy__':
4740 return None # No Proxy
4741 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4742 req.add_header('Ytdl-socks-proxy', proxy)
4743 # yt-dlp's http/https handlers do wrapping the socket with socks
4745 return urllib.request.ProxyHandler.proxy_open(
4746 self, req, proxy, type)
4749 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4750 # released into Public Domain
4751 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4753 def long_to_bytes(n, blocksize=0):
4754 """long_to_bytes(n:long, blocksize:int) : string
4755 Convert a long integer to a byte string.
4757 If optional blocksize is given and greater than zero, pad the front of the
4758 byte string with binary zeros so that the length is a multiple of
4761 # after much testing, this algorithm was deemed to be the fastest
4765 s = struct.pack('>I', n & 0xffffffff) + s
4767 # strip off leading zeros
4768 for i in range(len(s)):
4769 if s[i] != b'\000'[0]:
4772 # only happens when n == 0
4776 # add back some pad bytes. this could be done more efficiently w.r.t. the
4777 # de-padding being done above, but sigh...
4778 if blocksize > 0 and len(s) % blocksize:
4779 s = (blocksize - len(s) % blocksize) * b'\000' + s
4783 def bytes_to_long(s):
4784 """bytes_to_long(string) : long
4785 Convert a byte string to a long integer.
4787 This is (essentially) the inverse of long_to_bytes().
4792 extra = (4 - length % 4)
4793 s = b'\000' * extra + s
4794 length = length + extra
4795 for i in range(0, length, 4):
4796 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4800 def ohdave_rsa_encrypt(data, exponent, modulus):
4802 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4805 data: data to encrypt, bytes-like object
4806 exponent, modulus: parameter e and N of RSA algorithm, both integer
4807 Output: hex string of encrypted data
4809 Limitation: supports one block encryption only
4812 payload = int(binascii.hexlify(data[::-1]), 16)
4813 encrypted = pow(payload, exponent, modulus)
4814 return '%x' % encrypted
4817 def pkcs1pad(data, length):
4819 Padding input data with PKCS#1 scheme
4821 @param {int[]} data input data
4822 @param {int} length target length
4823 @returns {int[]} padded data
4825 if len(data) > length - 11:
4826 raise ValueError('Input data too
long for PKCS
#1 padding')
4828 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4829 return [0, 2] + pseudo_random
+ [0] + data
4832 def _base_n_table(n
, table
):
4833 if not table
and not n
:
4834 raise ValueError('Either table or n must be specified')
4835 table
= (table
or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n
]
4837 if n
and n
!= len(table
):
4838 raise ValueError(f
'base {n} exceeds table length {len(table)}')
4842 def encode_base_n(num
, n
=None, table
=None):
4843 """Convert given int to a base-n string"""
4844 table
= _base_n_table(n
, table
)
4848 result
, base
= '', len(table
)
4850 result
= table
[num
% base
] + result
4855 def decode_base_n(string
, n
=None, table
=None):
4856 """Convert given base-n string to int"""
4857 table
= {char: index for index, char in enumerate(_base_n_table(n, table))}
4858 result
, base
= 0, len(table
)
4860 result
= result
* base
+ table
[char
]
4864 def decode_base(value
, digits
):
4865 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4866 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4867 return decode_base_n(value
, table
=digits
)
4870 def decode_packed_codes(code
):
4871 mobj
= re
.search(PACKED_CODES_RE
, code
)
4872 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4875 symbols
= symbols
.split('|')
4880 base_n_count
= encode_base_n(count
, base
)
4881 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4884 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4888 def caesar(s
, alphabet
, shift
):
4893 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4898 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4901 def parse_m3u8_attributes(attrib
):
4903 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4904 if val
.startswith('"'):
4910 def urshift(val
, n
):
4911 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4914 # Based on png2str() written by @gdkchan and improved by @yokrysty
4915 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4916 def decode_png(png_data
):
4917 # Reference: https://www.w3.org/TR/PNG/
4918 header
= png_data
[8:]
4920 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4921 raise OSError('Not a valid PNG file.')
4923 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4924 unpack_integer
= lambda x
: struct
.unpack(int_map
[len(x
)], x
)[0]
4929 length
= unpack_integer(header
[:4])
4932 chunk_type
= header
[:4]
4935 chunk_data
= header
[:length
]
4936 header
= header
[length
:]
4938 header
= header
[4:] # Skip CRC
4946 ihdr
= chunks
[0]['data']
4948 width
= unpack_integer(ihdr
[:4])
4949 height
= unpack_integer(ihdr
[4:8])
4953 for chunk
in chunks
:
4954 if chunk
['type'] == b
'IDAT':
4955 idat
+= chunk
['data']
4958 raise OSError('Unable to read PNG data.')
4960 decompressed_data
= bytearray(zlib
.decompress(idat
))
4965 def _get_pixel(idx
):
4970 for y
in range(height
):
4971 basePos
= y
* (1 + stride
)
4972 filter_type
= decompressed_data
[basePos
]
4976 pixels
.append(current_row
)
4978 for x
in range(stride
):
4979 color
= decompressed_data
[1 + basePos
+ x
]
4980 basex
= y
* stride
+ x
4985 left
= _get_pixel(basex
- 3)
4987 up
= _get_pixel(basex
- stride
)
4989 if filter_type
== 1: # Sub
4990 color
= (color
+ left
) & 0xff
4991 elif filter_type
== 2: # Up
4992 color
= (color
+ up
) & 0xff
4993 elif filter_type
== 3: # Average
4994 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4995 elif filter_type
== 4: # Paeth
5001 c
= _get_pixel(basex
- stride
- 3)
5009 if pa
<= pb
and pa
<= pc
:
5010 color
= (color
+ a
) & 0xff
5012 color
= (color
+ b
) & 0xff
5014 color
= (color
+ c
) & 0xff
5016 current_row
.append(color
)
5018 return width
, height
, pixels
5021 def write_xattr(path
, key
, value
):
5022 # Windows: Write xattrs to NTFS Alternate Data Streams:
5023 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5024 if compat_os_name
== 'nt':
5025 assert ':' not in key
5026 assert os
.path
.exists(path
)
5029 with open(f
'{path}:{key}', 'wb') as f
:
5031 except OSError as e
:
5032 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5035 # UNIX Method 1. Use xattrs/pyxattrs modules
5038 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
5039 # Unicode arguments are not supported in pyxattr until version 0.5.0
5040 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5041 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
5042 setxattr
= xattr
.set
5044 setxattr
= xattr
.setxattr
5048 setxattr(path
, key
, value
)
5049 except OSError as e
:
5050 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5053 # UNIX Method 2. Use setfattr/xattr executables
5054 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
5055 else 'xattr' if check_executable('xattr', ['-h']) else None)
5057 raise XAttrUnavailableError(
5058 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5059 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5061 value
= value
.decode()
5063 _
, stderr
, returncode
= Popen
.run(
5064 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
5065 text
=True, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5066 except OSError as e
:
5067 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5069 raise XAttrMetadataError(returncode
, stderr
)
5072 def random_birthday(year_field
, month_field
, day_field
):
5073 start_date
= datetime
.date(1950, 1, 1)
5074 end_date
= datetime
.date(1995, 12, 31)
5075 offset
= random
.randint(0, (end_date
- start_date
).days
)
5076 random_date
= start_date
+ datetime
.timedelta(offset
)
5078 year_field
: str(random_date
.year
),
5079 month_field
: str(random_date
.month
),
5080 day_field
: str(random_date
.day
),
5084 # Templates for internet shortcut files, which are plain text files.
5085 DOT_URL_LINK_TEMPLATE
= '''\
5090 DOT_WEBLOC_LINK_TEMPLATE
= '''\
5091 <?xml version="1.0" encoding="UTF-8"?>
5092 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5093 <plist version="1.0">
5096 \t<string>%(url)s</string>
5101 DOT_DESKTOP_LINK_TEMPLATE
= '''\
5111 'url': DOT_URL_LINK_TEMPLATE
,
5112 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
5113 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
5117 def iri_to_uri(iri
):
5119 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5121 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5124 iri_parts
= urllib
.parse
.urlparse(iri
)
5126 if '[' in iri_parts
.netloc
:
5127 raise ValueError('IPv6 URIs are not, yet, supported.')
5128 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5130 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5133 if iri_parts
.username
:
5134 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5135 if iri_parts
.password
is not None:
5136 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5139 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
5140 # The 'idna' encoding produces ASCII text.
5141 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5142 net_location
+= ':' + str(iri_parts
.port
)
5144 return urllib
.parse
.urlunparse(
5148 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5150 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5151 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5153 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5154 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5156 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5158 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5161 def to_high_limit_path(path
):
5162 if sys
.platform
in ['win32', 'cygwin']:
5163 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5164 return '\\\\?\\' + os
.path
.abspath(path
)
5169 def format_field(obj
, field
=None, template
='%s', ignore
=NO_DEFAULT
, default
='', func
=IDENTITY
):
5170 val
= traverse_obj(obj
, *variadic(field
))
5171 if (not val
and val
!= 0) if ignore
is NO_DEFAULT
else val
in variadic(ignore
):
5173 return template
% func(val
)
5176 def clean_podcast_url(url
):
5177 return re
.sub(r
'''(?x)
5181 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5184 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5187 cn\.co| # https://podcorn.com/analytics-prefix/
5188 st\.fm # https://podsights.com/docs/
5193 _HEX_TABLE
= '0123456789abcdef'
5196 def random_uuidv4():
5197 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5200 def make_dir(path
, to_screen
=None):
5202 dn
= os
.path
.dirname(path
)
5203 if dn
and not os
.path
.exists(dn
):
5206 except OSError as err
:
5207 if callable(to_screen
) is not None:
5208 to_screen('unable to create directory ' + error_to_compat_str(err
))
5212 def get_executable_path():
5213 from .update
import _get_variant_and_executable_path
5215 return os
.path
.dirname(os
.path
.abspath(_get_variant_and_executable_path()[1]))
5218 def load_plugins(name
, suffix
, namespace
):
5220 with contextlib
.suppress(FileNotFoundError
):
5221 plugins_spec
= importlib
.util
.spec_from_file_location(
5222 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
5223 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
5224 sys
.modules
[plugins_spec
.name
] = plugins
5225 plugins_spec
.loader
.exec_module(plugins
)
5226 for name
in dir(plugins
):
5227 if name
in namespace
:
5229 if not name
.endswith(suffix
):
5231 klass
= getattr(plugins
, name
)
5232 classes
[name
] = namespace
[name
] = klass
5237 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
5238 casesense
=True, is_user_input
=False, traverse_string
=False):
5239 ''' Traverse nested list/dict/tuple
5240 @param path_list A list of paths which are checked one by one.
5241 Each path is a list of keys where each key is a:
5243 - string: A dictionary key
5244 - int: An index into a list
5245 - tuple: A list of keys all of which will be traversed
5246 - Ellipsis: Fetch all values in the object
5247 - Function: Takes the key and value as arguments
5248 and returns whether the key matches or not
5249 @param default Default value to return
5250 @param expected_type Only accept final value of this type (Can also be any callable)
5251 @param get_all Return all the values obtained from a path or only the first one
5252 @param casesense Whether to consider dictionary keys as case sensitive
5253 @param is_user_input Whether the keys are generated from user input. If True,
5254 strings are converted to int/slice if necessary
5255 @param traverse_string Whether to traverse inside strings. If True, any
5256 non-compatible object will also be converted into a string
5260 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
5261 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
5263 def _traverse_obj(obj
, path
, _current_depth
=0):
5265 path
= tuple(variadic(path
))
5266 for i
, key
in enumerate(path
):
5267 if None in (key
, obj
):
5269 if isinstance(key
, (list, tuple)):
5270 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
5273 obj
= (obj
.values() if isinstance(obj
, dict)
5274 else obj
if isinstance(obj
, (list, tuple, LazyList
))
5275 else str(obj
) if traverse_string
else [])
5277 depth
= max(depth
, _current_depth
)
5278 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
5280 if isinstance(obj
, (list, tuple, LazyList
)):
5281 obj
= enumerate(obj
)
5282 elif isinstance(obj
, dict):
5285 if not traverse_string
:
5289 depth
= max(depth
, _current_depth
)
5290 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if try_call(key
, args
=(k
, v
))]
5291 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
5292 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
5293 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
5296 key
= (int_or_none(key
) if ':' not in key
5297 else slice(*map(int_or_none
, key
.split(':'))))
5298 if key
== slice(None):
5299 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
5300 if not isinstance(key
, (int, slice)):
5302 if not isinstance(obj
, (list, tuple, LazyList
)):
5303 if not traverse_string
:
5312 if isinstance(expected_type
, type):
5313 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5315 type_test
= expected_type
or IDENTITY
5317 for path
in path_list
:
5319 val
= _traverse_obj(obj
, path
)
5322 for _
in range(depth
- 1):
5323 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5324 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5326 return val
if get_all
else val
[0]
5328 val
= type_test(val
)
5334 def traverse_dict(dictn
, keys
, casesense
=True):
5335 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5336 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5337 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5340 def get_first(obj
, keys
, **kwargs
):
5341 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5344 def variadic(x
, allowed_types
=(str, bytes, dict)):
5345 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5348 def time_seconds(**kwargs
):
5349 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5350 return t
.timestamp()
5353 # create a JSON Web Signature (jws) with HS256 algorithm
5354 # the resulting format is in JWS Compact Serialization
5355 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5356 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5357 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5363 header_data
.update(headers
)
5364 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5365 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5366 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5367 signature_b64
= base64
.b64encode(h
.digest())
5368 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5372 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5373 def jwt_decode_hs256(jwt
):
5374 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5375 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5379 WINDOWS_VT_MODE
= False if compat_os_name
== 'nt' else None
5383 def supports_terminal_sequences(stream
):
5384 if compat_os_name
== 'nt':
5385 if not WINDOWS_VT_MODE
:
5387 elif not os
.getenv('TERM'):
5390 return stream
.isatty()
5391 except BaseException
:
5395 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5396 if get_windows_version() < (10, 0, 10586):
5398 global WINDOWS_VT_MODE
5400 Popen
.run('', shell
=True)
5404 WINDOWS_VT_MODE
= True
5405 supports_terminal_sequences
.cache_clear()
5408 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5411 def remove_terminal_sequences(string
):
5412 return _terminal_sequences_re
.sub('', string
)
5415 def number_of_digits(number
):
5416 return len('%d' % number
)
5419 def join_nonempty(*values
, delim
='-', from_dict
=None):
5420 if from_dict
is not None:
5421 values
= (traverse_obj(from_dict
, variadic(v
)) for v
in values
)
5422 return delim
.join(map(str, filter(None, values
)))
5425 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5427 Find the largest format dimensions in terms of video width and, for each thumbnail:
5428 * Modify the URL: Match the width with the provided regex and replace with the former width
5431 This function is useful with video services that scale the provided thumbnails on demand
5433 _keys
= ('width', 'height')
5434 max_dimensions
= max(
5435 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5437 if not max_dimensions
[0]:
5441 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5442 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5443 for thumbnail
in thumbnails
5447 def parse_http_range(range):
5448 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5450 return None, None, None
5451 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5453 return None, None, None
5454 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5457 def read_stdin(what
):
5458 eof
= 'Ctrl+Z' if compat_os_name
== 'nt' else 'Ctrl+D'
5459 write_string(f
'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5463 def determine_file_encoding(data
):
5465 Detect the text encoding used
5466 @returns (encoding, bytes to skip)
5469 # BOM marks are given priority over declarations
5470 for bom
, enc
in BOMS
:
5471 if data
.startswith(bom
):
5472 return enc
, len(bom
)
5474 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5475 # We ignore the endianness to get a good enough match
5476 data
= data
.replace(b
'\0', b
'')
5477 mobj
= re
.match(rb
'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data
)
5478 return mobj
.group(1).decode() if mobj
else None, 0
5485 __initialized
= False
5487 def __init__(self
, parser
, label
=None):
5488 self
.parser
, self
.label
= parser
, label
5489 self
._loaded
_paths
, self
.configs
= set(), []
5491 def init(self
, args
=None, filename
=None):
5492 assert not self
.__initialized
5493 self
.own_args
, self
.filename
= args
, filename
5494 return self
.load_configs()
5496 def load_configs(self
):
5499 location
= os
.path
.realpath(self
.filename
)
5500 directory
= os
.path
.dirname(location
)
5501 if location
in self
._loaded
_paths
:
5503 self
._loaded
_paths
.add(location
)
5505 self
.__initialized
= True
5506 opts
, _
= self
.parser
.parse_known_args(self
.own_args
)
5507 self
.parsed_args
= self
.own_args
5508 for location
in opts
.config_locations
or []:
5510 self
.append_config(shlex
.split(read_stdin('options'), comments
=True), label
='stdin')
5512 location
= os
.path
.join(directory
, expand_path(location
))
5513 if os
.path
.isdir(location
):
5514 location
= os
.path
.join(location
, 'yt-dlp.conf')
5515 if not os
.path
.exists(location
):
5516 self
.parser
.error(f
'config location {location} does not exist')
5517 self
.append_config(self
.read_file(location
), location
)
5521 label
= join_nonempty(
5522 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5524 return join_nonempty(
5525 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5526 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5530 def read_file(filename
, default
=[]):
5532 optionf
= open(filename
, 'rb')
5534 return default
# silently skip if file is not present
5536 enc
, skip
= determine_file_encoding(optionf
.read(512))
5537 optionf
.seek(skip
, io
.SEEK_SET
)
5539 enc
= None # silently skip read errors
5541 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5542 contents
= optionf
.read().decode(enc
or preferredencoding())
5543 res
= shlex
.split(contents
, comments
=True)
5544 except Exception as err
:
5545 raise ValueError(f
'Unable to parse "{filename}": {err}')
5551 def hide_login_info(opts
):
5552 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5553 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5558 return m
.group('key') + '=PRIVATE'
5562 opts
= list(map(_scrub_eq
, opts
))
5563 for idx
, opt
in enumerate(opts
):
5564 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5565 opts
[idx
+ 1] = 'PRIVATE'
5568 def append_config(self
, *args
, label
=None):
5569 config
= type(self
)(self
.parser
, label
)
5570 config
._loaded
_paths
= self
._loaded
_paths
5571 if config
.init(*args
):
5572 self
.configs
.append(config
)
5576 for config
in reversed(self
.configs
):
5577 yield from config
.all_args
5578 yield from self
.parsed_args
or []
5580 def parse_known_args(self
, **kwargs
):
5581 return self
.parser
.parse_known_args(self
.all_args
, **kwargs
)
5583 def parse_args(self
):
5584 return self
.parser
.parse_args(self
.all_args
)
5587 class WebSocketsWrapper():
5588 """Wraps websockets module to use in non-async scopes"""
5591 def __init__(self
, url
, headers
=None, connect
=True):
5592 self
.loop
= asyncio
.new_event_loop()
5593 # XXX: "loop" is deprecated
5594 self
.conn
= websockets
.connect(
5595 url
, extra_headers
=headers
, ping_interval
=None,
5596 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5599 atexit
.register(self
.__exit
__, None, None, None)
5601 def __enter__(self
):
5603 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5606 def send(self
, *args
):
5607 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5609 def recv(self
, *args
):
5610 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5612 def __exit__(self
, type, value
, traceback
):
5614 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5617 self
._cancel
_all
_tasks
(self
.loop
)
5619 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5620 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5622 def run_with_loop(main
, loop
):
5623 if not asyncio
.iscoroutine(main
):
5624 raise ValueError(f
'a coroutine was expected, got {main!r}')
5627 return loop
.run_until_complete(main
)
5629 loop
.run_until_complete(loop
.shutdown_asyncgens())
5630 if hasattr(loop
, 'shutdown_default_executor'):
5631 loop
.run_until_complete(loop
.shutdown_default_executor())
5634 def _cancel_all_tasks(loop
):
5635 to_cancel
= asyncio
.all_tasks(loop
)
5640 for task
in to_cancel
:
5643 # XXX: "loop" is removed in python 3.10+
5644 loop
.run_until_complete(
5645 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5647 for task
in to_cancel
:
5648 if task
.cancelled():
5650 if task
.exception() is not None:
5651 loop
.call_exception_handler({
5652 'message': 'unhandled exception during asyncio.run() shutdown',
5653 'exception': task
.exception(),
5658 def merge_headers(*dicts
):
5659 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5660 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5663 def cached_method(f
):
5664 """Cache a method"""
5665 signature
= inspect
.signature(f
)
5668 def wrapper(self
, *args
, **kwargs
):
5669 bound_args
= signature
.bind(self
, *args
, **kwargs
)
5670 bound_args
.apply_defaults()
5671 key
= tuple(bound_args
.arguments
.values())
5673 if not hasattr(self
, '__cached_method__cache'):
5674 self
.__cached
_method
__cache
= {}
5675 cache
= self
.__cached
_method
__cache
.setdefault(f
.__name
__, {})
5676 if key
not in cache
:
5677 cache
[key
] = f(self
, *args
, **kwargs
)
5682 class classproperty
:
5683 """property access for class methods"""
5685 def __init__(self
, func
):
5686 functools
.update_wrapper(self
, func
)
5689 def __get__(self
, _
, cls
):
5690 return self
.func(cls
)
5693 class Namespace(types
.SimpleNamespace
):
5694 """Immutable namespace"""
5697 return iter(self
.__dict
__.values())
5701 return self
.__dict
__.items()
5704 MEDIA_EXTENSIONS
= Namespace(
5705 common_video
=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5706 video
=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5707 common_audio
=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5708 audio
=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5709 thumbnails
=('jpg', 'png', 'webp'),
5710 storyboards
=('mhtml', ),
5711 subtitles
=('srt', 'vtt', 'ass', 'lrc'),
5712 manifests
=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5714 MEDIA_EXTENSIONS
.video
+= MEDIA_EXTENSIONS
.common_video
5715 MEDIA_EXTENSIONS
.audio
+= MEDIA_EXTENSIONS
.common_audio
5717 KNOWN_EXTENSIONS
= (*MEDIA_EXTENSIONS
.video
, *MEDIA_EXTENSIONS
.audio
, *MEDIA_EXTENSIONS
.manifests
)
5722 for retry in RetryManager(...):
5725 except SomeException as err:
5729 attempt
, _error
= 0, None
5731 def __init__(self
, _retries
, _error_callback
, **kwargs
):
5732 self
.retries
= _retries
or 0
5733 self
.error_callback
= functools
.partial(_error_callback
, **kwargs
)
5735 def _should_retry(self
):
5736 return self
._error
is not NO_DEFAULT
and self
.attempt
<= self
.retries
5740 if self
._error
is NO_DEFAULT
:
5745 def error(self
, value
):
5749 while self
._should
_retry
():
5750 self
.error
= NO_DEFAULT
5754 self
.error_callback(self
.error
, self
.attempt
, self
.retries
)
5757 def report_retry(e
, count
, retries
, *, sleep_func
, info
, warn
, error
=None, suffix
=None):
5758 """Utility function for reporting retries"""
5761 return error(f
'{e}. Giving up after {count - 1} retries') if count
> 1 else error(str(e
))
5766 elif isinstance(e
, ExtractorError
):
5767 e
= remove_end(str_or_none(e
.cause
) or e
.orig_msg
, '.')
5768 warn(f
'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5770 delay
= float_or_none(sleep_func(n
=count
- 1)) if callable(sleep_func
) else sleep_func
5772 info(f
'Sleeping {delay:.2f} seconds ...')
5776 def make_archive_id(ie
, video_id
):
5777 ie_key
= ie
if isinstance(ie
, str) else ie
.ie_key()
5778 return f
'{ie_key.lower()} {video_id}'
5781 def truncate_string(s
, left
, right
=0):
5782 assert left
> 3 and right
>= 0
5783 if s
is None or len(s
) <= left
+ right
:
5785 return f
'{s[:left-3]}...{s[-right:]}'
5789 has_certifi
= bool(certifi
)
5790 has_websockets
= bool(websockets
)