39 import xml
.etree
.ElementTree
42 from .compat
import asyncio
, functools
# isort: split
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
50 compat_HTMLParseError
,
61 compat_urllib_parse_unquote_plus
,
62 compat_urllib_parse_urlencode
,
63 compat_urllib_parse_urlparse
,
64 compat_urllib_request
,
67 from .dependencies
import brotli
, certifi
, websockets
68 from .socks
import ProxyType
, sockssocket
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme
not in compat_urlparse
.uses_netloc
:
77 compat_urlparse
.uses_netloc
.append(scheme
)
80 # This is not clearly defined otherwise
81 compiled_regex_type
= type(re
.compile(''))
84 def random_user_agent():
85 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
126 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
129 SUPPORTED_ENCODINGS
= [
133 SUPPORTED_ENCODINGS
.append('br')
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
148 NO_DEFAULT
= object()
150 ENGLISH_MONTH_NAMES
= [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
155 'en': ENGLISH_MONTH_NAMES
,
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
174 'f4f', 'f4m', 'm3u8', 'smil')
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
217 '%Y-%m-%dT%H:%M:%S.%f',
220 '%b %d %Y at %H:%M:%S',
222 '%B %d %Y at %H:%M:%S',
226 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
227 DATE_FORMATS_DAY_FIRST
.extend([
236 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
237 DATE_FORMATS_MONTH_FIRST
.extend([
245 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
248 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
252 def preferredencoding():
253 """Get preferred encoding.
255 Returns the best encoding scheme for the system, based on
256 locale.getpreferredencoding() and some further tweaks.
259 pref = locale.getpreferredencoding()
267 def write_json_file(obj, fn):
268 """ Encode obj as JSON and write it to fn, atomically if possible """
270 tf = tempfile.NamedTemporaryFile(
271 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
272 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
276 json.dump(obj, tf, ensure_ascii=False)
277 if sys.platform == 'win32
':
278 # Need to remove existing file on Windows, else os.rename raises
279 # WindowsError or FileExistsError.
280 with contextlib.suppress(OSError):
282 with contextlib.suppress(OSError):
285 os.chmod(tf.name, 0o666 & ~mask)
286 os.rename(tf.name, fn)
288 with contextlib.suppress(OSError):
293 def find_xpath_attr(node, xpath, key, val=None):
294 """ Find the xpath xpath[@key=val] """
295 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
296 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
297 return node.find(expr)
299 # On python2.6 the xml.etree.ElementTree.Element methods don't support
300 # the namespace parameter
303 def xpath_with_ns(path
, ns_map
):
304 components
= [c
.split(':') for c
in path
.split('/')]
308 replaced
.append(c
[0])
311 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
312 return '/'.join(replaced
)
315 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
316 def _find_xpath(xpath
):
317 return node
.find(xpath
)
319 if isinstance(xpath
, (str, compat_str
)):
320 n
= _find_xpath(xpath
)
328 if default
is not NO_DEFAULT
:
331 name
= xpath
if name
is None else name
332 raise ExtractorError('Could not find XML element %s' % name
)
338 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
339 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
340 if n
is None or n
== default
:
343 if default
is not NO_DEFAULT
:
346 name
= xpath
if name
is None else name
347 raise ExtractorError('Could not find XML element\'s text %s' % name
)
353 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
354 n
= find_xpath_attr(node
, xpath
, key
)
356 if default
is not NO_DEFAULT
:
359 name
= f
'{xpath}[@{key}]' if name
is None else name
360 raise ExtractorError('Could not find XML attribute %s' % name
)
366 def get_element_by_id(id, html
, **kwargs
):
367 """Return the content of the tag with the specified ID in the passed HTML document"""
368 return get_element_by_attribute('id', id, html
, **kwargs
)
371 def get_element_html_by_id(id, html
, **kwargs
):
372 """Return the html of the tag with the specified ID in the passed HTML document"""
373 return get_element_html_by_attribute('id', id, html
, **kwargs
)
376 def get_element_by_class(class_name
, html
):
377 """Return the content of the first tag with the specified class in the passed HTML document"""
378 retval
= get_elements_by_class(class_name
, html
)
379 return retval
[0] if retval
else None
382 def get_element_html_by_class(class_name
, html
):
383 """Return the html of the first tag with the specified class in the passed HTML document"""
384 retval
= get_elements_html_by_class(class_name
, html
)
385 return retval
[0] if retval
else None
388 def get_element_by_attribute(attribute
, value
, html
, **kwargs
):
389 retval
= get_elements_by_attribute(attribute
, value
, html
, **kwargs
)
390 return retval
[0] if retval
else None
393 def get_element_html_by_attribute(attribute
, value
, html
, **kargs
):
394 retval
= get_elements_html_by_attribute(attribute
, value
, html
, **kargs
)
395 return retval
[0] if retval
else None
398 def get_elements_by_class(class_name
, html
, **kargs
):
399 """Return the content of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_by_attribute(
401 'class', r
'[^\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
405 def get_elements_html_by_class(class_name, html):
406 """Return the html of all tags with the specified class in the passed HTML document as a list"""
407 return get_elements_html_by_attribute(
408 'class', r'[^
\'"]*(?<=[\'"\s
])%s(?
=[\'"\s])[^\'"]*' % re.escape(class_name),
409 html, escape_value=False)
412 def get_elements_by_attribute(*args, **kwargs):
413 """Return the content of the tag with the specified attribute in the passed HTML document"""
414 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
417 def get_elements_html_by_attribute(*args, **kwargs):
418 """Return the html of the tag with the specified attribute in the passed HTML document"""
419 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
422 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
424 Return the text (content) and the html (whole) of the tag with the specified
425 attribute in the passed HTML document
428 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
430 value = re.escape(value) if escape_value else value
432 partial_element_re = rf'''(?x
)
433 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
434 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
435 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
438 for m in re.finditer(partial_element_re, html):
439 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
442 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
447 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
449 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
450 closing tag for the first opening tag it has encountered, and can be used
454 class HTMLBreakOnClosingTagException(Exception):
458 self.tagstack = collections.deque()
459 compat_HTMLParser.__init__(self)
464 def __exit__(self, *_):
468 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
469 # so data remains buffered; we no longer have any interest in it, thus
470 # override this method to discard it
473 def handle_starttag(self, tag, _):
474 self.tagstack.append(tag)
476 def handle_endtag(self, tag):
477 if not self.tagstack:
478 raise compat_HTMLParseError('no tags
in the stack
')
480 inner_tag = self.tagstack.pop()
484 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
485 if not self.tagstack:
486 raise self.HTMLBreakOnClosingTagException()
489 def get_element_text_and_html_by_tag(tag, html):
491 For the first element with the specified tag in the passed HTML document
492 return its' content (text
) and the whole
element (html
)
494 def find_or_raise(haystack, needle, exc):
496 return haystack.index(needle)
499 closing_tag = f'</{tag}>'
500 whole_start = find_or_raise(
501 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
502 content_start = find_or_raise(
503 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
504 content_start += whole_start + 1
505 with HTMLBreakOnClosingTagParser() as parser:
506 parser.feed(html[whole_start:content_start])
507 if not parser.tagstack or parser.tagstack[0] != tag:
508 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
509 offset = content_start
510 while offset < len(html):
511 next_closing_tag_start = find_or_raise(
512 html[offset:], closing_tag,
513 compat_HTMLParseError(f'closing {tag} tag not found'))
514 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
516 parser.feed(html[offset:offset + next_closing_tag_end])
517 offset += next_closing_tag_end
518 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
519 return html[content_start:offset + next_closing_tag_start], \
520 html[whole_start:offset + next_closing_tag_end]
521 raise compat_HTMLParseError('unexpected end of html')
524 class HTMLAttributeParser(compat_HTMLParser):
525 """Trivial HTML parser to gather the attributes
for a single element
"""
529 compat_HTMLParser.__init__(self)
531 def handle_starttag(self, tag, attrs):
532 self.attrs = dict(attrs)
535 class HTMLListAttrsParser(compat_HTMLParser):
536 """HTML parser to gather the attributes
for the elements of a
list"""
539 compat_HTMLParser.__init__(self)
543 def handle_starttag(self, tag, attrs):
544 if tag == 'li' and self._level == 0:
545 self.items.append(dict(attrs))
548 def handle_endtag(self, tag):
552 def extract_attributes(html_element):
553 """Given a string
for an HTML element such
as
555 a
="foo" B
="bar" c
="&98;az" d
=boz
556 empty
= noval entity
="&"
559 Decode
and return a dictionary of attributes
.
561 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
562 'empty': '', 'noval': None, 'entity': '&',
563 'sq': '"', 'dq': '\''
566 parser = HTMLAttributeParser()
567 with contextlib.suppress(compat_HTMLParseError):
568 parser.feed(html_element)
573 def parse_list(webpage):
574 """Given a string
for an series of HTML
<li
> elements
,
575 return a dictionary of their attributes
"""
576 parser = HTMLListAttrsParser()
582 def clean_html(html):
583 """Clean an HTML snippet into a readable string
"""
585 if html is None: # Convenience for sanitizing descriptions etc.
588 html = re.sub(r'\s+', ' ', html)
589 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
590 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
592 html = re.sub('<.*?>', '', html)
593 # Replace html entities
594 html = unescapeHTML(html)
598 class LenientJSONDecoder(json.JSONDecoder):
599 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
600 self.transform_source, self.ignore_extra = transform_source, ignore_extra
601 super().__init__(*args, **kwargs)
604 if self.transform_source:
605 s = self.transform_source(s)
606 if self.ignore_extra:
607 return self.raw_decode(s.lstrip())[0]
608 return super().decode(s)
611 def sanitize_open(filename, open_mode):
612 """Try to
open the given filename
, and slightly tweak it
if this fails
.
614 Attempts to
open the given filename
. If this fails
, it tries to change
615 the filename slightly
, step by step
, until it
's either able to open it
616 or it fails and raises a final exception, like the standard open()
619 It returns the tuple (stream, definitive_file_name).
622 if sys.platform == 'win32
':
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
627 for attempt in range(2):
630 if sys.platform == 'win32
':
631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
634 raise LockingUnsupportedError()
635 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
637 stream
= open(filename
, open_mode
)
638 return stream
, filename
639 except OSError as err
:
640 if attempt
or err
.errno
in (errno
.EACCES
,):
642 old_filename
, filename
= filename
, sanitize_path(filename
)
643 if old_filename
== filename
:
647 def timeconvert(timestr
):
648 """Convert RFC 2822 defined time string into system timestamp"""
650 timetuple
= email
.utils
.parsedate_tz(timestr
)
651 if timetuple
is not None:
652 timestamp
= email
.utils
.mktime_tz(timetuple
)
656 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
657 """Sanitizes a string so it could be used as part of a filename.
658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
665 def replace_insane(char
):
666 if restricted
and char
in ACCENT_CHARS
:
667 return ACCENT_CHARS
[char
]
668 elif not restricted
and char
== '\n':
670 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
673 return '' if restricted
else '\''
675 return '\0_\0-' if restricted
else '\0 \0-'
676 elif char
in '\\/|*<>':
678 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
682 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
683 result
= ''.join(map(replace_insane
, s
))
684 if is_id
is NO_DEFAULT
:
685 result
= re
.sub('(\0.)(?:(?=\\1)..)+', r
'\1', result
) # Remove repeated substitute chars
686 STRIP_RE
= '(?:\0.|[ _-])*'
687 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
688 result
= result
.replace('\0', '') or '_'
691 while '__' in result
:
692 result
= result
.replace('__', '_')
693 result
= result
.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted
and result
.startswith('-_'):
697 if result
.startswith('-'):
698 result
= '_' + result
[len('-'):]
699 result
= result
.lstrip('.')
705 def sanitize_path(s
, force
=False):
706 """Sanitizes and normalizes path on Windows"""
707 if sys
.platform
== 'win32':
709 drive_or_unc
, _
= os
.path
.splitdrive(s
)
715 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
719 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
720 for path_part
in norm_path
]
722 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
723 elif force
and s
and s
[0] == os
.path
.sep
:
724 sanitized_path
.insert(0, os
.path
.sep
)
725 return os
.path
.join(*sanitized_path
)
728 def sanitize_url(url
):
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
733 elif url
.startswith('//'):
734 return 'http:%s' % url
735 # Fix some common typos seen so far
737 # https://github.com/ytdl-org/youtube-dl/issues/15649
738 (r
'^httpss://', r
'https://'),
739 # https://bx1.be/lives/direct-tv/
740 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
742 for mistake
, fixup
in COMMON_TYPOS
:
743 if re
.match(mistake
, url
):
744 return re
.sub(mistake
, fixup
, url
)
748 def extract_basic_auth(url
):
749 parts
= compat_urlparse
.urlsplit(url
)
750 if parts
.username
is None:
752 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
753 parts
.hostname
if parts
.port
is None
754 else '%s:%d' % (parts
.hostname
, parts
.port
))))
755 auth_payload
= base64
.b64encode(
756 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
757 return url
, f
'Basic {auth_payload.decode()}'
760 def sanitized_Request(url
, *args
, **kwargs
):
761 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
762 if auth_header
is not None:
763 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
764 headers
['Authorization'] = auth_header
765 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
769 """Expand shell variables and ~"""
770 return os
.path
.expandvars(compat_expanduser(s
))
773 def orderedSet(iterable
):
774 """ Remove all duplicates from the input iterable """
782 def _htmlentity_transform(entity_with_semicolon
):
783 """Transforms an HTML entity to a character."""
784 entity
= entity_with_semicolon
[:-1]
786 # Known non-numeric HTML entity
787 if entity
in compat_html_entities
.name2codepoint
:
788 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
790 # TODO: HTML5 allows entities without a semicolon. For example,
791 # 'Éric' should be decoded as 'Éric'.
792 if entity_with_semicolon
in compat_html_entities_html5
:
793 return compat_html_entities_html5
[entity_with_semicolon
]
795 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
797 numstr
= mobj
.group(1)
798 if numstr
.startswith('x'):
800 numstr
= '0%s' % numstr
803 # See https://github.com/ytdl-org/youtube-dl/issues/7518
804 with contextlib
.suppress(ValueError):
805 return compat_chr(int(numstr
, base
))
807 # Unknown entity in name, return its literal representation
808 return '&%s;' % entity
814 assert isinstance(s
, str)
817 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
820 def escapeHTML(text
):
823 .replace('&', '&')
824 .replace('<', '<')
825 .replace('>', '>')
826 .replace('"', '"')
827 .replace("'", ''')
831 def process_communicate_or_kill(p
, *args
, **kwargs
):
832 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
833 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
834 return Popen
.communicate_or_kill(p
, *args
, **kwargs
)
837 class Popen(subprocess
.Popen
):
838 if sys
.platform
== 'win32':
839 _startupinfo
= subprocess
.STARTUPINFO()
840 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
844 def __init__(self
, *args
, text
=False, **kwargs
):
846 kwargs
['universal_newlines'] = True # For 3.6 compatibility
847 kwargs
.setdefault('encoding', 'utf-8')
848 kwargs
.setdefault('errors', 'replace')
849 super().__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
851 def communicate_or_kill(self
, *args
, **kwargs
):
853 return self
.communicate(*args
, **kwargs
)
854 except BaseException
: # Including KeyboardInterrupt
855 self
.kill(timeout
=None)
858 def kill(self
, *, timeout
=0):
861 self
.wait(timeout
=timeout
)
864 def run(cls
, *args
, **kwargs
):
865 with cls(*args
, **kwargs
) as proc
:
866 stdout
, stderr
= proc
.communicate_or_kill()
867 return stdout
or '', stderr
or '', proc
.returncode
870 def get_subprocess_encoding():
871 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
872 # For subprocess calls, encode with locale encoding
873 # Refer to http://stackoverflow.com/a/9951851/35070
874 encoding
= preferredencoding()
876 encoding
= sys
.getfilesystemencoding()
882 def encodeFilename(s
, for_subprocess
=False):
883 assert isinstance(s
, str)
887 def decodeFilename(b
, for_subprocess
=False):
891 def encodeArgument(s
):
892 # Legacy code that uses byte strings
893 # Uncomment the following line after fixing all post processors
894 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
895 return s
if isinstance(s
, str) else s
.decode('ascii')
898 def decodeArgument(b
):
902 def decodeOption(optval
):
905 if isinstance(optval
, bytes):
906 optval
= optval
.decode(preferredencoding())
908 assert isinstance(optval
, compat_str
)
912 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
915 def timetuple_from_msec(msec
):
916 secs
, msec
= divmod(msec
, 1000)
917 mins
, secs
= divmod(secs
, 60)
918 hrs
, mins
= divmod(mins
, 60)
919 return _timetuple(hrs
, mins
, secs
, msec
)
922 def formatSeconds(secs
, delim
=':', msec
=False):
923 time
= timetuple_from_msec(secs
* 1000)
925 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
927 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
929 ret
= '%d' % time
.seconds
930 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
933 def _ssl_load_windows_store_certs(ssl_context
, storename
):
934 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
936 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
937 if encoding
== 'x509_asn' and (
938 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
939 except PermissionError
:
942 with contextlib
.suppress(ssl
.SSLError
):
943 ssl_context
.load_verify_locations(cadata
=cert
)
946 def make_HTTPS_handler(params
, **kwargs
):
947 opts_check_certificate
= not params
.get('nocheckcertificate')
948 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
949 context
.check_hostname
= opts_check_certificate
950 if params
.get('legacyserverconnect'):
951 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
952 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
953 context
.set_ciphers('DEFAULT')
955 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
956 if opts_check_certificate
:
957 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
958 context
.load_verify_locations(cafile
=certifi
.where())
960 context
.load_default_certs()
961 # Work around the issue in load_default_certs when there are bad certificates. See:
962 # https://github.com/yt-dlp/yt-dlp/issues/1060,
963 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
965 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
966 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
967 for storename
in ('CA', 'ROOT'):
968 _ssl_load_windows_store_certs(context
, storename
)
969 context
.set_default_verify_paths()
971 client_certfile
= params
.get('client_certificate')
974 context
.load_cert_chain(
975 client_certfile
, keyfile
=params
.get('client_certificate_key'),
976 password
=params
.get('client_certificate_password'))
978 raise YoutubeDLError('Unable to load client certificate')
980 # Some servers may reject requests if ALPN extension is not sent. See:
981 # https://github.com/python/cpython/issues/85140
982 # https://github.com/yt-dlp/yt-dlp/issues/3878
983 with contextlib
.suppress(NotImplementedError):
984 context
.set_alpn_protocols(['http/1.1'])
986 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
989 def bug_reports_message(before
=';'):
990 msg
= ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
991 'filling out the appropriate issue template. '
992 'Confirm you are on the latest version using yt-dlp -U')
994 before
= before
.rstrip()
995 if not before
or before
.endswith(('.', '!', '?')):
996 msg
= msg
[0].title() + msg
[1:]
998 return (before
+ ' ' if before
else '') + msg
1001 class YoutubeDLError(Exception):
1002 """Base exception for YoutubeDL errors."""
1005 def __init__(self
, msg
=None):
1008 elif self
.msg
is None:
1009 self
.msg
= type(self
).__name
__
1010 super().__init
__(self
.msg
)
1013 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
1014 if hasattr(ssl
, 'CertificateError'):
1015 network_exceptions
.append(ssl
.CertificateError
)
1016 network_exceptions
= tuple(network_exceptions
)
1019 class ExtractorError(YoutubeDLError
):
1020 """Error during info extraction."""
1022 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
1023 """ tb, if given, is the original traceback (so that it can be printed out).
1024 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1026 if sys
.exc_info()[0] in network_exceptions
:
1029 self
.orig_msg
= str(msg
)
1031 self
.expected
= expected
1033 self
.video_id
= video_id
1035 self
.exc_info
= sys
.exc_info() # preserve original exception
1037 super().__init
__(''.join((
1038 format_field(ie
, template
='[%s] '),
1039 format_field(video_id
, template
='%s: '),
1041 format_field(cause
, template
=' (caused by %r)'),
1042 '' if expected
else bug_reports_message())))
1044 def format_traceback(self
):
1045 return join_nonempty(
1046 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1047 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1051 class UnsupportedError(ExtractorError
):
1052 def __init__(self
, url
):
1054 'Unsupported URL: %s' % url
, expected
=True)
1058 class RegexNotFoundError(ExtractorError
):
1059 """Error when a regex didn't match"""
1063 class GeoRestrictedError(ExtractorError
):
1064 """Geographic restriction Error exception.
1066 This exception may be thrown when a video is not available from your
1067 geographic location due to geographic restrictions imposed by a website.
1070 def __init__(self
, msg
, countries
=None, **kwargs
):
1071 kwargs
['expected'] = True
1072 super().__init
__(msg
, **kwargs
)
1073 self
.countries
= countries
1076 class DownloadError(YoutubeDLError
):
1077 """Download Error exception.
1079 This exception may be thrown by FileDownloader objects if they are not
1080 configured to continue on errors. They will contain the appropriate
1084 def __init__(self
, msg
, exc_info
=None):
1085 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1086 super().__init
__(msg
)
1087 self
.exc_info
= exc_info
1090 class EntryNotInPlaylist(YoutubeDLError
):
1091 """Entry not in playlist exception.
1093 This exception will be thrown by YoutubeDL when a requested entry
1094 is not found in the playlist info_dict
1096 msg
= 'Entry not found in info'
1099 class SameFileError(YoutubeDLError
):
1100 """Same File exception.
1102 This exception will be thrown by FileDownloader objects if they detect
1103 multiple files would have to be downloaded to the same file on disk.
1105 msg
= 'Fixed output name but more than one file to download'
1107 def __init__(self
, filename
=None):
1108 if filename
is not None:
1109 self
.msg
+= f
': {filename}'
1110 super().__init
__(self
.msg
)
1113 class PostProcessingError(YoutubeDLError
):
1114 """Post Processing exception.
1116 This exception may be raised by PostProcessor's .run() method to
1117 indicate an error in the postprocessing task.
1121 class DownloadCancelled(YoutubeDLError
):
1122 """ Exception raised when the download queue should be interrupted """
1123 msg
= 'The download was cancelled'
1126 class ExistingVideoReached(DownloadCancelled
):
1127 """ --break-on-existing triggered """
1128 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1131 class RejectedVideoReached(DownloadCancelled
):
1132 """ --break-on-reject triggered """
1133 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1136 class MaxDownloadsReached(DownloadCancelled
):
1137 """ --max-downloads limit has been reached. """
1138 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1141 class ReExtractInfo(YoutubeDLError
):
1142 """ Video info needs to be re-extracted. """
1144 def __init__(self
, msg
, expected
=False):
1145 super().__init
__(msg
)
1146 self
.expected
= expected
1149 class ThrottledDownload(ReExtractInfo
):
1150 """ Download speed below --throttled-rate. """
1151 msg
= 'The download speed is below throttle limit'
1154 super().__init
__(self
.msg
, expected
=False)
1157 class UnavailableVideoError(YoutubeDLError
):
1158 """Unavailable Format exception.
1160 This exception will be thrown when a video is requested
1161 in a format that is not available for that video.
1163 msg
= 'Unable to download video'
1165 def __init__(self
, err
=None):
1167 self
.msg
+= f
': {err}'
1168 super().__init
__(self
.msg
)
1171 class ContentTooShortError(YoutubeDLError
):
1172 """Content Too Short exception.
1174 This exception may be raised by FileDownloader objects when a file they
1175 download is too small for what the server announced first, indicating
1176 the connection was probably interrupted.
1179 def __init__(self
, downloaded
, expected
):
1180 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1182 self
.downloaded
= downloaded
1183 self
.expected
= expected
1186 class XAttrMetadataError(YoutubeDLError
):
1187 def __init__(self
, code
=None, msg
='Unknown error'):
1188 super().__init
__(msg
)
1192 # Parsing code and msg
1193 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1194 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1195 self
.reason
= 'NO_SPACE'
1196 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1197 self
.reason
= 'VALUE_TOO_LONG'
1199 self
.reason
= 'NOT_SUPPORTED'
1202 class XAttrUnavailableError(YoutubeDLError
):
1206 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1207 hc
= http_class(*args
, **kwargs
)
1208 source_address
= ydl_handler
._params
.get('source_address')
1210 if source_address
is not None:
1211 # This is to workaround _create_connection() from socket where it will try all
1212 # address data from getaddrinfo() including IPv6. This filters the result from
1213 # getaddrinfo() based on the source_address value.
1214 # This is based on the cpython socket.create_connection() function.
1215 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1216 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1217 host
, port
= address
1219 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1220 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1221 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1222 if addrs
and not ip_addrs
:
1223 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1225 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1226 % (ip_version
, source_address
[0]))
1227 for res
in ip_addrs
:
1228 af
, socktype
, proto
, canonname
, sa
= res
1231 sock
= socket
.socket(af
, socktype
, proto
)
1232 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1233 sock
.settimeout(timeout
)
1234 sock
.bind(source_address
)
1236 err
= None # Explicitly break reference cycle
1238 except OSError as _
:
1240 if sock
is not None:
1245 raise OSError('getaddrinfo returns an empty list')
1246 if hasattr(hc
, '_create_connection'):
1247 hc
._create
_connection
= _create_connection
1248 hc
.source_address
= (source_address
, 0)
1253 def handle_youtubedl_headers(headers
):
1254 filtered_headers
= headers
1256 if 'Youtubedl-no-compression' in filtered_headers
:
1257 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1258 del filtered_headers
['Youtubedl-no-compression']
1260 return filtered_headers
1263 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1264 """Handler for HTTP requests and responses.
1266 This class, when installed with an OpenerDirector, automatically adds
1267 the standard headers to every HTTP request and handles gzipped and
1268 deflated responses from web servers. If compression is to be avoided in
1269 a particular request, the original request in the program code only has
1270 to include the HTTP header "Youtubedl-no-compression", which will be
1271 removed before making the real request.
1273 Part of this code was copied from:
1275 http://techknack.net/python-urllib2-handlers/
1277 Andrew Rowls, the author of that code, agreed to release it to the
1281 def __init__(self
, params
, *args
, **kwargs
):
1282 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1283 self
._params
= params
1285 def http_open(self
, req
):
1286 conn_class
= compat_http_client
.HTTPConnection
1288 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1290 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1291 del req
.headers
['Ytdl-socks-proxy']
1293 return self
.do_open(functools
.partial(
1294 _create_http_connection
, self
, conn_class
, False),
1302 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1304 return zlib
.decompress(data
)
1310 return brotli
.decompress(data
)
1312 def http_request(self
, req
):
1313 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1314 # always respected by websites, some tend to give out URLs with non percent-encoded
1315 # non-ASCII characters (see telemb.py, ard.py [#3412])
1316 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1317 # To work around aforementioned issue we will replace request's original URL with
1318 # percent-encoded one
1319 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1320 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1321 url
= req
.get_full_url()
1322 url_escaped
= escape_url(url
)
1324 # Substitute URL if any change after escaping
1325 if url
!= url_escaped
:
1326 req
= update_Request(req
, url
=url_escaped
)
1328 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1329 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1330 # The dict keys are capitalized because of this bug by urllib
1331 if h
.capitalize() not in req
.headers
:
1332 req
.add_header(h
, v
)
1334 if 'Accept-encoding' not in req
.headers
:
1335 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1337 req
.headers
= handle_youtubedl_headers(req
.headers
)
1341 def http_response(self
, req
, resp
):
1344 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1345 content
= resp
.read()
1346 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1348 uncompressed
= io
.BytesIO(gz
.read())
1349 except OSError as original_ioerror
:
1350 # There may be junk add the end of the file
1351 # See http://stackoverflow.com/q/4928560/35070 for details
1352 for i
in range(1, 1024):
1354 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1355 uncompressed
= io
.BytesIO(gz
.read())
1360 raise original_ioerror
1361 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1362 resp
.msg
= old_resp
.msg
1363 del resp
.headers
['Content-encoding']
1365 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1366 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1367 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1368 resp
.msg
= old_resp
.msg
1369 del resp
.headers
['Content-encoding']
1371 if resp
.headers
.get('Content-encoding', '') == 'br':
1372 resp
= compat_urllib_request
.addinfourl(
1373 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1374 resp
.msg
= old_resp
.msg
1375 del resp
.headers
['Content-encoding']
1376 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1377 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1378 if 300 <= resp
.code
< 400:
1379 location
= resp
.headers
.get('Location')
1381 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1382 location
= location
.encode('iso-8859-1').decode()
1383 location_escaped
= escape_url(location
)
1384 if location
!= location_escaped
:
1385 del resp
.headers
['Location']
1386 resp
.headers
['Location'] = location_escaped
1389 https_request
= http_request
1390 https_response
= http_response
1393 def make_socks_conn_class(base_class
, socks_proxy
):
1394 assert issubclass(base_class
, (
1395 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1397 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1398 if url_components
.scheme
.lower() == 'socks5':
1399 socks_type
= ProxyType
.SOCKS5
1400 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1401 socks_type
= ProxyType
.SOCKS4
1402 elif url_components
.scheme
.lower() == 'socks4a':
1403 socks_type
= ProxyType
.SOCKS4A
1405 def unquote_if_non_empty(s
):
1408 return compat_urllib_parse_unquote_plus(s
)
1412 url_components
.hostname
, url_components
.port
or 1080,
1414 unquote_if_non_empty(url_components
.username
),
1415 unquote_if_non_empty(url_components
.password
),
1418 class SocksConnection(base_class
):
1420 self
.sock
= sockssocket()
1421 self
.sock
.setproxy(*proxy_args
)
1422 if isinstance(self
.timeout
, (int, float)):
1423 self
.sock
.settimeout(self
.timeout
)
1424 self
.sock
.connect((self
.host
, self
.port
))
1426 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1427 if hasattr(self
, '_context'): # Python > 2.6
1428 self
.sock
= self
._context
.wrap_socket(
1429 self
.sock
, server_hostname
=self
.host
)
1431 self
.sock
= ssl
.wrap_socket(self
.sock
)
1433 return SocksConnection
1436 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1437 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1438 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1439 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1440 self
._params
= params
1442 def https_open(self
, req
):
1444 conn_class
= self
._https
_conn
_class
1446 if hasattr(self
, '_context'): # python > 2.6
1447 kwargs
['context'] = self
._context
1448 if hasattr(self
, '_check_hostname'): # python 3.x
1449 kwargs
['check_hostname'] = self
._check
_hostname
1451 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1453 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1454 del req
.headers
['Ytdl-socks-proxy']
1457 return self
.do_open(
1458 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1459 except urllib
.error
.URLError
as e
:
1460 if (isinstance(e
.reason
, ssl
.SSLError
)
1461 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1462 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1466 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1468 See [1] for cookie file format.
1470 1. https://curl.haxx.se/docs/http-cookies.html
1472 _HTTPONLY_PREFIX
= '#HttpOnly_'
1474 _HEADER
= '''# Netscape HTTP Cookie File
1475 # This file is generated by yt-dlp. Do not edit.
1478 _CookieFileEntry
= collections
.namedtuple(
1480 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1482 def __init__(self
, filename
=None, *args
, **kwargs
):
1483 super().__init
__(None, *args
, **kwargs
)
1484 if self
.is_path(filename
):
1485 filename
= os
.fspath(filename
)
1486 self
.filename
= filename
1489 def _true_or_false(cndn
):
1490 return 'TRUE' if cndn
else 'FALSE'
1494 return isinstance(file, (str, bytes, os
.PathLike
))
1496 @contextlib.contextmanager
1497 def open(self
, file, *, write
=False):
1498 if self
.is_path(file):
1499 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1506 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1509 if (not ignore_discard
and cookie
.discard
1510 or not ignore_expires
and cookie
.is_expired(now
)):
1512 name
, value
= cookie
.name
, cookie
.value
1514 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1515 # with no name, whereas http.cookiejar regards it as a
1516 # cookie with no value.
1517 name
, value
= '', name
1518 f
.write('%s\n' % '\t'.join((
1520 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1522 self
._true
_or
_false
(cookie
.secure
),
1523 str_or_none(cookie
.expires
, default
=''),
1527 def save(self
, filename
=None, *args
, **kwargs
):
1529 Save cookies to a file.
1530 Code is taken from CPython 3.6
1531 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1533 if filename
is None:
1534 if self
.filename
is not None:
1535 filename
= self
.filename
1537 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1539 # Store session cookies with `expires` set to 0 instead of an empty string
1541 if cookie
.expires
is None:
1544 with self
.open(filename
, write
=True) as f
:
1545 f
.write(self
._HEADER
)
1546 self
._really
_save
(f
, *args
, **kwargs
)
1548 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1549 """Load cookies from a file."""
1550 if filename
is None:
1551 if self
.filename
is not None:
1552 filename
= self
.filename
1554 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1556 def prepare_line(line
):
1557 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1558 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1559 # comments and empty lines are fine
1560 if line
.startswith('#') or not line
.strip():
1562 cookie_list
= line
.split('\t')
1563 if len(cookie_list
) != self
._ENTRY
_LEN
:
1564 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1565 cookie
= self
._CookieFileEntry
(*cookie_list
)
1566 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1567 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1571 with self
.open(filename
) as f
:
1574 cf
.write(prepare_line(line
))
1575 except compat_cookiejar
.LoadError
as e
:
1576 if f
'{line.strip()} '[0] in '[{"':
1577 raise compat_cookiejar
.LoadError(
1578 'Cookies file must be Netscape formatted, not JSON. See '
1579 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1580 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1583 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1584 # Session cookies are denoted by either `expires` field set to
1585 # an empty string or 0. MozillaCookieJar only recognizes the former
1586 # (see [1]). So we need force the latter to be recognized as session
1587 # cookies on our own.
1588 # Session cookies may be important for cookies-based authentication,
1589 # e.g. usually, when user does not check 'Remember me' check box while
1590 # logging in on a site, some important cookies are stored as session
1591 # cookies so that not recognizing them will result in failed login.
1592 # 1. https://bugs.python.org/issue17164
1594 # Treat `expires=0` cookies as session cookies
1595 if cookie
.expires
== 0:
1596 cookie
.expires
= None
1597 cookie
.discard
= True
1600 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1601 def __init__(self
, cookiejar
=None):
1602 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1604 def http_response(self
, request
, response
):
1605 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1607 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1608 https_response
= http_response
1611 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1612 """YoutubeDL redirect handler
1614 The code is based on HTTPRedirectHandler implementation from CPython [1].
1616 This redirect handler solves two issues:
1617 - ensures redirect URL is always unicode under python 2
1618 - introduces support for experimental HTTP response status code
1619 308 Permanent Redirect [2] used by some sites [3]
1621 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1622 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1623 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1626 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1628 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1629 """Return a Request or None in response to a redirect.
1631 This is called by the http_error_30x methods when a
1632 redirection response is received. If a redirection should
1633 take place, return a new Request to allow http_error_30x to
1634 perform the redirect. Otherwise, raise HTTPError if no-one
1635 else should try to handle this url. Return None if you can't
1636 but another Handler might.
1638 m
= req
.get_method()
1639 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1640 or code
in (301, 302, 303) and m
== "POST")):
1641 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1642 # Strictly (according to RFC 2616), 301 or 302 in response to
1643 # a POST MUST NOT cause a redirection without confirmation
1644 # from the user (of urllib.request, in this case). In practice,
1645 # essentially all clients do redirect in this case, so we do
1648 # Be conciliant with URIs containing a space. This is mainly
1649 # redundant with the more complete encoding done in http_error_302(),
1650 # but it is kept for compatibility with other callers.
1651 newurl
= newurl
.replace(' ', '%20')
1653 CONTENT_HEADERS
= ("content-length", "content-type")
1654 # NB: don't use dict comprehension for python 2.6 compatibility
1655 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1657 # A 303 must either use GET or HEAD for subsequent request
1658 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1659 if code
== 303 and m
!= 'HEAD':
1661 # 301 and 302 redirects are commonly turned into a GET from a POST
1662 # for subsequent requests by browsers, so we'll do the same.
1663 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1664 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1665 if code
in (301, 302) and m
== 'POST':
1668 return compat_urllib_request
.Request(
1669 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1670 unverifiable
=True, method
=m
)
1673 def extract_timezone(date_str
):
1676 ^.{8,}? # >=8 char non-TZ prefix, if present
1677 (?P<tz>Z| # just the UTC Z, or
1678 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1679 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1680 [ ]? # optional space
1681 (?P<sign>\+|-) # +/-
1682 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1686 timezone
= datetime
.timedelta()
1688 date_str
= date_str
[:-len(m
.group('tz'))]
1689 if not m
.group('sign'):
1690 timezone
= datetime
.timedelta()
1692 sign
= 1 if m
.group('sign') == '+' else -1
1693 timezone
= datetime
.timedelta(
1694 hours
=sign
* int(m
.group('hours')),
1695 minutes
=sign
* int(m
.group('minutes')))
1696 return timezone
, date_str
1699 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1700 """ Return a UNIX timestamp from the given date """
1702 if date_str
is None:
1705 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1707 if timezone
is None:
1708 timezone
, date_str
= extract_timezone(date_str
)
1710 with contextlib
.suppress(ValueError):
1711 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1712 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1713 return calendar
.timegm(dt
.timetuple())
1716 def date_formats(day_first
=True):
1717 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1720 def unified_strdate(date_str
, day_first
=True):
1721 """Return a string with the date in the format YYYYMMDD"""
1723 if date_str
is None:
1727 date_str
= date_str
.replace(',', ' ')
1728 # Remove AM/PM + timezone
1729 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1730 _
, date_str
= extract_timezone(date_str
)
1732 for expression
in date_formats(day_first
):
1733 with contextlib
.suppress(ValueError):
1734 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1735 if upload_date
is None:
1736 timetuple
= email
.utils
.parsedate_tz(date_str
)
1738 with contextlib
.suppress(ValueError):
1739 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1740 if upload_date
is not None:
1741 return compat_str(upload_date
)
1744 def unified_timestamp(date_str
, day_first
=True):
1745 if date_str
is None:
1748 date_str
= re
.sub(r
'[,|]', '', date_str
)
1750 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1751 timezone
, date_str
= extract_timezone(date_str
)
1753 # Remove AM/PM + timezone
1754 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1756 # Remove unrecognized timezones from ISO 8601 alike timestamps
1757 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1759 date_str
= date_str
[:-len(m
.group('tz'))]
1761 # Python only supports microseconds, so remove nanoseconds
1762 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1764 date_str
= m
.group(1)
1766 for expression
in date_formats(day_first
):
1767 with contextlib
.suppress(ValueError):
1768 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1769 return calendar
.timegm(dt
.timetuple())
1770 timetuple
= email
.utils
.parsedate_tz(date_str
)
1772 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1775 def determine_ext(url
, default_ext
='unknown_video'):
1776 if url
is None or '.' not in url
:
1778 guess
= url
.partition('?')[0].rpartition('.')[2]
1779 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1781 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1782 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1783 return guess
.rstrip('/')
1788 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1789 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1792 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1794 Return a datetime object from a string.
1796 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1798 @param format strftime format of DATE
1799 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1800 auto: round to the unit provided in date_str (if applicable).
1802 auto_precision
= False
1803 if precision
== 'auto':
1804 auto_precision
= True
1805 precision
= 'microsecond'
1806 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1807 if date_str
in ('now', 'today'):
1809 if date_str
== 'yesterday':
1810 return today
- datetime
.timedelta(days
=1)
1812 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1814 if match
is not None:
1815 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1816 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1817 unit
= match
.group('unit')
1818 if unit
== 'month' or unit
== 'year':
1819 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1825 delta
= datetime
.timedelta(**{unit + 's': time}
)
1826 new_date
= start_time
+ delta
1828 return datetime_round(new_date
, unit
)
1831 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1834 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1836 Return a date object from a string using datetime_from_str
1838 @param strict Restrict allowed patterns to "YYYYMMDD" and
1839 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1841 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1842 raise ValueError(f
'Invalid date format "{date_str}"')
1843 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1846 def datetime_add_months(dt
, months
):
1847 """Increment/Decrement a datetime object by months."""
1848 month
= dt
.month
+ months
- 1
1849 year
= dt
.year
+ month
// 12
1850 month
= month
% 12 + 1
1851 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1852 return dt
.replace(year
, month
, day
)
1855 def datetime_round(dt
, precision
='day'):
1857 Round a datetime object's time to a specific precision
1859 if precision
== 'microsecond':
1868 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1869 timestamp
= calendar
.timegm(dt
.timetuple())
1870 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1873 def hyphenate_date(date_str
):
1875 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1876 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1877 if match
is not None:
1878 return '-'.join(match
.groups())
1884 """Represents a time interval between two dates"""
1886 def __init__(self
, start
=None, end
=None):
1887 """start and end must be strings in the format accepted by date"""
1888 if start
is not None:
1889 self
.start
= date_from_str(start
, strict
=True)
1891 self
.start
= datetime
.datetime
.min.date()
1893 self
.end
= date_from_str(end
, strict
=True)
1895 self
.end
= datetime
.datetime
.max.date()
1896 if self
.start
> self
.end
:
1897 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1901 """Returns a range that only contains the given day"""
1902 return cls(day
, day
)
1904 def __contains__(self
, date
):
1905 """Check if the date is in the range"""
1906 if not isinstance(date
, datetime
.date
):
1907 date
= date_from_str(date
)
1908 return self
.start
<= date
<= self
.end
1911 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
1914 def platform_name():
1915 """ Returns the platform name as a compat_str """
1916 res
= platform
.platform()
1917 if isinstance(res
, bytes):
1918 res
= res
.decode(preferredencoding())
1920 assert isinstance(res
, compat_str
)
1925 def get_windows_version():
1926 ''' Get Windows version. returns () if it's not running on Windows '''
1927 if compat_os_name
== 'nt':
1928 return version_tuple(platform
.win32_ver()[1])
1933 def write_string(s
, out
=None, encoding
=None):
1934 assert isinstance(s
, str)
1935 out
= out
or sys
.stderr
1937 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1938 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1940 enc
, buffer = None, out
1941 if 'b' in getattr(out
, 'mode', ''):
1942 enc
= encoding
or preferredencoding()
1943 elif hasattr(out
, 'buffer'):
1945 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1947 buffer.write(s
.encode(enc
, 'ignore') if enc
else s
)
1951 def bytes_to_intlist(bs
):
1954 if isinstance(bs
[0], int): # Python 3
1957 return [ord(c
) for c
in bs
]
1960 def intlist_to_bytes(xs
):
1963 return compat_struct_pack('%dB' % len(xs
), *xs
)
1966 class LockingUnsupportedError(OSError):
1967 msg
= 'File locking is not supported'
1970 super().__init
__(self
.msg
)
1973 # Cross-platform file locking
1974 if sys
.platform
== 'win32':
1975 import ctypes
.wintypes
1978 class OVERLAPPED(ctypes
.Structure
):
1980 ('Internal', ctypes
.wintypes
.LPVOID
),
1981 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1982 ('Offset', ctypes
.wintypes
.DWORD
),
1983 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1984 ('hEvent', ctypes
.wintypes
.HANDLE
),
1987 kernel32
= ctypes
.windll
.kernel32
1988 LockFileEx
= kernel32
.LockFileEx
1989 LockFileEx
.argtypes
= [
1990 ctypes
.wintypes
.HANDLE
, # hFile
1991 ctypes
.wintypes
.DWORD
, # dwFlags
1992 ctypes
.wintypes
.DWORD
, # dwReserved
1993 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1994 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1995 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1997 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1998 UnlockFileEx
= kernel32
.UnlockFileEx
1999 UnlockFileEx
.argtypes
= [
2000 ctypes
.wintypes
.HANDLE
, # hFile
2001 ctypes
.wintypes
.DWORD
, # dwReserved
2002 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2003 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2004 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2006 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
2007 whole_low
= 0xffffffff
2008 whole_high
= 0x7fffffff
2010 def _lock_file(f
, exclusive
, block
):
2011 overlapped
= OVERLAPPED()
2012 overlapped
.Offset
= 0
2013 overlapped
.OffsetHigh
= 0
2014 overlapped
.hEvent
= 0
2015 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
2017 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
2018 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
2019 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2020 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2021 raise BlockingIOError(f
'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2023 def _unlock_file(f
):
2024 assert f
._lock
_file
_overlapped
_p
2025 handle
= msvcrt
.get_osfhandle(f
.fileno())
2026 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2027 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2033 def _lock_file(f
, exclusive
, block
):
2034 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
2036 flags |
= fcntl
.LOCK_NB
2038 fcntl
.flock(f
, flags
)
2039 except BlockingIOError
:
2041 except OSError: # AOSP does not have flock()
2042 fcntl
.lockf(f
, flags
)
2044 def _unlock_file(f
):
2046 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2048 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2052 def _lock_file(f
, exclusive
, block
):
2053 raise LockingUnsupportedError()
2055 def _unlock_file(f
):
2056 raise LockingUnsupportedError()
2062 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2063 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2064 raise NotImplementedError(mode
)
2065 self
.mode
, self
.block
= mode
, block
2067 writable
= any(f
in mode
for f
in 'wax+')
2068 readable
= any(f
in mode
for f
in 'r+')
2069 flags
= functools
.reduce(operator
.ior
, (
2070 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2071 getattr(os
, 'O_BINARY', 0), # Windows only
2072 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2073 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2074 os
.O_APPEND
if 'a' in mode
else 0,
2075 os
.O_EXCL
if 'x' in mode
else 0,
2076 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2079 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2081 def __enter__(self
):
2082 exclusive
= 'r' not in self
.mode
2084 _lock_file(self
.f
, exclusive
, self
.block
)
2089 if 'w' in self
.mode
:
2092 except OSError as e
:
2094 errno
.ESPIPE
, # Illegal seek - expected for FIFO
2095 errno
.EINVAL
, # Invalid argument - expected for /dev/null
2104 _unlock_file(self
.f
)
2108 def __exit__(self
, *_
):
2117 def __getattr__(self
, attr
):
2118 return getattr(self
.f
, attr
)
2125 def get_filesystem_encoding():
2126 encoding
= sys
.getfilesystemencoding()
2127 return encoding
if encoding
is not None else 'utf-8'
2130 def shell_quote(args
):
2132 encoding
= get_filesystem_encoding()
2134 if isinstance(a
, bytes):
2135 # We may get a filename encoded with 'encodeFilename'
2136 a
= a
.decode(encoding
)
2137 quoted_args
.append(compat_shlex_quote(a
))
2138 return ' '.join(quoted_args
)
2141 def smuggle_url(url
, data
):
2142 """ Pass additional data in a URL for internal use. """
2144 url
, idata
= unsmuggle_url(url
, {})
2146 sdata
= compat_urllib_parse_urlencode(
2147 {'__youtubedl_smuggle': json.dumps(data)}
)
2148 return url
+ '#' + sdata
2151 def unsmuggle_url(smug_url
, default
=None):
2152 if '#__youtubedl_smuggle' not in smug_url
:
2153 return smug_url
, default
2154 url
, _
, sdata
= smug_url
.rpartition('#')
2155 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2156 data
= json
.loads(jsond
)
2160 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2161 """ Formats numbers with decimal sufixes like K, M, etc """
2162 num
, factor
= float_or_none(num
), float(factor
)
2163 if num
is None or num
< 0:
2165 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2166 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2167 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2169 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2170 converted
= num
/ (factor
** exponent
)
2171 return fmt
% (converted
, suffix
)
2174 def format_bytes(bytes):
2175 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2178 def lookup_unit_table(unit_table
, s
):
2179 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2181 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2184 num_str
= m
.group('num').replace(',', '.')
2185 mult
= unit_table
[m
.group('unit')]
2186 return int(float(num_str
) * mult
)
2189 def parse_filesize(s
):
2193 # The lower-case forms are of course incorrect and unofficial,
2194 # but we support those too
2211 'megabytes': 1000 ** 2,
2212 'mebibytes': 1024 ** 2,
2218 'gigabytes': 1000 ** 3,
2219 'gibibytes': 1024 ** 3,
2225 'terabytes': 1000 ** 4,
2226 'tebibytes': 1024 ** 4,
2232 'petabytes': 1000 ** 5,
2233 'pebibytes': 1024 ** 5,
2239 'exabytes': 1000 ** 6,
2240 'exbibytes': 1024 ** 6,
2246 'zettabytes': 1000 ** 7,
2247 'zebibytes': 1024 ** 7,
2253 'yottabytes': 1000 ** 8,
2254 'yobibytes': 1024 ** 8,
2257 return lookup_unit_table(_UNIT_TABLE
, s
)
2264 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2266 if re
.match(r
'^[\d,.]+$', s
):
2267 return str_to_int(s
)
2280 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2284 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2286 return str_to_int(mobj
.group(1))
2289 def parse_resolution(s
, *, lenient
=False):
2294 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2296 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2299 'width': int(mobj
.group('w')),
2300 'height': int(mobj
.group('h')),
2303 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2305 return {'height': int(mobj.group(1))}
2307 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2309 return {'height': int(mobj.group(1)) * 540}
2314 def parse_bitrate(s
):
2315 if not isinstance(s
, compat_str
):
2317 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2319 return int(mobj
.group(1))
2322 def month_by_name(name
, lang
='en'):
2323 """ Return the number of a month by (locale-independently) English name """
2325 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2328 return month_names
.index(name
) + 1
2333 def month_by_abbreviation(abbrev
):
2334 """ Return the number of a month by (locale-independently) English
2338 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2343 def fix_xml_ampersands(xml_str
):
2344 """Replace all the '&' by '&' in XML"""
2346 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2351 def setproctitle(title
):
2352 assert isinstance(title
, compat_str
)
2354 # ctypes in Jython is not complete
2355 # http://bugs.jython.org/issue2148
2356 if sys
.platform
.startswith('java'):
2360 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2364 # LoadLibrary in Windows Python 2.7.13 only expects
2365 # a bytestring, but since unicode_literals turns
2366 # every string into a unicode string, it fails.
2368 title_bytes
= title
.encode()
2369 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2370 buf
.value
= title_bytes
2372 libc
.prctl(15, buf
, 0, 0, 0)
2373 except AttributeError:
2374 return # Strange libc, just skip this
2377 def remove_start(s
, start
):
2378 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2381 def remove_end(s
, end
):
2382 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2385 def remove_quotes(s
):
2386 if s
is None or len(s
) < 2:
2388 for quote
in ('"', "'", ):
2389 if s
[0] == quote
and s
[-1] == quote
:
2394 def get_domain(url
):
2395 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2396 return domain
.group('domain') if domain
else None
2399 def url_basename(url
):
2400 path
= compat_urlparse
.urlparse(url
).path
2401 return path
.strip('/').split('/')[-1]
2405 return re
.match(r
'https?://[^?#&]+/', url
).group()
2408 def urljoin(base
, path
):
2409 if isinstance(path
, bytes):
2410 path
= path
.decode()
2411 if not isinstance(path
, compat_str
) or not path
:
2413 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2415 if isinstance(base
, bytes):
2416 base
= base
.decode()
2417 if not isinstance(base
, compat_str
) or not re
.match(
2418 r
'^(?:https?:)?//', base
):
2420 return compat_urlparse
.urljoin(base
, path
)
2423 class HEADRequest(compat_urllib_request
.Request
):
2424 def get_method(self
):
2428 class PUTRequest(compat_urllib_request
.Request
):
2429 def get_method(self
):
2433 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2434 if get_attr
and v
is not None:
2435 v
= getattr(v
, get_attr
, None)
2437 return int(v
) * invscale
// scale
2438 except (ValueError, TypeError, OverflowError):
2442 def str_or_none(v
, default
=None):
2443 return default
if v
is None else compat_str(v
)
2446 def str_to_int(int_str
):
2447 """ A more relaxed version of int_or_none """
2448 if isinstance(int_str
, int):
2450 elif isinstance(int_str
, compat_str
):
2451 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2452 return int_or_none(int_str
)
2455 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2459 return float(v
) * invscale
/ scale
2460 except (ValueError, TypeError):
2464 def bool_or_none(v
, default
=None):
2465 return v
if isinstance(v
, bool) else default
2468 def strip_or_none(v
, default
=None):
2469 return v
.strip() if isinstance(v
, compat_str
) else default
2472 def url_or_none(url
):
2473 if not url
or not isinstance(url
, compat_str
):
2476 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2479 def request_to_url(req
):
2480 if isinstance(req
, compat_urllib_request
.Request
):
2481 return req
.get_full_url()
2486 def strftime_or_none(timestamp
, date_format
, default
=None):
2487 datetime_object
= None
2489 if isinstance(timestamp
, (int, float)): # unix timestamp
2490 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2491 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2492 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2493 return datetime_object
.strftime(date_format
)
2494 except (ValueError, TypeError, AttributeError):
2498 def parse_duration(s
):
2499 if not isinstance(s
, str):
2505 days
, hours
, mins
, secs
, ms
= [None] * 5
2506 m
= re
.match(r
'''(?x)
2508 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2509 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2510 (?P<ms>[.:][0-9]+)?Z?$
2513 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2518 [0-9]+\s*y(?:ears?)?,?\s*
2521 [0-9]+\s*m(?:onths?)?,?\s*
2524 [0-9]+\s*w(?:eeks?)?,?\s*
2527 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2531 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2534 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2537 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2540 days
, hours
, mins
, secs
, ms
= m
.groups()
2542 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2544 hours
, mins
= m
.groups()
2549 ms
= ms
.replace(':', '.')
2550 return sum(float(part
or 0) * mult
for part
, mult
in (
2551 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2554 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2555 name
, real_ext
= os
.path
.splitext(filename
)
2557 f
'{name}.{ext}{real_ext}'
2558 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2559 else f
'{filename}.{ext}')
2562 def replace_extension(filename
, ext
, expected_real_ext
=None):
2563 name
, real_ext
= os
.path
.splitext(filename
)
2564 return '{}.{}'.format(
2565 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2569 def check_executable(exe
, args
=[]):
2570 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2571 args can be a list of arguments for a short output (like -version) """
2573 Popen
.run([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
2579 def _get_exe_version_output(exe
, args
, *, to_screen
=None):
2581 to_screen(f
'Checking exe version: {shell_quote([exe] + args)}')
2583 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2584 # SIGTTOU if yt-dlp is run in the background.
2585 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2586 stdout
, _
, _
= Popen
.run([encodeArgument(exe
)] + args
, text
=True,
2587 stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
2593 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2594 assert isinstance(output
, compat_str
)
2595 if version_re
is None:
2596 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2597 m
= re
.search(version_re
, output
)
2604 def get_exe_version(exe
, args
=['--version'],
2605 version_re
=None, unrecognized
='present'):
2606 """ Returns the version of the specified executable,
2607 or False if the executable is not present """
2608 out
= _get_exe_version_output(exe
, args
)
2609 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2612 def frange(start
=0, stop
=None, step
=1):
2615 start
, stop
= 0, start
2616 sign
= [-1, 1][step
> 0] if step
else 0
2617 while sign
* start
< sign
* stop
:
2622 class LazyList(collections
.abc
.Sequence
):
2623 """Lazy immutable list from an iterable
2624 Note that slices of a LazyList are lists and not LazyList"""
2626 class IndexError(IndexError):
2629 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2630 self
._iterable
= iter(iterable
)
2631 self
._cache
= [] if _cache
is None else _cache
2632 self
._reversed
= reverse
2636 # We need to consume the entire iterable to iterate in reverse
2637 yield from self
.exhaust()
2639 yield from self
._cache
2640 for item
in self
._iterable
:
2641 self
._cache
.append(item
)
2645 self
._cache
.extend(self
._iterable
)
2646 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2650 """Evaluate the entire iterable"""
2651 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2654 def _reverse_index(x
):
2655 return None if x
is None else -(x
+ 1)
2657 def __getitem__(self
, idx
):
2658 if isinstance(idx
, slice):
2660 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2661 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2662 elif isinstance(idx
, int):
2664 idx
= self
._reverse
_index
(idx
)
2665 start
, stop
, step
= idx
, idx
, 0
2667 raise TypeError('indices must be integers or slices')
2668 if ((start
or 0) < 0 or (stop
or 0) < 0
2669 or (start
is None and step
< 0)
2670 or (stop
is None and step
> 0)):
2671 # We need to consume the entire iterable to be able to slice from the end
2672 # Obviously, never use this with infinite iterables
2675 return self
._cache
[idx
]
2676 except IndexError as e
:
2677 raise self
.IndexError(e
) from e
2678 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2680 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2682 return self
._cache
[idx
]
2683 except IndexError as e
:
2684 raise self
.IndexError(e
) from e
2688 self
[-1] if self
._reversed
else self
[0]
2689 except self
.IndexError:
2695 return len(self
._cache
)
2697 def __reversed__(self
):
2698 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2701 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2704 # repr and str should mimic a list. So we exhaust the iterable
2705 return repr(self
.exhaust())
2708 return repr(self
.exhaust())
2713 class IndexError(IndexError):
2717 # This is only useful for tests
2718 return len(self
.getslice())
2720 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2721 self
._pagefunc
= pagefunc
2722 self
._pagesize
= pagesize
2723 self
._pagecount
= float('inf')
2724 self
._use
_cache
= use_cache
2727 def getpage(self
, pagenum
):
2728 page_results
= self
._cache
.get(pagenum
)
2729 if page_results
is None:
2730 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2732 self
._cache
[pagenum
] = page_results
2735 def getslice(self
, start
=0, end
=None):
2736 return list(self
._getslice
(start
, end
))
2738 def _getslice(self
, start
, end
):
2739 raise NotImplementedError('This method must be implemented by subclasses')
2741 def __getitem__(self
, idx
):
2742 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2743 if not isinstance(idx
, int) or idx
< 0:
2744 raise TypeError('indices must be non-negative integers')
2745 entries
= self
.getslice(idx
, idx
+ 1)
2747 raise self
.IndexError()
2751 class OnDemandPagedList(PagedList
):
2752 """Download pages until a page with less than maximum results"""
2754 def _getslice(self
, start
, end
):
2755 for pagenum
in itertools
.count(start
// self
._pagesize
):
2756 firstid
= pagenum
* self
._pagesize
2757 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2758 if start
>= nextfirstid
:
2762 start
% self
._pagesize
2763 if firstid
<= start
< nextfirstid
2766 ((end
- 1) % self
._pagesize
) + 1
2767 if (end
is not None and firstid
<= end
<= nextfirstid
)
2771 page_results
= self
.getpage(pagenum
)
2773 self
._pagecount
= pagenum
- 1
2775 if startv
!= 0 or endv
is not None:
2776 page_results
= page_results
[startv
:endv
]
2777 yield from page_results
2779 # A little optimization - if current page is not "full", ie. does
2780 # not contain page_size videos then we can assume that this page
2781 # is the last one - there are no more ids on further pages -
2782 # i.e. no need to query again.
2783 if len(page_results
) + startv
< self
._pagesize
:
2786 # If we got the whole page, but the next page is not interesting,
2787 # break out early as well
2788 if end
== nextfirstid
:
2792 class InAdvancePagedList(PagedList
):
2793 """PagedList with total number of pages known in advance"""
2795 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2796 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2797 self
._pagecount
= pagecount
2799 def _getslice(self
, start
, end
):
2800 start_page
= start
// self
._pagesize
2801 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2802 skip_elems
= start
- start_page
* self
._pagesize
2803 only_more
= None if end
is None else end
- start
2804 for pagenum
in range(start_page
, end_page
):
2805 page_results
= self
.getpage(pagenum
)
2807 page_results
= page_results
[skip_elems
:]
2809 if only_more
is not None:
2810 if len(page_results
) < only_more
:
2811 only_more
-= len(page_results
)
2813 yield from page_results
[:only_more
]
2815 yield from page_results
2818 class PlaylistEntries
:
2819 MissingEntry
= object()
2820 is_exhausted
= False
2822 def __init__(self
, ydl
, info_dict
):
2823 self
.ydl
, self
.info_dict
= ydl
, info_dict
2825 PLAYLIST_ITEMS_RE
= re
.compile(r
'''(?x)
2826 (?P<start>[+-]?\d+)?
2828 (?P<end>[+-]?\d+|inf(?:inite)?)?
2829 (?::(?P<step>[+-]?\d+))?
2833 def parse_playlist_items(cls
, string
):
2834 for segment
in string
.split(','):
2836 raise ValueError('There is two or more consecutive commas')
2837 mobj
= cls
.PLAYLIST_ITEMS_RE
.fullmatch(segment
)
2839 raise ValueError(f
'{segment!r} is not a valid specification')
2840 start
, end
, step
, has_range
= mobj
.group('start', 'end', 'step', 'range')
2841 if int_or_none(step
) == 0:
2842 raise ValueError(f
'Step in {segment!r} cannot be zero')
2843 yield slice(int_or_none(start
), float_or_none(end
), int_or_none(step
)) if has_range
else int(start
)
2845 def get_requested_items(self
):
2846 playlist_items
= self
.ydl
.params
.get('playlist_items')
2847 playlist_start
= self
.ydl
.params
.get('playliststart', 1)
2848 playlist_end
= self
.ydl
.params
.get('playlistend')
2849 # For backwards compatibility, interpret -1 as whole list
2850 if playlist_end
in (-1, None):
2852 if not playlist_items
:
2853 playlist_items
= f
'{playlist_start}:{playlist_end}'
2854 elif playlist_start
!= 1 or playlist_end
:
2855 self
.ydl
.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once
=True)
2857 for index
in self
.parse_playlist_items(playlist_items
):
2858 for i
, entry
in self
[index
]:
2861 # TODO: Add auto-generated fields
2862 self
.ydl
._match
_entry
(entry
, incomplete
=True, silent
=True)
2863 except (ExistingVideoReached
, RejectedVideoReached
):
2867 def full_count(self
):
2868 if self
.info_dict
.get('playlist_count'):
2869 return self
.info_dict
['playlist_count']
2870 elif self
.is_exhausted
and not self
.is_incomplete
:
2872 elif isinstance(self
._entries
, InAdvancePagedList
):
2873 if self
._entries
._pagesize
== 1:
2874 return self
._entries
._pagecount
2876 @functools.cached_property
2878 entries
= self
.info_dict
.get('entries')
2880 raise EntryNotInPlaylist('There are no entries')
2881 elif isinstance(entries
, list):
2882 self
.is_exhausted
= True
2884 indices
= self
.info_dict
.get('requested_entries')
2885 self
.is_incomplete
= bool(indices
)
2886 if self
.is_incomplete
:
2887 assert self
.is_exhausted
2888 ret
= [self
.MissingEntry
] * max(indices
)
2889 for i
, entry
in zip(indices
, entries
):
2893 if isinstance(entries
, (list, PagedList
, LazyList
)):
2895 return LazyList(entries
)
2897 @functools.cached_property
2899 if isinstance(self
._entries
, list):
2902 entry
= self
._entries
[i
]
2904 entry
= self
.MissingEntry
2905 if not self
.is_incomplete
:
2906 raise self
.IndexError()
2907 if entry
is self
.MissingEntry
:
2908 raise EntryNotInPlaylist(f
'Entry {i} cannot be found')
2913 return type(self
.ydl
)._handle
_extraction
_exceptions
(lambda _
, i
: self
._entries
[i
])(self
.ydl
, i
)
2914 except (LazyList
.IndexError, PagedList
.IndexError):
2915 raise self
.IndexError()
2918 def __getitem__(self
, idx
):
2919 if isinstance(idx
, int):
2920 idx
= slice(idx
, idx
)
2922 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2923 step
= 1 if idx
.step
is None else idx
.step
2924 if idx
.start
is None:
2925 start
= 0 if step
> 0 else len(self
) - 1
2927 start
= idx
.start
- 1 if idx
.start
>= 0 else len(self
) + idx
.start
2929 # NB: Do not call len(self) when idx == [:]
2930 if idx
.stop
is None:
2931 stop
= 0 if step
< 0 else float('inf')
2933 stop
= idx
.stop
- 1 if idx
.stop
>= 0 else len(self
) + idx
.stop
2934 stop
+= [-1, 1][step
> 0]
2936 for i
in frange(start
, stop
, step
):
2941 entry
= self
._getter
(i
)
2942 except self
.IndexError:
2943 self
.is_exhausted
= True
2948 if self
.is_exhausted
:
2954 return len(tuple(self
[:]))
2956 class IndexError(IndexError):
2960 def uppercase_escape(s
):
2961 unicode_escape
= codecs
.getdecoder('unicode_escape')
2963 r
'\\U[0-9a-fA-F]{8}',
2964 lambda m
: unicode_escape(m
.group(0))[0],
2968 def lowercase_escape(s
):
2969 unicode_escape
= codecs
.getdecoder('unicode_escape')
2971 r
'\\u[0-9a-fA-F]{4}',
2972 lambda m
: unicode_escape(m
.group(0))[0],
2976 def escape_rfc3986(s
):
2977 """Escape non-ASCII characters as suggested by RFC 3986"""
2978 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2981 def escape_url(url
):
2982 """Escape URL as suggested by RFC 3986"""
2983 url_parsed
= compat_urllib_parse_urlparse(url
)
2984 return url_parsed
._replace
(
2985 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2986 path
=escape_rfc3986(url_parsed
.path
),
2987 params
=escape_rfc3986(url_parsed
.params
),
2988 query
=escape_rfc3986(url_parsed
.query
),
2989 fragment
=escape_rfc3986(url_parsed
.fragment
)
2994 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2997 def read_batch_urls(batch_fd
):
2999 if not isinstance(url
, compat_str
):
3000 url
= url
.decode('utf-8', 'replace')
3001 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
3002 for bom
in BOM_UTF8
:
3003 if url
.startswith(bom
):
3004 url
= url
[len(bom
):]
3006 if not url
or url
.startswith(('#', ';', ']')):
3008 # "#" cannot be stripped out since it is part of the URI
3009 # However, it can be safely stipped out if follwing a whitespace
3010 return re
.split(r
'\s#', url
, 1)[0].rstrip()
3012 with contextlib
.closing(batch_fd
) as fd
:
3013 return [url
for url
in map(fixup
, fd
) if url
]
3016 def urlencode_postdata(*args
, **kargs
):
3017 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3020 def update_url_query(url
, query
):
3023 parsed_url
= compat_urlparse
.urlparse(url
)
3024 qs
= compat_parse_qs(parsed_url
.query
)
3026 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3027 query
=compat_urllib_parse_urlencode(qs
, True)))
3030 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
3031 req_headers
= req
.headers
.copy()
3032 req_headers
.update(headers
)
3033 req_data
= data
or req
.data
3034 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3035 req_get_method
= req
.get_method()
3036 if req_get_method
== 'HEAD':
3037 req_type
= HEADRequest
3038 elif req_get_method
== 'PUT':
3039 req_type
= PUTRequest
3041 req_type
= compat_urllib_request
.Request
3043 req_url
, data
=req_data
, headers
=req_headers
,
3044 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3045 if hasattr(req
, 'timeout'):
3046 new_req
.timeout
= req
.timeout
3050 def _multipart_encode_impl(data
, boundary
):
3051 content_type
= 'multipart/form-data; boundary=%s' % boundary
3054 for k
, v
in data
.items():
3055 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3056 if isinstance(k
, compat_str
):
3058 if isinstance(v
, compat_str
):
3060 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3061 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3062 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3063 if boundary
.encode('ascii') in content
:
3064 raise ValueError('Boundary overlaps with data')
3067 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3069 return out
, content_type
3072 def multipart_encode(data
, boundary
=None):
3074 Encode a dict to RFC 7578-compliant form-data
3077 A dict where keys and values can be either Unicode or bytes-like
3080 If specified a Unicode object, it's used as the boundary. Otherwise
3081 a random boundary is generated.
3083 Reference: https://tools.ietf.org/html/rfc7578
3085 has_specified_boundary
= boundary
is not None
3088 if boundary
is None:
3089 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3092 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3095 if has_specified_boundary
:
3099 return out
, content_type
3102 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3103 for val
in map(d
.get
, variadic(key_or_keys
)):
3104 if val
is not None and (val
or not skip_false_values
):
3109 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
3112 val
= f(*args
, **kwargs
)
3113 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3116 if expected_type
is None or isinstance(val
, expected_type
):
3120 def try_get(src
, getter
, expected_type
=None):
3121 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
3124 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
3125 return {k: v for k, v in dct.items() if cndn(k, v)}
3128 def merge_dicts(*dicts
):
3130 for a_dict
in dicts
:
3131 for k
, v
in a_dict
.items():
3132 if (v
is not None and k
not in merged
3133 or isinstance(v
, str) and merged
[k
] == ''):
3138 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
3139 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
3151 TV_PARENTAL_GUIDELINES
= {
3161 def parse_age_limit(s
):
3162 # isinstance(False, int) is True. So type() must be used instead
3163 if type(s
) is int: # noqa: E721
3164 return s
if 0 <= s
<= 21 else None
3165 elif not isinstance(s
, str):
3167 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
3169 return int(m
.group('age'))
3172 return US_RATINGS
[s
]
3173 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
3175 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3179 def strip_jsonp(code
):
3182 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3183 (?:\s*&&\s*(?P=func_name))?
3184 \s*\(\s*(?P<callback_data>.*)\);?
3185 \s*?(?://[^\n]*)*$''',
3186 r
'\g<callback_data>', code
)
3189 def js_to_json(code
, vars={}):
3190 # vars is a dict of var, val pairs to substitute
3191 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3192 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3194 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3195 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3200 if v
in ('true', 'false', 'null'):
3202 elif v
in ('undefined', 'void 0'):
3204 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3207 if v
[0] in ("'", '"'):
3208 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3213 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3215 for regex
, base
in INTEGER_TABLE
:
3216 im
= re
.match(regex
, v
)
3218 i
= int(im
.group(1), base
)
3219 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3226 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3228 return re
.sub(r
'''(?sx)
3229 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3230 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3231 {comment}|,(?={skip}[\]}}])|
3232 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3233 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3236 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3239 def qualities(quality_ids
):
3240 """ Get a numeric quality value out of a list of possible values """
3243 return quality_ids
.index(qid
)
3249 POSTPROCESS_WHEN
= ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3253 'default': '%(title)s [%(id)s].%(ext)s',
3254 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3260 'description': 'description',
3261 'annotation': 'annotations.xml',
3262 'infojson': 'info.json',
3265 'pl_thumbnail': None,
3266 'pl_description': 'description',
3267 'pl_infojson': 'info.json',
3270 # As of [1] format syntax is:
3271 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3272 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3273 STR_FORMAT_RE_TMPL
= r
'''(?x)
3274 (?<!%)(?P<prefix>(?:%%)*)
3276 (?P<has_key>\((?P<key>{0})\))?
3278 (?P<conversion>[#0\-+ ]+)?
3280 (?P<precision>\.\d+)?
3281 (?P<len_mod>[hlL])? # unused in python
3282 {1} # conversion type
3287 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3290 def limit_length(s
, length
):
3291 """ Add ellipses to overly long strings """
3296 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3300 def version_tuple(v
):
3301 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3304 def is_outdated_version(version
, limit
, assume_new
=True):
3306 return not assume_new
3308 return version_tuple(version
) < version_tuple(limit
)
3310 return not assume_new
3313 def ytdl_is_updateable():
3314 """ Returns if yt-dlp can be updated with -U """
3316 from .update
import is_non_updateable
3318 return not is_non_updateable()
3321 def args_to_str(args
):
3322 # Get a short string representation for a subprocess command
3323 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3326 def error_to_compat_str(err
):
3330 def error_to_str(err
):
3331 return f
'{type(err).__name__}: {err}'
3334 def mimetype2ext(mt
):
3338 mt
, _
, params
= mt
.partition(';')
3343 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3344 # it's the most popular one
3345 'audio/mpeg': 'mp3',
3346 'audio/x-wav': 'wav',
3348 'audio/wave': 'wav',
3351 ext
= FULL_MAP
.get(mt
)
3357 'smptett+xml': 'tt',
3361 'x-mp4-fragmented': 'mp4',
3362 'x-ms-sami': 'sami',
3365 'x-mpegurl': 'm3u8',
3366 'vnd.apple.mpegurl': 'm3u8',
3370 'vnd.ms-sstr+xml': 'ism',
3374 'filmstrip+json': 'fs',
3378 _
, _
, subtype
= mt
.rpartition('/')
3379 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3390 _
, _
, suffix
= subtype
.partition('+')
3391 ext
= SUFFIX_MAP
.get(suffix
)
3395 return subtype
.replace('+', '.')
3398 def ext2mimetype(ext_or_url
):
3401 if '.' not in ext_or_url
:
3402 ext_or_url
= f
'file.{ext_or_url}'
3403 return mimetypes
.guess_type(ext_or_url
)[0]
3406 def parse_codecs(codecs_str
):
3407 # http://tools.ietf.org/html/rfc6381
3410 split_codecs
= list(filter(None, map(
3411 str.strip
, codecs_str
.strip().strip(',').split(','))))
3412 vcodec
, acodec
, scodec
, hdr
= None, None, None, None
3413 for full_codec
in split_codecs
:
3414 parts
= full_codec
.split('.')
3415 codec
= parts
[0].replace('0', '')
3416 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3417 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3419 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3420 if codec
in ('dvh1', 'dvhe'):
3422 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3424 elif full_codec
.replace('0', '').startswith('vp9.2'):
3426 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3429 elif codec
in ('stpp', 'wvtt',):
3433 write_string(f
'WARNING: Unknown codec {full_codec}\n')
3434 if vcodec
or acodec
or scodec
:
3436 'vcodec': vcodec
or 'none',
3437 'acodec': acodec
or 'none',
3438 'dynamic_range': hdr
,
3439 **({'scodec': scodec}
if scodec
is not None else {}),
3441 elif len(split_codecs
) == 2:
3443 'vcodec': split_codecs
[0],
3444 'acodec': split_codecs
[1],
3449 def urlhandle_detect_ext(url_handle
):
3450 getheader
= url_handle
.headers
.get
3452 cd
= getheader('Content-Disposition')
3454 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3456 e
= determine_ext(m
.group('filename'), default_ext
=None)
3460 return mimetype2ext(getheader('Content-Type'))
3463 def encode_data_uri(data
, mime_type
):
3464 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3467 def age_restricted(content_limit
, age_limit
):
3468 """ Returns True iff the content should be blocked """
3470 if age_limit
is None: # No limit set
3472 if content_limit
is None:
3473 return False # Content available for everyone
3474 return age_limit
< content_limit
3477 def is_html(first_bytes
):
3478 """ Detect whether a file contains HTML by examining its first bytes. """
3481 (b
'\xef\xbb\xbf', 'utf-8'),
3482 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3483 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3484 (b
'\xff\xfe', 'utf-16-le'),
3485 (b
'\xfe\xff', 'utf-16-be'),
3489 for bom
, enc
in BOMS
:
3490 while first_bytes
.startswith(bom
):
3491 encoding
, first_bytes
= enc
, first_bytes
[len(bom
):]
3493 return re
.match(r
'^\s*<', first_bytes
.decode(encoding
, 'replace'))
3496 def determine_protocol(info_dict
):
3497 protocol
= info_dict
.get('protocol')
3498 if protocol
is not None:
3501 url
= sanitize_url(info_dict
['url'])
3502 if url
.startswith('rtmp'):
3504 elif url
.startswith('mms'):
3506 elif url
.startswith('rtsp'):
3509 ext
= determine_ext(url
)
3515 return compat_urllib_parse_urlparse(url
).scheme
3518 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3519 """ Render a list of rows, each as a list of values.
3520 Text after a \t will be right aligned """
3522 return len(remove_terminal_sequences(string
).replace('\t', ''))
3524 def get_max_lens(table
):
3525 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3527 def filter_using_list(row
, filterArray
):
3528 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3530 max_lens
= get_max_lens(data
) if hide_empty
else []
3531 header_row
= filter_using_list(header_row
, max_lens
)
3532 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3534 table
= [header_row
] + data
3535 max_lens
= get_max_lens(table
)
3538 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3539 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3541 for pos
, text
in enumerate(map(str, row
)):
3543 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3545 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3546 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3550 def _match_one(filter_part
, dct
, incomplete
):
3551 # TODO: Generalize code with YoutubeDL._build_format_filter
3552 STRING_OPERATORS
= {
3553 '*=': operator
.contains
,
3554 '^=': lambda attr
, value
: attr
.startswith(value
),
3555 '$=': lambda attr
, value
: attr
.endswith(value
),
3556 '~=': lambda attr
, value
: re
.search(value
, attr
),
3558 COMPARISON_OPERATORS
= {
3560 '<=': operator
.le
, # "<=" must be defined above "<"
3567 if isinstance(incomplete
, bool):
3568 is_incomplete
= lambda _
: incomplete
3570 is_incomplete
= lambda k
: k
in incomplete
3572 operator_rex
= re
.compile(r
'''(?x)
3574 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3576 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3579 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3580 m = operator_rex.fullmatch(filter_part.strip())
3583 unnegated_op = COMPARISON_OPERATORS[m['op']]
3585 op = lambda attr, value: not unnegated_op(attr, value)
3588 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3590 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3591 actual_value = dct.get(m['key'])
3592 numeric_comparison = None
3593 if isinstance(actual_value, (int, float)):
3594 # If the original field is a string and matching comparisonvalue is
3595 # a number we should respect the origin of the original field
3596 # and process comparison value as a string (see
3597 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3599 numeric_comparison = int(comparison_value)
3601 numeric_comparison = parse_filesize(comparison_value)
3602 if numeric_comparison is None:
3603 numeric_comparison = parse_filesize(f'{comparison_value}B')
3604 if numeric_comparison is None:
3605 numeric_comparison = parse_duration(comparison_value)
3606 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3607 raise ValueError('Operator %s only supports string values!' % m['op'])
3608 if actual_value is None:
3609 return is_incomplete(m['key']) or m['none_inclusive']
3610 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3613 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3614 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3616 operator_rex = re.compile(r'''(?x
)
3617 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3618 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3619 m = operator_rex.fullmatch(filter_part.strip())
3621 op = UNARY_OPERATORS[m.group('op')]
3622 actual_value = dct.get(m.group('key'))
3623 if is_incomplete(m.group('key')) and actual_value is None:
3625 return op(actual_value)
3627 raise ValueError('Invalid filter part %r' % filter_part)
3630 def match_str(filter_str, dct, incomplete=False):
3631 """ Filter a dictionary with a simple string syntax.
3632 @returns Whether the filter passes
3633 @param incomplete Set of keys that is expected to be missing from dct.
3634 Can be True/False to indicate all/none of the keys may be missing.
3635 All conditions on incomplete keys pass if the key is missing
3638 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3639 for filter_part in re.split(r'(?<!\\)&', filter_str))
3642 def match_filter_func(filters):
3645 filters = set(variadic(filters))
3647 interactive = '-' in filters
3651 def _match_func(info_dict, incomplete=False):
3652 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3653 return NO_DEFAULT if interactive and not incomplete else None
3655 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3656 filter_str = ') | ('.join(map(str.strip, filters))
3657 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3661 def download_range_func(chapters, ranges):
3662 def inner(info_dict, ydl):
3663 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3664 else 'Cannot match chapters since chapter information is unavailable')
3665 for regex in chapters or []:
3666 for i, chapter in enumerate(info_dict.get('chapters') or []):
3667 if re.search(regex, chapter['title']):
3669 yield {**chapter, 'index': i}
3670 if chapters and warning:
3671 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3673 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3678 def parse_dfxp_time_expr(time_expr):
3682 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3684 return float(mobj.group('time_offset'))
3686 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3688 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3691 def srt_subtitles_timecode(seconds):
3692 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3695 def ass_subtitles_timecode(seconds):
3696 time = timetuple_from_msec(seconds * 1000)
3697 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3700 def dfxp2srt(dfxp_data):
3702 @param dfxp_data A
bytes-like
object containing DFXP data
3703 @returns A
unicode object containing converted SRT data
3705 LEGACY_NAMESPACES = (
3706 (b'http://www.w3.org/ns/ttml', [
3707 b'http://www.w3.org/2004/11/ttaf1',
3708 b'http://www.w3.org/2006/04/ttaf1',
3709 b'http://www.w3.org/2006/10/ttaf1',
3711 (b'http://www.w3.org/ns/ttml#styling', [
3712 b'http://www.w3.org/ns/ttml#style',
3716 SUPPORTED_STYLING = [
3725 _x = functools.partial(xpath_with_ns, ns_map={
3726 'xml': 'http://www.w3.org/XML/1998/namespace',
3727 'ttml': 'http://www.w3.org/ns/ttml',
3728 'tts': 'http://www.w3.org/ns/ttml#styling',
3734 class TTMLPElementParser:
3736 _unclosed_elements = []
3737 _applied_styles = []
3739 def start(self, tag, attrib):
3740 if tag in (_x('ttml:br'), 'br'):
3743 unclosed_elements = []
3745 element_style_id = attrib.get('style')
3747 style.update(default_style)
3748 if element_style_id:
3749 style.update(styles.get(element_style_id, {}))
3750 for prop in SUPPORTED_STYLING:
3751 prop_val = attrib.get(_x('tts:' + prop))
3753 style[prop] = prop_val
3756 for k, v in sorted(style.items()):
3757 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3760 font += ' color="%s"' % v
3761 elif k == 'fontSize':
3762 font += ' size="%s"' % v
3763 elif k == 'fontFamily':
3764 font += ' face="%s"' % v
3765 elif k == 'fontWeight' and v == 'bold':
3767 unclosed_elements.append('b')
3768 elif k == 'fontStyle' and v == 'italic':
3770 unclosed_elements.append('i')
3771 elif k == 'textDecoration' and v == 'underline':
3773 unclosed_elements.append('u')
3775 self._out += '<font' + font + '>'
3776 unclosed_elements.append('font')
3778 if self._applied_styles:
3779 applied_style.update(self._applied_styles[-1])
3780 applied_style.update(style)
3781 self._applied_styles.append(applied_style)
3782 self._unclosed_elements.append(unclosed_elements)
3785 if tag not in (_x('ttml:br'), 'br'):
3786 unclosed_elements = self._unclosed_elements.pop()
3787 for element in reversed(unclosed_elements):
3788 self._out += '</%s>' % element
3789 if unclosed_elements and self._applied_styles:
3790 self._applied_styles.pop()
3792 def data(self, data):
3796 return self._out.strip()
3798 def parse_node(node):
3799 target = TTMLPElementParser()
3800 parser = xml.etree.ElementTree.XMLParser(target=target)
3801 parser.feed(xml.etree.ElementTree.tostring(node))
3802 return parser.close()
3804 for k, v in LEGACY_NAMESPACES:
3806 dfxp_data = dfxp_data.replace(ns, k)
3808 dfxp = compat_etree_fromstring(dfxp_data)
3810 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3813 raise ValueError('Invalid dfxp/TTML subtitle')
3817 for style in dfxp.findall(_x('.//ttml:style')):
3818 style_id = style.get('id') or style.get(_x('xml:id'))
3821 parent_style_id = style.get('style')
3823 if parent_style_id not in styles:
3826 styles[style_id] = styles[parent_style_id].copy()
3827 for prop in SUPPORTED_STYLING:
3828 prop_val = style.get(_x('tts:' + prop))
3830 styles.setdefault(style_id, {})[prop] = prop_val
3836 for p in ('body', 'div'):
3837 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3840 style = styles.get(ele.get('style'))
3843 default_style.update(style)
3845 for para, index in zip(paras, itertools.count(1)):
3846 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3847 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3848 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3849 if begin_time is None:
3854 end_time = begin_time + dur
3855 out.append('%d\n%s --> %s\n%s\n\n' % (
3857 srt_subtitles_timecode(begin_time),
3858 srt_subtitles_timecode(end_time),
3864 def cli_option(params, command_option, param, separator=None):
3865 param = params.get(param)
3866 return ([] if param is None
3867 else [command_option, str(param)] if separator is None
3868 else [f'{command_option}{separator}{param}'])
3871 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3872 param = params.get(param)
3873 assert param in (True, False, None)
3874 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3877 def cli_valueless_option(params, command_option, param, expected_value=True):
3878 return [command_option] if params.get(param) == expected_value else []
3881 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3882 if isinstance(argdict, (list, tuple)): # for backward compatibility
3889 assert isinstance(argdict, dict)
3891 assert isinstance(keys, (list, tuple))
3892 for key_list in keys:
3893 arg_list = list(filter(
3894 lambda x: x is not None,
3895 [argdict.get(key.lower()) for key in variadic(key_list)]))
3897 return [arg for args in arg_list for arg in args]
3901 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3902 main_key, exe = main_key.lower(), exe.lower()
3903 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3904 keys = [f'{root_key}{k}' for k in (keys or [''])]
3905 if root_key in keys:
3907 keys.append((main_key, exe))
3908 keys.append('default')
3911 return cli_configuration_args(argdict, keys, default, use_compat)
3915 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3974 'iw': 'heb', # Replaced by he in 1989 revision
3984 'in': 'ind', # Replaced by id in 1989 revision
4099 'ji': 'yid', # Replaced by yi in 1989 revision
4107 def short2long(cls, code):
4108 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4109 return cls._lang_map.get(code[:2])
4112 def long2short(cls, code):
4113 """Convert language code from ISO 639-2/T to ISO 639-1"""
4114 for short_name, long_name in cls._lang_map.items():
4115 if long_name == code:
4120 # From http://data.okfn.org/data/core/country-list
4122 'AF': 'Afghanistan',
4123 'AX': 'Åland Islands',
4126 'AS': 'American Samoa',
4131 'AG': 'Antigua and Barbuda',
4148 'BO': 'Bolivia, Plurinational State of',
4149 'BQ': 'Bonaire, Sint Eustatius and Saba',
4150 'BA': 'Bosnia and Herzegovina',
4152 'BV': 'Bouvet Island',
4154 'IO': 'British Indian Ocean Territory',
4155 'BN': 'Brunei Darussalam',
4157 'BF': 'Burkina Faso',
4163 'KY': 'Cayman Islands',
4164 'CF': 'Central African Republic',
4168 'CX': 'Christmas Island',
4169 'CC': 'Cocos (Keeling) Islands',
4173 'CD': 'Congo, the Democratic Republic of the',
4174 'CK': 'Cook Islands',
4176 'CI': 'Côte d\'Ivoire',
4181 'CZ': 'Czech Republic',
4185 'DO': 'Dominican Republic',
4188 'SV': 'El Salvador',
4189 'GQ': 'Equatorial Guinea',
4193 'FK': 'Falkland Islands (Malvinas)',
4194 'FO': 'Faroe Islands',
4198 'GF': 'French Guiana',
4199 'PF': 'French Polynesia',
4200 'TF': 'French Southern Territories',
4215 'GW': 'Guinea-Bissau',
4218 'HM': 'Heard Island and McDonald Islands',
4219 'VA': 'Holy See (Vatican City State)',
4226 'IR': 'Iran, Islamic Republic of',
4229 'IM': 'Isle of Man',
4239 'KP': 'Korea, Democratic People\'s Republic of',
4240 'KR': 'Korea, Republic of',
4243 'LA': 'Lao People\'s Democratic Republic',
4249 'LI': 'Liechtenstein',
4253 'MK': 'Macedonia, the Former Yugoslav Republic of',
4260 'MH': 'Marshall Islands',
4266 'FM': 'Micronesia, Federated States of',
4267 'MD': 'Moldova, Republic of',
4278 'NL': 'Netherlands',
4279 'NC': 'New Caledonia',
4280 'NZ': 'New Zealand',
4285 'NF': 'Norfolk Island',
4286 'MP': 'Northern Mariana Islands',
4291 'PS': 'Palestine, State of',
4293 'PG': 'Papua New Guinea',
4296 'PH': 'Philippines',
4300 'PR': 'Puerto Rico',
4304 'RU': 'Russian Federation',
4306 'BL': 'Saint Barthélemy',
4307 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4308 'KN': 'Saint Kitts and Nevis',
4309 'LC': 'Saint Lucia',
4310 'MF': 'Saint Martin (French part)',
4311 'PM': 'Saint Pierre and Miquelon',
4312 'VC': 'Saint Vincent and the Grenadines',
4315 'ST': 'Sao Tome and Principe',
4316 'SA': 'Saudi Arabia',
4320 'SL': 'Sierra Leone',
4322 'SX': 'Sint Maarten (Dutch part)',
4325 'SB': 'Solomon Islands',
4327 'ZA': 'South Africa',
4328 'GS': 'South Georgia and the South Sandwich Islands',
4329 'SS': 'South Sudan',
4334 'SJ': 'Svalbard and Jan Mayen',
4337 'CH': 'Switzerland',
4338 'SY': 'Syrian Arab Republic',
4339 'TW': 'Taiwan, Province of China',
4341 'TZ': 'Tanzania, United Republic of',
4343 'TL': 'Timor-Leste',
4347 'TT': 'Trinidad and Tobago',
4350 'TM': 'Turkmenistan',
4351 'TC': 'Turks and Caicos Islands',
4355 'AE': 'United Arab Emirates',
4356 'GB': 'United Kingdom',
4357 'US': 'United States',
4358 'UM': 'United States Minor Outlying Islands',
4362 'VE': 'Venezuela, Bolivarian Republic of',
4364 'VG': 'Virgin Islands, British',
4365 'VI': 'Virgin Islands, U.S.',
4366 'WF': 'Wallis and Futuna',
4367 'EH': 'Western Sahara',
4371 # Not ISO 3166 codes, but used for IP blocks
4372 'AP': 'Asia/Pacific Region',
4377 def short2full(cls, code):
4378 """Convert an ISO 3166-2 country code to the corresponding full name"""
4379 return cls._country_map.get(code.upper())
4383 # Major IPv4 address blocks per country
4385 'AD': '46.172.224.0/19',
4386 'AE': '94.200.0.0/13',
4387 'AF': '149.54.0.0/17',
4388 'AG': '209.59.64.0/18',
4389 'AI': '204.14.248.0/21',
4390 'AL': '46.99.0.0/16',
4391 'AM': '46.70.0.0/15',
4392 'AO': '105.168.0.0/13',
4393 'AP': '182.50.184.0/21',
4394 'AQ': '23.154.160.0/24',
4395 'AR': '181.0.0.0/12',
4396 'AS': '202.70.112.0/20',
4397 'AT': '77.116.0.0/14',
4398 'AU': '1.128.0.0/11',
4399 'AW': '181.41.0.0/18',
4400 'AX': '185.217.4.0/22',
4401 'AZ': '5.197.0.0/16',
4402 'BA': '31.176.128.0/17',
4403 'BB': '65.48.128.0/17',
4404 'BD': '114.130.0.0/16',
4406 'BF': '102.178.0.0/15',
4407 'BG': '95.42.0.0/15',
4408 'BH': '37.131.0.0/17',
4409 'BI': '154.117.192.0/18',
4410 'BJ': '137.255.0.0/16',
4411 'BL': '185.212.72.0/23',
4412 'BM': '196.12.64.0/18',
4413 'BN': '156.31.0.0/16',
4414 'BO': '161.56.0.0/16',
4415 'BQ': '161.0.80.0/20',
4416 'BR': '191.128.0.0/12',
4417 'BS': '24.51.64.0/18',
4418 'BT': '119.2.96.0/19',
4419 'BW': '168.167.0.0/16',
4420 'BY': '178.120.0.0/13',
4421 'BZ': '179.42.192.0/18',
4422 'CA': '99.224.0.0/11',
4423 'CD': '41.243.0.0/16',
4424 'CF': '197.242.176.0/21',
4425 'CG': '160.113.0.0/16',
4426 'CH': '85.0.0.0/13',
4427 'CI': '102.136.0.0/14',
4428 'CK': '202.65.32.0/19',
4429 'CL': '152.172.0.0/14',
4430 'CM': '102.244.0.0/14',
4431 'CN': '36.128.0.0/10',
4432 'CO': '181.240.0.0/12',
4433 'CR': '201.192.0.0/12',
4434 'CU': '152.206.0.0/15',
4435 'CV': '165.90.96.0/19',
4436 'CW': '190.88.128.0/17',
4437 'CY': '31.153.0.0/16',
4438 'CZ': '88.100.0.0/14',
4440 'DJ': '197.241.0.0/17',
4441 'DK': '87.48.0.0/12',
4442 'DM': '192.243.48.0/20',
4443 'DO': '152.166.0.0/15',
4444 'DZ': '41.96.0.0/12',
4445 'EC': '186.68.0.0/15',
4446 'EE': '90.190.0.0/15',
4447 'EG': '156.160.0.0/11',
4448 'ER': '196.200.96.0/20',
4449 'ES': '88.0.0.0/11',
4450 'ET': '196.188.0.0/14',
4451 'EU': '2.16.0.0/13',
4452 'FI': '91.152.0.0/13',
4453 'FJ': '144.120.0.0/16',
4454 'FK': '80.73.208.0/21',
4455 'FM': '119.252.112.0/20',
4456 'FO': '88.85.32.0/19',
4458 'GA': '41.158.0.0/15',
4460 'GD': '74.122.88.0/21',
4461 'GE': '31.146.0.0/16',
4462 'GF': '161.22.64.0/18',
4463 'GG': '62.68.160.0/19',
4464 'GH': '154.160.0.0/12',
4465 'GI': '95.164.0.0/16',
4466 'GL': '88.83.0.0/19',
4467 'GM': '160.182.0.0/15',
4468 'GN': '197.149.192.0/18',
4469 'GP': '104.250.0.0/19',
4470 'GQ': '105.235.224.0/20',
4471 'GR': '94.64.0.0/13',
4472 'GT': '168.234.0.0/16',
4473 'GU': '168.123.0.0/16',
4474 'GW': '197.214.80.0/20',
4475 'GY': '181.41.64.0/18',
4476 'HK': '113.252.0.0/14',
4477 'HN': '181.210.0.0/16',
4478 'HR': '93.136.0.0/13',
4479 'HT': '148.102.128.0/17',
4480 'HU': '84.0.0.0/14',
4481 'ID': '39.192.0.0/10',
4482 'IE': '87.32.0.0/12',
4483 'IL': '79.176.0.0/13',
4484 'IM': '5.62.80.0/20',
4485 'IN': '117.192.0.0/10',
4486 'IO': '203.83.48.0/21',
4487 'IQ': '37.236.0.0/14',
4488 'IR': '2.176.0.0/12',
4489 'IS': '82.221.0.0/16',
4490 'IT': '79.0.0.0/10',
4491 'JE': '87.244.64.0/18',
4492 'JM': '72.27.0.0/17',
4493 'JO': '176.29.0.0/16',
4494 'JP': '133.0.0.0/8',
4495 'KE': '105.48.0.0/12',
4496 'KG': '158.181.128.0/17',
4497 'KH': '36.37.128.0/17',
4498 'KI': '103.25.140.0/22',
4499 'KM': '197.255.224.0/20',
4500 'KN': '198.167.192.0/19',
4501 'KP': '175.45.176.0/22',
4502 'KR': '175.192.0.0/10',
4503 'KW': '37.36.0.0/14',
4504 'KY': '64.96.0.0/15',
4505 'KZ': '2.72.0.0/13',
4506 'LA': '115.84.64.0/18',
4507 'LB': '178.135.0.0/16',
4508 'LC': '24.92.144.0/20',
4509 'LI': '82.117.0.0/19',
4510 'LK': '112.134.0.0/15',
4511 'LR': '102.183.0.0/16',
4512 'LS': '129.232.0.0/17',
4513 'LT': '78.56.0.0/13',
4514 'LU': '188.42.0.0/16',
4515 'LV': '46.109.0.0/16',
4516 'LY': '41.252.0.0/14',
4517 'MA': '105.128.0.0/11',
4518 'MC': '88.209.64.0/18',
4519 'MD': '37.246.0.0/16',
4520 'ME': '178.175.0.0/17',
4521 'MF': '74.112.232.0/21',
4522 'MG': '154.126.0.0/17',
4523 'MH': '117.103.88.0/21',
4524 'MK': '77.28.0.0/15',
4525 'ML': '154.118.128.0/18',
4526 'MM': '37.111.0.0/17',
4527 'MN': '49.0.128.0/17',
4528 'MO': '60.246.0.0/16',
4529 'MP': '202.88.64.0/20',
4530 'MQ': '109.203.224.0/19',
4531 'MR': '41.188.64.0/18',
4532 'MS': '208.90.112.0/22',
4533 'MT': '46.11.0.0/16',
4534 'MU': '105.16.0.0/12',
4535 'MV': '27.114.128.0/18',
4536 'MW': '102.70.0.0/15',
4537 'MX': '187.192.0.0/11',
4538 'MY': '175.136.0.0/13',
4539 'MZ': '197.218.0.0/15',
4540 'NA': '41.182.0.0/16',
4541 'NC': '101.101.0.0/18',
4542 'NE': '197.214.0.0/18',
4543 'NF': '203.17.240.0/22',
4544 'NG': '105.112.0.0/12',
4545 'NI': '186.76.0.0/15',
4546 'NL': '145.96.0.0/11',
4547 'NO': '84.208.0.0/13',
4548 'NP': '36.252.0.0/15',
4549 'NR': '203.98.224.0/19',
4550 'NU': '49.156.48.0/22',
4551 'NZ': '49.224.0.0/14',
4552 'OM': '5.36.0.0/15',
4553 'PA': '186.72.0.0/15',
4554 'PE': '186.160.0.0/14',
4555 'PF': '123.50.64.0/18',
4556 'PG': '124.240.192.0/19',
4557 'PH': '49.144.0.0/13',
4558 'PK': '39.32.0.0/11',
4559 'PL': '83.0.0.0/11',
4560 'PM': '70.36.0.0/20',
4561 'PR': '66.50.0.0/16',
4562 'PS': '188.161.0.0/16',
4563 'PT': '85.240.0.0/13',
4564 'PW': '202.124.224.0/20',
4565 'PY': '181.120.0.0/14',
4566 'QA': '37.210.0.0/15',
4567 'RE': '102.35.0.0/16',
4568 'RO': '79.112.0.0/13',
4569 'RS': '93.86.0.0/15',
4570 'RU': '5.136.0.0/13',
4571 'RW': '41.186.0.0/16',
4572 'SA': '188.48.0.0/13',
4573 'SB': '202.1.160.0/19',
4574 'SC': '154.192.0.0/11',
4575 'SD': '102.120.0.0/13',
4576 'SE': '78.64.0.0/12',
4577 'SG': '8.128.0.0/10',
4578 'SI': '188.196.0.0/14',
4579 'SK': '78.98.0.0/15',
4580 'SL': '102.143.0.0/17',
4581 'SM': '89.186.32.0/19',
4582 'SN': '41.82.0.0/15',
4583 'SO': '154.115.192.0/18',
4584 'SR': '186.179.128.0/17',
4585 'SS': '105.235.208.0/21',
4586 'ST': '197.159.160.0/19',
4587 'SV': '168.243.0.0/16',
4588 'SX': '190.102.0.0/20',
4590 'SZ': '41.84.224.0/19',
4591 'TC': '65.255.48.0/20',
4592 'TD': '154.68.128.0/19',
4593 'TG': '196.168.0.0/14',
4594 'TH': '171.96.0.0/13',
4595 'TJ': '85.9.128.0/18',
4596 'TK': '27.96.24.0/21',
4597 'TL': '180.189.160.0/20',
4598 'TM': '95.85.96.0/19',
4599 'TN': '197.0.0.0/11',
4600 'TO': '175.176.144.0/21',
4601 'TR': '78.160.0.0/11',
4602 'TT': '186.44.0.0/15',
4603 'TV': '202.2.96.0/19',
4604 'TW': '120.96.0.0/11',
4605 'TZ': '156.156.0.0/14',
4606 'UA': '37.52.0.0/14',
4607 'UG': '102.80.0.0/13',
4609 'UY': '167.56.0.0/13',
4610 'UZ': '84.54.64.0/18',
4611 'VA': '212.77.0.0/19',
4612 'VC': '207.191.240.0/21',
4613 'VE': '186.88.0.0/13',
4614 'VG': '66.81.192.0/20',
4615 'VI': '146.226.0.0/16',
4616 'VN': '14.160.0.0/11',
4617 'VU': '202.80.32.0/20',
4618 'WF': '117.20.32.0/21',
4619 'WS': '202.4.32.0/19',
4620 'YE': '134.35.0.0/16',
4621 'YT': '41.242.116.0/22',
4622 'ZA': '41.0.0.0/11',
4623 'ZM': '102.144.0.0/13',
4624 'ZW': '102.177.192.0/18',
4628 def random_ipv4(cls, code_or_block):
4629 if len(code_or_block) == 2:
4630 block = cls._country_ip_map.get(code_or_block.upper())
4634 block = code_or_block
4635 addr, preflen = block.split('/')
4636 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4637 addr_max = addr_min | (0xffffffff >> int(preflen))
4638 return compat_str(socket.inet_ntoa(
4639 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4642 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4643 def __init__(self, proxies=None):
4644 # Set default handlers
4645 for type in ('http', 'https'):
4646 setattr(self, '%s_open' % type,
4647 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4648 meth(r, proxy, type))
4649 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4651 def proxy_open(self, req, proxy, type):
4652 req_proxy = req.headers.get('Ytdl-request-proxy')
4653 if req_proxy is not None:
4655 del req.headers['Ytdl-request-proxy']
4657 if proxy == '__noproxy__':
4658 return None # No Proxy
4659 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4660 req.add_header('Ytdl-socks-proxy', proxy)
4661 # yt-dlp's http/https handlers do wrapping the socket with socks
4663 return compat_urllib_request.ProxyHandler.proxy_open(
4664 self, req, proxy, type)
4667 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4668 # released into Public Domain
4669 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4671 def long_to_bytes(n, blocksize=0):
4672 """long_to_bytes(n:long, blocksize:int) : string
4673 Convert a long integer to a byte string.
4675 If optional blocksize is given and greater than zero, pad the front of the
4676 byte string with binary zeros so that the length is a multiple of
4679 # after much testing, this algorithm was deemed to be the fastest
4683 s = compat_struct_pack('>I', n & 0xffffffff) + s
4685 # strip off leading zeros
4686 for i in range(len(s)):
4687 if s[i] != b'\000'[0]:
4690 # only happens when n == 0
4694 # add back some pad bytes. this could be done more efficiently w.r.t. the
4695 # de-padding being done above, but sigh...
4696 if blocksize > 0 and len(s) % blocksize:
4697 s = (blocksize - len(s) % blocksize) * b'\000' + s
4701 def bytes_to_long(s):
4702 """bytes_to_long(string) : long
4703 Convert a byte string to a long integer.
4705 This is (essentially) the inverse of long_to_bytes().
4710 extra = (4 - length % 4)
4711 s = b'\000' * extra + s
4712 length = length + extra
4713 for i in range(0, length, 4):
4714 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4718 def ohdave_rsa_encrypt(data, exponent, modulus):
4720 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4723 data: data to encrypt, bytes-like object
4724 exponent, modulus: parameter e and N of RSA algorithm, both integer
4725 Output: hex string of encrypted data
4727 Limitation: supports one block encryption only
4730 payload = int(binascii.hexlify(data[::-1]), 16)
4731 encrypted = pow(payload, exponent, modulus)
4732 return '%x' % encrypted
4735 def pkcs1pad(data, length):
4737 Padding input data with PKCS#1 scheme
4739 @param {int[]} data input data
4740 @param {int} length target length
4741 @returns {int[]} padded data
4743 if len(data) > length - 11:
4744 raise ValueError('Input data too
long for PKCS
#1 padding')
4746 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4747 return [0, 2] + pseudo_random
+ [0] + data
4750 def encode_base_n(num
, n
, table
=None):
4751 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4753 table
= FULL_TABLE
[:n
]
4756 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4763 ret
= table
[num
% n
] + ret
4768 def decode_packed_codes(code
):
4769 mobj
= re
.search(PACKED_CODES_RE
, code
)
4770 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4773 symbols
= symbols
.split('|')
4778 base_n_count
= encode_base_n(count
, base
)
4779 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4782 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4786 def caesar(s
, alphabet
, shift
):
4791 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4796 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4799 def parse_m3u8_attributes(attrib
):
4801 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4802 if val
.startswith('"'):
4808 def urshift(val
, n
):
4809 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4812 # Based on png2str() written by @gdkchan and improved by @yokrysty
4813 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4814 def decode_png(png_data
):
4815 # Reference: https://www.w3.org/TR/PNG/
4816 header
= png_data
[8:]
4818 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4819 raise OSError('Not a valid PNG file.')
4821 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4822 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4827 length
= unpack_integer(header
[:4])
4830 chunk_type
= header
[:4]
4833 chunk_data
= header
[:length
]
4834 header
= header
[length
:]
4836 header
= header
[4:] # Skip CRC
4844 ihdr
= chunks
[0]['data']
4846 width
= unpack_integer(ihdr
[:4])
4847 height
= unpack_integer(ihdr
[4:8])
4851 for chunk
in chunks
:
4852 if chunk
['type'] == b
'IDAT':
4853 idat
+= chunk
['data']
4856 raise OSError('Unable to read PNG data.')
4858 decompressed_data
= bytearray(zlib
.decompress(idat
))
4863 def _get_pixel(idx
):
4868 for y
in range(height
):
4869 basePos
= y
* (1 + stride
)
4870 filter_type
= decompressed_data
[basePos
]
4874 pixels
.append(current_row
)
4876 for x
in range(stride
):
4877 color
= decompressed_data
[1 + basePos
+ x
]
4878 basex
= y
* stride
+ x
4883 left
= _get_pixel(basex
- 3)
4885 up
= _get_pixel(basex
- stride
)
4887 if filter_type
== 1: # Sub
4888 color
= (color
+ left
) & 0xff
4889 elif filter_type
== 2: # Up
4890 color
= (color
+ up
) & 0xff
4891 elif filter_type
== 3: # Average
4892 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4893 elif filter_type
== 4: # Paeth
4899 c
= _get_pixel(basex
- stride
- 3)
4907 if pa
<= pb
and pa
<= pc
:
4908 color
= (color
+ a
) & 0xff
4910 color
= (color
+ b
) & 0xff
4912 color
= (color
+ c
) & 0xff
4914 current_row
.append(color
)
4916 return width
, height
, pixels
4919 def write_xattr(path
, key
, value
):
4920 # Windows: Write xattrs to NTFS Alternate Data Streams:
4921 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4922 if compat_os_name
== 'nt':
4923 assert ':' not in key
4924 assert os
.path
.exists(path
)
4927 with open(f
'{path}:{key}', 'wb') as f
:
4929 except OSError as e
:
4930 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4933 # UNIX Method 1. Use xattrs/pyxattrs modules
4934 from .dependencies
import xattr
4937 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
4938 # Unicode arguments are not supported in pyxattr until version 0.5.0
4939 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4940 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
4941 setxattr
= xattr
.set
4943 setxattr
= xattr
.setxattr
4947 setxattr(path
, key
, value
)
4948 except OSError as e
:
4949 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4952 # UNIX Method 2. Use setfattr/xattr executables
4953 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
4954 else 'xattr' if check_executable('xattr', ['-h']) else None)
4956 raise XAttrUnavailableError(
4957 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4958 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4960 value
= value
.decode()
4962 _
, stderr
, returncode
= Popen
.run(
4963 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
4964 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4965 except OSError as e
:
4966 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4968 raise XAttrMetadataError(returncode
, stderr
)
4971 def random_birthday(year_field
, month_field
, day_field
):
4972 start_date
= datetime
.date(1950, 1, 1)
4973 end_date
= datetime
.date(1995, 12, 31)
4974 offset
= random
.randint(0, (end_date
- start_date
).days
)
4975 random_date
= start_date
+ datetime
.timedelta(offset
)
4977 year_field
: str(random_date
.year
),
4978 month_field
: str(random_date
.month
),
4979 day_field
: str(random_date
.day
),
4983 # Templates for internet shortcut files, which are plain text files.
4984 DOT_URL_LINK_TEMPLATE
= '''\
4989 DOT_WEBLOC_LINK_TEMPLATE
= '''\
4990 <?xml version="1.0" encoding="UTF-8"?>
4991 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4992 <plist version="1.0">
4995 \t<string>%(url)s</string>
5000 DOT_DESKTOP_LINK_TEMPLATE
= '''\
5010 'url': DOT_URL_LINK_TEMPLATE
,
5011 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
5012 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
5016 def iri_to_uri(iri
):
5018 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5020 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5023 iri_parts
= compat_urllib_parse_urlparse(iri
)
5025 if '[' in iri_parts
.netloc
:
5026 raise ValueError('IPv6 URIs are not, yet, supported.')
5027 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5029 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5032 if iri_parts
.username
:
5033 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5034 if iri_parts
.password
is not None:
5035 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5038 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
5039 # The 'idna' encoding produces ASCII text.
5040 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5041 net_location
+= ':' + str(iri_parts
.port
)
5043 return urllib
.parse
.urlunparse(
5047 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5049 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5050 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5052 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5053 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5055 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5057 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5060 def to_high_limit_path(path
):
5061 if sys
.platform
in ['win32', 'cygwin']:
5062 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5063 return '\\\\?\\' + os
.path
.abspath(path
)
5068 def format_field(obj
, field
=None, template
='%s', ignore
=NO_DEFAULT
, default
='', func
=None):
5069 val
= traverse_obj(obj
, *variadic(field
))
5070 if (not val
and val
!= 0) if ignore
is NO_DEFAULT
else val
in ignore
:
5072 return template
% (func(val
) if func
else val
)
5075 def clean_podcast_url(url
):
5076 return re
.sub(r
'''(?x)
5080 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5083 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5086 cn\.co| # https://podcorn.com/analytics-prefix/
5087 st\.fm # https://podsights.com/docs/
5092 _HEX_TABLE
= '0123456789abcdef'
5095 def random_uuidv4():
5096 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5099 def make_dir(path
, to_screen
=None):
5101 dn
= os
.path
.dirname(path
)
5102 if dn
and not os
.path
.exists(dn
):
5105 except OSError as err
:
5106 if callable(to_screen
) is not None:
5107 to_screen('unable to create directory ' + error_to_compat_str(err
))
5111 def get_executable_path():
5112 from .update
import _get_variant_and_executable_path
5114 return os
.path
.dirname(os
.path
.abspath(_get_variant_and_executable_path()[1]))
5117 def load_plugins(name
, suffix
, namespace
):
5119 with contextlib
.suppress(FileNotFoundError
):
5120 plugins_spec
= importlib
.util
.spec_from_file_location(
5121 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
5122 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
5123 sys
.modules
[plugins_spec
.name
] = plugins
5124 plugins_spec
.loader
.exec_module(plugins
)
5125 for name
in dir(plugins
):
5126 if name
in namespace
:
5128 if not name
.endswith(suffix
):
5130 klass
= getattr(plugins
, name
)
5131 classes
[name
] = namespace
[name
] = klass
5136 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
5137 casesense
=True, is_user_input
=False, traverse_string
=False):
5138 ''' Traverse nested list/dict/tuple
5139 @param path_list A list of paths which are checked one by one.
5140 Each path is a list of keys where each key is a:
5142 - string: A dictionary key
5143 - int: An index into a list
5144 - tuple: A list of keys all of which will be traversed
5145 - Ellipsis: Fetch all values in the object
5146 - Function: Takes the key and value as arguments
5147 and returns whether the key matches or not
5148 @param default Default value to return
5149 @param expected_type Only accept final value of this type (Can also be any callable)
5150 @param get_all Return all the values obtained from a path or only the first one
5151 @param casesense Whether to consider dictionary keys as case sensitive
5152 @param is_user_input Whether the keys are generated from user input. If True,
5153 strings are converted to int/slice if necessary
5154 @param traverse_string Whether to traverse inside strings. If True, any
5155 non-compatible object will also be converted into a string
5159 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
5160 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
5162 def _traverse_obj(obj
, path
, _current_depth
=0):
5164 path
= tuple(variadic(path
))
5165 for i
, key
in enumerate(path
):
5166 if None in (key
, obj
):
5168 if isinstance(key
, (list, tuple)):
5169 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
5172 obj
= (obj
.values() if isinstance(obj
, dict)
5173 else obj
if isinstance(obj
, (list, tuple, LazyList
))
5174 else str(obj
) if traverse_string
else [])
5176 depth
= max(depth
, _current_depth
)
5177 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
5179 if isinstance(obj
, (list, tuple, LazyList
)):
5180 obj
= enumerate(obj
)
5181 elif isinstance(obj
, dict):
5184 if not traverse_string
:
5188 depth
= max(depth
, _current_depth
)
5189 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if try_call(key
, args
=(k
, v
))]
5190 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
5191 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
5192 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
5195 key
= (int_or_none(key
) if ':' not in key
5196 else slice(*map(int_or_none
, key
.split(':'))))
5197 if key
== slice(None):
5198 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
5199 if not isinstance(key
, (int, slice)):
5201 if not isinstance(obj
, (list, tuple, LazyList
)):
5202 if not traverse_string
:
5211 if isinstance(expected_type
, type):
5212 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5213 elif expected_type
is not None:
5214 type_test
= expected_type
5216 type_test
= lambda val
: val
5218 for path
in path_list
:
5220 val
= _traverse_obj(obj
, path
)
5223 for _
in range(depth
- 1):
5224 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5225 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5227 return val
if get_all
else val
[0]
5229 val
= type_test(val
)
5235 def traverse_dict(dictn
, keys
, casesense
=True):
5236 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5237 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5238 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5241 def get_first(obj
, keys
, **kwargs
):
5242 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5245 def variadic(x
, allowed_types
=(str, bytes, dict)):
5246 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5249 def decode_base(value
, digits
):
5250 # This will convert given base-x string to scalar (long or int)
5251 table
= {char: index for index, char in enumerate(digits)}
5256 result
+= table
[chr]
5260 def time_seconds(**kwargs
):
5261 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5262 return t
.timestamp()
5265 # create a JSON Web Signature (jws) with HS256 algorithm
5266 # the resulting format is in JWS Compact Serialization
5267 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5268 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5269 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5275 header_data
.update(headers
)
5276 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5277 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5278 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5279 signature_b64
= base64
.b64encode(h
.digest())
5280 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5284 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5285 def jwt_decode_hs256(jwt
):
5286 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5287 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5291 WINDOWS_VT_MODE
= False if compat_os_name
== 'nt' else None
5295 def supports_terminal_sequences(stream
):
5296 if compat_os_name
== 'nt':
5297 if not WINDOWS_VT_MODE
:
5299 elif not os
.getenv('TERM'):
5302 return stream
.isatty()
5303 except BaseException
:
5307 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5308 if get_windows_version() < (10, 0, 10586):
5310 global WINDOWS_VT_MODE
5312 Popen
.run('', shell
=True)
5316 WINDOWS_VT_MODE
= True
5317 supports_terminal_sequences
.cache_clear()
5320 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5323 def remove_terminal_sequences(string
):
5324 return _terminal_sequences_re
.sub('', string
)
5327 def number_of_digits(number
):
5328 return len('%d' % number
)
5331 def join_nonempty(*values
, delim
='-', from_dict
=None):
5332 if from_dict
is not None:
5333 values
= map(from_dict
.get
, values
)
5334 return delim
.join(map(str, filter(None, values
)))
5337 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5339 Find the largest format dimensions in terms of video width and, for each thumbnail:
5340 * Modify the URL: Match the width with the provided regex and replace with the former width
5343 This function is useful with video services that scale the provided thumbnails on demand
5345 _keys
= ('width', 'height')
5346 max_dimensions
= max(
5347 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5349 if not max_dimensions
[0]:
5353 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5354 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5355 for thumbnail
in thumbnails
5359 def parse_http_range(range):
5360 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5362 return None, None, None
5363 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5365 return None, None, None
5366 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5369 def read_stdin(what
):
5370 eof
= 'Ctrl+Z' if compat_os_name
== 'nt' else 'Ctrl+D'
5371 write_string(f
'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5379 __initialized
= False
5381 def __init__(self
, parser
, label
=None):
5382 self
.parser
, self
.label
= parser
, label
5383 self
._loaded
_paths
, self
.configs
= set(), []
5385 def init(self
, args
=None, filename
=None):
5386 assert not self
.__initialized
5389 location
= os
.path
.realpath(filename
)
5390 directory
= os
.path
.dirname(location
)
5391 if location
in self
._loaded
_paths
:
5393 self
._loaded
_paths
.add(location
)
5395 self
.own_args
, self
.__initialized
= args
, True
5396 opts
, _
= self
.parser
.parse_known_args(args
)
5397 self
.parsed_args
, self
.filename
= args
, filename
5399 for location
in opts
.config_locations
or []:
5401 self
.append_config(shlex
.split(read_stdin('options'), comments
=True), label
='stdin')
5403 location
= os
.path
.join(directory
, expand_path(location
))
5404 if os
.path
.isdir(location
):
5405 location
= os
.path
.join(location
, 'yt-dlp.conf')
5406 if not os
.path
.exists(location
):
5407 self
.parser
.error(f
'config location {location} does not exist')
5408 self
.append_config(self
.read_file(location
), location
)
5412 label
= join_nonempty(
5413 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5415 return join_nonempty(
5416 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5417 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5421 def read_file(filename
, default
=[]):
5423 optionf
= open(filename
)
5425 return default
# silently skip if file is not present
5427 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5428 contents
= optionf
.read()
5429 res
= shlex
.split(contents
, comments
=True)
5435 def hide_login_info(opts
):
5436 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5437 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5442 return m
.group('key') + '=PRIVATE'
5446 opts
= list(map(_scrub_eq
, opts
))
5447 for idx
, opt
in enumerate(opts
):
5448 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5449 opts
[idx
+ 1] = 'PRIVATE'
5452 def append_config(self
, *args
, label
=None):
5453 config
= type(self
)(self
.parser
, label
)
5454 config
._loaded
_paths
= self
._loaded
_paths
5455 if config
.init(*args
):
5456 self
.configs
.append(config
)
5460 for config
in reversed(self
.configs
):
5461 yield from config
.all_args
5462 yield from self
.parsed_args
or []
5464 def parse_known_args(self
, **kwargs
):
5465 return self
.parser
.parse_known_args(self
.all_args
, **kwargs
)
5467 def parse_args(self
):
5468 return self
.parser
.parse_args(self
.all_args
)
5471 class WebSocketsWrapper():
5472 """Wraps websockets module to use in non-async scopes"""
5475 def __init__(self
, url
, headers
=None, connect
=True):
5476 self
.loop
= asyncio
.new_event_loop()
5477 # XXX: "loop" is deprecated
5478 self
.conn
= websockets
.connect(
5479 url
, extra_headers
=headers
, ping_interval
=None,
5480 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5483 atexit
.register(self
.__exit
__, None, None, None)
5485 def __enter__(self
):
5487 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5490 def send(self
, *args
):
5491 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5493 def recv(self
, *args
):
5494 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5496 def __exit__(self
, type, value
, traceback
):
5498 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5501 self
._cancel
_all
_tasks
(self
.loop
)
5503 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5504 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5506 def run_with_loop(main
, loop
):
5507 if not asyncio
.iscoroutine(main
):
5508 raise ValueError(f
'a coroutine was expected, got {main!r}')
5511 return loop
.run_until_complete(main
)
5513 loop
.run_until_complete(loop
.shutdown_asyncgens())
5514 if hasattr(loop
, 'shutdown_default_executor'):
5515 loop
.run_until_complete(loop
.shutdown_default_executor())
5518 def _cancel_all_tasks(loop
):
5519 to_cancel
= asyncio
.all_tasks(loop
)
5524 for task
in to_cancel
:
5527 # XXX: "loop" is removed in python 3.10+
5528 loop
.run_until_complete(
5529 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5531 for task
in to_cancel
:
5532 if task
.cancelled():
5534 if task
.exception() is not None:
5535 loop
.call_exception_handler({
5536 'message': 'unhandled exception during asyncio.run() shutdown',
5537 'exception': task
.exception(),
5542 def merge_headers(*dicts
):
5543 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5544 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5547 class classproperty
:
5548 """classmethod(property(func)) that works in py < 3.9"""
5550 def __init__(self
, func
):
5551 functools
.update_wrapper(self
, func
)
5554 def __get__(self
, _
, cls
):
5555 return self
.func(cls
)
5558 class Namespace(types
.SimpleNamespace
):
5559 """Immutable namespace"""
5562 return iter(self
.__dict
__.values())
5566 return self
.__dict
__.items()
5570 has_certifi
= bool(certifi
)
5571 has_websockets
= bool(websockets
)