39 import xml
.etree
.ElementTree
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
50 compat_HTMLParseError
,
61 compat_urllib_parse_unquote_plus
,
62 compat_urllib_parse_urlencode
,
63 compat_urllib_parse_urlparse
,
64 compat_urllib_request
,
67 from .dependencies
import brotli
, certifi
, websockets
68 from .socks
import ProxyType
, sockssocket
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme
not in compat_urlparse
.uses_netloc
:
77 compat_urlparse
.uses_netloc
.append(scheme
)
80 # This is not clearly defined otherwise
81 compiled_regex_type
= type(re
.compile(''))
84 def random_user_agent():
85 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
126 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
129 SUPPORTED_ENCODINGS
= [
133 SUPPORTED_ENCODINGS
.append('br')
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
148 NO_DEFAULT
= object()
150 ENGLISH_MONTH_NAMES
= [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
155 'en': ENGLISH_MONTH_NAMES
,
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
174 'f4f', 'f4m', 'm3u8', 'smil')
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
217 '%Y-%m-%dT%H:%M:%S.%f',
220 '%b %d %Y at %H:%M:%S',
222 '%B %d %Y at %H:%M:%S',
226 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
227 DATE_FORMATS_DAY_FIRST
.extend([
236 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
237 DATE_FORMATS_MONTH_FIRST
.extend([
245 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
248 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
251 def preferredencoding():
252 """Get preferred encoding.
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
258 pref = locale.getpreferredencoding()
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
271 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32
':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
281 with contextlib.suppress(OSError):
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
287 with contextlib.suppress(OSError):
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
296 return node.find(expr)
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
302 def xpath_with_ns(path
, ns_map
):
303 components
= [c
.split(':') for c
in path
.split('/')]
307 replaced
.append(c
[0])
310 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
311 return '/'.join(replaced
)
314 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
315 def _find_xpath(xpath
):
316 return node
.find(xpath
)
318 if isinstance(xpath
, (str, compat_str
)):
319 n
= _find_xpath(xpath
)
327 if default
is not NO_DEFAULT
:
330 name
= xpath
if name
is None else name
331 raise ExtractorError('Could not find XML element %s' % name
)
337 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
338 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
339 if n
is None or n
== default
:
342 if default
is not NO_DEFAULT
:
345 name
= xpath
if name
is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name
)
352 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
353 n
= find_xpath_attr(node
, xpath
, key
)
355 if default
is not NO_DEFAULT
:
358 name
= f
'{xpath}[@{key}]' if name
is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name
)
365 def get_element_by_id(id, html
):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html
)
370 def get_element_html_by_id(id, html
):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html
)
375 def get_element_by_class(class_name
, html
):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval
= get_elements_by_class(class_name
, html
)
378 return retval
[0] if retval
else None
381 def get_element_html_by_class(class_name
, html
):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval
= get_elements_html_by_class(class_name
, html
)
384 return retval
[0] if retval
else None
387 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
388 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
389 return retval
[0] if retval
else None
392 def get_element_html_by_attribute(attribute
, value
, html
, escape_value
=True):
393 retval
= get_elements_html_by_attribute(attribute
, value
, html
, escape_value
)
394 return retval
[0] if retval
else None
397 def get_elements_by_class(class_name
, html
):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^
\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
427 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
429 value = re.escape(value) if escape_value else value
431 partial_element_re = rf'''(?x
)
432 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
433 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
453 class HTMLBreakOnClosingTagException(Exception):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
463 def __exit__(self, *_):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags
in the stack
')
479 inner_tag = self.tagstack.pop()
483 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
488 def get_element_text_and_html_by_tag(tag, html):
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text
) and the whole
element (html
)
493 def find_or_raise(haystack, needle, exc):
495 return haystack.index(needle)
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes
for a single element
"""
528 compat_HTMLParser.__init__(self)
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes
for the elements of a
list"""
538 compat_HTMLParser.__init__(self)
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
547 def handle_endtag(self, tag):
551 def extract_attributes(html_element):
552 """Given a string
for an HTML element such
as
554 a
="foo" B
="bar" c
="&98;az" d
=boz
555 empty
= noval entity
="&"
558 Decode
and return a dictionary of attributes
.
560 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
572 def parse_list(webpage):
573 """Given a string
for an series of HTML
<li
> elements
,
574 return a dictionary of their attributes
"""
575 parser = HTMLListAttrsParser()
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string
"""
584 if html is None: # Convenience for sanitizing descriptions etc.
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
597 def sanitize_open(filename, open_mode):
598 """Try to
open the given filename
, and slightly tweak it
if this fails
.
600 Attempts to
open the given filename
. If this fails
, it tries to change
601 the filename slightly
, step by step
, until it
's either able to open it
602 or it fails and raises a final exception, like the standard open()
605 It returns the tuple (stream, definitive_file_name).
608 if sys.platform == 'win32
':
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
613 for attempt in range(2):
616 if sys.platform == 'win32
':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
622 except LockingUnsupportedError
:
623 stream
= open(filename
, open_mode
)
624 return (stream
, filename
)
625 except OSError as err
:
626 if attempt
or err
.errno
in (errno
.EACCES
,):
628 old_filename
, filename
= filename
, sanitize_path(filename
)
629 if old_filename
== filename
:
633 def timeconvert(timestr
):
634 """Convert RFC 2822 defined time string into system timestamp"""
636 timetuple
= email
.utils
.parsedate_tz(timestr
)
637 if timetuple
is not None:
638 timestamp
= email
.utils
.mktime_tz(timetuple
)
642 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
651 def replace_insane(char
):
652 if restricted
and char
in ACCENT_CHARS
:
653 return ACCENT_CHARS
[char
]
654 elif not restricted
and char
== '\n':
656 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
659 return '' if restricted
else '\''
661 return '\0_\0-' if restricted
else '\0 \0-'
662 elif char
in '\\/|*<>':
664 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
668 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
669 result
= ''.join(map(replace_insane
, s
))
670 if is_id
is NO_DEFAULT
:
671 result
= re
.sub('(\0.)(?:(?=\\1)..)+', r
'\1', result
) # Remove repeated substitute chars
672 STRIP_RE
= '(?:\0.|[ _-])*'
673 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
674 result
= result
.replace('\0', '') or '_'
677 while '__' in result
:
678 result
= result
.replace('__', '_')
679 result
= result
.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted
and result
.startswith('-_'):
683 if result
.startswith('-'):
684 result
= '_' + result
[len('-'):]
685 result
= result
.lstrip('.')
691 def sanitize_path(s
, force
=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys
.platform
== 'win32':
695 drive_or_unc
, _
= os
.path
.splitdrive(s
)
701 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
705 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
706 for path_part
in norm_path
]
708 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
709 elif force
and s
and s
[0] == os
.path
.sep
:
710 sanitized_path
.insert(0, os
.path
.sep
)
711 return os
.path
.join(*sanitized_path
)
714 def sanitize_url(url
):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url
.startswith('//'):
718 return 'http:%s' % url
719 # Fix some common typos seen so far
721 # https://github.com/ytdl-org/youtube-dl/issues/15649
722 (r
'^httpss://', r
'https://'),
723 # https://bx1.be/lives/direct-tv/
724 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
726 for mistake
, fixup
in COMMON_TYPOS
:
727 if re
.match(mistake
, url
):
728 return re
.sub(mistake
, fixup
, url
)
732 def extract_basic_auth(url
):
733 parts
= compat_urlparse
.urlsplit(url
)
734 if parts
.username
is None:
736 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
737 parts
.hostname
if parts
.port
is None
738 else '%s:%d' % (parts
.hostname
, parts
.port
))))
739 auth_payload
= base64
.b64encode(
740 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
741 return url
, f
'Basic {auth_payload.decode()}'
744 def sanitized_Request(url
, *args
, **kwargs
):
745 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
746 if auth_header
is not None:
747 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
748 headers
['Authorization'] = auth_header
749 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
753 """Expand shell variables and ~"""
754 return os
.path
.expandvars(compat_expanduser(s
))
757 def orderedSet(iterable
):
758 """ Remove all duplicates from the input iterable """
766 def _htmlentity_transform(entity_with_semicolon
):
767 """Transforms an HTML entity to a character."""
768 entity
= entity_with_semicolon
[:-1]
770 # Known non-numeric HTML entity
771 if entity
in compat_html_entities
.name2codepoint
:
772 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
774 # TODO: HTML5 allows entities without a semicolon. For example,
775 # 'Éric' should be decoded as 'Éric'.
776 if entity_with_semicolon
in compat_html_entities_html5
:
777 return compat_html_entities_html5
[entity_with_semicolon
]
779 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
781 numstr
= mobj
.group(1)
782 if numstr
.startswith('x'):
784 numstr
= '0%s' % numstr
787 # See https://github.com/ytdl-org/youtube-dl/issues/7518
788 with contextlib
.suppress(ValueError):
789 return compat_chr(int(numstr
, base
))
791 # Unknown entity in name, return its literal representation
792 return '&%s;' % entity
798 assert isinstance(s
, str)
801 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
804 def escapeHTML(text
):
807 .replace('&', '&')
808 .replace('<', '<')
809 .replace('>', '>')
810 .replace('"', '"')
811 .replace("'", ''')
815 def process_communicate_or_kill(p
, *args
, **kwargs
):
817 return p
.communicate(*args
, **kwargs
)
818 except BaseException
: # Including KeyboardInterrupt
824 class Popen(subprocess
.Popen
):
825 if sys
.platform
== 'win32':
826 _startupinfo
= subprocess
.STARTUPINFO()
827 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
831 def __init__(self
, *args
, **kwargs
):
832 super().__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
834 def communicate_or_kill(self
, *args
, **kwargs
):
835 return process_communicate_or_kill(self
, *args
, **kwargs
)
838 def get_subprocess_encoding():
839 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
840 # For subprocess calls, encode with locale encoding
841 # Refer to http://stackoverflow.com/a/9951851/35070
842 encoding
= preferredencoding()
844 encoding
= sys
.getfilesystemencoding()
850 def encodeFilename(s
, for_subprocess
=False):
851 assert isinstance(s
, str)
855 def decodeFilename(b
, for_subprocess
=False):
859 def encodeArgument(s
):
860 # Legacy code that uses byte strings
861 # Uncomment the following line after fixing all post processors
862 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
863 return s
if isinstance(s
, str) else s
.decode('ascii')
866 def decodeArgument(b
):
870 def decodeOption(optval
):
873 if isinstance(optval
, bytes):
874 optval
= optval
.decode(preferredencoding())
876 assert isinstance(optval
, compat_str
)
880 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883 def timetuple_from_msec(msec
):
884 secs
, msec
= divmod(msec
, 1000)
885 mins
, secs
= divmod(secs
, 60)
886 hrs
, mins
= divmod(mins
, 60)
887 return _timetuple(hrs
, mins
, secs
, msec
)
890 def formatSeconds(secs
, delim
=':', msec
=False):
891 time
= timetuple_from_msec(secs
* 1000)
893 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
895 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
897 ret
= '%d' % time
.seconds
898 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
901 def _ssl_load_windows_store_certs(ssl_context
, storename
):
902 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
904 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
905 if encoding
== 'x509_asn' and (
906 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
907 except PermissionError
:
910 with contextlib
.suppress(ssl
.SSLError
):
911 ssl_context
.load_verify_locations(cadata
=cert
)
914 def make_HTTPS_handler(params
, **kwargs
):
915 opts_check_certificate
= not params
.get('nocheckcertificate')
916 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
917 context
.check_hostname
= opts_check_certificate
918 if params
.get('legacyserverconnect'):
919 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
920 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
921 context
.set_ciphers('DEFAULT')
922 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
923 if opts_check_certificate
:
924 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
925 context
.load_verify_locations(cafile
=certifi
.where())
928 context
.load_default_certs()
929 # Work around the issue in load_default_certs when there are bad certificates. See:
930 # https://github.com/yt-dlp/yt-dlp/issues/1060,
931 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
933 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
934 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
935 for storename
in ('CA', 'ROOT'):
936 _ssl_load_windows_store_certs(context
, storename
)
937 context
.set_default_verify_paths()
938 client_certfile
= params
.get('client_certificate')
941 context
.load_cert_chain(
942 client_certfile
, keyfile
=params
.get('client_certificate_key'),
943 password
=params
.get('client_certificate_password'))
945 raise YoutubeDLError('Unable to load client certificate')
946 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
949 def bug_reports_message(before
=';'):
950 msg
= ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
951 'filling out the appropriate issue template. '
952 'Confirm you are on the latest version using yt-dlp -U')
954 before
= before
.rstrip()
955 if not before
or before
.endswith(('.', '!', '?')):
956 msg
= msg
[0].title() + msg
[1:]
958 return (before
+ ' ' if before
else '') + msg
961 class YoutubeDLError(Exception):
962 """Base exception for YoutubeDL errors."""
965 def __init__(self
, msg
=None):
968 elif self
.msg
is None:
969 self
.msg
= type(self
).__name
__
970 super().__init
__(self
.msg
)
973 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
974 if hasattr(ssl
, 'CertificateError'):
975 network_exceptions
.append(ssl
.CertificateError
)
976 network_exceptions
= tuple(network_exceptions
)
979 class ExtractorError(YoutubeDLError
):
980 """Error during info extraction."""
982 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
983 """ tb, if given, is the original traceback (so that it can be printed out).
984 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
986 if sys
.exc_info()[0] in network_exceptions
:
989 self
.orig_msg
= str(msg
)
991 self
.expected
= expected
993 self
.video_id
= video_id
995 self
.exc_info
= sys
.exc_info() # preserve original exception
997 super().__init
__(''.join((
998 format_field(ie
, template
='[%s] '),
999 format_field(video_id
, template
='%s: '),
1001 format_field(cause
, template
=' (caused by %r)'),
1002 '' if expected
else bug_reports_message())))
1004 def format_traceback(self
):
1005 return join_nonempty(
1006 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1007 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1011 class UnsupportedError(ExtractorError
):
1012 def __init__(self
, url
):
1014 'Unsupported URL: %s' % url
, expected
=True)
1018 class RegexNotFoundError(ExtractorError
):
1019 """Error when a regex didn't match"""
1023 class GeoRestrictedError(ExtractorError
):
1024 """Geographic restriction Error exception.
1026 This exception may be thrown when a video is not available from your
1027 geographic location due to geographic restrictions imposed by a website.
1030 def __init__(self
, msg
, countries
=None, **kwargs
):
1031 kwargs
['expected'] = True
1032 super().__init
__(msg
, **kwargs
)
1033 self
.countries
= countries
1036 class DownloadError(YoutubeDLError
):
1037 """Download Error exception.
1039 This exception may be thrown by FileDownloader objects if they are not
1040 configured to continue on errors. They will contain the appropriate
1044 def __init__(self
, msg
, exc_info
=None):
1045 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1046 super().__init
__(msg
)
1047 self
.exc_info
= exc_info
1050 class EntryNotInPlaylist(YoutubeDLError
):
1051 """Entry not in playlist exception.
1053 This exception will be thrown by YoutubeDL when a requested entry
1054 is not found in the playlist info_dict
1056 msg
= 'Entry not found in info'
1059 class SameFileError(YoutubeDLError
):
1060 """Same File exception.
1062 This exception will be thrown by FileDownloader objects if they detect
1063 multiple files would have to be downloaded to the same file on disk.
1065 msg
= 'Fixed output name but more than one file to download'
1067 def __init__(self
, filename
=None):
1068 if filename
is not None:
1069 self
.msg
+= f
': {filename}'
1070 super().__init
__(self
.msg
)
1073 class PostProcessingError(YoutubeDLError
):
1074 """Post Processing exception.
1076 This exception may be raised by PostProcessor's .run() method to
1077 indicate an error in the postprocessing task.
1081 class DownloadCancelled(YoutubeDLError
):
1082 """ Exception raised when the download queue should be interrupted """
1083 msg
= 'The download was cancelled'
1086 class ExistingVideoReached(DownloadCancelled
):
1087 """ --break-on-existing triggered """
1088 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1091 class RejectedVideoReached(DownloadCancelled
):
1092 """ --break-on-reject triggered """
1093 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1096 class MaxDownloadsReached(DownloadCancelled
):
1097 """ --max-downloads limit has been reached. """
1098 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1101 class ReExtractInfo(YoutubeDLError
):
1102 """ Video info needs to be re-extracted. """
1104 def __init__(self
, msg
, expected
=False):
1105 super().__init
__(msg
)
1106 self
.expected
= expected
1109 class ThrottledDownload(ReExtractInfo
):
1110 """ Download speed below --throttled-rate. """
1111 msg
= 'The download speed is below throttle limit'
1114 super().__init
__(self
.msg
, expected
=False)
1117 class UnavailableVideoError(YoutubeDLError
):
1118 """Unavailable Format exception.
1120 This exception will be thrown when a video is requested
1121 in a format that is not available for that video.
1123 msg
= 'Unable to download video'
1125 def __init__(self
, err
=None):
1127 self
.msg
+= f
': {err}'
1128 super().__init
__(self
.msg
)
1131 class ContentTooShortError(YoutubeDLError
):
1132 """Content Too Short exception.
1134 This exception may be raised by FileDownloader objects when a file they
1135 download is too small for what the server announced first, indicating
1136 the connection was probably interrupted.
1139 def __init__(self
, downloaded
, expected
):
1140 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1142 self
.downloaded
= downloaded
1143 self
.expected
= expected
1146 class XAttrMetadataError(YoutubeDLError
):
1147 def __init__(self
, code
=None, msg
='Unknown error'):
1148 super().__init
__(msg
)
1152 # Parsing code and msg
1153 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1154 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1155 self
.reason
= 'NO_SPACE'
1156 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1157 self
.reason
= 'VALUE_TOO_LONG'
1159 self
.reason
= 'NOT_SUPPORTED'
1162 class XAttrUnavailableError(YoutubeDLError
):
1166 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1167 hc
= http_class(*args
, **kwargs
)
1168 source_address
= ydl_handler
._params
.get('source_address')
1170 if source_address
is not None:
1171 # This is to workaround _create_connection() from socket where it will try all
1172 # address data from getaddrinfo() including IPv6. This filters the result from
1173 # getaddrinfo() based on the source_address value.
1174 # This is based on the cpython socket.create_connection() function.
1175 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1176 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1177 host
, port
= address
1179 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1180 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1181 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1182 if addrs
and not ip_addrs
:
1183 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1185 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1186 % (ip_version
, source_address
[0]))
1187 for res
in ip_addrs
:
1188 af
, socktype
, proto
, canonname
, sa
= res
1191 sock
= socket
.socket(af
, socktype
, proto
)
1192 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1193 sock
.settimeout(timeout
)
1194 sock
.bind(source_address
)
1196 err
= None # Explicitly break reference cycle
1198 except OSError as _
:
1200 if sock
is not None:
1205 raise OSError('getaddrinfo returns an empty list')
1206 if hasattr(hc
, '_create_connection'):
1207 hc
._create
_connection
= _create_connection
1208 hc
.source_address
= (source_address
, 0)
1213 def handle_youtubedl_headers(headers
):
1214 filtered_headers
= headers
1216 if 'Youtubedl-no-compression' in filtered_headers
:
1217 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1218 del filtered_headers
['Youtubedl-no-compression']
1220 return filtered_headers
1223 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1224 """Handler for HTTP requests and responses.
1226 This class, when installed with an OpenerDirector, automatically adds
1227 the standard headers to every HTTP request and handles gzipped and
1228 deflated responses from web servers. If compression is to be avoided in
1229 a particular request, the original request in the program code only has
1230 to include the HTTP header "Youtubedl-no-compression", which will be
1231 removed before making the real request.
1233 Part of this code was copied from:
1235 http://techknack.net/python-urllib2-handlers/
1237 Andrew Rowls, the author of that code, agreed to release it to the
1241 def __init__(self
, params
, *args
, **kwargs
):
1242 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1243 self
._params
= params
1245 def http_open(self
, req
):
1246 conn_class
= compat_http_client
.HTTPConnection
1248 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1250 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1251 del req
.headers
['Ytdl-socks-proxy']
1253 return self
.do_open(functools
.partial(
1254 _create_http_connection
, self
, conn_class
, False),
1262 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1264 return zlib
.decompress(data
)
1270 return brotli
.decompress(data
)
1272 def http_request(self
, req
):
1273 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1274 # always respected by websites, some tend to give out URLs with non percent-encoded
1275 # non-ASCII characters (see telemb.py, ard.py [#3412])
1276 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1277 # To work around aforementioned issue we will replace request's original URL with
1278 # percent-encoded one
1279 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1280 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1281 url
= req
.get_full_url()
1282 url_escaped
= escape_url(url
)
1284 # Substitute URL if any change after escaping
1285 if url
!= url_escaped
:
1286 req
= update_Request(req
, url
=url_escaped
)
1288 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1289 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1290 # The dict keys are capitalized because of this bug by urllib
1291 if h
.capitalize() not in req
.headers
:
1292 req
.add_header(h
, v
)
1294 if 'Accept-encoding' not in req
.headers
:
1295 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1297 req
.headers
= handle_youtubedl_headers(req
.headers
)
1301 def http_response(self
, req
, resp
):
1304 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1305 content
= resp
.read()
1306 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1308 uncompressed
= io
.BytesIO(gz
.read())
1309 except OSError as original_ioerror
:
1310 # There may be junk add the end of the file
1311 # See http://stackoverflow.com/q/4928560/35070 for details
1312 for i
in range(1, 1024):
1314 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1315 uncompressed
= io
.BytesIO(gz
.read())
1320 raise original_ioerror
1321 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1322 resp
.msg
= old_resp
.msg
1323 del resp
.headers
['Content-encoding']
1325 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1326 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1327 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1328 resp
.msg
= old_resp
.msg
1329 del resp
.headers
['Content-encoding']
1331 if resp
.headers
.get('Content-encoding', '') == 'br':
1332 resp
= compat_urllib_request
.addinfourl(
1333 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1334 resp
.msg
= old_resp
.msg
1335 del resp
.headers
['Content-encoding']
1336 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1337 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1338 if 300 <= resp
.code
< 400:
1339 location
= resp
.headers
.get('Location')
1341 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1342 location
= location
.encode('iso-8859-1').decode()
1343 location_escaped
= escape_url(location
)
1344 if location
!= location_escaped
:
1345 del resp
.headers
['Location']
1346 resp
.headers
['Location'] = location_escaped
1349 https_request
= http_request
1350 https_response
= http_response
1353 def make_socks_conn_class(base_class
, socks_proxy
):
1354 assert issubclass(base_class
, (
1355 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1357 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1358 if url_components
.scheme
.lower() == 'socks5':
1359 socks_type
= ProxyType
.SOCKS5
1360 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1361 socks_type
= ProxyType
.SOCKS4
1362 elif url_components
.scheme
.lower() == 'socks4a':
1363 socks_type
= ProxyType
.SOCKS4A
1365 def unquote_if_non_empty(s
):
1368 return compat_urllib_parse_unquote_plus(s
)
1372 url_components
.hostname
, url_components
.port
or 1080,
1374 unquote_if_non_empty(url_components
.username
),
1375 unquote_if_non_empty(url_components
.password
),
1378 class SocksConnection(base_class
):
1380 self
.sock
= sockssocket()
1381 self
.sock
.setproxy(*proxy_args
)
1382 if isinstance(self
.timeout
, (int, float)):
1383 self
.sock
.settimeout(self
.timeout
)
1384 self
.sock
.connect((self
.host
, self
.port
))
1386 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1387 if hasattr(self
, '_context'): # Python > 2.6
1388 self
.sock
= self
._context
.wrap_socket(
1389 self
.sock
, server_hostname
=self
.host
)
1391 self
.sock
= ssl
.wrap_socket(self
.sock
)
1393 return SocksConnection
1396 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1397 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1398 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1399 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1400 self
._params
= params
1402 def https_open(self
, req
):
1404 conn_class
= self
._https
_conn
_class
1406 if hasattr(self
, '_context'): # python > 2.6
1407 kwargs
['context'] = self
._context
1408 if hasattr(self
, '_check_hostname'): # python 3.x
1409 kwargs
['check_hostname'] = self
._check
_hostname
1411 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1413 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1414 del req
.headers
['Ytdl-socks-proxy']
1417 return self
.do_open(
1418 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1419 except urllib
.error
.URLError
as e
:
1420 if (isinstance(e
.reason
, ssl
.SSLError
)
1421 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1422 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1426 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1428 See [1] for cookie file format.
1430 1. https://curl.haxx.se/docs/http-cookies.html
1432 _HTTPONLY_PREFIX
= '#HttpOnly_'
1434 _HEADER
= '''# Netscape HTTP Cookie File
1435 # This file is generated by yt-dlp. Do not edit.
1438 _CookieFileEntry
= collections
.namedtuple(
1440 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1442 def __init__(self
, filename
=None, *args
, **kwargs
):
1443 super().__init
__(None, *args
, **kwargs
)
1444 if self
.is_path(filename
):
1445 filename
= os
.fspath(filename
)
1446 self
.filename
= filename
1449 def _true_or_false(cndn
):
1450 return 'TRUE' if cndn
else 'FALSE'
1454 return isinstance(file, (str, bytes, os
.PathLike
))
1456 @contextlib.contextmanager
1457 def open(self
, file, *, write
=False):
1458 if self
.is_path(file):
1459 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1466 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1469 if (not ignore_discard
and cookie
.discard
1470 or not ignore_expires
and cookie
.is_expired(now
)):
1472 name
, value
= cookie
.name
, cookie
.value
1474 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1475 # with no name, whereas http.cookiejar regards it as a
1476 # cookie with no value.
1477 name
, value
= '', name
1478 f
.write('%s\n' % '\t'.join((
1480 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1482 self
._true
_or
_false
(cookie
.secure
),
1483 str_or_none(cookie
.expires
, default
=''),
1487 def save(self
, filename
=None, *args
, **kwargs
):
1489 Save cookies to a file.
1490 Code is taken from CPython 3.6
1491 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1493 if filename
is None:
1494 if self
.filename
is not None:
1495 filename
= self
.filename
1497 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1499 # Store session cookies with `expires` set to 0 instead of an empty string
1501 if cookie
.expires
is None:
1504 with self
.open(filename
, write
=True) as f
:
1505 f
.write(self
._HEADER
)
1506 self
._really
_save
(f
, *args
, **kwargs
)
1508 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1509 """Load cookies from a file."""
1510 if filename
is None:
1511 if self
.filename
is not None:
1512 filename
= self
.filename
1514 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1516 def prepare_line(line
):
1517 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1518 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1519 # comments and empty lines are fine
1520 if line
.startswith('#') or not line
.strip():
1522 cookie_list
= line
.split('\t')
1523 if len(cookie_list
) != self
._ENTRY
_LEN
:
1524 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1525 cookie
= self
._CookieFileEntry
(*cookie_list
)
1526 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1527 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1531 with self
.open(filename
) as f
:
1534 cf
.write(prepare_line(line
))
1535 except compat_cookiejar
.LoadError
as e
:
1536 if f
'{line.strip()} '[0] in '[{"':
1537 raise compat_cookiejar
.LoadError(
1538 'Cookies file must be Netscape formatted, not JSON. See '
1539 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1540 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1543 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1544 # Session cookies are denoted by either `expires` field set to
1545 # an empty string or 0. MozillaCookieJar only recognizes the former
1546 # (see [1]). So we need force the latter to be recognized as session
1547 # cookies on our own.
1548 # Session cookies may be important for cookies-based authentication,
1549 # e.g. usually, when user does not check 'Remember me' check box while
1550 # logging in on a site, some important cookies are stored as session
1551 # cookies so that not recognizing them will result in failed login.
1552 # 1. https://bugs.python.org/issue17164
1554 # Treat `expires=0` cookies as session cookies
1555 if cookie
.expires
== 0:
1556 cookie
.expires
= None
1557 cookie
.discard
= True
1560 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1561 def __init__(self
, cookiejar
=None):
1562 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1564 def http_response(self
, request
, response
):
1565 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1567 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1568 https_response
= http_response
1571 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1572 """YoutubeDL redirect handler
1574 The code is based on HTTPRedirectHandler implementation from CPython [1].
1576 This redirect handler solves two issues:
1577 - ensures redirect URL is always unicode under python 2
1578 - introduces support for experimental HTTP response status code
1579 308 Permanent Redirect [2] used by some sites [3]
1581 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1582 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1583 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1588 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1589 """Return a Request or None in response to a redirect.
1591 This is called by the http_error_30x methods when a
1592 redirection response is received. If a redirection should
1593 take place, return a new Request to allow http_error_30x to
1594 perform the redirect. Otherwise, raise HTTPError if no-one
1595 else should try to handle this url. Return None if you can't
1596 but another Handler might.
1598 m
= req
.get_method()
1599 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1600 or code
in (301, 302, 303) and m
== "POST")):
1601 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1602 # Strictly (according to RFC 2616), 301 or 302 in response to
1603 # a POST MUST NOT cause a redirection without confirmation
1604 # from the user (of urllib.request, in this case). In practice,
1605 # essentially all clients do redirect in this case, so we do
1608 # Be conciliant with URIs containing a space. This is mainly
1609 # redundant with the more complete encoding done in http_error_302(),
1610 # but it is kept for compatibility with other callers.
1611 newurl
= newurl
.replace(' ', '%20')
1613 CONTENT_HEADERS
= ("content-length", "content-type")
1614 # NB: don't use dict comprehension for python 2.6 compatibility
1615 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1617 # A 303 must either use GET or HEAD for subsequent request
1618 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1619 if code
== 303 and m
!= 'HEAD':
1621 # 301 and 302 redirects are commonly turned into a GET from a POST
1622 # for subsequent requests by browsers, so we'll do the same.
1623 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1624 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1625 if code
in (301, 302) and m
== 'POST':
1628 return compat_urllib_request
.Request(
1629 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1630 unverifiable
=True, method
=m
)
1633 def extract_timezone(date_str
):
1636 ^.{8,}? # >=8 char non-TZ prefix, if present
1637 (?P<tz>Z| # just the UTC Z, or
1638 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1639 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1640 [ ]? # optional space
1641 (?P<sign>\+|-) # +/-
1642 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1646 timezone
= datetime
.timedelta()
1648 date_str
= date_str
[:-len(m
.group('tz'))]
1649 if not m
.group('sign'):
1650 timezone
= datetime
.timedelta()
1652 sign
= 1 if m
.group('sign') == '+' else -1
1653 timezone
= datetime
.timedelta(
1654 hours
=sign
* int(m
.group('hours')),
1655 minutes
=sign
* int(m
.group('minutes')))
1656 return timezone
, date_str
1659 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1660 """ Return a UNIX timestamp from the given date """
1662 if date_str
is None:
1665 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1667 if timezone
is None:
1668 timezone
, date_str
= extract_timezone(date_str
)
1670 with contextlib
.suppress(ValueError):
1671 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1672 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1673 return calendar
.timegm(dt
.timetuple())
1676 def date_formats(day_first
=True):
1677 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1680 def unified_strdate(date_str
, day_first
=True):
1681 """Return a string with the date in the format YYYYMMDD"""
1683 if date_str
is None:
1687 date_str
= date_str
.replace(',', ' ')
1688 # Remove AM/PM + timezone
1689 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1690 _
, date_str
= extract_timezone(date_str
)
1692 for expression
in date_formats(day_first
):
1693 with contextlib
.suppress(ValueError):
1694 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1695 if upload_date
is None:
1696 timetuple
= email
.utils
.parsedate_tz(date_str
)
1698 with contextlib
.suppress(ValueError):
1699 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1700 if upload_date
is not None:
1701 return compat_str(upload_date
)
1704 def unified_timestamp(date_str
, day_first
=True):
1705 if date_str
is None:
1708 date_str
= re
.sub(r
'[,|]', '', date_str
)
1710 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1711 timezone
, date_str
= extract_timezone(date_str
)
1713 # Remove AM/PM + timezone
1714 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1716 # Remove unrecognized timezones from ISO 8601 alike timestamps
1717 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1719 date_str
= date_str
[:-len(m
.group('tz'))]
1721 # Python only supports microseconds, so remove nanoseconds
1722 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1724 date_str
= m
.group(1)
1726 for expression
in date_formats(day_first
):
1727 with contextlib
.suppress(ValueError):
1728 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1729 return calendar
.timegm(dt
.timetuple())
1730 timetuple
= email
.utils
.parsedate_tz(date_str
)
1732 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1735 def determine_ext(url
, default_ext
='unknown_video'):
1736 if url
is None or '.' not in url
:
1738 guess
= url
.partition('?')[0].rpartition('.')[2]
1739 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1741 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1742 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1743 return guess
.rstrip('/')
1748 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1749 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1752 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1754 Return a datetime object from a string.
1756 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1758 @param format strftime format of DATE
1759 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1760 auto: round to the unit provided in date_str (if applicable).
1762 auto_precision
= False
1763 if precision
== 'auto':
1764 auto_precision
= True
1765 precision
= 'microsecond'
1766 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1767 if date_str
in ('now', 'today'):
1769 if date_str
== 'yesterday':
1770 return today
- datetime
.timedelta(days
=1)
1772 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1774 if match
is not None:
1775 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1776 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1777 unit
= match
.group('unit')
1778 if unit
== 'month' or unit
== 'year':
1779 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1785 delta
= datetime
.timedelta(**{unit + 's': time}
)
1786 new_date
= start_time
+ delta
1788 return datetime_round(new_date
, unit
)
1791 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1794 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1796 Return a date object from a string using datetime_from_str
1798 @param strict Restrict allowed patterns to "YYYYMMDD" and
1799 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1801 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1802 raise ValueError(f
'Invalid date format "{date_str}"')
1803 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1806 def datetime_add_months(dt
, months
):
1807 """Increment/Decrement a datetime object by months."""
1808 month
= dt
.month
+ months
- 1
1809 year
= dt
.year
+ month
// 12
1810 month
= month
% 12 + 1
1811 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1812 return dt
.replace(year
, month
, day
)
1815 def datetime_round(dt
, precision
='day'):
1817 Round a datetime object's time to a specific precision
1819 if precision
== 'microsecond':
1828 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1829 timestamp
= calendar
.timegm(dt
.timetuple())
1830 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1833 def hyphenate_date(date_str
):
1835 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1836 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1837 if match
is not None:
1838 return '-'.join(match
.groups())
1844 """Represents a time interval between two dates"""
1846 def __init__(self
, start
=None, end
=None):
1847 """start and end must be strings in the format accepted by date"""
1848 if start
is not None:
1849 self
.start
= date_from_str(start
, strict
=True)
1851 self
.start
= datetime
.datetime
.min.date()
1853 self
.end
= date_from_str(end
, strict
=True)
1855 self
.end
= datetime
.datetime
.max.date()
1856 if self
.start
> self
.end
:
1857 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1861 """Returns a range that only contains the given day"""
1862 return cls(day
, day
)
1864 def __contains__(self
, date
):
1865 """Check if the date is in the range"""
1866 if not isinstance(date
, datetime
.date
):
1867 date
= date_from_str(date
)
1868 return self
.start
<= date
<= self
.end
1871 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
1874 def platform_name():
1875 """ Returns the platform name as a compat_str """
1876 res
= platform
.platform()
1877 if isinstance(res
, bytes):
1878 res
= res
.decode(preferredencoding())
1880 assert isinstance(res
, compat_str
)
1884 def get_windows_version():
1885 ''' Get Windows version. None if it's not running on Windows '''
1886 if compat_os_name
== 'nt':
1887 return version_tuple(platform
.win32_ver()[1])
1892 def write_string(s
, out
=None, encoding
=None):
1893 assert isinstance(s
, str)
1894 out
= out
or sys
.stderr
1896 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1897 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1899 if 'b' in getattr(out
, 'mode', ''):
1900 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1902 elif hasattr(out
, 'buffer'):
1903 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1904 byt
= s
.encode(enc
, 'ignore')
1905 out
.buffer.write(byt
)
1911 def bytes_to_intlist(bs
):
1914 if isinstance(bs
[0], int): # Python 3
1917 return [ord(c
) for c
in bs
]
1920 def intlist_to_bytes(xs
):
1923 return compat_struct_pack('%dB' % len(xs
), *xs
)
1926 class LockingUnsupportedError(IOError):
1927 msg
= 'File locking is not supported on this platform'
1930 super().__init
__(self
.msg
)
1933 # Cross-platform file locking
1934 if sys
.platform
== 'win32':
1935 import ctypes
.wintypes
1938 class OVERLAPPED(ctypes
.Structure
):
1940 ('Internal', ctypes
.wintypes
.LPVOID
),
1941 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1942 ('Offset', ctypes
.wintypes
.DWORD
),
1943 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1944 ('hEvent', ctypes
.wintypes
.HANDLE
),
1947 kernel32
= ctypes
.windll
.kernel32
1948 LockFileEx
= kernel32
.LockFileEx
1949 LockFileEx
.argtypes
= [
1950 ctypes
.wintypes
.HANDLE
, # hFile
1951 ctypes
.wintypes
.DWORD
, # dwFlags
1952 ctypes
.wintypes
.DWORD
, # dwReserved
1953 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1954 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1955 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1957 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1958 UnlockFileEx
= kernel32
.UnlockFileEx
1959 UnlockFileEx
.argtypes
= [
1960 ctypes
.wintypes
.HANDLE
, # hFile
1961 ctypes
.wintypes
.DWORD
, # dwReserved
1962 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1963 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1964 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1966 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1967 whole_low
= 0xffffffff
1968 whole_high
= 0x7fffffff
1970 def _lock_file(f
, exclusive
, block
):
1971 overlapped
= OVERLAPPED()
1972 overlapped
.Offset
= 0
1973 overlapped
.OffsetHigh
= 0
1974 overlapped
.hEvent
= 0
1975 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1977 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
1978 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
1979 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1980 raise BlockingIOError('Locking file failed: %r' % ctypes
.FormatError())
1982 def _unlock_file(f
):
1983 assert f
._lock
_file
_overlapped
_p
1984 handle
= msvcrt
.get_osfhandle(f
.fileno())
1985 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1986 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1992 def _lock_file(f
, exclusive
, block
):
1993 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
1995 flags |
= fcntl
.LOCK_NB
1997 fcntl
.flock(f
, flags
)
1998 except BlockingIOError
:
2000 except OSError: # AOSP does not have flock()
2001 fcntl
.lockf(f
, flags
)
2003 def _unlock_file(f
):
2005 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2007 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2011 def _lock_file(f
, exclusive
, block
):
2012 raise LockingUnsupportedError()
2014 def _unlock_file(f
):
2015 raise LockingUnsupportedError()
2021 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2022 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2023 raise NotImplementedError(mode
)
2024 self
.mode
, self
.block
= mode
, block
2026 writable
= any(f
in mode
for f
in 'wax+')
2027 readable
= any(f
in mode
for f
in 'r+')
2028 flags
= functools
.reduce(operator
.ior
, (
2029 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2030 getattr(os
, 'O_BINARY', 0), # Windows only
2031 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2032 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2033 os
.O_APPEND
if 'a' in mode
else 0,
2034 os
.O_EXCL
if 'x' in mode
else 0,
2035 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2038 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2040 def __enter__(self
):
2041 exclusive
= 'r' not in self
.mode
2043 _lock_file(self
.f
, exclusive
, self
.block
)
2048 if 'w' in self
.mode
:
2051 except OSError as e
:
2052 if e
.errno
!= 29: # Illegal seek, expected when self.f is a FIFO
2060 _unlock_file(self
.f
)
2064 def __exit__(self
, *_
):
2073 def __getattr__(self
, attr
):
2074 return getattr(self
.f
, attr
)
2080 def get_filesystem_encoding():
2081 encoding
= sys
.getfilesystemencoding()
2082 return encoding
if encoding
is not None else 'utf-8'
2085 def shell_quote(args
):
2087 encoding
= get_filesystem_encoding()
2089 if isinstance(a
, bytes):
2090 # We may get a filename encoded with 'encodeFilename'
2091 a
= a
.decode(encoding
)
2092 quoted_args
.append(compat_shlex_quote(a
))
2093 return ' '.join(quoted_args
)
2096 def smuggle_url(url
, data
):
2097 """ Pass additional data in a URL for internal use. """
2099 url
, idata
= unsmuggle_url(url
, {})
2101 sdata
= compat_urllib_parse_urlencode(
2102 {'__youtubedl_smuggle': json.dumps(data)}
)
2103 return url
+ '#' + sdata
2106 def unsmuggle_url(smug_url
, default
=None):
2107 if '#__youtubedl_smuggle' not in smug_url
:
2108 return smug_url
, default
2109 url
, _
, sdata
= smug_url
.rpartition('#')
2110 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2111 data
= json
.loads(jsond
)
2115 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2116 """ Formats numbers with decimal sufixes like K, M, etc """
2117 num
, factor
= float_or_none(num
), float(factor
)
2118 if num
is None or num
< 0:
2120 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2121 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2122 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2124 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2125 converted
= num
/ (factor
** exponent
)
2126 return fmt
% (converted
, suffix
)
2129 def format_bytes(bytes):
2130 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2133 def lookup_unit_table(unit_table
, s
):
2134 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2136 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2139 num_str
= m
.group('num').replace(',', '.')
2140 mult
= unit_table
[m
.group('unit')]
2141 return int(float(num_str
) * mult
)
2144 def parse_filesize(s
):
2148 # The lower-case forms are of course incorrect and unofficial,
2149 # but we support those too
2166 'megabytes': 1000 ** 2,
2167 'mebibytes': 1024 ** 2,
2173 'gigabytes': 1000 ** 3,
2174 'gibibytes': 1024 ** 3,
2180 'terabytes': 1000 ** 4,
2181 'tebibytes': 1024 ** 4,
2187 'petabytes': 1000 ** 5,
2188 'pebibytes': 1024 ** 5,
2194 'exabytes': 1000 ** 6,
2195 'exbibytes': 1024 ** 6,
2201 'zettabytes': 1000 ** 7,
2202 'zebibytes': 1024 ** 7,
2208 'yottabytes': 1000 ** 8,
2209 'yobibytes': 1024 ** 8,
2212 return lookup_unit_table(_UNIT_TABLE
, s
)
2219 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2221 if re
.match(r
'^[\d,.]+$', s
):
2222 return str_to_int(s
)
2235 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2239 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2241 return str_to_int(mobj
.group(1))
2244 def parse_resolution(s
, *, lenient
=False):
2249 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2251 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2254 'width': int(mobj
.group('w')),
2255 'height': int(mobj
.group('h')),
2258 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2260 return {'height': int(mobj.group(1))}
2262 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2264 return {'height': int(mobj.group(1)) * 540}
2269 def parse_bitrate(s
):
2270 if not isinstance(s
, compat_str
):
2272 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2274 return int(mobj
.group(1))
2277 def month_by_name(name
, lang
='en'):
2278 """ Return the number of a month by (locale-independently) English name """
2280 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2283 return month_names
.index(name
) + 1
2288 def month_by_abbreviation(abbrev
):
2289 """ Return the number of a month by (locale-independently) English
2293 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2298 def fix_xml_ampersands(xml_str
):
2299 """Replace all the '&' by '&' in XML"""
2301 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2306 def setproctitle(title
):
2307 assert isinstance(title
, compat_str
)
2309 # ctypes in Jython is not complete
2310 # http://bugs.jython.org/issue2148
2311 if sys
.platform
.startswith('java'):
2315 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2319 # LoadLibrary in Windows Python 2.7.13 only expects
2320 # a bytestring, but since unicode_literals turns
2321 # every string into a unicode string, it fails.
2323 title_bytes
= title
.encode()
2324 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2325 buf
.value
= title_bytes
2327 libc
.prctl(15, buf
, 0, 0, 0)
2328 except AttributeError:
2329 return # Strange libc, just skip this
2332 def remove_start(s
, start
):
2333 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2336 def remove_end(s
, end
):
2337 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2340 def remove_quotes(s
):
2341 if s
is None or len(s
) < 2:
2343 for quote
in ('"', "'", ):
2344 if s
[0] == quote
and s
[-1] == quote
:
2349 def get_domain(url
):
2350 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2351 return domain
.group('domain') if domain
else None
2354 def url_basename(url
):
2355 path
= compat_urlparse
.urlparse(url
).path
2356 return path
.strip('/').split('/')[-1]
2360 return re
.match(r
'https?://[^?#&]+/', url
).group()
2363 def urljoin(base
, path
):
2364 if isinstance(path
, bytes):
2365 path
= path
.decode()
2366 if not isinstance(path
, compat_str
) or not path
:
2368 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2370 if isinstance(base
, bytes):
2371 base
= base
.decode()
2372 if not isinstance(base
, compat_str
) or not re
.match(
2373 r
'^(?:https?:)?//', base
):
2375 return compat_urlparse
.urljoin(base
, path
)
2378 class HEADRequest(compat_urllib_request
.Request
):
2379 def get_method(self
):
2383 class PUTRequest(compat_urllib_request
.Request
):
2384 def get_method(self
):
2388 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2389 if get_attr
and v
is not None:
2390 v
= getattr(v
, get_attr
, None)
2392 return int(v
) * invscale
// scale
2393 except (ValueError, TypeError, OverflowError):
2397 def str_or_none(v
, default
=None):
2398 return default
if v
is None else compat_str(v
)
2401 def str_to_int(int_str
):
2402 """ A more relaxed version of int_or_none """
2403 if isinstance(int_str
, int):
2405 elif isinstance(int_str
, compat_str
):
2406 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2407 return int_or_none(int_str
)
2410 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2414 return float(v
) * invscale
/ scale
2415 except (ValueError, TypeError):
2419 def bool_or_none(v
, default
=None):
2420 return v
if isinstance(v
, bool) else default
2423 def strip_or_none(v
, default
=None):
2424 return v
.strip() if isinstance(v
, compat_str
) else default
2427 def url_or_none(url
):
2428 if not url
or not isinstance(url
, compat_str
):
2431 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2434 def request_to_url(req
):
2435 if isinstance(req
, compat_urllib_request
.Request
):
2436 return req
.get_full_url()
2441 def strftime_or_none(timestamp
, date_format
, default
=None):
2442 datetime_object
= None
2444 if isinstance(timestamp
, (int, float)): # unix timestamp
2445 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2446 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2447 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2448 return datetime_object
.strftime(date_format
)
2449 except (ValueError, TypeError, AttributeError):
2453 def parse_duration(s
):
2454 if not isinstance(s
, str):
2460 days
, hours
, mins
, secs
, ms
= [None] * 5
2461 m
= re
.match(r
'''(?x)
2463 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2464 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2465 (?P<ms>[.:][0-9]+)?Z?$
2468 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2473 [0-9]+\s*y(?:ears?)?,?\s*
2476 [0-9]+\s*m(?:onths?)?,?\s*
2479 [0-9]+\s*w(?:eeks?)?,?\s*
2482 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2486 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2489 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2492 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2495 days
, hours
, mins
, secs
, ms
= m
.groups()
2497 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2499 hours
, mins
= m
.groups()
2504 ms
= ms
.replace(':', '.')
2505 return sum(float(part
or 0) * mult
for part
, mult
in (
2506 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2509 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2510 name
, real_ext
= os
.path
.splitext(filename
)
2512 f
'{name}.{ext}{real_ext}'
2513 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2514 else f
'{filename}.{ext}')
2517 def replace_extension(filename
, ext
, expected_real_ext
=None):
2518 name
, real_ext
= os
.path
.splitext(filename
)
2519 return '{}.{}'.format(
2520 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2524 def check_executable(exe
, args
=[]):
2525 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2526 args can be a list of arguments for a short output (like -version) """
2528 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
2534 def _get_exe_version_output(exe
, args
, *, to_screen
=None):
2536 to_screen(f
'Checking exe version: {shell_quote([exe] + args)}')
2538 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2539 # SIGTTOU if yt-dlp is run in the background.
2540 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2542 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
2543 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
2546 if isinstance(out
, bytes): # Python 2.x
2547 out
= out
.decode('ascii', 'ignore')
2551 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2552 assert isinstance(output
, compat_str
)
2553 if version_re
is None:
2554 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2555 m
= re
.search(version_re
, output
)
2562 def get_exe_version(exe
, args
=['--version'],
2563 version_re
=None, unrecognized
='present'):
2564 """ Returns the version of the specified executable,
2565 or False if the executable is not present """
2566 out
= _get_exe_version_output(exe
, args
)
2567 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2570 class LazyList(collections
.abc
.Sequence
):
2571 """Lazy immutable list from an iterable
2572 Note that slices of a LazyList are lists and not LazyList"""
2574 class IndexError(IndexError):
2577 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2578 self
._iterable
= iter(iterable
)
2579 self
._cache
= [] if _cache
is None else _cache
2580 self
._reversed
= reverse
2584 # We need to consume the entire iterable to iterate in reverse
2585 yield from self
.exhaust()
2587 yield from self
._cache
2588 for item
in self
._iterable
:
2589 self
._cache
.append(item
)
2593 self
._cache
.extend(self
._iterable
)
2594 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2598 """Evaluate the entire iterable"""
2599 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2602 def _reverse_index(x
):
2603 return None if x
is None else -(x
+ 1)
2605 def __getitem__(self
, idx
):
2606 if isinstance(idx
, slice):
2608 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2609 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2610 elif isinstance(idx
, int):
2612 idx
= self
._reverse
_index
(idx
)
2613 start
, stop
, step
= idx
, idx
, 0
2615 raise TypeError('indices must be integers or slices')
2616 if ((start
or 0) < 0 or (stop
or 0) < 0
2617 or (start
is None and step
< 0)
2618 or (stop
is None and step
> 0)):
2619 # We need to consume the entire iterable to be able to slice from the end
2620 # Obviously, never use this with infinite iterables
2623 return self
._cache
[idx
]
2624 except IndexError as e
:
2625 raise self
.IndexError(e
) from e
2626 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2628 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2630 return self
._cache
[idx
]
2631 except IndexError as e
:
2632 raise self
.IndexError(e
) from e
2636 self
[-1] if self
._reversed
else self
[0]
2637 except self
.IndexError:
2643 return len(self
._cache
)
2645 def __reversed__(self
):
2646 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2649 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2652 # repr and str should mimic a list. So we exhaust the iterable
2653 return repr(self
.exhaust())
2656 return repr(self
.exhaust())
2661 class IndexError(IndexError):
2665 # This is only useful for tests
2666 return len(self
.getslice())
2668 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2669 self
._pagefunc
= pagefunc
2670 self
._pagesize
= pagesize
2671 self
._pagecount
= float('inf')
2672 self
._use
_cache
= use_cache
2675 def getpage(self
, pagenum
):
2676 page_results
= self
._cache
.get(pagenum
)
2677 if page_results
is None:
2678 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2680 self
._cache
[pagenum
] = page_results
2683 def getslice(self
, start
=0, end
=None):
2684 return list(self
._getslice
(start
, end
))
2686 def _getslice(self
, start
, end
):
2687 raise NotImplementedError('This method must be implemented by subclasses')
2689 def __getitem__(self
, idx
):
2690 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2691 if not isinstance(idx
, int) or idx
< 0:
2692 raise TypeError('indices must be non-negative integers')
2693 entries
= self
.getslice(idx
, idx
+ 1)
2695 raise self
.IndexError()
2699 class OnDemandPagedList(PagedList
):
2700 """Download pages until a page with less than maximum results"""
2702 def _getslice(self
, start
, end
):
2703 for pagenum
in itertools
.count(start
// self
._pagesize
):
2704 firstid
= pagenum
* self
._pagesize
2705 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2706 if start
>= nextfirstid
:
2710 start
% self
._pagesize
2711 if firstid
<= start
< nextfirstid
2714 ((end
- 1) % self
._pagesize
) + 1
2715 if (end
is not None and firstid
<= end
<= nextfirstid
)
2719 page_results
= self
.getpage(pagenum
)
2721 self
._pagecount
= pagenum
- 1
2723 if startv
!= 0 or endv
is not None:
2724 page_results
= page_results
[startv
:endv
]
2725 yield from page_results
2727 # A little optimization - if current page is not "full", ie. does
2728 # not contain page_size videos then we can assume that this page
2729 # is the last one - there are no more ids on further pages -
2730 # i.e. no need to query again.
2731 if len(page_results
) + startv
< self
._pagesize
:
2734 # If we got the whole page, but the next page is not interesting,
2735 # break out early as well
2736 if end
== nextfirstid
:
2740 class InAdvancePagedList(PagedList
):
2741 """PagedList with total number of pages known in advance"""
2743 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2744 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2745 self
._pagecount
= pagecount
2747 def _getslice(self
, start
, end
):
2748 start_page
= start
// self
._pagesize
2749 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2750 skip_elems
= start
- start_page
* self
._pagesize
2751 only_more
= None if end
is None else end
- start
2752 for pagenum
in range(start_page
, end_page
):
2753 page_results
= self
.getpage(pagenum
)
2755 page_results
= page_results
[skip_elems
:]
2757 if only_more
is not None:
2758 if len(page_results
) < only_more
:
2759 only_more
-= len(page_results
)
2761 yield from page_results
[:only_more
]
2763 yield from page_results
2766 def uppercase_escape(s
):
2767 unicode_escape
= codecs
.getdecoder('unicode_escape')
2769 r
'\\U[0-9a-fA-F]{8}',
2770 lambda m
: unicode_escape(m
.group(0))[0],
2774 def lowercase_escape(s
):
2775 unicode_escape
= codecs
.getdecoder('unicode_escape')
2777 r
'\\u[0-9a-fA-F]{4}',
2778 lambda m
: unicode_escape(m
.group(0))[0],
2782 def escape_rfc3986(s
):
2783 """Escape non-ASCII characters as suggested by RFC 3986"""
2784 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2787 def escape_url(url
):
2788 """Escape URL as suggested by RFC 3986"""
2789 url_parsed
= compat_urllib_parse_urlparse(url
)
2790 return url_parsed
._replace
(
2791 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2792 path
=escape_rfc3986(url_parsed
.path
),
2793 params
=escape_rfc3986(url_parsed
.params
),
2794 query
=escape_rfc3986(url_parsed
.query
),
2795 fragment
=escape_rfc3986(url_parsed
.fragment
)
2800 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2803 def read_batch_urls(batch_fd
):
2805 if not isinstance(url
, compat_str
):
2806 url
= url
.decode('utf-8', 'replace')
2807 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2808 for bom
in BOM_UTF8
:
2809 if url
.startswith(bom
):
2810 url
= url
[len(bom
):]
2812 if not url
or url
.startswith(('#', ';', ']')):
2814 # "#" cannot be stripped out since it is part of the URI
2815 # However, it can be safely stipped out if follwing a whitespace
2816 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2818 with contextlib
.closing(batch_fd
) as fd
:
2819 return [url
for url
in map(fixup
, fd
) if url
]
2822 def urlencode_postdata(*args
, **kargs
):
2823 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2826 def update_url_query(url
, query
):
2829 parsed_url
= compat_urlparse
.urlparse(url
)
2830 qs
= compat_parse_qs(parsed_url
.query
)
2832 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2833 query
=compat_urllib_parse_urlencode(qs
, True)))
2836 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
2837 req_headers
= req
.headers
.copy()
2838 req_headers
.update(headers
)
2839 req_data
= data
or req
.data
2840 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2841 req_get_method
= req
.get_method()
2842 if req_get_method
== 'HEAD':
2843 req_type
= HEADRequest
2844 elif req_get_method
== 'PUT':
2845 req_type
= PUTRequest
2847 req_type
= compat_urllib_request
.Request
2849 req_url
, data
=req_data
, headers
=req_headers
,
2850 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2851 if hasattr(req
, 'timeout'):
2852 new_req
.timeout
= req
.timeout
2856 def _multipart_encode_impl(data
, boundary
):
2857 content_type
= 'multipart/form-data; boundary=%s' % boundary
2860 for k
, v
in data
.items():
2861 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2862 if isinstance(k
, compat_str
):
2864 if isinstance(v
, compat_str
):
2866 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2867 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2868 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2869 if boundary
.encode('ascii') in content
:
2870 raise ValueError('Boundary overlaps with data')
2873 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2875 return out
, content_type
2878 def multipart_encode(data
, boundary
=None):
2880 Encode a dict to RFC 7578-compliant form-data
2883 A dict where keys and values can be either Unicode or bytes-like
2886 If specified a Unicode object, it's used as the boundary. Otherwise
2887 a random boundary is generated.
2889 Reference: https://tools.ietf.org/html/rfc7578
2891 has_specified_boundary
= boundary
is not None
2894 if boundary
is None:
2895 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2898 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2901 if has_specified_boundary
:
2905 return out
, content_type
2908 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2909 for val
in map(d
.get
, variadic(key_or_keys
)):
2910 if val
is not None and (val
or not skip_false_values
):
2915 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
2918 val
= f(*args
, **kwargs
)
2919 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2922 if expected_type
is None or isinstance(val
, expected_type
):
2926 def try_get(src
, getter
, expected_type
=None):
2927 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
2930 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
2931 return {k: v for k, v in dct.items() if cndn(k, v)}
2934 def merge_dicts(*dicts
):
2936 for a_dict
in dicts
:
2937 for k
, v
in a_dict
.items():
2938 if (v
is not None and k
not in merged
2939 or isinstance(v
, str) and merged
[k
] == ''):
2944 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2945 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2957 TV_PARENTAL_GUIDELINES
= {
2967 def parse_age_limit(s
):
2968 # isinstance(False, int) is True. So type() must be used instead
2970 return s
if 0 <= s
<= 21 else None
2971 elif not isinstance(s
, str):
2973 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2975 return int(m
.group('age'))
2978 return US_RATINGS
[s
]
2979 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2981 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
2985 def strip_jsonp(code
):
2988 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2989 (?:\s*&&\s*(?P=func_name))?
2990 \s*\(\s*(?P<callback_data>.*)\);?
2991 \s*?(?://[^\n]*)*$''',
2992 r
'\g<callback_data>', code
)
2995 def js_to_json(code
, vars={}):
2996 # vars is a dict of var, val pairs to substitute
2997 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2998 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3000 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3001 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3006 if v
in ('true', 'false', 'null'):
3008 elif v
in ('undefined', 'void 0'):
3010 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3013 if v
[0] in ("'", '"'):
3014 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3019 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3021 for regex
, base
in INTEGER_TABLE
:
3022 im
= re
.match(regex
, v
)
3024 i
= int(im
.group(1), base
)
3025 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3032 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3034 return re
.sub(r
'''(?sx)
3035 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3036 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3037 {comment}|,(?={skip}[\]}}])|
3038 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3039 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3042 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3045 def qualities(quality_ids
):
3046 """ Get a numeric quality value out of a list of possible values """
3049 return quality_ids
.index(qid
)
3055 POSTPROCESS_WHEN
= ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3059 'default': '%(title)s [%(id)s].%(ext)s',
3060 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3066 'description': 'description',
3067 'annotation': 'annotations.xml',
3068 'infojson': 'info.json',
3071 'pl_thumbnail': None,
3072 'pl_description': 'description',
3073 'pl_infojson': 'info.json',
3076 # As of [1] format syntax is:
3077 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3078 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3079 STR_FORMAT_RE_TMPL
= r
'''(?x)
3080 (?<!%)(?P<prefix>(?:%%)*)
3082 (?P<has_key>\((?P<key>{0})\))?
3084 (?P<conversion>[#0\-+ ]+)?
3086 (?P<precision>\.\d+)?
3087 (?P<len_mod>[hlL])? # unused in python
3088 {1} # conversion type
3093 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3096 def limit_length(s
, length
):
3097 """ Add ellipses to overly long strings """
3102 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3106 def version_tuple(v
):
3107 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3110 def is_outdated_version(version
, limit
, assume_new
=True):
3112 return not assume_new
3114 return version_tuple(version
) < version_tuple(limit
)
3116 return not assume_new
3119 def ytdl_is_updateable():
3120 """ Returns if yt-dlp can be updated with -U """
3122 from .update
import is_non_updateable
3124 return not is_non_updateable()
3127 def args_to_str(args
):
3128 # Get a short string representation for a subprocess command
3129 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3132 def error_to_compat_str(err
):
3136 def error_to_str(err
):
3137 return f
'{type(err).__name__}: {err}'
3140 def mimetype2ext(mt
):
3144 mt
, _
, params
= mt
.partition(';')
3149 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3150 # it's the most popular one
3151 'audio/mpeg': 'mp3',
3152 'audio/x-wav': 'wav',
3154 'audio/wave': 'wav',
3157 ext
= FULL_MAP
.get(mt
)
3163 'smptett+xml': 'tt',
3167 'x-mp4-fragmented': 'mp4',
3168 'x-ms-sami': 'sami',
3171 'x-mpegurl': 'm3u8',
3172 'vnd.apple.mpegurl': 'm3u8',
3176 'vnd.ms-sstr+xml': 'ism',
3180 'filmstrip+json': 'fs',
3184 _
, _
, subtype
= mt
.rpartition('/')
3185 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3196 _
, _
, suffix
= subtype
.partition('+')
3197 ext
= SUFFIX_MAP
.get(suffix
)
3201 return subtype
.replace('+', '.')
3204 def ext2mimetype(ext_or_url
):
3207 if '.' not in ext_or_url
:
3208 ext_or_url
= f
'file.{ext_or_url}'
3209 return mimetypes
.guess_type(ext_or_url
)[0]
3212 def parse_codecs(codecs_str
):
3213 # http://tools.ietf.org/html/rfc6381
3216 split_codecs
= list(filter(None, map(
3217 str.strip
, codecs_str
.strip().strip(',').split(','))))
3218 vcodec
, acodec
, scodec
, hdr
= None, None, None, None
3219 for full_codec
in split_codecs
:
3220 parts
= full_codec
.split('.')
3221 codec
= parts
[0].replace('0', '')
3222 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3223 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3225 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3226 if codec
in ('dvh1', 'dvhe'):
3228 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3230 elif full_codec
.replace('0', '').startswith('vp9.2'):
3232 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3235 elif codec
in ('stpp', 'wvtt',):
3239 write_string(f
'WARNING: Unknown codec {full_codec}\n')
3240 if vcodec
or acodec
or scodec
:
3242 'vcodec': vcodec
or 'none',
3243 'acodec': acodec
or 'none',
3244 'dynamic_range': hdr
,
3245 **({'scodec': scodec}
if scodec
is not None else {}),
3247 elif len(split_codecs
) == 2:
3249 'vcodec': split_codecs
[0],
3250 'acodec': split_codecs
[1],
3255 def urlhandle_detect_ext(url_handle
):
3256 getheader
= url_handle
.headers
.get
3258 cd
= getheader('Content-Disposition')
3260 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3262 e
= determine_ext(m
.group('filename'), default_ext
=None)
3266 return mimetype2ext(getheader('Content-Type'))
3269 def encode_data_uri(data
, mime_type
):
3270 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3273 def age_restricted(content_limit
, age_limit
):
3274 """ Returns True iff the content should be blocked """
3276 if age_limit
is None: # No limit set
3278 if content_limit
is None:
3279 return False # Content available for everyone
3280 return age_limit
< content_limit
3283 def is_html(first_bytes
):
3284 """ Detect whether a file contains HTML by examining its first bytes. """
3287 (b
'\xef\xbb\xbf', 'utf-8'),
3288 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3289 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3290 (b
'\xff\xfe', 'utf-16-le'),
3291 (b
'\xfe\xff', 'utf-16-be'),
3293 for bom
, enc
in BOMS
:
3294 if first_bytes
.startswith(bom
):
3295 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
3298 s
= first_bytes
.decode('utf-8', 'replace')
3300 return re
.match(r
'^\s*<', s
)
3303 def determine_protocol(info_dict
):
3304 protocol
= info_dict
.get('protocol')
3305 if protocol
is not None:
3308 url
= sanitize_url(info_dict
['url'])
3309 if url
.startswith('rtmp'):
3311 elif url
.startswith('mms'):
3313 elif url
.startswith('rtsp'):
3316 ext
= determine_ext(url
)
3322 return compat_urllib_parse_urlparse(url
).scheme
3325 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3326 """ Render a list of rows, each as a list of values.
3327 Text after a \t will be right aligned """
3329 return len(remove_terminal_sequences(string
).replace('\t', ''))
3331 def get_max_lens(table
):
3332 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3334 def filter_using_list(row
, filterArray
):
3335 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3337 max_lens
= get_max_lens(data
) if hide_empty
else []
3338 header_row
= filter_using_list(header_row
, max_lens
)
3339 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3341 table
= [header_row
] + data
3342 max_lens
= get_max_lens(table
)
3345 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3346 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3348 for pos
, text
in enumerate(map(str, row
)):
3350 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3352 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3353 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3357 def _match_one(filter_part
, dct
, incomplete
):
3358 # TODO: Generalize code with YoutubeDL._build_format_filter
3359 STRING_OPERATORS
= {
3360 '*=': operator
.contains
,
3361 '^=': lambda attr
, value
: attr
.startswith(value
),
3362 '$=': lambda attr
, value
: attr
.endswith(value
),
3363 '~=': lambda attr
, value
: re
.search(value
, attr
),
3365 COMPARISON_OPERATORS
= {
3367 '<=': operator
.le
, # "<=" must be defined above "<"
3374 if isinstance(incomplete
, bool):
3375 is_incomplete
= lambda _
: incomplete
3377 is_incomplete
= lambda k
: k
in incomplete
3379 operator_rex
= re
.compile(r
'''(?x)\s*
3381 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3383 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3387 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3388 m = operator_rex.search(filter_part)
3391 unnegated_op = COMPARISON_OPERATORS[m['op']]
3393 op = lambda attr, value: not unnegated_op(attr, value)
3396 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3398 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3399 actual_value = dct.get(m['key'])
3400 numeric_comparison = None
3401 if isinstance(actual_value, (int, float)):
3402 # If the original field is a string and matching comparisonvalue is
3403 # a number we should respect the origin of the original field
3404 # and process comparison value as a string (see
3405 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3407 numeric_comparison = int(comparison_value)
3409 numeric_comparison = parse_filesize(comparison_value)
3410 if numeric_comparison is None:
3411 numeric_comparison = parse_filesize(f'{comparison_value}B')
3412 if numeric_comparison is None:
3413 numeric_comparison = parse_duration(comparison_value)
3414 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3415 raise ValueError('Operator %s only supports string values!' % m['op'])
3416 if actual_value is None:
3417 return is_incomplete(m['key']) or m['none_inclusive']
3418 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3421 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3422 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3424 operator_rex = re.compile(r'''(?x
)\s
*
3425 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3427 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3428 m = operator_rex.search(filter_part)
3430 op = UNARY_OPERATORS[m.group('op')]
3431 actual_value = dct.get(m.group('key'))
3432 if is_incomplete(m.group('key')) and actual_value is None:
3434 return op(actual_value)
3436 raise ValueError('Invalid filter part %r' % filter_part)
3439 def match_str(filter_str, dct, incomplete=False):
3440 """ Filter a dictionary with a simple string syntax.
3441 @returns Whether the filter passes
3442 @param incomplete Set of keys that is expected to be missing from dct.
3443 Can be True/False to indicate all/none of the keys may be missing.
3444 All conditions on incomplete keys pass if the key is missing
3447 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3448 for filter_part in re.split(r'(?<!\\)&', filter_str))
3451 def match_filter_func(filters):
3454 filters = set(variadic(filters))
3456 interactive = '-' in filters
3460 def _match_func(info_dict, incomplete=False):
3461 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3462 return NO_DEFAULT if interactive and not incomplete else None
3464 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3465 filter_str = ') | ('.join(map(str.strip, filters))
3466 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3470 def parse_dfxp_time_expr(time_expr):
3474 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3476 return float(mobj.group('time_offset'))
3478 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3480 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3483 def srt_subtitles_timecode(seconds):
3484 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3487 def ass_subtitles_timecode(seconds):
3488 time = timetuple_from_msec(seconds * 1000)
3489 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3492 def dfxp2srt(dfxp_data):
3494 @param dfxp_data A
bytes-like
object containing DFXP data
3495 @returns A
unicode object containing converted SRT data
3497 LEGACY_NAMESPACES = (
3498 (b'http://www.w3.org/ns/ttml', [
3499 b'http://www.w3.org/2004/11/ttaf1',
3500 b'http://www.w3.org/2006/04/ttaf1',
3501 b'http://www.w3.org/2006/10/ttaf1',
3503 (b'http://www.w3.org/ns/ttml#styling', [
3504 b'http://www.w3.org/ns/ttml#style',
3508 SUPPORTED_STYLING = [
3517 _x = functools.partial(xpath_with_ns, ns_map={
3518 'xml': 'http://www.w3.org/XML/1998/namespace',
3519 'ttml': 'http://www.w3.org/ns/ttml',
3520 'tts': 'http://www.w3.org/ns/ttml#styling',
3526 class TTMLPElementParser:
3528 _unclosed_elements = []
3529 _applied_styles = []
3531 def start(self, tag, attrib):
3532 if tag in (_x('ttml:br'), 'br'):
3535 unclosed_elements = []
3537 element_style_id = attrib.get('style')
3539 style.update(default_style)
3540 if element_style_id:
3541 style.update(styles.get(element_style_id, {}))
3542 for prop in SUPPORTED_STYLING:
3543 prop_val = attrib.get(_x('tts:' + prop))
3545 style[prop] = prop_val
3548 for k, v in sorted(style.items()):
3549 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3552 font += ' color="%s"' % v
3553 elif k == 'fontSize':
3554 font += ' size="%s"' % v
3555 elif k == 'fontFamily':
3556 font += ' face="%s"' % v
3557 elif k == 'fontWeight' and v == 'bold':
3559 unclosed_elements.append('b')
3560 elif k == 'fontStyle' and v == 'italic':
3562 unclosed_elements.append('i')
3563 elif k == 'textDecoration' and v == 'underline':
3565 unclosed_elements.append('u')
3567 self._out += '<font' + font + '>'
3568 unclosed_elements.append('font')
3570 if self._applied_styles:
3571 applied_style.update(self._applied_styles[-1])
3572 applied_style.update(style)
3573 self._applied_styles.append(applied_style)
3574 self._unclosed_elements.append(unclosed_elements)
3577 if tag not in (_x('ttml:br'), 'br'):
3578 unclosed_elements = self._unclosed_elements.pop()
3579 for element in reversed(unclosed_elements):
3580 self._out += '</%s>' % element
3581 if unclosed_elements and self._applied_styles:
3582 self._applied_styles.pop()
3584 def data(self, data):
3588 return self._out.strip()
3590 def parse_node(node):
3591 target = TTMLPElementParser()
3592 parser = xml.etree.ElementTree.XMLParser(target=target)
3593 parser.feed(xml.etree.ElementTree.tostring(node))
3594 return parser.close()
3596 for k, v in LEGACY_NAMESPACES:
3598 dfxp_data = dfxp_data.replace(ns, k)
3600 dfxp = compat_etree_fromstring(dfxp_data)
3602 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3605 raise ValueError('Invalid dfxp/TTML subtitle')
3609 for style in dfxp.findall(_x('.//ttml:style')):
3610 style_id = style.get('id') or style.get(_x('xml:id'))
3613 parent_style_id = style.get('style')
3615 if parent_style_id not in styles:
3618 styles[style_id] = styles[parent_style_id].copy()
3619 for prop in SUPPORTED_STYLING:
3620 prop_val = style.get(_x('tts:' + prop))
3622 styles.setdefault(style_id, {})[prop] = prop_val
3628 for p in ('body', 'div'):
3629 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3632 style = styles.get(ele.get('style'))
3635 default_style.update(style)
3637 for para, index in zip(paras, itertools.count(1)):
3638 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3639 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3640 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3641 if begin_time is None:
3646 end_time = begin_time + dur
3647 out.append('%d\n%s --> %s\n%s\n\n' % (
3649 srt_subtitles_timecode(begin_time),
3650 srt_subtitles_timecode(end_time),
3656 def cli_option(params, command_option, param):
3657 param = params.get(param)
3659 param = compat_str(param)
3660 return [command_option, param] if param is not None else []
3663 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3664 param = params.get(param)
3667 assert isinstance(param, bool)
3669 return [command_option + separator + (true_value if param else false_value)]
3670 return [command_option, true_value if param else false_value]
3673 def cli_valueless_option(params, command_option, param, expected_value=True):
3674 param = params.get(param)
3675 return [command_option] if param == expected_value else []
3678 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3679 if isinstance(argdict, (list, tuple)): # for backward compatibility
3686 assert isinstance(argdict, dict)
3688 assert isinstance(keys, (list, tuple))
3689 for key_list in keys:
3690 arg_list = list(filter(
3691 lambda x: x is not None,
3692 [argdict.get(key.lower()) for key in variadic(key_list)]))
3694 return [arg for args in arg_list for arg in args]
3698 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3699 main_key, exe = main_key.lower(), exe.lower()
3700 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3701 keys = [f'{root_key}{k}' for k in (keys or [''])]
3702 if root_key in keys:
3704 keys.append((main_key, exe))
3705 keys.append('default')
3708 return cli_configuration_args(argdict, keys, default, use_compat)
3712 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3771 'iw': 'heb', # Replaced by he in 1989 revision
3781 'in': 'ind', # Replaced by id in 1989 revision
3896 'ji': 'yid', # Replaced by yi in 1989 revision
3904 def short2long(cls, code):
3905 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3906 return cls._lang_map.get(code[:2])
3909 def long2short(cls, code):
3910 """Convert language code from ISO 639-2/T to ISO 639-1"""
3911 for short_name, long_name in cls._lang_map.items():
3912 if long_name == code:
3917 # From http://data.okfn.org/data/core/country-list
3919 'AF': 'Afghanistan',
3920 'AX': 'Åland Islands',
3923 'AS': 'American Samoa',
3928 'AG': 'Antigua and Barbuda',
3945 'BO': 'Bolivia, Plurinational State of',
3946 'BQ': 'Bonaire, Sint Eustatius and Saba',
3947 'BA': 'Bosnia and Herzegovina',
3949 'BV': 'Bouvet Island',
3951 'IO': 'British Indian Ocean Territory',
3952 'BN': 'Brunei Darussalam',
3954 'BF': 'Burkina Faso',
3960 'KY': 'Cayman Islands',
3961 'CF': 'Central African Republic',
3965 'CX': 'Christmas Island',
3966 'CC': 'Cocos (Keeling) Islands',
3970 'CD': 'Congo, the Democratic Republic of the',
3971 'CK': 'Cook Islands',
3973 'CI': 'Côte d\'Ivoire',
3978 'CZ': 'Czech Republic',
3982 'DO': 'Dominican Republic',
3985 'SV': 'El Salvador',
3986 'GQ': 'Equatorial Guinea',
3990 'FK': 'Falkland Islands (Malvinas)',
3991 'FO': 'Faroe Islands',
3995 'GF': 'French Guiana',
3996 'PF': 'French Polynesia',
3997 'TF': 'French Southern Territories',
4012 'GW': 'Guinea-Bissau',
4015 'HM': 'Heard Island and McDonald Islands',
4016 'VA': 'Holy See (Vatican City State)',
4023 'IR': 'Iran, Islamic Republic of',
4026 'IM': 'Isle of Man',
4036 'KP': 'Korea, Democratic People\'s Republic of',
4037 'KR': 'Korea, Republic of',
4040 'LA': 'Lao People\'s Democratic Republic',
4046 'LI': 'Liechtenstein',
4050 'MK': 'Macedonia, the Former Yugoslav Republic of',
4057 'MH': 'Marshall Islands',
4063 'FM': 'Micronesia, Federated States of',
4064 'MD': 'Moldova, Republic of',
4075 'NL': 'Netherlands',
4076 'NC': 'New Caledonia',
4077 'NZ': 'New Zealand',
4082 'NF': 'Norfolk Island',
4083 'MP': 'Northern Mariana Islands',
4088 'PS': 'Palestine, State of',
4090 'PG': 'Papua New Guinea',
4093 'PH': 'Philippines',
4097 'PR': 'Puerto Rico',
4101 'RU': 'Russian Federation',
4103 'BL': 'Saint Barthélemy',
4104 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4105 'KN': 'Saint Kitts and Nevis',
4106 'LC': 'Saint Lucia',
4107 'MF': 'Saint Martin (French part)',
4108 'PM': 'Saint Pierre and Miquelon',
4109 'VC': 'Saint Vincent and the Grenadines',
4112 'ST': 'Sao Tome and Principe',
4113 'SA': 'Saudi Arabia',
4117 'SL': 'Sierra Leone',
4119 'SX': 'Sint Maarten (Dutch part)',
4122 'SB': 'Solomon Islands',
4124 'ZA': 'South Africa',
4125 'GS': 'South Georgia and the South Sandwich Islands',
4126 'SS': 'South Sudan',
4131 'SJ': 'Svalbard and Jan Mayen',
4134 'CH': 'Switzerland',
4135 'SY': 'Syrian Arab Republic',
4136 'TW': 'Taiwan, Province of China',
4138 'TZ': 'Tanzania, United Republic of',
4140 'TL': 'Timor-Leste',
4144 'TT': 'Trinidad and Tobago',
4147 'TM': 'Turkmenistan',
4148 'TC': 'Turks and Caicos Islands',
4152 'AE': 'United Arab Emirates',
4153 'GB': 'United Kingdom',
4154 'US': 'United States',
4155 'UM': 'United States Minor Outlying Islands',
4159 'VE': 'Venezuela, Bolivarian Republic of',
4161 'VG': 'Virgin Islands, British',
4162 'VI': 'Virgin Islands, U.S.',
4163 'WF': 'Wallis and Futuna',
4164 'EH': 'Western Sahara',
4171 def short2full(cls, code):
4172 """Convert an ISO 3166-2 country code to the corresponding full name"""
4173 return cls._country_map.get(code.upper())
4177 # Major IPv4 address blocks per country
4179 'AD': '46.172.224.0/19',
4180 'AE': '94.200.0.0/13',
4181 'AF': '149.54.0.0/17',
4182 'AG': '209.59.64.0/18',
4183 'AI': '204.14.248.0/21',
4184 'AL': '46.99.0.0/16',
4185 'AM': '46.70.0.0/15',
4186 'AO': '105.168.0.0/13',
4187 'AP': '182.50.184.0/21',
4188 'AQ': '23.154.160.0/24',
4189 'AR': '181.0.0.0/12',
4190 'AS': '202.70.112.0/20',
4191 'AT': '77.116.0.0/14',
4192 'AU': '1.128.0.0/11',
4193 'AW': '181.41.0.0/18',
4194 'AX': '185.217.4.0/22',
4195 'AZ': '5.197.0.0/16',
4196 'BA': '31.176.128.0/17',
4197 'BB': '65.48.128.0/17',
4198 'BD': '114.130.0.0/16',
4200 'BF': '102.178.0.0/15',
4201 'BG': '95.42.0.0/15',
4202 'BH': '37.131.0.0/17',
4203 'BI': '154.117.192.0/18',
4204 'BJ': '137.255.0.0/16',
4205 'BL': '185.212.72.0/23',
4206 'BM': '196.12.64.0/18',
4207 'BN': '156.31.0.0/16',
4208 'BO': '161.56.0.0/16',
4209 'BQ': '161.0.80.0/20',
4210 'BR': '191.128.0.0/12',
4211 'BS': '24.51.64.0/18',
4212 'BT': '119.2.96.0/19',
4213 'BW': '168.167.0.0/16',
4214 'BY': '178.120.0.0/13',
4215 'BZ': '179.42.192.0/18',
4216 'CA': '99.224.0.0/11',
4217 'CD': '41.243.0.0/16',
4218 'CF': '197.242.176.0/21',
4219 'CG': '160.113.0.0/16',
4220 'CH': '85.0.0.0/13',
4221 'CI': '102.136.0.0/14',
4222 'CK': '202.65.32.0/19',
4223 'CL': '152.172.0.0/14',
4224 'CM': '102.244.0.0/14',
4225 'CN': '36.128.0.0/10',
4226 'CO': '181.240.0.0/12',
4227 'CR': '201.192.0.0/12',
4228 'CU': '152.206.0.0/15',
4229 'CV': '165.90.96.0/19',
4230 'CW': '190.88.128.0/17',
4231 'CY': '31.153.0.0/16',
4232 'CZ': '88.100.0.0/14',
4234 'DJ': '197.241.0.0/17',
4235 'DK': '87.48.0.0/12',
4236 'DM': '192.243.48.0/20',
4237 'DO': '152.166.0.0/15',
4238 'DZ': '41.96.0.0/12',
4239 'EC': '186.68.0.0/15',
4240 'EE': '90.190.0.0/15',
4241 'EG': '156.160.0.0/11',
4242 'ER': '196.200.96.0/20',
4243 'ES': '88.0.0.0/11',
4244 'ET': '196.188.0.0/14',
4245 'EU': '2.16.0.0/13',
4246 'FI': '91.152.0.0/13',
4247 'FJ': '144.120.0.0/16',
4248 'FK': '80.73.208.0/21',
4249 'FM': '119.252.112.0/20',
4250 'FO': '88.85.32.0/19',
4252 'GA': '41.158.0.0/15',
4254 'GD': '74.122.88.0/21',
4255 'GE': '31.146.0.0/16',
4256 'GF': '161.22.64.0/18',
4257 'GG': '62.68.160.0/19',
4258 'GH': '154.160.0.0/12',
4259 'GI': '95.164.0.0/16',
4260 'GL': '88.83.0.0/19',
4261 'GM': '160.182.0.0/15',
4262 'GN': '197.149.192.0/18',
4263 'GP': '104.250.0.0/19',
4264 'GQ': '105.235.224.0/20',
4265 'GR': '94.64.0.0/13',
4266 'GT': '168.234.0.0/16',
4267 'GU': '168.123.0.0/16',
4268 'GW': '197.214.80.0/20',
4269 'GY': '181.41.64.0/18',
4270 'HK': '113.252.0.0/14',
4271 'HN': '181.210.0.0/16',
4272 'HR': '93.136.0.0/13',
4273 'HT': '148.102.128.0/17',
4274 'HU': '84.0.0.0/14',
4275 'ID': '39.192.0.0/10',
4276 'IE': '87.32.0.0/12',
4277 'IL': '79.176.0.0/13',
4278 'IM': '5.62.80.0/20',
4279 'IN': '117.192.0.0/10',
4280 'IO': '203.83.48.0/21',
4281 'IQ': '37.236.0.0/14',
4282 'IR': '2.176.0.0/12',
4283 'IS': '82.221.0.0/16',
4284 'IT': '79.0.0.0/10',
4285 'JE': '87.244.64.0/18',
4286 'JM': '72.27.0.0/17',
4287 'JO': '176.29.0.0/16',
4288 'JP': '133.0.0.0/8',
4289 'KE': '105.48.0.0/12',
4290 'KG': '158.181.128.0/17',
4291 'KH': '36.37.128.0/17',
4292 'KI': '103.25.140.0/22',
4293 'KM': '197.255.224.0/20',
4294 'KN': '198.167.192.0/19',
4295 'KP': '175.45.176.0/22',
4296 'KR': '175.192.0.0/10',
4297 'KW': '37.36.0.0/14',
4298 'KY': '64.96.0.0/15',
4299 'KZ': '2.72.0.0/13',
4300 'LA': '115.84.64.0/18',
4301 'LB': '178.135.0.0/16',
4302 'LC': '24.92.144.0/20',
4303 'LI': '82.117.0.0/19',
4304 'LK': '112.134.0.0/15',
4305 'LR': '102.183.0.0/16',
4306 'LS': '129.232.0.0/17',
4307 'LT': '78.56.0.0/13',
4308 'LU': '188.42.0.0/16',
4309 'LV': '46.109.0.0/16',
4310 'LY': '41.252.0.0/14',
4311 'MA': '105.128.0.0/11',
4312 'MC': '88.209.64.0/18',
4313 'MD': '37.246.0.0/16',
4314 'ME': '178.175.0.0/17',
4315 'MF': '74.112.232.0/21',
4316 'MG': '154.126.0.0/17',
4317 'MH': '117.103.88.0/21',
4318 'MK': '77.28.0.0/15',
4319 'ML': '154.118.128.0/18',
4320 'MM': '37.111.0.0/17',
4321 'MN': '49.0.128.0/17',
4322 'MO': '60.246.0.0/16',
4323 'MP': '202.88.64.0/20',
4324 'MQ': '109.203.224.0/19',
4325 'MR': '41.188.64.0/18',
4326 'MS': '208.90.112.0/22',
4327 'MT': '46.11.0.0/16',
4328 'MU': '105.16.0.0/12',
4329 'MV': '27.114.128.0/18',
4330 'MW': '102.70.0.0/15',
4331 'MX': '187.192.0.0/11',
4332 'MY': '175.136.0.0/13',
4333 'MZ': '197.218.0.0/15',
4334 'NA': '41.182.0.0/16',
4335 'NC': '101.101.0.0/18',
4336 'NE': '197.214.0.0/18',
4337 'NF': '203.17.240.0/22',
4338 'NG': '105.112.0.0/12',
4339 'NI': '186.76.0.0/15',
4340 'NL': '145.96.0.0/11',
4341 'NO': '84.208.0.0/13',
4342 'NP': '36.252.0.0/15',
4343 'NR': '203.98.224.0/19',
4344 'NU': '49.156.48.0/22',
4345 'NZ': '49.224.0.0/14',
4346 'OM': '5.36.0.0/15',
4347 'PA': '186.72.0.0/15',
4348 'PE': '186.160.0.0/14',
4349 'PF': '123.50.64.0/18',
4350 'PG': '124.240.192.0/19',
4351 'PH': '49.144.0.0/13',
4352 'PK': '39.32.0.0/11',
4353 'PL': '83.0.0.0/11',
4354 'PM': '70.36.0.0/20',
4355 'PR': '66.50.0.0/16',
4356 'PS': '188.161.0.0/16',
4357 'PT': '85.240.0.0/13',
4358 'PW': '202.124.224.0/20',
4359 'PY': '181.120.0.0/14',
4360 'QA': '37.210.0.0/15',
4361 'RE': '102.35.0.0/16',
4362 'RO': '79.112.0.0/13',
4363 'RS': '93.86.0.0/15',
4364 'RU': '5.136.0.0/13',
4365 'RW': '41.186.0.0/16',
4366 'SA': '188.48.0.0/13',
4367 'SB': '202.1.160.0/19',
4368 'SC': '154.192.0.0/11',
4369 'SD': '102.120.0.0/13',
4370 'SE': '78.64.0.0/12',
4371 'SG': '8.128.0.0/10',
4372 'SI': '188.196.0.0/14',
4373 'SK': '78.98.0.0/15',
4374 'SL': '102.143.0.0/17',
4375 'SM': '89.186.32.0/19',
4376 'SN': '41.82.0.0/15',
4377 'SO': '154.115.192.0/18',
4378 'SR': '186.179.128.0/17',
4379 'SS': '105.235.208.0/21',
4380 'ST': '197.159.160.0/19',
4381 'SV': '168.243.0.0/16',
4382 'SX': '190.102.0.0/20',
4384 'SZ': '41.84.224.0/19',
4385 'TC': '65.255.48.0/20',
4386 'TD': '154.68.128.0/19',
4387 'TG': '196.168.0.0/14',
4388 'TH': '171.96.0.0/13',
4389 'TJ': '85.9.128.0/18',
4390 'TK': '27.96.24.0/21',
4391 'TL': '180.189.160.0/20',
4392 'TM': '95.85.96.0/19',
4393 'TN': '197.0.0.0/11',
4394 'TO': '175.176.144.0/21',
4395 'TR': '78.160.0.0/11',
4396 'TT': '186.44.0.0/15',
4397 'TV': '202.2.96.0/19',
4398 'TW': '120.96.0.0/11',
4399 'TZ': '156.156.0.0/14',
4400 'UA': '37.52.0.0/14',
4401 'UG': '102.80.0.0/13',
4403 'UY': '167.56.0.0/13',
4404 'UZ': '84.54.64.0/18',
4405 'VA': '212.77.0.0/19',
4406 'VC': '207.191.240.0/21',
4407 'VE': '186.88.0.0/13',
4408 'VG': '66.81.192.0/20',
4409 'VI': '146.226.0.0/16',
4410 'VN': '14.160.0.0/11',
4411 'VU': '202.80.32.0/20',
4412 'WF': '117.20.32.0/21',
4413 'WS': '202.4.32.0/19',
4414 'YE': '134.35.0.0/16',
4415 'YT': '41.242.116.0/22',
4416 'ZA': '41.0.0.0/11',
4417 'ZM': '102.144.0.0/13',
4418 'ZW': '102.177.192.0/18',
4422 def random_ipv4(cls, code_or_block):
4423 if len(code_or_block) == 2:
4424 block = cls._country_ip_map.get(code_or_block.upper())
4428 block = code_or_block
4429 addr, preflen = block.split('/')
4430 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4431 addr_max = addr_min | (0xffffffff >> int(preflen))
4432 return compat_str(socket.inet_ntoa(
4433 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4436 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4437 def __init__(self, proxies=None):
4438 # Set default handlers
4439 for type in ('http', 'https'):
4440 setattr(self, '%s_open' % type,
4441 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4442 meth(r, proxy, type))
4443 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4445 def proxy_open(self, req, proxy, type):
4446 req_proxy = req.headers.get('Ytdl-request-proxy')
4447 if req_proxy is not None:
4449 del req.headers['Ytdl-request-proxy']
4451 if proxy == '__noproxy__':
4452 return None # No Proxy
4453 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4454 req.add_header('Ytdl-socks-proxy', proxy)
4455 # yt-dlp's http/https handlers do wrapping the socket with socks
4457 return compat_urllib_request.ProxyHandler.proxy_open(
4458 self, req, proxy, type)
4461 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4462 # released into Public Domain
4463 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4465 def long_to_bytes(n, blocksize=0):
4466 """long_to_bytes(n:long, blocksize:int) : string
4467 Convert a long integer to a byte string.
4469 If optional blocksize is given and greater than zero, pad the front of the
4470 byte string with binary zeros so that the length is a multiple of
4473 # after much testing, this algorithm was deemed to be the fastest
4477 s = compat_struct_pack('>I', n & 0xffffffff) + s
4479 # strip off leading zeros
4480 for i in range(len(s)):
4481 if s[i] != b'\000'[0]:
4484 # only happens when n == 0
4488 # add back some pad bytes. this could be done more efficiently w.r.t. the
4489 # de-padding being done above, but sigh...
4490 if blocksize > 0 and len(s) % blocksize:
4491 s = (blocksize - len(s) % blocksize) * b'\000' + s
4495 def bytes_to_long(s):
4496 """bytes_to_long(string) : long
4497 Convert a byte string to a long integer.
4499 This is (essentially) the inverse of long_to_bytes().
4504 extra = (4 - length % 4)
4505 s = b'\000' * extra + s
4506 length = length + extra
4507 for i in range(0, length, 4):
4508 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4512 def ohdave_rsa_encrypt(data, exponent, modulus):
4514 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4517 data: data to encrypt, bytes-like object
4518 exponent, modulus: parameter e and N of RSA algorithm, both integer
4519 Output: hex string of encrypted data
4521 Limitation: supports one block encryption only
4524 payload = int(binascii.hexlify(data[::-1]), 16)
4525 encrypted = pow(payload, exponent, modulus)
4526 return '%x' % encrypted
4529 def pkcs1pad(data, length):
4531 Padding input data with PKCS#1 scheme
4533 @param {int[]} data input data
4534 @param {int} length target length
4535 @returns {int[]} padded data
4537 if len(data) > length - 11:
4538 raise ValueError('Input data too
long for PKCS
#1 padding')
4540 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4541 return [0, 2] + pseudo_random
+ [0] + data
4544 def encode_base_n(num
, n
, table
=None):
4545 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4547 table
= FULL_TABLE
[:n
]
4550 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4557 ret
= table
[num
% n
] + ret
4562 def decode_packed_codes(code
):
4563 mobj
= re
.search(PACKED_CODES_RE
, code
)
4564 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4567 symbols
= symbols
.split('|')
4572 base_n_count
= encode_base_n(count
, base
)
4573 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4576 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4580 def caesar(s
, alphabet
, shift
):
4585 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4590 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4593 def parse_m3u8_attributes(attrib
):
4595 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4596 if val
.startswith('"'):
4602 def urshift(val
, n
):
4603 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4606 # Based on png2str() written by @gdkchan and improved by @yokrysty
4607 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4608 def decode_png(png_data
):
4609 # Reference: https://www.w3.org/TR/PNG/
4610 header
= png_data
[8:]
4612 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4613 raise OSError('Not a valid PNG file.')
4615 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4616 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4621 length
= unpack_integer(header
[:4])
4624 chunk_type
= header
[:4]
4627 chunk_data
= header
[:length
]
4628 header
= header
[length
:]
4630 header
= header
[4:] # Skip CRC
4638 ihdr
= chunks
[0]['data']
4640 width
= unpack_integer(ihdr
[:4])
4641 height
= unpack_integer(ihdr
[4:8])
4645 for chunk
in chunks
:
4646 if chunk
['type'] == b
'IDAT':
4647 idat
+= chunk
['data']
4650 raise OSError('Unable to read PNG data.')
4652 decompressed_data
= bytearray(zlib
.decompress(idat
))
4657 def _get_pixel(idx
):
4662 for y
in range(height
):
4663 basePos
= y
* (1 + stride
)
4664 filter_type
= decompressed_data
[basePos
]
4668 pixels
.append(current_row
)
4670 for x
in range(stride
):
4671 color
= decompressed_data
[1 + basePos
+ x
]
4672 basex
= y
* stride
+ x
4677 left
= _get_pixel(basex
- 3)
4679 up
= _get_pixel(basex
- stride
)
4681 if filter_type
== 1: # Sub
4682 color
= (color
+ left
) & 0xff
4683 elif filter_type
== 2: # Up
4684 color
= (color
+ up
) & 0xff
4685 elif filter_type
== 3: # Average
4686 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4687 elif filter_type
== 4: # Paeth
4693 c
= _get_pixel(basex
- stride
- 3)
4701 if pa
<= pb
and pa
<= pc
:
4702 color
= (color
+ a
) & 0xff
4704 color
= (color
+ b
) & 0xff
4706 color
= (color
+ c
) & 0xff
4708 current_row
.append(color
)
4710 return width
, height
, pixels
4713 def write_xattr(path
, key
, value
):
4714 # Windows: Write xattrs to NTFS Alternate Data Streams:
4715 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4716 if compat_os_name
== 'nt':
4717 assert ':' not in key
4718 assert os
.path
.exists(path
)
4721 with open(f
'{path}:{key}', 'wb') as f
:
4723 except OSError as e
:
4724 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4727 # UNIX Method 1. Use xattrs/pyxattrs modules
4728 from .dependencies
import xattr
4731 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
4732 # Unicode arguments are not supported in pyxattr until version 0.5.0
4733 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4734 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
4735 setxattr
= xattr
.set
4737 setxattr
= xattr
.setxattr
4741 setxattr(path
, key
, value
)
4742 except OSError as e
:
4743 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4746 # UNIX Method 2. Use setfattr/xattr executables
4747 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
4748 else 'xattr' if check_executable('xattr', ['-h']) else None)
4750 raise XAttrUnavailableError(
4751 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4752 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4754 value
= value
.decode()
4757 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
4758 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4759 except OSError as e
:
4760 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4761 stderr
= p
.communicate_or_kill()[1].decode('utf-8', 'replace')
4763 raise XAttrMetadataError(p
.returncode
, stderr
)
4766 def random_birthday(year_field
, month_field
, day_field
):
4767 start_date
= datetime
.date(1950, 1, 1)
4768 end_date
= datetime
.date(1995, 12, 31)
4769 offset
= random
.randint(0, (end_date
- start_date
).days
)
4770 random_date
= start_date
+ datetime
.timedelta(offset
)
4772 year_field
: str(random_date
.year
),
4773 month_field
: str(random_date
.month
),
4774 day_field
: str(random_date
.day
),
4778 # Templates for internet shortcut files, which are plain text files.
4779 DOT_URL_LINK_TEMPLATE
= '''\
4784 DOT_WEBLOC_LINK_TEMPLATE
= '''\
4785 <?xml version="1.0" encoding="UTF-8"?>
4786 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787 <plist version="1.0">
4790 \t<string>%(url)s</string>
4795 DOT_DESKTOP_LINK_TEMPLATE
= '''\
4805 'url': DOT_URL_LINK_TEMPLATE
,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4811 def iri_to_uri(iri
):
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4818 iri_parts
= compat_urllib_parse_urlparse(iri
)
4820 if '[' in iri_parts
.netloc
:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4827 if iri_parts
.username
:
4828 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4829 if iri_parts
.password
is not None:
4830 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4833 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4836 net_location
+= ':' + str(iri_parts
.port
)
4838 return urllib
.parse
.urlunparse(
4842 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
4850 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4855 def to_high_limit_path(path
):
4856 if sys
.platform
in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858 return '\\\\?\\' + os
.path
.abspath(path
)
4863 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
4864 val
= traverse_obj(obj
, *variadic(field
))
4867 return template
% (func(val
) if func
else val
)
4870 def clean_podcast_url(url
):
4871 return re
.sub(r
'''(?x)
4875 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4878 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4881 cn\.co| # https://podcorn.com/analytics-prefix/
4882 st\.fm # https://podsights.com/docs/
4887 _HEX_TABLE
= '0123456789abcdef'
4890 def random_uuidv4():
4891 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4894 def make_dir(path
, to_screen
=None):
4896 dn
= os
.path
.dirname(path
)
4897 if dn
and not os
.path
.exists(dn
):
4900 except OSError as err
:
4901 if callable(to_screen
) is not None:
4902 to_screen('unable to create directory ' + error_to_compat_str(err
))
4906 def get_executable_path():
4907 from zipimport
import zipimporter
4908 if hasattr(sys
, 'frozen'): # Running from PyInstaller
4909 path
= os
.path
.dirname(sys
.executable
)
4910 elif isinstance(__loader__
, zipimporter
): # Running from ZIP
4911 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
4913 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
4914 return os
.path
.abspath(path
)
4917 def load_plugins(name
, suffix
, namespace
):
4919 with contextlib
.suppress(FileNotFoundError
):
4920 plugins_spec
= importlib
.util
.spec_from_file_location(
4921 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
4922 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
4923 sys
.modules
[plugins_spec
.name
] = plugins
4924 plugins_spec
.loader
.exec_module(plugins
)
4925 for name
in dir(plugins
):
4926 if name
in namespace
:
4928 if not name
.endswith(suffix
):
4930 klass
= getattr(plugins
, name
)
4931 classes
[name
] = namespace
[name
] = klass
4936 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
4937 casesense
=True, is_user_input
=False, traverse_string
=False):
4938 ''' Traverse nested list/dict/tuple
4939 @param path_list A list of paths which are checked one by one.
4940 Each path is a list of keys where each key is a:
4942 - string: A dictionary key
4943 - int: An index into a list
4944 - tuple: A list of keys all of which will be traversed
4945 - Ellipsis: Fetch all values in the object
4946 - Function: Takes the key and value as arguments
4947 and returns whether the key matches or not
4948 @param default Default value to return
4949 @param expected_type Only accept final value of this type (Can also be any callable)
4950 @param get_all Return all the values obtained from a path or only the first one
4951 @param casesense Whether to consider dictionary keys as case sensitive
4952 @param is_user_input Whether the keys are generated from user input. If True,
4953 strings are converted to int/slice if necessary
4954 @param traverse_string Whether to traverse inside strings. If True, any
4955 non-compatible object will also be converted into a string
4959 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
4960 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
4962 def _traverse_obj(obj
, path
, _current_depth
=0):
4964 path
= tuple(variadic(path
))
4965 for i
, key
in enumerate(path
):
4966 if None in (key
, obj
):
4968 if isinstance(key
, (list, tuple)):
4969 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
4972 obj
= (obj
.values() if isinstance(obj
, dict)
4973 else obj
if isinstance(obj
, (list, tuple, LazyList
))
4974 else str(obj
) if traverse_string
else [])
4976 depth
= max(depth
, _current_depth
)
4977 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
4979 if isinstance(obj
, (list, tuple, LazyList
)):
4980 obj
= enumerate(obj
)
4981 elif isinstance(obj
, dict):
4984 if not traverse_string
:
4988 depth
= max(depth
, _current_depth
)
4989 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if try_call(key
, args
=(k
, v
))]
4990 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
4991 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
4992 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
4995 key
= (int_or_none(key
) if ':' not in key
4996 else slice(*map(int_or_none
, key
.split(':'))))
4997 if key
== slice(None):
4998 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
4999 if not isinstance(key
, (int, slice)):
5001 if not isinstance(obj
, (list, tuple, LazyList
)):
5002 if not traverse_string
:
5011 if isinstance(expected_type
, type):
5012 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5013 elif expected_type
is not None:
5014 type_test
= expected_type
5016 type_test
= lambda val
: val
5018 for path
in path_list
:
5020 val
= _traverse_obj(obj
, path
)
5023 for _
in range(depth
- 1):
5024 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5025 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5027 return val
if get_all
else val
[0]
5029 val
= type_test(val
)
5035 def traverse_dict(dictn
, keys
, casesense
=True):
5036 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5037 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5038 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5041 def get_first(obj
, keys
, **kwargs
):
5042 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5045 def variadic(x
, allowed_types
=(str, bytes, dict)):
5046 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5049 def decode_base(value
, digits
):
5050 # This will convert given base-x string to scalar (long or int)
5051 table
= {char: index for index, char in enumerate(digits)}
5056 result
+= table
[chr]
5060 def time_seconds(**kwargs
):
5061 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5062 return t
.timestamp()
5065 # create a JSON Web Signature (jws) with HS256 algorithm
5066 # the resulting format is in JWS Compact Serialization
5067 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5068 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5069 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5075 header_data
.update(headers
)
5076 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5077 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5078 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5079 signature_b64
= base64
.b64encode(h
.digest())
5080 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5084 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5085 def jwt_decode_hs256(jwt
):
5086 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5087 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5091 def supports_terminal_sequences(stream
):
5092 if compat_os_name
== 'nt':
5093 from .compat
import WINDOWS_VT_MODE
# Must be imported locally
5094 if not WINDOWS_VT_MODE
or get_windows_version() < (10, 0, 10586):
5096 elif not os
.getenv('TERM'):
5099 return stream
.isatty()
5100 except BaseException
:
5104 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5107 def remove_terminal_sequences(string
):
5108 return _terminal_sequences_re
.sub('', string
)
5111 def number_of_digits(number
):
5112 return len('%d' % number
)
5115 def join_nonempty(*values
, delim
='-', from_dict
=None):
5116 if from_dict
is not None:
5117 values
= map(from_dict
.get
, values
)
5118 return delim
.join(map(str, filter(None, values
)))
5121 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5123 Find the largest format dimensions in terms of video width and, for each thumbnail:
5124 * Modify the URL: Match the width with the provided regex and replace with the former width
5127 This function is useful with video services that scale the provided thumbnails on demand
5129 _keys
= ('width', 'height')
5130 max_dimensions
= max(
5131 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5133 if not max_dimensions
[0]:
5137 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5138 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5139 for thumbnail
in thumbnails
5143 def parse_http_range(range):
5144 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5146 return None, None, None
5147 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5149 return None, None, None
5150 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5156 __initialized
= False
5158 def __init__(self
, parser
, label
=None):
5159 self
._parser
, self
.label
= parser
, label
5160 self
._loaded
_paths
, self
.configs
= set(), []
5162 def init(self
, args
=None, filename
=None):
5163 assert not self
.__initialized
5166 location
= os
.path
.realpath(filename
)
5167 directory
= os
.path
.dirname(location
)
5168 if location
in self
._loaded
_paths
:
5170 self
._loaded
_paths
.add(location
)
5172 self
.__initialized
= True
5173 self
.own_args
, self
.filename
= args
, filename
5174 for location
in self
._parser
.parse_args(args
)[0].config_locations
or []:
5175 location
= os
.path
.join(directory
, expand_path(location
))
5176 if os
.path
.isdir(location
):
5177 location
= os
.path
.join(location
, 'yt-dlp.conf')
5178 if not os
.path
.exists(location
):
5179 self
._parser
.error(f
'config location {location} does not exist')
5180 self
.append_config(self
.read_file(location
), location
)
5184 label
= join_nonempty(
5185 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5187 return join_nonempty(
5188 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5189 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5193 def read_file(filename
, default
=[]):
5195 optionf
= open(filename
)
5197 return default
# silently skip if file is not present
5199 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5200 contents
= optionf
.read()
5201 res
= shlex
.split(contents
, comments
=True)
5207 def hide_login_info(opts
):
5208 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5209 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5214 return m
.group('key') + '=PRIVATE'
5218 opts
= list(map(_scrub_eq
, opts
))
5219 for idx
, opt
in enumerate(opts
):
5220 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5221 opts
[idx
+ 1] = 'PRIVATE'
5224 def append_config(self
, *args
, label
=None):
5225 config
= type(self
)(self
._parser
, label
)
5226 config
._loaded
_paths
= self
._loaded
_paths
5227 if config
.init(*args
):
5228 self
.configs
.append(config
)
5232 for config
in reversed(self
.configs
):
5233 yield from config
.all_args
5234 yield from self
.own_args
or []
5236 def parse_args(self
):
5237 return self
._parser
.parse_args(self
.all_args
)
5240 class WebSocketsWrapper():
5241 """Wraps websockets module to use in non-async scopes"""
5244 def __init__(self
, url
, headers
=None, connect
=True):
5245 self
.loop
= asyncio
.new_event_loop()
5246 # XXX: "loop" is deprecated
5247 self
.conn
= websockets
.connect(
5248 url
, extra_headers
=headers
, ping_interval
=None,
5249 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5252 atexit
.register(self
.__exit
__, None, None, None)
5254 def __enter__(self
):
5256 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5259 def send(self
, *args
):
5260 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5262 def recv(self
, *args
):
5263 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5265 def __exit__(self
, type, value
, traceback
):
5267 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5270 self
._cancel
_all
_tasks
(self
.loop
)
5272 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5273 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5275 def run_with_loop(main
, loop
):
5276 if not asyncio
.iscoroutine(main
):
5277 raise ValueError(f
'a coroutine was expected, got {main!r}')
5280 return loop
.run_until_complete(main
)
5282 loop
.run_until_complete(loop
.shutdown_asyncgens())
5283 if hasattr(loop
, 'shutdown_default_executor'):
5284 loop
.run_until_complete(loop
.shutdown_default_executor())
5287 def _cancel_all_tasks(loop
):
5288 to_cancel
= asyncio
.all_tasks(loop
)
5293 for task
in to_cancel
:
5296 # XXX: "loop" is removed in python 3.10+
5297 loop
.run_until_complete(
5298 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5300 for task
in to_cancel
:
5301 if task
.cancelled():
5303 if task
.exception() is not None:
5304 loop
.call_exception_handler({
5305 'message': 'unhandled exception during asyncio.run() shutdown',
5306 'exception': task
.exception(),
5311 def merge_headers(*dicts
):
5312 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5313 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5316 class classproperty
:
5317 def __init__(self
, f
):
5318 functools
.update_wrapper(self
, f
)
5321 def __get__(self
, _
, cls
):
5326 """Immutable namespace"""
5328 def __init__(self
, **kwargs
):
5331 def __getattr__(self
, attr
):
5332 return self
._dict
[attr
]
5334 def __contains__(self
, item
):
5335 return item
in self
._dict
.values()
5338 return iter(self
._dict
.items())
5341 return f
'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5345 has_certifi
= bool(certifi
)
5346 has_websockets
= bool(websockets
)