39 import xml
.etree
.ElementTree
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
50 compat_HTMLParseError
,
61 compat_urllib_parse_unquote_plus
,
62 compat_urllib_parse_urlencode
,
63 compat_urllib_parse_urlparse
,
64 compat_urllib_request
,
67 from .dependencies
import brotli
, certifi
, websockets
68 from .socks
import ProxyType
, sockssocket
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme
not in compat_urlparse
.uses_netloc
:
77 compat_urlparse
.uses_netloc
.append(scheme
)
80 # This is not clearly defined otherwise
81 compiled_regex_type
= type(re
.compile(''))
84 def random_user_agent():
85 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
126 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
129 SUPPORTED_ENCODINGS
= [
133 SUPPORTED_ENCODINGS
.append('br')
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
148 NO_DEFAULT
= object()
150 ENGLISH_MONTH_NAMES
= [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
155 'en': ENGLISH_MONTH_NAMES
,
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
174 'f4f', 'f4m', 'm3u8', 'smil')
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
217 '%Y-%m-%dT%H:%M:%S.%f',
220 '%b %d %Y at %H:%M:%S',
222 '%B %d %Y at %H:%M:%S',
226 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
227 DATE_FORMATS_DAY_FIRST
.extend([
236 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
237 DATE_FORMATS_MONTH_FIRST
.extend([
245 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
248 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
251 def preferredencoding():
252 """Get preferred encoding.
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
258 pref = locale.getpreferredencoding()
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
271 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32
':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
281 with contextlib.suppress(OSError):
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
287 with contextlib.suppress(OSError):
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
296 return node.find(expr)
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
302 def xpath_with_ns(path
, ns_map
):
303 components
= [c
.split(':') for c
in path
.split('/')]
307 replaced
.append(c
[0])
310 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
311 return '/'.join(replaced
)
314 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
315 def _find_xpath(xpath
):
316 return node
.find(xpath
)
318 if isinstance(xpath
, (str, compat_str
)):
319 n
= _find_xpath(xpath
)
327 if default
is not NO_DEFAULT
:
330 name
= xpath
if name
is None else name
331 raise ExtractorError('Could not find XML element %s' % name
)
337 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
338 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
339 if n
is None or n
== default
:
342 if default
is not NO_DEFAULT
:
345 name
= xpath
if name
is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name
)
352 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
353 n
= find_xpath_attr(node
, xpath
, key
)
355 if default
is not NO_DEFAULT
:
358 name
= f
'{xpath}[@{key}]' if name
is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name
)
365 def get_element_by_id(id, html
):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html
)
370 def get_element_html_by_id(id, html
):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html
)
375 def get_element_by_class(class_name
, html
):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval
= get_elements_by_class(class_name
, html
)
378 return retval
[0] if retval
else None
381 def get_element_html_by_class(class_name
, html
):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval
= get_elements_html_by_class(class_name
, html
)
384 return retval
[0] if retval
else None
387 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
388 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
389 return retval
[0] if retval
else None
392 def get_element_html_by_attribute(attribute
, value
, html
, escape_value
=True):
393 retval
= get_elements_html_by_attribute(attribute
, value
, html
, escape_value
)
394 return retval
[0] if retval
else None
397 def get_elements_by_class(class_name
, html
):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^
\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
427 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
429 value = re.escape(value) if escape_value else value
431 partial_element_re = rf'''(?x
)
432 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
433 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
453 class HTMLBreakOnClosingTagException(Exception):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
463 def __exit__(self, *_):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags
in the stack
')
479 inner_tag = self.tagstack.pop()
483 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
488 def get_element_text_and_html_by_tag(tag, html):
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text
) and the whole
element (html
)
493 def find_or_raise(haystack, needle, exc):
495 return haystack.index(needle)
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes
for a single element
"""
528 compat_HTMLParser.__init__(self)
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes
for the elements of a
list"""
538 compat_HTMLParser.__init__(self)
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
547 def handle_endtag(self, tag):
551 def extract_attributes(html_element):
552 """Given a string
for an HTML element such
as
554 a
="foo" B
="bar" c
="&98;az" d
=boz
555 empty
= noval entity
="&"
558 Decode
and return a dictionary of attributes
.
560 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
572 def parse_list(webpage):
573 """Given a string
for an series of HTML
<li
> elements
,
574 return a dictionary of their attributes
"""
575 parser = HTMLListAttrsParser()
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string
"""
584 if html is None: # Convenience for sanitizing descriptions etc.
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
597 def sanitize_open(filename, open_mode):
598 """Try to
open the given filename
, and slightly tweak it
if this fails
.
600 Attempts to
open the given filename
. If this fails
, it tries to change
601 the filename slightly
, step by step
, until it
's either able to open it
602 or it fails and raises a final exception, like the standard open()
605 It returns the tuple (stream, definitive_file_name).
608 if sys.platform == 'win32
':
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
613 for attempt in range(2):
616 if sys.platform == 'win32
':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
622 except LockingUnsupportedError
:
623 stream
= open(filename
, open_mode
)
624 return (stream
, filename
)
625 except OSError as err
:
626 if attempt
or err
.errno
in (errno
.EACCES
,):
628 old_filename
, filename
= filename
, sanitize_path(filename
)
629 if old_filename
== filename
:
633 def timeconvert(timestr
):
634 """Convert RFC 2822 defined time string into system timestamp"""
636 timetuple
= email
.utils
.parsedate_tz(timestr
)
637 if timetuple
is not None:
638 timestamp
= email
.utils
.mktime_tz(timetuple
)
642 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
651 def replace_insane(char
):
652 if restricted
and char
in ACCENT_CHARS
:
653 return ACCENT_CHARS
[char
]
654 elif not restricted
and char
== '\n':
656 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
659 return '' if restricted
else '\''
661 return '\0_\0-' if restricted
else '\0 \0-'
662 elif char
in '\\/|*<>':
664 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
668 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
669 result
= ''.join(map(replace_insane
, s
))
670 if is_id
is NO_DEFAULT
:
671 result
= re
.sub('(\0.)(?:(?=\\1)..)+', r
'\1', result
) # Remove repeated substitute chars
672 STRIP_RE
= '(?:\0.|[ _-])*'
673 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
674 result
= result
.replace('\0', '') or '_'
677 while '__' in result
:
678 result
= result
.replace('__', '_')
679 result
= result
.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted
and result
.startswith('-_'):
683 if result
.startswith('-'):
684 result
= '_' + result
[len('-'):]
685 result
= result
.lstrip('.')
691 def sanitize_path(s
, force
=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys
.platform
== 'win32':
695 drive_or_unc
, _
= os
.path
.splitdrive(s
)
701 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
705 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
706 for path_part
in norm_path
]
708 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
709 elif force
and s
and s
[0] == os
.path
.sep
:
710 sanitized_path
.insert(0, os
.path
.sep
)
711 return os
.path
.join(*sanitized_path
)
714 def sanitize_url(url
):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url
.startswith('//'):
718 return 'http:%s' % url
719 # Fix some common typos seen so far
721 # https://github.com/ytdl-org/youtube-dl/issues/15649
722 (r
'^httpss://', r
'https://'),
723 # https://bx1.be/lives/direct-tv/
724 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
726 for mistake
, fixup
in COMMON_TYPOS
:
727 if re
.match(mistake
, url
):
728 return re
.sub(mistake
, fixup
, url
)
732 def extract_basic_auth(url
):
733 parts
= compat_urlparse
.urlsplit(url
)
734 if parts
.username
is None:
736 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
737 parts
.hostname
if parts
.port
is None
738 else '%s:%d' % (parts
.hostname
, parts
.port
))))
739 auth_payload
= base64
.b64encode(
740 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
741 return url
, f
'Basic {auth_payload.decode()}'
744 def sanitized_Request(url
, *args
, **kwargs
):
745 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
746 if auth_header
is not None:
747 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
748 headers
['Authorization'] = auth_header
749 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
753 """Expand shell variables and ~"""
754 return os
.path
.expandvars(compat_expanduser(s
))
757 def orderedSet(iterable
):
758 """ Remove all duplicates from the input iterable """
766 def _htmlentity_transform(entity_with_semicolon
):
767 """Transforms an HTML entity to a character."""
768 entity
= entity_with_semicolon
[:-1]
770 # Known non-numeric HTML entity
771 if entity
in compat_html_entities
.name2codepoint
:
772 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
774 # TODO: HTML5 allows entities without a semicolon. For example,
775 # 'Éric' should be decoded as 'Éric'.
776 if entity_with_semicolon
in compat_html_entities_html5
:
777 return compat_html_entities_html5
[entity_with_semicolon
]
779 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
781 numstr
= mobj
.group(1)
782 if numstr
.startswith('x'):
784 numstr
= '0%s' % numstr
787 # See https://github.com/ytdl-org/youtube-dl/issues/7518
788 with contextlib
.suppress(ValueError):
789 return compat_chr(int(numstr
, base
))
791 # Unknown entity in name, return its literal representation
792 return '&%s;' % entity
798 assert isinstance(s
, str)
801 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
804 def escapeHTML(text
):
807 .replace('&', '&')
808 .replace('<', '<')
809 .replace('>', '>')
810 .replace('"', '"')
811 .replace("'", ''')
815 def process_communicate_or_kill(p
, *args
, **kwargs
):
817 return p
.communicate(*args
, **kwargs
)
818 except BaseException
: # Including KeyboardInterrupt
824 class Popen(subprocess
.Popen
):
825 if sys
.platform
== 'win32':
826 _startupinfo
= subprocess
.STARTUPINFO()
827 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
831 def __init__(self
, *args
, **kwargs
):
832 super().__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
834 def communicate_or_kill(self
, *args
, **kwargs
):
835 return process_communicate_or_kill(self
, *args
, **kwargs
)
838 def get_subprocess_encoding():
839 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
840 # For subprocess calls, encode with locale encoding
841 # Refer to http://stackoverflow.com/a/9951851/35070
842 encoding
= preferredencoding()
844 encoding
= sys
.getfilesystemencoding()
850 def encodeFilename(s
, for_subprocess
=False):
851 assert isinstance(s
, str)
855 def decodeFilename(b
, for_subprocess
=False):
859 def encodeArgument(s
):
860 # Legacy code that uses byte strings
861 # Uncomment the following line after fixing all post processors
862 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
863 return s
if isinstance(s
, str) else s
.decode('ascii')
866 def decodeArgument(b
):
870 def decodeOption(optval
):
873 if isinstance(optval
, bytes):
874 optval
= optval
.decode(preferredencoding())
876 assert isinstance(optval
, compat_str
)
880 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883 def timetuple_from_msec(msec
):
884 secs
, msec
= divmod(msec
, 1000)
885 mins
, secs
= divmod(secs
, 60)
886 hrs
, mins
= divmod(mins
, 60)
887 return _timetuple(hrs
, mins
, secs
, msec
)
890 def formatSeconds(secs
, delim
=':', msec
=False):
891 time
= timetuple_from_msec(secs
* 1000)
893 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
895 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
897 ret
= '%d' % time
.seconds
898 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
901 def _ssl_load_windows_store_certs(ssl_context
, storename
):
902 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
904 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
905 if encoding
== 'x509_asn' and (
906 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
907 except PermissionError
:
910 with contextlib
.suppress(ssl
.SSLError
):
911 ssl_context
.load_verify_locations(cadata
=cert
)
914 def make_HTTPS_handler(params
, **kwargs
):
915 opts_check_certificate
= not params
.get('nocheckcertificate')
916 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
917 context
.check_hostname
= opts_check_certificate
918 if params
.get('legacyserverconnect'):
919 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
920 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
921 context
.set_ciphers('DEFAULT')
922 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
923 if opts_check_certificate
:
924 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
925 context
.load_verify_locations(cafile
=certifi
.where())
928 context
.load_default_certs()
929 # Work around the issue in load_default_certs when there are bad certificates. See:
930 # https://github.com/yt-dlp/yt-dlp/issues/1060,
931 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
933 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
934 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
935 for storename
in ('CA', 'ROOT'):
936 _ssl_load_windows_store_certs(context
, storename
)
937 context
.set_default_verify_paths()
938 client_certfile
= params
.get('client_certificate')
941 context
.load_cert_chain(
942 client_certfile
, keyfile
=params
.get('client_certificate_key'),
943 password
=params
.get('client_certificate_password'))
945 raise YoutubeDLError('Unable to load client certificate')
946 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
949 def bug_reports_message(before
=';'):
950 msg
= ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
951 'filling out the appropriate issue template. '
952 'Confirm you are on the latest version using yt-dlp -U')
954 before
= before
.rstrip()
955 if not before
or before
.endswith(('.', '!', '?')):
956 msg
= msg
[0].title() + msg
[1:]
958 return (before
+ ' ' if before
else '') + msg
961 class YoutubeDLError(Exception):
962 """Base exception for YoutubeDL errors."""
965 def __init__(self
, msg
=None):
968 elif self
.msg
is None:
969 self
.msg
= type(self
).__name
__
970 super().__init
__(self
.msg
)
973 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
974 if hasattr(ssl
, 'CertificateError'):
975 network_exceptions
.append(ssl
.CertificateError
)
976 network_exceptions
= tuple(network_exceptions
)
979 class ExtractorError(YoutubeDLError
):
980 """Error during info extraction."""
982 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
983 """ tb, if given, is the original traceback (so that it can be printed out).
984 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
986 if sys
.exc_info()[0] in network_exceptions
:
989 self
.orig_msg
= str(msg
)
991 self
.expected
= expected
993 self
.video_id
= video_id
995 self
.exc_info
= sys
.exc_info() # preserve original exception
997 super().__init
__(''.join((
998 format_field(ie
, template
='[%s] '),
999 format_field(video_id
, template
='%s: '),
1001 format_field(cause
, template
=' (caused by %r)'),
1002 '' if expected
else bug_reports_message())))
1004 def format_traceback(self
):
1005 return join_nonempty(
1006 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1007 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1011 class UnsupportedError(ExtractorError
):
1012 def __init__(self
, url
):
1014 'Unsupported URL: %s' % url
, expected
=True)
1018 class RegexNotFoundError(ExtractorError
):
1019 """Error when a regex didn't match"""
1023 class GeoRestrictedError(ExtractorError
):
1024 """Geographic restriction Error exception.
1026 This exception may be thrown when a video is not available from your
1027 geographic location due to geographic restrictions imposed by a website.
1030 def __init__(self
, msg
, countries
=None, **kwargs
):
1031 kwargs
['expected'] = True
1032 super().__init
__(msg
, **kwargs
)
1033 self
.countries
= countries
1036 class DownloadError(YoutubeDLError
):
1037 """Download Error exception.
1039 This exception may be thrown by FileDownloader objects if they are not
1040 configured to continue on errors. They will contain the appropriate
1044 def __init__(self
, msg
, exc_info
=None):
1045 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1046 super().__init
__(msg
)
1047 self
.exc_info
= exc_info
1050 class EntryNotInPlaylist(YoutubeDLError
):
1051 """Entry not in playlist exception.
1053 This exception will be thrown by YoutubeDL when a requested entry
1054 is not found in the playlist info_dict
1056 msg
= 'Entry not found in info'
1059 class SameFileError(YoutubeDLError
):
1060 """Same File exception.
1062 This exception will be thrown by FileDownloader objects if they detect
1063 multiple files would have to be downloaded to the same file on disk.
1065 msg
= 'Fixed output name but more than one file to download'
1067 def __init__(self
, filename
=None):
1068 if filename
is not None:
1069 self
.msg
+= f
': {filename}'
1070 super().__init
__(self
.msg
)
1073 class PostProcessingError(YoutubeDLError
):
1074 """Post Processing exception.
1076 This exception may be raised by PostProcessor's .run() method to
1077 indicate an error in the postprocessing task.
1081 class DownloadCancelled(YoutubeDLError
):
1082 """ Exception raised when the download queue should be interrupted """
1083 msg
= 'The download was cancelled'
1086 class ExistingVideoReached(DownloadCancelled
):
1087 """ --break-on-existing triggered """
1088 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1091 class RejectedVideoReached(DownloadCancelled
):
1092 """ --break-on-reject triggered """
1093 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1096 class MaxDownloadsReached(DownloadCancelled
):
1097 """ --max-downloads limit has been reached. """
1098 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1101 class ReExtractInfo(YoutubeDLError
):
1102 """ Video info needs to be re-extracted. """
1104 def __init__(self
, msg
, expected
=False):
1105 super().__init
__(msg
)
1106 self
.expected
= expected
1109 class ThrottledDownload(ReExtractInfo
):
1110 """ Download speed below --throttled-rate. """
1111 msg
= 'The download speed is below throttle limit'
1114 super().__init
__(self
.msg
, expected
=False)
1117 class UnavailableVideoError(YoutubeDLError
):
1118 """Unavailable Format exception.
1120 This exception will be thrown when a video is requested
1121 in a format that is not available for that video.
1123 msg
= 'Unable to download video'
1125 def __init__(self
, err
=None):
1127 self
.msg
+= f
': {err}'
1128 super().__init
__(self
.msg
)
1131 class ContentTooShortError(YoutubeDLError
):
1132 """Content Too Short exception.
1134 This exception may be raised by FileDownloader objects when a file they
1135 download is too small for what the server announced first, indicating
1136 the connection was probably interrupted.
1139 def __init__(self
, downloaded
, expected
):
1140 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1142 self
.downloaded
= downloaded
1143 self
.expected
= expected
1146 class XAttrMetadataError(YoutubeDLError
):
1147 def __init__(self
, code
=None, msg
='Unknown error'):
1148 super().__init
__(msg
)
1152 # Parsing code and msg
1153 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1154 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1155 self
.reason
= 'NO_SPACE'
1156 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1157 self
.reason
= 'VALUE_TOO_LONG'
1159 self
.reason
= 'NOT_SUPPORTED'
1162 class XAttrUnavailableError(YoutubeDLError
):
1166 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1167 hc
= http_class(*args
, **kwargs
)
1168 source_address
= ydl_handler
._params
.get('source_address')
1170 if source_address
is not None:
1171 # This is to workaround _create_connection() from socket where it will try all
1172 # address data from getaddrinfo() including IPv6. This filters the result from
1173 # getaddrinfo() based on the source_address value.
1174 # This is based on the cpython socket.create_connection() function.
1175 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1176 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1177 host
, port
= address
1179 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1180 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1181 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1182 if addrs
and not ip_addrs
:
1183 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1185 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1186 % (ip_version
, source_address
[0]))
1187 for res
in ip_addrs
:
1188 af
, socktype
, proto
, canonname
, sa
= res
1191 sock
= socket
.socket(af
, socktype
, proto
)
1192 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1193 sock
.settimeout(timeout
)
1194 sock
.bind(source_address
)
1196 err
= None # Explicitly break reference cycle
1198 except OSError as _
:
1200 if sock
is not None:
1205 raise OSError('getaddrinfo returns an empty list')
1206 if hasattr(hc
, '_create_connection'):
1207 hc
._create
_connection
= _create_connection
1208 hc
.source_address
= (source_address
, 0)
1213 def handle_youtubedl_headers(headers
):
1214 filtered_headers
= headers
1216 if 'Youtubedl-no-compression' in filtered_headers
:
1217 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1218 del filtered_headers
['Youtubedl-no-compression']
1220 return filtered_headers
1223 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1224 """Handler for HTTP requests and responses.
1226 This class, when installed with an OpenerDirector, automatically adds
1227 the standard headers to every HTTP request and handles gzipped and
1228 deflated responses from web servers. If compression is to be avoided in
1229 a particular request, the original request in the program code only has
1230 to include the HTTP header "Youtubedl-no-compression", which will be
1231 removed before making the real request.
1233 Part of this code was copied from:
1235 http://techknack.net/python-urllib2-handlers/
1237 Andrew Rowls, the author of that code, agreed to release it to the
1241 def __init__(self
, params
, *args
, **kwargs
):
1242 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1243 self
._params
= params
1245 def http_open(self
, req
):
1246 conn_class
= compat_http_client
.HTTPConnection
1248 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1250 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1251 del req
.headers
['Ytdl-socks-proxy']
1253 return self
.do_open(functools
.partial(
1254 _create_http_connection
, self
, conn_class
, False),
1262 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1264 return zlib
.decompress(data
)
1270 return brotli
.decompress(data
)
1272 def http_request(self
, req
):
1273 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1274 # always respected by websites, some tend to give out URLs with non percent-encoded
1275 # non-ASCII characters (see telemb.py, ard.py [#3412])
1276 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1277 # To work around aforementioned issue we will replace request's original URL with
1278 # percent-encoded one
1279 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1280 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1281 url
= req
.get_full_url()
1282 url_escaped
= escape_url(url
)
1284 # Substitute URL if any change after escaping
1285 if url
!= url_escaped
:
1286 req
= update_Request(req
, url
=url_escaped
)
1288 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1289 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1290 # The dict keys are capitalized because of this bug by urllib
1291 if h
.capitalize() not in req
.headers
:
1292 req
.add_header(h
, v
)
1294 if 'Accept-encoding' not in req
.headers
:
1295 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1297 req
.headers
= handle_youtubedl_headers(req
.headers
)
1301 def http_response(self
, req
, resp
):
1304 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1305 content
= resp
.read()
1306 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1308 uncompressed
= io
.BytesIO(gz
.read())
1309 except OSError as original_ioerror
:
1310 # There may be junk add the end of the file
1311 # See http://stackoverflow.com/q/4928560/35070 for details
1312 for i
in range(1, 1024):
1314 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1315 uncompressed
= io
.BytesIO(gz
.read())
1320 raise original_ioerror
1321 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1322 resp
.msg
= old_resp
.msg
1323 del resp
.headers
['Content-encoding']
1325 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1326 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1327 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1328 resp
.msg
= old_resp
.msg
1329 del resp
.headers
['Content-encoding']
1331 if resp
.headers
.get('Content-encoding', '') == 'br':
1332 resp
= compat_urllib_request
.addinfourl(
1333 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1334 resp
.msg
= old_resp
.msg
1335 del resp
.headers
['Content-encoding']
1336 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1337 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1338 if 300 <= resp
.code
< 400:
1339 location
= resp
.headers
.get('Location')
1341 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1342 location
= location
.encode('iso-8859-1').decode()
1343 location_escaped
= escape_url(location
)
1344 if location
!= location_escaped
:
1345 del resp
.headers
['Location']
1346 resp
.headers
['Location'] = location_escaped
1349 https_request
= http_request
1350 https_response
= http_response
1353 def make_socks_conn_class(base_class
, socks_proxy
):
1354 assert issubclass(base_class
, (
1355 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1357 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1358 if url_components
.scheme
.lower() == 'socks5':
1359 socks_type
= ProxyType
.SOCKS5
1360 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1361 socks_type
= ProxyType
.SOCKS4
1362 elif url_components
.scheme
.lower() == 'socks4a':
1363 socks_type
= ProxyType
.SOCKS4A
1365 def unquote_if_non_empty(s
):
1368 return compat_urllib_parse_unquote_plus(s
)
1372 url_components
.hostname
, url_components
.port
or 1080,
1374 unquote_if_non_empty(url_components
.username
),
1375 unquote_if_non_empty(url_components
.password
),
1378 class SocksConnection(base_class
):
1380 self
.sock
= sockssocket()
1381 self
.sock
.setproxy(*proxy_args
)
1382 if isinstance(self
.timeout
, (int, float)):
1383 self
.sock
.settimeout(self
.timeout
)
1384 self
.sock
.connect((self
.host
, self
.port
))
1386 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1387 if hasattr(self
, '_context'): # Python > 2.6
1388 self
.sock
= self
._context
.wrap_socket(
1389 self
.sock
, server_hostname
=self
.host
)
1391 self
.sock
= ssl
.wrap_socket(self
.sock
)
1393 return SocksConnection
1396 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1397 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1398 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1399 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1400 self
._params
= params
1402 def https_open(self
, req
):
1404 conn_class
= self
._https
_conn
_class
1406 if hasattr(self
, '_context'): # python > 2.6
1407 kwargs
['context'] = self
._context
1408 if hasattr(self
, '_check_hostname'): # python 3.x
1409 kwargs
['check_hostname'] = self
._check
_hostname
1411 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1413 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1414 del req
.headers
['Ytdl-socks-proxy']
1417 return self
.do_open(
1418 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1419 except urllib
.error
.URLError
as e
:
1420 if (isinstance(e
.reason
, ssl
.SSLError
)
1421 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1422 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1426 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1428 See [1] for cookie file format.
1430 1. https://curl.haxx.se/docs/http-cookies.html
1432 _HTTPONLY_PREFIX
= '#HttpOnly_'
1434 _HEADER
= '''# Netscape HTTP Cookie File
1435 # This file is generated by yt-dlp. Do not edit.
1438 _CookieFileEntry
= collections
.namedtuple(
1440 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1442 def __init__(self
, filename
=None, *args
, **kwargs
):
1443 super().__init
__(None, *args
, **kwargs
)
1444 if self
.is_path(filename
):
1445 filename
= os
.fspath(filename
)
1446 self
.filename
= filename
1449 def _true_or_false(cndn
):
1450 return 'TRUE' if cndn
else 'FALSE'
1454 return isinstance(file, (str, bytes, os
.PathLike
))
1456 @contextlib.contextmanager
1457 def open(self
, file, *, write
=False):
1458 if self
.is_path(file):
1459 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1466 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1469 if (not ignore_discard
and cookie
.discard
1470 or not ignore_expires
and cookie
.is_expired(now
)):
1472 name
, value
= cookie
.name
, cookie
.value
1474 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1475 # with no name, whereas http.cookiejar regards it as a
1476 # cookie with no value.
1477 name
, value
= '', name
1478 f
.write('%s\n' % '\t'.join((
1480 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1482 self
._true
_or
_false
(cookie
.secure
),
1483 str_or_none(cookie
.expires
, default
=''),
1487 def save(self
, filename
=None, *args
, **kwargs
):
1489 Save cookies to a file.
1490 Code is taken from CPython 3.6
1491 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1493 if filename
is None:
1494 if self
.filename
is not None:
1495 filename
= self
.filename
1497 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1499 # Store session cookies with `expires` set to 0 instead of an empty string
1501 if cookie
.expires
is None:
1504 with self
.open(filename
, write
=True) as f
:
1505 f
.write(self
._HEADER
)
1506 self
._really
_save
(f
, *args
, **kwargs
)
1508 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1509 """Load cookies from a file."""
1510 if filename
is None:
1511 if self
.filename
is not None:
1512 filename
= self
.filename
1514 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1516 def prepare_line(line
):
1517 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1518 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1519 # comments and empty lines are fine
1520 if line
.startswith('#') or not line
.strip():
1522 cookie_list
= line
.split('\t')
1523 if len(cookie_list
) != self
._ENTRY
_LEN
:
1524 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1525 cookie
= self
._CookieFileEntry
(*cookie_list
)
1526 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1527 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1531 with self
.open(filename
) as f
:
1534 cf
.write(prepare_line(line
))
1535 except compat_cookiejar
.LoadError
as e
:
1536 if f
'{line.strip()} '[0] in '[{"':
1537 raise compat_cookiejar
.LoadError(
1538 'Cookies file must be Netscape formatted, not JSON. See '
1539 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1540 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1543 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1544 # Session cookies are denoted by either `expires` field set to
1545 # an empty string or 0. MozillaCookieJar only recognizes the former
1546 # (see [1]). So we need force the latter to be recognized as session
1547 # cookies on our own.
1548 # Session cookies may be important for cookies-based authentication,
1549 # e.g. usually, when user does not check 'Remember me' check box while
1550 # logging in on a site, some important cookies are stored as session
1551 # cookies so that not recognizing them will result in failed login.
1552 # 1. https://bugs.python.org/issue17164
1554 # Treat `expires=0` cookies as session cookies
1555 if cookie
.expires
== 0:
1556 cookie
.expires
= None
1557 cookie
.discard
= True
1560 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1561 def __init__(self
, cookiejar
=None):
1562 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1564 def http_response(self
, request
, response
):
1565 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1567 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1568 https_response
= http_response
1571 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1572 """YoutubeDL redirect handler
1574 The code is based on HTTPRedirectHandler implementation from CPython [1].
1576 This redirect handler solves two issues:
1577 - ensures redirect URL is always unicode under python 2
1578 - introduces support for experimental HTTP response status code
1579 308 Permanent Redirect [2] used by some sites [3]
1581 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1582 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1583 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1588 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1589 """Return a Request or None in response to a redirect.
1591 This is called by the http_error_30x methods when a
1592 redirection response is received. If a redirection should
1593 take place, return a new Request to allow http_error_30x to
1594 perform the redirect. Otherwise, raise HTTPError if no-one
1595 else should try to handle this url. Return None if you can't
1596 but another Handler might.
1598 m
= req
.get_method()
1599 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1600 or code
in (301, 302, 303) and m
== "POST")):
1601 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1602 # Strictly (according to RFC 2616), 301 or 302 in response to
1603 # a POST MUST NOT cause a redirection without confirmation
1604 # from the user (of urllib.request, in this case). In practice,
1605 # essentially all clients do redirect in this case, so we do
1608 # Be conciliant with URIs containing a space. This is mainly
1609 # redundant with the more complete encoding done in http_error_302(),
1610 # but it is kept for compatibility with other callers.
1611 newurl
= newurl
.replace(' ', '%20')
1613 CONTENT_HEADERS
= ("content-length", "content-type")
1614 # NB: don't use dict comprehension for python 2.6 compatibility
1615 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1617 # A 303 must either use GET or HEAD for subsequent request
1618 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1619 if code
== 303 and m
!= 'HEAD':
1621 # 301 and 302 redirects are commonly turned into a GET from a POST
1622 # for subsequent requests by browsers, so we'll do the same.
1623 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1624 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1625 if code
in (301, 302) and m
== 'POST':
1628 return compat_urllib_request
.Request(
1629 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1630 unverifiable
=True, method
=m
)
1633 def extract_timezone(date_str
):
1636 ^.{8,}? # >=8 char non-TZ prefix, if present
1637 (?P<tz>Z| # just the UTC Z, or
1638 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1639 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1640 [ ]? # optional space
1641 (?P<sign>\+|-) # +/-
1642 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1646 timezone
= datetime
.timedelta()
1648 date_str
= date_str
[:-len(m
.group('tz'))]
1649 if not m
.group('sign'):
1650 timezone
= datetime
.timedelta()
1652 sign
= 1 if m
.group('sign') == '+' else -1
1653 timezone
= datetime
.timedelta(
1654 hours
=sign
* int(m
.group('hours')),
1655 minutes
=sign
* int(m
.group('minutes')))
1656 return timezone
, date_str
1659 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1660 """ Return a UNIX timestamp from the given date """
1662 if date_str
is None:
1665 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1667 if timezone
is None:
1668 timezone
, date_str
= extract_timezone(date_str
)
1670 with contextlib
.suppress(ValueError):
1671 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1672 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1673 return calendar
.timegm(dt
.timetuple())
1676 def date_formats(day_first
=True):
1677 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1680 def unified_strdate(date_str
, day_first
=True):
1681 """Return a string with the date in the format YYYYMMDD"""
1683 if date_str
is None:
1687 date_str
= date_str
.replace(',', ' ')
1688 # Remove AM/PM + timezone
1689 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1690 _
, date_str
= extract_timezone(date_str
)
1692 for expression
in date_formats(day_first
):
1693 with contextlib
.suppress(ValueError):
1694 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1695 if upload_date
is None:
1696 timetuple
= email
.utils
.parsedate_tz(date_str
)
1698 with contextlib
.suppress(ValueError):
1699 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1700 if upload_date
is not None:
1701 return compat_str(upload_date
)
1704 def unified_timestamp(date_str
, day_first
=True):
1705 if date_str
is None:
1708 date_str
= re
.sub(r
'[,|]', '', date_str
)
1710 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1711 timezone
, date_str
= extract_timezone(date_str
)
1713 # Remove AM/PM + timezone
1714 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1716 # Remove unrecognized timezones from ISO 8601 alike timestamps
1717 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1719 date_str
= date_str
[:-len(m
.group('tz'))]
1721 # Python only supports microseconds, so remove nanoseconds
1722 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1724 date_str
= m
.group(1)
1726 for expression
in date_formats(day_first
):
1727 with contextlib
.suppress(ValueError):
1728 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1729 return calendar
.timegm(dt
.timetuple())
1730 timetuple
= email
.utils
.parsedate_tz(date_str
)
1732 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1735 def determine_ext(url
, default_ext
='unknown_video'):
1736 if url
is None or '.' not in url
:
1738 guess
= url
.partition('?')[0].rpartition('.')[2]
1739 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1741 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1742 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1743 return guess
.rstrip('/')
1748 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1749 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1752 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1754 Return a datetime object from a string.
1756 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1758 @param format strftime format of DATE
1759 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1760 auto: round to the unit provided in date_str (if applicable).
1762 auto_precision
= False
1763 if precision
== 'auto':
1764 auto_precision
= True
1765 precision
= 'microsecond'
1766 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1767 if date_str
in ('now', 'today'):
1769 if date_str
== 'yesterday':
1770 return today
- datetime
.timedelta(days
=1)
1772 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1774 if match
is not None:
1775 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1776 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1777 unit
= match
.group('unit')
1778 if unit
== 'month' or unit
== 'year':
1779 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1785 delta
= datetime
.timedelta(**{unit + 's': time}
)
1786 new_date
= start_time
+ delta
1788 return datetime_round(new_date
, unit
)
1791 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1794 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1796 Return a date object from a string using datetime_from_str
1798 @param strict Restrict allowed patterns to "YYYYMMDD" and
1799 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1801 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1802 raise ValueError(f
'Invalid date format "{date_str}"')
1803 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1806 def datetime_add_months(dt
, months
):
1807 """Increment/Decrement a datetime object by months."""
1808 month
= dt
.month
+ months
- 1
1809 year
= dt
.year
+ month
// 12
1810 month
= month
% 12 + 1
1811 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1812 return dt
.replace(year
, month
, day
)
1815 def datetime_round(dt
, precision
='day'):
1817 Round a datetime object's time to a specific precision
1819 if precision
== 'microsecond':
1828 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1829 timestamp
= calendar
.timegm(dt
.timetuple())
1830 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1833 def hyphenate_date(date_str
):
1835 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1836 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1837 if match
is not None:
1838 return '-'.join(match
.groups())
1844 """Represents a time interval between two dates"""
1846 def __init__(self
, start
=None, end
=None):
1847 """start and end must be strings in the format accepted by date"""
1848 if start
is not None:
1849 self
.start
= date_from_str(start
, strict
=True)
1851 self
.start
= datetime
.datetime
.min.date()
1853 self
.end
= date_from_str(end
, strict
=True)
1855 self
.end
= datetime
.datetime
.max.date()
1856 if self
.start
> self
.end
:
1857 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1861 """Returns a range that only contains the given day"""
1862 return cls(day
, day
)
1864 def __contains__(self
, date
):
1865 """Check if the date is in the range"""
1866 if not isinstance(date
, datetime
.date
):
1867 date
= date_from_str(date
)
1868 return self
.start
<= date
<= self
.end
1871 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
1874 def platform_name():
1875 """ Returns the platform name as a compat_str """
1876 res
= platform
.platform()
1877 if isinstance(res
, bytes):
1878 res
= res
.decode(preferredencoding())
1880 assert isinstance(res
, compat_str
)
1884 def get_windows_version():
1885 ''' Get Windows version. None if it's not running on Windows '''
1886 if compat_os_name
== 'nt':
1887 return version_tuple(platform
.win32_ver()[1])
1892 def write_string(s
, out
=None, encoding
=None):
1893 assert isinstance(s
, str)
1894 out
= out
or sys
.stderr
1896 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1897 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1899 if 'b' in getattr(out
, 'mode', ''):
1900 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1902 elif hasattr(out
, 'buffer'):
1903 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1904 byt
= s
.encode(enc
, 'ignore')
1905 out
.buffer.write(byt
)
1911 def bytes_to_intlist(bs
):
1914 if isinstance(bs
[0], int): # Python 3
1917 return [ord(c
) for c
in bs
]
1920 def intlist_to_bytes(xs
):
1923 return compat_struct_pack('%dB' % len(xs
), *xs
)
1926 class LockingUnsupportedError(IOError):
1927 msg
= 'File locking is not supported on this platform'
1930 super().__init
__(self
.msg
)
1933 # Cross-platform file locking
1934 if sys
.platform
== 'win32':
1935 import ctypes
.wintypes
1938 class OVERLAPPED(ctypes
.Structure
):
1940 ('Internal', ctypes
.wintypes
.LPVOID
),
1941 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1942 ('Offset', ctypes
.wintypes
.DWORD
),
1943 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1944 ('hEvent', ctypes
.wintypes
.HANDLE
),
1947 kernel32
= ctypes
.windll
.kernel32
1948 LockFileEx
= kernel32
.LockFileEx
1949 LockFileEx
.argtypes
= [
1950 ctypes
.wintypes
.HANDLE
, # hFile
1951 ctypes
.wintypes
.DWORD
, # dwFlags
1952 ctypes
.wintypes
.DWORD
, # dwReserved
1953 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1954 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1955 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1957 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1958 UnlockFileEx
= kernel32
.UnlockFileEx
1959 UnlockFileEx
.argtypes
= [
1960 ctypes
.wintypes
.HANDLE
, # hFile
1961 ctypes
.wintypes
.DWORD
, # dwReserved
1962 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1963 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1964 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1966 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1967 whole_low
= 0xffffffff
1968 whole_high
= 0x7fffffff
1970 def _lock_file(f
, exclusive
, block
):
1971 overlapped
= OVERLAPPED()
1972 overlapped
.Offset
= 0
1973 overlapped
.OffsetHigh
= 0
1974 overlapped
.hEvent
= 0
1975 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1977 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
1978 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
1979 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1980 raise BlockingIOError('Locking file failed: %r' % ctypes
.FormatError())
1982 def _unlock_file(f
):
1983 assert f
._lock
_file
_overlapped
_p
1984 handle
= msvcrt
.get_osfhandle(f
.fileno())
1985 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1986 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
1992 def _lock_file(f
, exclusive
, block
):
1993 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
1995 flags |
= fcntl
.LOCK_NB
1997 fcntl
.flock(f
, flags
)
1998 except BlockingIOError
:
2000 except OSError: # AOSP does not have flock()
2001 fcntl
.lockf(f
, flags
)
2003 def _unlock_file(f
):
2005 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2007 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2011 def _lock_file(f
, exclusive
, block
):
2012 raise LockingUnsupportedError()
2014 def _unlock_file(f
):
2015 raise LockingUnsupportedError()
2021 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2022 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2023 raise NotImplementedError(mode
)
2024 self
.mode
, self
.block
= mode
, block
2026 writable
= any(f
in mode
for f
in 'wax+')
2027 readable
= any(f
in mode
for f
in 'r+')
2028 flags
= functools
.reduce(operator
.ior
, (
2029 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2030 getattr(os
, 'O_BINARY', 0), # Windows only
2031 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2032 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2033 os
.O_APPEND
if 'a' in mode
else 0,
2034 os
.O_EXCL
if 'x' in mode
else 0,
2035 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2038 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2040 def __enter__(self
):
2041 exclusive
= 'r' not in self
.mode
2043 _lock_file(self
.f
, exclusive
, self
.block
)
2048 if 'w' in self
.mode
:
2051 except OSError as e
:
2052 if e
.errno
!= 29: # Illegal seek, expected when self.f is a FIFO
2060 _unlock_file(self
.f
)
2064 def __exit__(self
, *_
):
2073 def __getattr__(self
, attr
):
2074 return getattr(self
.f
, attr
)
2080 def get_filesystem_encoding():
2081 encoding
= sys
.getfilesystemencoding()
2082 return encoding
if encoding
is not None else 'utf-8'
2085 def shell_quote(args
):
2087 encoding
= get_filesystem_encoding()
2089 if isinstance(a
, bytes):
2090 # We may get a filename encoded with 'encodeFilename'
2091 a
= a
.decode(encoding
)
2092 quoted_args
.append(compat_shlex_quote(a
))
2093 return ' '.join(quoted_args
)
2096 def smuggle_url(url
, data
):
2097 """ Pass additional data in a URL for internal use. """
2099 url
, idata
= unsmuggle_url(url
, {})
2101 sdata
= compat_urllib_parse_urlencode(
2102 {'__youtubedl_smuggle': json.dumps(data)}
)
2103 return url
+ '#' + sdata
2106 def unsmuggle_url(smug_url
, default
=None):
2107 if '#__youtubedl_smuggle' not in smug_url
:
2108 return smug_url
, default
2109 url
, _
, sdata
= smug_url
.rpartition('#')
2110 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2111 data
= json
.loads(jsond
)
2115 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2116 """ Formats numbers with decimal sufixes like K, M, etc """
2117 num
, factor
= float_or_none(num
), float(factor
)
2118 if num
is None or num
< 0:
2120 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2121 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2122 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2124 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2125 converted
= num
/ (factor
** exponent
)
2126 return fmt
% (converted
, suffix
)
2129 def format_bytes(bytes):
2130 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2133 def lookup_unit_table(unit_table
, s
):
2134 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2136 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2139 num_str
= m
.group('num').replace(',', '.')
2140 mult
= unit_table
[m
.group('unit')]
2141 return int(float(num_str
) * mult
)
2144 def parse_filesize(s
):
2148 # The lower-case forms are of course incorrect and unofficial,
2149 # but we support those too
2166 'megabytes': 1000 ** 2,
2167 'mebibytes': 1024 ** 2,
2173 'gigabytes': 1000 ** 3,
2174 'gibibytes': 1024 ** 3,
2180 'terabytes': 1000 ** 4,
2181 'tebibytes': 1024 ** 4,
2187 'petabytes': 1000 ** 5,
2188 'pebibytes': 1024 ** 5,
2194 'exabytes': 1000 ** 6,
2195 'exbibytes': 1024 ** 6,
2201 'zettabytes': 1000 ** 7,
2202 'zebibytes': 1024 ** 7,
2208 'yottabytes': 1000 ** 8,
2209 'yobibytes': 1024 ** 8,
2212 return lookup_unit_table(_UNIT_TABLE
, s
)
2219 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2221 if re
.match(r
'^[\d,.]+$', s
):
2222 return str_to_int(s
)
2235 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2239 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2241 return str_to_int(mobj
.group(1))
2244 def parse_resolution(s
, *, lenient
=False):
2249 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2251 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2254 'width': int(mobj
.group('w')),
2255 'height': int(mobj
.group('h')),
2258 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2260 return {'height': int(mobj.group(1))}
2262 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2264 return {'height': int(mobj.group(1)) * 540}
2269 def parse_bitrate(s
):
2270 if not isinstance(s
, compat_str
):
2272 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2274 return int(mobj
.group(1))
2277 def month_by_name(name
, lang
='en'):
2278 """ Return the number of a month by (locale-independently) English name """
2280 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2283 return month_names
.index(name
) + 1
2288 def month_by_abbreviation(abbrev
):
2289 """ Return the number of a month by (locale-independently) English
2293 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2298 def fix_xml_ampersands(xml_str
):
2299 """Replace all the '&' by '&' in XML"""
2301 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2306 def setproctitle(title
):
2307 assert isinstance(title
, compat_str
)
2309 # ctypes in Jython is not complete
2310 # http://bugs.jython.org/issue2148
2311 if sys
.platform
.startswith('java'):
2315 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2319 # LoadLibrary in Windows Python 2.7.13 only expects
2320 # a bytestring, but since unicode_literals turns
2321 # every string into a unicode string, it fails.
2323 title_bytes
= title
.encode()
2324 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2325 buf
.value
= title_bytes
2327 libc
.prctl(15, buf
, 0, 0, 0)
2328 except AttributeError:
2329 return # Strange libc, just skip this
2332 def remove_start(s
, start
):
2333 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2336 def remove_end(s
, end
):
2337 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2340 def remove_quotes(s
):
2341 if s
is None or len(s
) < 2:
2343 for quote
in ('"', "'", ):
2344 if s
[0] == quote
and s
[-1] == quote
:
2349 def get_domain(url
):
2350 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2351 return domain
.group('domain') if domain
else None
2354 def url_basename(url
):
2355 path
= compat_urlparse
.urlparse(url
).path
2356 return path
.strip('/').split('/')[-1]
2360 return re
.match(r
'https?://[^?#&]+/', url
).group()
2363 def urljoin(base
, path
):
2364 if isinstance(path
, bytes):
2365 path
= path
.decode()
2366 if not isinstance(path
, compat_str
) or not path
:
2368 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2370 if isinstance(base
, bytes):
2371 base
= base
.decode()
2372 if not isinstance(base
, compat_str
) or not re
.match(
2373 r
'^(?:https?:)?//', base
):
2375 return compat_urlparse
.urljoin(base
, path
)
2378 class HEADRequest(compat_urllib_request
.Request
):
2379 def get_method(self
):
2383 class PUTRequest(compat_urllib_request
.Request
):
2384 def get_method(self
):
2388 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2389 if get_attr
and v
is not None:
2390 v
= getattr(v
, get_attr
, None)
2392 return int(v
) * invscale
// scale
2393 except (ValueError, TypeError, OverflowError):
2397 def str_or_none(v
, default
=None):
2398 return default
if v
is None else compat_str(v
)
2401 def str_to_int(int_str
):
2402 """ A more relaxed version of int_or_none """
2403 if isinstance(int_str
, int):
2405 elif isinstance(int_str
, compat_str
):
2406 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2407 return int_or_none(int_str
)
2410 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2414 return float(v
) * invscale
/ scale
2415 except (ValueError, TypeError):
2419 def bool_or_none(v
, default
=None):
2420 return v
if isinstance(v
, bool) else default
2423 def strip_or_none(v
, default
=None):
2424 return v
.strip() if isinstance(v
, compat_str
) else default
2427 def url_or_none(url
):
2428 if not url
or not isinstance(url
, compat_str
):
2431 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2434 def request_to_url(req
):
2435 if isinstance(req
, compat_urllib_request
.Request
):
2436 return req
.get_full_url()
2441 def strftime_or_none(timestamp
, date_format
, default
=None):
2442 datetime_object
= None
2444 if isinstance(timestamp
, (int, float)): # unix timestamp
2445 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2446 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2447 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2448 return datetime_object
.strftime(date_format
)
2449 except (ValueError, TypeError, AttributeError):
2453 def parse_duration(s
):
2454 if not isinstance(s
, str):
2460 days
, hours
, mins
, secs
, ms
= [None] * 5
2461 m
= re
.match(r
'''(?x)
2463 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2464 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2465 (?P<ms>[.:][0-9]+)?Z?$
2468 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2473 [0-9]+\s*y(?:ears?)?,?\s*
2476 [0-9]+\s*m(?:onths?)?,?\s*
2479 [0-9]+\s*w(?:eeks?)?,?\s*
2482 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2486 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2489 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2492 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2495 days
, hours
, mins
, secs
, ms
= m
.groups()
2497 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2499 hours
, mins
= m
.groups()
2504 ms
= ms
.replace(':', '.')
2505 return sum(float(part
or 0) * mult
for part
, mult
in (
2506 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2509 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2510 name
, real_ext
= os
.path
.splitext(filename
)
2512 f
'{name}.{ext}{real_ext}'
2513 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2514 else f
'{filename}.{ext}')
2517 def replace_extension(filename
, ext
, expected_real_ext
=None):
2518 name
, real_ext
= os
.path
.splitext(filename
)
2519 return '{}.{}'.format(
2520 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2524 def check_executable(exe
, args
=[]):
2525 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2526 args can be a list of arguments for a short output (like -version) """
2528 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
2534 def _get_exe_version_output(exe
, args
, *, to_screen
=None):
2536 to_screen(f
'Checking exe version: {shell_quote([exe] + args)}')
2538 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2539 # SIGTTOU if yt-dlp is run in the background.
2540 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2542 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
2543 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
2546 if isinstance(out
, bytes): # Python 2.x
2547 out
= out
.decode('ascii', 'ignore')
2551 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2552 assert isinstance(output
, compat_str
)
2553 if version_re
is None:
2554 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2555 m
= re
.search(version_re
, output
)
2562 def get_exe_version(exe
, args
=['--version'],
2563 version_re
=None, unrecognized
='present'):
2564 """ Returns the version of the specified executable,
2565 or False if the executable is not present """
2566 out
= _get_exe_version_output(exe
, args
)
2567 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2570 class LazyList(collections
.abc
.Sequence
):
2571 """Lazy immutable list from an iterable
2572 Note that slices of a LazyList are lists and not LazyList"""
2574 class IndexError(IndexError):
2577 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2578 self
._iterable
= iter(iterable
)
2579 self
._cache
= [] if _cache
is None else _cache
2580 self
._reversed
= reverse
2584 # We need to consume the entire iterable to iterate in reverse
2585 yield from self
.exhaust()
2587 yield from self
._cache
2588 for item
in self
._iterable
:
2589 self
._cache
.append(item
)
2593 self
._cache
.extend(self
._iterable
)
2594 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2598 """Evaluate the entire iterable"""
2599 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2602 def _reverse_index(x
):
2603 return None if x
is None else -(x
+ 1)
2605 def __getitem__(self
, idx
):
2606 if isinstance(idx
, slice):
2608 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2609 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2610 elif isinstance(idx
, int):
2612 idx
= self
._reverse
_index
(idx
)
2613 start
, stop
, step
= idx
, idx
, 0
2615 raise TypeError('indices must be integers or slices')
2616 if ((start
or 0) < 0 or (stop
or 0) < 0
2617 or (start
is None and step
< 0)
2618 or (stop
is None and step
> 0)):
2619 # We need to consume the entire iterable to be able to slice from the end
2620 # Obviously, never use this with infinite iterables
2623 return self
._cache
[idx
]
2624 except IndexError as e
:
2625 raise self
.IndexError(e
) from e
2626 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2628 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2630 return self
._cache
[idx
]
2631 except IndexError as e
:
2632 raise self
.IndexError(e
) from e
2636 self
[-1] if self
._reversed
else self
[0]
2637 except self
.IndexError:
2643 return len(self
._cache
)
2645 def __reversed__(self
):
2646 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2649 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2652 # repr and str should mimic a list. So we exhaust the iterable
2653 return repr(self
.exhaust())
2656 return repr(self
.exhaust())
2661 class IndexError(IndexError):
2665 # This is only useful for tests
2666 return len(self
.getslice())
2668 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2669 self
._pagefunc
= pagefunc
2670 self
._pagesize
= pagesize
2671 self
._pagecount
= float('inf')
2672 self
._use
_cache
= use_cache
2675 def getpage(self
, pagenum
):
2676 page_results
= self
._cache
.get(pagenum
)
2677 if page_results
is None:
2678 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2680 self
._cache
[pagenum
] = page_results
2683 def getslice(self
, start
=0, end
=None):
2684 return list(self
._getslice
(start
, end
))
2686 def _getslice(self
, start
, end
):
2687 raise NotImplementedError('This method must be implemented by subclasses')
2689 def __getitem__(self
, idx
):
2690 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2691 if not isinstance(idx
, int) or idx
< 0:
2692 raise TypeError('indices must be non-negative integers')
2693 entries
= self
.getslice(idx
, idx
+ 1)
2695 raise self
.IndexError()
2699 class OnDemandPagedList(PagedList
):
2700 """Download pages until a page with less than maximum results"""
2702 def _getslice(self
, start
, end
):
2703 for pagenum
in itertools
.count(start
// self
._pagesize
):
2704 firstid
= pagenum
* self
._pagesize
2705 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2706 if start
>= nextfirstid
:
2710 start
% self
._pagesize
2711 if firstid
<= start
< nextfirstid
2714 ((end
- 1) % self
._pagesize
) + 1
2715 if (end
is not None and firstid
<= end
<= nextfirstid
)
2719 page_results
= self
.getpage(pagenum
)
2721 self
._pagecount
= pagenum
- 1
2723 if startv
!= 0 or endv
is not None:
2724 page_results
= page_results
[startv
:endv
]
2725 yield from page_results
2727 # A little optimization - if current page is not "full", ie. does
2728 # not contain page_size videos then we can assume that this page
2729 # is the last one - there are no more ids on further pages -
2730 # i.e. no need to query again.
2731 if len(page_results
) + startv
< self
._pagesize
:
2734 # If we got the whole page, but the next page is not interesting,
2735 # break out early as well
2736 if end
== nextfirstid
:
2740 class InAdvancePagedList(PagedList
):
2741 """PagedList with total number of pages known in advance"""
2743 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2744 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2745 self
._pagecount
= pagecount
2747 def _getslice(self
, start
, end
):
2748 start_page
= start
// self
._pagesize
2749 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2750 skip_elems
= start
- start_page
* self
._pagesize
2751 only_more
= None if end
is None else end
- start
2752 for pagenum
in range(start_page
, end_page
):
2753 page_results
= self
.getpage(pagenum
)
2755 page_results
= page_results
[skip_elems
:]
2757 if only_more
is not None:
2758 if len(page_results
) < only_more
:
2759 only_more
-= len(page_results
)
2761 yield from page_results
[:only_more
]
2763 yield from page_results
2766 def uppercase_escape(s
):
2767 unicode_escape
= codecs
.getdecoder('unicode_escape')
2769 r
'\\U[0-9a-fA-F]{8}',
2770 lambda m
: unicode_escape(m
.group(0))[0],
2774 def lowercase_escape(s
):
2775 unicode_escape
= codecs
.getdecoder('unicode_escape')
2777 r
'\\u[0-9a-fA-F]{4}',
2778 lambda m
: unicode_escape(m
.group(0))[0],
2782 def escape_rfc3986(s
):
2783 """Escape non-ASCII characters as suggested by RFC 3986"""
2784 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2787 def escape_url(url
):
2788 """Escape URL as suggested by RFC 3986"""
2789 url_parsed
= compat_urllib_parse_urlparse(url
)
2790 return url_parsed
._replace
(
2791 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2792 path
=escape_rfc3986(url_parsed
.path
),
2793 params
=escape_rfc3986(url_parsed
.params
),
2794 query
=escape_rfc3986(url_parsed
.query
),
2795 fragment
=escape_rfc3986(url_parsed
.fragment
)
2800 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2803 def read_batch_urls(batch_fd
):
2805 if not isinstance(url
, compat_str
):
2806 url
= url
.decode('utf-8', 'replace')
2807 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2808 for bom
in BOM_UTF8
:
2809 if url
.startswith(bom
):
2810 url
= url
[len(bom
):]
2812 if not url
or url
.startswith(('#', ';', ']')):
2814 # "#" cannot be stripped out since it is part of the URI
2815 # However, it can be safely stipped out if follwing a whitespace
2816 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2818 with contextlib
.closing(batch_fd
) as fd
:
2819 return [url
for url
in map(fixup
, fd
) if url
]
2822 def urlencode_postdata(*args
, **kargs
):
2823 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2826 def update_url_query(url
, query
):
2829 parsed_url
= compat_urlparse
.urlparse(url
)
2830 qs
= compat_parse_qs(parsed_url
.query
)
2832 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2833 query
=compat_urllib_parse_urlencode(qs
, True)))
2836 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
2837 req_headers
= req
.headers
.copy()
2838 req_headers
.update(headers
)
2839 req_data
= data
or req
.data
2840 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2841 req_get_method
= req
.get_method()
2842 if req_get_method
== 'HEAD':
2843 req_type
= HEADRequest
2844 elif req_get_method
== 'PUT':
2845 req_type
= PUTRequest
2847 req_type
= compat_urllib_request
.Request
2849 req_url
, data
=req_data
, headers
=req_headers
,
2850 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2851 if hasattr(req
, 'timeout'):
2852 new_req
.timeout
= req
.timeout
2856 def _multipart_encode_impl(data
, boundary
):
2857 content_type
= 'multipart/form-data; boundary=%s' % boundary
2860 for k
, v
in data
.items():
2861 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2862 if isinstance(k
, compat_str
):
2864 if isinstance(v
, compat_str
):
2866 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2867 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2868 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2869 if boundary
.encode('ascii') in content
:
2870 raise ValueError('Boundary overlaps with data')
2873 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2875 return out
, content_type
2878 def multipart_encode(data
, boundary
=None):
2880 Encode a dict to RFC 7578-compliant form-data
2883 A dict where keys and values can be either Unicode or bytes-like
2886 If specified a Unicode object, it's used as the boundary. Otherwise
2887 a random boundary is generated.
2889 Reference: https://tools.ietf.org/html/rfc7578
2891 has_specified_boundary
= boundary
is not None
2894 if boundary
is None:
2895 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2898 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2901 if has_specified_boundary
:
2905 return out
, content_type
2908 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2909 for val
in map(d
.get
, variadic(key_or_keys
)):
2910 if val
is not None and (val
or not skip_false_values
):
2915 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
2918 val
= f(*args
, **kwargs
)
2919 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2922 if expected_type
is None or isinstance(val
, expected_type
):
2926 def try_get(src
, getter
, expected_type
=None):
2927 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
2930 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
2931 return {k: v for k, v in dct.items() if cndn(k, v)}
2934 def merge_dicts(*dicts
):
2936 for a_dict
in dicts
:
2937 for k
, v
in a_dict
.items():
2938 if (v
is not None and k
not in merged
2939 or isinstance(v
, str) and merged
[k
] == ''):
2944 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2945 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2957 TV_PARENTAL_GUIDELINES
= {
2967 def parse_age_limit(s
):
2968 # isinstance(False, int) is True. So type() must be used instead
2970 return s
if 0 <= s
<= 21 else None
2971 elif not isinstance(s
, str):
2973 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2975 return int(m
.group('age'))
2978 return US_RATINGS
[s
]
2979 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2981 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
2985 def strip_jsonp(code
):
2988 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2989 (?:\s*&&\s*(?P=func_name))?
2990 \s*\(\s*(?P<callback_data>.*)\);?
2991 \s*?(?://[^\n]*)*$''',
2992 r
'\g<callback_data>', code
)
2995 def js_to_json(code
, vars={}):
2996 # vars is a dict of var, val pairs to substitute
2997 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2998 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3000 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3001 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3006 if v
in ('true', 'false', 'null'):
3008 elif v
in ('undefined', 'void 0'):
3010 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3013 if v
[0] in ("'", '"'):
3014 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3019 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3021 for regex
, base
in INTEGER_TABLE
:
3022 im
= re
.match(regex
, v
)
3024 i
= int(im
.group(1), base
)
3025 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3032 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3034 return re
.sub(r
'''(?sx)
3035 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3036 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3037 {comment}|,(?={skip}[\]}}])|
3038 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3039 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3042 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3045 def qualities(quality_ids
):
3046 """ Get a numeric quality value out of a list of possible values """
3049 return quality_ids
.index(qid
)
3055 POSTPROCESS_WHEN
= ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3059 'default': '%(title)s [%(id)s].%(ext)s',
3060 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3066 'description': 'description',
3067 'annotation': 'annotations.xml',
3068 'infojson': 'info.json',
3071 'pl_thumbnail': None,
3072 'pl_description': 'description',
3073 'pl_infojson': 'info.json',
3076 # As of [1] format syntax is:
3077 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3078 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3079 STR_FORMAT_RE_TMPL
= r
'''(?x)
3080 (?<!%)(?P<prefix>(?:%%)*)
3082 (?P<has_key>\((?P<key>{0})\))?
3084 (?P<conversion>[#0\-+ ]+)?
3086 (?P<precision>\.\d+)?
3087 (?P<len_mod>[hlL])? # unused in python
3088 {1} # conversion type
3093 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3096 def limit_length(s
, length
):
3097 """ Add ellipses to overly long strings """
3102 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3106 def version_tuple(v
):
3107 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3110 def is_outdated_version(version
, limit
, assume_new
=True):
3112 return not assume_new
3114 return version_tuple(version
) < version_tuple(limit
)
3116 return not assume_new
3119 def ytdl_is_updateable():
3120 """ Returns if yt-dlp can be updated with -U """
3122 from .update
import is_non_updateable
3124 return not is_non_updateable()
3127 def args_to_str(args
):
3128 # Get a short string representation for a subprocess command
3129 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3132 def error_to_compat_str(err
):
3136 def error_to_str(err
):
3137 return f
'{type(err).__name__}: {err}'
3140 def mimetype2ext(mt
):
3144 mt
, _
, params
= mt
.partition(';')
3149 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3150 # it's the most popular one
3151 'audio/mpeg': 'mp3',
3152 'audio/x-wav': 'wav',
3154 'audio/wave': 'wav',
3157 ext
= FULL_MAP
.get(mt
)
3163 'smptett+xml': 'tt',
3167 'x-mp4-fragmented': 'mp4',
3168 'x-ms-sami': 'sami',
3171 'x-mpegurl': 'm3u8',
3172 'vnd.apple.mpegurl': 'm3u8',
3176 'vnd.ms-sstr+xml': 'ism',
3180 'filmstrip+json': 'fs',
3184 _
, _
, subtype
= mt
.rpartition('/')
3185 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3196 _
, _
, suffix
= subtype
.partition('+')
3197 ext
= SUFFIX_MAP
.get(suffix
)
3201 return subtype
.replace('+', '.')
3204 def ext2mimetype(ext_or_url
):
3207 if '.' not in ext_or_url
:
3208 ext_or_url
= f
'file.{ext_or_url}'
3209 return mimetypes
.guess_type(ext_or_url
)[0]
3212 def parse_codecs(codecs_str
):
3213 # http://tools.ietf.org/html/rfc6381
3216 split_codecs
= list(filter(None, map(
3217 str.strip
, codecs_str
.strip().strip(',').split(','))))
3218 vcodec
, acodec
, scodec
, hdr
= None, None, None, None
3219 for full_codec
in split_codecs
:
3220 parts
= full_codec
.split('.')
3221 codec
= parts
[0].replace('0', '')
3222 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3223 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3225 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3226 if codec
in ('dvh1', 'dvhe'):
3228 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3230 elif full_codec
.replace('0', '').startswith('vp9.2'):
3232 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3235 elif codec
in ('stpp', 'wvtt',):
3239 write_string(f
'WARNING: Unknown codec {full_codec}\n')
3240 if vcodec
or acodec
or scodec
:
3242 'vcodec': vcodec
or 'none',
3243 'acodec': acodec
or 'none',
3244 'dynamic_range': hdr
,
3245 **({'scodec': scodec}
if scodec
is not None else {}),
3247 elif len(split_codecs
) == 2:
3249 'vcodec': split_codecs
[0],
3250 'acodec': split_codecs
[1],
3255 def urlhandle_detect_ext(url_handle
):
3256 getheader
= url_handle
.headers
.get
3258 cd
= getheader('Content-Disposition')
3260 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3262 e
= determine_ext(m
.group('filename'), default_ext
=None)
3266 return mimetype2ext(getheader('Content-Type'))
3269 def encode_data_uri(data
, mime_type
):
3270 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3273 def age_restricted(content_limit
, age_limit
):
3274 """ Returns True iff the content should be blocked """
3276 if age_limit
is None: # No limit set
3278 if content_limit
is None:
3279 return False # Content available for everyone
3280 return age_limit
< content_limit
3283 def is_html(first_bytes
):
3284 """ Detect whether a file contains HTML by examining its first bytes. """
3287 (b
'\xef\xbb\xbf', 'utf-8'),
3288 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3289 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3290 (b
'\xff\xfe', 'utf-16-le'),
3291 (b
'\xfe\xff', 'utf-16-be'),
3295 for bom
, enc
in BOMS
:
3296 while first_bytes
.startswith(bom
):
3297 encoding
, first_bytes
= enc
, first_bytes
[len(bom
):]
3299 return re
.match(r
'^\s*<', first_bytes
.decode(encoding
, 'replace'))
3302 def determine_protocol(info_dict
):
3303 protocol
= info_dict
.get('protocol')
3304 if protocol
is not None:
3307 url
= sanitize_url(info_dict
['url'])
3308 if url
.startswith('rtmp'):
3310 elif url
.startswith('mms'):
3312 elif url
.startswith('rtsp'):
3315 ext
= determine_ext(url
)
3321 return compat_urllib_parse_urlparse(url
).scheme
3324 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3325 """ Render a list of rows, each as a list of values.
3326 Text after a \t will be right aligned """
3328 return len(remove_terminal_sequences(string
).replace('\t', ''))
3330 def get_max_lens(table
):
3331 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3333 def filter_using_list(row
, filterArray
):
3334 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3336 max_lens
= get_max_lens(data
) if hide_empty
else []
3337 header_row
= filter_using_list(header_row
, max_lens
)
3338 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3340 table
= [header_row
] + data
3341 max_lens
= get_max_lens(table
)
3344 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3345 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3347 for pos
, text
in enumerate(map(str, row
)):
3349 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3351 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3352 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3356 def _match_one(filter_part
, dct
, incomplete
):
3357 # TODO: Generalize code with YoutubeDL._build_format_filter
3358 STRING_OPERATORS
= {
3359 '*=': operator
.contains
,
3360 '^=': lambda attr
, value
: attr
.startswith(value
),
3361 '$=': lambda attr
, value
: attr
.endswith(value
),
3362 '~=': lambda attr
, value
: re
.search(value
, attr
),
3364 COMPARISON_OPERATORS
= {
3366 '<=': operator
.le
, # "<=" must be defined above "<"
3373 if isinstance(incomplete
, bool):
3374 is_incomplete
= lambda _
: incomplete
3376 is_incomplete
= lambda k
: k
in incomplete
3378 operator_rex
= re
.compile(r
'''(?x)\s*
3380 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3382 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3386 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3387 m = operator_rex.search(filter_part)
3390 unnegated_op = COMPARISON_OPERATORS[m['op']]
3392 op = lambda attr, value: not unnegated_op(attr, value)
3395 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3397 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3398 actual_value = dct.get(m['key'])
3399 numeric_comparison = None
3400 if isinstance(actual_value, (int, float)):
3401 # If the original field is a string and matching comparisonvalue is
3402 # a number we should respect the origin of the original field
3403 # and process comparison value as a string (see
3404 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3406 numeric_comparison = int(comparison_value)
3408 numeric_comparison = parse_filesize(comparison_value)
3409 if numeric_comparison is None:
3410 numeric_comparison = parse_filesize(f'{comparison_value}B')
3411 if numeric_comparison is None:
3412 numeric_comparison = parse_duration(comparison_value)
3413 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3414 raise ValueError('Operator %s only supports string values!' % m['op'])
3415 if actual_value is None:
3416 return is_incomplete(m['key']) or m['none_inclusive']
3417 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3420 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3421 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3423 operator_rex = re.compile(r'''(?x
)\s
*
3424 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3426 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3427 m = operator_rex.search(filter_part)
3429 op = UNARY_OPERATORS[m.group('op')]
3430 actual_value = dct.get(m.group('key'))
3431 if is_incomplete(m.group('key')) and actual_value is None:
3433 return op(actual_value)
3435 raise ValueError('Invalid filter part %r' % filter_part)
3438 def match_str(filter_str, dct, incomplete=False):
3439 """ Filter a dictionary with a simple string syntax.
3440 @returns Whether the filter passes
3441 @param incomplete Set of keys that is expected to be missing from dct.
3442 Can be True/False to indicate all/none of the keys may be missing.
3443 All conditions on incomplete keys pass if the key is missing
3446 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3447 for filter_part in re.split(r'(?<!\\)&', filter_str))
3450 def match_filter_func(filters):
3453 filters = set(variadic(filters))
3455 interactive = '-' in filters
3459 def _match_func(info_dict, incomplete=False):
3460 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3461 return NO_DEFAULT if interactive and not incomplete else None
3463 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3464 filter_str = ') | ('.join(map(str.strip, filters))
3465 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3469 def parse_dfxp_time_expr(time_expr):
3473 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3475 return float(mobj.group('time_offset'))
3477 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3479 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3482 def srt_subtitles_timecode(seconds):
3483 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3486 def ass_subtitles_timecode(seconds):
3487 time = timetuple_from_msec(seconds * 1000)
3488 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3491 def dfxp2srt(dfxp_data):
3493 @param dfxp_data A
bytes-like
object containing DFXP data
3494 @returns A
unicode object containing converted SRT data
3496 LEGACY_NAMESPACES = (
3497 (b'http://www.w3.org/ns/ttml', [
3498 b'http://www.w3.org/2004/11/ttaf1',
3499 b'http://www.w3.org/2006/04/ttaf1',
3500 b'http://www.w3.org/2006/10/ttaf1',
3502 (b'http://www.w3.org/ns/ttml#styling', [
3503 b'http://www.w3.org/ns/ttml#style',
3507 SUPPORTED_STYLING = [
3516 _x = functools.partial(xpath_with_ns, ns_map={
3517 'xml': 'http://www.w3.org/XML/1998/namespace',
3518 'ttml': 'http://www.w3.org/ns/ttml',
3519 'tts': 'http://www.w3.org/ns/ttml#styling',
3525 class TTMLPElementParser:
3527 _unclosed_elements = []
3528 _applied_styles = []
3530 def start(self, tag, attrib):
3531 if tag in (_x('ttml:br'), 'br'):
3534 unclosed_elements = []
3536 element_style_id = attrib.get('style')
3538 style.update(default_style)
3539 if element_style_id:
3540 style.update(styles.get(element_style_id, {}))
3541 for prop in SUPPORTED_STYLING:
3542 prop_val = attrib.get(_x('tts:' + prop))
3544 style[prop] = prop_val
3547 for k, v in sorted(style.items()):
3548 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3551 font += ' color="%s"' % v
3552 elif k == 'fontSize':
3553 font += ' size="%s"' % v
3554 elif k == 'fontFamily':
3555 font += ' face="%s"' % v
3556 elif k == 'fontWeight' and v == 'bold':
3558 unclosed_elements.append('b')
3559 elif k == 'fontStyle' and v == 'italic':
3561 unclosed_elements.append('i')
3562 elif k == 'textDecoration' and v == 'underline':
3564 unclosed_elements.append('u')
3566 self._out += '<font' + font + '>'
3567 unclosed_elements.append('font')
3569 if self._applied_styles:
3570 applied_style.update(self._applied_styles[-1])
3571 applied_style.update(style)
3572 self._applied_styles.append(applied_style)
3573 self._unclosed_elements.append(unclosed_elements)
3576 if tag not in (_x('ttml:br'), 'br'):
3577 unclosed_elements = self._unclosed_elements.pop()
3578 for element in reversed(unclosed_elements):
3579 self._out += '</%s>' % element
3580 if unclosed_elements and self._applied_styles:
3581 self._applied_styles.pop()
3583 def data(self, data):
3587 return self._out.strip()
3589 def parse_node(node):
3590 target = TTMLPElementParser()
3591 parser = xml.etree.ElementTree.XMLParser(target=target)
3592 parser.feed(xml.etree.ElementTree.tostring(node))
3593 return parser.close()
3595 for k, v in LEGACY_NAMESPACES:
3597 dfxp_data = dfxp_data.replace(ns, k)
3599 dfxp = compat_etree_fromstring(dfxp_data)
3601 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3604 raise ValueError('Invalid dfxp/TTML subtitle')
3608 for style in dfxp.findall(_x('.//ttml:style')):
3609 style_id = style.get('id') or style.get(_x('xml:id'))
3612 parent_style_id = style.get('style')
3614 if parent_style_id not in styles:
3617 styles[style_id] = styles[parent_style_id].copy()
3618 for prop in SUPPORTED_STYLING:
3619 prop_val = style.get(_x('tts:' + prop))
3621 styles.setdefault(style_id, {})[prop] = prop_val
3627 for p in ('body', 'div'):
3628 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3631 style = styles.get(ele.get('style'))
3634 default_style.update(style)
3636 for para, index in zip(paras, itertools.count(1)):
3637 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3638 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3639 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3640 if begin_time is None:
3645 end_time = begin_time + dur
3646 out.append('%d\n%s --> %s\n%s\n\n' % (
3648 srt_subtitles_timecode(begin_time),
3649 srt_subtitles_timecode(end_time),
3655 def cli_option(params, command_option, param):
3656 param = params.get(param)
3658 param = compat_str(param)
3659 return [command_option, param] if param is not None else []
3662 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3663 param = params.get(param)
3666 assert isinstance(param, bool)
3668 return [command_option + separator + (true_value if param else false_value)]
3669 return [command_option, true_value if param else false_value]
3672 def cli_valueless_option(params, command_option, param, expected_value=True):
3673 param = params.get(param)
3674 return [command_option] if param == expected_value else []
3677 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3678 if isinstance(argdict, (list, tuple)): # for backward compatibility
3685 assert isinstance(argdict, dict)
3687 assert isinstance(keys, (list, tuple))
3688 for key_list in keys:
3689 arg_list = list(filter(
3690 lambda x: x is not None,
3691 [argdict.get(key.lower()) for key in variadic(key_list)]))
3693 return [arg for args in arg_list for arg in args]
3697 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3698 main_key, exe = main_key.lower(), exe.lower()
3699 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3700 keys = [f'{root_key}{k}' for k in (keys or [''])]
3701 if root_key in keys:
3703 keys.append((main_key, exe))
3704 keys.append('default')
3707 return cli_configuration_args(argdict, keys, default, use_compat)
3711 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3770 'iw': 'heb', # Replaced by he in 1989 revision
3780 'in': 'ind', # Replaced by id in 1989 revision
3895 'ji': 'yid', # Replaced by yi in 1989 revision
3903 def short2long(cls, code):
3904 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3905 return cls._lang_map.get(code[:2])
3908 def long2short(cls, code):
3909 """Convert language code from ISO 639-2/T to ISO 639-1"""
3910 for short_name, long_name in cls._lang_map.items():
3911 if long_name == code:
3916 # From http://data.okfn.org/data/core/country-list
3918 'AF': 'Afghanistan',
3919 'AX': 'Åland Islands',
3922 'AS': 'American Samoa',
3927 'AG': 'Antigua and Barbuda',
3944 'BO': 'Bolivia, Plurinational State of',
3945 'BQ': 'Bonaire, Sint Eustatius and Saba',
3946 'BA': 'Bosnia and Herzegovina',
3948 'BV': 'Bouvet Island',
3950 'IO': 'British Indian Ocean Territory',
3951 'BN': 'Brunei Darussalam',
3953 'BF': 'Burkina Faso',
3959 'KY': 'Cayman Islands',
3960 'CF': 'Central African Republic',
3964 'CX': 'Christmas Island',
3965 'CC': 'Cocos (Keeling) Islands',
3969 'CD': 'Congo, the Democratic Republic of the',
3970 'CK': 'Cook Islands',
3972 'CI': 'Côte d\'Ivoire',
3977 'CZ': 'Czech Republic',
3981 'DO': 'Dominican Republic',
3984 'SV': 'El Salvador',
3985 'GQ': 'Equatorial Guinea',
3989 'FK': 'Falkland Islands (Malvinas)',
3990 'FO': 'Faroe Islands',
3994 'GF': 'French Guiana',
3995 'PF': 'French Polynesia',
3996 'TF': 'French Southern Territories',
4011 'GW': 'Guinea-Bissau',
4014 'HM': 'Heard Island and McDonald Islands',
4015 'VA': 'Holy See (Vatican City State)',
4022 'IR': 'Iran, Islamic Republic of',
4025 'IM': 'Isle of Man',
4035 'KP': 'Korea, Democratic People\'s Republic of',
4036 'KR': 'Korea, Republic of',
4039 'LA': 'Lao People\'s Democratic Republic',
4045 'LI': 'Liechtenstein',
4049 'MK': 'Macedonia, the Former Yugoslav Republic of',
4056 'MH': 'Marshall Islands',
4062 'FM': 'Micronesia, Federated States of',
4063 'MD': 'Moldova, Republic of',
4074 'NL': 'Netherlands',
4075 'NC': 'New Caledonia',
4076 'NZ': 'New Zealand',
4081 'NF': 'Norfolk Island',
4082 'MP': 'Northern Mariana Islands',
4087 'PS': 'Palestine, State of',
4089 'PG': 'Papua New Guinea',
4092 'PH': 'Philippines',
4096 'PR': 'Puerto Rico',
4100 'RU': 'Russian Federation',
4102 'BL': 'Saint Barthélemy',
4103 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4104 'KN': 'Saint Kitts and Nevis',
4105 'LC': 'Saint Lucia',
4106 'MF': 'Saint Martin (French part)',
4107 'PM': 'Saint Pierre and Miquelon',
4108 'VC': 'Saint Vincent and the Grenadines',
4111 'ST': 'Sao Tome and Principe',
4112 'SA': 'Saudi Arabia',
4116 'SL': 'Sierra Leone',
4118 'SX': 'Sint Maarten (Dutch part)',
4121 'SB': 'Solomon Islands',
4123 'ZA': 'South Africa',
4124 'GS': 'South Georgia and the South Sandwich Islands',
4125 'SS': 'South Sudan',
4130 'SJ': 'Svalbard and Jan Mayen',
4133 'CH': 'Switzerland',
4134 'SY': 'Syrian Arab Republic',
4135 'TW': 'Taiwan, Province of China',
4137 'TZ': 'Tanzania, United Republic of',
4139 'TL': 'Timor-Leste',
4143 'TT': 'Trinidad and Tobago',
4146 'TM': 'Turkmenistan',
4147 'TC': 'Turks and Caicos Islands',
4151 'AE': 'United Arab Emirates',
4152 'GB': 'United Kingdom',
4153 'US': 'United States',
4154 'UM': 'United States Minor Outlying Islands',
4158 'VE': 'Venezuela, Bolivarian Republic of',
4160 'VG': 'Virgin Islands, British',
4161 'VI': 'Virgin Islands, U.S.',
4162 'WF': 'Wallis and Futuna',
4163 'EH': 'Western Sahara',
4170 def short2full(cls, code):
4171 """Convert an ISO 3166-2 country code to the corresponding full name"""
4172 return cls._country_map.get(code.upper())
4176 # Major IPv4 address blocks per country
4178 'AD': '46.172.224.0/19',
4179 'AE': '94.200.0.0/13',
4180 'AF': '149.54.0.0/17',
4181 'AG': '209.59.64.0/18',
4182 'AI': '204.14.248.0/21',
4183 'AL': '46.99.0.0/16',
4184 'AM': '46.70.0.0/15',
4185 'AO': '105.168.0.0/13',
4186 'AP': '182.50.184.0/21',
4187 'AQ': '23.154.160.0/24',
4188 'AR': '181.0.0.0/12',
4189 'AS': '202.70.112.0/20',
4190 'AT': '77.116.0.0/14',
4191 'AU': '1.128.0.0/11',
4192 'AW': '181.41.0.0/18',
4193 'AX': '185.217.4.0/22',
4194 'AZ': '5.197.0.0/16',
4195 'BA': '31.176.128.0/17',
4196 'BB': '65.48.128.0/17',
4197 'BD': '114.130.0.0/16',
4199 'BF': '102.178.0.0/15',
4200 'BG': '95.42.0.0/15',
4201 'BH': '37.131.0.0/17',
4202 'BI': '154.117.192.0/18',
4203 'BJ': '137.255.0.0/16',
4204 'BL': '185.212.72.0/23',
4205 'BM': '196.12.64.0/18',
4206 'BN': '156.31.0.0/16',
4207 'BO': '161.56.0.0/16',
4208 'BQ': '161.0.80.0/20',
4209 'BR': '191.128.0.0/12',
4210 'BS': '24.51.64.0/18',
4211 'BT': '119.2.96.0/19',
4212 'BW': '168.167.0.0/16',
4213 'BY': '178.120.0.0/13',
4214 'BZ': '179.42.192.0/18',
4215 'CA': '99.224.0.0/11',
4216 'CD': '41.243.0.0/16',
4217 'CF': '197.242.176.0/21',
4218 'CG': '160.113.0.0/16',
4219 'CH': '85.0.0.0/13',
4220 'CI': '102.136.0.0/14',
4221 'CK': '202.65.32.0/19',
4222 'CL': '152.172.0.0/14',
4223 'CM': '102.244.0.0/14',
4224 'CN': '36.128.0.0/10',
4225 'CO': '181.240.0.0/12',
4226 'CR': '201.192.0.0/12',
4227 'CU': '152.206.0.0/15',
4228 'CV': '165.90.96.0/19',
4229 'CW': '190.88.128.0/17',
4230 'CY': '31.153.0.0/16',
4231 'CZ': '88.100.0.0/14',
4233 'DJ': '197.241.0.0/17',
4234 'DK': '87.48.0.0/12',
4235 'DM': '192.243.48.0/20',
4236 'DO': '152.166.0.0/15',
4237 'DZ': '41.96.0.0/12',
4238 'EC': '186.68.0.0/15',
4239 'EE': '90.190.0.0/15',
4240 'EG': '156.160.0.0/11',
4241 'ER': '196.200.96.0/20',
4242 'ES': '88.0.0.0/11',
4243 'ET': '196.188.0.0/14',
4244 'EU': '2.16.0.0/13',
4245 'FI': '91.152.0.0/13',
4246 'FJ': '144.120.0.0/16',
4247 'FK': '80.73.208.0/21',
4248 'FM': '119.252.112.0/20',
4249 'FO': '88.85.32.0/19',
4251 'GA': '41.158.0.0/15',
4253 'GD': '74.122.88.0/21',
4254 'GE': '31.146.0.0/16',
4255 'GF': '161.22.64.0/18',
4256 'GG': '62.68.160.0/19',
4257 'GH': '154.160.0.0/12',
4258 'GI': '95.164.0.0/16',
4259 'GL': '88.83.0.0/19',
4260 'GM': '160.182.0.0/15',
4261 'GN': '197.149.192.0/18',
4262 'GP': '104.250.0.0/19',
4263 'GQ': '105.235.224.0/20',
4264 'GR': '94.64.0.0/13',
4265 'GT': '168.234.0.0/16',
4266 'GU': '168.123.0.0/16',
4267 'GW': '197.214.80.0/20',
4268 'GY': '181.41.64.0/18',
4269 'HK': '113.252.0.0/14',
4270 'HN': '181.210.0.0/16',
4271 'HR': '93.136.0.0/13',
4272 'HT': '148.102.128.0/17',
4273 'HU': '84.0.0.0/14',
4274 'ID': '39.192.0.0/10',
4275 'IE': '87.32.0.0/12',
4276 'IL': '79.176.0.0/13',
4277 'IM': '5.62.80.0/20',
4278 'IN': '117.192.0.0/10',
4279 'IO': '203.83.48.0/21',
4280 'IQ': '37.236.0.0/14',
4281 'IR': '2.176.0.0/12',
4282 'IS': '82.221.0.0/16',
4283 'IT': '79.0.0.0/10',
4284 'JE': '87.244.64.0/18',
4285 'JM': '72.27.0.0/17',
4286 'JO': '176.29.0.0/16',
4287 'JP': '133.0.0.0/8',
4288 'KE': '105.48.0.0/12',
4289 'KG': '158.181.128.0/17',
4290 'KH': '36.37.128.0/17',
4291 'KI': '103.25.140.0/22',
4292 'KM': '197.255.224.0/20',
4293 'KN': '198.167.192.0/19',
4294 'KP': '175.45.176.0/22',
4295 'KR': '175.192.0.0/10',
4296 'KW': '37.36.0.0/14',
4297 'KY': '64.96.0.0/15',
4298 'KZ': '2.72.0.0/13',
4299 'LA': '115.84.64.0/18',
4300 'LB': '178.135.0.0/16',
4301 'LC': '24.92.144.0/20',
4302 'LI': '82.117.0.0/19',
4303 'LK': '112.134.0.0/15',
4304 'LR': '102.183.0.0/16',
4305 'LS': '129.232.0.0/17',
4306 'LT': '78.56.0.0/13',
4307 'LU': '188.42.0.0/16',
4308 'LV': '46.109.0.0/16',
4309 'LY': '41.252.0.0/14',
4310 'MA': '105.128.0.0/11',
4311 'MC': '88.209.64.0/18',
4312 'MD': '37.246.0.0/16',
4313 'ME': '178.175.0.0/17',
4314 'MF': '74.112.232.0/21',
4315 'MG': '154.126.0.0/17',
4316 'MH': '117.103.88.0/21',
4317 'MK': '77.28.0.0/15',
4318 'ML': '154.118.128.0/18',
4319 'MM': '37.111.0.0/17',
4320 'MN': '49.0.128.0/17',
4321 'MO': '60.246.0.0/16',
4322 'MP': '202.88.64.0/20',
4323 'MQ': '109.203.224.0/19',
4324 'MR': '41.188.64.0/18',
4325 'MS': '208.90.112.0/22',
4326 'MT': '46.11.0.0/16',
4327 'MU': '105.16.0.0/12',
4328 'MV': '27.114.128.0/18',
4329 'MW': '102.70.0.0/15',
4330 'MX': '187.192.0.0/11',
4331 'MY': '175.136.0.0/13',
4332 'MZ': '197.218.0.0/15',
4333 'NA': '41.182.0.0/16',
4334 'NC': '101.101.0.0/18',
4335 'NE': '197.214.0.0/18',
4336 'NF': '203.17.240.0/22',
4337 'NG': '105.112.0.0/12',
4338 'NI': '186.76.0.0/15',
4339 'NL': '145.96.0.0/11',
4340 'NO': '84.208.0.0/13',
4341 'NP': '36.252.0.0/15',
4342 'NR': '203.98.224.0/19',
4343 'NU': '49.156.48.0/22',
4344 'NZ': '49.224.0.0/14',
4345 'OM': '5.36.0.0/15',
4346 'PA': '186.72.0.0/15',
4347 'PE': '186.160.0.0/14',
4348 'PF': '123.50.64.0/18',
4349 'PG': '124.240.192.0/19',
4350 'PH': '49.144.0.0/13',
4351 'PK': '39.32.0.0/11',
4352 'PL': '83.0.0.0/11',
4353 'PM': '70.36.0.0/20',
4354 'PR': '66.50.0.0/16',
4355 'PS': '188.161.0.0/16',
4356 'PT': '85.240.0.0/13',
4357 'PW': '202.124.224.0/20',
4358 'PY': '181.120.0.0/14',
4359 'QA': '37.210.0.0/15',
4360 'RE': '102.35.0.0/16',
4361 'RO': '79.112.0.0/13',
4362 'RS': '93.86.0.0/15',
4363 'RU': '5.136.0.0/13',
4364 'RW': '41.186.0.0/16',
4365 'SA': '188.48.0.0/13',
4366 'SB': '202.1.160.0/19',
4367 'SC': '154.192.0.0/11',
4368 'SD': '102.120.0.0/13',
4369 'SE': '78.64.0.0/12',
4370 'SG': '8.128.0.0/10',
4371 'SI': '188.196.0.0/14',
4372 'SK': '78.98.0.0/15',
4373 'SL': '102.143.0.0/17',
4374 'SM': '89.186.32.0/19',
4375 'SN': '41.82.0.0/15',
4376 'SO': '154.115.192.0/18',
4377 'SR': '186.179.128.0/17',
4378 'SS': '105.235.208.0/21',
4379 'ST': '197.159.160.0/19',
4380 'SV': '168.243.0.0/16',
4381 'SX': '190.102.0.0/20',
4383 'SZ': '41.84.224.0/19',
4384 'TC': '65.255.48.0/20',
4385 'TD': '154.68.128.0/19',
4386 'TG': '196.168.0.0/14',
4387 'TH': '171.96.0.0/13',
4388 'TJ': '85.9.128.0/18',
4389 'TK': '27.96.24.0/21',
4390 'TL': '180.189.160.0/20',
4391 'TM': '95.85.96.0/19',
4392 'TN': '197.0.0.0/11',
4393 'TO': '175.176.144.0/21',
4394 'TR': '78.160.0.0/11',
4395 'TT': '186.44.0.0/15',
4396 'TV': '202.2.96.0/19',
4397 'TW': '120.96.0.0/11',
4398 'TZ': '156.156.0.0/14',
4399 'UA': '37.52.0.0/14',
4400 'UG': '102.80.0.0/13',
4402 'UY': '167.56.0.0/13',
4403 'UZ': '84.54.64.0/18',
4404 'VA': '212.77.0.0/19',
4405 'VC': '207.191.240.0/21',
4406 'VE': '186.88.0.0/13',
4407 'VG': '66.81.192.0/20',
4408 'VI': '146.226.0.0/16',
4409 'VN': '14.160.0.0/11',
4410 'VU': '202.80.32.0/20',
4411 'WF': '117.20.32.0/21',
4412 'WS': '202.4.32.0/19',
4413 'YE': '134.35.0.0/16',
4414 'YT': '41.242.116.0/22',
4415 'ZA': '41.0.0.0/11',
4416 'ZM': '102.144.0.0/13',
4417 'ZW': '102.177.192.0/18',
4421 def random_ipv4(cls, code_or_block):
4422 if len(code_or_block) == 2:
4423 block = cls._country_ip_map.get(code_or_block.upper())
4427 block = code_or_block
4428 addr, preflen = block.split('/')
4429 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4430 addr_max = addr_min | (0xffffffff >> int(preflen))
4431 return compat_str(socket.inet_ntoa(
4432 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4435 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4436 def __init__(self, proxies=None):
4437 # Set default handlers
4438 for type in ('http', 'https'):
4439 setattr(self, '%s_open' % type,
4440 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4441 meth(r, proxy, type))
4442 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4444 def proxy_open(self, req, proxy, type):
4445 req_proxy = req.headers.get('Ytdl-request-proxy')
4446 if req_proxy is not None:
4448 del req.headers['Ytdl-request-proxy']
4450 if proxy == '__noproxy__':
4451 return None # No Proxy
4452 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4453 req.add_header('Ytdl-socks-proxy', proxy)
4454 # yt-dlp's http/https handlers do wrapping the socket with socks
4456 return compat_urllib_request.ProxyHandler.proxy_open(
4457 self, req, proxy, type)
4460 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4461 # released into Public Domain
4462 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4464 def long_to_bytes(n, blocksize=0):
4465 """long_to_bytes(n:long, blocksize:int) : string
4466 Convert a long integer to a byte string.
4468 If optional blocksize is given and greater than zero, pad the front of the
4469 byte string with binary zeros so that the length is a multiple of
4472 # after much testing, this algorithm was deemed to be the fastest
4476 s = compat_struct_pack('>I', n & 0xffffffff) + s
4478 # strip off leading zeros
4479 for i in range(len(s)):
4480 if s[i] != b'\000'[0]:
4483 # only happens when n == 0
4487 # add back some pad bytes. this could be done more efficiently w.r.t. the
4488 # de-padding being done above, but sigh...
4489 if blocksize > 0 and len(s) % blocksize:
4490 s = (blocksize - len(s) % blocksize) * b'\000' + s
4494 def bytes_to_long(s):
4495 """bytes_to_long(string) : long
4496 Convert a byte string to a long integer.
4498 This is (essentially) the inverse of long_to_bytes().
4503 extra = (4 - length % 4)
4504 s = b'\000' * extra + s
4505 length = length + extra
4506 for i in range(0, length, 4):
4507 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4511 def ohdave_rsa_encrypt(data, exponent, modulus):
4513 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4516 data: data to encrypt, bytes-like object
4517 exponent, modulus: parameter e and N of RSA algorithm, both integer
4518 Output: hex string of encrypted data
4520 Limitation: supports one block encryption only
4523 payload = int(binascii.hexlify(data[::-1]), 16)
4524 encrypted = pow(payload, exponent, modulus)
4525 return '%x' % encrypted
4528 def pkcs1pad(data, length):
4530 Padding input data with PKCS#1 scheme
4532 @param {int[]} data input data
4533 @param {int} length target length
4534 @returns {int[]} padded data
4536 if len(data) > length - 11:
4537 raise ValueError('Input data too
long for PKCS
#1 padding')
4539 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4540 return [0, 2] + pseudo_random
+ [0] + data
4543 def encode_base_n(num
, n
, table
=None):
4544 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4546 table
= FULL_TABLE
[:n
]
4549 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4556 ret
= table
[num
% n
] + ret
4561 def decode_packed_codes(code
):
4562 mobj
= re
.search(PACKED_CODES_RE
, code
)
4563 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4566 symbols
= symbols
.split('|')
4571 base_n_count
= encode_base_n(count
, base
)
4572 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4575 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4579 def caesar(s
, alphabet
, shift
):
4584 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4589 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4592 def parse_m3u8_attributes(attrib
):
4594 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4595 if val
.startswith('"'):
4601 def urshift(val
, n
):
4602 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4605 # Based on png2str() written by @gdkchan and improved by @yokrysty
4606 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4607 def decode_png(png_data
):
4608 # Reference: https://www.w3.org/TR/PNG/
4609 header
= png_data
[8:]
4611 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4612 raise OSError('Not a valid PNG file.')
4614 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4615 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4620 length
= unpack_integer(header
[:4])
4623 chunk_type
= header
[:4]
4626 chunk_data
= header
[:length
]
4627 header
= header
[length
:]
4629 header
= header
[4:] # Skip CRC
4637 ihdr
= chunks
[0]['data']
4639 width
= unpack_integer(ihdr
[:4])
4640 height
= unpack_integer(ihdr
[4:8])
4644 for chunk
in chunks
:
4645 if chunk
['type'] == b
'IDAT':
4646 idat
+= chunk
['data']
4649 raise OSError('Unable to read PNG data.')
4651 decompressed_data
= bytearray(zlib
.decompress(idat
))
4656 def _get_pixel(idx
):
4661 for y
in range(height
):
4662 basePos
= y
* (1 + stride
)
4663 filter_type
= decompressed_data
[basePos
]
4667 pixels
.append(current_row
)
4669 for x
in range(stride
):
4670 color
= decompressed_data
[1 + basePos
+ x
]
4671 basex
= y
* stride
+ x
4676 left
= _get_pixel(basex
- 3)
4678 up
= _get_pixel(basex
- stride
)
4680 if filter_type
== 1: # Sub
4681 color
= (color
+ left
) & 0xff
4682 elif filter_type
== 2: # Up
4683 color
= (color
+ up
) & 0xff
4684 elif filter_type
== 3: # Average
4685 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4686 elif filter_type
== 4: # Paeth
4692 c
= _get_pixel(basex
- stride
- 3)
4700 if pa
<= pb
and pa
<= pc
:
4701 color
= (color
+ a
) & 0xff
4703 color
= (color
+ b
) & 0xff
4705 color
= (color
+ c
) & 0xff
4707 current_row
.append(color
)
4709 return width
, height
, pixels
4712 def write_xattr(path
, key
, value
):
4713 # Windows: Write xattrs to NTFS Alternate Data Streams:
4714 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4715 if compat_os_name
== 'nt':
4716 assert ':' not in key
4717 assert os
.path
.exists(path
)
4720 with open(f
'{path}:{key}', 'wb') as f
:
4722 except OSError as e
:
4723 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4726 # UNIX Method 1. Use xattrs/pyxattrs modules
4727 from .dependencies
import xattr
4730 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
4731 # Unicode arguments are not supported in pyxattr until version 0.5.0
4732 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4733 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
4734 setxattr
= xattr
.set
4736 setxattr
= xattr
.setxattr
4740 setxattr(path
, key
, value
)
4741 except OSError as e
:
4742 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4745 # UNIX Method 2. Use setfattr/xattr executables
4746 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
4747 else 'xattr' if check_executable('xattr', ['-h']) else None)
4749 raise XAttrUnavailableError(
4750 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4751 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4753 value
= value
.decode()
4756 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
4757 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4758 except OSError as e
:
4759 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4760 stderr
= p
.communicate_or_kill()[1].decode('utf-8', 'replace')
4762 raise XAttrMetadataError(p
.returncode
, stderr
)
4765 def random_birthday(year_field
, month_field
, day_field
):
4766 start_date
= datetime
.date(1950, 1, 1)
4767 end_date
= datetime
.date(1995, 12, 31)
4768 offset
= random
.randint(0, (end_date
- start_date
).days
)
4769 random_date
= start_date
+ datetime
.timedelta(offset
)
4771 year_field
: str(random_date
.year
),
4772 month_field
: str(random_date
.month
),
4773 day_field
: str(random_date
.day
),
4777 # Templates for internet shortcut files, which are plain text files.
4778 DOT_URL_LINK_TEMPLATE
= '''\
4783 DOT_WEBLOC_LINK_TEMPLATE
= '''\
4784 <?xml version="1.0" encoding="UTF-8"?>
4785 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4786 <plist version="1.0">
4789 \t<string>%(url)s</string>
4794 DOT_DESKTOP_LINK_TEMPLATE
= '''\
4804 'url': DOT_URL_LINK_TEMPLATE
,
4805 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4806 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4810 def iri_to_uri(iri
):
4812 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4817 iri_parts
= compat_urllib_parse_urlparse(iri
)
4819 if '[' in iri_parts
.netloc
:
4820 raise ValueError('IPv6 URIs are not, yet, supported.')
4821 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4826 if iri_parts
.username
:
4827 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4828 if iri_parts
.password
is not None:
4829 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4832 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
4833 # The 'idna' encoding produces ASCII text.
4834 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4835 net_location
+= ':' + str(iri_parts
.port
)
4837 return urllib
.parse
.urlunparse(
4841 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
4843 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4844 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
4846 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4847 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
4849 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
4851 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4854 def to_high_limit_path(path
):
4855 if sys
.platform
in ['win32', 'cygwin']:
4856 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4857 return '\\\\?\\' + os
.path
.abspath(path
)
4862 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
4863 val
= traverse_obj(obj
, *variadic(field
))
4866 return template
% (func(val
) if func
else val
)
4869 def clean_podcast_url(url
):
4870 return re
.sub(r
'''(?x)
4874 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4877 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4880 cn\.co| # https://podcorn.com/analytics-prefix/
4881 st\.fm # https://podsights.com/docs/
4886 _HEX_TABLE
= '0123456789abcdef'
4889 def random_uuidv4():
4890 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4893 def make_dir(path
, to_screen
=None):
4895 dn
= os
.path
.dirname(path
)
4896 if dn
and not os
.path
.exists(dn
):
4899 except OSError as err
:
4900 if callable(to_screen
) is not None:
4901 to_screen('unable to create directory ' + error_to_compat_str(err
))
4905 def get_executable_path():
4906 from zipimport
import zipimporter
4907 if hasattr(sys
, 'frozen'): # Running from PyInstaller
4908 path
= os
.path
.dirname(sys
.executable
)
4909 elif isinstance(__loader__
, zipimporter
): # Running from ZIP
4910 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
4912 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
4913 return os
.path
.abspath(path
)
4916 def load_plugins(name
, suffix
, namespace
):
4918 with contextlib
.suppress(FileNotFoundError
):
4919 plugins_spec
= importlib
.util
.spec_from_file_location(
4920 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
4921 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
4922 sys
.modules
[plugins_spec
.name
] = plugins
4923 plugins_spec
.loader
.exec_module(plugins
)
4924 for name
in dir(plugins
):
4925 if name
in namespace
:
4927 if not name
.endswith(suffix
):
4929 klass
= getattr(plugins
, name
)
4930 classes
[name
] = namespace
[name
] = klass
4935 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
4936 casesense
=True, is_user_input
=False, traverse_string
=False):
4937 ''' Traverse nested list/dict/tuple
4938 @param path_list A list of paths which are checked one by one.
4939 Each path is a list of keys where each key is a:
4941 - string: A dictionary key
4942 - int: An index into a list
4943 - tuple: A list of keys all of which will be traversed
4944 - Ellipsis: Fetch all values in the object
4945 - Function: Takes the key and value as arguments
4946 and returns whether the key matches or not
4947 @param default Default value to return
4948 @param expected_type Only accept final value of this type (Can also be any callable)
4949 @param get_all Return all the values obtained from a path or only the first one
4950 @param casesense Whether to consider dictionary keys as case sensitive
4951 @param is_user_input Whether the keys are generated from user input. If True,
4952 strings are converted to int/slice if necessary
4953 @param traverse_string Whether to traverse inside strings. If True, any
4954 non-compatible object will also be converted into a string
4958 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
4959 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
4961 def _traverse_obj(obj
, path
, _current_depth
=0):
4963 path
= tuple(variadic(path
))
4964 for i
, key
in enumerate(path
):
4965 if None in (key
, obj
):
4967 if isinstance(key
, (list, tuple)):
4968 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
4971 obj
= (obj
.values() if isinstance(obj
, dict)
4972 else obj
if isinstance(obj
, (list, tuple, LazyList
))
4973 else str(obj
) if traverse_string
else [])
4975 depth
= max(depth
, _current_depth
)
4976 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
4978 if isinstance(obj
, (list, tuple, LazyList
)):
4979 obj
= enumerate(obj
)
4980 elif isinstance(obj
, dict):
4983 if not traverse_string
:
4987 depth
= max(depth
, _current_depth
)
4988 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if try_call(key
, args
=(k
, v
))]
4989 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
4990 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
4991 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
4994 key
= (int_or_none(key
) if ':' not in key
4995 else slice(*map(int_or_none
, key
.split(':'))))
4996 if key
== slice(None):
4997 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
4998 if not isinstance(key
, (int, slice)):
5000 if not isinstance(obj
, (list, tuple, LazyList
)):
5001 if not traverse_string
:
5010 if isinstance(expected_type
, type):
5011 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5012 elif expected_type
is not None:
5013 type_test
= expected_type
5015 type_test
= lambda val
: val
5017 for path
in path_list
:
5019 val
= _traverse_obj(obj
, path
)
5022 for _
in range(depth
- 1):
5023 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5024 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5026 return val
if get_all
else val
[0]
5028 val
= type_test(val
)
5034 def traverse_dict(dictn
, keys
, casesense
=True):
5035 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5036 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5037 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5040 def get_first(obj
, keys
, **kwargs
):
5041 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5044 def variadic(x
, allowed_types
=(str, bytes, dict)):
5045 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5048 def decode_base(value
, digits
):
5049 # This will convert given base-x string to scalar (long or int)
5050 table
= {char: index for index, char in enumerate(digits)}
5055 result
+= table
[chr]
5059 def time_seconds(**kwargs
):
5060 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5061 return t
.timestamp()
5064 # create a JSON Web Signature (jws) with HS256 algorithm
5065 # the resulting format is in JWS Compact Serialization
5066 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5067 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5068 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5074 header_data
.update(headers
)
5075 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5076 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5077 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5078 signature_b64
= base64
.b64encode(h
.digest())
5079 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5083 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5084 def jwt_decode_hs256(jwt
):
5085 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5086 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5090 def supports_terminal_sequences(stream
):
5091 if compat_os_name
== 'nt':
5092 from .compat
import WINDOWS_VT_MODE
# Must be imported locally
5093 if not WINDOWS_VT_MODE
or get_windows_version() < (10, 0, 10586):
5095 elif not os
.getenv('TERM'):
5098 return stream
.isatty()
5099 except BaseException
:
5103 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5106 def remove_terminal_sequences(string
):
5107 return _terminal_sequences_re
.sub('', string
)
5110 def number_of_digits(number
):
5111 return len('%d' % number
)
5114 def join_nonempty(*values
, delim
='-', from_dict
=None):
5115 if from_dict
is not None:
5116 values
= map(from_dict
.get
, values
)
5117 return delim
.join(map(str, filter(None, values
)))
5120 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5122 Find the largest format dimensions in terms of video width and, for each thumbnail:
5123 * Modify the URL: Match the width with the provided regex and replace with the former width
5126 This function is useful with video services that scale the provided thumbnails on demand
5128 _keys
= ('width', 'height')
5129 max_dimensions
= max(
5130 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5132 if not max_dimensions
[0]:
5136 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5137 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5138 for thumbnail
in thumbnails
5142 def parse_http_range(range):
5143 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5145 return None, None, None
5146 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5148 return None, None, None
5149 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5155 __initialized
= False
5157 def __init__(self
, parser
, label
=None):
5158 self
._parser
, self
.label
= parser
, label
5159 self
._loaded
_paths
, self
.configs
= set(), []
5161 def init(self
, args
=None, filename
=None):
5162 assert not self
.__initialized
5165 location
= os
.path
.realpath(filename
)
5166 directory
= os
.path
.dirname(location
)
5167 if location
in self
._loaded
_paths
:
5169 self
._loaded
_paths
.add(location
)
5171 self
.__initialized
= True
5172 self
.own_args
, self
.filename
= args
, filename
5173 for location
in self
._parser
.parse_args(args
)[0].config_locations
or []:
5174 location
= os
.path
.join(directory
, expand_path(location
))
5175 if os
.path
.isdir(location
):
5176 location
= os
.path
.join(location
, 'yt-dlp.conf')
5177 if not os
.path
.exists(location
):
5178 self
._parser
.error(f
'config location {location} does not exist')
5179 self
.append_config(self
.read_file(location
), location
)
5183 label
= join_nonempty(
5184 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5186 return join_nonempty(
5187 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5188 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5192 def read_file(filename
, default
=[]):
5194 optionf
= open(filename
)
5196 return default
# silently skip if file is not present
5198 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5199 contents
= optionf
.read()
5200 res
= shlex
.split(contents
, comments
=True)
5206 def hide_login_info(opts
):
5207 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5208 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5213 return m
.group('key') + '=PRIVATE'
5217 opts
= list(map(_scrub_eq
, opts
))
5218 for idx
, opt
in enumerate(opts
):
5219 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5220 opts
[idx
+ 1] = 'PRIVATE'
5223 def append_config(self
, *args
, label
=None):
5224 config
= type(self
)(self
._parser
, label
)
5225 config
._loaded
_paths
= self
._loaded
_paths
5226 if config
.init(*args
):
5227 self
.configs
.append(config
)
5231 for config
in reversed(self
.configs
):
5232 yield from config
.all_args
5233 yield from self
.own_args
or []
5235 def parse_args(self
):
5236 return self
._parser
.parse_args(self
.all_args
)
5239 class WebSocketsWrapper():
5240 """Wraps websockets module to use in non-async scopes"""
5243 def __init__(self
, url
, headers
=None, connect
=True):
5244 self
.loop
= asyncio
.new_event_loop()
5245 # XXX: "loop" is deprecated
5246 self
.conn
= websockets
.connect(
5247 url
, extra_headers
=headers
, ping_interval
=None,
5248 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5251 atexit
.register(self
.__exit
__, None, None, None)
5253 def __enter__(self
):
5255 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5258 def send(self
, *args
):
5259 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5261 def recv(self
, *args
):
5262 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5264 def __exit__(self
, type, value
, traceback
):
5266 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5269 self
._cancel
_all
_tasks
(self
.loop
)
5271 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5272 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5274 def run_with_loop(main
, loop
):
5275 if not asyncio
.iscoroutine(main
):
5276 raise ValueError(f
'a coroutine was expected, got {main!r}')
5279 return loop
.run_until_complete(main
)
5281 loop
.run_until_complete(loop
.shutdown_asyncgens())
5282 if hasattr(loop
, 'shutdown_default_executor'):
5283 loop
.run_until_complete(loop
.shutdown_default_executor())
5286 def _cancel_all_tasks(loop
):
5287 to_cancel
= asyncio
.all_tasks(loop
)
5292 for task
in to_cancel
:
5295 # XXX: "loop" is removed in python 3.10+
5296 loop
.run_until_complete(
5297 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5299 for task
in to_cancel
:
5300 if task
.cancelled():
5302 if task
.exception() is not None:
5303 loop
.call_exception_handler({
5304 'message': 'unhandled exception during asyncio.run() shutdown',
5305 'exception': task
.exception(),
5310 def merge_headers(*dicts
):
5311 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5312 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5315 class classproperty
:
5316 def __init__(self
, f
):
5317 functools
.update_wrapper(self
, f
)
5320 def __get__(self
, _
, cls
):
5325 """Immutable namespace"""
5327 def __init__(self
, **kwargs
):
5330 def __getattr__(self
, attr
):
5331 return self
._dict
[attr
]
5333 def __contains__(self
, item
):
5334 return item
in self
._dict
.values()
5337 return iter(self
._dict
.items())
5340 return f
'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5344 has_certifi
= bool(certifi
)
5345 has_websockets
= bool(websockets
)