38 import xml
.etree
.ElementTree
41 from .compat
import asyncio
, functools
# isort: split
45 compat_etree_fromstring
,
48 compat_html_entities_html5
,
49 compat_HTMLParseError
,
60 compat_urllib_parse_unquote_plus
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_request
,
66 from .dependencies
import brotli
, certifi
, websockets
67 from .socks
import ProxyType
, sockssocket
70 def register_socks_protocols():
71 # "Register" SOCKS protocols
72 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
73 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
74 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
75 if scheme
not in compat_urlparse
.uses_netloc
:
76 compat_urlparse
.uses_netloc
.append(scheme
)
79 # This is not clearly defined otherwise
80 compiled_regex_type
= type(re
.compile(''))
83 def random_user_agent():
84 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
125 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
128 SUPPORTED_ENCODINGS
= [
132 SUPPORTED_ENCODINGS
.append('br')
135 'User-Agent': random_user_agent(),
136 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
137 'Accept-Language': 'en-us,en;q=0.5',
138 'Sec-Fetch-Mode': 'navigate',
143 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
147 NO_DEFAULT
= object()
149 ENGLISH_MONTH_NAMES
= [
150 'January', 'February', 'March', 'April', 'May', 'June',
151 'July', 'August', 'September', 'October', 'November', 'December']
154 'en': ENGLISH_MONTH_NAMES
,
156 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
157 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
161 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
162 'flv', 'f4v', 'f4a', 'f4b',
163 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
164 'mkv', 'mka', 'mk3d',
173 'f4f', 'f4m', 'm3u8', 'smil')
175 # needed for sanitizing filenames in restricted mode
176 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
177 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
178 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
208 '%Y-%m-%d %H:%M:%S.%f',
209 '%Y-%m-%d %H:%M:%S:%f',
212 '%Y-%m-%dT%H:%M:%SZ',
213 '%Y-%m-%dT%H:%M:%S.%fZ',
214 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S.%f',
219 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M:%S',
225 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
226 DATE_FORMATS_DAY_FIRST
.extend([
235 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
236 DATE_FORMATS_MONTH_FIRST
.extend([
244 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
245 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
247 NUMBER_RE = r'\d
+(?
:\
.\d
+)?
'
251 def preferredencoding():
252 """Get preferred encoding.
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
258 pref = locale.getpreferredencoding()
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}
.', dir=os.path.dirname(fn),
271 suffix='.tmp
', delete=False, mode='w
', encoding='utf
-8')
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32
':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
281 with contextlib.suppress(OSError):
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
287 with contextlib.suppress(OSError):
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}
']")
296 return node.find(expr)
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
302 def xpath_with_ns(path
, ns_map
):
303 components
= [c
.split(':') for c
in path
.split('/')]
307 replaced
.append(c
[0])
310 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
311 return '/'.join(replaced
)
314 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
315 def _find_xpath(xpath
):
316 return node
.find(xpath
)
318 if isinstance(xpath
, (str, compat_str
)):
319 n
= _find_xpath(xpath
)
327 if default
is not NO_DEFAULT
:
330 name
= xpath
if name
is None else name
331 raise ExtractorError('Could not find XML element %s' % name
)
337 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
338 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
339 if n
is None or n
== default
:
342 if default
is not NO_DEFAULT
:
345 name
= xpath
if name
is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name
)
352 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
353 n
= find_xpath_attr(node
, xpath
, key
)
355 if default
is not NO_DEFAULT
:
358 name
= f
'{xpath}[@{key}]' if name
is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name
)
365 def get_element_by_id(id, html
, **kwargs
):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html
, **kwargs
)
370 def get_element_html_by_id(id, html
, **kwargs
):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html
, **kwargs
)
375 def get_element_by_class(class_name
, html
):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval
= get_elements_by_class(class_name
, html
)
378 return retval
[0] if retval
else None
381 def get_element_html_by_class(class_name
, html
):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval
= get_elements_html_by_class(class_name
, html
)
384 return retval
[0] if retval
else None
387 def get_element_by_attribute(attribute
, value
, html
, **kwargs
):
388 retval
= get_elements_by_attribute(attribute
, value
, html
, **kwargs
)
389 return retval
[0] if retval
else None
392 def get_element_html_by_attribute(attribute
, value
, html
, **kargs
):
393 retval
= get_elements_html_by_attribute(attribute
, value
, html
, **kargs
)
394 return retval
[0] if retval
else None
397 def get_elements_by_class(class_name
, html
, **kargs
):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^
\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
427 quote = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
429 value = re.escape(value) if escape_value else value
431 partial_element_re = rf'''(?x
)
432 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
433 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
453 class HTMLBreakOnClosingTagException(Exception):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
463 def __exit__(self, *_):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags
in the stack
')
479 inner_tag = self.tagstack.pop()
483 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
488 def get_element_text_and_html_by_tag(tag, html):
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text
) and the whole
element (html
)
493 def find_or_raise(haystack, needle, exc):
495 return haystack.index(needle)
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes
for a single element
"""
528 compat_HTMLParser.__init__(self)
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes
for the elements of a
list"""
538 compat_HTMLParser.__init__(self)
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
547 def handle_endtag(self, tag):
551 def extract_attributes(html_element):
552 """Given a string
for an HTML element such
as
554 a
="foo" B
="bar" c
="&98;az" d
=boz
555 empty
= noval entity
="&"
558 Decode
and return a dictionary of attributes
.
560 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
572 def parse_list(webpage):
573 """Given a string
for an series of HTML
<li
> elements
,
574 return a dictionary of their attributes
"""
575 parser = HTMLListAttrsParser()
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string
"""
584 if html is None: # Convenience for sanitizing descriptions etc.
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
597 def sanitize_open(filename, open_mode):
598 """Try to
open the given filename
, and slightly tweak it
if this fails
.
600 Attempts to
open the given filename
. If this fails
, it tries to change
601 the filename slightly
, step by step
, until it
's either able to open it
602 or it fails and raises a final exception, like the standard open()
605 It returns the tuple (stream, definitive_file_name).
608 if sys.platform == 'win32
':
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
613 for attempt in range(2):
616 if sys.platform == 'win32
':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the
file on
windows (for now
).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream
= locked_file(filename
, open_mode
, block
=False).__enter
__()
623 stream
= open(filename
, open_mode
)
624 return stream
, filename
625 except OSError as err
:
626 if attempt
or err
.errno
in (errno
.EACCES
,):
628 old_filename
, filename
= filename
, sanitize_path(filename
)
629 if old_filename
== filename
:
633 def timeconvert(timestr
):
634 """Convert RFC 2822 defined time string into system timestamp"""
636 timetuple
= email
.utils
.parsedate_tz(timestr
)
637 if timetuple
is not None:
638 timestamp
= email
.utils
.mktime_tz(timetuple
)
642 def sanitize_filename(s
, restricted
=False, is_id
=NO_DEFAULT
):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
651 def replace_insane(char
):
652 if restricted
and char
in ACCENT_CHARS
:
653 return ACCENT_CHARS
[char
]
654 elif not restricted
and char
== '\n':
656 elif char
== '?' or ord(char
) < 32 or ord(char
) == 127:
659 return '' if restricted
else '\''
661 return '\0_\0-' if restricted
else '\0 \0-'
662 elif char
in '\\/|*<>':
664 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace() or ord(char
) > 127):
668 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
) # Handle timestamps
669 result
= ''.join(map(replace_insane
, s
))
670 if is_id
is NO_DEFAULT
:
671 result
= re
.sub('(\0.)(?:(?=\\1)..)+', r
'\1', result
) # Remove repeated substitute chars
672 STRIP_RE
= '(?:\0.|[ _-])*'
673 result
= re
.sub(f
'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result
) # Remove substitute chars from start/end
674 result
= result
.replace('\0', '') or '_'
677 while '__' in result
:
678 result
= result
.replace('__', '_')
679 result
= result
.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted
and result
.startswith('-_'):
683 if result
.startswith('-'):
684 result
= '_' + result
[len('-'):]
685 result
= result
.lstrip('.')
691 def sanitize_path(s
, force
=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys
.platform
== 'win32':
695 drive_or_unc
, _
= os
.path
.splitdrive(s
)
701 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
705 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
706 for path_part
in norm_path
]
708 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
709 elif force
and s
and s
[0] == os
.path
.sep
:
710 sanitized_path
.insert(0, os
.path
.sep
)
711 return os
.path
.join(*sanitized_path
)
714 def sanitize_url(url
):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
719 elif url
.startswith('//'):
720 return 'http:%s' % url
721 # Fix some common typos seen so far
723 # https://github.com/ytdl-org/youtube-dl/issues/15649
724 (r
'^httpss://', r
'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
728 for mistake
, fixup
in COMMON_TYPOS
:
729 if re
.match(mistake
, url
):
730 return re
.sub(mistake
, fixup
, url
)
734 def extract_basic_auth(url
):
735 parts
= compat_urlparse
.urlsplit(url
)
736 if parts
.username
is None:
738 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
739 parts
.hostname
if parts
.port
is None
740 else '%s:%d' % (parts
.hostname
, parts
.port
))))
741 auth_payload
= base64
.b64encode(
742 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode())
743 return url
, f
'Basic {auth_payload.decode()}'
746 def sanitized_Request(url
, *args
, **kwargs
):
747 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
748 if auth_header
is not None:
749 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
750 headers
['Authorization'] = auth_header
751 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
755 """Expand shell variables and ~"""
756 return os
.path
.expandvars(compat_expanduser(s
))
759 def orderedSet(iterable
):
760 """ Remove all duplicates from the input iterable """
768 def _htmlentity_transform(entity_with_semicolon
):
769 """Transforms an HTML entity to a character."""
770 entity
= entity_with_semicolon
[:-1]
772 # Known non-numeric HTML entity
773 if entity
in compat_html_entities
.name2codepoint
:
774 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # 'Éric' should be decoded as 'Éric'.
778 if entity_with_semicolon
in compat_html_entities_html5
:
779 return compat_html_entities_html5
[entity_with_semicolon
]
781 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
783 numstr
= mobj
.group(1)
784 if numstr
.startswith('x'):
786 numstr
= '0%s' % numstr
789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
790 with contextlib
.suppress(ValueError):
791 return compat_chr(int(numstr
, base
))
793 # Unknown entity in name, return its literal representation
794 return '&%s;' % entity
800 assert isinstance(s
, str)
803 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
806 def escapeHTML(text
):
809 .replace('&', '&')
810 .replace('<', '<')
811 .replace('>', '>')
812 .replace('"', '"')
813 .replace("'", ''')
817 def process_communicate_or_kill(p
, *args
, **kwargs
):
818 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
819 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
820 return Popen
.communicate_or_kill(p
, *args
, **kwargs
)
823 class Popen(subprocess
.Popen
):
824 if sys
.platform
== 'win32':
825 _startupinfo
= subprocess
.STARTUPINFO()
826 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
830 def __init__(self
, *args
, **kwargs
):
831 super().__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
833 def communicate_or_kill(self
, *args
, **kwargs
):
835 return self
.communicate(*args
, **kwargs
)
836 except BaseException
: # Including KeyboardInterrupt
842 def get_subprocess_encoding():
843 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
844 # For subprocess calls, encode with locale encoding
845 # Refer to http://stackoverflow.com/a/9951851/35070
846 encoding
= preferredencoding()
848 encoding
= sys
.getfilesystemencoding()
854 def encodeFilename(s
, for_subprocess
=False):
855 assert isinstance(s
, str)
859 def decodeFilename(b
, for_subprocess
=False):
863 def encodeArgument(s
):
864 # Legacy code that uses byte strings
865 # Uncomment the following line after fixing all post processors
866 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
867 return s
if isinstance(s
, str) else s
.decode('ascii')
870 def decodeArgument(b
):
874 def decodeOption(optval
):
877 if isinstance(optval
, bytes):
878 optval
= optval
.decode(preferredencoding())
880 assert isinstance(optval
, compat_str
)
884 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
887 def timetuple_from_msec(msec
):
888 secs
, msec
= divmod(msec
, 1000)
889 mins
, secs
= divmod(secs
, 60)
890 hrs
, mins
= divmod(mins
, 60)
891 return _timetuple(hrs
, mins
, secs
, msec
)
894 def formatSeconds(secs
, delim
=':', msec
=False):
895 time
= timetuple_from_msec(secs
* 1000)
897 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
899 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
901 ret
= '%d' % time
.seconds
902 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
905 def _ssl_load_windows_store_certs(ssl_context
, storename
):
906 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
908 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
909 if encoding
== 'x509_asn' and (
910 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
911 except PermissionError
:
914 with contextlib
.suppress(ssl
.SSLError
):
915 ssl_context
.load_verify_locations(cadata
=cert
)
918 def make_HTTPS_handler(params
, **kwargs
):
919 opts_check_certificate
= not params
.get('nocheckcertificate')
920 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
921 context
.check_hostname
= opts_check_certificate
922 if params
.get('legacyserverconnect'):
923 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
924 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
925 context
.set_ciphers('DEFAULT')
927 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
928 if opts_check_certificate
:
929 if has_certifi
and 'no-certifi' not in params
.get('compat_opts', []):
930 context
.load_verify_locations(cafile
=certifi
.where())
932 context
.load_default_certs()
933 # Work around the issue in load_default_certs when there are bad certificates. See:
934 # https://github.com/yt-dlp/yt-dlp/issues/1060,
935 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
937 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
938 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
939 for storename
in ('CA', 'ROOT'):
940 _ssl_load_windows_store_certs(context
, storename
)
941 context
.set_default_verify_paths()
943 client_certfile
= params
.get('client_certificate')
946 context
.load_cert_chain(
947 client_certfile
, keyfile
=params
.get('client_certificate_key'),
948 password
=params
.get('client_certificate_password'))
950 raise YoutubeDLError('Unable to load client certificate')
952 # Some servers may reject requests if ALPN extension is not sent. See:
953 # https://github.com/python/cpython/issues/85140
954 # https://github.com/yt-dlp/yt-dlp/issues/3878
955 with contextlib
.suppress(NotImplementedError):
956 context
.set_alpn_protocols(['http/1.1'])
958 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
961 def bug_reports_message(before
=';'):
962 msg
= ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
963 'filling out the appropriate issue template. '
964 'Confirm you are on the latest version using yt-dlp -U')
966 before
= before
.rstrip()
967 if not before
or before
.endswith(('.', '!', '?')):
968 msg
= msg
[0].title() + msg
[1:]
970 return (before
+ ' ' if before
else '') + msg
973 class YoutubeDLError(Exception):
974 """Base exception for YoutubeDL errors."""
977 def __init__(self
, msg
=None):
980 elif self
.msg
is None:
981 self
.msg
= type(self
).__name
__
982 super().__init
__(self
.msg
)
985 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
986 if hasattr(ssl
, 'CertificateError'):
987 network_exceptions
.append(ssl
.CertificateError
)
988 network_exceptions
= tuple(network_exceptions
)
991 class ExtractorError(YoutubeDLError
):
992 """Error during info extraction."""
994 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
995 """ tb, if given, is the original traceback (so that it can be printed out).
996 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
998 if sys
.exc_info()[0] in network_exceptions
:
1001 self
.orig_msg
= str(msg
)
1003 self
.expected
= expected
1005 self
.video_id
= video_id
1007 self
.exc_info
= sys
.exc_info() # preserve original exception
1009 super().__init
__(''.join((
1010 format_field(ie
, template
='[%s] '),
1011 format_field(video_id
, template
='%s: '),
1013 format_field(cause
, template
=' (caused by %r)'),
1014 '' if expected
else bug_reports_message())))
1016 def format_traceback(self
):
1017 return join_nonempty(
1018 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1019 self
.cause
and ''.join(traceback
.format_exception(None, self
.cause
, self
.cause
.__traceback
__)[1:]),
1023 class UnsupportedError(ExtractorError
):
1024 def __init__(self
, url
):
1026 'Unsupported URL: %s' % url
, expected
=True)
1030 class RegexNotFoundError(ExtractorError
):
1031 """Error when a regex didn't match"""
1035 class GeoRestrictedError(ExtractorError
):
1036 """Geographic restriction Error exception.
1038 This exception may be thrown when a video is not available from your
1039 geographic location due to geographic restrictions imposed by a website.
1042 def __init__(self
, msg
, countries
=None, **kwargs
):
1043 kwargs
['expected'] = True
1044 super().__init
__(msg
, **kwargs
)
1045 self
.countries
= countries
1048 class DownloadError(YoutubeDLError
):
1049 """Download Error exception.
1051 This exception may be thrown by FileDownloader objects if they are not
1052 configured to continue on errors. They will contain the appropriate
1056 def __init__(self
, msg
, exc_info
=None):
1057 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1058 super().__init
__(msg
)
1059 self
.exc_info
= exc_info
1062 class EntryNotInPlaylist(YoutubeDLError
):
1063 """Entry not in playlist exception.
1065 This exception will be thrown by YoutubeDL when a requested entry
1066 is not found in the playlist info_dict
1068 msg
= 'Entry not found in info'
1071 class SameFileError(YoutubeDLError
):
1072 """Same File exception.
1074 This exception will be thrown by FileDownloader objects if they detect
1075 multiple files would have to be downloaded to the same file on disk.
1077 msg
= 'Fixed output name but more than one file to download'
1079 def __init__(self
, filename
=None):
1080 if filename
is not None:
1081 self
.msg
+= f
': {filename}'
1082 super().__init
__(self
.msg
)
1085 class PostProcessingError(YoutubeDLError
):
1086 """Post Processing exception.
1088 This exception may be raised by PostProcessor's .run() method to
1089 indicate an error in the postprocessing task.
1093 class DownloadCancelled(YoutubeDLError
):
1094 """ Exception raised when the download queue should be interrupted """
1095 msg
= 'The download was cancelled'
1098 class ExistingVideoReached(DownloadCancelled
):
1099 """ --break-on-existing triggered """
1100 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1103 class RejectedVideoReached(DownloadCancelled
):
1104 """ --break-on-reject triggered """
1105 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1108 class MaxDownloadsReached(DownloadCancelled
):
1109 """ --max-downloads limit has been reached. """
1110 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1113 class ReExtractInfo(YoutubeDLError
):
1114 """ Video info needs to be re-extracted. """
1116 def __init__(self
, msg
, expected
=False):
1117 super().__init
__(msg
)
1118 self
.expected
= expected
1121 class ThrottledDownload(ReExtractInfo
):
1122 """ Download speed below --throttled-rate. """
1123 msg
= 'The download speed is below throttle limit'
1126 super().__init
__(self
.msg
, expected
=False)
1129 class UnavailableVideoError(YoutubeDLError
):
1130 """Unavailable Format exception.
1132 This exception will be thrown when a video is requested
1133 in a format that is not available for that video.
1135 msg
= 'Unable to download video'
1137 def __init__(self
, err
=None):
1139 self
.msg
+= f
': {err}'
1140 super().__init
__(self
.msg
)
1143 class ContentTooShortError(YoutubeDLError
):
1144 """Content Too Short exception.
1146 This exception may be raised by FileDownloader objects when a file they
1147 download is too small for what the server announced first, indicating
1148 the connection was probably interrupted.
1151 def __init__(self
, downloaded
, expected
):
1152 super().__init
__(f
'Downloaded {downloaded} bytes, expected {expected} bytes')
1154 self
.downloaded
= downloaded
1155 self
.expected
= expected
1158 class XAttrMetadataError(YoutubeDLError
):
1159 def __init__(self
, code
=None, msg
='Unknown error'):
1160 super().__init
__(msg
)
1164 # Parsing code and msg
1165 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1166 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1167 self
.reason
= 'NO_SPACE'
1168 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1169 self
.reason
= 'VALUE_TOO_LONG'
1171 self
.reason
= 'NOT_SUPPORTED'
1174 class XAttrUnavailableError(YoutubeDLError
):
1178 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1179 hc
= http_class(*args
, **kwargs
)
1180 source_address
= ydl_handler
._params
.get('source_address')
1182 if source_address
is not None:
1183 # This is to workaround _create_connection() from socket where it will try all
1184 # address data from getaddrinfo() including IPv6. This filters the result from
1185 # getaddrinfo() based on the source_address value.
1186 # This is based on the cpython socket.create_connection() function.
1187 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1188 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1189 host
, port
= address
1191 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1192 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1193 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1194 if addrs
and not ip_addrs
:
1195 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1197 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1198 % (ip_version
, source_address
[0]))
1199 for res
in ip_addrs
:
1200 af
, socktype
, proto
, canonname
, sa
= res
1203 sock
= socket
.socket(af
, socktype
, proto
)
1204 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1205 sock
.settimeout(timeout
)
1206 sock
.bind(source_address
)
1208 err
= None # Explicitly break reference cycle
1210 except OSError as _
:
1212 if sock
is not None:
1217 raise OSError('getaddrinfo returns an empty list')
1218 if hasattr(hc
, '_create_connection'):
1219 hc
._create
_connection
= _create_connection
1220 hc
.source_address
= (source_address
, 0)
1225 def handle_youtubedl_headers(headers
):
1226 filtered_headers
= headers
1228 if 'Youtubedl-no-compression' in filtered_headers
:
1229 filtered_headers
= {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1230 del filtered_headers
['Youtubedl-no-compression']
1232 return filtered_headers
1235 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1236 """Handler for HTTP requests and responses.
1238 This class, when installed with an OpenerDirector, automatically adds
1239 the standard headers to every HTTP request and handles gzipped and
1240 deflated responses from web servers. If compression is to be avoided in
1241 a particular request, the original request in the program code only has
1242 to include the HTTP header "Youtubedl-no-compression", which will be
1243 removed before making the real request.
1245 Part of this code was copied from:
1247 http://techknack.net/python-urllib2-handlers/
1249 Andrew Rowls, the author of that code, agreed to release it to the
1253 def __init__(self
, params
, *args
, **kwargs
):
1254 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1255 self
._params
= params
1257 def http_open(self
, req
):
1258 conn_class
= compat_http_client
.HTTPConnection
1260 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1262 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1263 del req
.headers
['Ytdl-socks-proxy']
1265 return self
.do_open(functools
.partial(
1266 _create_http_connection
, self
, conn_class
, False),
1274 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1276 return zlib
.decompress(data
)
1282 return brotli
.decompress(data
)
1284 def http_request(self
, req
):
1285 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1286 # always respected by websites, some tend to give out URLs with non percent-encoded
1287 # non-ASCII characters (see telemb.py, ard.py [#3412])
1288 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1289 # To work around aforementioned issue we will replace request's original URL with
1290 # percent-encoded one
1291 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1292 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1293 url
= req
.get_full_url()
1294 url_escaped
= escape_url(url
)
1296 # Substitute URL if any change after escaping
1297 if url
!= url_escaped
:
1298 req
= update_Request(req
, url
=url_escaped
)
1300 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1301 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1302 # The dict keys are capitalized because of this bug by urllib
1303 if h
.capitalize() not in req
.headers
:
1304 req
.add_header(h
, v
)
1306 if 'Accept-encoding' not in req
.headers
:
1307 req
.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS
))
1309 req
.headers
= handle_youtubedl_headers(req
.headers
)
1313 def http_response(self
, req
, resp
):
1316 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1317 content
= resp
.read()
1318 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1320 uncompressed
= io
.BytesIO(gz
.read())
1321 except OSError as original_ioerror
:
1322 # There may be junk add the end of the file
1323 # See http://stackoverflow.com/q/4928560/35070 for details
1324 for i
in range(1, 1024):
1326 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1327 uncompressed
= io
.BytesIO(gz
.read())
1332 raise original_ioerror
1333 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1334 resp
.msg
= old_resp
.msg
1335 del resp
.headers
['Content-encoding']
1337 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1338 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1339 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1340 resp
.msg
= old_resp
.msg
1341 del resp
.headers
['Content-encoding']
1343 if resp
.headers
.get('Content-encoding', '') == 'br':
1344 resp
= compat_urllib_request
.addinfourl(
1345 io
.BytesIO(self
.brotli(resp
.read())), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1346 resp
.msg
= old_resp
.msg
1347 del resp
.headers
['Content-encoding']
1348 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1349 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1350 if 300 <= resp
.code
< 400:
1351 location
= resp
.headers
.get('Location')
1353 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1354 location
= location
.encode('iso-8859-1').decode()
1355 location_escaped
= escape_url(location
)
1356 if location
!= location_escaped
:
1357 del resp
.headers
['Location']
1358 resp
.headers
['Location'] = location_escaped
1361 https_request
= http_request
1362 https_response
= http_response
1365 def make_socks_conn_class(base_class
, socks_proxy
):
1366 assert issubclass(base_class
, (
1367 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1369 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1370 if url_components
.scheme
.lower() == 'socks5':
1371 socks_type
= ProxyType
.SOCKS5
1372 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1373 socks_type
= ProxyType
.SOCKS4
1374 elif url_components
.scheme
.lower() == 'socks4a':
1375 socks_type
= ProxyType
.SOCKS4A
1377 def unquote_if_non_empty(s
):
1380 return compat_urllib_parse_unquote_plus(s
)
1384 url_components
.hostname
, url_components
.port
or 1080,
1386 unquote_if_non_empty(url_components
.username
),
1387 unquote_if_non_empty(url_components
.password
),
1390 class SocksConnection(base_class
):
1392 self
.sock
= sockssocket()
1393 self
.sock
.setproxy(*proxy_args
)
1394 if isinstance(self
.timeout
, (int, float)):
1395 self
.sock
.settimeout(self
.timeout
)
1396 self
.sock
.connect((self
.host
, self
.port
))
1398 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1399 if hasattr(self
, '_context'): # Python > 2.6
1400 self
.sock
= self
._context
.wrap_socket(
1401 self
.sock
, server_hostname
=self
.host
)
1403 self
.sock
= ssl
.wrap_socket(self
.sock
)
1405 return SocksConnection
1408 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1409 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1410 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1411 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1412 self
._params
= params
1414 def https_open(self
, req
):
1416 conn_class
= self
._https
_conn
_class
1418 if hasattr(self
, '_context'): # python > 2.6
1419 kwargs
['context'] = self
._context
1420 if hasattr(self
, '_check_hostname'): # python 3.x
1421 kwargs
['check_hostname'] = self
._check
_hostname
1423 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1425 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1426 del req
.headers
['Ytdl-socks-proxy']
1429 return self
.do_open(
1430 functools
.partial(_create_http_connection
, self
, conn_class
, True), req
, **kwargs
)
1431 except urllib
.error
.URLError
as e
:
1432 if (isinstance(e
.reason
, ssl
.SSLError
)
1433 and getattr(e
.reason
, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1434 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1438 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1440 See [1] for cookie file format.
1442 1. https://curl.haxx.se/docs/http-cookies.html
1444 _HTTPONLY_PREFIX
= '#HttpOnly_'
1446 _HEADER
= '''# Netscape HTTP Cookie File
1447 # This file is generated by yt-dlp. Do not edit.
1450 _CookieFileEntry
= collections
.namedtuple(
1452 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1454 def __init__(self
, filename
=None, *args
, **kwargs
):
1455 super().__init
__(None, *args
, **kwargs
)
1456 if self
.is_path(filename
):
1457 filename
= os
.fspath(filename
)
1458 self
.filename
= filename
1461 def _true_or_false(cndn
):
1462 return 'TRUE' if cndn
else 'FALSE'
1466 return isinstance(file, (str, bytes, os
.PathLike
))
1468 @contextlib.contextmanager
1469 def open(self
, file, *, write
=False):
1470 if self
.is_path(file):
1471 with open(file, 'w' if write
else 'r', encoding
='utf-8') as f
:
1478 def _really_save(self
, f
, ignore_discard
=False, ignore_expires
=False):
1481 if (not ignore_discard
and cookie
.discard
1482 or not ignore_expires
and cookie
.is_expired(now
)):
1484 name
, value
= cookie
.name
, cookie
.value
1486 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1487 # with no name, whereas http.cookiejar regards it as a
1488 # cookie with no value.
1489 name
, value
= '', name
1490 f
.write('%s\n' % '\t'.join((
1492 self
._true
_or
_false
(cookie
.domain
.startswith('.')),
1494 self
._true
_or
_false
(cookie
.secure
),
1495 str_or_none(cookie
.expires
, default
=''),
1499 def save(self
, filename
=None, *args
, **kwargs
):
1501 Save cookies to a file.
1502 Code is taken from CPython 3.6
1503 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1505 if filename
is None:
1506 if self
.filename
is not None:
1507 filename
= self
.filename
1509 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1511 # Store session cookies with `expires` set to 0 instead of an empty string
1513 if cookie
.expires
is None:
1516 with self
.open(filename
, write
=True) as f
:
1517 f
.write(self
._HEADER
)
1518 self
._really
_save
(f
, *args
, **kwargs
)
1520 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1521 """Load cookies from a file."""
1522 if filename
is None:
1523 if self
.filename
is not None:
1524 filename
= self
.filename
1526 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1528 def prepare_line(line
):
1529 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1530 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1531 # comments and empty lines are fine
1532 if line
.startswith('#') or not line
.strip():
1534 cookie_list
= line
.split('\t')
1535 if len(cookie_list
) != self
._ENTRY
_LEN
:
1536 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1537 cookie
= self
._CookieFileEntry
(*cookie_list
)
1538 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1539 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1543 with self
.open(filename
) as f
:
1546 cf
.write(prepare_line(line
))
1547 except compat_cookiejar
.LoadError
as e
:
1548 if f
'{line.strip()} '[0] in '[{"':
1549 raise compat_cookiejar
.LoadError(
1550 'Cookies file must be Netscape formatted, not JSON. See '
1551 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1552 write_string(f
'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1555 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1556 # Session cookies are denoted by either `expires` field set to
1557 # an empty string or 0. MozillaCookieJar only recognizes the former
1558 # (see [1]). So we need force the latter to be recognized as session
1559 # cookies on our own.
1560 # Session cookies may be important for cookies-based authentication,
1561 # e.g. usually, when user does not check 'Remember me' check box while
1562 # logging in on a site, some important cookies are stored as session
1563 # cookies so that not recognizing them will result in failed login.
1564 # 1. https://bugs.python.org/issue17164
1566 # Treat `expires=0` cookies as session cookies
1567 if cookie
.expires
== 0:
1568 cookie
.expires
= None
1569 cookie
.discard
= True
1572 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1573 def __init__(self
, cookiejar
=None):
1574 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1576 def http_response(self
, request
, response
):
1577 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1579 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1580 https_response
= http_response
1583 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1584 """YoutubeDL redirect handler
1586 The code is based on HTTPRedirectHandler implementation from CPython [1].
1588 This redirect handler solves two issues:
1589 - ensures redirect URL is always unicode under python 2
1590 - introduces support for experimental HTTP response status code
1591 308 Permanent Redirect [2] used by some sites [3]
1593 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1594 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1595 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1598 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1600 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1601 """Return a Request or None in response to a redirect.
1603 This is called by the http_error_30x methods when a
1604 redirection response is received. If a redirection should
1605 take place, return a new Request to allow http_error_30x to
1606 perform the redirect. Otherwise, raise HTTPError if no-one
1607 else should try to handle this url. Return None if you can't
1608 but another Handler might.
1610 m
= req
.get_method()
1611 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1612 or code
in (301, 302, 303) and m
== "POST")):
1613 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1614 # Strictly (according to RFC 2616), 301 or 302 in response to
1615 # a POST MUST NOT cause a redirection without confirmation
1616 # from the user (of urllib.request, in this case). In practice,
1617 # essentially all clients do redirect in this case, so we do
1620 # Be conciliant with URIs containing a space. This is mainly
1621 # redundant with the more complete encoding done in http_error_302(),
1622 # but it is kept for compatibility with other callers.
1623 newurl
= newurl
.replace(' ', '%20')
1625 CONTENT_HEADERS
= ("content-length", "content-type")
1626 # NB: don't use dict comprehension for python 2.6 compatibility
1627 newheaders
= {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1629 # A 303 must either use GET or HEAD for subsequent request
1630 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1631 if code
== 303 and m
!= 'HEAD':
1633 # 301 and 302 redirects are commonly turned into a GET from a POST
1634 # for subsequent requests by browsers, so we'll do the same.
1635 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1636 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1637 if code
in (301, 302) and m
== 'POST':
1640 return compat_urllib_request
.Request(
1641 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1642 unverifiable
=True, method
=m
)
1645 def extract_timezone(date_str
):
1648 ^.{8,}? # >=8 char non-TZ prefix, if present
1649 (?P<tz>Z| # just the UTC Z, or
1650 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1651 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1652 [ ]? # optional space
1653 (?P<sign>\+|-) # +/-
1654 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1658 timezone
= datetime
.timedelta()
1660 date_str
= date_str
[:-len(m
.group('tz'))]
1661 if not m
.group('sign'):
1662 timezone
= datetime
.timedelta()
1664 sign
= 1 if m
.group('sign') == '+' else -1
1665 timezone
= datetime
.timedelta(
1666 hours
=sign
* int(m
.group('hours')),
1667 minutes
=sign
* int(m
.group('minutes')))
1668 return timezone
, date_str
1671 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1672 """ Return a UNIX timestamp from the given date """
1674 if date_str
is None:
1677 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1679 if timezone
is None:
1680 timezone
, date_str
= extract_timezone(date_str
)
1682 with contextlib
.suppress(ValueError):
1683 date_format
= f
'%Y-%m-%d{delimiter}%H:%M:%S'
1684 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1685 return calendar
.timegm(dt
.timetuple())
1688 def date_formats(day_first
=True):
1689 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1692 def unified_strdate(date_str
, day_first
=True):
1693 """Return a string with the date in the format YYYYMMDD"""
1695 if date_str
is None:
1699 date_str
= date_str
.replace(',', ' ')
1700 # Remove AM/PM + timezone
1701 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1702 _
, date_str
= extract_timezone(date_str
)
1704 for expression
in date_formats(day_first
):
1705 with contextlib
.suppress(ValueError):
1706 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1707 if upload_date
is None:
1708 timetuple
= email
.utils
.parsedate_tz(date_str
)
1710 with contextlib
.suppress(ValueError):
1711 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1712 if upload_date
is not None:
1713 return compat_str(upload_date
)
1716 def unified_timestamp(date_str
, day_first
=True):
1717 if date_str
is None:
1720 date_str
= re
.sub(r
'[,|]', '', date_str
)
1722 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1723 timezone
, date_str
= extract_timezone(date_str
)
1725 # Remove AM/PM + timezone
1726 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1728 # Remove unrecognized timezones from ISO 8601 alike timestamps
1729 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1731 date_str
= date_str
[:-len(m
.group('tz'))]
1733 # Python only supports microseconds, so remove nanoseconds
1734 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1736 date_str
= m
.group(1)
1738 for expression
in date_formats(day_first
):
1739 with contextlib
.suppress(ValueError):
1740 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1741 return calendar
.timegm(dt
.timetuple())
1742 timetuple
= email
.utils
.parsedate_tz(date_str
)
1744 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1747 def determine_ext(url
, default_ext
='unknown_video'):
1748 if url
is None or '.' not in url
:
1750 guess
= url
.partition('?')[0].rpartition('.')[2]
1751 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1753 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1754 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1755 return guess
.rstrip('/')
1760 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1761 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1764 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1766 Return a datetime object from a string.
1768 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1770 @param format strftime format of DATE
1771 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1772 auto: round to the unit provided in date_str (if applicable).
1774 auto_precision
= False
1775 if precision
== 'auto':
1776 auto_precision
= True
1777 precision
= 'microsecond'
1778 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1779 if date_str
in ('now', 'today'):
1781 if date_str
== 'yesterday':
1782 return today
- datetime
.timedelta(days
=1)
1784 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1786 if match
is not None:
1787 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1788 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1789 unit
= match
.group('unit')
1790 if unit
== 'month' or unit
== 'year':
1791 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1797 delta
= datetime
.timedelta(**{unit + 's': time}
)
1798 new_date
= start_time
+ delta
1800 return datetime_round(new_date
, unit
)
1803 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1806 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1808 Return a date object from a string using datetime_from_str
1810 @param strict Restrict allowed patterns to "YYYYMMDD" and
1811 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1813 if strict
and not re
.fullmatch(r
'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str
):
1814 raise ValueError(f
'Invalid date format "{date_str}"')
1815 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1818 def datetime_add_months(dt
, months
):
1819 """Increment/Decrement a datetime object by months."""
1820 month
= dt
.month
+ months
- 1
1821 year
= dt
.year
+ month
// 12
1822 month
= month
% 12 + 1
1823 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1824 return dt
.replace(year
, month
, day
)
1827 def datetime_round(dt
, precision
='day'):
1829 Round a datetime object's time to a specific precision
1831 if precision
== 'microsecond':
1840 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1841 timestamp
= calendar
.timegm(dt
.timetuple())
1842 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1845 def hyphenate_date(date_str
):
1847 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1848 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1849 if match
is not None:
1850 return '-'.join(match
.groups())
1856 """Represents a time interval between two dates"""
1858 def __init__(self
, start
=None, end
=None):
1859 """start and end must be strings in the format accepted by date"""
1860 if start
is not None:
1861 self
.start
= date_from_str(start
, strict
=True)
1863 self
.start
= datetime
.datetime
.min.date()
1865 self
.end
= date_from_str(end
, strict
=True)
1867 self
.end
= datetime
.datetime
.max.date()
1868 if self
.start
> self
.end
:
1869 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1873 """Returns a range that only contains the given day"""
1874 return cls(day
, day
)
1876 def __contains__(self
, date
):
1877 """Check if the date is in the range"""
1878 if not isinstance(date
, datetime
.date
):
1879 date
= date_from_str(date
)
1880 return self
.start
<= date
<= self
.end
1883 return f
'{self.start.isoformat()} - {self.end.isoformat()}'
1886 def platform_name():
1887 """ Returns the platform name as a compat_str """
1888 res
= platform
.platform()
1889 if isinstance(res
, bytes):
1890 res
= res
.decode(preferredencoding())
1892 assert isinstance(res
, compat_str
)
1897 def get_windows_version():
1898 ''' Get Windows version. returns () if it's not running on Windows '''
1899 if compat_os_name
== 'nt':
1900 return version_tuple(platform
.win32_ver()[1])
1905 def write_string(s
, out
=None, encoding
=None):
1906 assert isinstance(s
, str)
1907 out
= out
or sys
.stderr
1909 if compat_os_name
== 'nt' and supports_terminal_sequences(out
):
1910 s
= re
.sub(r
'([\r\n]+)', r
' \1', s
)
1912 enc
, buffer = None, out
1913 if 'b' in getattr(out
, 'mode', ''):
1914 enc
= encoding
or preferredencoding()
1915 elif hasattr(out
, 'buffer'):
1917 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1919 buffer.write(s
.encode(enc
, 'ignore') if enc
else s
)
1923 def bytes_to_intlist(bs
):
1926 if isinstance(bs
[0], int): # Python 3
1929 return [ord(c
) for c
in bs
]
1932 def intlist_to_bytes(xs
):
1935 return compat_struct_pack('%dB' % len(xs
), *xs
)
1938 class LockingUnsupportedError(OSError):
1939 msg
= 'File locking is not supported'
1942 super().__init
__(self
.msg
)
1945 # Cross-platform file locking
1946 if sys
.platform
== 'win32':
1947 import ctypes
.wintypes
1950 class OVERLAPPED(ctypes
.Structure
):
1952 ('Internal', ctypes
.wintypes
.LPVOID
),
1953 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1954 ('Offset', ctypes
.wintypes
.DWORD
),
1955 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1956 ('hEvent', ctypes
.wintypes
.HANDLE
),
1959 kernel32
= ctypes
.windll
.kernel32
1960 LockFileEx
= kernel32
.LockFileEx
1961 LockFileEx
.argtypes
= [
1962 ctypes
.wintypes
.HANDLE
, # hFile
1963 ctypes
.wintypes
.DWORD
, # dwFlags
1964 ctypes
.wintypes
.DWORD
, # dwReserved
1965 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1966 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1967 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1969 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1970 UnlockFileEx
= kernel32
.UnlockFileEx
1971 UnlockFileEx
.argtypes
= [
1972 ctypes
.wintypes
.HANDLE
, # hFile
1973 ctypes
.wintypes
.DWORD
, # dwReserved
1974 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1975 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1976 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1978 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
1979 whole_low
= 0xffffffff
1980 whole_high
= 0x7fffffff
1982 def _lock_file(f
, exclusive
, block
):
1983 overlapped
= OVERLAPPED()
1984 overlapped
.Offset
= 0
1985 overlapped
.OffsetHigh
= 0
1986 overlapped
.hEvent
= 0
1987 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
1989 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
1990 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
1991 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1992 raise BlockingIOError('Locking file failed: %r' % ctypes
.FormatError())
1994 def _unlock_file(f
):
1995 assert f
._lock
_file
_overlapped
_p
1996 handle
= msvcrt
.get_osfhandle(f
.fileno())
1997 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
1998 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2004 def _lock_file(f
, exclusive
, block
):
2005 flags
= fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
2007 flags |
= fcntl
.LOCK_NB
2009 fcntl
.flock(f
, flags
)
2010 except BlockingIOError
:
2012 except OSError: # AOSP does not have flock()
2013 fcntl
.lockf(f
, flags
)
2015 def _unlock_file(f
):
2017 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2019 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2023 def _lock_file(f
, exclusive
, block
):
2024 raise LockingUnsupportedError()
2026 def _unlock_file(f
):
2027 raise LockingUnsupportedError()
2033 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2034 if mode
not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}
:
2035 raise NotImplementedError(mode
)
2036 self
.mode
, self
.block
= mode
, block
2038 writable
= any(f
in mode
for f
in 'wax+')
2039 readable
= any(f
in mode
for f
in 'r+')
2040 flags
= functools
.reduce(operator
.ior
, (
2041 getattr(os
, 'O_CLOEXEC', 0), # UNIX only
2042 getattr(os
, 'O_BINARY', 0), # Windows only
2043 getattr(os
, 'O_NOINHERIT', 0), # Windows only
2044 os
.O_CREAT
if writable
else 0, # O_TRUNC only after locking
2045 os
.O_APPEND
if 'a' in mode
else 0,
2046 os
.O_EXCL
if 'x' in mode
else 0,
2047 os
.O_RDONLY
if not writable
else os
.O_RDWR
if readable
else os
.O_WRONLY
,
2050 self
.f
= os
.fdopen(os
.open(filename
, flags
, 0o666), mode
, encoding
=encoding
)
2052 def __enter__(self
):
2053 exclusive
= 'r' not in self
.mode
2055 _lock_file(self
.f
, exclusive
, self
.block
)
2060 if 'w' in self
.mode
:
2063 except OSError as e
:
2065 errno
.ESPIPE
, # Illegal seek - expected for FIFO
2066 errno
.EINVAL
, # Invalid argument - expected for /dev/null
2075 _unlock_file(self
.f
)
2079 def __exit__(self
, *_
):
2088 def __getattr__(self
, attr
):
2089 return getattr(self
.f
, attr
)
2096 def get_filesystem_encoding():
2097 encoding
= sys
.getfilesystemencoding()
2098 return encoding
if encoding
is not None else 'utf-8'
2101 def shell_quote(args
):
2103 encoding
= get_filesystem_encoding()
2105 if isinstance(a
, bytes):
2106 # We may get a filename encoded with 'encodeFilename'
2107 a
= a
.decode(encoding
)
2108 quoted_args
.append(compat_shlex_quote(a
))
2109 return ' '.join(quoted_args
)
2112 def smuggle_url(url
, data
):
2113 """ Pass additional data in a URL for internal use. """
2115 url
, idata
= unsmuggle_url(url
, {})
2117 sdata
= compat_urllib_parse_urlencode(
2118 {'__youtubedl_smuggle': json.dumps(data)}
)
2119 return url
+ '#' + sdata
2122 def unsmuggle_url(smug_url
, default
=None):
2123 if '#__youtubedl_smuggle' not in smug_url
:
2124 return smug_url
, default
2125 url
, _
, sdata
= smug_url
.rpartition('#')
2126 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2127 data
= json
.loads(jsond
)
2131 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2132 """ Formats numbers with decimal sufixes like K, M, etc """
2133 num
, factor
= float_or_none(num
), float(factor
)
2134 if num
is None or num
< 0:
2136 POSSIBLE_SUFFIXES
= 'kMGTPEZY'
2137 exponent
= 0 if num
== 0 else min(int(math
.log(num
, factor
)), len(POSSIBLE_SUFFIXES
))
2138 suffix
= ['', *POSSIBLE_SUFFIXES
][exponent
]
2140 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2141 converted
= num
/ (factor
** exponent
)
2142 return fmt
% (converted
, suffix
)
2145 def format_bytes(bytes):
2146 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2149 def lookup_unit_table(unit_table
, s
):
2150 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2152 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2155 num_str
= m
.group('num').replace(',', '.')
2156 mult
= unit_table
[m
.group('unit')]
2157 return int(float(num_str
) * mult
)
2160 def parse_filesize(s
):
2164 # The lower-case forms are of course incorrect and unofficial,
2165 # but we support those too
2182 'megabytes': 1000 ** 2,
2183 'mebibytes': 1024 ** 2,
2189 'gigabytes': 1000 ** 3,
2190 'gibibytes': 1024 ** 3,
2196 'terabytes': 1000 ** 4,
2197 'tebibytes': 1024 ** 4,
2203 'petabytes': 1000 ** 5,
2204 'pebibytes': 1024 ** 5,
2210 'exabytes': 1000 ** 6,
2211 'exbibytes': 1024 ** 6,
2217 'zettabytes': 1000 ** 7,
2218 'zebibytes': 1024 ** 7,
2224 'yottabytes': 1000 ** 8,
2225 'yobibytes': 1024 ** 8,
2228 return lookup_unit_table(_UNIT_TABLE
, s
)
2235 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2237 if re
.match(r
'^[\d,.]+$', s
):
2238 return str_to_int(s
)
2251 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2255 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2257 return str_to_int(mobj
.group(1))
2260 def parse_resolution(s
, *, lenient
=False):
2265 mobj
= re
.search(r
'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s
)
2267 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2270 'width': int(mobj
.group('w')),
2271 'height': int(mobj
.group('h')),
2274 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2276 return {'height': int(mobj.group(1))}
2278 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2280 return {'height': int(mobj.group(1)) * 540}
2285 def parse_bitrate(s
):
2286 if not isinstance(s
, compat_str
):
2288 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2290 return int(mobj
.group(1))
2293 def month_by_name(name
, lang
='en'):
2294 """ Return the number of a month by (locale-independently) English name """
2296 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2299 return month_names
.index(name
) + 1
2304 def month_by_abbreviation(abbrev
):
2305 """ Return the number of a month by (locale-independently) English
2309 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2314 def fix_xml_ampersands(xml_str
):
2315 """Replace all the '&' by '&' in XML"""
2317 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2322 def setproctitle(title
):
2323 assert isinstance(title
, compat_str
)
2325 # ctypes in Jython is not complete
2326 # http://bugs.jython.org/issue2148
2327 if sys
.platform
.startswith('java'):
2331 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2335 # LoadLibrary in Windows Python 2.7.13 only expects
2336 # a bytestring, but since unicode_literals turns
2337 # every string into a unicode string, it fails.
2339 title_bytes
= title
.encode()
2340 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2341 buf
.value
= title_bytes
2343 libc
.prctl(15, buf
, 0, 0, 0)
2344 except AttributeError:
2345 return # Strange libc, just skip this
2348 def remove_start(s
, start
):
2349 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2352 def remove_end(s
, end
):
2353 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2356 def remove_quotes(s
):
2357 if s
is None or len(s
) < 2:
2359 for quote
in ('"', "'", ):
2360 if s
[0] == quote
and s
[-1] == quote
:
2365 def get_domain(url
):
2366 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2367 return domain
.group('domain') if domain
else None
2370 def url_basename(url
):
2371 path
= compat_urlparse
.urlparse(url
).path
2372 return path
.strip('/').split('/')[-1]
2376 return re
.match(r
'https?://[^?#&]+/', url
).group()
2379 def urljoin(base
, path
):
2380 if isinstance(path
, bytes):
2381 path
= path
.decode()
2382 if not isinstance(path
, compat_str
) or not path
:
2384 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2386 if isinstance(base
, bytes):
2387 base
= base
.decode()
2388 if not isinstance(base
, compat_str
) or not re
.match(
2389 r
'^(?:https?:)?//', base
):
2391 return compat_urlparse
.urljoin(base
, path
)
2394 class HEADRequest(compat_urllib_request
.Request
):
2395 def get_method(self
):
2399 class PUTRequest(compat_urllib_request
.Request
):
2400 def get_method(self
):
2404 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2405 if get_attr
and v
is not None:
2406 v
= getattr(v
, get_attr
, None)
2408 return int(v
) * invscale
// scale
2409 except (ValueError, TypeError, OverflowError):
2413 def str_or_none(v
, default
=None):
2414 return default
if v
is None else compat_str(v
)
2417 def str_to_int(int_str
):
2418 """ A more relaxed version of int_or_none """
2419 if isinstance(int_str
, int):
2421 elif isinstance(int_str
, compat_str
):
2422 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2423 return int_or_none(int_str
)
2426 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2430 return float(v
) * invscale
/ scale
2431 except (ValueError, TypeError):
2435 def bool_or_none(v
, default
=None):
2436 return v
if isinstance(v
, bool) else default
2439 def strip_or_none(v
, default
=None):
2440 return v
.strip() if isinstance(v
, compat_str
) else default
2443 def url_or_none(url
):
2444 if not url
or not isinstance(url
, compat_str
):
2447 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2450 def request_to_url(req
):
2451 if isinstance(req
, compat_urllib_request
.Request
):
2452 return req
.get_full_url()
2457 def strftime_or_none(timestamp
, date_format
, default
=None):
2458 datetime_object
= None
2460 if isinstance(timestamp
, (int, float)): # unix timestamp
2461 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2462 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2463 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2464 return datetime_object
.strftime(date_format
)
2465 except (ValueError, TypeError, AttributeError):
2469 def parse_duration(s
):
2470 if not isinstance(s
, str):
2476 days
, hours
, mins
, secs
, ms
= [None] * 5
2477 m
= re
.match(r
'''(?x)
2479 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2480 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2481 (?P<ms>[.:][0-9]+)?Z?$
2484 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2489 [0-9]+\s*y(?:ears?)?,?\s*
2492 [0-9]+\s*m(?:onths?)?,?\s*
2495 [0-9]+\s*w(?:eeks?)?,?\s*
2498 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2502 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2505 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2508 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2511 days
, hours
, mins
, secs
, ms
= m
.groups()
2513 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2515 hours
, mins
= m
.groups()
2520 ms
= ms
.replace(':', '.')
2521 return sum(float(part
or 0) * mult
for part
, mult
in (
2522 (days
, 86400), (hours
, 3600), (mins
, 60), (secs
, 1), (ms
, 1)))
2525 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2526 name
, real_ext
= os
.path
.splitext(filename
)
2528 f
'{name}.{ext}{real_ext}'
2529 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2530 else f
'{filename}.{ext}')
2533 def replace_extension(filename
, ext
, expected_real_ext
=None):
2534 name
, real_ext
= os
.path
.splitext(filename
)
2535 return '{}.{}'.format(
2536 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2540 def check_executable(exe
, args
=[]):
2541 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2542 args can be a list of arguments for a short output (like -version) """
2544 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
2550 def _get_exe_version_output(exe
, args
, *, to_screen
=None):
2552 to_screen(f
'Checking exe version: {shell_quote([exe] + args)}')
2554 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2555 # SIGTTOU if yt-dlp is run in the background.
2556 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2558 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
2559 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
2562 if isinstance(out
, bytes): # Python 2.x
2563 out
= out
.decode('ascii', 'ignore')
2567 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2568 assert isinstance(output
, compat_str
)
2569 if version_re
is None:
2570 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2571 m
= re
.search(version_re
, output
)
2578 def get_exe_version(exe
, args
=['--version'],
2579 version_re
=None, unrecognized
='present'):
2580 """ Returns the version of the specified executable,
2581 or False if the executable is not present """
2582 out
= _get_exe_version_output(exe
, args
)
2583 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2586 class LazyList(collections
.abc
.Sequence
):
2587 """Lazy immutable list from an iterable
2588 Note that slices of a LazyList are lists and not LazyList"""
2590 class IndexError(IndexError):
2593 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2594 self
._iterable
= iter(iterable
)
2595 self
._cache
= [] if _cache
is None else _cache
2596 self
._reversed
= reverse
2600 # We need to consume the entire iterable to iterate in reverse
2601 yield from self
.exhaust()
2603 yield from self
._cache
2604 for item
in self
._iterable
:
2605 self
._cache
.append(item
)
2609 self
._cache
.extend(self
._iterable
)
2610 self
._iterable
= [] # Discard the emptied iterable to make it pickle-able
2614 """Evaluate the entire iterable"""
2615 return self
._exhaust
()[::-1 if self
._reversed
else 1]
2618 def _reverse_index(x
):
2619 return None if x
is None else -(x
+ 1)
2621 def __getitem__(self
, idx
):
2622 if isinstance(idx
, slice):
2624 idx
= slice(self
._reverse
_index
(idx
.start
), self
._reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2625 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2626 elif isinstance(idx
, int):
2628 idx
= self
._reverse
_index
(idx
)
2629 start
, stop
, step
= idx
, idx
, 0
2631 raise TypeError('indices must be integers or slices')
2632 if ((start
or 0) < 0 or (stop
or 0) < 0
2633 or (start
is None and step
< 0)
2634 or (stop
is None and step
> 0)):
2635 # We need to consume the entire iterable to be able to slice from the end
2636 # Obviously, never use this with infinite iterables
2639 return self
._cache
[idx
]
2640 except IndexError as e
:
2641 raise self
.IndexError(e
) from e
2642 n
= max(start
or 0, stop
or 0) - len(self
._cache
) + 1
2644 self
._cache
.extend(itertools
.islice(self
._iterable
, n
))
2646 return self
._cache
[idx
]
2647 except IndexError as e
:
2648 raise self
.IndexError(e
) from e
2652 self
[-1] if self
._reversed
else self
[0]
2653 except self
.IndexError:
2659 return len(self
._cache
)
2661 def __reversed__(self
):
2662 return type(self
)(self
._iterable
, reverse
=not self
._reversed
, _cache
=self
._cache
)
2665 return type(self
)(self
._iterable
, reverse
=self
._reversed
, _cache
=self
._cache
)
2668 # repr and str should mimic a list. So we exhaust the iterable
2669 return repr(self
.exhaust())
2672 return repr(self
.exhaust())
2677 class IndexError(IndexError):
2681 # This is only useful for tests
2682 return len(self
.getslice())
2684 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2685 self
._pagefunc
= pagefunc
2686 self
._pagesize
= pagesize
2687 self
._pagecount
= float('inf')
2688 self
._use
_cache
= use_cache
2691 def getpage(self
, pagenum
):
2692 page_results
= self
._cache
.get(pagenum
)
2693 if page_results
is None:
2694 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2696 self
._cache
[pagenum
] = page_results
2699 def getslice(self
, start
=0, end
=None):
2700 return list(self
._getslice
(start
, end
))
2702 def _getslice(self
, start
, end
):
2703 raise NotImplementedError('This method must be implemented by subclasses')
2705 def __getitem__(self
, idx
):
2706 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2707 if not isinstance(idx
, int) or idx
< 0:
2708 raise TypeError('indices must be non-negative integers')
2709 entries
= self
.getslice(idx
, idx
+ 1)
2711 raise self
.IndexError()
2715 class OnDemandPagedList(PagedList
):
2716 """Download pages until a page with less than maximum results"""
2718 def _getslice(self
, start
, end
):
2719 for pagenum
in itertools
.count(start
// self
._pagesize
):
2720 firstid
= pagenum
* self
._pagesize
2721 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2722 if start
>= nextfirstid
:
2726 start
% self
._pagesize
2727 if firstid
<= start
< nextfirstid
2730 ((end
- 1) % self
._pagesize
) + 1
2731 if (end
is not None and firstid
<= end
<= nextfirstid
)
2735 page_results
= self
.getpage(pagenum
)
2737 self
._pagecount
= pagenum
- 1
2739 if startv
!= 0 or endv
is not None:
2740 page_results
= page_results
[startv
:endv
]
2741 yield from page_results
2743 # A little optimization - if current page is not "full", ie. does
2744 # not contain page_size videos then we can assume that this page
2745 # is the last one - there are no more ids on further pages -
2746 # i.e. no need to query again.
2747 if len(page_results
) + startv
< self
._pagesize
:
2750 # If we got the whole page, but the next page is not interesting,
2751 # break out early as well
2752 if end
== nextfirstid
:
2756 class InAdvancePagedList(PagedList
):
2757 """PagedList with total number of pages known in advance"""
2759 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2760 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2761 self
._pagecount
= pagecount
2763 def _getslice(self
, start
, end
):
2764 start_page
= start
// self
._pagesize
2765 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2766 skip_elems
= start
- start_page
* self
._pagesize
2767 only_more
= None if end
is None else end
- start
2768 for pagenum
in range(start_page
, end_page
):
2769 page_results
= self
.getpage(pagenum
)
2771 page_results
= page_results
[skip_elems
:]
2773 if only_more
is not None:
2774 if len(page_results
) < only_more
:
2775 only_more
-= len(page_results
)
2777 yield from page_results
[:only_more
]
2779 yield from page_results
2782 def uppercase_escape(s
):
2783 unicode_escape
= codecs
.getdecoder('unicode_escape')
2785 r
'\\U[0-9a-fA-F]{8}',
2786 lambda m
: unicode_escape(m
.group(0))[0],
2790 def lowercase_escape(s
):
2791 unicode_escape
= codecs
.getdecoder('unicode_escape')
2793 r
'\\u[0-9a-fA-F]{4}',
2794 lambda m
: unicode_escape(m
.group(0))[0],
2798 def escape_rfc3986(s
):
2799 """Escape non-ASCII characters as suggested by RFC 3986"""
2800 return urllib
.parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2803 def escape_url(url
):
2804 """Escape URL as suggested by RFC 3986"""
2805 url_parsed
= compat_urllib_parse_urlparse(url
)
2806 return url_parsed
._replace
(
2807 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2808 path
=escape_rfc3986(url_parsed
.path
),
2809 params
=escape_rfc3986(url_parsed
.params
),
2810 query
=escape_rfc3986(url_parsed
.query
),
2811 fragment
=escape_rfc3986(url_parsed
.fragment
)
2816 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2819 def read_batch_urls(batch_fd
):
2821 if not isinstance(url
, compat_str
):
2822 url
= url
.decode('utf-8', 'replace')
2823 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2824 for bom
in BOM_UTF8
:
2825 if url
.startswith(bom
):
2826 url
= url
[len(bom
):]
2828 if not url
or url
.startswith(('#', ';', ']')):
2830 # "#" cannot be stripped out since it is part of the URI
2831 # However, it can be safely stipped out if follwing a whitespace
2832 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2834 with contextlib
.closing(batch_fd
) as fd
:
2835 return [url
for url
in map(fixup
, fd
) if url
]
2838 def urlencode_postdata(*args
, **kargs
):
2839 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2842 def update_url_query(url
, query
):
2845 parsed_url
= compat_urlparse
.urlparse(url
)
2846 qs
= compat_parse_qs(parsed_url
.query
)
2848 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2849 query
=compat_urllib_parse_urlencode(qs
, True)))
2852 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
2853 req_headers
= req
.headers
.copy()
2854 req_headers
.update(headers
)
2855 req_data
= data
or req
.data
2856 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2857 req_get_method
= req
.get_method()
2858 if req_get_method
== 'HEAD':
2859 req_type
= HEADRequest
2860 elif req_get_method
== 'PUT':
2861 req_type
= PUTRequest
2863 req_type
= compat_urllib_request
.Request
2865 req_url
, data
=req_data
, headers
=req_headers
,
2866 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2867 if hasattr(req
, 'timeout'):
2868 new_req
.timeout
= req
.timeout
2872 def _multipart_encode_impl(data
, boundary
):
2873 content_type
= 'multipart/form-data; boundary=%s' % boundary
2876 for k
, v
in data
.items():
2877 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2878 if isinstance(k
, compat_str
):
2880 if isinstance(v
, compat_str
):
2882 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2883 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2884 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2885 if boundary
.encode('ascii') in content
:
2886 raise ValueError('Boundary overlaps with data')
2889 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2891 return out
, content_type
2894 def multipart_encode(data
, boundary
=None):
2896 Encode a dict to RFC 7578-compliant form-data
2899 A dict where keys and values can be either Unicode or bytes-like
2902 If specified a Unicode object, it's used as the boundary. Otherwise
2903 a random boundary is generated.
2905 Reference: https://tools.ietf.org/html/rfc7578
2907 has_specified_boundary
= boundary
is not None
2910 if boundary
is None:
2911 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2914 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2917 if has_specified_boundary
:
2921 return out
, content_type
2924 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2925 for val
in map(d
.get
, variadic(key_or_keys
)):
2926 if val
is not None and (val
or not skip_false_values
):
2931 def try_call(*funcs
, expected_type
=None, args
=[], kwargs
={}):
2934 val
= f(*args
, **kwargs
)
2935 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2938 if expected_type
is None or isinstance(val
, expected_type
):
2942 def try_get(src
, getter
, expected_type
=None):
2943 return try_call(*variadic(getter
), args
=(src
,), expected_type
=expected_type
)
2946 def filter_dict(dct
, cndn
=lambda _
, v
: v
is not None):
2947 return {k: v for k, v in dct.items() if cndn(k, v)}
2950 def merge_dicts(*dicts
):
2952 for a_dict
in dicts
:
2953 for k
, v
in a_dict
.items():
2954 if (v
is not None and k
not in merged
2955 or isinstance(v
, str) and merged
[k
] == ''):
2960 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2961 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2973 TV_PARENTAL_GUIDELINES
= {
2983 def parse_age_limit(s
):
2984 # isinstance(False, int) is True. So type() must be used instead
2985 if type(s
) is int: # noqa: E721
2986 return s
if 0 <= s
<= 21 else None
2987 elif not isinstance(s
, str):
2989 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2991 return int(m
.group('age'))
2994 return US_RATINGS
[s
]
2995 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2997 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3001 def strip_jsonp(code
):
3004 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3005 (?:\s*&&\s*(?P=func_name))?
3006 \s*\(\s*(?P<callback_data>.*)\);?
3007 \s*?(?://[^\n]*)*$''',
3008 r
'\g<callback_data>', code
)
3011 def js_to_json(code
, vars={}):
3012 # vars is a dict of var, val pairs to substitute
3013 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3014 SKIP_RE
= fr
'\s*(?:{COMMENT_RE})?\s*'
3016 (fr
'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3017 (fr
'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3022 if v
in ('true', 'false', 'null'):
3024 elif v
in ('undefined', 'void 0'):
3026 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3029 if v
[0] in ("'", '"'):
3030 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3035 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3037 for regex
, base
in INTEGER_TABLE
:
3038 im
= re
.match(regex
, v
)
3040 i
= int(im
.group(1), base
)
3041 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3048 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3050 return re
.sub(r
'''(?sx)
3051 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3052 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3053 {comment}|,(?={skip}[\]}}])|
3054 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3055 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3058 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3061 def qualities(quality_ids
):
3062 """ Get a numeric quality value out of a list of possible values """
3065 return quality_ids
.index(qid
)
3071 POSTPROCESS_WHEN
= ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3075 'default': '%(title)s [%(id)s].%(ext)s',
3076 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3082 'description': 'description',
3083 'annotation': 'annotations.xml',
3084 'infojson': 'info.json',
3087 'pl_thumbnail': None,
3088 'pl_description': 'description',
3089 'pl_infojson': 'info.json',
3092 # As of [1] format syntax is:
3093 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3094 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3095 STR_FORMAT_RE_TMPL
= r
'''(?x)
3096 (?<!%)(?P<prefix>(?:%%)*)
3098 (?P<has_key>\((?P<key>{0})\))?
3100 (?P<conversion>[#0\-+ ]+)?
3102 (?P<precision>\.\d+)?
3103 (?P<len_mod>[hlL])? # unused in python
3104 {1} # conversion type
3109 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3112 def limit_length(s
, length
):
3113 """ Add ellipses to overly long strings """
3118 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3122 def version_tuple(v
):
3123 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3126 def is_outdated_version(version
, limit
, assume_new
=True):
3128 return not assume_new
3130 return version_tuple(version
) < version_tuple(limit
)
3132 return not assume_new
3135 def ytdl_is_updateable():
3136 """ Returns if yt-dlp can be updated with -U """
3138 from .update
import is_non_updateable
3140 return not is_non_updateable()
3143 def args_to_str(args
):
3144 # Get a short string representation for a subprocess command
3145 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3148 def error_to_compat_str(err
):
3152 def error_to_str(err
):
3153 return f
'{type(err).__name__}: {err}'
3156 def mimetype2ext(mt
):
3160 mt
, _
, params
= mt
.partition(';')
3165 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3166 # it's the most popular one
3167 'audio/mpeg': 'mp3',
3168 'audio/x-wav': 'wav',
3170 'audio/wave': 'wav',
3173 ext
= FULL_MAP
.get(mt
)
3179 'smptett+xml': 'tt',
3183 'x-mp4-fragmented': 'mp4',
3184 'x-ms-sami': 'sami',
3187 'x-mpegurl': 'm3u8',
3188 'vnd.apple.mpegurl': 'm3u8',
3192 'vnd.ms-sstr+xml': 'ism',
3196 'filmstrip+json': 'fs',
3200 _
, _
, subtype
= mt
.rpartition('/')
3201 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3212 _
, _
, suffix
= subtype
.partition('+')
3213 ext
= SUFFIX_MAP
.get(suffix
)
3217 return subtype
.replace('+', '.')
3220 def ext2mimetype(ext_or_url
):
3223 if '.' not in ext_or_url
:
3224 ext_or_url
= f
'file.{ext_or_url}'
3225 return mimetypes
.guess_type(ext_or_url
)[0]
3228 def parse_codecs(codecs_str
):
3229 # http://tools.ietf.org/html/rfc6381
3232 split_codecs
= list(filter(None, map(
3233 str.strip
, codecs_str
.strip().strip(',').split(','))))
3234 vcodec
, acodec
, scodec
, hdr
= None, None, None, None
3235 for full_codec
in split_codecs
:
3236 parts
= full_codec
.split('.')
3237 codec
= parts
[0].replace('0', '')
3238 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3239 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3241 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3242 if codec
in ('dvh1', 'dvhe'):
3244 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3246 elif full_codec
.replace('0', '').startswith('vp9.2'):
3248 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3251 elif codec
in ('stpp', 'wvtt',):
3255 write_string(f
'WARNING: Unknown codec {full_codec}\n')
3256 if vcodec
or acodec
or scodec
:
3258 'vcodec': vcodec
or 'none',
3259 'acodec': acodec
or 'none',
3260 'dynamic_range': hdr
,
3261 **({'scodec': scodec}
if scodec
is not None else {}),
3263 elif len(split_codecs
) == 2:
3265 'vcodec': split_codecs
[0],
3266 'acodec': split_codecs
[1],
3271 def urlhandle_detect_ext(url_handle
):
3272 getheader
= url_handle
.headers
.get
3274 cd
= getheader('Content-Disposition')
3276 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3278 e
= determine_ext(m
.group('filename'), default_ext
=None)
3282 return mimetype2ext(getheader('Content-Type'))
3285 def encode_data_uri(data
, mime_type
):
3286 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3289 def age_restricted(content_limit
, age_limit
):
3290 """ Returns True iff the content should be blocked """
3292 if age_limit
is None: # No limit set
3294 if content_limit
is None:
3295 return False # Content available for everyone
3296 return age_limit
< content_limit
3299 def is_html(first_bytes
):
3300 """ Detect whether a file contains HTML by examining its first bytes. """
3303 (b
'\xef\xbb\xbf', 'utf-8'),
3304 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3305 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3306 (b
'\xff\xfe', 'utf-16-le'),
3307 (b
'\xfe\xff', 'utf-16-be'),
3311 for bom
, enc
in BOMS
:
3312 while first_bytes
.startswith(bom
):
3313 encoding
, first_bytes
= enc
, first_bytes
[len(bom
):]
3315 return re
.match(r
'^\s*<', first_bytes
.decode(encoding
, 'replace'))
3318 def determine_protocol(info_dict
):
3319 protocol
= info_dict
.get('protocol')
3320 if protocol
is not None:
3323 url
= sanitize_url(info_dict
['url'])
3324 if url
.startswith('rtmp'):
3326 elif url
.startswith('mms'):
3328 elif url
.startswith('rtsp'):
3331 ext
= determine_ext(url
)
3337 return compat_urllib_parse_urlparse(url
).scheme
3340 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3341 """ Render a list of rows, each as a list of values.
3342 Text after a \t will be right aligned """
3344 return len(remove_terminal_sequences(string
).replace('\t', ''))
3346 def get_max_lens(table
):
3347 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3349 def filter_using_list(row
, filterArray
):
3350 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3352 max_lens
= get_max_lens(data
) if hide_empty
else []
3353 header_row
= filter_using_list(header_row
, max_lens
)
3354 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3356 table
= [header_row
] + data
3357 max_lens
= get_max_lens(table
)
3360 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3361 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3363 for pos
, text
in enumerate(map(str, row
)):
3365 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3367 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3368 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3372 def _match_one(filter_part
, dct
, incomplete
):
3373 # TODO: Generalize code with YoutubeDL._build_format_filter
3374 STRING_OPERATORS
= {
3375 '*=': operator
.contains
,
3376 '^=': lambda attr
, value
: attr
.startswith(value
),
3377 '$=': lambda attr
, value
: attr
.endswith(value
),
3378 '~=': lambda attr
, value
: re
.search(value
, attr
),
3380 COMPARISON_OPERATORS
= {
3382 '<=': operator
.le
, # "<=" must be defined above "<"
3389 if isinstance(incomplete
, bool):
3390 is_incomplete
= lambda _
: incomplete
3392 is_incomplete
= lambda k
: k
in incomplete
3394 operator_rex
= re
.compile(r
'''(?x)\s*
3396 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3398 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3402 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3403 m = operator_rex.search(filter_part)
3406 unnegated_op = COMPARISON_OPERATORS[m['op']]
3408 op = lambda attr, value: not unnegated_op(attr, value)
3411 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3413 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3414 actual_value = dct.get(m['key'])
3415 numeric_comparison = None
3416 if isinstance(actual_value, (int, float)):
3417 # If the original field is a string and matching comparisonvalue is
3418 # a number we should respect the origin of the original field
3419 # and process comparison value as a string (see
3420 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3422 numeric_comparison = int(comparison_value)
3424 numeric_comparison = parse_filesize(comparison_value)
3425 if numeric_comparison is None:
3426 numeric_comparison = parse_filesize(f'{comparison_value}B')
3427 if numeric_comparison is None:
3428 numeric_comparison = parse_duration(comparison_value)
3429 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3430 raise ValueError('Operator %s only supports string values!' % m['op'])
3431 if actual_value is None:
3432 return is_incomplete(m['key']) or m['none_inclusive']
3433 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3436 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3437 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3439 operator_rex = re.compile(r'''(?x
)\s
*
3440 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3442 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3443 m = operator_rex.search(filter_part)
3445 op = UNARY_OPERATORS[m.group('op')]
3446 actual_value = dct.get(m.group('key'))
3447 if is_incomplete(m.group('key')) and actual_value is None:
3449 return op(actual_value)
3451 raise ValueError('Invalid filter part %r' % filter_part)
3454 def match_str(filter_str, dct, incomplete=False):
3455 """ Filter a dictionary with a simple string syntax.
3456 @returns Whether the filter passes
3457 @param incomplete Set of keys that is expected to be missing from dct.
3458 Can be True/False to indicate all/none of the keys may be missing.
3459 All conditions on incomplete keys pass if the key is missing
3462 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3463 for filter_part in re.split(r'(?<!\\)&', filter_str))
3466 def match_filter_func(filters):
3469 filters = set(variadic(filters))
3471 interactive = '-' in filters
3475 def _match_func(info_dict, incomplete=False):
3476 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3477 return NO_DEFAULT if interactive and not incomplete else None
3479 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3480 filter_str = ') | ('.join(map(str.strip, filters))
3481 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3485 def parse_dfxp_time_expr(time_expr):
3489 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3491 return float(mobj.group('time_offset'))
3493 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3495 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3498 def srt_subtitles_timecode(seconds):
3499 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3502 def ass_subtitles_timecode(seconds):
3503 time = timetuple_from_msec(seconds * 1000)
3504 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3507 def dfxp2srt(dfxp_data):
3509 @param dfxp_data A
bytes-like
object containing DFXP data
3510 @returns A
unicode object containing converted SRT data
3512 LEGACY_NAMESPACES = (
3513 (b'http://www.w3.org/ns/ttml', [
3514 b'http://www.w3.org/2004/11/ttaf1',
3515 b'http://www.w3.org/2006/04/ttaf1',
3516 b'http://www.w3.org/2006/10/ttaf1',
3518 (b'http://www.w3.org/ns/ttml#styling', [
3519 b'http://www.w3.org/ns/ttml#style',
3523 SUPPORTED_STYLING = [
3532 _x = functools.partial(xpath_with_ns, ns_map={
3533 'xml': 'http://www.w3.org/XML/1998/namespace',
3534 'ttml': 'http://www.w3.org/ns/ttml',
3535 'tts': 'http://www.w3.org/ns/ttml#styling',
3541 class TTMLPElementParser:
3543 _unclosed_elements = []
3544 _applied_styles = []
3546 def start(self, tag, attrib):
3547 if tag in (_x('ttml:br'), 'br'):
3550 unclosed_elements = []
3552 element_style_id = attrib.get('style')
3554 style.update(default_style)
3555 if element_style_id:
3556 style.update(styles.get(element_style_id, {}))
3557 for prop in SUPPORTED_STYLING:
3558 prop_val = attrib.get(_x('tts:' + prop))
3560 style[prop] = prop_val
3563 for k, v in sorted(style.items()):
3564 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3567 font += ' color="%s"' % v
3568 elif k == 'fontSize':
3569 font += ' size="%s"' % v
3570 elif k == 'fontFamily':
3571 font += ' face="%s"' % v
3572 elif k == 'fontWeight' and v == 'bold':
3574 unclosed_elements.append('b')
3575 elif k == 'fontStyle' and v == 'italic':
3577 unclosed_elements.append('i')
3578 elif k == 'textDecoration' and v == 'underline':
3580 unclosed_elements.append('u')
3582 self._out += '<font' + font + '>'
3583 unclosed_elements.append('font')
3585 if self._applied_styles:
3586 applied_style.update(self._applied_styles[-1])
3587 applied_style.update(style)
3588 self._applied_styles.append(applied_style)
3589 self._unclosed_elements.append(unclosed_elements)
3592 if tag not in (_x('ttml:br'), 'br'):
3593 unclosed_elements = self._unclosed_elements.pop()
3594 for element in reversed(unclosed_elements):
3595 self._out += '</%s>' % element
3596 if unclosed_elements and self._applied_styles:
3597 self._applied_styles.pop()
3599 def data(self, data):
3603 return self._out.strip()
3605 def parse_node(node):
3606 target = TTMLPElementParser()
3607 parser = xml.etree.ElementTree.XMLParser(target=target)
3608 parser.feed(xml.etree.ElementTree.tostring(node))
3609 return parser.close()
3611 for k, v in LEGACY_NAMESPACES:
3613 dfxp_data = dfxp_data.replace(ns, k)
3615 dfxp = compat_etree_fromstring(dfxp_data)
3617 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3620 raise ValueError('Invalid dfxp/TTML subtitle')
3624 for style in dfxp.findall(_x('.//ttml:style')):
3625 style_id = style.get('id') or style.get(_x('xml:id'))
3628 parent_style_id = style.get('style')
3630 if parent_style_id not in styles:
3633 styles[style_id] = styles[parent_style_id].copy()
3634 for prop in SUPPORTED_STYLING:
3635 prop_val = style.get(_x('tts:' + prop))
3637 styles.setdefault(style_id, {})[prop] = prop_val
3643 for p in ('body', 'div'):
3644 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3647 style = styles.get(ele.get('style'))
3650 default_style.update(style)
3652 for para, index in zip(paras, itertools.count(1)):
3653 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3654 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3655 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3656 if begin_time is None:
3661 end_time = begin_time + dur
3662 out.append('%d\n%s --> %s\n%s\n\n' % (
3664 srt_subtitles_timecode(begin_time),
3665 srt_subtitles_timecode(end_time),
3671 def cli_option(params, command_option, param, separator=None):
3672 param = params.get(param)
3673 return ([] if param is None
3674 else [command_option, str(param)] if separator is None
3675 else [f'{command_option}{separator}{param}'])
3678 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3679 param = params.get(param)
3680 assert param in (True, False, None)
3681 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3684 def cli_valueless_option(params, command_option, param, expected_value=True):
3685 return [command_option] if params.get(param) == expected_value else []
3688 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3689 if isinstance(argdict, (list, tuple)): # for backward compatibility
3696 assert isinstance(argdict, dict)
3698 assert isinstance(keys, (list, tuple))
3699 for key_list in keys:
3700 arg_list = list(filter(
3701 lambda x: x is not None,
3702 [argdict.get(key.lower()) for key in variadic(key_list)]))
3704 return [arg for args in arg_list for arg in args]
3708 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3709 main_key, exe = main_key.lower(), exe.lower()
3710 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3711 keys = [f'{root_key}{k}' for k in (keys or [''])]
3712 if root_key in keys:
3714 keys.append((main_key, exe))
3715 keys.append('default')
3718 return cli_configuration_args(argdict, keys, default, use_compat)
3722 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3781 'iw': 'heb', # Replaced by he in 1989 revision
3791 'in': 'ind', # Replaced by id in 1989 revision
3906 'ji': 'yid', # Replaced by yi in 1989 revision
3914 def short2long(cls, code):
3915 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3916 return cls._lang_map.get(code[:2])
3919 def long2short(cls, code):
3920 """Convert language code from ISO 639-2/T to ISO 639-1"""
3921 for short_name, long_name in cls._lang_map.items():
3922 if long_name == code:
3927 # From http://data.okfn.org/data/core/country-list
3929 'AF': 'Afghanistan',
3930 'AX': 'Åland Islands',
3933 'AS': 'American Samoa',
3938 'AG': 'Antigua and Barbuda',
3955 'BO': 'Bolivia, Plurinational State of',
3956 'BQ': 'Bonaire, Sint Eustatius and Saba',
3957 'BA': 'Bosnia and Herzegovina',
3959 'BV': 'Bouvet Island',
3961 'IO': 'British Indian Ocean Territory',
3962 'BN': 'Brunei Darussalam',
3964 'BF': 'Burkina Faso',
3970 'KY': 'Cayman Islands',
3971 'CF': 'Central African Republic',
3975 'CX': 'Christmas Island',
3976 'CC': 'Cocos (Keeling) Islands',
3980 'CD': 'Congo, the Democratic Republic of the',
3981 'CK': 'Cook Islands',
3983 'CI': 'Côte d\'Ivoire',
3988 'CZ': 'Czech Republic',
3992 'DO': 'Dominican Republic',
3995 'SV': 'El Salvador',
3996 'GQ': 'Equatorial Guinea',
4000 'FK': 'Falkland Islands (Malvinas)',
4001 'FO': 'Faroe Islands',
4005 'GF': 'French Guiana',
4006 'PF': 'French Polynesia',
4007 'TF': 'French Southern Territories',
4022 'GW': 'Guinea-Bissau',
4025 'HM': 'Heard Island and McDonald Islands',
4026 'VA': 'Holy See (Vatican City State)',
4033 'IR': 'Iran, Islamic Republic of',
4036 'IM': 'Isle of Man',
4046 'KP': 'Korea, Democratic People\'s Republic of',
4047 'KR': 'Korea, Republic of',
4050 'LA': 'Lao People\'s Democratic Republic',
4056 'LI': 'Liechtenstein',
4060 'MK': 'Macedonia, the Former Yugoslav Republic of',
4067 'MH': 'Marshall Islands',
4073 'FM': 'Micronesia, Federated States of',
4074 'MD': 'Moldova, Republic of',
4085 'NL': 'Netherlands',
4086 'NC': 'New Caledonia',
4087 'NZ': 'New Zealand',
4092 'NF': 'Norfolk Island',
4093 'MP': 'Northern Mariana Islands',
4098 'PS': 'Palestine, State of',
4100 'PG': 'Papua New Guinea',
4103 'PH': 'Philippines',
4107 'PR': 'Puerto Rico',
4111 'RU': 'Russian Federation',
4113 'BL': 'Saint Barthélemy',
4114 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4115 'KN': 'Saint Kitts and Nevis',
4116 'LC': 'Saint Lucia',
4117 'MF': 'Saint Martin (French part)',
4118 'PM': 'Saint Pierre and Miquelon',
4119 'VC': 'Saint Vincent and the Grenadines',
4122 'ST': 'Sao Tome and Principe',
4123 'SA': 'Saudi Arabia',
4127 'SL': 'Sierra Leone',
4129 'SX': 'Sint Maarten (Dutch part)',
4132 'SB': 'Solomon Islands',
4134 'ZA': 'South Africa',
4135 'GS': 'South Georgia and the South Sandwich Islands',
4136 'SS': 'South Sudan',
4141 'SJ': 'Svalbard and Jan Mayen',
4144 'CH': 'Switzerland',
4145 'SY': 'Syrian Arab Republic',
4146 'TW': 'Taiwan, Province of China',
4148 'TZ': 'Tanzania, United Republic of',
4150 'TL': 'Timor-Leste',
4154 'TT': 'Trinidad and Tobago',
4157 'TM': 'Turkmenistan',
4158 'TC': 'Turks and Caicos Islands',
4162 'AE': 'United Arab Emirates',
4163 'GB': 'United Kingdom',
4164 'US': 'United States',
4165 'UM': 'United States Minor Outlying Islands',
4169 'VE': 'Venezuela, Bolivarian Republic of',
4171 'VG': 'Virgin Islands, British',
4172 'VI': 'Virgin Islands, U.S.',
4173 'WF': 'Wallis and Futuna',
4174 'EH': 'Western Sahara',
4178 # Not ISO 3166 codes, but used for IP blocks
4179 'AP': 'Asia/Pacific Region',
4184 def short2full(cls, code):
4185 """Convert an ISO 3166-2 country code to the corresponding full name"""
4186 return cls._country_map.get(code.upper())
4190 # Major IPv4 address blocks per country
4192 'AD': '46.172.224.0/19',
4193 'AE': '94.200.0.0/13',
4194 'AF': '149.54.0.0/17',
4195 'AG': '209.59.64.0/18',
4196 'AI': '204.14.248.0/21',
4197 'AL': '46.99.0.0/16',
4198 'AM': '46.70.0.0/15',
4199 'AO': '105.168.0.0/13',
4200 'AP': '182.50.184.0/21',
4201 'AQ': '23.154.160.0/24',
4202 'AR': '181.0.0.0/12',
4203 'AS': '202.70.112.0/20',
4204 'AT': '77.116.0.0/14',
4205 'AU': '1.128.0.0/11',
4206 'AW': '181.41.0.0/18',
4207 'AX': '185.217.4.0/22',
4208 'AZ': '5.197.0.0/16',
4209 'BA': '31.176.128.0/17',
4210 'BB': '65.48.128.0/17',
4211 'BD': '114.130.0.0/16',
4213 'BF': '102.178.0.0/15',
4214 'BG': '95.42.0.0/15',
4215 'BH': '37.131.0.0/17',
4216 'BI': '154.117.192.0/18',
4217 'BJ': '137.255.0.0/16',
4218 'BL': '185.212.72.0/23',
4219 'BM': '196.12.64.0/18',
4220 'BN': '156.31.0.0/16',
4221 'BO': '161.56.0.0/16',
4222 'BQ': '161.0.80.0/20',
4223 'BR': '191.128.0.0/12',
4224 'BS': '24.51.64.0/18',
4225 'BT': '119.2.96.0/19',
4226 'BW': '168.167.0.0/16',
4227 'BY': '178.120.0.0/13',
4228 'BZ': '179.42.192.0/18',
4229 'CA': '99.224.0.0/11',
4230 'CD': '41.243.0.0/16',
4231 'CF': '197.242.176.0/21',
4232 'CG': '160.113.0.0/16',
4233 'CH': '85.0.0.0/13',
4234 'CI': '102.136.0.0/14',
4235 'CK': '202.65.32.0/19',
4236 'CL': '152.172.0.0/14',
4237 'CM': '102.244.0.0/14',
4238 'CN': '36.128.0.0/10',
4239 'CO': '181.240.0.0/12',
4240 'CR': '201.192.0.0/12',
4241 'CU': '152.206.0.0/15',
4242 'CV': '165.90.96.0/19',
4243 'CW': '190.88.128.0/17',
4244 'CY': '31.153.0.0/16',
4245 'CZ': '88.100.0.0/14',
4247 'DJ': '197.241.0.0/17',
4248 'DK': '87.48.0.0/12',
4249 'DM': '192.243.48.0/20',
4250 'DO': '152.166.0.0/15',
4251 'DZ': '41.96.0.0/12',
4252 'EC': '186.68.0.0/15',
4253 'EE': '90.190.0.0/15',
4254 'EG': '156.160.0.0/11',
4255 'ER': '196.200.96.0/20',
4256 'ES': '88.0.0.0/11',
4257 'ET': '196.188.0.0/14',
4258 'EU': '2.16.0.0/13',
4259 'FI': '91.152.0.0/13',
4260 'FJ': '144.120.0.0/16',
4261 'FK': '80.73.208.0/21',
4262 'FM': '119.252.112.0/20',
4263 'FO': '88.85.32.0/19',
4265 'GA': '41.158.0.0/15',
4267 'GD': '74.122.88.0/21',
4268 'GE': '31.146.0.0/16',
4269 'GF': '161.22.64.0/18',
4270 'GG': '62.68.160.0/19',
4271 'GH': '154.160.0.0/12',
4272 'GI': '95.164.0.0/16',
4273 'GL': '88.83.0.0/19',
4274 'GM': '160.182.0.0/15',
4275 'GN': '197.149.192.0/18',
4276 'GP': '104.250.0.0/19',
4277 'GQ': '105.235.224.0/20',
4278 'GR': '94.64.0.0/13',
4279 'GT': '168.234.0.0/16',
4280 'GU': '168.123.0.0/16',
4281 'GW': '197.214.80.0/20',
4282 'GY': '181.41.64.0/18',
4283 'HK': '113.252.0.0/14',
4284 'HN': '181.210.0.0/16',
4285 'HR': '93.136.0.0/13',
4286 'HT': '148.102.128.0/17',
4287 'HU': '84.0.0.0/14',
4288 'ID': '39.192.0.0/10',
4289 'IE': '87.32.0.0/12',
4290 'IL': '79.176.0.0/13',
4291 'IM': '5.62.80.0/20',
4292 'IN': '117.192.0.0/10',
4293 'IO': '203.83.48.0/21',
4294 'IQ': '37.236.0.0/14',
4295 'IR': '2.176.0.0/12',
4296 'IS': '82.221.0.0/16',
4297 'IT': '79.0.0.0/10',
4298 'JE': '87.244.64.0/18',
4299 'JM': '72.27.0.0/17',
4300 'JO': '176.29.0.0/16',
4301 'JP': '133.0.0.0/8',
4302 'KE': '105.48.0.0/12',
4303 'KG': '158.181.128.0/17',
4304 'KH': '36.37.128.0/17',
4305 'KI': '103.25.140.0/22',
4306 'KM': '197.255.224.0/20',
4307 'KN': '198.167.192.0/19',
4308 'KP': '175.45.176.0/22',
4309 'KR': '175.192.0.0/10',
4310 'KW': '37.36.0.0/14',
4311 'KY': '64.96.0.0/15',
4312 'KZ': '2.72.0.0/13',
4313 'LA': '115.84.64.0/18',
4314 'LB': '178.135.0.0/16',
4315 'LC': '24.92.144.0/20',
4316 'LI': '82.117.0.0/19',
4317 'LK': '112.134.0.0/15',
4318 'LR': '102.183.0.0/16',
4319 'LS': '129.232.0.0/17',
4320 'LT': '78.56.0.0/13',
4321 'LU': '188.42.0.0/16',
4322 'LV': '46.109.0.0/16',
4323 'LY': '41.252.0.0/14',
4324 'MA': '105.128.0.0/11',
4325 'MC': '88.209.64.0/18',
4326 'MD': '37.246.0.0/16',
4327 'ME': '178.175.0.0/17',
4328 'MF': '74.112.232.0/21',
4329 'MG': '154.126.0.0/17',
4330 'MH': '117.103.88.0/21',
4331 'MK': '77.28.0.0/15',
4332 'ML': '154.118.128.0/18',
4333 'MM': '37.111.0.0/17',
4334 'MN': '49.0.128.0/17',
4335 'MO': '60.246.0.0/16',
4336 'MP': '202.88.64.0/20',
4337 'MQ': '109.203.224.0/19',
4338 'MR': '41.188.64.0/18',
4339 'MS': '208.90.112.0/22',
4340 'MT': '46.11.0.0/16',
4341 'MU': '105.16.0.0/12',
4342 'MV': '27.114.128.0/18',
4343 'MW': '102.70.0.0/15',
4344 'MX': '187.192.0.0/11',
4345 'MY': '175.136.0.0/13',
4346 'MZ': '197.218.0.0/15',
4347 'NA': '41.182.0.0/16',
4348 'NC': '101.101.0.0/18',
4349 'NE': '197.214.0.0/18',
4350 'NF': '203.17.240.0/22',
4351 'NG': '105.112.0.0/12',
4352 'NI': '186.76.0.0/15',
4353 'NL': '145.96.0.0/11',
4354 'NO': '84.208.0.0/13',
4355 'NP': '36.252.0.0/15',
4356 'NR': '203.98.224.0/19',
4357 'NU': '49.156.48.0/22',
4358 'NZ': '49.224.0.0/14',
4359 'OM': '5.36.0.0/15',
4360 'PA': '186.72.0.0/15',
4361 'PE': '186.160.0.0/14',
4362 'PF': '123.50.64.0/18',
4363 'PG': '124.240.192.0/19',
4364 'PH': '49.144.0.0/13',
4365 'PK': '39.32.0.0/11',
4366 'PL': '83.0.0.0/11',
4367 'PM': '70.36.0.0/20',
4368 'PR': '66.50.0.0/16',
4369 'PS': '188.161.0.0/16',
4370 'PT': '85.240.0.0/13',
4371 'PW': '202.124.224.0/20',
4372 'PY': '181.120.0.0/14',
4373 'QA': '37.210.0.0/15',
4374 'RE': '102.35.0.0/16',
4375 'RO': '79.112.0.0/13',
4376 'RS': '93.86.0.0/15',
4377 'RU': '5.136.0.0/13',
4378 'RW': '41.186.0.0/16',
4379 'SA': '188.48.0.0/13',
4380 'SB': '202.1.160.0/19',
4381 'SC': '154.192.0.0/11',
4382 'SD': '102.120.0.0/13',
4383 'SE': '78.64.0.0/12',
4384 'SG': '8.128.0.0/10',
4385 'SI': '188.196.0.0/14',
4386 'SK': '78.98.0.0/15',
4387 'SL': '102.143.0.0/17',
4388 'SM': '89.186.32.0/19',
4389 'SN': '41.82.0.0/15',
4390 'SO': '154.115.192.0/18',
4391 'SR': '186.179.128.0/17',
4392 'SS': '105.235.208.0/21',
4393 'ST': '197.159.160.0/19',
4394 'SV': '168.243.0.0/16',
4395 'SX': '190.102.0.0/20',
4397 'SZ': '41.84.224.0/19',
4398 'TC': '65.255.48.0/20',
4399 'TD': '154.68.128.0/19',
4400 'TG': '196.168.0.0/14',
4401 'TH': '171.96.0.0/13',
4402 'TJ': '85.9.128.0/18',
4403 'TK': '27.96.24.0/21',
4404 'TL': '180.189.160.0/20',
4405 'TM': '95.85.96.0/19',
4406 'TN': '197.0.0.0/11',
4407 'TO': '175.176.144.0/21',
4408 'TR': '78.160.0.0/11',
4409 'TT': '186.44.0.0/15',
4410 'TV': '202.2.96.0/19',
4411 'TW': '120.96.0.0/11',
4412 'TZ': '156.156.0.0/14',
4413 'UA': '37.52.0.0/14',
4414 'UG': '102.80.0.0/13',
4416 'UY': '167.56.0.0/13',
4417 'UZ': '84.54.64.0/18',
4418 'VA': '212.77.0.0/19',
4419 'VC': '207.191.240.0/21',
4420 'VE': '186.88.0.0/13',
4421 'VG': '66.81.192.0/20',
4422 'VI': '146.226.0.0/16',
4423 'VN': '14.160.0.0/11',
4424 'VU': '202.80.32.0/20',
4425 'WF': '117.20.32.0/21',
4426 'WS': '202.4.32.0/19',
4427 'YE': '134.35.0.0/16',
4428 'YT': '41.242.116.0/22',
4429 'ZA': '41.0.0.0/11',
4430 'ZM': '102.144.0.0/13',
4431 'ZW': '102.177.192.0/18',
4435 def random_ipv4(cls, code_or_block):
4436 if len(code_or_block) == 2:
4437 block = cls._country_ip_map.get(code_or_block.upper())
4441 block = code_or_block
4442 addr, preflen = block.split('/')
4443 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4444 addr_max = addr_min | (0xffffffff >> int(preflen))
4445 return compat_str(socket.inet_ntoa(
4446 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4449 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4450 def __init__(self, proxies=None):
4451 # Set default handlers
4452 for type in ('http', 'https'):
4453 setattr(self, '%s_open' % type,
4454 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4455 meth(r, proxy, type))
4456 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4458 def proxy_open(self, req, proxy, type):
4459 req_proxy = req.headers.get('Ytdl-request-proxy')
4460 if req_proxy is not None:
4462 del req.headers['Ytdl-request-proxy']
4464 if proxy == '__noproxy__':
4465 return None # No Proxy
4466 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4467 req.add_header('Ytdl-socks-proxy', proxy)
4468 # yt-dlp's http/https handlers do wrapping the socket with socks
4470 return compat_urllib_request.ProxyHandler.proxy_open(
4471 self, req, proxy, type)
4474 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4475 # released into Public Domain
4476 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4478 def long_to_bytes(n, blocksize=0):
4479 """long_to_bytes(n:long, blocksize:int) : string
4480 Convert a long integer to a byte string.
4482 If optional blocksize is given and greater than zero, pad the front of the
4483 byte string with binary zeros so that the length is a multiple of
4486 # after much testing, this algorithm was deemed to be the fastest
4490 s = compat_struct_pack('>I', n & 0xffffffff) + s
4492 # strip off leading zeros
4493 for i in range(len(s)):
4494 if s[i] != b'\000'[0]:
4497 # only happens when n == 0
4501 # add back some pad bytes. this could be done more efficiently w.r.t. the
4502 # de-padding being done above, but sigh...
4503 if blocksize > 0 and len(s) % blocksize:
4504 s = (blocksize - len(s) % blocksize) * b'\000' + s
4508 def bytes_to_long(s):
4509 """bytes_to_long(string) : long
4510 Convert a byte string to a long integer.
4512 This is (essentially) the inverse of long_to_bytes().
4517 extra = (4 - length % 4)
4518 s = b'\000' * extra + s
4519 length = length + extra
4520 for i in range(0, length, 4):
4521 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4525 def ohdave_rsa_encrypt(data, exponent, modulus):
4527 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4530 data: data to encrypt, bytes-like object
4531 exponent, modulus: parameter e and N of RSA algorithm, both integer
4532 Output: hex string of encrypted data
4534 Limitation: supports one block encryption only
4537 payload = int(binascii.hexlify(data[::-1]), 16)
4538 encrypted = pow(payload, exponent, modulus)
4539 return '%x' % encrypted
4542 def pkcs1pad(data, length):
4544 Padding input data with PKCS#1 scheme
4546 @param {int[]} data input data
4547 @param {int} length target length
4548 @returns {int[]} padded data
4550 if len(data) > length - 11:
4551 raise ValueError('Input data too
long for PKCS
#1 padding')
4553 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4554 return [0, 2] + pseudo_random
+ [0] + data
4557 def encode_base_n(num
, n
, table
=None):
4558 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4560 table
= FULL_TABLE
[:n
]
4563 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4570 ret
= table
[num
% n
] + ret
4575 def decode_packed_codes(code
):
4576 mobj
= re
.search(PACKED_CODES_RE
, code
)
4577 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4580 symbols
= symbols
.split('|')
4585 base_n_count
= encode_base_n(count
, base
)
4586 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4589 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4593 def caesar(s
, alphabet
, shift
):
4598 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4603 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4606 def parse_m3u8_attributes(attrib
):
4608 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4609 if val
.startswith('"'):
4615 def urshift(val
, n
):
4616 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4619 # Based on png2str() written by @gdkchan and improved by @yokrysty
4620 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4621 def decode_png(png_data
):
4622 # Reference: https://www.w3.org/TR/PNG/
4623 header
= png_data
[8:]
4625 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4626 raise OSError('Not a valid PNG file.')
4628 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4629 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4634 length
= unpack_integer(header
[:4])
4637 chunk_type
= header
[:4]
4640 chunk_data
= header
[:length
]
4641 header
= header
[length
:]
4643 header
= header
[4:] # Skip CRC
4651 ihdr
= chunks
[0]['data']
4653 width
= unpack_integer(ihdr
[:4])
4654 height
= unpack_integer(ihdr
[4:8])
4658 for chunk
in chunks
:
4659 if chunk
['type'] == b
'IDAT':
4660 idat
+= chunk
['data']
4663 raise OSError('Unable to read PNG data.')
4665 decompressed_data
= bytearray(zlib
.decompress(idat
))
4670 def _get_pixel(idx
):
4675 for y
in range(height
):
4676 basePos
= y
* (1 + stride
)
4677 filter_type
= decompressed_data
[basePos
]
4681 pixels
.append(current_row
)
4683 for x
in range(stride
):
4684 color
= decompressed_data
[1 + basePos
+ x
]
4685 basex
= y
* stride
+ x
4690 left
= _get_pixel(basex
- 3)
4692 up
= _get_pixel(basex
- stride
)
4694 if filter_type
== 1: # Sub
4695 color
= (color
+ left
) & 0xff
4696 elif filter_type
== 2: # Up
4697 color
= (color
+ up
) & 0xff
4698 elif filter_type
== 3: # Average
4699 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4700 elif filter_type
== 4: # Paeth
4706 c
= _get_pixel(basex
- stride
- 3)
4714 if pa
<= pb
and pa
<= pc
:
4715 color
= (color
+ a
) & 0xff
4717 color
= (color
+ b
) & 0xff
4719 color
= (color
+ c
) & 0xff
4721 current_row
.append(color
)
4723 return width
, height
, pixels
4726 def write_xattr(path
, key
, value
):
4727 # Windows: Write xattrs to NTFS Alternate Data Streams:
4728 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4729 if compat_os_name
== 'nt':
4730 assert ':' not in key
4731 assert os
.path
.exists(path
)
4734 with open(f
'{path}:{key}', 'wb') as f
:
4736 except OSError as e
:
4737 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4740 # UNIX Method 1. Use xattrs/pyxattrs modules
4741 from .dependencies
import xattr
4744 if getattr(xattr
, '_yt_dlp__identifier', None) == 'pyxattr':
4745 # Unicode arguments are not supported in pyxattr until version 0.5.0
4746 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4747 if version_tuple(xattr
.__version
__) >= (0, 5, 0):
4748 setxattr
= xattr
.set
4750 setxattr
= xattr
.setxattr
4754 setxattr(path
, key
, value
)
4755 except OSError as e
:
4756 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4759 # UNIX Method 2. Use setfattr/xattr executables
4760 exe
= ('setfattr' if check_executable('setfattr', ['--version'])
4761 else 'xattr' if check_executable('xattr', ['-h']) else None)
4763 raise XAttrUnavailableError(
4764 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4765 + ('"xattr" binary' if sys
.platform
!= 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4767 value
= value
.decode()
4770 [exe
, '-w', key
, value
, path
] if exe
== 'xattr' else [exe
, '-n', key
, '-v', value
, path
],
4771 stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4772 except OSError as e
:
4773 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4774 stderr
= p
.communicate_or_kill()[1].decode('utf-8', 'replace')
4776 raise XAttrMetadataError(p
.returncode
, stderr
)
4779 def random_birthday(year_field
, month_field
, day_field
):
4780 start_date
= datetime
.date(1950, 1, 1)
4781 end_date
= datetime
.date(1995, 12, 31)
4782 offset
= random
.randint(0, (end_date
- start_date
).days
)
4783 random_date
= start_date
+ datetime
.timedelta(offset
)
4785 year_field
: str(random_date
.year
),
4786 month_field
: str(random_date
.month
),
4787 day_field
: str(random_date
.day
),
4791 # Templates for internet shortcut files, which are plain text files.
4792 DOT_URL_LINK_TEMPLATE
= '''\
4797 DOT_WEBLOC_LINK_TEMPLATE
= '''\
4798 <?xml version="1.0" encoding="UTF-8"?>
4799 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4800 <plist version="1.0">
4803 \t<string>%(url)s</string>
4808 DOT_DESKTOP_LINK_TEMPLATE
= '''\
4818 'url': DOT_URL_LINK_TEMPLATE
,
4819 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4820 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4824 def iri_to_uri(iri
):
4826 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4828 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4831 iri_parts
= compat_urllib_parse_urlparse(iri
)
4833 if '[' in iri_parts
.netloc
:
4834 raise ValueError('IPv6 URIs are not, yet, supported.')
4835 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4837 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4840 if iri_parts
.username
:
4841 net_location
+= urllib
.parse
.quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4842 if iri_parts
.password
is not None:
4843 net_location
+= ':' + urllib
.parse
.quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4846 net_location
+= iri_parts
.hostname
.encode('idna').decode() # Punycode for Unicode hostnames.
4847 # The 'idna' encoding produces ASCII text.
4848 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4849 net_location
+= ':' + str(iri_parts
.port
)
4851 return urllib
.parse
.urlunparse(
4855 urllib
.parse
.quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
4857 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4858 urllib
.parse
.quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
4860 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4861 urllib
.parse
.quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
4863 urllib
.parse
.quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
4865 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4868 def to_high_limit_path(path
):
4869 if sys
.platform
in ['win32', 'cygwin']:
4870 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4871 return '\\\\?\\' + os
.path
.abspath(path
)
4876 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
4877 val
= traverse_obj(obj
, *variadic(field
))
4880 return template
% (func(val
) if func
else val
)
4883 def clean_podcast_url(url
):
4884 return re
.sub(r
'''(?x)
4888 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4891 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4894 cn\.co| # https://podcorn.com/analytics-prefix/
4895 st\.fm # https://podsights.com/docs/
4900 _HEX_TABLE
= '0123456789abcdef'
4903 def random_uuidv4():
4904 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4907 def make_dir(path
, to_screen
=None):
4909 dn
= os
.path
.dirname(path
)
4910 if dn
and not os
.path
.exists(dn
):
4913 except OSError as err
:
4914 if callable(to_screen
) is not None:
4915 to_screen('unable to create directory ' + error_to_compat_str(err
))
4919 def get_executable_path():
4920 from .update
import _get_variant_and_executable_path
4922 return os
.path
.dirname(os
.path
.abspath(_get_variant_and_executable_path()[1]))
4925 def load_plugins(name
, suffix
, namespace
):
4927 with contextlib
.suppress(FileNotFoundError
):
4928 plugins_spec
= importlib
.util
.spec_from_file_location(
4929 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
4930 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
4931 sys
.modules
[plugins_spec
.name
] = plugins
4932 plugins_spec
.loader
.exec_module(plugins
)
4933 for name
in dir(plugins
):
4934 if name
in namespace
:
4936 if not name
.endswith(suffix
):
4938 klass
= getattr(plugins
, name
)
4939 classes
[name
] = namespace
[name
] = klass
4944 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
4945 casesense
=True, is_user_input
=False, traverse_string
=False):
4946 ''' Traverse nested list/dict/tuple
4947 @param path_list A list of paths which are checked one by one.
4948 Each path is a list of keys where each key is a:
4950 - string: A dictionary key
4951 - int: An index into a list
4952 - tuple: A list of keys all of which will be traversed
4953 - Ellipsis: Fetch all values in the object
4954 - Function: Takes the key and value as arguments
4955 and returns whether the key matches or not
4956 @param default Default value to return
4957 @param expected_type Only accept final value of this type (Can also be any callable)
4958 @param get_all Return all the values obtained from a path or only the first one
4959 @param casesense Whether to consider dictionary keys as case sensitive
4960 @param is_user_input Whether the keys are generated from user input. If True,
4961 strings are converted to int/slice if necessary
4962 @param traverse_string Whether to traverse inside strings. If True, any
4963 non-compatible object will also be converted into a string
4967 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
4968 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
4970 def _traverse_obj(obj
, path
, _current_depth
=0):
4972 path
= tuple(variadic(path
))
4973 for i
, key
in enumerate(path
):
4974 if None in (key
, obj
):
4976 if isinstance(key
, (list, tuple)):
4977 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
4980 obj
= (obj
.values() if isinstance(obj
, dict)
4981 else obj
if isinstance(obj
, (list, tuple, LazyList
))
4982 else str(obj
) if traverse_string
else [])
4984 depth
= max(depth
, _current_depth
)
4985 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
4987 if isinstance(obj
, (list, tuple, LazyList
)):
4988 obj
= enumerate(obj
)
4989 elif isinstance(obj
, dict):
4992 if not traverse_string
:
4996 depth
= max(depth
, _current_depth
)
4997 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if try_call(key
, args
=(k
, v
))]
4998 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
4999 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
5000 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
5003 key
= (int_or_none(key
) if ':' not in key
5004 else slice(*map(int_or_none
, key
.split(':'))))
5005 if key
== slice(None):
5006 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
5007 if not isinstance(key
, (int, slice)):
5009 if not isinstance(obj
, (list, tuple, LazyList
)):
5010 if not traverse_string
:
5019 if isinstance(expected_type
, type):
5020 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5021 elif expected_type
is not None:
5022 type_test
= expected_type
5024 type_test
= lambda val
: val
5026 for path
in path_list
:
5028 val
= _traverse_obj(obj
, path
)
5031 for _
in range(depth
- 1):
5032 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5033 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5035 return val
if get_all
else val
[0]
5037 val
= type_test(val
)
5043 def traverse_dict(dictn
, keys
, casesense
=True):
5044 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5045 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5046 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5049 def get_first(obj
, keys
, **kwargs
):
5050 return traverse_obj(obj
, (..., *variadic(keys
)), **kwargs
, get_all
=False)
5053 def variadic(x
, allowed_types
=(str, bytes, dict)):
5054 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5057 def decode_base(value
, digits
):
5058 # This will convert given base-x string to scalar (long or int)
5059 table
= {char: index for index, char in enumerate(digits)}
5064 result
+= table
[chr]
5068 def time_seconds(**kwargs
):
5069 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5070 return t
.timestamp()
5073 # create a JSON Web Signature (jws) with HS256 algorithm
5074 # the resulting format is in JWS Compact Serialization
5075 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5076 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5077 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5083 header_data
.update(headers
)
5084 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode())
5085 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode())
5086 h
= hmac
.new(key
.encode(), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5087 signature_b64
= base64
.b64encode(h
.digest())
5088 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5092 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5093 def jwt_decode_hs256(jwt
):
5094 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5095 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5099 WINDOWS_VT_MODE
= False if compat_os_name
== 'nt' else None
5103 def supports_terminal_sequences(stream
):
5104 if compat_os_name
== 'nt':
5105 if not WINDOWS_VT_MODE
:
5107 elif not os
.getenv('TERM'):
5110 return stream
.isatty()
5111 except BaseException
:
5115 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5116 if get_windows_version() < (10, 0, 10586):
5118 global WINDOWS_VT_MODE
5119 startupinfo
= subprocess
.STARTUPINFO()
5120 startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
5122 subprocess
.Popen('', shell
=True, startupinfo
=startupinfo
).wait()
5126 WINDOWS_VT_MODE
= True
5127 supports_terminal_sequences
.cache_clear()
5130 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5133 def remove_terminal_sequences(string
):
5134 return _terminal_sequences_re
.sub('', string
)
5137 def number_of_digits(number
):
5138 return len('%d' % number
)
5141 def join_nonempty(*values
, delim
='-', from_dict
=None):
5142 if from_dict
is not None:
5143 values
= map(from_dict
.get
, values
)
5144 return delim
.join(map(str, filter(None, values
)))
5147 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5149 Find the largest format dimensions in terms of video width and, for each thumbnail:
5150 * Modify the URL: Match the width with the provided regex and replace with the former width
5153 This function is useful with video services that scale the provided thumbnails on demand
5155 _keys
= ('width', 'height')
5156 max_dimensions
= max(
5157 (tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
),
5159 if not max_dimensions
[0]:
5163 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5164 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5165 for thumbnail
in thumbnails
5169 def parse_http_range(range):
5170 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5172 return None, None, None
5173 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5175 return None, None, None
5176 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5179 def read_stdin(what
):
5180 eof
= 'Ctrl+Z' if compat_os_name
== 'nt' else 'Ctrl+D'
5181 write_string(f
'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5189 __initialized
= False
5191 def __init__(self
, parser
, label
=None):
5192 self
.parser
, self
.label
= parser
, label
5193 self
._loaded
_paths
, self
.configs
= set(), []
5195 def init(self
, args
=None, filename
=None):
5196 assert not self
.__initialized
5199 location
= os
.path
.realpath(filename
)
5200 directory
= os
.path
.dirname(location
)
5201 if location
in self
._loaded
_paths
:
5203 self
._loaded
_paths
.add(location
)
5205 self
.own_args
, self
.__initialized
= args
, True
5206 opts
, _
= self
.parser
.parse_known_args(args
)
5207 self
.parsed_args
, self
.filename
= args
, filename
5209 for location
in opts
.config_locations
or []:
5211 self
.append_config(shlex
.split(read_stdin('options'), comments
=True), label
='stdin')
5213 location
= os
.path
.join(directory
, expand_path(location
))
5214 if os
.path
.isdir(location
):
5215 location
= os
.path
.join(location
, 'yt-dlp.conf')
5216 if not os
.path
.exists(location
):
5217 self
.parser
.error(f
'config location {location} does not exist')
5218 self
.append_config(self
.read_file(location
), location
)
5222 label
= join_nonempty(
5223 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5225 return join_nonempty(
5226 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5227 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5231 def read_file(filename
, default
=[]):
5233 optionf
= open(filename
)
5235 return default
# silently skip if file is not present
5237 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5238 contents
= optionf
.read()
5239 res
= shlex
.split(contents
, comments
=True)
5245 def hide_login_info(opts
):
5246 PRIVATE_OPTS
= {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5247 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5252 return m
.group('key') + '=PRIVATE'
5256 opts
= list(map(_scrub_eq
, opts
))
5257 for idx
, opt
in enumerate(opts
):
5258 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5259 opts
[idx
+ 1] = 'PRIVATE'
5262 def append_config(self
, *args
, label
=None):
5263 config
= type(self
)(self
.parser
, label
)
5264 config
._loaded
_paths
= self
._loaded
_paths
5265 if config
.init(*args
):
5266 self
.configs
.append(config
)
5270 for config
in reversed(self
.configs
):
5271 yield from config
.all_args
5272 yield from self
.parsed_args
or []
5274 def parse_known_args(self
, **kwargs
):
5275 return self
.parser
.parse_known_args(self
.all_args
, **kwargs
)
5277 def parse_args(self
):
5278 return self
.parser
.parse_args(self
.all_args
)
5281 class WebSocketsWrapper():
5282 """Wraps websockets module to use in non-async scopes"""
5285 def __init__(self
, url
, headers
=None, connect
=True):
5286 self
.loop
= asyncio
.new_event_loop()
5287 # XXX: "loop" is deprecated
5288 self
.conn
= websockets
.connect(
5289 url
, extra_headers
=headers
, ping_interval
=None,
5290 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5293 atexit
.register(self
.__exit
__, None, None, None)
5295 def __enter__(self
):
5297 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5300 def send(self
, *args
):
5301 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5303 def recv(self
, *args
):
5304 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5306 def __exit__(self
, type, value
, traceback
):
5308 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5311 self
._cancel
_all
_tasks
(self
.loop
)
5313 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5314 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5316 def run_with_loop(main
, loop
):
5317 if not asyncio
.iscoroutine(main
):
5318 raise ValueError(f
'a coroutine was expected, got {main!r}')
5321 return loop
.run_until_complete(main
)
5323 loop
.run_until_complete(loop
.shutdown_asyncgens())
5324 if hasattr(loop
, 'shutdown_default_executor'):
5325 loop
.run_until_complete(loop
.shutdown_default_executor())
5328 def _cancel_all_tasks(loop
):
5329 to_cancel
= asyncio
.all_tasks(loop
)
5334 for task
in to_cancel
:
5337 # XXX: "loop" is removed in python 3.10+
5338 loop
.run_until_complete(
5339 asyncio
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5341 for task
in to_cancel
:
5342 if task
.cancelled():
5344 if task
.exception() is not None:
5345 loop
.call_exception_handler({
5346 'message': 'unhandled exception during asyncio.run() shutdown',
5347 'exception': task
.exception(),
5352 def merge_headers(*dicts
):
5353 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5354 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5357 class classproperty
:
5358 """classmethod(property(func)) that works in py < 3.9"""
5360 def __init__(self
, func
):
5361 functools
.update_wrapper(self
, func
)
5364 def __get__(self
, _
, cls
):
5365 return self
.func(cls
)
5369 """Immutable namespace"""
5371 def __init__(self
, **kwargs
):
5374 def __getattr__(self
, attr
):
5375 return self
._dict
[attr
]
5377 def __contains__(self
, item
):
5378 return item
in self
._dict
.values()
5381 return iter(self
._dict
.items())
5384 return f
'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5388 has_certifi
= bool(certifi
)
5389 has_websockets
= bool(websockets
)