4 from __future__
import unicode_literals
41 import xml
.etree
.ElementTree
46 compat_HTMLParseError
,
52 compat_ctypes_WINFUNCTYPE
,
53 compat_etree_fromstring
,
56 compat_html_entities_html5
,
70 compat_urllib_parse_urlencode
,
71 compat_urllib_parse_urlparse
,
72 compat_urllib_parse_urlunparse
,
73 compat_urllib_parse_quote
,
74 compat_urllib_parse_quote_plus
,
75 compat_urllib_parse_unquote_plus
,
76 compat_urllib_request
,
88 def register_socks_protocols():
89 # "Register" SOCKS protocols
90 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
91 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
92 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
93 if scheme
not in compat_urlparse
.uses_netloc
:
94 compat_urlparse
.uses_netloc
.append(scheme
)
97 # This is not clearly defined otherwise
98 compiled_regex_type
= type(re
.compile(''))
101 def random_user_agent():
102 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
143 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
147 'User-Agent': random_user_agent(),
148 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
149 'Accept-Encoding': 'gzip, deflate',
150 'Accept-Language': 'en-us,en;q=0.5',
151 'Sec-Fetch-Mode': 'navigate',
156 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
160 NO_DEFAULT
= object()
162 ENGLISH_MONTH_NAMES
= [
163 'January', 'February', 'March', 'April', 'May', 'June',
164 'July', 'August', 'September', 'October', 'November', 'December']
167 'en': ENGLISH_MONTH_NAMES
,
169 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
170 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
174 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
175 'flv', 'f4v', 'f4a', 'f4b',
176 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
177 'mkv', 'mka', 'mk3d',
186 'f4f', 'f4m', 'm3u8', 'smil')
188 # needed for sanitizing filenames in restricted mode
189 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
190 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
191 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
221 '%Y-%m-%d %H:%M:%S.%f',
222 '%Y-%m-%d %H:%M:%S:%f',
225 '%Y-%m-%dT%H:%M:%SZ',
226 '%Y-%m-%dT%H:%M:%S.%fZ',
227 '%Y-%m-%dT%H:%M:%S.%f0Z',
229 '%Y-%m-%dT%H:%M:%S.%f',
232 '%b %d %Y at %H:%M:%S',
234 '%B %d %Y at %H:%M:%S',
238 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
239 DATE_FORMATS_DAY_FIRST
.extend([
248 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
249 DATE_FORMATS_MONTH_FIRST
.extend([
257 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
258 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
261 def preferredencoding():
262 """Get preferred encoding.
264 Returns the best encoding scheme for the system, based on
265 locale.getpreferredencoding() and some further tweaks.
268 pref = locale.getpreferredencoding()
276 def write_json_file(obj, fn):
277 """ Encode obj as JSON and write it to fn, atomically if possible """
279 fn = encodeFilename(fn)
280 if sys.version_info < (3, 0) and sys.platform != 'win32
':
281 encoding = get_filesystem_encoding()
282 # os.path.basename returns a bytes object, but NamedTemporaryFile
283 # will fail if the filename contains non ascii characters unless we
284 # use a unicode object
285 path_basename = lambda f: os.path.basename(fn).decode(encoding)
286 # the same for os.path.dirname
287 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
289 path_basename = os.path.basename
290 path_dirname = os.path.dirname
294 'prefix
': path_basename(fn) + '.',
295 'dir': path_dirname(fn),
299 # In Python 2.x, json.dump expects a bytestream.
300 # In Python 3.x, it writes to a character stream
301 if sys.version_info < (3, 0):
309 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
313 json.dump(obj, tf, ensure_ascii=False)
314 if sys.platform == 'win32
':
315 # Need to remove existing file on Windows, else os.rename raises
316 # WindowsError or FileExistsError.
324 os.chmod(tf.name, 0o666 & ~mask)
327 os.rename(tf.name, fn)
336 if sys.version_info >= (2, 7):
337 def find_xpath_attr(node, xpath, key, val=None):
338 """ Find the xpath xpath[@key=val] """
339 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
340 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
341 return node.find(expr)
343 def find_xpath_attr(node, xpath, key, val=None):
344 for f in node.findall(compat_xpath(xpath)):
345 if key not in f.attrib:
347 if val is None or f.attrib.get(key) == val:
351 # On python2.6 the xml.etree.ElementTree.Element methods don't support
352 # the namespace parameter
355 def xpath_with_ns(path
, ns_map
):
356 components
= [c
.split(':') for c
in path
.split('/')]
360 replaced
.append(c
[0])
363 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
364 return '/'.join(replaced
)
367 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
368 def _find_xpath(xpath
):
369 return node
.find(compat_xpath(xpath
))
371 if isinstance(xpath
, (str, compat_str
)):
372 n
= _find_xpath(xpath
)
380 if default
is not NO_DEFAULT
:
383 name
= xpath
if name
is None else name
384 raise ExtractorError('Could not find XML element %s' % name
)
390 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
391 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
392 if n
is None or n
== default
:
395 if default
is not NO_DEFAULT
:
398 name
= xpath
if name
is None else name
399 raise ExtractorError('Could not find XML element\'s text %s' % name
)
405 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
406 n
= find_xpath_attr(node
, xpath
, key
)
408 if default
is not NO_DEFAULT
:
411 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
412 raise ExtractorError('Could not find XML attribute %s' % name
)
418 def get_element_by_id(id, html
):
419 """Return the content of the tag with the specified ID in the passed HTML document"""
420 return get_element_by_attribute('id', id, html
)
423 def get_element_html_by_id(id, html
):
424 """Return the html of the tag with the specified ID in the passed HTML document"""
425 return get_element_html_by_attribute('id', id, html
)
428 def get_element_by_class(class_name
, html
):
429 """Return the content of the first tag with the specified class in the passed HTML document"""
430 retval
= get_elements_by_class(class_name
, html
)
431 return retval
[0] if retval
else None
434 def get_element_html_by_class(class_name
, html
):
435 """Return the html of the first tag with the specified class in the passed HTML document"""
436 retval
= get_elements_html_by_class(class_name
, html
)
437 return retval
[0] if retval
else None
440 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
441 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
442 return retval
[0] if retval
else None
445 def get_element_html_by_attribute(attribute
, value
, html
, escape_value
=True):
446 retval
= get_elements_html_by_attribute(attribute
, value
, html
, escape_value
)
447 return retval
[0] if retval
else None
450 def get_elements_by_class(class_name
, html
):
451 """Return the content of all tags with the specified class in the passed HTML document as a list"""
452 return get_elements_by_attribute(
453 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
454 html, escape_value=False)
457 def get_elements_html_by_class(class_name, html):
458 """Return the html of all tags with the specified class in the passed HTML document as a list"""
459 return get_elements_html_by_attribute(
460 'class', r'[^
\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
461 html, escape_value=False)
464 def get_elements_by_attribute(*args, **kwargs):
465 """Return the content of the tag with the specified attribute in the passed HTML document"""
466 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
469 def get_elements_html_by_attribute(*args, **kwargs):
470 """Return the html of the tag with the specified attribute in the passed HTML document"""
471 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
474 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
476 Return the text (content) and the html (whole) of the tag with the specified
477 attribute in the passed HTML document
480 value_quote_optional = '' if re.match(r'''[\s"'`
=<>]''', value) else '?'
482 value = re.escape(value) if escape_value else value
484 partial_element_re = r'''(?x
)
485 <(?P
<tag
>[a
-zA
-Z0
-9:._-]+)
486 (?
:\
s(?
:[^
>"']|"[^
"]*"|
'[^']*')*)?
487 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
488 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
490 for m in re.finditer(partial_element_re, html):
491 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
494 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P
<content
>.*)(?P
=q
)$
', r'\g
<content
>', content, flags=re.DOTALL)),
499 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
501 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
502 closing tag for the first opening tag it has encountered, and can be used
506 class HTMLBreakOnClosingTagException(Exception):
510 self.tagstack = collections.deque()
511 compat_HTMLParser.__init__(self)
516 def __exit__(self, *_):
520 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
521 # so data remains buffered; we no longer have any interest in it, thus
522 # override this method to discard it
525 def handle_starttag(self, tag, _):
526 self.tagstack.append(tag)
528 def handle_endtag(self, tag):
529 if not self.tagstack:
530 raise compat_HTMLParseError('no tags
in the stack
')
532 inner_tag = self.tagstack.pop()
536 raise compat_HTMLParseError(f'matching opening tag
for closing {tag} tag
not found
')
537 if not self.tagstack:
538 raise self.HTMLBreakOnClosingTagException()
541 def get_element_text_and_html_by_tag(tag, html):
543 For the first element with the specified tag in the passed HTML document
544 return its' content (text
) and the whole
element (html
)
546 def find_or_raise(haystack, needle, exc):
548 return haystack.index(needle)
551 closing_tag = f'</{tag}>'
552 whole_start = find_or_raise(
553 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
554 content_start = find_or_raise(
555 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
556 content_start += whole_start + 1
557 with HTMLBreakOnClosingTagParser() as parser:
558 parser.feed(html[whole_start:content_start])
559 if not parser.tagstack or parser.tagstack[0] != tag:
560 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
561 offset = content_start
562 while offset < len(html):
563 next_closing_tag_start = find_or_raise(
564 html[offset:], closing_tag,
565 compat_HTMLParseError(f'closing {tag} tag not found'))
566 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
568 parser.feed(html[offset:offset + next_closing_tag_end])
569 offset += next_closing_tag_end
570 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
571 return html[content_start:offset + next_closing_tag_start], \
572 html[whole_start:offset + next_closing_tag_end]
573 raise compat_HTMLParseError('unexpected end of html')
576 class HTMLAttributeParser(compat_HTMLParser):
577 """Trivial HTML parser to gather the attributes
for a single element
"""
581 compat_HTMLParser.__init__(self)
583 def handle_starttag(self, tag, attrs):
584 self.attrs = dict(attrs)
587 class HTMLListAttrsParser(compat_HTMLParser):
588 """HTML parser to gather the attributes
for the elements of a
list"""
591 compat_HTMLParser.__init__(self)
595 def handle_starttag(self, tag, attrs):
596 if tag == 'li' and self._level == 0:
597 self.items.append(dict(attrs))
600 def handle_endtag(self, tag):
604 def extract_attributes(html_element):
605 """Given a string
for an HTML element such
as
607 a
="foo" B
="bar" c
="&98;az" d
=boz
608 empty
= noval entity
="&"
611 Decode
and return a dictionary of attributes
.
613 'a': 'foo', 'b': 'bar', c
: 'baz', d
: 'boz',
614 'empty': '', 'noval': None, 'entity': '&',
615 'sq': '"', 'dq': '\''
617 NB HTMLParser
is stricter
in Python
2.6 & 3.2 than
in later versions
,
618 but the cases
in the unit test will work
for all of
2.6, 2.7, 3.2-3.5.
620 parser = HTMLAttributeParser()
622 parser.feed(html_element)
624 # Older Python may throw HTMLParseError in case of malformed HTML
625 except compat_HTMLParseError:
630 def parse_list(webpage):
631 """Given a string
for an series of HTML
<li
> elements
,
632 return a dictionary of their attributes
"""
633 parser = HTMLListAttrsParser()
639 def clean_html(html):
640 """Clean an HTML snippet into a readable string
"""
642 if html is None: # Convenience for sanitizing descriptions etc.
645 html = re.sub(r'\s+', ' ', html)
646 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
647 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
649 html = re.sub('<.*?>', '', html)
650 # Replace html entities
651 html = unescapeHTML(html)
655 def sanitize_open(filename, open_mode):
656 """Try to
open the given filename
, and slightly tweak it
if this fails
.
658 Attempts to
open the given filename
. If this fails
, it tries to change
659 the filename slightly
, step by step
, until it
's either able to open it
660 or it fails and raises a final exception, like the standard open()
663 It returns the tuple (stream, definitive_file_name).
667 if sys.platform == 'win32
':
669 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
670 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
671 stream = locked_file(filename, open_mode, block=False).open()
672 return (stream, filename)
673 except (IOError, OSError) as err:
674 if err.errno in (errno.EACCES,):
677 # In case of error, try to remove win32 forbidden chars
678 alt_filename = sanitize_path(filename)
679 if alt_filename == filename:
682 # An exception here should be caught in the caller
683 stream = locked_file(filename, open_mode, block=False).open()
684 return (stream, alt_filename)
687 def timeconvert(timestr):
688 """Convert RFC 2822 defined time string into system timestamp"""
690 timetuple = email.utils.parsedate_tz(timestr)
691 if timetuple is not None:
692 timestamp = email.utils.mktime_tz(timetuple)
696 def sanitize_filename(s, restricted=False, is_id=False):
697 """Sanitizes a string so it could be used as part of a filename.
698 If restricted is set, use a stricter subset of allowed characters.
699 Set is_id if this is not an arbitrary string, but an ID that should be kept
702 def replace_insane(char):
703 if restricted and char in ACCENT_CHARS:
704 return ACCENT_CHARS[char]
705 elif not restricted and char == '\n':
707 elif char == '?
' or ord(char) < 32 or ord(char) == 127:
710 return '' if restricted else '\''
712 return '_-' if restricted else ' -'
713 elif char in '\\/|*<>':
715 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
717 if restricted and ord(char) > 127:
724 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
725 result = ''.join(map(replace_insane, s))
727 while '__' in result:
728 result = result.replace('__', '_')
729 result = result.strip('_')
730 # Common case of "Foreign band name
- English song title
"
731 if restricted and result.startswith('-_'):
733 if result.startswith('-'):
734 result = '_' + result[len('-'):]
735 result = result.lstrip('.')
741 def sanitize_path(s, force=False):
742 """Sanitizes and normalizes path on Windows"""
743 if sys.platform == 'win32':
745 drive_or_unc, _ = os.path.splitdrive(s)
746 if sys.version_info < (2, 7) and not drive_or_unc:
747 drive_or_unc, _ = os.path.splitunc(s)
753 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
757 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|
\\?\
*]|
[\s
.]$
)', '#', path_part)
758 for path_part
in norm_path
]
760 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
761 elif force
and s
[0] == os
.path
.sep
:
762 sanitized_path
.insert(0, os
.path
.sep
)
763 return os
.path
.join(*sanitized_path
)
766 def sanitize_url(url
):
767 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
768 # the number of unwanted failures due to missing protocol
769 if url
.startswith('//'):
770 return 'http:%s' % url
771 # Fix some common typos seen so far
773 # https://github.com/ytdl-org/youtube-dl/issues/15649
774 (r
'^httpss://', r
'https://'),
775 # https://bx1.be/lives/direct-tv/
776 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
778 for mistake
, fixup
in COMMON_TYPOS
:
779 if re
.match(mistake
, url
):
780 return re
.sub(mistake
, fixup
, url
)
784 def extract_basic_auth(url
):
785 parts
= compat_urlparse
.urlsplit(url
)
786 if parts
.username
is None:
788 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
789 parts
.hostname
if parts
.port
is None
790 else '%s:%d' % (parts
.hostname
, parts
.port
))))
791 auth_payload
= base64
.b64encode(
792 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
793 return url
, 'Basic ' + auth_payload
.decode('utf-8')
796 def sanitized_Request(url
, *args
, **kwargs
):
797 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
798 if auth_header
is not None:
799 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
800 headers
['Authorization'] = auth_header
801 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
805 """Expand shell variables and ~"""
806 return os
.path
.expandvars(compat_expanduser(s
))
809 def orderedSet(iterable
):
810 """ Remove all duplicates from the input iterable """
818 def _htmlentity_transform(entity_with_semicolon
):
819 """Transforms an HTML entity to a character."""
820 entity
= entity_with_semicolon
[:-1]
822 # Known non-numeric HTML entity
823 if entity
in compat_html_entities
.name2codepoint
:
824 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
826 # TODO: HTML5 allows entities without a semicolon. For example,
827 # 'Éric' should be decoded as 'Éric'.
828 if entity_with_semicolon
in compat_html_entities_html5
:
829 return compat_html_entities_html5
[entity_with_semicolon
]
831 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
833 numstr
= mobj
.group(1)
834 if numstr
.startswith('x'):
836 numstr
= '0%s' % numstr
839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
841 return compat_chr(int(numstr
, base
))
845 # Unknown entity in name, return its literal representation
846 return '&%s;' % entity
852 assert type(s
) == compat_str
855 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
858 def escapeHTML(text
):
861 .replace('&', '&')
862 .replace('<', '<')
863 .replace('>', '>')
864 .replace('"', '"')
865 .replace("'", ''')
869 def process_communicate_or_kill(p
, *args
, **kwargs
):
871 return p
.communicate(*args
, **kwargs
)
872 except BaseException
: # Including KeyboardInterrupt
878 class Popen(subprocess
.Popen
):
879 if sys
.platform
== 'win32':
880 _startupinfo
= subprocess
.STARTUPINFO()
881 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
885 def __init__(self
, *args
, **kwargs
):
886 super(Popen
, self
).__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
888 def communicate_or_kill(self
, *args
, **kwargs
):
889 return process_communicate_or_kill(self
, *args
, **kwargs
)
892 def get_subprocess_encoding():
893 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
894 # For subprocess calls, encode with locale encoding
895 # Refer to http://stackoverflow.com/a/9951851/35070
896 encoding
= preferredencoding()
898 encoding
= sys
.getfilesystemencoding()
904 def encodeFilename(s
, for_subprocess
=False):
906 @param s The name of the file
909 assert type(s
) == compat_str
911 # Python 3 has a Unicode API
912 if sys
.version_info
>= (3, 0):
915 # Pass '' directly to use Unicode APIs on Windows 2000 and up
916 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
917 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
918 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
921 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
922 if sys
.platform
.startswith('java'):
925 return s
.encode(get_subprocess_encoding(), 'ignore')
928 def decodeFilename(b
, for_subprocess
=False):
930 if sys
.version_info
>= (3, 0):
933 if not isinstance(b
, bytes):
936 return b
.decode(get_subprocess_encoding(), 'ignore')
939 def encodeArgument(s
):
940 if not isinstance(s
, compat_str
):
941 # Legacy code that uses byte strings
942 # Uncomment the following line after fixing all post processors
943 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
944 s
= s
.decode('ascii')
945 return encodeFilename(s
, True)
948 def decodeArgument(b
):
949 return decodeFilename(b
, True)
952 def decodeOption(optval
):
955 if isinstance(optval
, bytes):
956 optval
= optval
.decode(preferredencoding())
958 assert isinstance(optval
, compat_str
)
962 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
965 def timetuple_from_msec(msec
):
966 secs
, msec
= divmod(msec
, 1000)
967 mins
, secs
= divmod(secs
, 60)
968 hrs
, mins
= divmod(mins
, 60)
969 return _timetuple(hrs
, mins
, secs
, msec
)
972 def formatSeconds(secs
, delim
=':', msec
=False):
973 time
= timetuple_from_msec(secs
* 1000)
975 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
977 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
979 ret
= '%d' % time
.seconds
980 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
983 def _ssl_load_windows_store_certs(ssl_context
, storename
):
984 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
986 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
987 if encoding
== 'x509_asn' and (
988 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
989 except PermissionError
:
993 ssl_context
.load_verify_locations(cadata
=cert
)
998 def make_HTTPS_handler(params
, **kwargs
):
999 opts_check_certificate
= not params
.get('nocheckcertificate')
1000 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
1001 context
.check_hostname
= opts_check_certificate
1002 if params
.get('legacyserverconnect'):
1003 context
.options |
= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1004 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
1005 if opts_check_certificate
:
1007 context
.load_default_certs()
1008 # Work around the issue in load_default_certs when there are bad certificates. See:
1009 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011 except ssl
.SSLError
:
1012 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
1014 # Create a new context to discard any certificates that were already loaded
1015 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
1016 context
.check_hostname
, context
.verify_mode
= True, ssl
.CERT_REQUIRED
1017 for storename
in ('CA', 'ROOT'):
1018 _ssl_load_windows_store_certs(context
, storename
)
1019 context
.set_default_verify_paths()
1020 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
1023 def bug_reports_message(before
=';'):
1024 msg
= ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1025 'filling out the "Broken site" issue template properly. '
1026 'Confirm you are on the latest version using -U')
1028 before
= before
.rstrip()
1029 if not before
or before
.endswith(('.', '!', '?')):
1030 msg
= msg
[0].title() + msg
[1:]
1032 return (before
+ ' ' if before
else '') + msg
1035 class YoutubeDLError(Exception):
1036 """Base exception for YoutubeDL errors."""
1039 def __init__(self
, msg
=None):
1042 elif self
.msg
is None:
1043 self
.msg
= type(self
).__name
__
1044 super().__init
__(self
.msg
)
1047 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
1048 if hasattr(ssl
, 'CertificateError'):
1049 network_exceptions
.append(ssl
.CertificateError
)
1050 network_exceptions
= tuple(network_exceptions
)
1053 class ExtractorError(YoutubeDLError
):
1054 """Error during info extraction."""
1056 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
1057 """ tb, if given, is the original traceback (so that it can be printed out).
1058 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1060 if sys
.exc_info()[0] in network_exceptions
:
1063 self
.orig_msg
= str(msg
)
1065 self
.expected
= expected
1067 self
.video_id
= video_id
1069 self
.exc_info
= sys
.exc_info() # preserve original exception
1071 super(ExtractorError
, self
).__init
__(''.join((
1072 format_field(ie
, template
='[%s] '),
1073 format_field(video_id
, template
='%s: '),
1075 format_field(cause
, template
=' (caused by %r)'),
1076 '' if expected
else bug_reports_message())))
1078 def format_traceback(self
):
1079 return join_nonempty(
1080 self
.traceback
and ''.join(traceback
.format_tb(self
.traceback
)),
1081 self
.cause
and ''.join(traceback
.format_exception(self
.cause
)[1:]),
1085 class UnsupportedError(ExtractorError
):
1086 def __init__(self
, url
):
1087 super(UnsupportedError
, self
).__init
__(
1088 'Unsupported URL: %s' % url
, expected
=True)
1092 class RegexNotFoundError(ExtractorError
):
1093 """Error when a regex didn't match"""
1097 class GeoRestrictedError(ExtractorError
):
1098 """Geographic restriction Error exception.
1100 This exception may be thrown when a video is not available from your
1101 geographic location due to geographic restrictions imposed by a website.
1104 def __init__(self
, msg
, countries
=None, **kwargs
):
1105 kwargs
['expected'] = True
1106 super(GeoRestrictedError
, self
).__init
__(msg
, **kwargs
)
1107 self
.countries
= countries
1110 class DownloadError(YoutubeDLError
):
1111 """Download Error exception.
1113 This exception may be thrown by FileDownloader objects if they are not
1114 configured to continue on errors. They will contain the appropriate
1118 def __init__(self
, msg
, exc_info
=None):
1119 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120 super(DownloadError
, self
).__init
__(msg
)
1121 self
.exc_info
= exc_info
1124 class EntryNotInPlaylist(YoutubeDLError
):
1125 """Entry not in playlist exception.
1127 This exception will be thrown by YoutubeDL when a requested entry
1128 is not found in the playlist info_dict
1130 msg
= 'Entry not found in info'
1133 class SameFileError(YoutubeDLError
):
1134 """Same File exception.
1136 This exception will be thrown by FileDownloader objects if they detect
1137 multiple files would have to be downloaded to the same file on disk.
1139 msg
= 'Fixed output name but more than one file to download'
1141 def __init__(self
, filename
=None):
1142 if filename
is not None:
1143 self
.msg
+= f
': {filename}'
1144 super().__init
__(self
.msg
)
1147 class PostProcessingError(YoutubeDLError
):
1148 """Post Processing exception.
1150 This exception may be raised by PostProcessor's .run() method to
1151 indicate an error in the postprocessing task.
1155 class DownloadCancelled(YoutubeDLError
):
1156 """ Exception raised when the download queue should be interrupted """
1157 msg
= 'The download was cancelled'
1160 class ExistingVideoReached(DownloadCancelled
):
1161 """ --break-on-existing triggered """
1162 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1165 class RejectedVideoReached(DownloadCancelled
):
1166 """ --break-on-reject triggered """
1167 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1170 class MaxDownloadsReached(DownloadCancelled
):
1171 """ --max-downloads limit has been reached. """
1172 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1175 class ReExtractInfo(YoutubeDLError
):
1176 """ Video info needs to be re-extracted. """
1178 def __init__(self
, msg
, expected
=False):
1179 super().__init
__(msg
)
1180 self
.expected
= expected
1183 class ThrottledDownload(ReExtractInfo
):
1184 """ Download speed below --throttled-rate. """
1185 msg
= 'The download speed is below throttle limit'
1188 super().__init
__(self
.msg
, expected
=False)
1191 class UnavailableVideoError(YoutubeDLError
):
1192 """Unavailable Format exception.
1194 This exception will be thrown when a video is requested
1195 in a format that is not available for that video.
1197 msg
= 'Unable to download video'
1199 def __init__(self
, err
=None):
1201 self
.msg
+= f
': {err}'
1202 super().__init
__(self
.msg
)
1205 class ContentTooShortError(YoutubeDLError
):
1206 """Content Too Short exception.
1208 This exception may be raised by FileDownloader objects when a file they
1209 download is too small for what the server announced first, indicating
1210 the connection was probably interrupted.
1213 def __init__(self
, downloaded
, expected
):
1214 super(ContentTooShortError
, self
).__init
__(
1215 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
1218 self
.downloaded
= downloaded
1219 self
.expected
= expected
1222 class XAttrMetadataError(YoutubeDLError
):
1223 def __init__(self
, code
=None, msg
='Unknown error'):
1224 super(XAttrMetadataError
, self
).__init
__(msg
)
1228 # Parsing code and msg
1229 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1230 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1231 self
.reason
= 'NO_SPACE'
1232 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1233 self
.reason
= 'VALUE_TOO_LONG'
1235 self
.reason
= 'NOT_SUPPORTED'
1238 class XAttrUnavailableError(YoutubeDLError
):
1242 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1243 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1244 # expected HTTP responses to meet HTTP/1.0 or later (see also
1245 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1246 if sys
.version_info
< (3, 0):
1247 kwargs
['strict'] = True
1248 hc
= http_class(*args
, **compat_kwargs(kwargs
))
1249 source_address
= ydl_handler
._params
.get('source_address')
1251 if source_address
is not None:
1252 # This is to workaround _create_connection() from socket where it will try all
1253 # address data from getaddrinfo() including IPv6. This filters the result from
1254 # getaddrinfo() based on the source_address value.
1255 # This is based on the cpython socket.create_connection() function.
1256 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1258 host
, port
= address
1260 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1261 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1262 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1263 if addrs
and not ip_addrs
:
1264 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1266 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267 % (ip_version
, source_address
[0]))
1268 for res
in ip_addrs
:
1269 af
, socktype
, proto
, canonname
, sa
= res
1272 sock
= socket
.socket(af
, socktype
, proto
)
1273 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1274 sock
.settimeout(timeout
)
1275 sock
.bind(source_address
)
1277 err
= None # Explicitly break reference cycle
1279 except socket
.error
as _
:
1281 if sock
is not None:
1286 raise socket
.error('getaddrinfo returns an empty list')
1287 if hasattr(hc
, '_create_connection'):
1288 hc
._create
_connection
= _create_connection
1289 sa
= (source_address
, 0)
1290 if hasattr(hc
, 'source_address'): # Python 2.7+
1291 hc
.source_address
= sa
1293 def _hc_connect(self
, *args
, **kwargs
):
1294 sock
= _create_connection(
1295 (self
.host
, self
.port
), self
.timeout
, sa
)
1297 self
.sock
= ssl
.wrap_socket(
1298 sock
, self
.key_file
, self
.cert_file
,
1299 ssl_version
=ssl
.PROTOCOL_TLSv1
)
1302 hc
.connect
= functools
.partial(_hc_connect
, hc
)
1307 def handle_youtubedl_headers(headers
):
1308 filtered_headers
= headers
1310 if 'Youtubedl-no-compression' in filtered_headers
:
1311 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
1312 del filtered_headers
['Youtubedl-no-compression']
1314 return filtered_headers
1317 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1318 """Handler for HTTP requests and responses.
1320 This class, when installed with an OpenerDirector, automatically adds
1321 the standard headers to every HTTP request and handles gzipped and
1322 deflated responses from web servers. If compression is to be avoided in
1323 a particular request, the original request in the program code only has
1324 to include the HTTP header "Youtubedl-no-compression", which will be
1325 removed before making the real request.
1327 Part of this code was copied from:
1329 http://techknack.net/python-urllib2-handlers/
1331 Andrew Rowls, the author of that code, agreed to release it to the
1335 def __init__(self
, params
, *args
, **kwargs
):
1336 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1337 self
._params
= params
1339 def http_open(self
, req
):
1340 conn_class
= compat_http_client
.HTTPConnection
1342 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1344 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1345 del req
.headers
['Ytdl-socks-proxy']
1347 return self
.do_open(functools
.partial(
1348 _create_http_connection
, self
, conn_class
, False),
1356 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1358 return zlib
.decompress(data
)
1360 def http_request(self
, req
):
1361 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1362 # always respected by websites, some tend to give out URLs with non percent-encoded
1363 # non-ASCII characters (see telemb.py, ard.py [#3412])
1364 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1365 # To work around aforementioned issue we will replace request's original URL with
1366 # percent-encoded one
1367 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1368 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1369 url
= req
.get_full_url()
1370 url_escaped
= escape_url(url
)
1372 # Substitute URL if any change after escaping
1373 if url
!= url_escaped
:
1374 req
= update_Request(req
, url
=url_escaped
)
1376 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
1377 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1378 # The dict keys are capitalized because of this bug by urllib
1379 if h
.capitalize() not in req
.headers
:
1380 req
.add_header(h
, v
)
1382 req
.headers
= handle_youtubedl_headers(req
.headers
)
1384 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
1385 # Python 2.6 is brain-dead when it comes to fragments
1386 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
1387 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
1391 def http_response(self
, req
, resp
):
1394 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1395 content
= resp
.read()
1396 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1398 uncompressed
= io
.BytesIO(gz
.read())
1399 except IOError as original_ioerror
:
1400 # There may be junk add the end of the file
1401 # See http://stackoverflow.com/q/4928560/35070 for details
1402 for i
in range(1, 1024):
1404 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1405 uncompressed
= io
.BytesIO(gz
.read())
1410 raise original_ioerror
1411 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1412 resp
.msg
= old_resp
.msg
1413 del resp
.headers
['Content-encoding']
1415 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1416 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1417 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1418 resp
.msg
= old_resp
.msg
1419 del resp
.headers
['Content-encoding']
1420 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1421 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1422 if 300 <= resp
.code
< 400:
1423 location
= resp
.headers
.get('Location')
1425 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1426 if sys
.version_info
>= (3, 0):
1427 location
= location
.encode('iso-8859-1').decode('utf-8')
1429 location
= location
.decode('utf-8')
1430 location_escaped
= escape_url(location
)
1431 if location
!= location_escaped
:
1432 del resp
.headers
['Location']
1433 if sys
.version_info
< (3, 0):
1434 location_escaped
= location_escaped
.encode('utf-8')
1435 resp
.headers
['Location'] = location_escaped
1438 https_request
= http_request
1439 https_response
= http_response
1442 def make_socks_conn_class(base_class
, socks_proxy
):
1443 assert issubclass(base_class
, (
1444 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1446 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1447 if url_components
.scheme
.lower() == 'socks5':
1448 socks_type
= ProxyType
.SOCKS5
1449 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1450 socks_type
= ProxyType
.SOCKS4
1451 elif url_components
.scheme
.lower() == 'socks4a':
1452 socks_type
= ProxyType
.SOCKS4A
1454 def unquote_if_non_empty(s
):
1457 return compat_urllib_parse_unquote_plus(s
)
1461 url_components
.hostname
, url_components
.port
or 1080,
1463 unquote_if_non_empty(url_components
.username
),
1464 unquote_if_non_empty(url_components
.password
),
1467 class SocksConnection(base_class
):
1469 self
.sock
= sockssocket()
1470 self
.sock
.setproxy(*proxy_args
)
1471 if type(self
.timeout
) in (int, float):
1472 self
.sock
.settimeout(self
.timeout
)
1473 self
.sock
.connect((self
.host
, self
.port
))
1475 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1476 if hasattr(self
, '_context'): # Python > 2.6
1477 self
.sock
= self
._context
.wrap_socket(
1478 self
.sock
, server_hostname
=self
.host
)
1480 self
.sock
= ssl
.wrap_socket(self
.sock
)
1482 return SocksConnection
1485 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1486 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1487 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1488 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1489 self
._params
= params
1491 def https_open(self
, req
):
1493 conn_class
= self
._https
_conn
_class
1495 if hasattr(self
, '_context'): # python > 2.6
1496 kwargs
['context'] = self
._context
1497 if hasattr(self
, '_check_hostname'): # python 3.x
1498 kwargs
['check_hostname'] = self
._check
_hostname
1500 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1502 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1503 del req
.headers
['Ytdl-socks-proxy']
1505 return self
.do_open(functools
.partial(
1506 _create_http_connection
, self
, conn_class
, True),
1510 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1512 See [1] for cookie file format.
1514 1. https://curl.haxx.se/docs/http-cookies.html
1516 _HTTPONLY_PREFIX
= '#HttpOnly_'
1518 _HEADER
= '''# Netscape HTTP Cookie File
1519 # This file is generated by yt-dlp. Do not edit.
1522 _CookieFileEntry
= collections
.namedtuple(
1524 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1526 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1528 Save cookies to a file.
1530 Most of the code is taken from CPython 3.8 and slightly adapted
1531 to support cookie files with UTF-8 in both python 2 and 3.
1533 if filename
is None:
1534 if self
.filename
is not None:
1535 filename
= self
.filename
1537 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1539 # Store session cookies with `expires` set to 0 instead of an empty
1542 if cookie
.expires
is None:
1545 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
1546 f
.write(self
._HEADER
)
1549 if not ignore_discard
and cookie
.discard
:
1551 if not ignore_expires
and cookie
.is_expired(now
):
1557 if cookie
.domain
.startswith('.'):
1558 initial_dot
= 'TRUE'
1560 initial_dot
= 'FALSE'
1561 if cookie
.expires
is not None:
1562 expires
= compat_str(cookie
.expires
)
1565 if cookie
.value
is None:
1566 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1567 # with no name, whereas http.cookiejar regards it as a
1568 # cookie with no value.
1573 value
= cookie
.value
1575 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
1576 secure
, expires
, name
, value
]) + '\n')
1578 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1579 """Load cookies from a file."""
1580 if filename
is None:
1581 if self
.filename
is not None:
1582 filename
= self
.filename
1584 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1586 def prepare_line(line
):
1587 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1588 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1589 # comments and empty lines are fine
1590 if line
.startswith('#') or not line
.strip():
1592 cookie_list
= line
.split('\t')
1593 if len(cookie_list
) != self
._ENTRY
_LEN
:
1594 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1595 cookie
= self
._CookieFileEntry
(*cookie_list
)
1596 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1597 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1601 with io
.open(filename
, encoding
='utf-8') as f
:
1604 cf
.write(prepare_line(line
))
1605 except compat_cookiejar
.LoadError
as e
:
1607 'WARNING: skipping cookie file entry due to %s: %r\n'
1608 % (e
, line
), sys
.stderr
)
1611 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1612 # Session cookies are denoted by either `expires` field set to
1613 # an empty string or 0. MozillaCookieJar only recognizes the former
1614 # (see [1]). So we need force the latter to be recognized as session
1615 # cookies on our own.
1616 # Session cookies may be important for cookies-based authentication,
1617 # e.g. usually, when user does not check 'Remember me' check box while
1618 # logging in on a site, some important cookies are stored as session
1619 # cookies so that not recognizing them will result in failed login.
1620 # 1. https://bugs.python.org/issue17164
1622 # Treat `expires=0` cookies as session cookies
1623 if cookie
.expires
== 0:
1624 cookie
.expires
= None
1625 cookie
.discard
= True
1628 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1629 def __init__(self
, cookiejar
=None):
1630 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1632 def http_response(self
, request
, response
):
1633 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1634 # characters in Set-Cookie HTTP header of last response (see
1635 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1636 # In order to at least prevent crashing we will percent encode Set-Cookie
1637 # header before HTTPCookieProcessor starts processing it.
1638 # if sys.version_info < (3, 0) and response.headers:
1639 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1640 # set_cookie = response.headers.get(set_cookie_header)
1642 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1643 # if set_cookie != set_cookie_escaped:
1644 # del response.headers[set_cookie_header]
1645 # response.headers[set_cookie_header] = set_cookie_escaped
1646 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1648 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1649 https_response
= http_response
1652 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1653 """YoutubeDL redirect handler
1655 The code is based on HTTPRedirectHandler implementation from CPython [1].
1657 This redirect handler solves two issues:
1658 - ensures redirect URL is always unicode under python 2
1659 - introduces support for experimental HTTP response status code
1660 308 Permanent Redirect [2] used by some sites [3]
1662 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1667 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1669 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1670 """Return a Request or None in response to a redirect.
1672 This is called by the http_error_30x methods when a
1673 redirection response is received. If a redirection should
1674 take place, return a new Request to allow http_error_30x to
1675 perform the redirect. Otherwise, raise HTTPError if no-one
1676 else should try to handle this url. Return None if you can't
1677 but another Handler might.
1679 m
= req
.get_method()
1680 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1681 or code
in (301, 302, 303) and m
== "POST")):
1682 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1683 # Strictly (according to RFC 2616), 301 or 302 in response to
1684 # a POST MUST NOT cause a redirection without confirmation
1685 # from the user (of urllib.request, in this case). In practice,
1686 # essentially all clients do redirect in this case, so we do
1689 # On python 2 urlh.geturl() may sometimes return redirect URL
1690 # as byte string instead of unicode. This workaround allows
1691 # to force it always return unicode.
1692 if sys
.version_info
[0] < 3:
1693 newurl
= compat_str(newurl
)
1695 # Be conciliant with URIs containing a space. This is mainly
1696 # redundant with the more complete encoding done in http_error_302(),
1697 # but it is kept for compatibility with other callers.
1698 newurl
= newurl
.replace(' ', '%20')
1700 CONTENT_HEADERS
= ("content-length", "content-type")
1701 # NB: don't use dict comprehension for python 2.6 compatibility
1702 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
1703 if k
.lower() not in CONTENT_HEADERS
)
1704 return compat_urllib_request
.Request(
1705 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1709 def extract_timezone(date_str
):
1712 ^.{8,}? # >=8 char non-TZ prefix, if present
1713 (?P<tz>Z| # just the UTC Z, or
1714 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1715 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1716 [ ]? # optional space
1717 (?P<sign>\+|-) # +/-
1718 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1722 timezone
= datetime
.timedelta()
1724 date_str
= date_str
[:-len(m
.group('tz'))]
1725 if not m
.group('sign'):
1726 timezone
= datetime
.timedelta()
1728 sign
= 1 if m
.group('sign') == '+' else -1
1729 timezone
= datetime
.timedelta(
1730 hours
=sign
* int(m
.group('hours')),
1731 minutes
=sign
* int(m
.group('minutes')))
1732 return timezone
, date_str
1735 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1736 """ Return a UNIX timestamp from the given date """
1738 if date_str
is None:
1741 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1743 if timezone
is None:
1744 timezone
, date_str
= extract_timezone(date_str
)
1747 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1748 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1749 return calendar
.timegm(dt
.timetuple())
1754 def date_formats(day_first
=True):
1755 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1758 def unified_strdate(date_str
, day_first
=True):
1759 """Return a string with the date in the format YYYYMMDD"""
1761 if date_str
is None:
1765 date_str
= date_str
.replace(',', ' ')
1766 # Remove AM/PM + timezone
1767 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1768 _
, date_str
= extract_timezone(date_str
)
1770 for expression
in date_formats(day_first
):
1772 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1775 if upload_date
is None:
1776 timetuple
= email
.utils
.parsedate_tz(date_str
)
1779 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1782 if upload_date
is not None:
1783 return compat_str(upload_date
)
1786 def unified_timestamp(date_str
, day_first
=True):
1787 if date_str
is None:
1790 date_str
= re
.sub(r
'[,|]', '', date_str
)
1792 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1793 timezone
, date_str
= extract_timezone(date_str
)
1795 # Remove AM/PM + timezone
1796 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1798 # Remove unrecognized timezones from ISO 8601 alike timestamps
1799 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1801 date_str
= date_str
[:-len(m
.group('tz'))]
1803 # Python only supports microseconds, so remove nanoseconds
1804 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1806 date_str
= m
.group(1)
1808 for expression
in date_formats(day_first
):
1810 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1811 return calendar
.timegm(dt
.timetuple())
1814 timetuple
= email
.utils
.parsedate_tz(date_str
)
1816 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1819 def determine_ext(url
, default_ext
='unknown_video'):
1820 if url
is None or '.' not in url
:
1822 guess
= url
.partition('?')[0].rpartition('.')[2]
1823 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1825 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1826 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1827 return guess
.rstrip('/')
1832 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1833 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1836 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1838 Return a datetime object from a string in the format YYYYMMDD or
1839 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1841 format: string date format used to return datetime object from
1842 precision: round the time portion of a datetime object.
1843 auto|microsecond|second|minute|hour|day.
1844 auto: round to the unit provided in date_str (if applicable).
1846 auto_precision
= False
1847 if precision
== 'auto':
1848 auto_precision
= True
1849 precision
= 'microsecond'
1850 today
= datetime_round(datetime
.datetime
.utcnow(), precision
)
1851 if date_str
in ('now', 'today'):
1853 if date_str
== 'yesterday':
1854 return today
- datetime
.timedelta(days
=1)
1856 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1858 if match
is not None:
1859 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1860 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1861 unit
= match
.group('unit')
1862 if unit
== 'month' or unit
== 'year':
1863 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1869 delta
= datetime
.timedelta(**{unit + 's': time}
)
1870 new_date
= start_time
+ delta
1872 return datetime_round(new_date
, unit
)
1875 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1878 def date_from_str(date_str
, format
='%Y%m%d', strict
=False):
1880 Return a datetime object from a string in the format YYYYMMDD or
1881 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1883 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1885 format: string date format used to return datetime object from
1887 if strict
and not re
.fullmatch(r
'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str
):
1888 raise ValueError(f
'Invalid date format {date_str}')
1889 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1892 def datetime_add_months(dt
, months
):
1893 """Increment/Decrement a datetime object by months."""
1894 month
= dt
.month
+ months
- 1
1895 year
= dt
.year
+ month
// 12
1896 month
= month
% 12 + 1
1897 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1898 return dt
.replace(year
, month
, day
)
1901 def datetime_round(dt
, precision
='day'):
1903 Round a datetime object's time to a specific precision
1905 if precision
== 'microsecond':
1914 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1915 timestamp
= calendar
.timegm(dt
.timetuple())
1916 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1919 def hyphenate_date(date_str
):
1921 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1922 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1923 if match
is not None:
1924 return '-'.join(match
.groups())
1929 class DateRange(object):
1930 """Represents a time interval between two dates"""
1932 def __init__(self
, start
=None, end
=None):
1933 """start and end must be strings in the format accepted by date"""
1934 if start
is not None:
1935 self
.start
= date_from_str(start
, strict
=True)
1937 self
.start
= datetime
.datetime
.min.date()
1939 self
.end
= date_from_str(end
, strict
=True)
1941 self
.end
= datetime
.datetime
.max.date()
1942 if self
.start
> self
.end
:
1943 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1947 """Returns a range that only contains the given day"""
1948 return cls(day
, day
)
1950 def __contains__(self
, date
):
1951 """Check if the date is in the range"""
1952 if not isinstance(date
, datetime
.date
):
1953 date
= date_from_str(date
)
1954 return self
.start
<= date
<= self
.end
1957 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1960 def platform_name():
1961 """ Returns the platform name as a compat_str """
1962 res
= platform
.platform()
1963 if isinstance(res
, bytes):
1964 res
= res
.decode(preferredencoding())
1966 assert isinstance(res
, compat_str
)
1970 def get_windows_version():
1971 ''' Get Windows version. None if it's not running on Windows '''
1972 if compat_os_name
== 'nt':
1973 return version_tuple(platform
.win32_ver()[1])
1978 def _windows_write_string(s
, out
):
1979 """ Returns True if the string was written using special methods,
1980 False if it has yet to be written out."""
1981 # Adapted from http://stackoverflow.com/a/3259271/35070
1983 import ctypes
.wintypes
1991 fileno
= out
.fileno()
1992 except AttributeError:
1993 # If the output stream doesn't have a fileno, it's virtual
1995 except io
.UnsupportedOperation
:
1996 # Some strange Windows pseudo files?
1998 if fileno
not in WIN_OUTPUT_IDS
:
2001 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
2002 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
2003 ('GetStdHandle', ctypes
.windll
.kernel32
))
2004 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
2006 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
2007 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
2008 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
2009 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
2010 written
= ctypes
.wintypes
.DWORD(0)
2012 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
2013 FILE_TYPE_CHAR
= 0x0002
2014 FILE_TYPE_REMOTE
= 0x8000
2015 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
2016 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
2017 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
2018 ('GetConsoleMode', ctypes
.windll
.kernel32
))
2019 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
2021 def not_a_console(handle
):
2022 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
2024 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
2025 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
2027 if not_a_console(h
):
2030 def next_nonbmp_pos(s
):
2032 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
2033 except StopIteration:
2037 count
= min(next_nonbmp_pos(s
), 1024)
2039 ret
= WriteConsoleW(
2040 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
2042 raise OSError('Failed to write string')
2043 if not count
: # We just wrote a non-BMP character
2044 assert written
.value
== 2
2047 assert written
.value
> 0
2048 s
= s
[written
.value
:]
2052 def write_string(s
, out
=None, encoding
=None):
2055 assert type(s
) == compat_str
2057 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
2058 if _windows_write_string(s
, out
):
2061 if ('b' in getattr(out
, 'mode', '')
2062 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
2063 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
2065 elif hasattr(out
, 'buffer'):
2066 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
2067 byt
= s
.encode(enc
, 'ignore')
2068 out
.buffer.write(byt
)
2074 def bytes_to_intlist(bs
):
2077 if isinstance(bs
[0], int): # Python 3
2080 return [ord(c
) for c
in bs
]
2083 def intlist_to_bytes(xs
):
2086 return compat_struct_pack('%dB' % len(xs
), *xs
)
2089 # Cross-platform file locking
2090 if sys
.platform
== 'win32':
2091 import ctypes
.wintypes
2094 class OVERLAPPED(ctypes
.Structure
):
2096 ('Internal', ctypes
.wintypes
.LPVOID
),
2097 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
2098 ('Offset', ctypes
.wintypes
.DWORD
),
2099 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
2100 ('hEvent', ctypes
.wintypes
.HANDLE
),
2103 kernel32
= ctypes
.windll
.kernel32
2104 LockFileEx
= kernel32
.LockFileEx
2105 LockFileEx
.argtypes
= [
2106 ctypes
.wintypes
.HANDLE
, # hFile
2107 ctypes
.wintypes
.DWORD
, # dwFlags
2108 ctypes
.wintypes
.DWORD
, # dwReserved
2109 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2110 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2111 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2113 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
2114 UnlockFileEx
= kernel32
.UnlockFileEx
2115 UnlockFileEx
.argtypes
= [
2116 ctypes
.wintypes
.HANDLE
, # hFile
2117 ctypes
.wintypes
.DWORD
, # dwReserved
2118 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2119 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2120 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2122 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
2123 whole_low
= 0xffffffff
2124 whole_high
= 0x7fffffff
2126 def _lock_file(f
, exclusive
, block
):
2127 overlapped
= OVERLAPPED()
2128 overlapped
.Offset
= 0
2129 overlapped
.OffsetHigh
= 0
2130 overlapped
.hEvent
= 0
2131 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
2133 if not LockFileEx(msvcrt
.get_osfhandle(f
.fileno()),
2134 (0x2 if exclusive
else 0x0) |
(0x0 if block
else 0x1),
2135 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2136 raise BlockingIOError('Locking file failed: %r' % ctypes
.FormatError())
2138 def _unlock_file(f
):
2139 assert f
._lock
_file
_overlapped
_p
2140 handle
= msvcrt
.get_osfhandle(f
.fileno())
2141 if not UnlockFileEx(handle
, 0, whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2142 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2148 def _lock_file(f
, exclusive
, block
):
2151 fcntl
.LOCK_SH
if not exclusive
2152 else fcntl
.LOCK_EX
if block
2153 else fcntl
.LOCK_EX | fcntl
.LOCK_NB
)
2154 except BlockingIOError
:
2156 except OSError: # AOSP does not have flock()
2158 fcntl
.LOCK_SH
if not exclusive
2159 else fcntl
.LOCK_EX
if block
2160 else fcntl
.LOCK_EX | fcntl
.LOCK_NB
)
2162 def _unlock_file(f
):
2164 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2166 fcntl
.lockf(f
, fcntl
.LOCK_UN
)
2169 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
2171 def _lock_file(f
, exclusive
, block
):
2172 raise IOError(UNSUPPORTED_MSG
)
2174 def _unlock_file(f
):
2175 raise IOError(UNSUPPORTED_MSG
)
2178 class locked_file(object):
2181 def __init__(self
, filename
, mode
, block
=True, encoding
=None):
2182 assert mode
in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2183 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
2187 def __enter__(self
):
2188 exclusive
= 'r' not in self
.mode
2190 _lock_file(self
.f
, exclusive
, self
.block
)
2196 def __exit__(self
, etype
, value
, traceback
):
2198 if not self
._closed
:
2199 _unlock_file(self
.f
)
2207 def write(self
, *args
):
2208 return self
.f
.write(*args
)
2210 def read(self
, *args
):
2211 return self
.f
.read(*args
)
2217 return self
.__enter
__()
2219 def close(self
, *args
):
2220 self
.__exit
__(self
, *args
, value
=False, traceback
=False)
2223 def get_filesystem_encoding():
2224 encoding
= sys
.getfilesystemencoding()
2225 return encoding
if encoding
is not None else 'utf-8'
2228 def shell_quote(args
):
2230 encoding
= get_filesystem_encoding()
2232 if isinstance(a
, bytes):
2233 # We may get a filename encoded with 'encodeFilename'
2234 a
= a
.decode(encoding
)
2235 quoted_args
.append(compat_shlex_quote(a
))
2236 return ' '.join(quoted_args
)
2239 def smuggle_url(url
, data
):
2240 """ Pass additional data in a URL for internal use. """
2242 url
, idata
= unsmuggle_url(url
, {})
2244 sdata
= compat_urllib_parse_urlencode(
2245 {'__youtubedl_smuggle': json.dumps(data)}
)
2246 return url
+ '#' + sdata
2249 def unsmuggle_url(smug_url
, default
=None):
2250 if '#__youtubedl_smuggle' not in smug_url
:
2251 return smug_url
, default
2252 url
, _
, sdata
= smug_url
.rpartition('#')
2253 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2254 data
= json
.loads(jsond
)
2258 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2259 """ Formats numbers with decimal sufixes like K, M, etc """
2260 num
, factor
= float_or_none(num
), float(factor
)
2261 if num
is None or num
< 0:
2263 exponent
= 0 if num
== 0 else int(math
.log(num
, factor
))
2264 suffix
= ['', *'kMGTPEZY'][exponent
]
2266 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2267 converted
= num
/ (factor
** exponent
)
2268 return fmt
% (converted
, suffix
)
2271 def format_bytes(bytes):
2272 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2275 def lookup_unit_table(unit_table
, s
):
2276 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2278 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2281 num_str
= m
.group('num').replace(',', '.')
2282 mult
= unit_table
[m
.group('unit')]
2283 return int(float(num_str
) * mult
)
2286 def parse_filesize(s
):
2290 # The lower-case forms are of course incorrect and unofficial,
2291 # but we support those too
2308 'megabytes': 1000 ** 2,
2309 'mebibytes': 1024 ** 2,
2315 'gigabytes': 1000 ** 3,
2316 'gibibytes': 1024 ** 3,
2322 'terabytes': 1000 ** 4,
2323 'tebibytes': 1024 ** 4,
2329 'petabytes': 1000 ** 5,
2330 'pebibytes': 1024 ** 5,
2336 'exabytes': 1000 ** 6,
2337 'exbibytes': 1024 ** 6,
2343 'zettabytes': 1000 ** 7,
2344 'zebibytes': 1024 ** 7,
2350 'yottabytes': 1000 ** 8,
2351 'yobibytes': 1024 ** 8,
2354 return lookup_unit_table(_UNIT_TABLE
, s
)
2361 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2363 if re
.match(r
'^[\d,.]+$', s
):
2364 return str_to_int(s
)
2377 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2381 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2383 return str_to_int(mobj
.group(1))
2386 def parse_resolution(s
):
2390 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2393 'width': int(mobj
.group('w')),
2394 'height': int(mobj
.group('h')),
2397 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2399 return {'height': int(mobj.group(1))}
2401 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2403 return {'height': int(mobj.group(1)) * 540}
2408 def parse_bitrate(s
):
2409 if not isinstance(s
, compat_str
):
2411 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2413 return int(mobj
.group(1))
2416 def month_by_name(name
, lang
='en'):
2417 """ Return the number of a month by (locale-independently) English name """
2419 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2422 return month_names
.index(name
) + 1
2427 def month_by_abbreviation(abbrev
):
2428 """ Return the number of a month by (locale-independently) English
2432 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2437 def fix_xml_ampersands(xml_str
):
2438 """Replace all the '&' by '&' in XML"""
2440 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2445 def setproctitle(title
):
2446 assert isinstance(title
, compat_str
)
2448 # ctypes in Jython is not complete
2449 # http://bugs.jython.org/issue2148
2450 if sys
.platform
.startswith('java'):
2454 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2458 # LoadLibrary in Windows Python 2.7.13 only expects
2459 # a bytestring, but since unicode_literals turns
2460 # every string into a unicode string, it fails.
2462 title_bytes
= title
.encode('utf-8')
2463 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2464 buf
.value
= title_bytes
2466 libc
.prctl(15, buf
, 0, 0, 0)
2467 except AttributeError:
2468 return # Strange libc, just skip this
2471 def remove_start(s
, start
):
2472 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2475 def remove_end(s
, end
):
2476 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2479 def remove_quotes(s
):
2480 if s
is None or len(s
) < 2:
2482 for quote
in ('"', "'", ):
2483 if s
[0] == quote
and s
[-1] == quote
:
2488 def get_domain(url
):
2489 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2490 return domain
.group('domain') if domain
else None
2493 def url_basename(url
):
2494 path
= compat_urlparse
.urlparse(url
).path
2495 return path
.strip('/').split('/')[-1]
2499 return re
.match(r
'https?://[^?#&]+/', url
).group()
2502 def urljoin(base
, path
):
2503 if isinstance(path
, bytes):
2504 path
= path
.decode('utf-8')
2505 if not isinstance(path
, compat_str
) or not path
:
2507 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2509 if isinstance(base
, bytes):
2510 base
= base
.decode('utf-8')
2511 if not isinstance(base
, compat_str
) or not re
.match(
2512 r
'^(?:https?:)?//', base
):
2514 return compat_urlparse
.urljoin(base
, path
)
2517 class HEADRequest(compat_urllib_request
.Request
):
2518 def get_method(self
):
2522 class PUTRequest(compat_urllib_request
.Request
):
2523 def get_method(self
):
2527 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2528 if get_attr
and v
is not None:
2529 v
= getattr(v
, get_attr
, None)
2531 return int(v
) * invscale
// scale
2532 except (ValueError, TypeError, OverflowError):
2536 def str_or_none(v
, default
=None):
2537 return default
if v
is None else compat_str(v
)
2540 def str_to_int(int_str
):
2541 """ A more relaxed version of int_or_none """
2542 if isinstance(int_str
, compat_integer_types
):
2544 elif isinstance(int_str
, compat_str
):
2545 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2546 return int_or_none(int_str
)
2549 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2553 return float(v
) * invscale
/ scale
2554 except (ValueError, TypeError):
2558 def bool_or_none(v
, default
=None):
2559 return v
if isinstance(v
, bool) else default
2562 def strip_or_none(v
, default
=None):
2563 return v
.strip() if isinstance(v
, compat_str
) else default
2566 def url_or_none(url
):
2567 if not url
or not isinstance(url
, compat_str
):
2570 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2573 def request_to_url(req
):
2574 if isinstance(req
, compat_urllib_request
.Request
):
2575 return req
.get_full_url()
2580 def strftime_or_none(timestamp
, date_format
, default
=None):
2581 datetime_object
= None
2583 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
2584 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2585 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2586 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2587 return datetime_object
.strftime(date_format
)
2588 except (ValueError, TypeError, AttributeError):
2592 def parse_duration(s
):
2593 if not isinstance(s
, compat_basestring
):
2599 days
, hours
, mins
, secs
, ms
= [None] * 5
2600 m
= re
.match(r
'''(?x)
2602 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2603 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2604 (?P<ms>[.:][0-9]+)?Z?$
2607 days
, hours
, mins
, secs
, ms
= m
.group('days', 'hours', 'mins', 'secs', 'ms')
2612 [0-9]+\s*y(?:ears?)?\s*
2615 [0-9]+\s*m(?:onths?)?\s*
2618 [0-9]+\s*w(?:eeks?)?\s*
2621 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2625 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2628 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2631 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2634 days
, hours
, mins
, secs
, ms
= m
.groups()
2636 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2638 hours
, mins
= m
.groups()
2644 duration
+= float(secs
)
2646 duration
+= float(mins
) * 60
2648 duration
+= float(hours
) * 60 * 60
2650 duration
+= float(days
) * 24 * 60 * 60
2652 duration
+= float(ms
.replace(':', '.'))
2656 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2657 name
, real_ext
= os
.path
.splitext(filename
)
2659 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
2660 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2661 else '{0}.{1}'.format(filename
, ext
))
2664 def replace_extension(filename
, ext
, expected_real_ext
=None):
2665 name
, real_ext
= os
.path
.splitext(filename
)
2666 return '{0}.{1}'.format(
2667 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2671 def check_executable(exe
, args
=[]):
2672 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2673 args can be a list of arguments for a short output (like -version) """
2675 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
2681 def _get_exe_version_output(exe
, args
):
2683 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2684 # SIGTTOU if yt-dlp is run in the background.
2685 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2687 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
2688 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
2691 if isinstance(out
, bytes): # Python 2.x
2692 out
= out
.decode('ascii', 'ignore')
2696 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2697 assert isinstance(output
, compat_str
)
2698 if version_re
is None:
2699 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2700 m
= re
.search(version_re
, output
)
2707 def get_exe_version(exe
, args
=['--version'],
2708 version_re
=None, unrecognized
='present'):
2709 """ Returns the version of the specified executable,
2710 or False if the executable is not present """
2711 out
= _get_exe_version_output(exe
, args
)
2712 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2715 class LazyList(collections
.abc
.Sequence
):
2716 ''' Lazy immutable list from an iterable
2717 Note that slices of a LazyList are lists and not LazyList'''
2719 class IndexError(IndexError):
2722 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2723 self
.__iterable
= iter(iterable
)
2724 self
.__cache
= [] if _cache
is None else _cache
2725 self
.__reversed
= reverse
2729 # We need to consume the entire iterable to iterate in reverse
2730 yield from self
.exhaust()
2732 yield from self
.__cache
2733 for item
in self
.__iterable
:
2734 self
.__cache
.append(item
)
2737 def __exhaust(self
):
2738 self
.__cache
.extend(self
.__iterable
)
2739 # Discard the emptied iterable to make it pickle-able
2740 self
.__iterable
= []
2744 ''' Evaluate the entire iterable '''
2745 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
2748 def __reverse_index(x
):
2749 return None if x
is None else -(x
+ 1)
2751 def __getitem__(self
, idx
):
2752 if isinstance(idx
, slice):
2754 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2755 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2756 elif isinstance(idx
, int):
2758 idx
= self
.__reverse
_index
(idx
)
2759 start
, stop
, step
= idx
, idx
, 0
2761 raise TypeError('indices must be integers or slices')
2762 if ((start
or 0) < 0 or (stop
or 0) < 0
2763 or (start
is None and step
< 0)
2764 or (stop
is None and step
> 0)):
2765 # We need to consume the entire iterable to be able to slice from the end
2766 # Obviously, never use this with infinite iterables
2769 return self
.__cache
[idx
]
2770 except IndexError as e
:
2771 raise self
.IndexError(e
) from e
2772 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
2774 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
2776 return self
.__cache
[idx
]
2777 except IndexError as e
:
2778 raise self
.IndexError(e
) from e
2782 self
[-1] if self
.__reversed
else self
[0]
2783 except self
.IndexError:
2789 return len(self
.__cache
)
2791 def __reversed__(self
):
2792 return type(self
)(self
.__iterable
, reverse
=not self
.__reversed
, _cache
=self
.__cache
)
2795 return type(self
)(self
.__iterable
, reverse
=self
.__reversed
, _cache
=self
.__cache
)
2798 # repr and str should mimic a list. So we exhaust the iterable
2799 return repr(self
.exhaust())
2802 return repr(self
.exhaust())
2807 class IndexError(IndexError):
2811 # This is only useful for tests
2812 return len(self
.getslice())
2814 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2815 self
._pagefunc
= pagefunc
2816 self
._pagesize
= pagesize
2817 self
._pagecount
= float('inf')
2818 self
._use
_cache
= use_cache
2821 def getpage(self
, pagenum
):
2822 page_results
= self
._cache
.get(pagenum
)
2823 if page_results
is None:
2824 page_results
= [] if pagenum
> self
._pagecount
else list(self
._pagefunc
(pagenum
))
2826 self
._cache
[pagenum
] = page_results
2829 def getslice(self
, start
=0, end
=None):
2830 return list(self
._getslice
(start
, end
))
2832 def _getslice(self
, start
, end
):
2833 raise NotImplementedError('This method must be implemented by subclasses')
2835 def __getitem__(self
, idx
):
2836 assert self
._use
_cache
, 'Indexing PagedList requires cache'
2837 if not isinstance(idx
, int) or idx
< 0:
2838 raise TypeError('indices must be non-negative integers')
2839 entries
= self
.getslice(idx
, idx
+ 1)
2841 raise self
.IndexError()
2845 class OnDemandPagedList(PagedList
):
2846 def _getslice(self
, start
, end
):
2847 for pagenum
in itertools
.count(start
// self
._pagesize
):
2848 firstid
= pagenum
* self
._pagesize
2849 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2850 if start
>= nextfirstid
:
2854 start
% self
._pagesize
2855 if firstid
<= start
< nextfirstid
2858 ((end
- 1) % self
._pagesize
) + 1
2859 if (end
is not None and firstid
<= end
<= nextfirstid
)
2863 page_results
= self
.getpage(pagenum
)
2865 self
._pagecount
= pagenum
- 1
2867 if startv
!= 0 or endv
is not None:
2868 page_results
= page_results
[startv
:endv
]
2869 yield from page_results
2871 # A little optimization - if current page is not "full", ie. does
2872 # not contain page_size videos then we can assume that this page
2873 # is the last one - there are no more ids on further pages -
2874 # i.e. no need to query again.
2875 if len(page_results
) + startv
< self
._pagesize
:
2878 # If we got the whole page, but the next page is not interesting,
2879 # break out early as well
2880 if end
== nextfirstid
:
2884 class InAdvancePagedList(PagedList
):
2885 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2886 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2887 self
._pagecount
= pagecount
2889 def _getslice(self
, start
, end
):
2890 start_page
= start
// self
._pagesize
2891 end_page
= self
._pagecount
if end
is None else min(self
._pagecount
, end
// self
._pagesize
+ 1)
2892 skip_elems
= start
- start_page
* self
._pagesize
2893 only_more
= None if end
is None else end
- start
2894 for pagenum
in range(start_page
, end_page
):
2895 page_results
= self
.getpage(pagenum
)
2897 page_results
= page_results
[skip_elems
:]
2899 if only_more
is not None:
2900 if len(page_results
) < only_more
:
2901 only_more
-= len(page_results
)
2903 yield from page_results
[:only_more
]
2905 yield from page_results
2908 def uppercase_escape(s
):
2909 unicode_escape
= codecs
.getdecoder('unicode_escape')
2911 r
'\\U[0-9a-fA-F]{8}',
2912 lambda m
: unicode_escape(m
.group(0))[0],
2916 def lowercase_escape(s
):
2917 unicode_escape
= codecs
.getdecoder('unicode_escape')
2919 r
'\\u[0-9a-fA-F]{4}',
2920 lambda m
: unicode_escape(m
.group(0))[0],
2924 def escape_rfc3986(s
):
2925 """Escape non-ASCII characters as suggested by RFC 3986"""
2926 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
2927 s
= s
.encode('utf-8')
2928 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2931 def escape_url(url
):
2932 """Escape URL as suggested by RFC 3986"""
2933 url_parsed
= compat_urllib_parse_urlparse(url
)
2934 return url_parsed
._replace
(
2935 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2936 path
=escape_rfc3986(url_parsed
.path
),
2937 params
=escape_rfc3986(url_parsed
.params
),
2938 query
=escape_rfc3986(url_parsed
.query
),
2939 fragment
=escape_rfc3986(url_parsed
.fragment
)
2944 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2947 def read_batch_urls(batch_fd
):
2949 if not isinstance(url
, compat_str
):
2950 url
= url
.decode('utf-8', 'replace')
2951 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2952 for bom
in BOM_UTF8
:
2953 if url
.startswith(bom
):
2954 url
= url
[len(bom
):]
2956 if not url
or url
.startswith(('#', ';', ']')):
2958 # "#" cannot be stripped out since it is part of the URI
2959 # However, it can be safely stipped out if follwing a whitespace
2960 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2962 with contextlib
.closing(batch_fd
) as fd
:
2963 return [url
for url
in map(fixup
, fd
) if url
]
2966 def urlencode_postdata(*args
, **kargs
):
2967 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2970 def update_url_query(url
, query
):
2973 parsed_url
= compat_urlparse
.urlparse(url
)
2974 qs
= compat_parse_qs(parsed_url
.query
)
2976 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2977 query
=compat_urllib_parse_urlencode(qs
, True)))
2980 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
2981 req_headers
= req
.headers
.copy()
2982 req_headers
.update(headers
)
2983 req_data
= data
or req
.data
2984 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2985 req_get_method
= req
.get_method()
2986 if req_get_method
== 'HEAD':
2987 req_type
= HEADRequest
2988 elif req_get_method
== 'PUT':
2989 req_type
= PUTRequest
2991 req_type
= compat_urllib_request
.Request
2993 req_url
, data
=req_data
, headers
=req_headers
,
2994 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2995 if hasattr(req
, 'timeout'):
2996 new_req
.timeout
= req
.timeout
3000 def _multipart_encode_impl(data
, boundary
):
3001 content_type
= 'multipart/form-data; boundary=%s' % boundary
3004 for k
, v
in data
.items():
3005 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3006 if isinstance(k
, compat_str
):
3007 k
= k
.encode('utf-8')
3008 if isinstance(v
, compat_str
):
3009 v
= v
.encode('utf-8')
3010 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3011 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3012 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3013 if boundary
.encode('ascii') in content
:
3014 raise ValueError('Boundary overlaps with data')
3017 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3019 return out
, content_type
3022 def multipart_encode(data
, boundary
=None):
3024 Encode a dict to RFC 7578-compliant form-data
3027 A dict where keys and values can be either Unicode or bytes-like
3030 If specified a Unicode object, it's used as the boundary. Otherwise
3031 a random boundary is generated.
3033 Reference: https://tools.ietf.org/html/rfc7578
3035 has_specified_boundary
= boundary
is not None
3038 if boundary
is None:
3039 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3042 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3045 if has_specified_boundary
:
3049 return out
, content_type
3052 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3053 if isinstance(key_or_keys
, (list, tuple)):
3054 for key
in key_or_keys
:
3055 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3059 return d
.get(key_or_keys
, default
)
3062 def try_get(src
, getter
, expected_type
=None):
3063 for get
in variadic(getter
):
3066 except (AttributeError, KeyError, TypeError, IndexError):
3069 if expected_type
is None or isinstance(v
, expected_type
):
3073 def merge_dicts(*dicts
):
3075 for a_dict
in dicts
:
3076 for k
, v
in a_dict
.items():
3080 or (isinstance(v
, compat_str
) and v
3081 and isinstance(merged
[k
], compat_str
)
3082 and not merged
[k
])):
3087 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
3088 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
3100 TV_PARENTAL_GUIDELINES
= {
3110 def parse_age_limit(s
):
3112 return s
if 0 <= s
<= 21 else None
3113 if not isinstance(s
, compat_basestring
):
3115 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
3117 return int(m
.group('age'))
3120 return US_RATINGS
[s
]
3121 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
3123 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
3127 def strip_jsonp(code
):
3130 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3131 (?:\s*&&\s*(?P=func_name))?
3132 \s*\(\s*(?P<callback_data>.*)\);?
3133 \s*?(?://[^\n]*)*$''',
3134 r
'\g<callback_data>', code
)
3137 def js_to_json(code
, vars={}):
3138 # vars is a dict of var, val pairs to substitute
3139 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3140 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
3142 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
3143 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
3148 if v
in ('true', 'false', 'null'):
3150 elif v
in ('undefined', 'void 0'):
3152 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3155 if v
[0] in ("'", '"'):
3156 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3161 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3163 for regex
, base
in INTEGER_TABLE
:
3164 im
= re
.match(regex
, v
)
3166 i
= int(im
.group(1), base
)
3167 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3174 code
= re
.sub(r
'new Date\((".+")\)', r
'\g<1>', code
)
3176 return re
.sub(r
'''(?sx)
3177 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3178 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3179 {comment}|,(?={skip}[\]}}])|
3180 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3181 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3184 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3187 def qualities(quality_ids
):
3188 """ Get a numeric quality value out of a list of possible values """
3191 return quality_ids
.index(qid
)
3197 POSTPROCESS_WHEN
= {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3201 'default': '%(title)s [%(id)s].%(ext)s',
3202 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3208 'description': 'description',
3209 'annotation': 'annotations.xml',
3210 'infojson': 'info.json',
3213 'pl_thumbnail': None,
3214 'pl_description': 'description',
3215 'pl_infojson': 'info.json',
3218 # As of [1] format syntax is:
3219 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3220 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3221 STR_FORMAT_RE_TMPL
= r
'''(?x)
3222 (?<!%)(?P<prefix>(?:%%)*)
3224 (?P<has_key>\((?P<key>{0})\))?
3226 (?P<conversion>[#0\-+ ]+)?
3228 (?P<precision>\.\d+)?
3229 (?P<len_mod>[hlL])? # unused in python
3230 {1} # conversion type
3235 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3238 def limit_length(s
, length
):
3239 """ Add ellipses to overly long strings """
3244 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3248 def version_tuple(v
):
3249 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3252 def is_outdated_version(version
, limit
, assume_new
=True):
3254 return not assume_new
3256 return version_tuple(version
) < version_tuple(limit
)
3258 return not assume_new
3261 def ytdl_is_updateable():
3262 """ Returns if yt-dlp can be updated with -U """
3264 from .update
import is_non_updateable
3266 return not is_non_updateable()
3269 def args_to_str(args
):
3270 # Get a short string representation for a subprocess command
3271 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3274 def error_to_compat_str(err
):
3276 # On python 2 error byte string must be decoded with proper
3277 # encoding rather than ascii
3278 if sys
.version_info
[0] < 3:
3279 err_str
= err_str
.decode(preferredencoding())
3283 def mimetype2ext(mt
):
3287 mt
, _
, params
= mt
.partition(';')
3292 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3293 # it's the most popular one
3294 'audio/mpeg': 'mp3',
3295 'audio/x-wav': 'wav',
3297 'audio/wave': 'wav',
3300 ext
= FULL_MAP
.get(mt
)
3306 'smptett+xml': 'tt',
3310 'x-mp4-fragmented': 'mp4',
3311 'x-ms-sami': 'sami',
3314 'x-mpegurl': 'm3u8',
3315 'vnd.apple.mpegurl': 'm3u8',
3319 'vnd.ms-sstr+xml': 'ism',
3323 'filmstrip+json': 'fs',
3327 _
, _
, subtype
= mt
.rpartition('/')
3328 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3339 _
, _
, suffix
= subtype
.partition('+')
3340 ext
= SUFFIX_MAP
.get(suffix
)
3344 return subtype
.replace('+', '.')
3347 def ext2mimetype(ext_or_url
):
3350 if '.' not in ext_or_url
:
3351 ext_or_url
= f
'file.{ext_or_url}'
3352 return mimetypes
.guess_type(ext_or_url
)[0]
3355 def parse_codecs(codecs_str
):
3356 # http://tools.ietf.org/html/rfc6381
3359 split_codecs
= list(filter(None, map(
3360 str.strip
, codecs_str
.strip().strip(',').split(','))))
3361 vcodec
, acodec
, tcodec
, hdr
= None, None, None, None
3362 for full_codec
in split_codecs
:
3363 parts
= full_codec
.split('.')
3364 codec
= parts
[0].replace('0', '')
3365 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3366 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3368 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3369 if codec
in ('dvh1', 'dvhe'):
3371 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3373 elif full_codec
.replace('0', '').startswith('vp9.2'):
3375 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3378 elif codec
in ('stpp', 'wvtt',):
3382 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
3383 if vcodec
or acodec
or tcodec
:
3385 'vcodec': vcodec
or 'none',
3386 'acodec': acodec
or 'none',
3387 'dynamic_range': hdr
,
3388 **({'tcodec': tcodec}
if tcodec
is not None else {}),
3390 elif len(split_codecs
) == 2:
3392 'vcodec': split_codecs
[0],
3393 'acodec': split_codecs
[1],
3398 def urlhandle_detect_ext(url_handle
):
3399 getheader
= url_handle
.headers
.get
3401 cd
= getheader('Content-Disposition')
3403 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3405 e
= determine_ext(m
.group('filename'), default_ext
=None)
3409 return mimetype2ext(getheader('Content-Type'))
3412 def encode_data_uri(data
, mime_type
):
3413 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3416 def age_restricted(content_limit
, age_limit
):
3417 """ Returns True iff the content should be blocked """
3419 if age_limit
is None: # No limit set
3421 if content_limit
is None:
3422 return False # Content available for everyone
3423 return age_limit
< content_limit
3426 def is_html(first_bytes
):
3427 """ Detect whether a file contains HTML by examining its first bytes. """
3430 (b
'\xef\xbb\xbf', 'utf-8'),
3431 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3432 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3433 (b
'\xff\xfe', 'utf-16-le'),
3434 (b
'\xfe\xff', 'utf-16-be'),
3436 for bom
, enc
in BOMS
:
3437 if first_bytes
.startswith(bom
):
3438 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
3441 s
= first_bytes
.decode('utf-8', 'replace')
3443 return re
.match(r
'^\s*<', s
)
3446 def determine_protocol(info_dict
):
3447 protocol
= info_dict
.get('protocol')
3448 if protocol
is not None:
3451 url
= sanitize_url(info_dict
['url'])
3452 if url
.startswith('rtmp'):
3454 elif url
.startswith('mms'):
3456 elif url
.startswith('rtsp'):
3459 ext
= determine_ext(url
)
3465 return compat_urllib_parse_urlparse(url
).scheme
3468 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3469 """ Render a list of rows, each as a list of values.
3470 Text after a \t will be right aligned """
3472 return len(remove_terminal_sequences(string
).replace('\t', ''))
3474 def get_max_lens(table
):
3475 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3477 def filter_using_list(row
, filterArray
):
3478 return [col
for take
, col
in itertools
.zip_longest(filterArray
, row
, fillvalue
=True) if take
]
3480 max_lens
= get_max_lens(data
) if hide_empty
else []
3481 header_row
= filter_using_list(header_row
, max_lens
)
3482 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3484 table
= [header_row
] + data
3485 max_lens
= get_max_lens(table
)
3488 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3489 table
[1][-1] = table
[1][-1][:-extra_gap
* len(delim
)] # Remove extra_gap from end of delimiter
3491 for pos
, text
in enumerate(map(str, row
)):
3493 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3495 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3496 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3500 def _match_one(filter_part
, dct
, incomplete
):
3501 # TODO: Generalize code with YoutubeDL._build_format_filter
3502 STRING_OPERATORS
= {
3503 '*=': operator
.contains
,
3504 '^=': lambda attr
, value
: attr
.startswith(value
),
3505 '$=': lambda attr
, value
: attr
.endswith(value
),
3506 '~=': lambda attr
, value
: re
.search(value
, attr
),
3508 COMPARISON_OPERATORS
= {
3510 '<=': operator
.le
, # "<=" must be defined above "<"
3517 operator_rex
= re
.compile(r
'''(?x)\s*
3519 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3521 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3525 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3526 m = operator_rex.search(filter_part)
3529 unnegated_op = COMPARISON_OPERATORS[m['op']]
3531 op = lambda attr, value: not unnegated_op(attr, value)
3534 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3536 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3537 actual_value = dct.get(m['key'])
3538 numeric_comparison = None
3539 if isinstance(actual_value, compat_numeric_types):
3540 # If the original field is a string and matching comparisonvalue is
3541 # a number we should respect the origin of the original field
3542 # and process comparison value as a string (see
3543 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3545 numeric_comparison = int(comparison_value)
3547 numeric_comparison = parse_filesize(comparison_value)
3548 if numeric_comparison is None:
3549 numeric_comparison = parse_filesize(f'{comparison_value}B')
3550 if numeric_comparison is None:
3551 numeric_comparison = parse_duration(comparison_value)
3552 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3553 raise ValueError('Operator %s only supports string values!' % m['op'])
3554 if actual_value is None:
3555 return incomplete or m['none_inclusive']
3556 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3559 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3560 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3562 operator_rex = re.compile(r'''(?x
)\s
*
3563 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3565 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3566 m = operator_rex.search(filter_part)
3568 op = UNARY_OPERATORS[m.group('op')]
3569 actual_value = dct.get(m.group('key'))
3570 if incomplete and actual_value is None:
3572 return op(actual_value)
3574 raise ValueError('Invalid filter part %r' % filter_part)
3577 def match_str(filter_str, dct, incomplete=False):
3578 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3579 When incomplete, all conditions passes on missing fields
3582 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3583 for filter_part in re.split(r'(?<!\\)&', filter_str))
3586 def match_filter_func(filter_str):
3587 def _match_func(info_dict, *args, **kwargs):
3588 if match_str(filter_str, info_dict, *args, **kwargs):
3591 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3592 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3596 def parse_dfxp_time_expr(time_expr):
3600 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3602 return float(mobj.group('time_offset'))
3604 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3606 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3609 def srt_subtitles_timecode(seconds):
3610 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3613 def ass_subtitles_timecode(seconds):
3614 time = timetuple_from_msec(seconds * 1000)
3615 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3618 def dfxp2srt(dfxp_data):
3620 @param dfxp_data A
bytes-like
object containing DFXP data
3621 @returns A
unicode object containing converted SRT data
3623 LEGACY_NAMESPACES = (
3624 (b'http://www.w3.org/ns/ttml', [
3625 b'http://www.w3.org/2004/11/ttaf1',
3626 b'http://www.w3.org/2006/04/ttaf1',
3627 b'http://www.w3.org/2006/10/ttaf1',
3629 (b'http://www.w3.org/ns/ttml#styling', [
3630 b'http://www.w3.org/ns/ttml#style',
3634 SUPPORTED_STYLING = [
3643 _x = functools.partial(xpath_with_ns, ns_map={
3644 'xml': 'http://www.w3.org/XML/1998/namespace',
3645 'ttml': 'http://www.w3.org/ns/ttml',
3646 'tts': 'http://www.w3.org/ns/ttml#styling',
3652 class TTMLPElementParser(object):
3654 _unclosed_elements = []
3655 _applied_styles = []
3657 def start(self, tag, attrib):
3658 if tag in (_x('ttml:br'), 'br'):
3661 unclosed_elements = []
3663 element_style_id = attrib.get('style')
3665 style.update(default_style)
3666 if element_style_id:
3667 style.update(styles.get(element_style_id, {}))
3668 for prop in SUPPORTED_STYLING:
3669 prop_val = attrib.get(_x('tts:' + prop))
3671 style[prop] = prop_val
3674 for k, v in sorted(style.items()):
3675 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3678 font += ' color="%s"' % v
3679 elif k == 'fontSize':
3680 font += ' size="%s"' % v
3681 elif k == 'fontFamily':
3682 font += ' face="%s"' % v
3683 elif k == 'fontWeight' and v == 'bold':
3685 unclosed_elements.append('b')
3686 elif k == 'fontStyle' and v == 'italic':
3688 unclosed_elements.append('i')
3689 elif k == 'textDecoration' and v == 'underline':
3691 unclosed_elements.append('u')
3693 self._out += '<font' + font + '>'
3694 unclosed_elements.append('font')
3696 if self._applied_styles:
3697 applied_style.update(self._applied_styles[-1])
3698 applied_style.update(style)
3699 self._applied_styles.append(applied_style)
3700 self._unclosed_elements.append(unclosed_elements)
3703 if tag not in (_x('ttml:br'), 'br'):
3704 unclosed_elements = self._unclosed_elements.pop()
3705 for element in reversed(unclosed_elements):
3706 self._out += '</%s>' % element
3707 if unclosed_elements and self._applied_styles:
3708 self._applied_styles.pop()
3710 def data(self, data):
3714 return self._out.strip()
3716 def parse_node(node):
3717 target = TTMLPElementParser()
3718 parser = xml.etree.ElementTree.XMLParser(target=target)
3719 parser.feed(xml.etree.ElementTree.tostring(node))
3720 return parser.close()
3722 for k, v in LEGACY_NAMESPACES:
3724 dfxp_data = dfxp_data.replace(ns, k)
3726 dfxp = compat_etree_fromstring(dfxp_data)
3728 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3731 raise ValueError('Invalid dfxp/TTML subtitle')
3735 for style in dfxp.findall(_x('.//ttml:style')):
3736 style_id = style.get('id') or style.get(_x('xml:id'))
3739 parent_style_id = style.get('style')
3741 if parent_style_id not in styles:
3744 styles[style_id] = styles[parent_style_id].copy()
3745 for prop in SUPPORTED_STYLING:
3746 prop_val = style.get(_x('tts:' + prop))
3748 styles.setdefault(style_id, {})[prop] = prop_val
3754 for p in ('body', 'div'):
3755 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3758 style = styles.get(ele.get('style'))
3761 default_style.update(style)
3763 for para, index in zip(paras, itertools.count(1)):
3764 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3765 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3766 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3767 if begin_time is None:
3772 end_time = begin_time + dur
3773 out.append('%d\n%s --> %s\n%s\n\n' % (
3775 srt_subtitles_timecode(begin_time),
3776 srt_subtitles_timecode(end_time),
3782 def cli_option(params, command_option, param):
3783 param = params.get(param)
3785 param = compat_str(param)
3786 return [command_option, param] if param is not None else []
3789 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3790 param = params.get(param)
3793 assert isinstance(param, bool)
3795 return [command_option + separator + (true_value if param else false_value)]
3796 return [command_option, true_value if param else false_value]
3799 def cli_valueless_option(params, command_option, param, expected_value=True):
3800 param = params.get(param)
3801 return [command_option] if param == expected_value else []
3804 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3805 if isinstance(argdict, (list, tuple)): # for backward compatibility
3812 assert isinstance(argdict, dict)
3814 assert isinstance(keys, (list, tuple))
3815 for key_list in keys:
3816 arg_list = list(filter(
3817 lambda x: x is not None,
3818 [argdict.get(key.lower()) for key in variadic(key_list)]))
3820 return [arg for args in arg_list for arg in args]
3824 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3825 main_key, exe = main_key.lower(), exe.lower()
3826 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3827 keys = [f'{root_key}{k}' for k in (keys or [''])]
3828 if root_key in keys:
3830 keys.append((main_key, exe))
3831 keys.append('default')
3834 return cli_configuration_args(argdict, keys, default, use_compat)
3837 class ISO639Utils(object):
3838 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3897 'iw': 'heb', # Replaced by he in 1989 revision
3907 'in': 'ind', # Replaced by id in 1989 revision
4022 'ji': 'yid', # Replaced by yi in 1989 revision
4030 def short2long(cls, code):
4031 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4032 return cls._lang_map.get(code[:2])
4035 def long2short(cls, code):
4036 """Convert language code from ISO 639-2/T to ISO 639-1"""
4037 for short_name, long_name in cls._lang_map.items():
4038 if long_name == code:
4042 class ISO3166Utils(object):
4043 # From http://data.okfn.org/data/core/country-list
4045 'AF': 'Afghanistan',
4046 'AX': 'Åland Islands',
4049 'AS': 'American Samoa',
4054 'AG': 'Antigua and Barbuda',
4071 'BO': 'Bolivia, Plurinational State of',
4072 'BQ': 'Bonaire, Sint Eustatius and Saba',
4073 'BA': 'Bosnia and Herzegovina',
4075 'BV': 'Bouvet Island',
4077 'IO': 'British Indian Ocean Territory',
4078 'BN': 'Brunei Darussalam',
4080 'BF': 'Burkina Faso',
4086 'KY': 'Cayman Islands',
4087 'CF': 'Central African Republic',
4091 'CX': 'Christmas Island',
4092 'CC': 'Cocos (Keeling) Islands',
4096 'CD': 'Congo, the Democratic Republic of the',
4097 'CK': 'Cook Islands',
4099 'CI': 'Côte d\'Ivoire',
4104 'CZ': 'Czech Republic',
4108 'DO': 'Dominican Republic',
4111 'SV': 'El Salvador',
4112 'GQ': 'Equatorial Guinea',
4116 'FK': 'Falkland Islands (Malvinas)',
4117 'FO': 'Faroe Islands',
4121 'GF': 'French Guiana',
4122 'PF': 'French Polynesia',
4123 'TF': 'French Southern Territories',
4138 'GW': 'Guinea-Bissau',
4141 'HM': 'Heard Island and McDonald Islands',
4142 'VA': 'Holy See (Vatican City State)',
4149 'IR': 'Iran, Islamic Republic of',
4152 'IM': 'Isle of Man',
4162 'KP': 'Korea, Democratic People\'s Republic of',
4163 'KR': 'Korea, Republic of',
4166 'LA': 'Lao People\'s Democratic Republic',
4172 'LI': 'Liechtenstein',
4176 'MK': 'Macedonia, the Former Yugoslav Republic of',
4183 'MH': 'Marshall Islands',
4189 'FM': 'Micronesia, Federated States of',
4190 'MD': 'Moldova, Republic of',
4201 'NL': 'Netherlands',
4202 'NC': 'New Caledonia',
4203 'NZ': 'New Zealand',
4208 'NF': 'Norfolk Island',
4209 'MP': 'Northern Mariana Islands',
4214 'PS': 'Palestine, State of',
4216 'PG': 'Papua New Guinea',
4219 'PH': 'Philippines',
4223 'PR': 'Puerto Rico',
4227 'RU': 'Russian Federation',
4229 'BL': 'Saint Barthélemy',
4230 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4231 'KN': 'Saint Kitts and Nevis',
4232 'LC': 'Saint Lucia',
4233 'MF': 'Saint Martin (French part)',
4234 'PM': 'Saint Pierre and Miquelon',
4235 'VC': 'Saint Vincent and the Grenadines',
4238 'ST': 'Sao Tome and Principe',
4239 'SA': 'Saudi Arabia',
4243 'SL': 'Sierra Leone',
4245 'SX': 'Sint Maarten (Dutch part)',
4248 'SB': 'Solomon Islands',
4250 'ZA': 'South Africa',
4251 'GS': 'South Georgia and the South Sandwich Islands',
4252 'SS': 'South Sudan',
4257 'SJ': 'Svalbard and Jan Mayen',
4260 'CH': 'Switzerland',
4261 'SY': 'Syrian Arab Republic',
4262 'TW': 'Taiwan, Province of China',
4264 'TZ': 'Tanzania, United Republic of',
4266 'TL': 'Timor-Leste',
4270 'TT': 'Trinidad and Tobago',
4273 'TM': 'Turkmenistan',
4274 'TC': 'Turks and Caicos Islands',
4278 'AE': 'United Arab Emirates',
4279 'GB': 'United Kingdom',
4280 'US': 'United States',
4281 'UM': 'United States Minor Outlying Islands',
4285 'VE': 'Venezuela, Bolivarian Republic of',
4287 'VG': 'Virgin Islands, British',
4288 'VI': 'Virgin Islands, U.S.',
4289 'WF': 'Wallis and Futuna',
4290 'EH': 'Western Sahara',
4297 def short2full(cls, code):
4298 """Convert an ISO 3166-2 country code to the corresponding full name"""
4299 return cls._country_map.get(code.upper())
4302 class GeoUtils(object):
4303 # Major IPv4 address blocks per country
4305 'AD': '46.172.224.0/19',
4306 'AE': '94.200.0.0/13',
4307 'AF': '149.54.0.0/17',
4308 'AG': '209.59.64.0/18',
4309 'AI': '204.14.248.0/21',
4310 'AL': '46.99.0.0/16',
4311 'AM': '46.70.0.0/15',
4312 'AO': '105.168.0.0/13',
4313 'AP': '182.50.184.0/21',
4314 'AQ': '23.154.160.0/24',
4315 'AR': '181.0.0.0/12',
4316 'AS': '202.70.112.0/20',
4317 'AT': '77.116.0.0/14',
4318 'AU': '1.128.0.0/11',
4319 'AW': '181.41.0.0/18',
4320 'AX': '185.217.4.0/22',
4321 'AZ': '5.197.0.0/16',
4322 'BA': '31.176.128.0/17',
4323 'BB': '65.48.128.0/17',
4324 'BD': '114.130.0.0/16',
4326 'BF': '102.178.0.0/15',
4327 'BG': '95.42.0.0/15',
4328 'BH': '37.131.0.0/17',
4329 'BI': '154.117.192.0/18',
4330 'BJ': '137.255.0.0/16',
4331 'BL': '185.212.72.0/23',
4332 'BM': '196.12.64.0/18',
4333 'BN': '156.31.0.0/16',
4334 'BO': '161.56.0.0/16',
4335 'BQ': '161.0.80.0/20',
4336 'BR': '191.128.0.0/12',
4337 'BS': '24.51.64.0/18',
4338 'BT': '119.2.96.0/19',
4339 'BW': '168.167.0.0/16',
4340 'BY': '178.120.0.0/13',
4341 'BZ': '179.42.192.0/18',
4342 'CA': '99.224.0.0/11',
4343 'CD': '41.243.0.0/16',
4344 'CF': '197.242.176.0/21',
4345 'CG': '160.113.0.0/16',
4346 'CH': '85.0.0.0/13',
4347 'CI': '102.136.0.0/14',
4348 'CK': '202.65.32.0/19',
4349 'CL': '152.172.0.0/14',
4350 'CM': '102.244.0.0/14',
4351 'CN': '36.128.0.0/10',
4352 'CO': '181.240.0.0/12',
4353 'CR': '201.192.0.0/12',
4354 'CU': '152.206.0.0/15',
4355 'CV': '165.90.96.0/19',
4356 'CW': '190.88.128.0/17',
4357 'CY': '31.153.0.0/16',
4358 'CZ': '88.100.0.0/14',
4360 'DJ': '197.241.0.0/17',
4361 'DK': '87.48.0.0/12',
4362 'DM': '192.243.48.0/20',
4363 'DO': '152.166.0.0/15',
4364 'DZ': '41.96.0.0/12',
4365 'EC': '186.68.0.0/15',
4366 'EE': '90.190.0.0/15',
4367 'EG': '156.160.0.0/11',
4368 'ER': '196.200.96.0/20',
4369 'ES': '88.0.0.0/11',
4370 'ET': '196.188.0.0/14',
4371 'EU': '2.16.0.0/13',
4372 'FI': '91.152.0.0/13',
4373 'FJ': '144.120.0.0/16',
4374 'FK': '80.73.208.0/21',
4375 'FM': '119.252.112.0/20',
4376 'FO': '88.85.32.0/19',
4378 'GA': '41.158.0.0/15',
4380 'GD': '74.122.88.0/21',
4381 'GE': '31.146.0.0/16',
4382 'GF': '161.22.64.0/18',
4383 'GG': '62.68.160.0/19',
4384 'GH': '154.160.0.0/12',
4385 'GI': '95.164.0.0/16',
4386 'GL': '88.83.0.0/19',
4387 'GM': '160.182.0.0/15',
4388 'GN': '197.149.192.0/18',
4389 'GP': '104.250.0.0/19',
4390 'GQ': '105.235.224.0/20',
4391 'GR': '94.64.0.0/13',
4392 'GT': '168.234.0.0/16',
4393 'GU': '168.123.0.0/16',
4394 'GW': '197.214.80.0/20',
4395 'GY': '181.41.64.0/18',
4396 'HK': '113.252.0.0/14',
4397 'HN': '181.210.0.0/16',
4398 'HR': '93.136.0.0/13',
4399 'HT': '148.102.128.0/17',
4400 'HU': '84.0.0.0/14',
4401 'ID': '39.192.0.0/10',
4402 'IE': '87.32.0.0/12',
4403 'IL': '79.176.0.0/13',
4404 'IM': '5.62.80.0/20',
4405 'IN': '117.192.0.0/10',
4406 'IO': '203.83.48.0/21',
4407 'IQ': '37.236.0.0/14',
4408 'IR': '2.176.0.0/12',
4409 'IS': '82.221.0.0/16',
4410 'IT': '79.0.0.0/10',
4411 'JE': '87.244.64.0/18',
4412 'JM': '72.27.0.0/17',
4413 'JO': '176.29.0.0/16',
4414 'JP': '133.0.0.0/8',
4415 'KE': '105.48.0.0/12',
4416 'KG': '158.181.128.0/17',
4417 'KH': '36.37.128.0/17',
4418 'KI': '103.25.140.0/22',
4419 'KM': '197.255.224.0/20',
4420 'KN': '198.167.192.0/19',
4421 'KP': '175.45.176.0/22',
4422 'KR': '175.192.0.0/10',
4423 'KW': '37.36.0.0/14',
4424 'KY': '64.96.0.0/15',
4425 'KZ': '2.72.0.0/13',
4426 'LA': '115.84.64.0/18',
4427 'LB': '178.135.0.0/16',
4428 'LC': '24.92.144.0/20',
4429 'LI': '82.117.0.0/19',
4430 'LK': '112.134.0.0/15',
4431 'LR': '102.183.0.0/16',
4432 'LS': '129.232.0.0/17',
4433 'LT': '78.56.0.0/13',
4434 'LU': '188.42.0.0/16',
4435 'LV': '46.109.0.0/16',
4436 'LY': '41.252.0.0/14',
4437 'MA': '105.128.0.0/11',
4438 'MC': '88.209.64.0/18',
4439 'MD': '37.246.0.0/16',
4440 'ME': '178.175.0.0/17',
4441 'MF': '74.112.232.0/21',
4442 'MG': '154.126.0.0/17',
4443 'MH': '117.103.88.0/21',
4444 'MK': '77.28.0.0/15',
4445 'ML': '154.118.128.0/18',
4446 'MM': '37.111.0.0/17',
4447 'MN': '49.0.128.0/17',
4448 'MO': '60.246.0.0/16',
4449 'MP': '202.88.64.0/20',
4450 'MQ': '109.203.224.0/19',
4451 'MR': '41.188.64.0/18',
4452 'MS': '208.90.112.0/22',
4453 'MT': '46.11.0.0/16',
4454 'MU': '105.16.0.0/12',
4455 'MV': '27.114.128.0/18',
4456 'MW': '102.70.0.0/15',
4457 'MX': '187.192.0.0/11',
4458 'MY': '175.136.0.0/13',
4459 'MZ': '197.218.0.0/15',
4460 'NA': '41.182.0.0/16',
4461 'NC': '101.101.0.0/18',
4462 'NE': '197.214.0.0/18',
4463 'NF': '203.17.240.0/22',
4464 'NG': '105.112.0.0/12',
4465 'NI': '186.76.0.0/15',
4466 'NL': '145.96.0.0/11',
4467 'NO': '84.208.0.0/13',
4468 'NP': '36.252.0.0/15',
4469 'NR': '203.98.224.0/19',
4470 'NU': '49.156.48.0/22',
4471 'NZ': '49.224.0.0/14',
4472 'OM': '5.36.0.0/15',
4473 'PA': '186.72.0.0/15',
4474 'PE': '186.160.0.0/14',
4475 'PF': '123.50.64.0/18',
4476 'PG': '124.240.192.0/19',
4477 'PH': '49.144.0.0/13',
4478 'PK': '39.32.0.0/11',
4479 'PL': '83.0.0.0/11',
4480 'PM': '70.36.0.0/20',
4481 'PR': '66.50.0.0/16',
4482 'PS': '188.161.0.0/16',
4483 'PT': '85.240.0.0/13',
4484 'PW': '202.124.224.0/20',
4485 'PY': '181.120.0.0/14',
4486 'QA': '37.210.0.0/15',
4487 'RE': '102.35.0.0/16',
4488 'RO': '79.112.0.0/13',
4489 'RS': '93.86.0.0/15',
4490 'RU': '5.136.0.0/13',
4491 'RW': '41.186.0.0/16',
4492 'SA': '188.48.0.0/13',
4493 'SB': '202.1.160.0/19',
4494 'SC': '154.192.0.0/11',
4495 'SD': '102.120.0.0/13',
4496 'SE': '78.64.0.0/12',
4497 'SG': '8.128.0.0/10',
4498 'SI': '188.196.0.0/14',
4499 'SK': '78.98.0.0/15',
4500 'SL': '102.143.0.0/17',
4501 'SM': '89.186.32.0/19',
4502 'SN': '41.82.0.0/15',
4503 'SO': '154.115.192.0/18',
4504 'SR': '186.179.128.0/17',
4505 'SS': '105.235.208.0/21',
4506 'ST': '197.159.160.0/19',
4507 'SV': '168.243.0.0/16',
4508 'SX': '190.102.0.0/20',
4510 'SZ': '41.84.224.0/19',
4511 'TC': '65.255.48.0/20',
4512 'TD': '154.68.128.0/19',
4513 'TG': '196.168.0.0/14',
4514 'TH': '171.96.0.0/13',
4515 'TJ': '85.9.128.0/18',
4516 'TK': '27.96.24.0/21',
4517 'TL': '180.189.160.0/20',
4518 'TM': '95.85.96.0/19',
4519 'TN': '197.0.0.0/11',
4520 'TO': '175.176.144.0/21',
4521 'TR': '78.160.0.0/11',
4522 'TT': '186.44.0.0/15',
4523 'TV': '202.2.96.0/19',
4524 'TW': '120.96.0.0/11',
4525 'TZ': '156.156.0.0/14',
4526 'UA': '37.52.0.0/14',
4527 'UG': '102.80.0.0/13',
4529 'UY': '167.56.0.0/13',
4530 'UZ': '84.54.64.0/18',
4531 'VA': '212.77.0.0/19',
4532 'VC': '207.191.240.0/21',
4533 'VE': '186.88.0.0/13',
4534 'VG': '66.81.192.0/20',
4535 'VI': '146.226.0.0/16',
4536 'VN': '14.160.0.0/11',
4537 'VU': '202.80.32.0/20',
4538 'WF': '117.20.32.0/21',
4539 'WS': '202.4.32.0/19',
4540 'YE': '134.35.0.0/16',
4541 'YT': '41.242.116.0/22',
4542 'ZA': '41.0.0.0/11',
4543 'ZM': '102.144.0.0/13',
4544 'ZW': '102.177.192.0/18',
4548 def random_ipv4(cls, code_or_block):
4549 if len(code_or_block) == 2:
4550 block = cls._country_ip_map.get(code_or_block.upper())
4554 block = code_or_block
4555 addr, preflen = block.split('/')
4556 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4557 addr_max = addr_min | (0xffffffff >> int(preflen))
4558 return compat_str(socket.inet_ntoa(
4559 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4562 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4563 def __init__(self, proxies=None):
4564 # Set default handlers
4565 for type in ('http', 'https'):
4566 setattr(self, '%s_open' % type,
4567 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4568 meth(r, proxy, type))
4569 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4571 def proxy_open(self, req, proxy, type):
4572 req_proxy = req.headers.get('Ytdl-request-proxy')
4573 if req_proxy is not None:
4575 del req.headers['Ytdl-request-proxy']
4577 if proxy == '__noproxy__':
4578 return None # No Proxy
4579 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4580 req.add_header('Ytdl-socks-proxy', proxy)
4581 # yt-dlp's http/https handlers do wrapping the socket with socks
4583 return compat_urllib_request.ProxyHandler.proxy_open(
4584 self, req, proxy, type)
4587 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4588 # released into Public Domain
4589 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4591 def long_to_bytes(n, blocksize=0):
4592 """long_to_bytes(n:long, blocksize:int) : string
4593 Convert a long integer to a byte string.
4595 If optional blocksize is given and greater than zero, pad the front of the
4596 byte string with binary zeros so that the length is a multiple of
4599 # after much testing, this algorithm was deemed to be the fastest
4603 s = compat_struct_pack('>I', n & 0xffffffff) + s
4605 # strip off leading zeros
4606 for i in range(len(s)):
4607 if s[i] != b'\000'[0]:
4610 # only happens when n == 0
4614 # add back some pad bytes. this could be done more efficiently w.r.t. the
4615 # de-padding being done above, but sigh...
4616 if blocksize > 0 and len(s) % blocksize:
4617 s = (blocksize - len(s) % blocksize) * b'\000' + s
4621 def bytes_to_long(s):
4622 """bytes_to_long(string) : long
4623 Convert a byte string to a long integer.
4625 This is (essentially) the inverse of long_to_bytes().
4630 extra = (4 - length % 4)
4631 s = b'\000' * extra + s
4632 length = length + extra
4633 for i in range(0, length, 4):
4634 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4638 def ohdave_rsa_encrypt(data, exponent, modulus):
4640 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4643 data: data to encrypt, bytes-like object
4644 exponent, modulus: parameter e and N of RSA algorithm, both integer
4645 Output: hex string of encrypted data
4647 Limitation: supports one block encryption only
4650 payload = int(binascii.hexlify(data[::-1]), 16)
4651 encrypted = pow(payload, exponent, modulus)
4652 return '%x' % encrypted
4655 def pkcs1pad(data, length):
4657 Padding input data with PKCS#1 scheme
4659 @param {int[]} data input data
4660 @param {int} length target length
4661 @returns {int[]} padded data
4663 if len(data) > length - 11:
4664 raise ValueError('Input data too
long for PKCS
#1 padding')
4666 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4667 return [0, 2] + pseudo_random
+ [0] + data
4670 def encode_base_n(num
, n
, table
=None):
4671 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4673 table
= FULL_TABLE
[:n
]
4676 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4683 ret
= table
[num
% n
] + ret
4688 def decode_packed_codes(code
):
4689 mobj
= re
.search(PACKED_CODES_RE
, code
)
4690 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4693 symbols
= symbols
.split('|')
4698 base_n_count
= encode_base_n(count
, base
)
4699 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4702 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4706 def caesar(s
, alphabet
, shift
):
4711 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4716 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4719 def parse_m3u8_attributes(attrib
):
4721 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4722 if val
.startswith('"'):
4728 def urshift(val
, n
):
4729 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4732 # Based on png2str() written by @gdkchan and improved by @yokrysty
4733 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4734 def decode_png(png_data
):
4735 # Reference: https://www.w3.org/TR/PNG/
4736 header
= png_data
[8:]
4738 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4739 raise IOError('Not a valid PNG file.')
4741 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4742 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4747 length
= unpack_integer(header
[:4])
4750 chunk_type
= header
[:4]
4753 chunk_data
= header
[:length
]
4754 header
= header
[length
:]
4756 header
= header
[4:] # Skip CRC
4764 ihdr
= chunks
[0]['data']
4766 width
= unpack_integer(ihdr
[:4])
4767 height
= unpack_integer(ihdr
[4:8])
4771 for chunk
in chunks
:
4772 if chunk
['type'] == b
'IDAT':
4773 idat
+= chunk
['data']
4776 raise IOError('Unable to read PNG data.')
4778 decompressed_data
= bytearray(zlib
.decompress(idat
))
4783 def _get_pixel(idx
):
4788 for y
in range(height
):
4789 basePos
= y
* (1 + stride
)
4790 filter_type
= decompressed_data
[basePos
]
4794 pixels
.append(current_row
)
4796 for x
in range(stride
):
4797 color
= decompressed_data
[1 + basePos
+ x
]
4798 basex
= y
* stride
+ x
4803 left
= _get_pixel(basex
- 3)
4805 up
= _get_pixel(basex
- stride
)
4807 if filter_type
== 1: # Sub
4808 color
= (color
+ left
) & 0xff
4809 elif filter_type
== 2: # Up
4810 color
= (color
+ up
) & 0xff
4811 elif filter_type
== 3: # Average
4812 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4813 elif filter_type
== 4: # Paeth
4819 c
= _get_pixel(basex
- stride
- 3)
4827 if pa
<= pb
and pa
<= pc
:
4828 color
= (color
+ a
) & 0xff
4830 color
= (color
+ b
) & 0xff
4832 color
= (color
+ c
) & 0xff
4834 current_row
.append(color
)
4836 return width
, height
, pixels
4839 def write_xattr(path
, key
, value
):
4840 # This mess below finds the best xattr tool for the job
4842 # try the pyxattr module...
4845 if hasattr(xattr
, 'set'): # pyxattr
4846 # Unicode arguments are not supported in python-pyxattr until
4848 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4849 pyxattr_required_version
= '0.5.0'
4850 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
4851 # TODO: fallback to CLI tools
4852 raise XAttrUnavailableError(
4853 'python-pyxattr is detected but is too old. '
4854 'yt-dlp requires %s or above while your version is %s. '
4855 'Falling back to other xattr implementations' % (
4856 pyxattr_required_version
, xattr
.__version
__))
4858 setxattr
= xattr
.set
4860 setxattr
= xattr
.setxattr
4863 setxattr(path
, key
, value
)
4864 except EnvironmentError as e
:
4865 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4868 if compat_os_name
== 'nt':
4869 # Write xattrs to NTFS Alternate Data Streams:
4870 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4871 assert ':' not in key
4872 assert os
.path
.exists(path
)
4874 ads_fn
= path
+ ':' + key
4876 with open(ads_fn
, 'wb') as f
:
4878 except EnvironmentError as e
:
4879 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4881 user_has_setfattr
= check_executable('setfattr', ['--version'])
4882 user_has_xattr
= check_executable('xattr', ['-h'])
4884 if user_has_setfattr
or user_has_xattr
:
4886 value
= value
.decode('utf-8')
4887 if user_has_setfattr
:
4888 executable
= 'setfattr'
4889 opts
= ['-n', key
, '-v', value
]
4890 elif user_has_xattr
:
4891 executable
= 'xattr'
4892 opts
= ['-w', key
, value
]
4894 cmd
= ([encodeFilename(executable
, True)]
4895 + [encodeArgument(o
) for o
in opts
]
4896 + [encodeFilename(path
, True)])
4900 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4901 except EnvironmentError as e
:
4902 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4903 stdout
, stderr
= p
.communicate_or_kill()
4904 stderr
= stderr
.decode('utf-8', 'replace')
4905 if p
.returncode
!= 0:
4906 raise XAttrMetadataError(p
.returncode
, stderr
)
4909 # On Unix, and can't find pyxattr, setfattr, or xattr.
4910 if sys
.platform
.startswith('linux'):
4911 raise XAttrUnavailableError(
4912 "Couldn't find a tool to set the xattrs. "
4913 "Install either the python 'pyxattr' or 'xattr' "
4914 "modules, or the GNU 'attr' package "
4915 "(which contains the 'setfattr' tool).")
4917 raise XAttrUnavailableError(
4918 "Couldn't find a tool to set the xattrs. "
4919 "Install either the python 'xattr' module, "
4920 "or the 'xattr' binary.")
4923 def random_birthday(year_field
, month_field
, day_field
):
4924 start_date
= datetime
.date(1950, 1, 1)
4925 end_date
= datetime
.date(1995, 12, 31)
4926 offset
= random
.randint(0, (end_date
- start_date
).days
)
4927 random_date
= start_date
+ datetime
.timedelta(offset
)
4929 year_field
: str(random_date
.year
),
4930 month_field
: str(random_date
.month
),
4931 day_field
: str(random_date
.day
),
4935 # Templates for internet shortcut files, which are plain text files.
4936 DOT_URL_LINK_TEMPLATE
= '''
4941 DOT_WEBLOC_LINK_TEMPLATE
= '''
4942 <?xml version="1.0" encoding="UTF-8"?>
4943 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4944 <plist version="1.0">
4947 \t<string>%(url)s</string>
4952 DOT_DESKTOP_LINK_TEMPLATE
= '''
4962 'url': DOT_URL_LINK_TEMPLATE
,
4963 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4964 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4968 def iri_to_uri(iri
):
4970 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4972 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4975 iri_parts
= compat_urllib_parse_urlparse(iri
)
4977 if '[' in iri_parts
.netloc
:
4978 raise ValueError('IPv6 URIs are not, yet, supported.')
4979 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4981 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4984 if iri_parts
.username
:
4985 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4986 if iri_parts
.password
is not None:
4987 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4990 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4991 # The 'idna' encoding produces ASCII text.
4992 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4993 net_location
+= ':' + str(iri_parts
.port
)
4995 return compat_urllib_parse_urlunparse(
4999 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5001 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5002 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5004 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5005 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5007 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5009 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5012 def to_high_limit_path(path
):
5013 if sys
.platform
in ['win32', 'cygwin']:
5014 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5015 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
5020 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
5021 val
= traverse_obj(obj
, *variadic(field
))
5024 return template
% (func(val
) if func
else val
)
5027 def clean_podcast_url(url
):
5028 return re
.sub(r
'''(?x)
5032 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5035 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5038 cn\.co| # https://podcorn.com/analytics-prefix/
5039 st\.fm # https://podsights.com/docs/
5044 _HEX_TABLE
= '0123456789abcdef'
5047 def random_uuidv4():
5048 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5051 def make_dir(path
, to_screen
=None):
5053 dn
= os
.path
.dirname(path
)
5054 if dn
and not os
.path
.exists(dn
):
5057 except (OSError, IOError) as err
:
5058 if callable(to_screen
) is not None:
5059 to_screen('unable to create directory ' + error_to_compat_str(err
))
5063 def get_executable_path():
5064 from zipimport
import zipimporter
5065 if hasattr(sys
, 'frozen'): # Running from PyInstaller
5066 path
= os
.path
.dirname(sys
.executable
)
5067 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
5068 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
5070 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
5071 return os
.path
.abspath(path
)
5074 def load_plugins(name
, suffix
, namespace
):
5077 plugins_spec
= importlib
.util
.spec_from_file_location(
5078 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
5079 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
5080 sys
.modules
[plugins_spec
.name
] = plugins
5081 plugins_spec
.loader
.exec_module(plugins
)
5082 for name
in dir(plugins
):
5083 if name
in namespace
:
5085 if not name
.endswith(suffix
):
5087 klass
= getattr(plugins
, name
)
5088 classes
[name
] = namespace
[name
] = klass
5089 except FileNotFoundError
:
5095 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
5096 casesense
=True, is_user_input
=False, traverse_string
=False):
5097 ''' Traverse nested list/dict/tuple
5098 @param path_list A list of paths which are checked one by one.
5099 Each path is a list of keys where each key is a string,
5100 a function, a tuple of strings/None or "...".
5101 When a fuction is given, it takes the key as argument and
5102 returns whether the key matches or not. When a tuple is given,
5103 all the keys given in the tuple are traversed, and
5104 "..." traverses all the keys in the object
5105 "None" returns the object without traversal
5106 @param default Default value to return
5107 @param expected_type Only accept final value of this type (Can also be any callable)
5108 @param get_all Return all the values obtained from a path or only the first one
5109 @param casesense Whether to consider dictionary keys as case sensitive
5110 @param is_user_input Whether the keys are generated from user input. If True,
5111 strings are converted to int/slice if necessary
5112 @param traverse_string Whether to traverse inside strings. If True, any
5113 non-compatible object will also be converted into a string
5117 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
5118 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
5120 def _traverse_obj(obj
, path
, _current_depth
=0):
5122 path
= tuple(variadic(path
))
5123 for i
, key
in enumerate(path
):
5124 if None in (key
, obj
):
5126 if isinstance(key
, (list, tuple)):
5127 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
5130 obj
= (obj
.values() if isinstance(obj
, dict)
5131 else obj
if isinstance(obj
, (list, tuple, LazyList
))
5132 else str(obj
) if traverse_string
else [])
5134 depth
= max(depth
, _current_depth
)
5135 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
5137 if isinstance(obj
, (list, tuple, LazyList
)):
5138 obj
= enumerate(obj
)
5139 elif isinstance(obj
, dict):
5142 if not traverse_string
:
5146 depth
= max(depth
, _current_depth
)
5147 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if key(k
)]
5148 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
5149 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
5150 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
5153 key
= (int_or_none(key
) if ':' not in key
5154 else slice(*map(int_or_none
, key
.split(':'))))
5155 if key
== slice(None):
5156 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
5157 if not isinstance(key
, (int, slice)):
5159 if not isinstance(obj
, (list, tuple, LazyList
)):
5160 if not traverse_string
:
5169 if isinstance(expected_type
, type):
5170 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5171 elif expected_type
is not None:
5172 type_test
= expected_type
5174 type_test
= lambda val
: val
5176 for path
in path_list
:
5178 val
= _traverse_obj(obj
, path
)
5181 for _
in range(depth
- 1):
5182 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5183 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5185 return val
if get_all
else val
[0]
5187 val
= type_test(val
)
5193 def traverse_dict(dictn
, keys
, casesense
=True):
5194 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5195 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5196 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5199 def variadic(x
, allowed_types
=(str, bytes, dict)):
5200 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5203 def decode_base(value
, digits
):
5204 # This will convert given base-x string to scalar (long or int)
5205 table
= {char: index for index, char in enumerate(digits)}
5210 result
+= table
[chr]
5214 def time_seconds(**kwargs
):
5215 t
= datetime
.datetime
.now(datetime
.timezone(datetime
.timedelta(**kwargs
)))
5216 return t
.timestamp()
5219 # create a JSON Web Signature (jws) with HS256 algorithm
5220 # the resulting format is in JWS Compact Serialization
5221 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5222 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5223 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5229 header_data
.update(headers
)
5230 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode('utf-8'))
5231 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode('utf-8'))
5232 h
= hmac
.new(key
.encode('utf-8'), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5233 signature_b64
= base64
.b64encode(h
.digest())
5234 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5238 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5239 def jwt_decode_hs256(jwt
):
5240 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5241 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5245 def supports_terminal_sequences(stream
):
5246 if compat_os_name
== 'nt':
5247 from .compat
import WINDOWS_VT_MODE
# Must be imported locally
5248 if not WINDOWS_VT_MODE
or get_windows_version() < (10, 0, 10586):
5250 elif not os
.getenv('TERM'):
5253 return stream
.isatty()
5254 except BaseException
:
5258 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5261 def remove_terminal_sequences(string
):
5262 return _terminal_sequences_re
.sub('', string
)
5265 def number_of_digits(number
):
5266 return len('%d' % number
)
5269 def join_nonempty(*values
, delim
='-', from_dict
=None):
5270 if from_dict
is not None:
5271 values
= map(from_dict
.get
, values
)
5272 return delim
.join(map(str, filter(None, values
)))
5275 def scale_thumbnails_to_max_format_width(formats
, thumbnails
, url_width_re
):
5277 Find the largest format dimensions in terms of video width and, for each thumbnail:
5278 * Modify the URL: Match the width with the provided regex and replace with the former width
5281 This function is useful with video services that scale the provided thumbnails on demand
5283 _keys
= ('width', 'height')
5284 max_dimensions
= max(
5285 [tuple(format
.get(k
) or 0 for k
in _keys
) for format
in formats
],
5287 if not max_dimensions
[0]:
5291 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}
,
5292 dict(zip(_keys
, max_dimensions
)), thumbnail
)
5293 for thumbnail
in thumbnails
5297 def parse_http_range(range):
5298 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5300 return None, None, None
5301 crg
= re
.search(r
'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5303 return None, None, None
5304 return int(crg
.group(1)), int_or_none(crg
.group(2)), int_or_none(crg
.group(3))
5310 __initialized
= False
5312 def __init__(self
, parser
, label
=None):
5313 self
._parser
, self
.label
= parser
, label
5314 self
._loaded
_paths
, self
.configs
= set(), []
5316 def init(self
, args
=None, filename
=None):
5317 assert not self
.__initialized
5320 location
= os
.path
.realpath(filename
)
5321 directory
= os
.path
.dirname(location
)
5322 if location
in self
._loaded
_paths
:
5324 self
._loaded
_paths
.add(location
)
5326 self
.__initialized
= True
5327 self
.own_args
, self
.filename
= args
, filename
5328 for location
in self
._parser
.parse_args(args
)[0].config_locations
or []:
5329 location
= os
.path
.join(directory
, expand_path(location
))
5330 if os
.path
.isdir(location
):
5331 location
= os
.path
.join(location
, 'yt-dlp.conf')
5332 if not os
.path
.exists(location
):
5333 self
._parser
.error(f
'config location {location} does not exist')
5334 self
.append_config(self
.read_file(location
), location
)
5338 label
= join_nonempty(
5339 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5341 return join_nonempty(
5342 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5343 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5347 def read_file(filename
, default
=[]):
5349 optionf
= open(filename
)
5351 return default
# silently skip if file is not present
5353 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5354 contents
= optionf
.read()
5355 if sys
.version_info
< (3,):
5356 contents
= contents
.decode(preferredencoding())
5357 res
= compat_shlex_split(contents
, comments
=True)
5363 def hide_login_info(opts
):
5364 PRIVATE_OPTS
= set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5365 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5370 return m
.group('key') + '=PRIVATE'
5374 opts
= list(map(_scrub_eq
, opts
))
5375 for idx
, opt
in enumerate(opts
):
5376 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5377 opts
[idx
+ 1] = 'PRIVATE'
5380 def append_config(self
, *args
, label
=None):
5381 config
= type(self
)(self
._parser
, label
)
5382 config
._loaded
_paths
= self
._loaded
_paths
5383 if config
.init(*args
):
5384 self
.configs
.append(config
)
5388 for config
in reversed(self
.configs
):
5389 yield from config
.all_args
5390 yield from self
.own_args
or []
5392 def parse_args(self
):
5393 return self
._parser
.parse_args(list(self
.all_args
))
5396 class WebSocketsWrapper():
5397 """Wraps websockets module to use in non-async scopes"""
5399 def __init__(self
, url
, headers
=None):
5400 self
.loop
= asyncio
.events
.new_event_loop()
5401 self
.conn
= compat_websockets
.connect(
5402 url
, extra_headers
=headers
, ping_interval
=None,
5403 close_timeout
=float('inf'), loop
=self
.loop
, ping_timeout
=float('inf'))
5404 atexit
.register(self
.__exit
__, None, None, None)
5406 def __enter__(self
):
5407 self
.pool
= self
.run_with_loop(self
.conn
.__aenter
__(), self
.loop
)
5410 def send(self
, *args
):
5411 self
.run_with_loop(self
.pool
.send(*args
), self
.loop
)
5413 def recv(self
, *args
):
5414 return self
.run_with_loop(self
.pool
.recv(*args
), self
.loop
)
5416 def __exit__(self
, type, value
, traceback
):
5418 return self
.run_with_loop(self
.conn
.__aexit
__(type, value
, traceback
), self
.loop
)
5421 self
._cancel
_all
_tasks
(self
.loop
)
5423 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5424 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5426 def run_with_loop(main
, loop
):
5427 if not asyncio
.coroutines
.iscoroutine(main
):
5428 raise ValueError(f
'a coroutine was expected, got {main!r}')
5431 return loop
.run_until_complete(main
)
5433 loop
.run_until_complete(loop
.shutdown_asyncgens())
5434 if hasattr(loop
, 'shutdown_default_executor'):
5435 loop
.run_until_complete(loop
.shutdown_default_executor())
5438 def _cancel_all_tasks(loop
):
5439 to_cancel
= asyncio
.tasks
.all_tasks(loop
)
5444 for task
in to_cancel
:
5447 loop
.run_until_complete(
5448 asyncio
.tasks
.gather(*to_cancel
, loop
=loop
, return_exceptions
=True))
5450 for task
in to_cancel
:
5451 if task
.cancelled():
5453 if task
.exception() is not None:
5454 loop
.call_exception_handler({
5455 'message': 'unhandled exception during asyncio.run() shutdown',
5456 'exception': task
.exception(),
5461 has_websockets
= bool(compat_websockets
)
5464 def merge_headers(*dicts
):
5465 """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5466 return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}