4 from __future__
import unicode_literals
39 import xml
.etree
.ElementTree
44 compat_HTMLParseError
,
50 compat_ctypes_WINFUNCTYPE
,
51 compat_etree_fromstring
,
54 compat_html_entities_html5
,
68 compat_urllib_parse_urlencode
,
69 compat_urllib_parse_urlparse
,
70 compat_urllib_parse_urlunparse
,
71 compat_urllib_parse_quote
,
72 compat_urllib_parse_quote_plus
,
73 compat_urllib_parse_unquote_plus
,
74 compat_urllib_request
,
85 def register_socks_protocols():
86 # "Register" SOCKS protocols
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
89 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme
not in compat_urlparse
.uses_netloc
:
91 compat_urlparse
.uses_netloc
.append(scheme
)
94 # This is not clearly defined otherwise
95 compiled_regex_type
= type(re
.compile(''))
98 def random_user_agent():
99 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
140 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
144 'User-Agent': random_user_agent(),
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
156 NO_DEFAULT
= object()
158 ENGLISH_MONTH_NAMES
= [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
163 'en': ENGLISH_MONTH_NAMES
,
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
182 'f4f', 'f4m', 'm3u8', 'smil')
184 # needed for sanitizing filenames in restricted mode
185 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
186 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
217 '%Y-%m-%d %H:%M:%S.%f',
218 '%Y-%m-%d %H:%M:%S:%f',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
225 '%Y-%m-%dT%H:%M:%S.%f',
228 '%b %d %Y at %H:%M:%S',
230 '%B %d %Y at %H:%M:%S',
234 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
235 DATE_FORMATS_DAY_FIRST
.extend([
244 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
245 DATE_FORMATS_MONTH_FIRST
.extend([
253 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
254 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
257 def preferredencoding():
258 """Get preferred encoding.
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
264 pref = locale.getpreferredencoding()
272 def write_json_file(obj, fn):
273 """ Encode obj as JSON and write it to fn, atomically if possible """
275 fn = encodeFilename(fn)
276 if sys.version_info < (3, 0) and sys.platform != 'win32
':
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
290 'prefix
': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
309 json.dump(obj, tf, ensure_ascii=False)
310 if sys.platform == 'win32
':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
320 os.chmod(tf.name, 0o666 & ~mask)
323 os.rename(tf.name, fn)
332 if sys.version_info >= (2, 7):
333 def find_xpath_attr(node, xpath, key, val=None):
334 """ Find the xpath xpath[@key=val] """
335 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
337 return node.find(expr)
339 def find_xpath_attr(node, xpath, key, val=None):
340 for f in node.findall(compat_xpath(xpath)):
341 if key not in f.attrib:
343 if val is None or f.attrib.get(key) == val:
347 # On python2.6 the xml.etree.ElementTree.Element methods don't support
348 # the namespace parameter
351 def xpath_with_ns(path
, ns_map
):
352 components
= [c
.split(':') for c
in path
.split('/')]
356 replaced
.append(c
[0])
359 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
360 return '/'.join(replaced
)
363 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
364 def _find_xpath(xpath
):
365 return node
.find(compat_xpath(xpath
))
367 if isinstance(xpath
, (str, compat_str
)):
368 n
= _find_xpath(xpath
)
376 if default
is not NO_DEFAULT
:
379 name
= xpath
if name
is None else name
380 raise ExtractorError('Could not find XML element %s' % name
)
386 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
387 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
388 if n
is None or n
== default
:
391 if default
is not NO_DEFAULT
:
394 name
= xpath
if name
is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name
)
401 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
402 n
= find_xpath_attr(node
, xpath
, key
)
404 if default
is not NO_DEFAULT
:
407 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name
)
414 def get_element_by_id(id, html
):
415 """Return the content of the tag with the specified ID in the passed HTML document"""
416 return get_element_by_attribute('id', id, html
)
419 def get_element_by_class(class_name
, html
):
420 """Return the content of the first tag with the specified class in the passed HTML document"""
421 retval
= get_elements_by_class(class_name
, html
)
422 return retval
[0] if retval
else None
425 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
426 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
427 return retval
[0] if retval
else None
430 def get_elements_by_class(class_name
, html
):
431 """Return the content of all tags with the specified class in the passed HTML document as a list"""
432 return get_elements_by_attribute(
433 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
434 html, escape_value=False)
437 def get_elements_by_attribute(attribute, value, html, escape_value=True):
438 """Return the content of the tag with the specified attribute in the passed HTML document"""
440 value = re.escape(value) if escape_value else value
443 for m in re.finditer(r'''(?xs)
445 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
447 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
451 ''' % (re.escape(attribute), value), html):
452 res = m.group('content
')
454 if res.startswith('"') or res.startswith("'"):
457 retlist.append(unescapeHTML(res))
462 class HTMLAttributeParser(compat_HTMLParser):
463 """Trivial HTML parser to gather the attributes for a single element"""
467 compat_HTMLParser.__init__(self)
469 def handle_starttag(self, tag, attrs):
470 self.attrs = dict(attrs)
473 class HTMLListAttrsParser(compat_HTMLParser):
474 """HTML parser to gather the attributes for the elements of a list"""
477 compat_HTMLParser.__init__(self)
481 def handle_starttag(self, tag, attrs):
482 if tag == 'li
' and self._level == 0:
483 self.items.append(dict(attrs))
486 def handle_endtag(self, tag):
490 def extract_attributes(html_element):
491 """Given a string for an HTML element such as
493 a="foo" B="bar" c="&98;az" d=boz
494 empty= noval entity="&"
497 Decode and return a dictionary of attributes.
499 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
500 'empty
': '', 'noval
': None, 'entity
': '&',
501 'sq
': '"', 'dq': '\''
503 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
504 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
506 parser = HTMLAttributeParser()
508 parser.feed(html_element)
510 # Older Python may throw HTMLParseError in case of malformed HTML
511 except compat_HTMLParseError:
516 def parse_list(webpage):
517 """Given a string for an series of HTML <li> elements,
518 return a dictionary of their attributes"""
519 parser = HTMLListAttrsParser()
525 def clean_html(html):
526 """Clean an HTML snippet into a readable string"""
528 if html is None: # Convenience for sanitizing descriptions etc.
532 html = html.replace('\n', ' ')
533 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
534 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
536 html = re.sub('<.*?>', '', html)
537 # Replace html entities
538 html = unescapeHTML(html)
542 def sanitize_open(filename, open_mode):
543 """Try to open the given filename, and slightly tweak it if this fails.
545 Attempts to open the given filename. If this fails, it tries to change
546 the filename slightly, step by step, until it's either able to open it
547 or it fails and raises a final exception, like the standard open()
550 It returns the tuple (stream, definitive_file_name).
554 if sys.platform == 'win32':
556 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
557 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
558 stream = open(encodeFilename(filename), open_mode)
559 return (stream, filename)
560 except (IOError, OSError) as err:
561 if err.errno in (errno.EACCES,):
564 # In case of error, try to remove win32 forbidden chars
565 alt_filename = sanitize_path(filename)
566 if alt_filename == filename:
569 # An exception here should be caught in the caller
570 stream = open(encodeFilename(alt_filename), open_mode)
571 return (stream, alt_filename)
574 def timeconvert(timestr):
575 """Convert RFC 2822 defined time string into system timestamp"""
577 timetuple = email.utils.parsedate_tz(timestr)
578 if timetuple is not None:
579 timestamp = email.utils.mktime_tz(timetuple)
583 def sanitize_filename(s, restricted=False, is_id=False):
584 """Sanitizes a string so it could be used as part of a filename.
585 If restricted is set, use a stricter subset of allowed characters.
586 Set is_id if this is not an arbitrary string, but an ID that should be kept
589 def replace_insane(char):
590 if restricted and char in ACCENT_CHARS:
591 return ACCENT_CHARS[char]
592 elif not restricted and char == '\n':
594 elif char == '?' or ord(char) < 32 or ord(char) == 127:
597 return '' if restricted else '\''
599 return '_
-' if restricted else ' -'
600 elif char in '\\/|
*<>':
602 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
604 if restricted
and ord(char
) > 127:
611 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
612 result
= ''.join(map(replace_insane
, s
))
614 while '__' in result
:
615 result
= result
.replace('__', '_')
616 result
= result
.strip('_')
617 # Common case of "Foreign band name - English song title"
618 if restricted
and result
.startswith('-_'):
620 if result
.startswith('-'):
621 result
= '_' + result
[len('-'):]
622 result
= result
.lstrip('.')
628 def sanitize_path(s
, force
=False):
629 """Sanitizes and normalizes path on Windows"""
630 if sys
.platform
== 'win32':
632 drive_or_unc
, _
= os
.path
.splitdrive(s
)
633 if sys
.version_info
< (2, 7) and not drive_or_unc
:
634 drive_or_unc
, _
= os
.path
.splitunc(s
)
640 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
644 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
645 for path_part
in norm_path
]
647 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
648 elif force
and s
[0] == os
.path
.sep
:
649 sanitized_path
.insert(0, os
.path
.sep
)
650 return os
.path
.join(*sanitized_path
)
653 def sanitize_url(url
):
654 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
655 # the number of unwanted failures due to missing protocol
656 if url
.startswith('//'):
657 return 'http:%s' % url
658 # Fix some common typos seen so far
660 # https://github.com/ytdl-org/youtube-dl/issues/15649
661 (r
'^httpss://', r
'https://'),
662 # https://bx1.be/lives/direct-tv/
663 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
665 for mistake
, fixup
in COMMON_TYPOS
:
666 if re
.match(mistake
, url
):
667 return re
.sub(mistake
, fixup
, url
)
671 def extract_basic_auth(url
):
672 parts
= compat_urlparse
.urlsplit(url
)
673 if parts
.username
is None:
675 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
676 parts
.hostname
if parts
.port
is None
677 else '%s:%d' % (parts
.hostname
, parts
.port
))))
678 auth_payload
= base64
.b64encode(
679 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
680 return url
, 'Basic ' + auth_payload
.decode('utf-8')
683 def sanitized_Request(url
, *args
, **kwargs
):
684 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
685 if auth_header
is not None:
686 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
687 headers
['Authorization'] = auth_header
688 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
692 """Expand shell variables and ~"""
693 return os
.path
.expandvars(compat_expanduser(s
))
696 def orderedSet(iterable
):
697 """ Remove all duplicates from the input iterable """
705 def _htmlentity_transform(entity_with_semicolon
):
706 """Transforms an HTML entity to a character."""
707 entity
= entity_with_semicolon
[:-1]
709 # Known non-numeric HTML entity
710 if entity
in compat_html_entities
.name2codepoint
:
711 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
713 # TODO: HTML5 allows entities without a semicolon. For example,
714 # 'Éric' should be decoded as 'Éric'.
715 if entity_with_semicolon
in compat_html_entities_html5
:
716 return compat_html_entities_html5
[entity_with_semicolon
]
718 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
720 numstr
= mobj
.group(1)
721 if numstr
.startswith('x'):
723 numstr
= '0%s' % numstr
726 # See https://github.com/ytdl-org/youtube-dl/issues/7518
728 return compat_chr(int(numstr
, base
))
732 # Unknown entity in name, return its literal representation
733 return '&%s;' % entity
739 assert type(s
) == compat_str
742 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
745 def escapeHTML(text
):
748 .replace('&', '&')
749 .replace('<', '<')
750 .replace('>', '>')
751 .replace('"', '"')
752 .replace("'", ''')
756 def process_communicate_or_kill(p
, *args
, **kwargs
):
758 return p
.communicate(*args
, **kwargs
)
759 except BaseException
: # Including KeyboardInterrupt
765 class Popen(subprocess
.Popen
):
766 if sys
.platform
== 'win32':
767 _startupinfo
= subprocess
.STARTUPINFO()
768 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
772 def __init__(self
, *args
, **kwargs
):
773 super(Popen
, self
).__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
775 def communicate_or_kill(self
, *args
, **kwargs
):
776 return process_communicate_or_kill(self
, *args
, **kwargs
)
779 def get_subprocess_encoding():
780 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
781 # For subprocess calls, encode with locale encoding
782 # Refer to http://stackoverflow.com/a/9951851/35070
783 encoding
= preferredencoding()
785 encoding
= sys
.getfilesystemencoding()
791 def encodeFilename(s
, for_subprocess
=False):
793 @param s The name of the file
796 assert type(s
) == compat_str
798 # Python 3 has a Unicode API
799 if sys
.version_info
>= (3, 0):
802 # Pass '' directly to use Unicode APIs on Windows 2000 and up
803 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
804 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
805 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
808 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
809 if sys
.platform
.startswith('java'):
812 return s
.encode(get_subprocess_encoding(), 'ignore')
815 def decodeFilename(b
, for_subprocess
=False):
817 if sys
.version_info
>= (3, 0):
820 if not isinstance(b
, bytes):
823 return b
.decode(get_subprocess_encoding(), 'ignore')
826 def encodeArgument(s
):
827 if not isinstance(s
, compat_str
):
828 # Legacy code that uses byte strings
829 # Uncomment the following line after fixing all post processors
830 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
831 s
= s
.decode('ascii')
832 return encodeFilename(s
, True)
835 def decodeArgument(b
):
836 return decodeFilename(b
, True)
839 def decodeOption(optval
):
842 if isinstance(optval
, bytes):
843 optval
= optval
.decode(preferredencoding())
845 assert isinstance(optval
, compat_str
)
849 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
852 def timetuple_from_msec(msec
):
853 secs
, msec
= divmod(msec
, 1000)
854 mins
, secs
= divmod(secs
, 60)
855 hrs
, mins
= divmod(mins
, 60)
856 return _timetuple(hrs
, mins
, secs
, msec
)
859 def formatSeconds(secs
, delim
=':', msec
=False):
860 time
= timetuple_from_msec(secs
* 1000)
862 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
864 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
866 ret
= '%d' % time
.seconds
867 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
870 def _ssl_load_windows_store_certs(ssl_context
, storename
):
871 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
873 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
874 if encoding
== 'x509_asn' and (
875 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
876 except PermissionError
:
880 ssl_context
.load_verify_locations(cadata
=cert
)
885 def make_HTTPS_handler(params
, **kwargs
):
886 opts_check_certificate
= not params
.get('nocheckcertificate')
887 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
888 context
.check_hostname
= opts_check_certificate
889 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
890 if opts_check_certificate
:
892 context
.load_default_certs()
893 # Work around the issue in load_default_certs when there are bad certificates. See:
894 # https://github.com/yt-dlp/yt-dlp/issues/1060,
895 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
897 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
898 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
899 # Create a new context to discard any certificates that were already loaded
900 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
901 context
.check_hostname
, context
.verify_mode
= True, ssl
.CERT_REQUIRED
902 for storename
in ('CA', 'ROOT'):
903 _ssl_load_windows_store_certs(context
, storename
)
904 context
.set_default_verify_paths()
905 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
908 def bug_reports_message(before
=';'):
909 if ytdl_is_updateable():
910 update_cmd
= 'type yt-dlp -U to update'
912 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
913 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
914 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
915 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
917 before
= before
.rstrip()
918 if not before
or before
.endswith(('.', '!', '?')):
919 msg
= msg
[0].title() + msg
[1:]
921 return (before
+ ' ' if before
else '') + msg
924 class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
928 def __init__(self
, msg
=None):
931 elif self
.msg
is None:
932 self
.msg
= type(self
).__name
__
933 super().__init
__(self
.msg
)
936 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
937 if hasattr(ssl
, 'CertificateError'):
938 network_exceptions
.append(ssl
.CertificateError
)
939 network_exceptions
= tuple(network_exceptions
)
942 class ExtractorError(YoutubeDLError
):
943 """Error during info extraction."""
945 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
946 """ tb, if given, is the original traceback (so that it can be printed out).
947 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
949 if sys
.exc_info()[0] in network_exceptions
:
954 self
.expected
= expected
956 self
.video_id
= video_id
958 self
.exc_info
= sys
.exc_info() # preserve original exception
960 super(ExtractorError
, self
).__init
__(''.join((
961 format_field(ie
, template
='[%s] '),
962 format_field(video_id
, template
='%s: '),
964 format_field(cause
, template
=' (caused by %r)'),
965 '' if expected
else bug_reports_message())))
967 def format_traceback(self
):
968 if self
.traceback
is None:
970 return ''.join(traceback
.format_tb(self
.traceback
))
973 class UnsupportedError(ExtractorError
):
974 def __init__(self
, url
):
975 super(UnsupportedError
, self
).__init
__(
976 'Unsupported URL: %s' % url
, expected
=True)
980 class RegexNotFoundError(ExtractorError
):
981 """Error when a regex didn't match"""
985 class GeoRestrictedError(ExtractorError
):
986 """Geographic restriction Error exception.
988 This exception may be thrown when a video is not available from your
989 geographic location due to geographic restrictions imposed by a website.
992 def __init__(self
, msg
, countries
=None, **kwargs
):
993 kwargs
['expected'] = True
994 super(GeoRestrictedError
, self
).__init
__(msg
, **kwargs
)
995 self
.countries
= countries
998 class DownloadError(YoutubeDLError
):
999 """Download Error exception.
1001 This exception may be thrown by FileDownloader objects if they are not
1002 configured to continue on errors. They will contain the appropriate
1006 def __init__(self
, msg
, exc_info
=None):
1007 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1008 super(DownloadError
, self
).__init
__(msg
)
1009 self
.exc_info
= exc_info
1012 class EntryNotInPlaylist(YoutubeDLError
):
1013 """Entry not in playlist exception.
1015 This exception will be thrown by YoutubeDL when a requested entry
1016 is not found in the playlist info_dict
1018 msg
= 'Entry not found in info'
1021 class SameFileError(YoutubeDLError
):
1022 """Same File exception.
1024 This exception will be thrown by FileDownloader objects if they detect
1025 multiple files would have to be downloaded to the same file on disk.
1027 msg
= 'Fixed output name but more than one file to download'
1029 def __init__(self
, filename
=None):
1030 if filename
is not None:
1031 self
.msg
+= f
': {filename}'
1032 super().__init
__(self
.msg
)
1035 class PostProcessingError(YoutubeDLError
):
1036 """Post Processing exception.
1038 This exception may be raised by PostProcessor's .run() method to
1039 indicate an error in the postprocessing task.
1043 class DownloadCancelled(YoutubeDLError
):
1044 """ Exception raised when the download queue should be interrupted """
1045 msg
= 'The download was cancelled'
1048 class ExistingVideoReached(DownloadCancelled
):
1049 """ --break-on-existing triggered """
1050 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1053 class RejectedVideoReached(DownloadCancelled
):
1054 """ --break-on-reject triggered """
1055 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1058 class MaxDownloadsReached(DownloadCancelled
):
1059 """ --max-downloads limit has been reached. """
1060 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
1063 class ReExtractInfo(YoutubeDLError
):
1064 """ Video info needs to be re-extracted. """
1066 def __init__(self
, msg
, expected
=False):
1067 super().__init
__(msg
)
1068 self
.expected
= expected
1071 class ThrottledDownload(ReExtractInfo
):
1072 """ Download speed below --throttled-rate. """
1073 msg
= 'The download speed is below throttle limit'
1076 super().__init
__(self
.msg
, expected
=False)
1079 class UnavailableVideoError(YoutubeDLError
):
1080 """Unavailable Format exception.
1082 This exception will be thrown when a video is requested
1083 in a format that is not available for that video.
1085 msg
= 'Unable to download video'
1087 def __init__(self
, err
=None):
1089 self
.msg
+= f
': {err}'
1090 super().__init
__(self
.msg
)
1093 class ContentTooShortError(YoutubeDLError
):
1094 """Content Too Short exception.
1096 This exception may be raised by FileDownloader objects when a file they
1097 download is too small for what the server announced first, indicating
1098 the connection was probably interrupted.
1101 def __init__(self
, downloaded
, expected
):
1102 super(ContentTooShortError
, self
).__init
__(
1103 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
1106 self
.downloaded
= downloaded
1107 self
.expected
= expected
1110 class XAttrMetadataError(YoutubeDLError
):
1111 def __init__(self
, code
=None, msg
='Unknown error'):
1112 super(XAttrMetadataError
, self
).__init
__(msg
)
1116 # Parsing code and msg
1117 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
1118 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
1119 self
.reason
= 'NO_SPACE'
1120 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
1121 self
.reason
= 'VALUE_TOO_LONG'
1123 self
.reason
= 'NOT_SUPPORTED'
1126 class XAttrUnavailableError(YoutubeDLError
):
1130 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
1131 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1132 # expected HTTP responses to meet HTTP/1.0 or later (see also
1133 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1134 if sys
.version_info
< (3, 0):
1135 kwargs
['strict'] = True
1136 hc
= http_class(*args
, **compat_kwargs(kwargs
))
1137 source_address
= ydl_handler
._params
.get('source_address')
1139 if source_address
is not None:
1140 # This is to workaround _create_connection() from socket where it will try all
1141 # address data from getaddrinfo() including IPv6. This filters the result from
1142 # getaddrinfo() based on the source_address value.
1143 # This is based on the cpython socket.create_connection() function.
1144 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1145 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
1146 host
, port
= address
1148 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
1149 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
1150 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
1151 if addrs
and not ip_addrs
:
1152 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
1154 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1155 % (ip_version
, source_address
[0]))
1156 for res
in ip_addrs
:
1157 af
, socktype
, proto
, canonname
, sa
= res
1160 sock
= socket
.socket(af
, socktype
, proto
)
1161 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
1162 sock
.settimeout(timeout
)
1163 sock
.bind(source_address
)
1165 err
= None # Explicitly break reference cycle
1167 except socket
.error
as _
:
1169 if sock
is not None:
1174 raise socket
.error('getaddrinfo returns an empty list')
1175 if hasattr(hc
, '_create_connection'):
1176 hc
._create
_connection
= _create_connection
1177 sa
= (source_address
, 0)
1178 if hasattr(hc
, 'source_address'): # Python 2.7+
1179 hc
.source_address
= sa
1181 def _hc_connect(self
, *args
, **kwargs
):
1182 sock
= _create_connection(
1183 (self
.host
, self
.port
), self
.timeout
, sa
)
1185 self
.sock
= ssl
.wrap_socket(
1186 sock
, self
.key_file
, self
.cert_file
,
1187 ssl_version
=ssl
.PROTOCOL_TLSv1
)
1190 hc
.connect
= functools
.partial(_hc_connect
, hc
)
1195 def handle_youtubedl_headers(headers
):
1196 filtered_headers
= headers
1198 if 'Youtubedl-no-compression' in filtered_headers
:
1199 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
1200 del filtered_headers
['Youtubedl-no-compression']
1202 return filtered_headers
1205 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
1206 """Handler for HTTP requests and responses.
1208 This class, when installed with an OpenerDirector, automatically adds
1209 the standard headers to every HTTP request and handles gzipped and
1210 deflated responses from web servers. If compression is to be avoided in
1211 a particular request, the original request in the program code only has
1212 to include the HTTP header "Youtubedl-no-compression", which will be
1213 removed before making the real request.
1215 Part of this code was copied from:
1217 http://techknack.net/python-urllib2-handlers/
1219 Andrew Rowls, the author of that code, agreed to release it to the
1223 def __init__(self
, params
, *args
, **kwargs
):
1224 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
1225 self
._params
= params
1227 def http_open(self
, req
):
1228 conn_class
= compat_http_client
.HTTPConnection
1230 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1232 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1233 del req
.headers
['Ytdl-socks-proxy']
1235 return self
.do_open(functools
.partial(
1236 _create_http_connection
, self
, conn_class
, False),
1244 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
1246 return zlib
.decompress(data
)
1248 def http_request(self
, req
):
1249 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1250 # always respected by websites, some tend to give out URLs with non percent-encoded
1251 # non-ASCII characters (see telemb.py, ard.py [#3412])
1252 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1253 # To work around aforementioned issue we will replace request's original URL with
1254 # percent-encoded one
1255 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1256 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1257 url
= req
.get_full_url()
1258 url_escaped
= escape_url(url
)
1260 # Substitute URL if any change after escaping
1261 if url
!= url_escaped
:
1262 req
= update_Request(req
, url
=url_escaped
)
1264 for h
, v
in std_headers
.items():
1265 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1266 # The dict keys are capitalized because of this bug by urllib
1267 if h
.capitalize() not in req
.headers
:
1268 req
.add_header(h
, v
)
1270 req
.headers
= handle_youtubedl_headers(req
.headers
)
1272 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
1273 # Python 2.6 is brain-dead when it comes to fragments
1274 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
1275 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
1279 def http_response(self
, req
, resp
):
1282 if resp
.headers
.get('Content-encoding', '') == 'gzip':
1283 content
= resp
.read()
1284 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
1286 uncompressed
= io
.BytesIO(gz
.read())
1287 except IOError as original_ioerror
:
1288 # There may be junk add the end of the file
1289 # See http://stackoverflow.com/q/4928560/35070 for details
1290 for i
in range(1, 1024):
1292 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
1293 uncompressed
= io
.BytesIO(gz
.read())
1298 raise original_ioerror
1299 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1300 resp
.msg
= old_resp
.msg
1301 del resp
.headers
['Content-encoding']
1303 if resp
.headers
.get('Content-encoding', '') == 'deflate':
1304 gz
= io
.BytesIO(self
.deflate(resp
.read()))
1305 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
1306 resp
.msg
= old_resp
.msg
1307 del resp
.headers
['Content-encoding']
1308 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1309 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1310 if 300 <= resp
.code
< 400:
1311 location
= resp
.headers
.get('Location')
1313 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1314 if sys
.version_info
>= (3, 0):
1315 location
= location
.encode('iso-8859-1').decode('utf-8')
1317 location
= location
.decode('utf-8')
1318 location_escaped
= escape_url(location
)
1319 if location
!= location_escaped
:
1320 del resp
.headers
['Location']
1321 if sys
.version_info
< (3, 0):
1322 location_escaped
= location_escaped
.encode('utf-8')
1323 resp
.headers
['Location'] = location_escaped
1326 https_request
= http_request
1327 https_response
= http_response
1330 def make_socks_conn_class(base_class
, socks_proxy
):
1331 assert issubclass(base_class
, (
1332 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
1334 url_components
= compat_urlparse
.urlparse(socks_proxy
)
1335 if url_components
.scheme
.lower() == 'socks5':
1336 socks_type
= ProxyType
.SOCKS5
1337 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
1338 socks_type
= ProxyType
.SOCKS4
1339 elif url_components
.scheme
.lower() == 'socks4a':
1340 socks_type
= ProxyType
.SOCKS4A
1342 def unquote_if_non_empty(s
):
1345 return compat_urllib_parse_unquote_plus(s
)
1349 url_components
.hostname
, url_components
.port
or 1080,
1351 unquote_if_non_empty(url_components
.username
),
1352 unquote_if_non_empty(url_components
.password
),
1355 class SocksConnection(base_class
):
1357 self
.sock
= sockssocket()
1358 self
.sock
.setproxy(*proxy_args
)
1359 if type(self
.timeout
) in (int, float):
1360 self
.sock
.settimeout(self
.timeout
)
1361 self
.sock
.connect((self
.host
, self
.port
))
1363 if isinstance(self
, compat_http_client
.HTTPSConnection
):
1364 if hasattr(self
, '_context'): # Python > 2.6
1365 self
.sock
= self
._context
.wrap_socket(
1366 self
.sock
, server_hostname
=self
.host
)
1368 self
.sock
= ssl
.wrap_socket(self
.sock
)
1370 return SocksConnection
1373 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
1374 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
1375 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
1376 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
1377 self
._params
= params
1379 def https_open(self
, req
):
1381 conn_class
= self
._https
_conn
_class
1383 if hasattr(self
, '_context'): # python > 2.6
1384 kwargs
['context'] = self
._context
1385 if hasattr(self
, '_check_hostname'): # python 3.x
1386 kwargs
['check_hostname'] = self
._check
_hostname
1388 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
1390 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
1391 del req
.headers
['Ytdl-socks-proxy']
1393 return self
.do_open(functools
.partial(
1394 _create_http_connection
, self
, conn_class
, True),
1398 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
1400 See [1] for cookie file format.
1402 1. https://curl.haxx.se/docs/http-cookies.html
1404 _HTTPONLY_PREFIX
= '#HttpOnly_'
1406 _HEADER
= '''# Netscape HTTP Cookie File
1407 # This file is generated by yt-dlp. Do not edit.
1410 _CookieFileEntry
= collections
.namedtuple(
1412 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1414 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1416 Save cookies to a file.
1418 Most of the code is taken from CPython 3.8 and slightly adapted
1419 to support cookie files with UTF-8 in both python 2 and 3.
1421 if filename
is None:
1422 if self
.filename
is not None:
1423 filename
= self
.filename
1425 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1427 # Store session cookies with `expires` set to 0 instead of an empty
1430 if cookie
.expires
is None:
1433 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
1434 f
.write(self
._HEADER
)
1437 if not ignore_discard
and cookie
.discard
:
1439 if not ignore_expires
and cookie
.is_expired(now
):
1445 if cookie
.domain
.startswith('.'):
1446 initial_dot
= 'TRUE'
1448 initial_dot
= 'FALSE'
1449 if cookie
.expires
is not None:
1450 expires
= compat_str(cookie
.expires
)
1453 if cookie
.value
is None:
1454 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1455 # with no name, whereas http.cookiejar regards it as a
1456 # cookie with no value.
1461 value
= cookie
.value
1463 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
1464 secure
, expires
, name
, value
]) + '\n')
1466 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
1467 """Load cookies from a file."""
1468 if filename
is None:
1469 if self
.filename
is not None:
1470 filename
= self
.filename
1472 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
1474 def prepare_line(line
):
1475 if line
.startswith(self
._HTTPONLY
_PREFIX
):
1476 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
1477 # comments and empty lines are fine
1478 if line
.startswith('#') or not line
.strip():
1480 cookie_list
= line
.split('\t')
1481 if len(cookie_list
) != self
._ENTRY
_LEN
:
1482 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
1483 cookie
= self
._CookieFileEntry
(*cookie_list
)
1484 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
1485 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
1489 with io
.open(filename
, encoding
='utf-8') as f
:
1492 cf
.write(prepare_line(line
))
1493 except compat_cookiejar
.LoadError
as e
:
1495 'WARNING: skipping cookie file entry due to %s: %r\n'
1496 % (e
, line
), sys
.stderr
)
1499 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
1500 # Session cookies are denoted by either `expires` field set to
1501 # an empty string or 0. MozillaCookieJar only recognizes the former
1502 # (see [1]). So we need force the latter to be recognized as session
1503 # cookies on our own.
1504 # Session cookies may be important for cookies-based authentication,
1505 # e.g. usually, when user does not check 'Remember me' check box while
1506 # logging in on a site, some important cookies are stored as session
1507 # cookies so that not recognizing them will result in failed login.
1508 # 1. https://bugs.python.org/issue17164
1510 # Treat `expires=0` cookies as session cookies
1511 if cookie
.expires
== 0:
1512 cookie
.expires
= None
1513 cookie
.discard
= True
1516 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
1517 def __init__(self
, cookiejar
=None):
1518 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
1520 def http_response(self
, request
, response
):
1521 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1522 # characters in Set-Cookie HTTP header of last response (see
1523 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1524 # In order to at least prevent crashing we will percent encode Set-Cookie
1525 # header before HTTPCookieProcessor starts processing it.
1526 # if sys.version_info < (3, 0) and response.headers:
1527 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1528 # set_cookie = response.headers.get(set_cookie_header)
1530 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1531 # if set_cookie != set_cookie_escaped:
1532 # del response.headers[set_cookie_header]
1533 # response.headers[set_cookie_header] = set_cookie_escaped
1534 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
1536 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
1537 https_response
= http_response
1540 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
1541 """YoutubeDL redirect handler
1543 The code is based on HTTPRedirectHandler implementation from CPython [1].
1545 This redirect handler solves two issues:
1546 - ensures redirect URL is always unicode under python 2
1547 - introduces support for experimental HTTP response status code
1548 308 Permanent Redirect [2] used by some sites [3]
1550 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1551 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1552 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1555 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
1557 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1558 """Return a Request or None in response to a redirect.
1560 This is called by the http_error_30x methods when a
1561 redirection response is received. If a redirection should
1562 take place, return a new Request to allow http_error_30x to
1563 perform the redirect. Otherwise, raise HTTPError if no-one
1564 else should try to handle this url. Return None if you can't
1565 but another Handler might.
1567 m
= req
.get_method()
1568 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
1569 or code
in (301, 302, 303) and m
== "POST")):
1570 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
1571 # Strictly (according to RFC 2616), 301 or 302 in response to
1572 # a POST MUST NOT cause a redirection without confirmation
1573 # from the user (of urllib.request, in this case). In practice,
1574 # essentially all clients do redirect in this case, so we do
1577 # On python 2 urlh.geturl() may sometimes return redirect URL
1578 # as byte string instead of unicode. This workaround allows
1579 # to force it always return unicode.
1580 if sys
.version_info
[0] < 3:
1581 newurl
= compat_str(newurl
)
1583 # Be conciliant with URIs containing a space. This is mainly
1584 # redundant with the more complete encoding done in http_error_302(),
1585 # but it is kept for compatibility with other callers.
1586 newurl
= newurl
.replace(' ', '%20')
1588 CONTENT_HEADERS
= ("content-length", "content-type")
1589 # NB: don't use dict comprehension for python 2.6 compatibility
1590 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
1591 if k
.lower() not in CONTENT_HEADERS
)
1592 return compat_urllib_request
.Request(
1593 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
1597 def extract_timezone(date_str
):
1600 ^.{8,}? # >=8 char non-TZ prefix, if present
1601 (?P<tz>Z| # just the UTC Z, or
1602 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1603 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1604 [ ]? # optional space
1605 (?P<sign>\+|-) # +/-
1606 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1610 timezone
= datetime
.timedelta()
1612 date_str
= date_str
[:-len(m
.group('tz'))]
1613 if not m
.group('sign'):
1614 timezone
= datetime
.timedelta()
1616 sign
= 1 if m
.group('sign') == '+' else -1
1617 timezone
= datetime
.timedelta(
1618 hours
=sign
* int(m
.group('hours')),
1619 minutes
=sign
* int(m
.group('minutes')))
1620 return timezone
, date_str
1623 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
1624 """ Return a UNIX timestamp from the given date """
1626 if date_str
is None:
1629 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
1631 if timezone
is None:
1632 timezone
, date_str
= extract_timezone(date_str
)
1635 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
1636 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
1637 return calendar
.timegm(dt
.timetuple())
1642 def date_formats(day_first
=True):
1643 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
1646 def unified_strdate(date_str
, day_first
=True):
1647 """Return a string with the date in the format YYYYMMDD"""
1649 if date_str
is None:
1653 date_str
= date_str
.replace(',', ' ')
1654 # Remove AM/PM + timezone
1655 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1656 _
, date_str
= extract_timezone(date_str
)
1658 for expression
in date_formats(day_first
):
1660 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
1663 if upload_date
is None:
1664 timetuple
= email
.utils
.parsedate_tz(date_str
)
1667 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
1670 if upload_date
is not None:
1671 return compat_str(upload_date
)
1674 def unified_timestamp(date_str
, day_first
=True):
1675 if date_str
is None:
1678 date_str
= re
.sub(r
'[,|]', '', date_str
)
1680 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
1681 timezone
, date_str
= extract_timezone(date_str
)
1683 # Remove AM/PM + timezone
1684 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
1686 # Remove unrecognized timezones from ISO 8601 alike timestamps
1687 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
1689 date_str
= date_str
[:-len(m
.group('tz'))]
1691 # Python only supports microseconds, so remove nanoseconds
1692 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
1694 date_str
= m
.group(1)
1696 for expression
in date_formats(day_first
):
1698 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
1699 return calendar
.timegm(dt
.timetuple())
1702 timetuple
= email
.utils
.parsedate_tz(date_str
)
1704 return calendar
.timegm(timetuple
) + pm_delta
* 3600
1707 def determine_ext(url
, default_ext
='unknown_video'):
1708 if url
is None or '.' not in url
:
1710 guess
= url
.partition('?')[0].rpartition('.')[2]
1711 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
1713 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1714 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
1715 return guess
.rstrip('/')
1720 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
1721 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
1724 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
1726 Return a datetime object from a string in the format YYYYMMDD or
1727 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1729 format: string date format used to return datetime object from
1730 precision: round the time portion of a datetime object.
1731 auto|microsecond|second|minute|hour|day.
1732 auto: round to the unit provided in date_str (if applicable).
1734 auto_precision
= False
1735 if precision
== 'auto':
1736 auto_precision
= True
1737 precision
= 'microsecond'
1738 today
= datetime_round(datetime
.datetime
.now(), precision
)
1739 if date_str
in ('now', 'today'):
1741 if date_str
== 'yesterday':
1742 return today
- datetime
.timedelta(days
=1)
1744 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1746 if match
is not None:
1747 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
1748 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
1749 unit
= match
.group('unit')
1750 if unit
== 'month' or unit
== 'year':
1751 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
1757 delta
= datetime
.timedelta(**{unit + 's': time}
)
1758 new_date
= start_time
+ delta
1760 return datetime_round(new_date
, unit
)
1763 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
1766 def date_from_str(date_str
, format
='%Y%m%d'):
1768 Return a datetime object from a string in the format YYYYMMDD or
1769 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1771 format: string date format used to return datetime object from
1773 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
1776 def datetime_add_months(dt
, months
):
1777 """Increment/Decrement a datetime object by months."""
1778 month
= dt
.month
+ months
- 1
1779 year
= dt
.year
+ month
// 12
1780 month
= month
% 12 + 1
1781 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
1782 return dt
.replace(year
, month
, day
)
1785 def datetime_round(dt
, precision
='day'):
1787 Round a datetime object's time to a specific precision
1789 if precision
== 'microsecond':
1798 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
1799 timestamp
= calendar
.timegm(dt
.timetuple())
1800 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
1803 def hyphenate_date(date_str
):
1805 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1806 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
1807 if match
is not None:
1808 return '-'.join(match
.groups())
1813 class DateRange(object):
1814 """Represents a time interval between two dates"""
1816 def __init__(self
, start
=None, end
=None):
1817 """start and end must be strings in the format accepted by date"""
1818 if start
is not None:
1819 self
.start
= date_from_str(start
)
1821 self
.start
= datetime
.datetime
.min.date()
1823 self
.end
= date_from_str(end
)
1825 self
.end
= datetime
.datetime
.max.date()
1826 if self
.start
> self
.end
:
1827 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
1831 """Returns a range that only contains the given day"""
1832 return cls(day
, day
)
1834 def __contains__(self
, date
):
1835 """Check if the date is in the range"""
1836 if not isinstance(date
, datetime
.date
):
1837 date
= date_from_str(date
)
1838 return self
.start
<= date
<= self
.end
1841 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
1844 def platform_name():
1845 """ Returns the platform name as a compat_str """
1846 res
= platform
.platform()
1847 if isinstance(res
, bytes):
1848 res
= res
.decode(preferredencoding())
1850 assert isinstance(res
, compat_str
)
1854 def get_windows_version():
1855 ''' Get Windows version. None if it's not running on Windows '''
1856 if compat_os_name
== 'nt':
1857 return version_tuple(platform
.win32_ver()[1])
1862 def _windows_write_string(s
, out
):
1863 """ Returns True if the string was written using special methods,
1864 False if it has yet to be written out."""
1865 # Adapted from http://stackoverflow.com/a/3259271/35070
1867 import ctypes
.wintypes
1875 fileno
= out
.fileno()
1876 except AttributeError:
1877 # If the output stream doesn't have a fileno, it's virtual
1879 except io
.UnsupportedOperation
:
1880 # Some strange Windows pseudo files?
1882 if fileno
not in WIN_OUTPUT_IDS
:
1885 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
1886 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
1887 ('GetStdHandle', ctypes
.windll
.kernel32
))
1888 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
1890 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
1891 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
1892 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
1893 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
1894 written
= ctypes
.wintypes
.DWORD(0)
1896 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
1897 FILE_TYPE_CHAR
= 0x0002
1898 FILE_TYPE_REMOTE
= 0x8000
1899 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
1900 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
1901 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
1902 ('GetConsoleMode', ctypes
.windll
.kernel32
))
1903 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
1905 def not_a_console(handle
):
1906 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
1908 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
1909 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
1911 if not_a_console(h
):
1914 def next_nonbmp_pos(s
):
1916 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
1917 except StopIteration:
1921 count
= min(next_nonbmp_pos(s
), 1024)
1923 ret
= WriteConsoleW(
1924 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
1926 raise OSError('Failed to write string')
1927 if not count
: # We just wrote a non-BMP character
1928 assert written
.value
== 2
1931 assert written
.value
> 0
1932 s
= s
[written
.value
:]
1936 def write_string(s
, out
=None, encoding
=None):
1939 assert type(s
) == compat_str
1941 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
1942 if _windows_write_string(s
, out
):
1945 if ('b' in getattr(out
, 'mode', '')
1946 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
1947 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
1949 elif hasattr(out
, 'buffer'):
1950 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
1951 byt
= s
.encode(enc
, 'ignore')
1952 out
.buffer.write(byt
)
1958 def bytes_to_intlist(bs
):
1961 if isinstance(bs
[0], int): # Python 3
1964 return [ord(c
) for c
in bs
]
1967 def intlist_to_bytes(xs
):
1970 return compat_struct_pack('%dB' % len(xs
), *xs
)
1973 # Cross-platform file locking
1974 if sys
.platform
== 'win32':
1975 import ctypes
.wintypes
1978 class OVERLAPPED(ctypes
.Structure
):
1980 ('Internal', ctypes
.wintypes
.LPVOID
),
1981 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
1982 ('Offset', ctypes
.wintypes
.DWORD
),
1983 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
1984 ('hEvent', ctypes
.wintypes
.HANDLE
),
1987 kernel32
= ctypes
.windll
.kernel32
1988 LockFileEx
= kernel32
.LockFileEx
1989 LockFileEx
.argtypes
= [
1990 ctypes
.wintypes
.HANDLE
, # hFile
1991 ctypes
.wintypes
.DWORD
, # dwFlags
1992 ctypes
.wintypes
.DWORD
, # dwReserved
1993 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
1994 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
1995 ctypes
.POINTER(OVERLAPPED
) # Overlapped
1997 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
1998 UnlockFileEx
= kernel32
.UnlockFileEx
1999 UnlockFileEx
.argtypes
= [
2000 ctypes
.wintypes
.HANDLE
, # hFile
2001 ctypes
.wintypes
.DWORD
, # dwReserved
2002 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
2003 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
2004 ctypes
.POINTER(OVERLAPPED
) # Overlapped
2006 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
2007 whole_low
= 0xffffffff
2008 whole_high
= 0x7fffffff
2010 def _lock_file(f
, exclusive
):
2011 overlapped
= OVERLAPPED()
2012 overlapped
.Offset
= 0
2013 overlapped
.OffsetHigh
= 0
2014 overlapped
.hEvent
= 0
2015 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
2016 handle
= msvcrt
.get_osfhandle(f
.fileno())
2017 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
2018 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2019 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
2021 def _unlock_file(f
):
2022 assert f
._lock
_file
_overlapped
_p
2023 handle
= msvcrt
.get_osfhandle(f
.fileno())
2024 if not UnlockFileEx(handle
, 0,
2025 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
2026 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
2029 # Some platforms, such as Jython, is missing fcntl
2033 def _lock_file(f
, exclusive
):
2034 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
2036 def _unlock_file(f
):
2037 fcntl
.flock(f
, fcntl
.LOCK_UN
)
2039 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
2041 def _lock_file(f
, exclusive
):
2042 raise IOError(UNSUPPORTED_MSG
)
2044 def _unlock_file(f
):
2045 raise IOError(UNSUPPORTED_MSG
)
2048 class locked_file(object):
2049 def __init__(self
, filename
, mode
, encoding
=None):
2050 assert mode
in ['r', 'a', 'w']
2051 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
2054 def __enter__(self
):
2055 exclusive
= self
.mode
!= 'r'
2057 _lock_file(self
.f
, exclusive
)
2063 def __exit__(self
, etype
, value
, traceback
):
2065 _unlock_file(self
.f
)
2072 def write(self
, *args
):
2073 return self
.f
.write(*args
)
2075 def read(self
, *args
):
2076 return self
.f
.read(*args
)
2079 def get_filesystem_encoding():
2080 encoding
= sys
.getfilesystemencoding()
2081 return encoding
if encoding
is not None else 'utf-8'
2084 def shell_quote(args
):
2086 encoding
= get_filesystem_encoding()
2088 if isinstance(a
, bytes):
2089 # We may get a filename encoded with 'encodeFilename'
2090 a
= a
.decode(encoding
)
2091 quoted_args
.append(compat_shlex_quote(a
))
2092 return ' '.join(quoted_args
)
2095 def smuggle_url(url
, data
):
2096 """ Pass additional data in a URL for internal use. """
2098 url
, idata
= unsmuggle_url(url
, {})
2100 sdata
= compat_urllib_parse_urlencode(
2101 {'__youtubedl_smuggle': json.dumps(data)}
)
2102 return url
+ '#' + sdata
2105 def unsmuggle_url(smug_url
, default
=None):
2106 if '#__youtubedl_smuggle' not in smug_url
:
2107 return smug_url
, default
2108 url
, _
, sdata
= smug_url
.rpartition('#')
2109 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
2110 data
= json
.loads(jsond
)
2114 def format_decimal_suffix(num
, fmt
='%d%s', *, factor
=1000):
2115 """ Formats numbers with decimal sufixes like K, M, etc """
2116 num
, factor
= float_or_none(num
), float(factor
)
2119 exponent
= 0 if num
== 0 else int(math
.log(num
, factor
))
2120 suffix
= ['', *'kMGTPEZY'][exponent
]
2122 suffix
= {'k': 'Ki', '': ''}
.get(suffix
, f
'{suffix}i')
2123 converted
= num
/ (factor
** exponent
)
2124 return fmt
% (converted
, suffix
)
2127 def format_bytes(bytes):
2128 return format_decimal_suffix(bytes, '%.2f%sB', factor
=1024) or 'N/A'
2131 def lookup_unit_table(unit_table
, s
):
2132 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
2134 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
2137 num_str
= m
.group('num').replace(',', '.')
2138 mult
= unit_table
[m
.group('unit')]
2139 return int(float(num_str
) * mult
)
2142 def parse_filesize(s
):
2146 # The lower-case forms are of course incorrect and unofficial,
2147 # but we support those too
2164 'megabytes': 1000 ** 2,
2165 'mebibytes': 1024 ** 2,
2171 'gigabytes': 1000 ** 3,
2172 'gibibytes': 1024 ** 3,
2178 'terabytes': 1000 ** 4,
2179 'tebibytes': 1024 ** 4,
2185 'petabytes': 1000 ** 5,
2186 'pebibytes': 1024 ** 5,
2192 'exabytes': 1000 ** 6,
2193 'exbibytes': 1024 ** 6,
2199 'zettabytes': 1000 ** 7,
2200 'zebibytes': 1024 ** 7,
2206 'yottabytes': 1000 ** 8,
2207 'yobibytes': 1024 ** 8,
2210 return lookup_unit_table(_UNIT_TABLE
, s
)
2217 s
= re
.sub(r
'^[^\d]+\s', '', s
).strip()
2219 if re
.match(r
'^[\d,.]+$', s
):
2220 return str_to_int(s
)
2233 ret
= lookup_unit_table(_UNIT_TABLE
, s
)
2237 mobj
= re
.match(r
'([\d,.]+)(?:$|\s)', s
)
2239 return str_to_int(mobj
.group(1))
2242 def parse_resolution(s
):
2246 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
2249 'width': int(mobj
.group('w')),
2250 'height': int(mobj
.group('h')),
2253 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
2255 return {'height': int(mobj.group(1))}
2257 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
2259 return {'height': int(mobj.group(1)) * 540}
2264 def parse_bitrate(s
):
2265 if not isinstance(s
, compat_str
):
2267 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
2269 return int(mobj
.group(1))
2272 def month_by_name(name
, lang
='en'):
2273 """ Return the number of a month by (locale-independently) English name """
2275 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
2278 return month_names
.index(name
) + 1
2283 def month_by_abbreviation(abbrev
):
2284 """ Return the number of a month by (locale-independently) English
2288 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
2293 def fix_xml_ampersands(xml_str
):
2294 """Replace all the '&' by '&' in XML"""
2296 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2301 def setproctitle(title
):
2302 assert isinstance(title
, compat_str
)
2304 # ctypes in Jython is not complete
2305 # http://bugs.jython.org/issue2148
2306 if sys
.platform
.startswith('java'):
2310 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
2314 # LoadLibrary in Windows Python 2.7.13 only expects
2315 # a bytestring, but since unicode_literals turns
2316 # every string into a unicode string, it fails.
2318 title_bytes
= title
.encode('utf-8')
2319 buf
= ctypes
.create_string_buffer(len(title_bytes
))
2320 buf
.value
= title_bytes
2322 libc
.prctl(15, buf
, 0, 0, 0)
2323 except AttributeError:
2324 return # Strange libc, just skip this
2327 def remove_start(s
, start
):
2328 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
2331 def remove_end(s
, end
):
2332 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
2335 def remove_quotes(s
):
2336 if s
is None or len(s
) < 2:
2338 for quote
in ('"', "'", ):
2339 if s
[0] == quote
and s
[-1] == quote
:
2344 def get_domain(url
):
2345 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
2346 return domain
.group('domain') if domain
else None
2349 def url_basename(url
):
2350 path
= compat_urlparse
.urlparse(url
).path
2351 return path
.strip('/').split('/')[-1]
2355 return re
.match(r
'https?://[^?#&]+/', url
).group()
2358 def urljoin(base
, path
):
2359 if isinstance(path
, bytes):
2360 path
= path
.decode('utf-8')
2361 if not isinstance(path
, compat_str
) or not path
:
2363 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
2365 if isinstance(base
, bytes):
2366 base
= base
.decode('utf-8')
2367 if not isinstance(base
, compat_str
) or not re
.match(
2368 r
'^(?:https?:)?//', base
):
2370 return compat_urlparse
.urljoin(base
, path
)
2373 class HEADRequest(compat_urllib_request
.Request
):
2374 def get_method(self
):
2378 class PUTRequest(compat_urllib_request
.Request
):
2379 def get_method(self
):
2383 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
2386 v
= getattr(v
, get_attr
, None)
2392 return int(v
) * invscale
// scale
2393 except (ValueError, TypeError, OverflowError):
2397 def str_or_none(v
, default
=None):
2398 return default
if v
is None else compat_str(v
)
2401 def str_to_int(int_str
):
2402 """ A more relaxed version of int_or_none """
2403 if isinstance(int_str
, compat_integer_types
):
2405 elif isinstance(int_str
, compat_str
):
2406 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
2407 return int_or_none(int_str
)
2410 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
2414 return float(v
) * invscale
/ scale
2415 except (ValueError, TypeError):
2419 def bool_or_none(v
, default
=None):
2420 return v
if isinstance(v
, bool) else default
2423 def strip_or_none(v
, default
=None):
2424 return v
.strip() if isinstance(v
, compat_str
) else default
2427 def url_or_none(url
):
2428 if not url
or not isinstance(url
, compat_str
):
2431 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
2434 def strftime_or_none(timestamp
, date_format
, default
=None):
2435 datetime_object
= None
2437 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
2438 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
2439 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
2440 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
2441 return datetime_object
.strftime(date_format
)
2442 except (ValueError, TypeError, AttributeError):
2446 def parse_duration(s
):
2447 if not isinstance(s
, compat_basestring
):
2453 days
, hours
, mins
, secs
, ms
= [None] * 5
2454 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
2456 days
, hours
, mins
, secs
, ms
= m
.groups()
2461 [0-9]+\s*y(?:ears?)?\s*
2464 [0-9]+\s*m(?:onths?)?\s*
2467 [0-9]+\s*w(?:eeks?)?\s*
2470 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2474 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2477 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2480 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2483 days
, hours
, mins
, secs
, ms
= m
.groups()
2485 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
2487 hours
, mins
= m
.groups()
2493 duration
+= float(secs
)
2495 duration
+= float(mins
) * 60
2497 duration
+= float(hours
) * 60 * 60
2499 duration
+= float(days
) * 24 * 60 * 60
2501 duration
+= float(ms
)
2505 def prepend_extension(filename
, ext
, expected_real_ext
=None):
2506 name
, real_ext
= os
.path
.splitext(filename
)
2508 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
2509 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
2510 else '{0}.{1}'.format(filename
, ext
))
2513 def replace_extension(filename
, ext
, expected_real_ext
=None):
2514 name
, real_ext
= os
.path
.splitext(filename
)
2515 return '{0}.{1}'.format(
2516 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
2520 def check_executable(exe
, args
=[]):
2521 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2522 args can be a list of arguments for a short output (like -version) """
2524 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
2530 def _get_exe_version_output(exe
, args
):
2532 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2533 # SIGTTOU if yt-dlp is run in the background.
2534 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2536 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
2537 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
2540 if isinstance(out
, bytes): # Python 2.x
2541 out
= out
.decode('ascii', 'ignore')
2545 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
2546 assert isinstance(output
, compat_str
)
2547 if version_re
is None:
2548 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
2549 m
= re
.search(version_re
, output
)
2556 def get_exe_version(exe
, args
=['--version'],
2557 version_re
=None, unrecognized
='present'):
2558 """ Returns the version of the specified executable,
2559 or False if the executable is not present """
2560 out
= _get_exe_version_output(exe
, args
)
2561 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
2564 class LazyList(collections
.abc
.Sequence
):
2565 ''' Lazy immutable list from an iterable
2566 Note that slices of a LazyList are lists and not LazyList'''
2568 class IndexError(IndexError):
2571 def __init__(self
, iterable
, *, reverse
=False, _cache
=None):
2572 self
.__iterable
= iter(iterable
)
2573 self
.__cache
= [] if _cache
is None else _cache
2574 self
.__reversed
= reverse
2578 # We need to consume the entire iterable to iterate in reverse
2579 yield from self
.exhaust()
2581 yield from self
.__cache
2582 for item
in self
.__iterable
:
2583 self
.__cache
.append(item
)
2586 def __exhaust(self
):
2587 self
.__cache
.extend(self
.__iterable
)
2588 # Discard the emptied iterable to make it pickle-able
2589 self
.__iterable
= []
2593 ''' Evaluate the entire iterable '''
2594 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
2597 def __reverse_index(x
):
2598 return None if x
is None else -(x
+ 1)
2600 def __getitem__(self
, idx
):
2601 if isinstance(idx
, slice):
2603 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
2604 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
2605 elif isinstance(idx
, int):
2607 idx
= self
.__reverse
_index
(idx
)
2608 start
, stop
, step
= idx
, idx
, 0
2610 raise TypeError('indices must be integers or slices')
2611 if ((start
or 0) < 0 or (stop
or 0) < 0
2612 or (start
is None and step
< 0)
2613 or (stop
is None and step
> 0)):
2614 # We need to consume the entire iterable to be able to slice from the end
2615 # Obviously, never use this with infinite iterables
2618 return self
.__cache
[idx
]
2619 except IndexError as e
:
2620 raise self
.IndexError(e
) from e
2621 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
2623 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
2625 return self
.__cache
[idx
]
2626 except IndexError as e
:
2627 raise self
.IndexError(e
) from e
2631 self
[-1] if self
.__reversed
else self
[0]
2632 except self
.IndexError:
2638 return len(self
.__cache
)
2640 def __reversed__(self
):
2641 return type(self
)(self
.__iterable
, reverse
=not self
.__reversed
, _cache
=self
.__cache
)
2644 return type(self
)(self
.__iterable
, reverse
=self
.__reversed
, _cache
=self
.__cache
)
2647 # repr and str should mimic a list. So we exhaust the iterable
2648 return repr(self
.exhaust())
2651 return repr(self
.exhaust())
2656 class IndexError(IndexError):
2660 # This is only useful for tests
2661 return len(self
.getslice())
2663 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
2664 self
._pagefunc
= pagefunc
2665 self
._pagesize
= pagesize
2666 self
._use
_cache
= use_cache
2669 def getpage(self
, pagenum
):
2670 page_results
= self
._cache
.get(pagenum
)
2671 if page_results
is None:
2672 page_results
= list(self
._pagefunc
(pagenum
))
2674 self
._cache
[pagenum
] = page_results
2677 def getslice(self
, start
=0, end
=None):
2678 return list(self
._getslice
(start
, end
))
2680 def _getslice(self
, start
, end
):
2681 raise NotImplementedError('This method must be implemented by subclasses')
2683 def __getitem__(self
, idx
):
2684 # NOTE: cache must be enabled if this is used
2685 if not isinstance(idx
, int) or idx
< 0:
2686 raise TypeError('indices must be non-negative integers')
2687 entries
= self
.getslice(idx
, idx
+ 1)
2689 raise self
.IndexError()
2693 class OnDemandPagedList(PagedList
):
2694 def _getslice(self
, start
, end
):
2695 for pagenum
in itertools
.count(start
// self
._pagesize
):
2696 firstid
= pagenum
* self
._pagesize
2697 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
2698 if start
>= nextfirstid
:
2702 start
% self
._pagesize
2703 if firstid
<= start
< nextfirstid
2706 ((end
- 1) % self
._pagesize
) + 1
2707 if (end
is not None and firstid
<= end
<= nextfirstid
)
2710 page_results
= self
.getpage(pagenum
)
2711 if startv
!= 0 or endv
is not None:
2712 page_results
= page_results
[startv
:endv
]
2713 yield from page_results
2715 # A little optimization - if current page is not "full", ie. does
2716 # not contain page_size videos then we can assume that this page
2717 # is the last one - there are no more ids on further pages -
2718 # i.e. no need to query again.
2719 if len(page_results
) + startv
< self
._pagesize
:
2722 # If we got the whole page, but the next page is not interesting,
2723 # break out early as well
2724 if end
== nextfirstid
:
2728 class InAdvancePagedList(PagedList
):
2729 def __init__(self
, pagefunc
, pagecount
, pagesize
):
2730 self
._pagecount
= pagecount
2731 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
2733 def _getslice(self
, start
, end
):
2734 start_page
= start
// self
._pagesize
2736 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
2737 skip_elems
= start
- start_page
* self
._pagesize
2738 only_more
= None if end
is None else end
- start
2739 for pagenum
in range(start_page
, end_page
):
2740 page_results
= self
.getpage(pagenum
)
2742 page_results
= page_results
[skip_elems
:]
2744 if only_more
is not None:
2745 if len(page_results
) < only_more
:
2746 only_more
-= len(page_results
)
2748 yield from page_results
[:only_more
]
2750 yield from page_results
2753 def uppercase_escape(s
):
2754 unicode_escape
= codecs
.getdecoder('unicode_escape')
2756 r
'\\U[0-9a-fA-F]{8}',
2757 lambda m
: unicode_escape(m
.group(0))[0],
2761 def lowercase_escape(s
):
2762 unicode_escape
= codecs
.getdecoder('unicode_escape')
2764 r
'\\u[0-9a-fA-F]{4}',
2765 lambda m
: unicode_escape(m
.group(0))[0],
2769 def escape_rfc3986(s
):
2770 """Escape non-ASCII characters as suggested by RFC 3986"""
2771 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
2772 s
= s
.encode('utf-8')
2773 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
2776 def escape_url(url
):
2777 """Escape URL as suggested by RFC 3986"""
2778 url_parsed
= compat_urllib_parse_urlparse(url
)
2779 return url_parsed
._replace
(
2780 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
2781 path
=escape_rfc3986(url_parsed
.path
),
2782 params
=escape_rfc3986(url_parsed
.params
),
2783 query
=escape_rfc3986(url_parsed
.query
),
2784 fragment
=escape_rfc3986(url_parsed
.fragment
)
2789 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
2792 def read_batch_urls(batch_fd
):
2794 if not isinstance(url
, compat_str
):
2795 url
= url
.decode('utf-8', 'replace')
2796 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
2797 for bom
in BOM_UTF8
:
2798 if url
.startswith(bom
):
2799 url
= url
[len(bom
):]
2801 if not url
or url
.startswith(('#', ';', ']')):
2803 # "#" cannot be stripped out since it is part of the URI
2804 # However, it can be safely stipped out if follwing a whitespace
2805 return re
.split(r
'\s#', url
, 1)[0].rstrip()
2807 with contextlib
.closing(batch_fd
) as fd
:
2808 return [url
for url
in map(fixup
, fd
) if url
]
2811 def urlencode_postdata(*args
, **kargs
):
2812 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
2815 def update_url_query(url
, query
):
2818 parsed_url
= compat_urlparse
.urlparse(url
)
2819 qs
= compat_parse_qs(parsed_url
.query
)
2821 return compat_urlparse
.urlunparse(parsed_url
._replace
(
2822 query
=compat_urllib_parse_urlencode(qs
, True)))
2825 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
2826 req_headers
= req
.headers
.copy()
2827 req_headers
.update(headers
)
2828 req_data
= data
or req
.data
2829 req_url
= update_url_query(url
or req
.get_full_url(), query
)
2830 req_get_method
= req
.get_method()
2831 if req_get_method
== 'HEAD':
2832 req_type
= HEADRequest
2833 elif req_get_method
== 'PUT':
2834 req_type
= PUTRequest
2836 req_type
= compat_urllib_request
.Request
2838 req_url
, data
=req_data
, headers
=req_headers
,
2839 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
2840 if hasattr(req
, 'timeout'):
2841 new_req
.timeout
= req
.timeout
2845 def _multipart_encode_impl(data
, boundary
):
2846 content_type
= 'multipart/form-data; boundary=%s' % boundary
2849 for k
, v
in data
.items():
2850 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
2851 if isinstance(k
, compat_str
):
2852 k
= k
.encode('utf-8')
2853 if isinstance(v
, compat_str
):
2854 v
= v
.encode('utf-8')
2855 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2856 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2857 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
2858 if boundary
.encode('ascii') in content
:
2859 raise ValueError('Boundary overlaps with data')
2862 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
2864 return out
, content_type
2867 def multipart_encode(data
, boundary
=None):
2869 Encode a dict to RFC 7578-compliant form-data
2872 A dict where keys and values can be either Unicode or bytes-like
2875 If specified a Unicode object, it's used as the boundary. Otherwise
2876 a random boundary is generated.
2878 Reference: https://tools.ietf.org/html/rfc7578
2880 has_specified_boundary
= boundary
is not None
2883 if boundary
is None:
2884 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
2887 out
, content_type
= _multipart_encode_impl(data
, boundary
)
2890 if has_specified_boundary
:
2894 return out
, content_type
2897 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
2898 if isinstance(key_or_keys
, (list, tuple)):
2899 for key
in key_or_keys
:
2900 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
2904 return d
.get(key_or_keys
, default
)
2907 def try_get(src
, getter
, expected_type
=None):
2908 for get
in variadic(getter
):
2911 except (AttributeError, KeyError, TypeError, IndexError):
2914 if expected_type
is None or isinstance(v
, expected_type
):
2918 def merge_dicts(*dicts
):
2920 for a_dict
in dicts
:
2921 for k
, v
in a_dict
.items():
2925 or (isinstance(v
, compat_str
) and v
2926 and isinstance(merged
[k
], compat_str
)
2927 and not merged
[k
])):
2932 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
2933 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
2945 TV_PARENTAL_GUIDELINES
= {
2955 def parse_age_limit(s
):
2957 return s
if 0 <= s
<= 21 else None
2958 if not isinstance(s
, compat_basestring
):
2960 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
2962 return int(m
.group('age'))
2965 return US_RATINGS
[s
]
2966 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
2968 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
2972 def strip_jsonp(code
):
2975 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2976 (?:\s*&&\s*(?P=func_name))?
2977 \s*\(\s*(?P<callback_data>.*)\);?
2978 \s*?(?://[^\n]*)*$''',
2979 r
'\g<callback_data>', code
)
2982 def js_to_json(code
, vars={}):
2983 # vars is a dict of var, val pairs to substitute
2984 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2985 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
2987 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
2988 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
2993 if v
in ('true', 'false', 'null'):
2995 elif v
in ('undefined', 'void 0'):
2997 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
3000 if v
[0] in ("'", '"'):
3001 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
3006 }.get(m
.group(0), m
.group(0)), v
[1:-1])
3008 for regex
, base
in INTEGER_TABLE
:
3009 im
= re
.match(regex
, v
)
3011 i
= int(im
.group(1), base
)
3012 return '"%d":' % i
if v
.endswith(':') else '%d' % i
3019 return re
.sub(r
'''(?sx)
3020 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3021 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3022 {comment}|,(?={skip}[\]}}])|
3023 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3024 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3027 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
3030 def qualities(quality_ids
):
3031 """ Get a numeric quality value out of a list of possible values """
3034 return quality_ids
.index(qid
)
3040 POSTPROCESS_WHEN
= {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3044 'default': '%(title)s [%(id)s].%(ext)s',
3045 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3051 'description': 'description',
3052 'annotation': 'annotations.xml',
3053 'infojson': 'info.json',
3055 'pl_thumbnail': None,
3056 'pl_description': 'description',
3057 'pl_infojson': 'info.json',
3060 # As of [1] format syntax is:
3061 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3062 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3063 STR_FORMAT_RE_TMPL
= r
'''(?x)
3064 (?<!%)(?P<prefix>(?:%%)*)
3066 (?P<has_key>\((?P<key>{0})\))?
3068 (?P<conversion>[#0\-+ ]+)?
3070 (?P<precision>\.\d+)?
3071 (?P<len_mod>[hlL])? # unused in python
3072 {1} # conversion type
3077 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
3080 def limit_length(s
, length
):
3081 """ Add ellipses to overly long strings """
3086 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
3090 def version_tuple(v
):
3091 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
3094 def is_outdated_version(version
, limit
, assume_new
=True):
3096 return not assume_new
3098 return version_tuple(version
) < version_tuple(limit
)
3100 return not assume_new
3103 def ytdl_is_updateable():
3104 """ Returns if yt-dlp can be updated with -U """
3106 from .update
import is_non_updateable
3108 return not is_non_updateable()
3111 def args_to_str(args
):
3112 # Get a short string representation for a subprocess command
3113 return ' '.join(compat_shlex_quote(a
) for a
in args
)
3116 def error_to_compat_str(err
):
3118 # On python 2 error byte string must be decoded with proper
3119 # encoding rather than ascii
3120 if sys
.version_info
[0] < 3:
3121 err_str
= err_str
.decode(preferredencoding())
3125 def mimetype2ext(mt
):
3129 mt
, _
, params
= mt
.partition(';')
3134 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3135 # it's the most popular one
3136 'audio/mpeg': 'mp3',
3137 'audio/x-wav': 'wav',
3139 'audio/wave': 'wav',
3142 ext
= FULL_MAP
.get(mt
)
3148 'smptett+xml': 'tt',
3152 'x-mp4-fragmented': 'mp4',
3153 'x-ms-sami': 'sami',
3156 'x-mpegurl': 'm3u8',
3157 'vnd.apple.mpegurl': 'm3u8',
3161 'vnd.ms-sstr+xml': 'ism',
3165 'filmstrip+json': 'fs',
3169 _
, _
, subtype
= mt
.rpartition('/')
3170 ext
= SUBTYPE_MAP
.get(subtype
.lower())
3181 _
, _
, suffix
= subtype
.partition('+')
3182 ext
= SUFFIX_MAP
.get(suffix
)
3186 return subtype
.replace('+', '.')
3189 def ext2mimetype(ext_or_url
):
3192 if '.' not in ext_or_url
:
3193 ext_or_url
= f
'file.{ext_or_url}'
3194 return mimetypes
.guess_type(ext_or_url
)[0]
3197 def parse_codecs(codecs_str
):
3198 # http://tools.ietf.org/html/rfc6381
3201 split_codecs
= list(filter(None, map(
3202 str.strip
, codecs_str
.strip().strip(',').split(','))))
3203 vcodec
, acodec
, tcodec
, hdr
= None, None, None, None
3204 for full_codec
in split_codecs
:
3205 parts
= full_codec
.split('.')
3206 codec
= parts
[0].replace('0', '')
3207 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3208 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3210 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1', 'hvc1') else full_codec
3211 if codec
in ('dvh1', 'dvhe'):
3213 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
3215 elif full_codec
.replace('0', '').startswith('vp9.2'):
3217 elif codec
in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3220 elif codec
in ('stpp', 'wvtt',):
3224 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
3225 if vcodec
or acodec
or tcodec
:
3227 'vcodec': vcodec
or 'none',
3228 'acodec': acodec
or 'none',
3229 'dynamic_range': hdr
,
3230 **({'tcodec': tcodec}
if tcodec
is not None else {}),
3232 elif len(split_codecs
) == 2:
3234 'vcodec': split_codecs
[0],
3235 'acodec': split_codecs
[1],
3240 def urlhandle_detect_ext(url_handle
):
3241 getheader
= url_handle
.headers
.get
3243 cd
= getheader('Content-Disposition')
3245 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
3247 e
= determine_ext(m
.group('filename'), default_ext
=None)
3251 return mimetype2ext(getheader('Content-Type'))
3254 def encode_data_uri(data
, mime_type
):
3255 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
3258 def age_restricted(content_limit
, age_limit
):
3259 """ Returns True iff the content should be blocked """
3261 if age_limit
is None: # No limit set
3263 if content_limit
is None:
3264 return False # Content available for everyone
3265 return age_limit
< content_limit
3268 def is_html(first_bytes
):
3269 """ Detect whether a file contains HTML by examining its first bytes. """
3272 (b
'\xef\xbb\xbf', 'utf-8'),
3273 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
3274 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
3275 (b
'\xff\xfe', 'utf-16-le'),
3276 (b
'\xfe\xff', 'utf-16-be'),
3278 for bom
, enc
in BOMS
:
3279 if first_bytes
.startswith(bom
):
3280 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
3283 s
= first_bytes
.decode('utf-8', 'replace')
3285 return re
.match(r
'^\s*<', s
)
3288 def determine_protocol(info_dict
):
3289 protocol
= info_dict
.get('protocol')
3290 if protocol
is not None:
3293 url
= sanitize_url(info_dict
['url'])
3294 if url
.startswith('rtmp'):
3296 elif url
.startswith('mms'):
3298 elif url
.startswith('rtsp'):
3301 ext
= determine_ext(url
)
3307 return compat_urllib_parse_urlparse(url
).scheme
3310 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
3311 """ Render a list of rows, each as a list of values.
3312 Text after a \t will be right aligned """
3314 return len(remove_terminal_sequences(string
).replace('\t', ''))
3316 def get_max_lens(table
):
3317 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
3319 def filter_using_list(row
, filterArray
):
3320 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
3323 max_lens
= get_max_lens(data
)
3324 header_row
= filter_using_list(header_row
, max_lens
)
3325 data
= [filter_using_list(row
, max_lens
) for row
in data
]
3327 table
= [header_row
] + data
3328 max_lens
= get_max_lens(table
)
3331 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
3332 table
[1][-1] = table
[1][-1][:-extra_gap
] # Remove extra_gap from end of delimiter
3334 for pos
, text
in enumerate(map(str, row
)):
3336 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
3338 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
3339 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
3343 def _match_one(filter_part
, dct
, incomplete
):
3344 # TODO: Generalize code with YoutubeDL._build_format_filter
3345 STRING_OPERATORS
= {
3346 '*=': operator
.contains
,
3347 '^=': lambda attr
, value
: attr
.startswith(value
),
3348 '$=': lambda attr
, value
: attr
.endswith(value
),
3349 '~=': lambda attr
, value
: re
.search(value
, attr
),
3351 COMPARISON_OPERATORS
= {
3353 '<=': operator
.le
, # "<=" must be defined above "<"
3360 operator_rex
= re
.compile(r
'''(?x)\s*
3362 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3364 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
3368 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3369 m = operator_rex.search(filter_part)
3372 unnegated_op = COMPARISON_OPERATORS[m['op']]
3374 op = lambda attr, value: not unnegated_op(attr, value)
3377 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3379 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3380 actual_value = dct.get(m['key'])
3381 numeric_comparison = None
3382 if isinstance(actual_value, compat_numeric_types):
3383 # If the original field is a string and matching comparisonvalue is
3384 # a number we should respect the origin of the original field
3385 # and process comparison value as a string (see
3386 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3388 numeric_comparison = int(comparison_value)
3390 numeric_comparison = parse_filesize(comparison_value)
3391 if numeric_comparison is None:
3392 numeric_comparison = parse_filesize(f'{comparison_value}B')
3393 if numeric_comparison is None:
3394 numeric_comparison = parse_duration(comparison_value)
3395 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3396 raise ValueError('Operator %s only supports string values!' % m['op'])
3397 if actual_value is None:
3398 return incomplete or m['none_inclusive']
3399 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3402 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3403 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3405 operator_rex = re.compile(r'''(?x
)\s
*
3406 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
3408 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3409 m = operator_rex.search(filter_part)
3411 op = UNARY_OPERATORS[m.group('op')]
3412 actual_value = dct.get(m.group('key'))
3413 if incomplete and actual_value is None:
3415 return op(actual_value)
3417 raise ValueError('Invalid filter part %r' % filter_part)
3420 def match_str(filter_str, dct, incomplete=False):
3421 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3422 When incomplete, all conditions passes on missing fields
3425 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3426 for filter_part in re.split(r'(?<!\\)&', filter_str))
3429 def match_filter_func(filter_str):
3430 def _match_func(info_dict, *args, **kwargs):
3431 if match_str(filter_str, info_dict, *args, **kwargs):
3434 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3435 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3439 def parse_dfxp_time_expr(time_expr):
3443 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3445 return float(mobj.group('time_offset'))
3447 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3449 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3452 def srt_subtitles_timecode(seconds):
3453 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3456 def ass_subtitles_timecode(seconds):
3457 time = timetuple_from_msec(seconds * 1000)
3458 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3461 def dfxp2srt(dfxp_data):
3463 @param dfxp_data A
bytes-like
object containing DFXP data
3464 @returns A
unicode object containing converted SRT data
3466 LEGACY_NAMESPACES = (
3467 (b'http://www.w3.org/ns/ttml', [
3468 b'http://www.w3.org/2004/11/ttaf1',
3469 b'http://www.w3.org/2006/04/ttaf1',
3470 b'http://www.w3.org/2006/10/ttaf1',
3472 (b'http://www.w3.org/ns/ttml#styling', [
3473 b'http://www.w3.org/ns/ttml#style',
3477 SUPPORTED_STYLING = [
3486 _x = functools.partial(xpath_with_ns, ns_map={
3487 'xml': 'http://www.w3.org/XML/1998/namespace',
3488 'ttml': 'http://www.w3.org/ns/ttml',
3489 'tts': 'http://www.w3.org/ns/ttml#styling',
3495 class TTMLPElementParser(object):
3497 _unclosed_elements = []
3498 _applied_styles = []
3500 def start(self, tag, attrib):
3501 if tag in (_x('ttml:br'), 'br'):
3504 unclosed_elements = []
3506 element_style_id = attrib.get('style')
3508 style.update(default_style)
3509 if element_style_id:
3510 style.update(styles.get(element_style_id, {}))
3511 for prop in SUPPORTED_STYLING:
3512 prop_val = attrib.get(_x('tts:' + prop))
3514 style[prop] = prop_val
3517 for k, v in sorted(style.items()):
3518 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3521 font += ' color="%s"' % v
3522 elif k == 'fontSize':
3523 font += ' size="%s"' % v
3524 elif k == 'fontFamily':
3525 font += ' face="%s"' % v
3526 elif k == 'fontWeight' and v == 'bold':
3528 unclosed_elements.append('b')
3529 elif k == 'fontStyle' and v == 'italic':
3531 unclosed_elements.append('i')
3532 elif k == 'textDecoration' and v == 'underline':
3534 unclosed_elements.append('u')
3536 self._out += '<font' + font + '>'
3537 unclosed_elements.append('font')
3539 if self._applied_styles:
3540 applied_style.update(self._applied_styles[-1])
3541 applied_style.update(style)
3542 self._applied_styles.append(applied_style)
3543 self._unclosed_elements.append(unclosed_elements)
3546 if tag not in (_x('ttml:br'), 'br'):
3547 unclosed_elements = self._unclosed_elements.pop()
3548 for element in reversed(unclosed_elements):
3549 self._out += '</%s>' % element
3550 if unclosed_elements and self._applied_styles:
3551 self._applied_styles.pop()
3553 def data(self, data):
3557 return self._out.strip()
3559 def parse_node(node):
3560 target = TTMLPElementParser()
3561 parser = xml.etree.ElementTree.XMLParser(target=target)
3562 parser.feed(xml.etree.ElementTree.tostring(node))
3563 return parser.close()
3565 for k, v in LEGACY_NAMESPACES:
3567 dfxp_data = dfxp_data.replace(ns, k)
3569 dfxp = compat_etree_fromstring(dfxp_data)
3571 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3574 raise ValueError('Invalid dfxp/TTML subtitle')
3578 for style in dfxp.findall(_x('.//ttml:style')):
3579 style_id = style.get('id') or style.get(_x('xml:id'))
3582 parent_style_id = style.get('style')
3584 if parent_style_id not in styles:
3587 styles[style_id] = styles[parent_style_id].copy()
3588 for prop in SUPPORTED_STYLING:
3589 prop_val = style.get(_x('tts:' + prop))
3591 styles.setdefault(style_id, {})[prop] = prop_val
3597 for p in ('body', 'div'):
3598 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3601 style = styles.get(ele.get('style'))
3604 default_style.update(style)
3606 for para, index in zip(paras, itertools.count(1)):
3607 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3608 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3609 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3610 if begin_time is None:
3615 end_time = begin_time + dur
3616 out.append('%d\n%s --> %s\n%s\n\n' % (
3618 srt_subtitles_timecode(begin_time),
3619 srt_subtitles_timecode(end_time),
3625 def cli_option(params, command_option, param):
3626 param = params.get(param)
3628 param = compat_str(param)
3629 return [command_option, param] if param is not None else []
3632 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3633 param = params.get(param)
3636 assert isinstance(param, bool)
3638 return [command_option + separator + (true_value if param else false_value)]
3639 return [command_option, true_value if param else false_value]
3642 def cli_valueless_option(params, command_option, param, expected_value=True):
3643 param = params.get(param)
3644 return [command_option] if param == expected_value else []
3647 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3648 if isinstance(argdict, (list, tuple)): # for backward compatibility
3655 assert isinstance(argdict, dict)
3657 assert isinstance(keys, (list, tuple))
3658 for key_list in keys:
3659 arg_list = list(filter(
3660 lambda x: x is not None,
3661 [argdict.get(key.lower()) for key in variadic(key_list)]))
3663 return [arg for args in arg_list for arg in args]
3667 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3668 main_key, exe = main_key.lower(), exe.lower()
3669 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3670 keys = [f'{root_key}{k}' for k in (keys or [''])]
3671 if root_key in keys:
3673 keys.append((main_key, exe))
3674 keys.append('default')
3677 return cli_configuration_args(argdict, keys, default, use_compat)
3680 class ISO639Utils(object):
3681 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3740 'iw': 'heb', # Replaced by he in 1989 revision
3750 'in': 'ind', # Replaced by id in 1989 revision
3865 'ji': 'yid', # Replaced by yi in 1989 revision
3873 def short2long(cls, code):
3874 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3875 return cls._lang_map.get(code[:2])
3878 def long2short(cls, code):
3879 """Convert language code from ISO 639-2/T to ISO 639-1"""
3880 for short_name, long_name in cls._lang_map.items():
3881 if long_name == code:
3885 class ISO3166Utils(object):
3886 # From http://data.okfn.org/data/core/country-list
3888 'AF': 'Afghanistan',
3889 'AX': 'Åland Islands',
3892 'AS': 'American Samoa',
3897 'AG': 'Antigua and Barbuda',
3914 'BO': 'Bolivia, Plurinational State of',
3915 'BQ': 'Bonaire, Sint Eustatius and Saba',
3916 'BA': 'Bosnia and Herzegovina',
3918 'BV': 'Bouvet Island',
3920 'IO': 'British Indian Ocean Territory',
3921 'BN': 'Brunei Darussalam',
3923 'BF': 'Burkina Faso',
3929 'KY': 'Cayman Islands',
3930 'CF': 'Central African Republic',
3934 'CX': 'Christmas Island',
3935 'CC': 'Cocos (Keeling) Islands',
3939 'CD': 'Congo, the Democratic Republic of the',
3940 'CK': 'Cook Islands',
3942 'CI': 'Côte d\'Ivoire',
3947 'CZ': 'Czech Republic',
3951 'DO': 'Dominican Republic',
3954 'SV': 'El Salvador',
3955 'GQ': 'Equatorial Guinea',
3959 'FK': 'Falkland Islands (Malvinas)',
3960 'FO': 'Faroe Islands',
3964 'GF': 'French Guiana',
3965 'PF': 'French Polynesia',
3966 'TF': 'French Southern Territories',
3981 'GW': 'Guinea-Bissau',
3984 'HM': 'Heard Island and McDonald Islands',
3985 'VA': 'Holy See (Vatican City State)',
3992 'IR': 'Iran, Islamic Republic of',
3995 'IM': 'Isle of Man',
4005 'KP': 'Korea, Democratic People\'s Republic of',
4006 'KR': 'Korea, Republic of',
4009 'LA': 'Lao People\'s Democratic Republic',
4015 'LI': 'Liechtenstein',
4019 'MK': 'Macedonia, the Former Yugoslav Republic of',
4026 'MH': 'Marshall Islands',
4032 'FM': 'Micronesia, Federated States of',
4033 'MD': 'Moldova, Republic of',
4044 'NL': 'Netherlands',
4045 'NC': 'New Caledonia',
4046 'NZ': 'New Zealand',
4051 'NF': 'Norfolk Island',
4052 'MP': 'Northern Mariana Islands',
4057 'PS': 'Palestine, State of',
4059 'PG': 'Papua New Guinea',
4062 'PH': 'Philippines',
4066 'PR': 'Puerto Rico',
4070 'RU': 'Russian Federation',
4072 'BL': 'Saint Barthélemy',
4073 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4074 'KN': 'Saint Kitts and Nevis',
4075 'LC': 'Saint Lucia',
4076 'MF': 'Saint Martin (French part)',
4077 'PM': 'Saint Pierre and Miquelon',
4078 'VC': 'Saint Vincent and the Grenadines',
4081 'ST': 'Sao Tome and Principe',
4082 'SA': 'Saudi Arabia',
4086 'SL': 'Sierra Leone',
4088 'SX': 'Sint Maarten (Dutch part)',
4091 'SB': 'Solomon Islands',
4093 'ZA': 'South Africa',
4094 'GS': 'South Georgia and the South Sandwich Islands',
4095 'SS': 'South Sudan',
4100 'SJ': 'Svalbard and Jan Mayen',
4103 'CH': 'Switzerland',
4104 'SY': 'Syrian Arab Republic',
4105 'TW': 'Taiwan, Province of China',
4107 'TZ': 'Tanzania, United Republic of',
4109 'TL': 'Timor-Leste',
4113 'TT': 'Trinidad and Tobago',
4116 'TM': 'Turkmenistan',
4117 'TC': 'Turks and Caicos Islands',
4121 'AE': 'United Arab Emirates',
4122 'GB': 'United Kingdom',
4123 'US': 'United States',
4124 'UM': 'United States Minor Outlying Islands',
4128 'VE': 'Venezuela, Bolivarian Republic of',
4130 'VG': 'Virgin Islands, British',
4131 'VI': 'Virgin Islands, U.S.',
4132 'WF': 'Wallis and Futuna',
4133 'EH': 'Western Sahara',
4140 def short2full(cls, code):
4141 """Convert an ISO 3166-2 country code to the corresponding full name"""
4142 return cls._country_map.get(code.upper())
4145 class GeoUtils(object):
4146 # Major IPv4 address blocks per country
4148 'AD': '46.172.224.0/19',
4149 'AE': '94.200.0.0/13',
4150 'AF': '149.54.0.0/17',
4151 'AG': '209.59.64.0/18',
4152 'AI': '204.14.248.0/21',
4153 'AL': '46.99.0.0/16',
4154 'AM': '46.70.0.0/15',
4155 'AO': '105.168.0.0/13',
4156 'AP': '182.50.184.0/21',
4157 'AQ': '23.154.160.0/24',
4158 'AR': '181.0.0.0/12',
4159 'AS': '202.70.112.0/20',
4160 'AT': '77.116.0.0/14',
4161 'AU': '1.128.0.0/11',
4162 'AW': '181.41.0.0/18',
4163 'AX': '185.217.4.0/22',
4164 'AZ': '5.197.0.0/16',
4165 'BA': '31.176.128.0/17',
4166 'BB': '65.48.128.0/17',
4167 'BD': '114.130.0.0/16',
4169 'BF': '102.178.0.0/15',
4170 'BG': '95.42.0.0/15',
4171 'BH': '37.131.0.0/17',
4172 'BI': '154.117.192.0/18',
4173 'BJ': '137.255.0.0/16',
4174 'BL': '185.212.72.0/23',
4175 'BM': '196.12.64.0/18',
4176 'BN': '156.31.0.0/16',
4177 'BO': '161.56.0.0/16',
4178 'BQ': '161.0.80.0/20',
4179 'BR': '191.128.0.0/12',
4180 'BS': '24.51.64.0/18',
4181 'BT': '119.2.96.0/19',
4182 'BW': '168.167.0.0/16',
4183 'BY': '178.120.0.0/13',
4184 'BZ': '179.42.192.0/18',
4185 'CA': '99.224.0.0/11',
4186 'CD': '41.243.0.0/16',
4187 'CF': '197.242.176.0/21',
4188 'CG': '160.113.0.0/16',
4189 'CH': '85.0.0.0/13',
4190 'CI': '102.136.0.0/14',
4191 'CK': '202.65.32.0/19',
4192 'CL': '152.172.0.0/14',
4193 'CM': '102.244.0.0/14',
4194 'CN': '36.128.0.0/10',
4195 'CO': '181.240.0.0/12',
4196 'CR': '201.192.0.0/12',
4197 'CU': '152.206.0.0/15',
4198 'CV': '165.90.96.0/19',
4199 'CW': '190.88.128.0/17',
4200 'CY': '31.153.0.0/16',
4201 'CZ': '88.100.0.0/14',
4203 'DJ': '197.241.0.0/17',
4204 'DK': '87.48.0.0/12',
4205 'DM': '192.243.48.0/20',
4206 'DO': '152.166.0.0/15',
4207 'DZ': '41.96.0.0/12',
4208 'EC': '186.68.0.0/15',
4209 'EE': '90.190.0.0/15',
4210 'EG': '156.160.0.0/11',
4211 'ER': '196.200.96.0/20',
4212 'ES': '88.0.0.0/11',
4213 'ET': '196.188.0.0/14',
4214 'EU': '2.16.0.0/13',
4215 'FI': '91.152.0.0/13',
4216 'FJ': '144.120.0.0/16',
4217 'FK': '80.73.208.0/21',
4218 'FM': '119.252.112.0/20',
4219 'FO': '88.85.32.0/19',
4221 'GA': '41.158.0.0/15',
4223 'GD': '74.122.88.0/21',
4224 'GE': '31.146.0.0/16',
4225 'GF': '161.22.64.0/18',
4226 'GG': '62.68.160.0/19',
4227 'GH': '154.160.0.0/12',
4228 'GI': '95.164.0.0/16',
4229 'GL': '88.83.0.0/19',
4230 'GM': '160.182.0.0/15',
4231 'GN': '197.149.192.0/18',
4232 'GP': '104.250.0.0/19',
4233 'GQ': '105.235.224.0/20',
4234 'GR': '94.64.0.0/13',
4235 'GT': '168.234.0.0/16',
4236 'GU': '168.123.0.0/16',
4237 'GW': '197.214.80.0/20',
4238 'GY': '181.41.64.0/18',
4239 'HK': '113.252.0.0/14',
4240 'HN': '181.210.0.0/16',
4241 'HR': '93.136.0.0/13',
4242 'HT': '148.102.128.0/17',
4243 'HU': '84.0.0.0/14',
4244 'ID': '39.192.0.0/10',
4245 'IE': '87.32.0.0/12',
4246 'IL': '79.176.0.0/13',
4247 'IM': '5.62.80.0/20',
4248 'IN': '117.192.0.0/10',
4249 'IO': '203.83.48.0/21',
4250 'IQ': '37.236.0.0/14',
4251 'IR': '2.176.0.0/12',
4252 'IS': '82.221.0.0/16',
4253 'IT': '79.0.0.0/10',
4254 'JE': '87.244.64.0/18',
4255 'JM': '72.27.0.0/17',
4256 'JO': '176.29.0.0/16',
4257 'JP': '133.0.0.0/8',
4258 'KE': '105.48.0.0/12',
4259 'KG': '158.181.128.0/17',
4260 'KH': '36.37.128.0/17',
4261 'KI': '103.25.140.0/22',
4262 'KM': '197.255.224.0/20',
4263 'KN': '198.167.192.0/19',
4264 'KP': '175.45.176.0/22',
4265 'KR': '175.192.0.0/10',
4266 'KW': '37.36.0.0/14',
4267 'KY': '64.96.0.0/15',
4268 'KZ': '2.72.0.0/13',
4269 'LA': '115.84.64.0/18',
4270 'LB': '178.135.0.0/16',
4271 'LC': '24.92.144.0/20',
4272 'LI': '82.117.0.0/19',
4273 'LK': '112.134.0.0/15',
4274 'LR': '102.183.0.0/16',
4275 'LS': '129.232.0.0/17',
4276 'LT': '78.56.0.0/13',
4277 'LU': '188.42.0.0/16',
4278 'LV': '46.109.0.0/16',
4279 'LY': '41.252.0.0/14',
4280 'MA': '105.128.0.0/11',
4281 'MC': '88.209.64.0/18',
4282 'MD': '37.246.0.0/16',
4283 'ME': '178.175.0.0/17',
4284 'MF': '74.112.232.0/21',
4285 'MG': '154.126.0.0/17',
4286 'MH': '117.103.88.0/21',
4287 'MK': '77.28.0.0/15',
4288 'ML': '154.118.128.0/18',
4289 'MM': '37.111.0.0/17',
4290 'MN': '49.0.128.0/17',
4291 'MO': '60.246.0.0/16',
4292 'MP': '202.88.64.0/20',
4293 'MQ': '109.203.224.0/19',
4294 'MR': '41.188.64.0/18',
4295 'MS': '208.90.112.0/22',
4296 'MT': '46.11.0.0/16',
4297 'MU': '105.16.0.0/12',
4298 'MV': '27.114.128.0/18',
4299 'MW': '102.70.0.0/15',
4300 'MX': '187.192.0.0/11',
4301 'MY': '175.136.0.0/13',
4302 'MZ': '197.218.0.0/15',
4303 'NA': '41.182.0.0/16',
4304 'NC': '101.101.0.0/18',
4305 'NE': '197.214.0.0/18',
4306 'NF': '203.17.240.0/22',
4307 'NG': '105.112.0.0/12',
4308 'NI': '186.76.0.0/15',
4309 'NL': '145.96.0.0/11',
4310 'NO': '84.208.0.0/13',
4311 'NP': '36.252.0.0/15',
4312 'NR': '203.98.224.0/19',
4313 'NU': '49.156.48.0/22',
4314 'NZ': '49.224.0.0/14',
4315 'OM': '5.36.0.0/15',
4316 'PA': '186.72.0.0/15',
4317 'PE': '186.160.0.0/14',
4318 'PF': '123.50.64.0/18',
4319 'PG': '124.240.192.0/19',
4320 'PH': '49.144.0.0/13',
4321 'PK': '39.32.0.0/11',
4322 'PL': '83.0.0.0/11',
4323 'PM': '70.36.0.0/20',
4324 'PR': '66.50.0.0/16',
4325 'PS': '188.161.0.0/16',
4326 'PT': '85.240.0.0/13',
4327 'PW': '202.124.224.0/20',
4328 'PY': '181.120.0.0/14',
4329 'QA': '37.210.0.0/15',
4330 'RE': '102.35.0.0/16',
4331 'RO': '79.112.0.0/13',
4332 'RS': '93.86.0.0/15',
4333 'RU': '5.136.0.0/13',
4334 'RW': '41.186.0.0/16',
4335 'SA': '188.48.0.0/13',
4336 'SB': '202.1.160.0/19',
4337 'SC': '154.192.0.0/11',
4338 'SD': '102.120.0.0/13',
4339 'SE': '78.64.0.0/12',
4340 'SG': '8.128.0.0/10',
4341 'SI': '188.196.0.0/14',
4342 'SK': '78.98.0.0/15',
4343 'SL': '102.143.0.0/17',
4344 'SM': '89.186.32.0/19',
4345 'SN': '41.82.0.0/15',
4346 'SO': '154.115.192.0/18',
4347 'SR': '186.179.128.0/17',
4348 'SS': '105.235.208.0/21',
4349 'ST': '197.159.160.0/19',
4350 'SV': '168.243.0.0/16',
4351 'SX': '190.102.0.0/20',
4353 'SZ': '41.84.224.0/19',
4354 'TC': '65.255.48.0/20',
4355 'TD': '154.68.128.0/19',
4356 'TG': '196.168.0.0/14',
4357 'TH': '171.96.0.0/13',
4358 'TJ': '85.9.128.0/18',
4359 'TK': '27.96.24.0/21',
4360 'TL': '180.189.160.0/20',
4361 'TM': '95.85.96.0/19',
4362 'TN': '197.0.0.0/11',
4363 'TO': '175.176.144.0/21',
4364 'TR': '78.160.0.0/11',
4365 'TT': '186.44.0.0/15',
4366 'TV': '202.2.96.0/19',
4367 'TW': '120.96.0.0/11',
4368 'TZ': '156.156.0.0/14',
4369 'UA': '37.52.0.0/14',
4370 'UG': '102.80.0.0/13',
4372 'UY': '167.56.0.0/13',
4373 'UZ': '84.54.64.0/18',
4374 'VA': '212.77.0.0/19',
4375 'VC': '207.191.240.0/21',
4376 'VE': '186.88.0.0/13',
4377 'VG': '66.81.192.0/20',
4378 'VI': '146.226.0.0/16',
4379 'VN': '14.160.0.0/11',
4380 'VU': '202.80.32.0/20',
4381 'WF': '117.20.32.0/21',
4382 'WS': '202.4.32.0/19',
4383 'YE': '134.35.0.0/16',
4384 'YT': '41.242.116.0/22',
4385 'ZA': '41.0.0.0/11',
4386 'ZM': '102.144.0.0/13',
4387 'ZW': '102.177.192.0/18',
4391 def random_ipv4(cls, code_or_block):
4392 if len(code_or_block) == 2:
4393 block = cls._country_ip_map.get(code_or_block.upper())
4397 block = code_or_block
4398 addr, preflen = block.split('/')
4399 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4400 addr_max = addr_min | (0xffffffff >> int(preflen))
4401 return compat_str(socket.inet_ntoa(
4402 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4405 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4406 def __init__(self, proxies=None):
4407 # Set default handlers
4408 for type in ('http', 'https'):
4409 setattr(self, '%s_open' % type,
4410 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4411 meth(r, proxy, type))
4412 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4414 def proxy_open(self, req, proxy, type):
4415 req_proxy = req.headers.get('Ytdl-request-proxy')
4416 if req_proxy is not None:
4418 del req.headers['Ytdl-request-proxy']
4420 if proxy == '__noproxy__':
4421 return None # No Proxy
4422 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4423 req.add_header('Ytdl-socks-proxy', proxy)
4424 # yt-dlp's http/https handlers do wrapping the socket with socks
4426 return compat_urllib_request.ProxyHandler.proxy_open(
4427 self, req, proxy, type)
4430 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4431 # released into Public Domain
4432 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4434 def long_to_bytes(n, blocksize=0):
4435 """long_to_bytes(n:long, blocksize:int) : string
4436 Convert a long integer to a byte string.
4438 If optional blocksize is given and greater than zero, pad the front of the
4439 byte string with binary zeros so that the length is a multiple of
4442 # after much testing, this algorithm was deemed to be the fastest
4446 s = compat_struct_pack('>I', n & 0xffffffff) + s
4448 # strip off leading zeros
4449 for i in range(len(s)):
4450 if s[i] != b'\000'[0]:
4453 # only happens when n == 0
4457 # add back some pad bytes. this could be done more efficiently w.r.t. the
4458 # de-padding being done above, but sigh...
4459 if blocksize > 0 and len(s) % blocksize:
4460 s = (blocksize - len(s) % blocksize) * b'\000' + s
4464 def bytes_to_long(s):
4465 """bytes_to_long(string) : long
4466 Convert a byte string to a long integer.
4468 This is (essentially) the inverse of long_to_bytes().
4473 extra = (4 - length % 4)
4474 s = b'\000' * extra + s
4475 length = length + extra
4476 for i in range(0, length, 4):
4477 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4481 def ohdave_rsa_encrypt(data, exponent, modulus):
4483 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
4486 data: data to encrypt, bytes-like object
4487 exponent, modulus: parameter e and N of RSA algorithm, both integer
4488 Output: hex string of encrypted data
4490 Limitation: supports one block encryption only
4493 payload = int(binascii.hexlify(data[::-1]), 16)
4494 encrypted = pow(payload, exponent, modulus)
4495 return '%x' % encrypted
4498 def pkcs1pad(data, length):
4500 Padding input data with PKCS#1 scheme
4502 @param {int[]} data input data
4503 @param {int} length target length
4504 @returns {int[]} padded data
4506 if len(data) > length - 11:
4507 raise ValueError('Input data too
long for PKCS
#1 padding')
4509 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
4510 return [0, 2] + pseudo_random
+ [0] + data
4513 def encode_base_n(num
, n
, table
=None):
4514 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4516 table
= FULL_TABLE
[:n
]
4519 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
4526 ret
= table
[num
% n
] + ret
4531 def decode_packed_codes(code
):
4532 mobj
= re
.search(PACKED_CODES_RE
, code
)
4533 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
4536 symbols
= symbols
.split('|')
4541 base_n_count
= encode_base_n(count
, base
)
4542 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
4545 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
4549 def caesar(s
, alphabet
, shift
):
4554 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
4559 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4562 def parse_m3u8_attributes(attrib
):
4564 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
4565 if val
.startswith('"'):
4571 def urshift(val
, n
):
4572 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
4575 # Based on png2str() written by @gdkchan and improved by @yokrysty
4576 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4577 def decode_png(png_data
):
4578 # Reference: https://www.w3.org/TR/PNG/
4579 header
= png_data
[8:]
4581 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
4582 raise IOError('Not a valid PNG file.')
4584 int_map
= {1: '>B', 2: '>H', 4: '>I'}
4585 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
4590 length
= unpack_integer(header
[:4])
4593 chunk_type
= header
[:4]
4596 chunk_data
= header
[:length
]
4597 header
= header
[length
:]
4599 header
= header
[4:] # Skip CRC
4607 ihdr
= chunks
[0]['data']
4609 width
= unpack_integer(ihdr
[:4])
4610 height
= unpack_integer(ihdr
[4:8])
4614 for chunk
in chunks
:
4615 if chunk
['type'] == b
'IDAT':
4616 idat
+= chunk
['data']
4619 raise IOError('Unable to read PNG data.')
4621 decompressed_data
= bytearray(zlib
.decompress(idat
))
4626 def _get_pixel(idx
):
4631 for y
in range(height
):
4632 basePos
= y
* (1 + stride
)
4633 filter_type
= decompressed_data
[basePos
]
4637 pixels
.append(current_row
)
4639 for x
in range(stride
):
4640 color
= decompressed_data
[1 + basePos
+ x
]
4641 basex
= y
* stride
+ x
4646 left
= _get_pixel(basex
- 3)
4648 up
= _get_pixel(basex
- stride
)
4650 if filter_type
== 1: # Sub
4651 color
= (color
+ left
) & 0xff
4652 elif filter_type
== 2: # Up
4653 color
= (color
+ up
) & 0xff
4654 elif filter_type
== 3: # Average
4655 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
4656 elif filter_type
== 4: # Paeth
4662 c
= _get_pixel(basex
- stride
- 3)
4670 if pa
<= pb
and pa
<= pc
:
4671 color
= (color
+ a
) & 0xff
4673 color
= (color
+ b
) & 0xff
4675 color
= (color
+ c
) & 0xff
4677 current_row
.append(color
)
4679 return width
, height
, pixels
4682 def write_xattr(path
, key
, value
):
4683 # This mess below finds the best xattr tool for the job
4685 # try the pyxattr module...
4688 if hasattr(xattr
, 'set'): # pyxattr
4689 # Unicode arguments are not supported in python-pyxattr until
4691 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4692 pyxattr_required_version
= '0.5.0'
4693 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
4694 # TODO: fallback to CLI tools
4695 raise XAttrUnavailableError(
4696 'python-pyxattr is detected but is too old. '
4697 'yt-dlp requires %s or above while your version is %s. '
4698 'Falling back to other xattr implementations' % (
4699 pyxattr_required_version
, xattr
.__version
__))
4701 setxattr
= xattr
.set
4703 setxattr
= xattr
.setxattr
4706 setxattr(path
, key
, value
)
4707 except EnvironmentError as e
:
4708 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4711 if compat_os_name
== 'nt':
4712 # Write xattrs to NTFS Alternate Data Streams:
4713 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4714 assert ':' not in key
4715 assert os
.path
.exists(path
)
4717 ads_fn
= path
+ ':' + key
4719 with open(ads_fn
, 'wb') as f
:
4721 except EnvironmentError as e
:
4722 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4724 user_has_setfattr
= check_executable('setfattr', ['--version'])
4725 user_has_xattr
= check_executable('xattr', ['-h'])
4727 if user_has_setfattr
or user_has_xattr
:
4729 value
= value
.decode('utf-8')
4730 if user_has_setfattr
:
4731 executable
= 'setfattr'
4732 opts
= ['-n', key
, '-v', value
]
4733 elif user_has_xattr
:
4734 executable
= 'xattr'
4735 opts
= ['-w', key
, value
]
4737 cmd
= ([encodeFilename(executable
, True)]
4738 + [encodeArgument(o
) for o
in opts
]
4739 + [encodeFilename(path
, True)])
4743 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
4744 except EnvironmentError as e
:
4745 raise XAttrMetadataError(e
.errno
, e
.strerror
)
4746 stdout
, stderr
= p
.communicate_or_kill()
4747 stderr
= stderr
.decode('utf-8', 'replace')
4748 if p
.returncode
!= 0:
4749 raise XAttrMetadataError(p
.returncode
, stderr
)
4752 # On Unix, and can't find pyxattr, setfattr, or xattr.
4753 if sys
.platform
.startswith('linux'):
4754 raise XAttrUnavailableError(
4755 "Couldn't find a tool to set the xattrs. "
4756 "Install either the python 'pyxattr' or 'xattr' "
4757 "modules, or the GNU 'attr' package "
4758 "(which contains the 'setfattr' tool).")
4760 raise XAttrUnavailableError(
4761 "Couldn't find a tool to set the xattrs. "
4762 "Install either the python 'xattr' module, "
4763 "or the 'xattr' binary.")
4766 def random_birthday(year_field
, month_field
, day_field
):
4767 start_date
= datetime
.date(1950, 1, 1)
4768 end_date
= datetime
.date(1995, 12, 31)
4769 offset
= random
.randint(0, (end_date
- start_date
).days
)
4770 random_date
= start_date
+ datetime
.timedelta(offset
)
4772 year_field
: str(random_date
.year
),
4773 month_field
: str(random_date
.month
),
4774 day_field
: str(random_date
.day
),
4778 # Templates for internet shortcut files, which are plain text files.
4779 DOT_URL_LINK_TEMPLATE
= '''
4784 DOT_WEBLOC_LINK_TEMPLATE
= '''
4785 <?xml version="1.0" encoding="UTF-8"?>
4786 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787 <plist version="1.0">
4790 \t<string>%(url)s</string>
4795 DOT_DESKTOP_LINK_TEMPLATE
= '''
4805 'url': DOT_URL_LINK_TEMPLATE
,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
4811 def iri_to_uri(iri
):
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4818 iri_parts
= compat_urllib_parse_urlparse(iri
)
4820 if '[' in iri_parts
.netloc
:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4827 if iri_parts
.username
:
4828 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
4829 if iri_parts
.password
is not None:
4830 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
4833 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
4836 net_location
+= ':' + str(iri_parts
.port
)
4838 return compat_urllib_parse_urlunparse(
4842 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
4850 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4855 def to_high_limit_path(path
):
4856 if sys
.platform
in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
4863 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
4865 val
= obj
if obj
is not None else default
4867 val
= obj
.get(field
, default
)
4868 if func
and val
not in ignore
:
4870 return template
% val
if val
not in ignore
else default
4873 def clean_podcast_url(url
):
4874 return re
.sub(r
'''(?x)
4878 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4881 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4884 cn\.co| # https://podcorn.com/analytics-prefix/
4885 st\.fm # https://podsights.com/docs/
4890 _HEX_TABLE
= '0123456789abcdef'
4893 def random_uuidv4():
4894 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4897 def make_dir(path
, to_screen
=None):
4899 dn
= os
.path
.dirname(path
)
4900 if dn
and not os
.path
.exists(dn
):
4903 except (OSError, IOError) as err
:
4904 if callable(to_screen
) is not None:
4905 to_screen('unable to create directory ' + error_to_compat_str(err
))
4909 def get_executable_path():
4910 from zipimport
import zipimporter
4911 if hasattr(sys
, 'frozen'): # Running from PyInstaller
4912 path
= os
.path
.dirname(sys
.executable
)
4913 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
4914 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
4916 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
4917 return os
.path
.abspath(path
)
4920 def load_plugins(name
, suffix
, namespace
):
4923 plugins_spec
= importlib
.util
.spec_from_file_location(
4924 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
4925 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
4926 sys
.modules
[plugins_spec
.name
] = plugins
4927 plugins_spec
.loader
.exec_module(plugins
)
4928 for name
in dir(plugins
):
4929 if name
in namespace
:
4931 if not name
.endswith(suffix
):
4933 klass
= getattr(plugins
, name
)
4934 classes
[name
] = namespace
[name
] = klass
4935 except FileNotFoundError
:
4941 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
4942 casesense
=True, is_user_input
=False, traverse_string
=False):
4943 ''' Traverse nested list/dict/tuple
4944 @param path_list A list of paths which are checked one by one.
4945 Each path is a list of keys where each key is a string,
4946 a function, a tuple of strings/None or "...".
4947 When a fuction is given, it takes the key as argument and
4948 returns whether the key matches or not. When a tuple is given,
4949 all the keys given in the tuple are traversed, and
4950 "..." traverses all the keys in the object
4951 "None" returns the object without traversal
4952 @param default Default value to return
4953 @param expected_type Only accept final value of this type (Can also be any callable)
4954 @param get_all Return all the values obtained from a path or only the first one
4955 @param casesense Whether to consider dictionary keys as case sensitive
4956 @param is_user_input Whether the keys are generated from user input. If True,
4957 strings are converted to int/slice if necessary
4958 @param traverse_string Whether to traverse inside strings. If True, any
4959 non-compatible object will also be converted into a string
4963 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
4964 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
4966 def _traverse_obj(obj
, path
, _current_depth
=0):
4968 path
= tuple(variadic(path
))
4969 for i
, key
in enumerate(path
):
4970 if None in (key
, obj
):
4972 if isinstance(key
, (list, tuple)):
4973 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
4976 obj
= (obj
.values() if isinstance(obj
, dict)
4977 else obj
if isinstance(obj
, (list, tuple, LazyList
))
4978 else str(obj
) if traverse_string
else [])
4980 depth
= max(depth
, _current_depth
)
4981 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
4983 if isinstance(obj
, (list, tuple, LazyList
)):
4984 obj
= enumerate(obj
)
4985 elif isinstance(obj
, dict):
4988 if not traverse_string
:
4992 depth
= max(depth
, _current_depth
)
4993 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if key(k
)]
4994 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
4995 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
4996 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
4999 key
= (int_or_none(key
) if ':' not in key
5000 else slice(*map(int_or_none
, key
.split(':'))))
5001 if key
== slice(None):
5002 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
5003 if not isinstance(key
, (int, slice)):
5005 if not isinstance(obj
, (list, tuple, LazyList
)):
5006 if not traverse_string
:
5015 if isinstance(expected_type
, type):
5016 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
5017 elif expected_type
is not None:
5018 type_test
= expected_type
5020 type_test
= lambda val
: val
5022 for path
in path_list
:
5024 val
= _traverse_obj(obj
, path
)
5027 for _
in range(depth
- 1):
5028 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
5029 val
= [v
for v
in map(type_test
, val
) if v
is not None]
5031 return val
if get_all
else val
[0]
5033 val
= type_test(val
)
5040 def traverse_dict(dictn
, keys
, casesense
=True):
5041 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5042 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5043 return traverse_obj(dictn
, keys
, casesense
=casesense
, is_user_input
=True, traverse_string
=True)
5046 def variadic(x
, allowed_types
=(str, bytes, dict)):
5047 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
5050 # create a JSON Web Signature (jws) with HS256 algorithm
5051 # the resulting format is in JWS Compact Serialization
5052 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5053 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5054 def jwt_encode_hs256(payload_data
, key
, headers
={}):
5060 header_data
.update(headers
)
5061 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode('utf-8'))
5062 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode('utf-8'))
5063 h
= hmac
.new(key
.encode('utf-8'), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
5064 signature_b64
= base64
.b64encode(h
.digest())
5065 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
5069 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5070 def jwt_decode_hs256(jwt
):
5071 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
5072 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
5076 def supports_terminal_sequences(stream
):
5077 if compat_os_name
== 'nt':
5078 from .compat
import WINDOWS_VT_MODE
# Must be imported locally
5079 if not WINDOWS_VT_MODE
or get_windows_version() < (10, 0, 10586):
5081 elif not os
.getenv('TERM'):
5084 return stream
.isatty()
5085 except BaseException
:
5089 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
5092 def remove_terminal_sequences(string
):
5093 return _terminal_sequences_re
.sub('', string
)
5096 def number_of_digits(number
):
5097 return len('%d' % number
)
5100 def join_nonempty(*values
, delim
='-', from_dict
=None):
5101 if from_dict
is not None:
5102 values
= map(from_dict
.get
, values
)
5103 return delim
.join(map(str, filter(None, values
)))
5109 __initialized
= False
5111 def __init__(self
, parser
, label
=None):
5112 self
._parser
, self
.label
= parser
, label
5113 self
._loaded
_paths
, self
.configs
= set(), []
5115 def init(self
, args
=None, filename
=None):
5116 assert not self
.__initialized
5118 location
= os
.path
.realpath(filename
)
5119 if location
in self
._loaded
_paths
:
5121 self
._loaded
_paths
.add(location
)
5123 self
.__initialized
= True
5124 self
.own_args
, self
.filename
= args
, filename
5125 for location
in self
._parser
.parse_args(args
)[0].config_locations
or []:
5126 location
= compat_expanduser(location
)
5127 if os
.path
.isdir(location
):
5128 location
= os
.path
.join(location
, 'yt-dlp.conf')
5129 if not os
.path
.exists(location
):
5130 self
._parser
.error(f
'config location {location} does not exist')
5131 self
.append_config(self
.read_file(location
), location
)
5135 label
= join_nonempty(
5136 self
.label
, 'config', f
'"{self.filename}"' if self
.filename
else '',
5138 return join_nonempty(
5139 self
.own_args
is not None and f
'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5140 *(f
'\n{c}'.replace('\n', '\n| ')[1:] for c
in self
.configs
),
5144 def read_file(filename
, default
=[]):
5146 optionf
= open(filename
)
5148 return default
# silently skip if file is not present
5150 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5151 contents
= optionf
.read()
5152 if sys
.version_info
< (3,):
5153 contents
= contents
.decode(preferredencoding())
5154 res
= compat_shlex_split(contents
, comments
=True)
5160 def hide_login_info(opts
):
5161 PRIVATE_OPTS
= set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5162 eqre
= re
.compile('^(?P<key>' + ('|'.join(re
.escape(po
) for po
in PRIVATE_OPTS
)) + ')=.+$')
5167 return m
.group('key') + '=PRIVATE'
5171 opts
= list(map(_scrub_eq
, opts
))
5172 for idx
, opt
in enumerate(opts
):
5173 if opt
in PRIVATE_OPTS
and idx
+ 1 < len(opts
):
5174 opts
[idx
+ 1] = 'PRIVATE'
5177 def append_config(self
, *args
, label
=None):
5178 config
= type(self
)(self
._parser
, label
)
5179 config
._loaded
_paths
= self
._loaded
_paths
5180 if config
.init(*args
):
5181 self
.configs
.append(config
)
5185 for config
in reversed(self
.configs
):
5186 yield from config
.all_args
5187 yield from self
.own_args
or []
5189 def parse_args(self
):
5190 return self
._parser
.parse_args(list(self
.all_args
))