4 from __future__
import unicode_literals
39 import xml
.etree
.ElementTree
43 compat_HTMLParseError
,
49 compat_ctypes_WINFUNCTYPE
,
50 compat_etree_fromstring
,
53 compat_html_entities_html5
,
66 compat_urllib_parse_urlencode
,
67 compat_urllib_parse_urlparse
,
68 compat_urllib_parse_urlunparse
,
69 compat_urllib_parse_quote
,
70 compat_urllib_parse_quote_plus
,
71 compat_urllib_parse_unquote_plus
,
72 compat_urllib_request
,
83 def register_socks_protocols():
84 # "Register" SOCKS protocols
85 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
86 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
87 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
88 if scheme
not in compat_urlparse
.uses_netloc
:
89 compat_urlparse
.uses_netloc
.append(scheme
)
92 # This is not clearly defined otherwise
93 compiled_regex_type
= type(re
.compile(''))
96 def random_user_agent():
97 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1676 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1680 'User-Agent': random_user_agent(),
1681 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1682 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1683 'Accept-Encoding': 'gzip, deflate',
1684 'Accept-Language': 'en-us,en;q=0.5',
1689 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1693 NO_DEFAULT
= object()
1695 ENGLISH_MONTH_NAMES
= [
1696 'January', 'February', 'March', 'April', 'May', 'June',
1697 'July', 'August', 'September', 'October', 'November', 'December']
1700 'en': ENGLISH_MONTH_NAMES
,
1702 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1703 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1706 KNOWN_EXTENSIONS
= (
1707 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1708 'flv', 'f4v', 'f4a', 'f4b',
1709 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1710 'mkv', 'mka', 'mk3d',
1713 'asf', 'wmv', 'wma',
1719 'f4f', 'f4m', 'm3u8', 'smil')
1721 # needed for sanitizing filenames in restricted mode
1722 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1723 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1724 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1748 '%Y/%m/%d %H:%M:%S',
1752 '%Y-%m-%d %H:%M:%S',
1753 '%Y-%m-%d %H:%M:%S.%f',
1754 '%Y-%m-%d %H:%M:%S:%f',
1757 '%Y-%m-%dT%H:%M:%SZ',
1758 '%Y-%m-%dT%H:%M:%S.%fZ',
1759 '%Y-%m-%dT%H:%M:%S.%f0Z',
1760 '%Y-%m-%dT%H:%M:%S',
1761 '%Y-%m-%dT%H:%M:%S.%f',
1763 '%b %d %Y at %H:%M',
1764 '%b %d %Y at %H:%M:%S',
1765 '%B %d %Y at %H:%M',
1766 '%B %d %Y at %H:%M:%S',
1770 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1771 DATE_FORMATS_DAY_FIRST
.extend([
1777 '%d/%m/%Y %H:%M:%S',
1780 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1781 DATE_FORMATS_MONTH_FIRST
.extend([
1786 '%m/%d/%Y %H:%M:%S',
1789 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1790 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1793 def preferredencoding():
1794 """Get preferred encoding.
1796 Returns the best encoding scheme for the system, based on
1797 locale.getpreferredencoding() and some further tweaks.
1800 pref = locale.getpreferredencoding()
1808 def write_json_file(obj, fn):
1809 """ Encode obj as JSON and write it to fn, atomically if possible """
1811 fn = encodeFilename(fn)
1812 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1813 encoding = get_filesystem_encoding()
1814 # os.path.basename returns a bytes object, but NamedTemporaryFile
1815 # will fail if the filename contains non ascii characters unless we
1816 # use a unicode object
1817 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1818 # the same for os.path.dirname
1819 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1821 path_basename = os.path.basename
1822 path_dirname = os.path.dirname
1826 'prefix
': path_basename(fn) + '.',
1827 'dir': path_dirname(fn),
1831 # In Python 2.x, json.dump expects a bytestream.
1832 # In Python 3.x, it writes to a character stream
1833 if sys.version_info < (3, 0):
1838 'encoding
': 'utf
-8',
1841 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1846 if sys.platform == 'win32
':
1847 # Need to remove existing file on Windows, else os.rename raises
1848 # WindowsError or FileExistsError.
1856 os.chmod(tf.name, 0o666 & ~mask)
1859 os.rename(tf.name, fn)
1868 if sys.version_info >= (2, 7):
1869 def find_xpath_attr(node, xpath, key, val=None):
1870 """ Find the xpath xpath[@key=val] """
1871 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1872 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1873 return node.find(expr)
1875 def find_xpath_attr(node, xpath, key, val=None):
1876 for f in node.findall(compat_xpath(xpath)):
1877 if key not in f.attrib:
1879 if val is None or f.attrib.get(key) == val:
1883 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1884 # the namespace parameter
1887 def xpath_with_ns(path
, ns_map
):
1888 components
= [c
.split(':') for c
in path
.split('/')]
1890 for c
in components
:
1892 replaced
.append(c
[0])
1895 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1896 return '/'.join(replaced
)
1899 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1900 def _find_xpath(xpath
):
1901 return node
.find(compat_xpath(xpath
))
1903 if isinstance(xpath
, (str, compat_str
)):
1904 n
= _find_xpath(xpath
)
1912 if default
is not NO_DEFAULT
:
1915 name
= xpath
if name
is None else name
1916 raise ExtractorError('Could not find XML element %s' % name
)
1922 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1923 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1924 if n
is None or n
== default
:
1927 if default
is not NO_DEFAULT
:
1930 name
= xpath
if name
is None else name
1931 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1937 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1938 n
= find_xpath_attr(node
, xpath
, key
)
1940 if default
is not NO_DEFAULT
:
1943 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1944 raise ExtractorError('Could not find XML attribute %s' % name
)
1947 return n
.attrib
[key
]
1950 def get_element_by_id(id, html
):
1951 """Return the content of the tag with the specified ID in the passed HTML document"""
1952 return get_element_by_attribute('id', id, html
)
1955 def get_element_by_class(class_name
, html
):
1956 """Return the content of the first tag with the specified class in the passed HTML document"""
1957 retval
= get_elements_by_class(class_name
, html
)
1958 return retval
[0] if retval
else None
1961 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1962 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1963 return retval
[0] if retval
else None
1966 def get_elements_by_class(class_name
, html
):
1967 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1968 return get_elements_by_attribute(
1969 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1970 html, escape_value=False)
1973 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1974 """Return the content of the tag with the specified attribute in the passed HTML document"""
1976 value = re.escape(value) if escape_value else value
1979 for m in re.finditer(r'''(?xs)
1981 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1983 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1987 ''' % (re.escape(attribute), value), html):
1988 res = m.group('content
')
1990 if res.startswith('"') or res.startswith("'"):
1993 retlist.append(unescapeHTML(res))
1998 class HTMLAttributeParser(compat_HTMLParser):
1999 """Trivial HTML parser to gather the attributes for a single element"""
2003 compat_HTMLParser.__init__(self)
2005 def handle_starttag(self, tag, attrs):
2006 self.attrs = dict(attrs)
2009 class HTMLListAttrsParser(compat_HTMLParser):
2010 """HTML parser to gather the attributes for the elements of a list"""
2013 compat_HTMLParser.__init__(self)
2017 def handle_starttag(self, tag, attrs):
2018 if tag == 'li
' and self._level == 0:
2019 self.items.append(dict(attrs))
2022 def handle_endtag(self, tag):
2026 def extract_attributes(html_element):
2027 """Given a string for an HTML element such as
2029 a="foo" B="bar" c="&98;az" d=boz
2030 empty= noval entity="&"
2033 Decode and return a dictionary of attributes.
2035 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2036 'empty
': '', 'noval
': None, 'entity
': '&',
2037 'sq
': '"', 'dq': '\''
2039 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2040 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2042 parser = HTMLAttributeParser()
2044 parser.feed(html_element)
2046 # Older Python may throw HTMLParseError in case of malformed HTML
2047 except compat_HTMLParseError:
2052 def parse_list(webpage):
2053 """Given a string for an series of HTML <li> elements,
2054 return a dictionary of their attributes"""
2055 parser = HTMLListAttrsParser()
2056 parser.feed(webpage)
2061 def clean_html(html):
2062 """Clean an HTML snippet into a readable string"""
2064 if html is None: # Convenience for sanitizing descriptions etc.
2068 html = html.replace('\n', ' ')
2069 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2070 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2072 html = re.sub('<.*?>', '', html)
2073 # Replace html entities
2074 html = unescapeHTML(html)
2078 def sanitize_open(filename, open_mode):
2079 """Try to open the given filename, and slightly tweak it if this fails.
2081 Attempts to open the given filename. If this fails, it tries to change
2082 the filename slightly, step by step, until it's either able to open it
2083 or it fails and raises a final exception, like the standard open()
2086 It returns the tuple (stream, definitive_file_name).
2090 if sys.platform == 'win32':
2092 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2093 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2094 stream = open(encodeFilename(filename), open_mode)
2095 return (stream, filename)
2096 except (IOError, OSError) as err:
2097 if err.errno in (errno.EACCES,):
2100 # In case of error, try to remove win32 forbidden chars
2101 alt_filename = sanitize_path(filename)
2102 if alt_filename == filename:
2105 # An exception here should be caught in the caller
2106 stream = open(encodeFilename(alt_filename), open_mode)
2107 return (stream, alt_filename)
2110 def timeconvert(timestr):
2111 """Convert RFC 2822 defined time string into system timestamp"""
2113 timetuple = email.utils.parsedate_tz(timestr)
2114 if timetuple is not None:
2115 timestamp = email.utils.mktime_tz(timetuple)
2119 def sanitize_filename(s, restricted=False, is_id=False):
2120 """Sanitizes a string so it could be used as part of a filename.
2121 If restricted is set, use a stricter subset of allowed characters.
2122 Set is_id if this is not an arbitrary string, but an ID that should be kept
2125 def replace_insane(char):
2126 if restricted and char in ACCENT_CHARS:
2127 return ACCENT_CHARS[char]
2128 elif not restricted and char == '\n':
2130 elif char == '?' or ord(char) < 32 or ord(char) == 127:
2133 return '' if restricted else '\''
2135 return '_
-' if restricted else ' -'
2136 elif char in '\\/|
*<>':
2138 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2140 if restricted
and ord(char
) > 127:
2147 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2148 result
= ''.join(map(replace_insane
, s
))
2150 while '__' in result
:
2151 result
= result
.replace('__', '_')
2152 result
= result
.strip('_')
2153 # Common case of "Foreign band name - English song title"
2154 if restricted
and result
.startswith('-_'):
2156 if result
.startswith('-'):
2157 result
= '_' + result
[len('-'):]
2158 result
= result
.lstrip('.')
2164 def sanitize_path(s
, force
=False):
2165 """Sanitizes and normalizes path on Windows"""
2166 if sys
.platform
== 'win32':
2168 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2169 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2170 drive_or_unc
, _
= os
.path
.splitunc(s
)
2176 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2180 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2181 for path_part
in norm_path
]
2183 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2184 elif force
and s
[0] == os
.path
.sep
:
2185 sanitized_path
.insert(0, os
.path
.sep
)
2186 return os
.path
.join(*sanitized_path
)
2189 def sanitize_url(url
):
2190 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2191 # the number of unwanted failures due to missing protocol
2192 if url
.startswith('//'):
2193 return 'http:%s' % url
2194 # Fix some common typos seen so far
2196 # https://github.com/ytdl-org/youtube-dl/issues/15649
2197 (r
'^httpss://', r
'https://'),
2198 # https://bx1.be/lives/direct-tv/
2199 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2201 for mistake
, fixup
in COMMON_TYPOS
:
2202 if re
.match(mistake
, url
):
2203 return re
.sub(mistake
, fixup
, url
)
2207 def extract_basic_auth(url
):
2208 parts
= compat_urlparse
.urlsplit(url
)
2209 if parts
.username
is None:
2211 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
2212 parts
.hostname
if parts
.port
is None
2213 else '%s:%d' % (parts
.hostname
, parts
.port
))))
2214 auth_payload
= base64
.b64encode(
2215 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
2216 return url
, 'Basic ' + auth_payload
.decode('utf-8')
2219 def sanitized_Request(url
, *args
, **kwargs
):
2220 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
2221 if auth_header
is not None:
2222 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
2223 headers
['Authorization'] = auth_header
2224 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
2228 """Expand shell variables and ~"""
2229 return os
.path
.expandvars(compat_expanduser(s
))
2232 def orderedSet(iterable
):
2233 """ Remove all duplicates from the input iterable """
2241 def _htmlentity_transform(entity_with_semicolon
):
2242 """Transforms an HTML entity to a character."""
2243 entity
= entity_with_semicolon
[:-1]
2245 # Known non-numeric HTML entity
2246 if entity
in compat_html_entities
.name2codepoint
:
2247 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2249 # TODO: HTML5 allows entities without a semicolon. For example,
2250 # 'Éric' should be decoded as 'Éric'.
2251 if entity_with_semicolon
in compat_html_entities_html5
:
2252 return compat_html_entities_html5
[entity_with_semicolon
]
2254 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2255 if mobj
is not None:
2256 numstr
= mobj
.group(1)
2257 if numstr
.startswith('x'):
2259 numstr
= '0%s' % numstr
2262 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2264 return compat_chr(int(numstr
, base
))
2268 # Unknown entity in name, return its literal representation
2269 return '&%s;' % entity
2272 def unescapeHTML(s
):
2275 assert type(s
) == compat_str
2278 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2281 def escapeHTML(text
):
2284 .replace('&', '&')
2285 .replace('<', '<')
2286 .replace('>', '>')
2287 .replace('"', '"')
2288 .replace("'", ''')
2292 def process_communicate_or_kill(p
, *args
, **kwargs
):
2294 return p
.communicate(*args
, **kwargs
)
2295 except BaseException
: # Including KeyboardInterrupt
2301 class Popen(subprocess
.Popen
):
2302 if sys
.platform
== 'win32':
2303 _startupinfo
= subprocess
.STARTUPINFO()
2304 _startupinfo
.dwFlags |
= subprocess
.STARTF_USESHOWWINDOW
2308 def __init__(self
, *args
, **kwargs
):
2309 super(Popen
, self
).__init
__(*args
, **kwargs
, startupinfo
=self
._startupinfo
)
2311 def communicate_or_kill(self
, *args
, **kwargs
):
2312 return process_communicate_or_kill(self
, *args
, **kwargs
)
2315 def get_subprocess_encoding():
2316 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2317 # For subprocess calls, encode with locale encoding
2318 # Refer to http://stackoverflow.com/a/9951851/35070
2319 encoding
= preferredencoding()
2321 encoding
= sys
.getfilesystemencoding()
2322 if encoding
is None:
2327 def encodeFilename(s
, for_subprocess
=False):
2329 @param s The name of the file
2332 assert type(s
) == compat_str
2334 # Python 3 has a Unicode API
2335 if sys
.version_info
>= (3, 0):
2338 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2339 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2340 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2341 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2344 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2345 if sys
.platform
.startswith('java'):
2348 return s
.encode(get_subprocess_encoding(), 'ignore')
2351 def decodeFilename(b
, for_subprocess
=False):
2353 if sys
.version_info
>= (3, 0):
2356 if not isinstance(b
, bytes):
2359 return b
.decode(get_subprocess_encoding(), 'ignore')
2362 def encodeArgument(s
):
2363 if not isinstance(s
, compat_str
):
2364 # Legacy code that uses byte strings
2365 # Uncomment the following line after fixing all post processors
2366 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2367 s
= s
.decode('ascii')
2368 return encodeFilename(s
, True)
2371 def decodeArgument(b
):
2372 return decodeFilename(b
, True)
2375 def decodeOption(optval
):
2378 if isinstance(optval
, bytes):
2379 optval
= optval
.decode(preferredencoding())
2381 assert isinstance(optval
, compat_str
)
2385 _timetuple
= collections
.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
2388 def timetuple_from_msec(msec
):
2389 secs
, msec
= divmod(msec
, 1000)
2390 mins
, secs
= divmod(secs
, 60)
2391 hrs
, mins
= divmod(mins
, 60)
2392 return _timetuple(hrs
, mins
, secs
, msec
)
2395 def formatSeconds(secs
, delim
=':', msec
=False):
2396 time
= timetuple_from_msec(secs
* 1000)
2398 ret
= '%d%s%02d%s%02d' % (time
.hours
, delim
, time
.minutes
, delim
, time
.seconds
)
2400 ret
= '%d%s%02d' % (time
.minutes
, delim
, time
.seconds
)
2402 ret
= '%d' % time
.seconds
2403 return '%s.%03d' % (ret
, time
.milliseconds
) if msec
else ret
2406 def _ssl_load_windows_store_certs(ssl_context
, storename
):
2407 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
2409 certs
= [cert
for cert
, encoding
, trust
in ssl
.enum_certificates(storename
)
2410 if encoding
== 'x509_asn' and (
2411 trust
is True or ssl
.Purpose
.SERVER_AUTH
.oid
in trust
)]
2412 except PermissionError
:
2416 ssl_context
.load_verify_locations(cadata
=cert
)
2417 except ssl
.SSLError
:
2421 def make_HTTPS_handler(params
, **kwargs
):
2422 opts_check_certificate
= not params
.get('nocheckcertificate')
2423 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
2424 context
.check_hostname
= opts_check_certificate
2425 context
.verify_mode
= ssl
.CERT_REQUIRED
if opts_check_certificate
else ssl
.CERT_NONE
2426 if opts_check_certificate
:
2428 context
.load_default_certs()
2429 # Work around the issue in load_default_certs when there are bad certificates. See:
2430 # https://github.com/yt-dlp/yt-dlp/issues/1060,
2431 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
2432 except ssl
.SSLError
:
2433 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
2434 if sys
.platform
== 'win32' and hasattr(ssl
, 'enum_certificates'):
2435 # Create a new context to discard any certificates that were already loaded
2436 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLS_CLIENT
)
2437 context
.check_hostname
, context
.verify_mode
= True, ssl
.CERT_REQUIRED
2438 for storename
in ('CA', 'ROOT'):
2439 _ssl_load_windows_store_certs(context
, storename
)
2440 context
.set_default_verify_paths()
2441 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2444 def bug_reports_message(before
=';'):
2445 if ytdl_is_updateable():
2446 update_cmd
= 'type yt-dlp -U to update'
2448 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2449 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2450 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2451 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2453 before
= before
.rstrip()
2454 if not before
or before
.endswith(('.', '!', '?')):
2455 msg
= msg
[0].title() + msg
[1:]
2457 return (before
+ ' ' if before
else '') + msg
2460 class YoutubeDLError(Exception):
2461 """Base exception for YoutubeDL errors."""
2464 def __init__(self
, msg
=None):
2467 elif self
.msg
is None:
2468 self
.msg
= type(self
).__name
__
2469 super().__init
__(self
.msg
)
2472 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2473 if hasattr(ssl
, 'CertificateError'):
2474 network_exceptions
.append(ssl
.CertificateError
)
2475 network_exceptions
= tuple(network_exceptions
)
2478 class ExtractorError(YoutubeDLError
):
2479 """Error during info extraction."""
2481 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
2482 """ tb, if given, is the original traceback (so that it can be printed out).
2483 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2485 if sys
.exc_info()[0] in network_exceptions
:
2490 self
.expected
= expected
2492 self
.video_id
= video_id
2494 self
.exc_info
= sys
.exc_info() # preserve original exception
2496 super(ExtractorError
, self
).__init
__(''.join((
2497 format_field(ie
, template
='[%s] '),
2498 format_field(video_id
, template
='%s: '),
2500 format_field(cause
, template
=' (caused by %r)'),
2501 '' if expected
else bug_reports_message())))
2503 def format_traceback(self
):
2504 if self
.traceback
is None:
2506 return ''.join(traceback
.format_tb(self
.traceback
))
2509 class UnsupportedError(ExtractorError
):
2510 def __init__(self
, url
):
2511 super(UnsupportedError
, self
).__init
__(
2512 'Unsupported URL: %s' % url
, expected
=True)
2516 class RegexNotFoundError(ExtractorError
):
2517 """Error when a regex didn't match"""
2521 class GeoRestrictedError(ExtractorError
):
2522 """Geographic restriction Error exception.
2524 This exception may be thrown when a video is not available from your
2525 geographic location due to geographic restrictions imposed by a website.
2528 def __init__(self
, msg
, countries
=None, **kwargs
):
2529 kwargs
['expected'] = True
2530 super(GeoRestrictedError
, self
).__init
__(msg
, **kwargs
)
2531 self
.countries
= countries
2534 class DownloadError(YoutubeDLError
):
2535 """Download Error exception.
2537 This exception may be thrown by FileDownloader objects if they are not
2538 configured to continue on errors. They will contain the appropriate
2542 def __init__(self
, msg
, exc_info
=None):
2543 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2544 super(DownloadError
, self
).__init
__(msg
)
2545 self
.exc_info
= exc_info
2548 class EntryNotInPlaylist(YoutubeDLError
):
2549 """Entry not in playlist exception.
2551 This exception will be thrown by YoutubeDL when a requested entry
2552 is not found in the playlist info_dict
2554 msg
= 'Entry not found in info'
2557 class SameFileError(YoutubeDLError
):
2558 """Same File exception.
2560 This exception will be thrown by FileDownloader objects if they detect
2561 multiple files would have to be downloaded to the same file on disk.
2563 msg
= 'Fixed output name but more than one file to download'
2565 def __init__(self
, filename
=None):
2566 if filename
is not None:
2567 self
.msg
+= f
': {filename}'
2568 super().__init
__(self
.msg
)
2571 class PostProcessingError(YoutubeDLError
):
2572 """Post Processing exception.
2574 This exception may be raised by PostProcessor's .run() method to
2575 indicate an error in the postprocessing task.
2578 def __init__(self
, msg
):
2579 super(PostProcessingError
, self
).__init
__(msg
)
2583 class DownloadCancelled(YoutubeDLError
):
2584 """ Exception raised when the download queue should be interrupted """
2585 msg
= 'The download was cancelled'
2588 class ExistingVideoReached(DownloadCancelled
):
2589 """ --break-on-existing triggered """
2590 msg
= 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
2593 class RejectedVideoReached(DownloadCancelled
):
2594 """ --break-on-reject triggered """
2595 msg
= 'Encountered a video that did not match filter, stopping due to --break-on-reject'
2598 class MaxDownloadsReached(DownloadCancelled
):
2599 """ --max-downloads limit has been reached. """
2600 msg
= 'Maximum number of downloads reached, stopping due to --max-downloads'
2603 class ThrottledDownload(YoutubeDLError
):
2604 """ Download speed below --throttled-rate. """
2605 msg
= 'The download speed is below throttle limit'
2608 class UnavailableVideoError(YoutubeDLError
):
2609 """Unavailable Format exception.
2611 This exception will be thrown when a video is requested
2612 in a format that is not available for that video.
2614 msg
= 'Unable to download video'
2616 def __init__(self
, err
=None):
2618 self
.msg
+= f
': {err}'
2619 super().__init
__(self
.msg
)
2622 class ContentTooShortError(YoutubeDLError
):
2623 """Content Too Short exception.
2625 This exception may be raised by FileDownloader objects when a file they
2626 download is too small for what the server announced first, indicating
2627 the connection was probably interrupted.
2630 def __init__(self
, downloaded
, expected
):
2631 super(ContentTooShortError
, self
).__init
__(
2632 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2635 self
.downloaded
= downloaded
2636 self
.expected
= expected
2639 class XAttrMetadataError(YoutubeDLError
):
2640 def __init__(self
, code
=None, msg
='Unknown error'):
2641 super(XAttrMetadataError
, self
).__init
__(msg
)
2645 # Parsing code and msg
2646 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2647 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2648 self
.reason
= 'NO_SPACE'
2649 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2650 self
.reason
= 'VALUE_TOO_LONG'
2652 self
.reason
= 'NOT_SUPPORTED'
2655 class XAttrUnavailableError(YoutubeDLError
):
2659 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2660 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2661 # expected HTTP responses to meet HTTP/1.0 or later (see also
2662 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2663 if sys
.version_info
< (3, 0):
2664 kwargs
['strict'] = True
2665 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2666 source_address
= ydl_handler
._params
.get('source_address')
2668 if source_address
is not None:
2669 # This is to workaround _create_connection() from socket where it will try all
2670 # address data from getaddrinfo() including IPv6. This filters the result from
2671 # getaddrinfo() based on the source_address value.
2672 # This is based on the cpython socket.create_connection() function.
2673 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2674 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2675 host
, port
= address
2677 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2678 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2679 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2680 if addrs
and not ip_addrs
:
2681 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2683 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2684 % (ip_version
, source_address
[0]))
2685 for res
in ip_addrs
:
2686 af
, socktype
, proto
, canonname
, sa
= res
2689 sock
= socket
.socket(af
, socktype
, proto
)
2690 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2691 sock
.settimeout(timeout
)
2692 sock
.bind(source_address
)
2694 err
= None # Explicitly break reference cycle
2696 except socket
.error
as _
:
2698 if sock
is not None:
2703 raise socket
.error('getaddrinfo returns an empty list')
2704 if hasattr(hc
, '_create_connection'):
2705 hc
._create
_connection
= _create_connection
2706 sa
= (source_address
, 0)
2707 if hasattr(hc
, 'source_address'): # Python 2.7+
2708 hc
.source_address
= sa
2710 def _hc_connect(self
, *args
, **kwargs
):
2711 sock
= _create_connection(
2712 (self
.host
, self
.port
), self
.timeout
, sa
)
2714 self
.sock
= ssl
.wrap_socket(
2715 sock
, self
.key_file
, self
.cert_file
,
2716 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2719 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2724 def handle_youtubedl_headers(headers
):
2725 filtered_headers
= headers
2727 if 'Youtubedl-no-compression' in filtered_headers
:
2728 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2729 del filtered_headers
['Youtubedl-no-compression']
2731 return filtered_headers
2734 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2735 """Handler for HTTP requests and responses.
2737 This class, when installed with an OpenerDirector, automatically adds
2738 the standard headers to every HTTP request and handles gzipped and
2739 deflated responses from web servers. If compression is to be avoided in
2740 a particular request, the original request in the program code only has
2741 to include the HTTP header "Youtubedl-no-compression", which will be
2742 removed before making the real request.
2744 Part of this code was copied from:
2746 http://techknack.net/python-urllib2-handlers/
2748 Andrew Rowls, the author of that code, agreed to release it to the
2752 def __init__(self
, params
, *args
, **kwargs
):
2753 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2754 self
._params
= params
2756 def http_open(self
, req
):
2757 conn_class
= compat_http_client
.HTTPConnection
2759 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2761 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2762 del req
.headers
['Ytdl-socks-proxy']
2764 return self
.do_open(functools
.partial(
2765 _create_http_connection
, self
, conn_class
, False),
2773 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2775 return zlib
.decompress(data
)
2777 def http_request(self
, req
):
2778 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2779 # always respected by websites, some tend to give out URLs with non percent-encoded
2780 # non-ASCII characters (see telemb.py, ard.py [#3412])
2781 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2782 # To work around aforementioned issue we will replace request's original URL with
2783 # percent-encoded one
2784 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2785 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2786 url
= req
.get_full_url()
2787 url_escaped
= escape_url(url
)
2789 # Substitute URL if any change after escaping
2790 if url
!= url_escaped
:
2791 req
= update_Request(req
, url
=url_escaped
)
2793 for h
, v
in std_headers
.items():
2794 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2795 # The dict keys are capitalized because of this bug by urllib
2796 if h
.capitalize() not in req
.headers
:
2797 req
.add_header(h
, v
)
2799 req
.headers
= handle_youtubedl_headers(req
.headers
)
2801 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2802 # Python 2.6 is brain-dead when it comes to fragments
2803 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2804 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2808 def http_response(self
, req
, resp
):
2811 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2812 content
= resp
.read()
2813 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2815 uncompressed
= io
.BytesIO(gz
.read())
2816 except IOError as original_ioerror
:
2817 # There may be junk add the end of the file
2818 # See http://stackoverflow.com/q/4928560/35070 for details
2819 for i
in range(1, 1024):
2821 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2822 uncompressed
= io
.BytesIO(gz
.read())
2827 raise original_ioerror
2828 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2829 resp
.msg
= old_resp
.msg
2830 del resp
.headers
['Content-encoding']
2832 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2833 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2834 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2835 resp
.msg
= old_resp
.msg
2836 del resp
.headers
['Content-encoding']
2837 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2838 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2839 if 300 <= resp
.code
< 400:
2840 location
= resp
.headers
.get('Location')
2842 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2843 if sys
.version_info
>= (3, 0):
2844 location
= location
.encode('iso-8859-1').decode('utf-8')
2846 location
= location
.decode('utf-8')
2847 location_escaped
= escape_url(location
)
2848 if location
!= location_escaped
:
2849 del resp
.headers
['Location']
2850 if sys
.version_info
< (3, 0):
2851 location_escaped
= location_escaped
.encode('utf-8')
2852 resp
.headers
['Location'] = location_escaped
2855 https_request
= http_request
2856 https_response
= http_response
2859 def make_socks_conn_class(base_class
, socks_proxy
):
2860 assert issubclass(base_class
, (
2861 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2863 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2864 if url_components
.scheme
.lower() == 'socks5':
2865 socks_type
= ProxyType
.SOCKS5
2866 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2867 socks_type
= ProxyType
.SOCKS4
2868 elif url_components
.scheme
.lower() == 'socks4a':
2869 socks_type
= ProxyType
.SOCKS4A
2871 def unquote_if_non_empty(s
):
2874 return compat_urllib_parse_unquote_plus(s
)
2878 url_components
.hostname
, url_components
.port
or 1080,
2880 unquote_if_non_empty(url_components
.username
),
2881 unquote_if_non_empty(url_components
.password
),
2884 class SocksConnection(base_class
):
2886 self
.sock
= sockssocket()
2887 self
.sock
.setproxy(*proxy_args
)
2888 if type(self
.timeout
) in (int, float):
2889 self
.sock
.settimeout(self
.timeout
)
2890 self
.sock
.connect((self
.host
, self
.port
))
2892 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2893 if hasattr(self
, '_context'): # Python > 2.6
2894 self
.sock
= self
._context
.wrap_socket(
2895 self
.sock
, server_hostname
=self
.host
)
2897 self
.sock
= ssl
.wrap_socket(self
.sock
)
2899 return SocksConnection
2902 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2903 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2904 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2905 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2906 self
._params
= params
2908 def https_open(self
, req
):
2910 conn_class
= self
._https
_conn
_class
2912 if hasattr(self
, '_context'): # python > 2.6
2913 kwargs
['context'] = self
._context
2914 if hasattr(self
, '_check_hostname'): # python 3.x
2915 kwargs
['check_hostname'] = self
._check
_hostname
2917 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2919 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2920 del req
.headers
['Ytdl-socks-proxy']
2922 return self
.do_open(functools
.partial(
2923 _create_http_connection
, self
, conn_class
, True),
2927 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2929 See [1] for cookie file format.
2931 1. https://curl.haxx.se/docs/http-cookies.html
2933 _HTTPONLY_PREFIX
= '#HttpOnly_'
2935 _HEADER
= '''# Netscape HTTP Cookie File
2936 # This file is generated by yt-dlp. Do not edit.
2939 _CookieFileEntry
= collections
.namedtuple(
2941 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2943 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2945 Save cookies to a file.
2947 Most of the code is taken from CPython 3.8 and slightly adapted
2948 to support cookie files with UTF-8 in both python 2 and 3.
2950 if filename
is None:
2951 if self
.filename
is not None:
2952 filename
= self
.filename
2954 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2956 # Store session cookies with `expires` set to 0 instead of an empty
2959 if cookie
.expires
is None:
2962 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2963 f
.write(self
._HEADER
)
2966 if not ignore_discard
and cookie
.discard
:
2968 if not ignore_expires
and cookie
.is_expired(now
):
2974 if cookie
.domain
.startswith('.'):
2975 initial_dot
= 'TRUE'
2977 initial_dot
= 'FALSE'
2978 if cookie
.expires
is not None:
2979 expires
= compat_str(cookie
.expires
)
2982 if cookie
.value
is None:
2983 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2984 # with no name, whereas http.cookiejar regards it as a
2985 # cookie with no value.
2990 value
= cookie
.value
2992 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2993 secure
, expires
, name
, value
]) + '\n')
2995 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2996 """Load cookies from a file."""
2997 if filename
is None:
2998 if self
.filename
is not None:
2999 filename
= self
.filename
3001 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
3003 def prepare_line(line
):
3004 if line
.startswith(self
._HTTPONLY
_PREFIX
):
3005 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
3006 # comments and empty lines are fine
3007 if line
.startswith('#') or not line
.strip():
3009 cookie_list
= line
.split('\t')
3010 if len(cookie_list
) != self
._ENTRY
_LEN
:
3011 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
3012 cookie
= self
._CookieFileEntry
(*cookie_list
)
3013 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
3014 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
3018 with io
.open(filename
, encoding
='utf-8') as f
:
3021 cf
.write(prepare_line(line
))
3022 except compat_cookiejar
.LoadError
as e
:
3024 'WARNING: skipping cookie file entry due to %s: %r\n'
3025 % (e
, line
), sys
.stderr
)
3028 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
3029 # Session cookies are denoted by either `expires` field set to
3030 # an empty string or 0. MozillaCookieJar only recognizes the former
3031 # (see [1]). So we need force the latter to be recognized as session
3032 # cookies on our own.
3033 # Session cookies may be important for cookies-based authentication,
3034 # e.g. usually, when user does not check 'Remember me' check box while
3035 # logging in on a site, some important cookies are stored as session
3036 # cookies so that not recognizing them will result in failed login.
3037 # 1. https://bugs.python.org/issue17164
3039 # Treat `expires=0` cookies as session cookies
3040 if cookie
.expires
== 0:
3041 cookie
.expires
= None
3042 cookie
.discard
= True
3045 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
3046 def __init__(self
, cookiejar
=None):
3047 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
3049 def http_response(self
, request
, response
):
3050 # Python 2 will choke on next HTTP request in row if there are non-ASCII
3051 # characters in Set-Cookie HTTP header of last response (see
3052 # https://github.com/ytdl-org/youtube-dl/issues/6769).
3053 # In order to at least prevent crashing we will percent encode Set-Cookie
3054 # header before HTTPCookieProcessor starts processing it.
3055 # if sys.version_info < (3, 0) and response.headers:
3056 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
3057 # set_cookie = response.headers.get(set_cookie_header)
3059 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
3060 # if set_cookie != set_cookie_escaped:
3061 # del response.headers[set_cookie_header]
3062 # response.headers[set_cookie_header] = set_cookie_escaped
3063 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
3065 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
3066 https_response
= http_response
3069 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
3070 """YoutubeDL redirect handler
3072 The code is based on HTTPRedirectHandler implementation from CPython [1].
3074 This redirect handler solves two issues:
3075 - ensures redirect URL is always unicode under python 2
3076 - introduces support for experimental HTTP response status code
3077 308 Permanent Redirect [2] used by some sites [3]
3079 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
3080 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
3081 3. https://github.com/ytdl-org/youtube-dl/issues/28768
3084 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
3086 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
3087 """Return a Request or None in response to a redirect.
3089 This is called by the http_error_30x methods when a
3090 redirection response is received. If a redirection should
3091 take place, return a new Request to allow http_error_30x to
3092 perform the redirect. Otherwise, raise HTTPError if no-one
3093 else should try to handle this url. Return None if you can't
3094 but another Handler might.
3096 m
= req
.get_method()
3097 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
3098 or code
in (301, 302, 303) and m
== "POST")):
3099 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
3100 # Strictly (according to RFC 2616), 301 or 302 in response to
3101 # a POST MUST NOT cause a redirection without confirmation
3102 # from the user (of urllib.request, in this case). In practice,
3103 # essentially all clients do redirect in this case, so we do
3106 # On python 2 urlh.geturl() may sometimes return redirect URL
3107 # as byte string instead of unicode. This workaround allows
3108 # to force it always return unicode.
3109 if sys
.version_info
[0] < 3:
3110 newurl
= compat_str(newurl
)
3112 # Be conciliant with URIs containing a space. This is mainly
3113 # redundant with the more complete encoding done in http_error_302(),
3114 # but it is kept for compatibility with other callers.
3115 newurl
= newurl
.replace(' ', '%20')
3117 CONTENT_HEADERS
= ("content-length", "content-type")
3118 # NB: don't use dict comprehension for python 2.6 compatibility
3119 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
3120 if k
.lower() not in CONTENT_HEADERS
)
3121 return compat_urllib_request
.Request(
3122 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
3126 def extract_timezone(date_str
):
3129 ^.{8,}? # >=8 char non-TZ prefix, if present
3130 (?P<tz>Z| # just the UTC Z, or
3131 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
3132 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
3133 [ ]? # optional space
3134 (?P<sign>\+|-) # +/-
3135 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
3139 timezone
= datetime
.timedelta()
3141 date_str
= date_str
[:-len(m
.group('tz'))]
3142 if not m
.group('sign'):
3143 timezone
= datetime
.timedelta()
3145 sign
= 1 if m
.group('sign') == '+' else -1
3146 timezone
= datetime
.timedelta(
3147 hours
=sign
* int(m
.group('hours')),
3148 minutes
=sign
* int(m
.group('minutes')))
3149 return timezone
, date_str
3152 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3153 """ Return a UNIX timestamp from the given date """
3155 if date_str
is None:
3158 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3160 if timezone
is None:
3161 timezone
, date_str
= extract_timezone(date_str
)
3164 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3165 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3166 return calendar
.timegm(dt
.timetuple())
3171 def date_formats(day_first
=True):
3172 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3175 def unified_strdate(date_str
, day_first
=True):
3176 """Return a string with the date in the format YYYYMMDD"""
3178 if date_str
is None:
3182 date_str
= date_str
.replace(',', ' ')
3183 # Remove AM/PM + timezone
3184 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3185 _
, date_str
= extract_timezone(date_str
)
3187 for expression
in date_formats(day_first
):
3189 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3192 if upload_date
is None:
3193 timetuple
= email
.utils
.parsedate_tz(date_str
)
3196 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3199 if upload_date
is not None:
3200 return compat_str(upload_date
)
3203 def unified_timestamp(date_str
, day_first
=True):
3204 if date_str
is None:
3207 date_str
= re
.sub(r
'[,|]', '', date_str
)
3209 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3210 timezone
, date_str
= extract_timezone(date_str
)
3212 # Remove AM/PM + timezone
3213 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3215 # Remove unrecognized timezones from ISO 8601 alike timestamps
3216 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3218 date_str
= date_str
[:-len(m
.group('tz'))]
3220 # Python only supports microseconds, so remove nanoseconds
3221 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3223 date_str
= m
.group(1)
3225 for expression
in date_formats(day_first
):
3227 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3228 return calendar
.timegm(dt
.timetuple())
3231 timetuple
= email
.utils
.parsedate_tz(date_str
)
3233 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3236 def determine_ext(url
, default_ext
='unknown_video'):
3237 if url
is None or '.' not in url
:
3239 guess
= url
.partition('?')[0].rpartition('.')[2]
3240 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3242 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3243 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3244 return guess
.rstrip('/')
3249 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3250 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3253 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3255 Return a datetime object from a string in the format YYYYMMDD or
3256 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3258 format: string date format used to return datetime object from
3259 precision: round the time portion of a datetime object.
3260 auto|microsecond|second|minute|hour|day.
3261 auto: round to the unit provided in date_str (if applicable).
3263 auto_precision
= False
3264 if precision
== 'auto':
3265 auto_precision
= True
3266 precision
= 'microsecond'
3267 today
= datetime_round(datetime
.datetime
.now(), precision
)
3268 if date_str
in ('now', 'today'):
3270 if date_str
== 'yesterday':
3271 return today
- datetime
.timedelta(days
=1)
3273 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3275 if match
is not None:
3276 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3277 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3278 unit
= match
.group('unit')
3279 if unit
== 'month' or unit
== 'year':
3280 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3286 delta
= datetime
.timedelta(**{unit + 's': time}
)
3287 new_date
= start_time
+ delta
3289 return datetime_round(new_date
, unit
)
3292 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3295 def date_from_str(date_str
, format
='%Y%m%d'):
3297 Return a datetime object from a string in the format YYYYMMDD or
3298 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3300 format: string date format used to return datetime object from
3302 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3305 def datetime_add_months(dt
, months
):
3306 """Increment/Decrement a datetime object by months."""
3307 month
= dt
.month
+ months
- 1
3308 year
= dt
.year
+ month
// 12
3309 month
= month
% 12 + 1
3310 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3311 return dt
.replace(year
, month
, day
)
3314 def datetime_round(dt
, precision
='day'):
3316 Round a datetime object's time to a specific precision
3318 if precision
== 'microsecond':
3327 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3328 timestamp
= calendar
.timegm(dt
.timetuple())
3329 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3332 def hyphenate_date(date_str
):
3334 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3335 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3336 if match
is not None:
3337 return '-'.join(match
.groups())
3342 class DateRange(object):
3343 """Represents a time interval between two dates"""
3345 def __init__(self
, start
=None, end
=None):
3346 """start and end must be strings in the format accepted by date"""
3347 if start
is not None:
3348 self
.start
= date_from_str(start
)
3350 self
.start
= datetime
.datetime
.min.date()
3352 self
.end
= date_from_str(end
)
3354 self
.end
= datetime
.datetime
.max.date()
3355 if self
.start
> self
.end
:
3356 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3360 """Returns a range that only contains the given day"""
3361 return cls(day
, day
)
3363 def __contains__(self
, date
):
3364 """Check if the date is in the range"""
3365 if not isinstance(date
, datetime
.date
):
3366 date
= date_from_str(date
)
3367 return self
.start
<= date
<= self
.end
3370 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3373 def platform_name():
3374 """ Returns the platform name as a compat_str """
3375 res
= platform
.platform()
3376 if isinstance(res
, bytes):
3377 res
= res
.decode(preferredencoding())
3379 assert isinstance(res
, compat_str
)
3383 def get_windows_version():
3384 ''' Get Windows version. None if it's not running on Windows '''
3385 if compat_os_name
== 'nt':
3386 return version_tuple(platform
.win32_ver()[1])
3391 def _windows_write_string(s
, out
):
3392 """ Returns True if the string was written using special methods,
3393 False if it has yet to be written out."""
3394 # Adapted from http://stackoverflow.com/a/3259271/35070
3397 import ctypes
.wintypes
3405 fileno
= out
.fileno()
3406 except AttributeError:
3407 # If the output stream doesn't have a fileno, it's virtual
3409 except io
.UnsupportedOperation
:
3410 # Some strange Windows pseudo files?
3412 if fileno
not in WIN_OUTPUT_IDS
:
3415 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3416 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3417 ('GetStdHandle', ctypes
.windll
.kernel32
))
3418 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3420 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3421 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3422 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3423 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3424 written
= ctypes
.wintypes
.DWORD(0)
3426 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3427 FILE_TYPE_CHAR
= 0x0002
3428 FILE_TYPE_REMOTE
= 0x8000
3429 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3430 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3431 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3432 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3433 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3435 def not_a_console(handle
):
3436 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3438 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3439 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3441 if not_a_console(h
):
3444 def next_nonbmp_pos(s
):
3446 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3447 except StopIteration:
3451 count
= min(next_nonbmp_pos(s
), 1024)
3453 ret
= WriteConsoleW(
3454 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3456 raise OSError('Failed to write string')
3457 if not count
: # We just wrote a non-BMP character
3458 assert written
.value
== 2
3461 assert written
.value
> 0
3462 s
= s
[written
.value
:]
3466 def write_string(s
, out
=None, encoding
=None):
3469 assert type(s
) == compat_str
3471 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3472 if _windows_write_string(s
, out
):
3475 if ('b' in getattr(out
, 'mode', '')
3476 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3477 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3479 elif hasattr(out
, 'buffer'):
3480 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3481 byt
= s
.encode(enc
, 'ignore')
3482 out
.buffer.write(byt
)
3488 def bytes_to_intlist(bs
):
3491 if isinstance(bs
[0], int): # Python 3
3494 return [ord(c
) for c
in bs
]
3497 def intlist_to_bytes(xs
):
3500 return compat_struct_pack('%dB' % len(xs
), *xs
)
3503 # Cross-platform file locking
3504 if sys
.platform
== 'win32':
3505 import ctypes
.wintypes
3508 class OVERLAPPED(ctypes
.Structure
):
3510 ('Internal', ctypes
.wintypes
.LPVOID
),
3511 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3512 ('Offset', ctypes
.wintypes
.DWORD
),
3513 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3514 ('hEvent', ctypes
.wintypes
.HANDLE
),
3517 kernel32
= ctypes
.windll
.kernel32
3518 LockFileEx
= kernel32
.LockFileEx
3519 LockFileEx
.argtypes
= [
3520 ctypes
.wintypes
.HANDLE
, # hFile
3521 ctypes
.wintypes
.DWORD
, # dwFlags
3522 ctypes
.wintypes
.DWORD
, # dwReserved
3523 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3524 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3525 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3527 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3528 UnlockFileEx
= kernel32
.UnlockFileEx
3529 UnlockFileEx
.argtypes
= [
3530 ctypes
.wintypes
.HANDLE
, # hFile
3531 ctypes
.wintypes
.DWORD
, # dwReserved
3532 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3533 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3534 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3536 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3537 whole_low
= 0xffffffff
3538 whole_high
= 0x7fffffff
3540 def _lock_file(f
, exclusive
):
3541 overlapped
= OVERLAPPED()
3542 overlapped
.Offset
= 0
3543 overlapped
.OffsetHigh
= 0
3544 overlapped
.hEvent
= 0
3545 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3546 handle
= msvcrt
.get_osfhandle(f
.fileno())
3547 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3548 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3549 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3551 def _unlock_file(f
):
3552 assert f
._lock
_file
_overlapped
_p
3553 handle
= msvcrt
.get_osfhandle(f
.fileno())
3554 if not UnlockFileEx(handle
, 0,
3555 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3556 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3559 # Some platforms, such as Jython, is missing fcntl
3563 def _lock_file(f
, exclusive
):
3564 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3566 def _unlock_file(f
):
3567 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3569 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3571 def _lock_file(f
, exclusive
):
3572 raise IOError(UNSUPPORTED_MSG
)
3574 def _unlock_file(f
):
3575 raise IOError(UNSUPPORTED_MSG
)
3578 class locked_file(object):
3579 def __init__(self
, filename
, mode
, encoding
=None):
3580 assert mode
in ['r', 'a', 'w']
3581 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3584 def __enter__(self
):
3585 exclusive
= self
.mode
!= 'r'
3587 _lock_file(self
.f
, exclusive
)
3593 def __exit__(self
, etype
, value
, traceback
):
3595 _unlock_file(self
.f
)
3602 def write(self
, *args
):
3603 return self
.f
.write(*args
)
3605 def read(self
, *args
):
3606 return self
.f
.read(*args
)
3609 def get_filesystem_encoding():
3610 encoding
= sys
.getfilesystemencoding()
3611 return encoding
if encoding
is not None else 'utf-8'
3614 def shell_quote(args
):
3616 encoding
= get_filesystem_encoding()
3618 if isinstance(a
, bytes):
3619 # We may get a filename encoded with 'encodeFilename'
3620 a
= a
.decode(encoding
)
3621 quoted_args
.append(compat_shlex_quote(a
))
3622 return ' '.join(quoted_args
)
3625 def smuggle_url(url
, data
):
3626 """ Pass additional data in a URL for internal use. """
3628 url
, idata
= unsmuggle_url(url
, {})
3630 sdata
= compat_urllib_parse_urlencode(
3631 {'__youtubedl_smuggle': json.dumps(data)}
)
3632 return url
+ '#' + sdata
3635 def unsmuggle_url(smug_url
, default
=None):
3636 if '#__youtubedl_smuggle' not in smug_url
:
3637 return smug_url
, default
3638 url
, _
, sdata
= smug_url
.rpartition('#')
3639 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3640 data
= json
.loads(jsond
)
3644 def format_bytes(bytes):
3647 if type(bytes) is str:
3648 bytes = float(bytes)
3652 exponent
= int(math
.log(bytes, 1024.0))
3653 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3654 converted
= float(bytes) / float(1024 ** exponent
)
3655 return '%.2f%s' % (converted
, suffix
)
3658 def lookup_unit_table(unit_table
, s
):
3659 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3661 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3664 num_str
= m
.group('num').replace(',', '.')
3665 mult
= unit_table
[m
.group('unit')]
3666 return int(float(num_str
) * mult
)
3669 def parse_filesize(s
):
3673 # The lower-case forms are of course incorrect and unofficial,
3674 # but we support those too
3691 'megabytes': 1000 ** 2,
3692 'mebibytes': 1024 ** 2,
3698 'gigabytes': 1000 ** 3,
3699 'gibibytes': 1024 ** 3,
3705 'terabytes': 1000 ** 4,
3706 'tebibytes': 1024 ** 4,
3712 'petabytes': 1000 ** 5,
3713 'pebibytes': 1024 ** 5,
3719 'exabytes': 1000 ** 6,
3720 'exbibytes': 1024 ** 6,
3726 'zettabytes': 1000 ** 7,
3727 'zebibytes': 1024 ** 7,
3733 'yottabytes': 1000 ** 8,
3734 'yobibytes': 1024 ** 8,
3737 return lookup_unit_table(_UNIT_TABLE
, s
)
3746 if re
.match(r
'^[\d,.]+$', s
):
3747 return str_to_int(s
)
3758 return lookup_unit_table(_UNIT_TABLE
, s
)
3761 def parse_resolution(s
):
3765 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s
)
3768 'width': int(mobj
.group('w')),
3769 'height': int(mobj
.group('h')),
3772 mobj
= re
.search(r
'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s
)
3774 return {'height': int(mobj.group(1))}
3776 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3778 return {'height': int(mobj.group(1)) * 540}
3783 def parse_bitrate(s
):
3784 if not isinstance(s
, compat_str
):
3786 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3788 return int(mobj
.group(1))
3791 def month_by_name(name
, lang
='en'):
3792 """ Return the number of a month by (locale-independently) English name """
3794 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3797 return month_names
.index(name
) + 1
3802 def month_by_abbreviation(abbrev
):
3803 """ Return the number of a month by (locale-independently) English
3807 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3812 def fix_xml_ampersands(xml_str
):
3813 """Replace all the '&' by '&' in XML"""
3815 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3820 def setproctitle(title
):
3821 assert isinstance(title
, compat_str
)
3823 # ctypes in Jython is not complete
3824 # http://bugs.jython.org/issue2148
3825 if sys
.platform
.startswith('java'):
3829 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3833 # LoadLibrary in Windows Python 2.7.13 only expects
3834 # a bytestring, but since unicode_literals turns
3835 # every string into a unicode string, it fails.
3837 title_bytes
= title
.encode('utf-8')
3838 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3839 buf
.value
= title_bytes
3841 libc
.prctl(15, buf
, 0, 0, 0)
3842 except AttributeError:
3843 return # Strange libc, just skip this
3846 def remove_start(s
, start
):
3847 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3850 def remove_end(s
, end
):
3851 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3854 def remove_quotes(s
):
3855 if s
is None or len(s
) < 2:
3857 for quote
in ('"', "'", ):
3858 if s
[0] == quote
and s
[-1] == quote
:
3863 def get_domain(url
):
3864 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3865 return domain
.group('domain') if domain
else None
3868 def url_basename(url
):
3869 path
= compat_urlparse
.urlparse(url
).path
3870 return path
.strip('/').split('/')[-1]
3874 return re
.match(r
'https?://[^?#&]+/', url
).group()
3877 def urljoin(base
, path
):
3878 if isinstance(path
, bytes):
3879 path
= path
.decode('utf-8')
3880 if not isinstance(path
, compat_str
) or not path
:
3882 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3884 if isinstance(base
, bytes):
3885 base
= base
.decode('utf-8')
3886 if not isinstance(base
, compat_str
) or not re
.match(
3887 r
'^(?:https?:)?//', base
):
3889 return compat_urlparse
.urljoin(base
, path
)
3892 class HEADRequest(compat_urllib_request
.Request
):
3893 def get_method(self
):
3897 class PUTRequest(compat_urllib_request
.Request
):
3898 def get_method(self
):
3902 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3905 v
= getattr(v
, get_attr
, None)
3911 return int(v
) * invscale
// scale
3912 except (ValueError, TypeError, OverflowError):
3916 def str_or_none(v
, default
=None):
3917 return default
if v
is None else compat_str(v
)
3920 def str_to_int(int_str
):
3921 """ A more relaxed version of int_or_none """
3922 if isinstance(int_str
, compat_integer_types
):
3924 elif isinstance(int_str
, compat_str
):
3925 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3926 return int_or_none(int_str
)
3929 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3933 return float(v
) * invscale
/ scale
3934 except (ValueError, TypeError):
3938 def bool_or_none(v
, default
=None):
3939 return v
if isinstance(v
, bool) else default
3942 def strip_or_none(v
, default
=None):
3943 return v
.strip() if isinstance(v
, compat_str
) else default
3946 def url_or_none(url
):
3947 if not url
or not isinstance(url
, compat_str
):
3950 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3953 def strftime_or_none(timestamp
, date_format
, default
=None):
3954 datetime_object
= None
3956 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3957 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3958 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3959 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3960 return datetime_object
.strftime(date_format
)
3961 except (ValueError, TypeError, AttributeError):
3965 def parse_duration(s
):
3966 if not isinstance(s
, compat_basestring
):
3971 days
, hours
, mins
, secs
, ms
= [None] * 5
3972 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3974 days
, hours
, mins
, secs
, ms
= m
.groups()
3979 [0-9]+\s*y(?:ears?)?\s*
3982 [0-9]+\s*m(?:onths?)?\s*
3985 [0-9]+\s*w(?:eeks?)?\s*
3988 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3992 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3995 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3998 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
4001 days
, hours
, mins
, secs
, ms
= m
.groups()
4003 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
4005 hours
, mins
= m
.groups()
4011 duration
+= float(secs
)
4013 duration
+= float(mins
) * 60
4015 duration
+= float(hours
) * 60 * 60
4017 duration
+= float(days
) * 24 * 60 * 60
4019 duration
+= float(ms
)
4023 def prepend_extension(filename
, ext
, expected_real_ext
=None):
4024 name
, real_ext
= os
.path
.splitext(filename
)
4026 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
4027 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
4028 else '{0}.{1}'.format(filename
, ext
))
4031 def replace_extension(filename
, ext
, expected_real_ext
=None):
4032 name
, real_ext
= os
.path
.splitext(filename
)
4033 return '{0}.{1}'.format(
4034 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
4038 def check_executable(exe
, args
=[]):
4039 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
4040 args can be a list of arguments for a short output (like -version) """
4042 Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate_or_kill()
4048 def _get_exe_version_output(exe
, args
):
4050 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
4051 # SIGTTOU if yt-dlp is run in the background.
4052 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
4054 [encodeArgument(exe
)] + args
, stdin
=subprocess
.PIPE
,
4055 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate_or_kill()
4058 if isinstance(out
, bytes): # Python 2.x
4059 out
= out
.decode('ascii', 'ignore')
4063 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
4064 assert isinstance(output
, compat_str
)
4065 if version_re
is None:
4066 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
4067 m
= re
.search(version_re
, output
)
4074 def get_exe_version(exe
, args
=['--version'],
4075 version_re
=None, unrecognized
='present'):
4076 """ Returns the version of the specified executable,
4077 or False if the executable is not present """
4078 out
= _get_exe_version_output(exe
, args
)
4079 return detect_exe_version(out
, version_re
, unrecognized
) if out
else False
4082 class LazyList(collections
.abc
.Sequence
):
4083 ''' Lazy immutable list from an iterable
4084 Note that slices of a LazyList are lists and not LazyList'''
4086 class IndexError(IndexError):
4089 def __init__(self
, iterable
):
4090 self
.__iterable
= iter(iterable
)
4092 self
.__reversed
= False
4096 # We need to consume the entire iterable to iterate in reverse
4097 yield from self
.exhaust()
4099 yield from self
.__cache
4100 for item
in self
.__iterable
:
4101 self
.__cache
.append(item
)
4104 def __exhaust(self
):
4105 self
.__cache
.extend(self
.__iterable
)
4106 # Discard the emptied iterable to make it pickle-able
4107 self
.__iterable
= []
4111 ''' Evaluate the entire iterable '''
4112 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
4115 def __reverse_index(x
):
4116 return None if x
is None else -(x
+ 1)
4118 def __getitem__(self
, idx
):
4119 if isinstance(idx
, slice):
4121 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
4122 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
4123 elif isinstance(idx
, int):
4125 idx
= self
.__reverse
_index
(idx
)
4126 start
, stop
, step
= idx
, idx
, 0
4128 raise TypeError('indices must be integers or slices')
4129 if ((start
or 0) < 0 or (stop
or 0) < 0
4130 or (start
is None and step
< 0)
4131 or (stop
is None and step
> 0)):
4132 # We need to consume the entire iterable to be able to slice from the end
4133 # Obviously, never use this with infinite iterables
4136 return self
.__cache
[idx
]
4137 except IndexError as e
:
4138 raise self
.IndexError(e
) from e
4139 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
4141 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
4143 return self
.__cache
[idx
]
4144 except IndexError as e
:
4145 raise self
.IndexError(e
) from e
4149 self
[-1] if self
.__reversed
else self
[0]
4150 except self
.IndexError:
4156 return len(self
.__cache
)
4159 self
.__reversed
= not self
.__reversed
4163 # repr and str should mimic a list. So we exhaust the iterable
4164 return repr(self
.exhaust())
4167 return repr(self
.exhaust())
4172 # This is only useful for tests
4173 return len(self
.getslice())
4175 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
4176 self
._pagefunc
= pagefunc
4177 self
._pagesize
= pagesize
4178 self
._use
_cache
= use_cache
4181 def getpage(self
, pagenum
):
4182 page_results
= self
._cache
.get(pagenum
)
4183 if page_results
is None:
4184 page_results
= list(self
._pagefunc
(pagenum
))
4186 self
._cache
[pagenum
] = page_results
4189 def getslice(self
, start
=0, end
=None):
4190 return list(self
._getslice
(start
, end
))
4192 def _getslice(self
, start
, end
):
4193 raise NotImplementedError('This method must be implemented by subclasses')
4195 def __getitem__(self
, idx
):
4196 # NOTE: cache must be enabled if this is used
4197 if not isinstance(idx
, int) or idx
< 0:
4198 raise TypeError('indices must be non-negative integers')
4199 entries
= self
.getslice(idx
, idx
+ 1)
4205 class OnDemandPagedList(PagedList
):
4206 def _getslice(self
, start
, end
):
4207 for pagenum
in itertools
.count(start
// self
._pagesize
):
4208 firstid
= pagenum
* self
._pagesize
4209 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
4210 if start
>= nextfirstid
:
4214 start
% self
._pagesize
4215 if firstid
<= start
< nextfirstid
4218 ((end
- 1) % self
._pagesize
) + 1
4219 if (end
is not None and firstid
<= end
<= nextfirstid
)
4222 page_results
= self
.getpage(pagenum
)
4223 if startv
!= 0 or endv
is not None:
4224 page_results
= page_results
[startv
:endv
]
4225 yield from page_results
4227 # A little optimization - if current page is not "full", ie. does
4228 # not contain page_size videos then we can assume that this page
4229 # is the last one - there are no more ids on further pages -
4230 # i.e. no need to query again.
4231 if len(page_results
) + startv
< self
._pagesize
:
4234 # If we got the whole page, but the next page is not interesting,
4235 # break out early as well
4236 if end
== nextfirstid
:
4240 class InAdvancePagedList(PagedList
):
4241 def __init__(self
, pagefunc
, pagecount
, pagesize
):
4242 self
._pagecount
= pagecount
4243 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
4245 def _getslice(self
, start
, end
):
4246 start_page
= start
// self
._pagesize
4248 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4249 skip_elems
= start
- start_page
* self
._pagesize
4250 only_more
= None if end
is None else end
- start
4251 for pagenum
in range(start_page
, end_page
):
4252 page_results
= self
.getpage(pagenum
)
4254 page_results
= page_results
[skip_elems
:]
4256 if only_more
is not None:
4257 if len(page_results
) < only_more
:
4258 only_more
-= len(page_results
)
4260 yield from page_results
[:only_more
]
4262 yield from page_results
4265 def uppercase_escape(s
):
4266 unicode_escape
= codecs
.getdecoder('unicode_escape')
4268 r
'\\U[0-9a-fA-F]{8}',
4269 lambda m
: unicode_escape(m
.group(0))[0],
4273 def lowercase_escape(s
):
4274 unicode_escape
= codecs
.getdecoder('unicode_escape')
4276 r
'\\u[0-9a-fA-F]{4}',
4277 lambda m
: unicode_escape(m
.group(0))[0],
4281 def escape_rfc3986(s
):
4282 """Escape non-ASCII characters as suggested by RFC 3986"""
4283 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4284 s
= s
.encode('utf-8')
4285 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4288 def escape_url(url
):
4289 """Escape URL as suggested by RFC 3986"""
4290 url_parsed
= compat_urllib_parse_urlparse(url
)
4291 return url_parsed
._replace
(
4292 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4293 path
=escape_rfc3986(url_parsed
.path
),
4294 params
=escape_rfc3986(url_parsed
.params
),
4295 query
=escape_rfc3986(url_parsed
.query
),
4296 fragment
=escape_rfc3986(url_parsed
.fragment
)
4301 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
4304 def read_batch_urls(batch_fd
):
4306 if not isinstance(url
, compat_str
):
4307 url
= url
.decode('utf-8', 'replace')
4308 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4309 for bom
in BOM_UTF8
:
4310 if url
.startswith(bom
):
4311 url
= url
[len(bom
):]
4313 if not url
or url
.startswith(('#', ';', ']')):
4315 # "#" cannot be stripped out since it is part of the URI
4316 # However, it can be safely stipped out if follwing a whitespace
4317 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4319 with contextlib
.closing(batch_fd
) as fd
:
4320 return [url
for url
in map(fixup
, fd
) if url
]
4323 def urlencode_postdata(*args
, **kargs
):
4324 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4327 def update_url_query(url
, query
):
4330 parsed_url
= compat_urlparse
.urlparse(url
)
4331 qs
= compat_parse_qs(parsed_url
.query
)
4333 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4334 query
=compat_urllib_parse_urlencode(qs
, True)))
4337 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4338 req_headers
= req
.headers
.copy()
4339 req_headers
.update(headers
)
4340 req_data
= data
or req
.data
4341 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4342 req_get_method
= req
.get_method()
4343 if req_get_method
== 'HEAD':
4344 req_type
= HEADRequest
4345 elif req_get_method
== 'PUT':
4346 req_type
= PUTRequest
4348 req_type
= compat_urllib_request
.Request
4350 req_url
, data
=req_data
, headers
=req_headers
,
4351 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4352 if hasattr(req
, 'timeout'):
4353 new_req
.timeout
= req
.timeout
4357 def _multipart_encode_impl(data
, boundary
):
4358 content_type
= 'multipart/form-data; boundary=%s' % boundary
4361 for k
, v
in data
.items():
4362 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4363 if isinstance(k
, compat_str
):
4364 k
= k
.encode('utf-8')
4365 if isinstance(v
, compat_str
):
4366 v
= v
.encode('utf-8')
4367 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4368 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4369 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4370 if boundary
.encode('ascii') in content
:
4371 raise ValueError('Boundary overlaps with data')
4374 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4376 return out
, content_type
4379 def multipart_encode(data
, boundary
=None):
4381 Encode a dict to RFC 7578-compliant form-data
4384 A dict where keys and values can be either Unicode or bytes-like
4387 If specified a Unicode object, it's used as the boundary. Otherwise
4388 a random boundary is generated.
4390 Reference: https://tools.ietf.org/html/rfc7578
4392 has_specified_boundary
= boundary
is not None
4395 if boundary
is None:
4396 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4399 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4402 if has_specified_boundary
:
4406 return out
, content_type
4409 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4410 if isinstance(key_or_keys
, (list, tuple)):
4411 for key
in key_or_keys
:
4412 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4416 return d
.get(key_or_keys
, default
)
4419 def try_get(src
, getter
, expected_type
=None):
4420 for get
in variadic(getter
):
4423 except (AttributeError, KeyError, TypeError, IndexError):
4426 if expected_type
is None or isinstance(v
, expected_type
):
4430 def merge_dicts(*dicts
):
4432 for a_dict
in dicts
:
4433 for k
, v
in a_dict
.items():
4437 or (isinstance(v
, compat_str
) and v
4438 and isinstance(merged
[k
], compat_str
)
4439 and not merged
[k
])):
4444 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4445 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4457 TV_PARENTAL_GUIDELINES
= {
4467 def parse_age_limit(s
):
4469 return s
if 0 <= s
<= 21 else None
4470 if not isinstance(s
, compat_basestring
):
4472 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4474 return int(m
.group('age'))
4477 return US_RATINGS
[s
]
4478 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4480 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4484 def strip_jsonp(code
):
4487 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4488 (?:\s*&&\s*(?P=func_name))?
4489 \s*\(\s*(?P<callback_data>.*)\);?
4490 \s*?(?://[^\n]*)*$''',
4491 r
'\g<callback_data>', code
)
4494 def js_to_json(code
, vars={}):
4495 # vars is a dict of var, val pairs to substitute
4496 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4497 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4499 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4500 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4505 if v
in ('true', 'false', 'null'):
4507 elif v
in ('undefined', 'void 0'):
4509 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4512 if v
[0] in ("'", '"'):
4513 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4518 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4520 for regex
, base
in INTEGER_TABLE
:
4521 im
= re
.match(regex
, v
)
4523 i
= int(im
.group(1), base
)
4524 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4531 return re
.sub(r
'''(?sx)
4532 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4533 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4534 {comment}|,(?={skip}[\]}}])|
4535 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4536 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4539 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4542 def qualities(quality_ids
):
4543 """ Get a numeric quality value out of a list of possible values """
4546 return quality_ids
.index(qid
)
4553 'default': '%(title)s [%(id)s].%(ext)s',
4554 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4560 'description': 'description',
4561 'annotation': 'annotations.xml',
4562 'infojson': 'info.json',
4564 'pl_thumbnail': None,
4565 'pl_description': 'description',
4566 'pl_infojson': 'info.json',
4569 # As of [1] format syntax is:
4570 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4571 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4572 STR_FORMAT_RE_TMPL
= r
'''(?x)
4573 (?<!%)(?P<prefix>(?:%%)*)
4575 (?P<has_key>\((?P<key>{0})\))?
4577 (?P<conversion>[#0\-+ ]+)?
4579 (?P<precision>\.\d+)?
4580 (?P<len_mod>[hlL])? # unused in python
4581 {1} # conversion type
4586 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
4589 def limit_length(s
, length
):
4590 """ Add ellipses to overly long strings """
4595 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4599 def version_tuple(v
):
4600 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4603 def is_outdated_version(version
, limit
, assume_new
=True):
4605 return not assume_new
4607 return version_tuple(version
) < version_tuple(limit
)
4609 return not assume_new
4612 def ytdl_is_updateable():
4613 """ Returns if yt-dlp can be updated with -U """
4615 from .update
import is_non_updateable
4617 return not is_non_updateable()
4620 def args_to_str(args
):
4621 # Get a short string representation for a subprocess command
4622 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4625 def error_to_compat_str(err
):
4627 # On python 2 error byte string must be decoded with proper
4628 # encoding rather than ascii
4629 if sys
.version_info
[0] < 3:
4630 err_str
= err_str
.decode(preferredencoding())
4634 def mimetype2ext(mt
):
4638 mt
, _
, params
= mt
.partition(';')
4643 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4644 # it's the most popular one
4645 'audio/mpeg': 'mp3',
4646 'audio/x-wav': 'wav',
4648 'audio/wave': 'wav',
4651 ext
= FULL_MAP
.get(mt
)
4657 'smptett+xml': 'tt',
4661 'x-mp4-fragmented': 'mp4',
4662 'x-ms-sami': 'sami',
4665 'x-mpegurl': 'm3u8',
4666 'vnd.apple.mpegurl': 'm3u8',
4670 'vnd.ms-sstr+xml': 'ism',
4674 'filmstrip+json': 'fs',
4678 _
, _
, subtype
= mt
.rpartition('/')
4679 ext
= SUBTYPE_MAP
.get(subtype
.lower())
4690 _
, _
, suffix
= subtype
.partition('+')
4691 ext
= SUFFIX_MAP
.get(suffix
)
4695 return subtype
.replace('+', '.')
4698 def parse_codecs(codecs_str
):
4699 # http://tools.ietf.org/html/rfc6381
4702 split_codecs
= list(filter(None, map(
4703 str.strip
, codecs_str
.strip().strip(',').split(','))))
4704 vcodec
, acodec
, hdr
= None, None, None
4705 for full_codec
in split_codecs
:
4706 parts
= full_codec
.split('.')
4707 codec
= parts
[0].replace('0', '')
4708 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
4709 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4711 vcodec
= '.'.join(parts
[:4]) if codec
in ('vp9', 'av1') else full_codec
4712 if codec
in ('dvh1', 'dvhe'):
4714 elif codec
== 'av1' and len(parts
) > 3 and parts
[3] == '10':
4716 elif full_codec
.replace('0', '').startswith('vp9.2'):
4718 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4722 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4723 if not vcodec
and not acodec
:
4724 if len(split_codecs
) == 2:
4726 'vcodec': split_codecs
[0],
4727 'acodec': split_codecs
[1],
4731 'vcodec': vcodec
or 'none',
4732 'acodec': acodec
or 'none',
4733 'dynamic_range': hdr
,
4738 def urlhandle_detect_ext(url_handle
):
4739 getheader
= url_handle
.headers
.get
4741 cd
= getheader('Content-Disposition')
4743 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4745 e
= determine_ext(m
.group('filename'), default_ext
=None)
4749 return mimetype2ext(getheader('Content-Type'))
4752 def encode_data_uri(data
, mime_type
):
4753 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4756 def age_restricted(content_limit
, age_limit
):
4757 """ Returns True iff the content should be blocked """
4759 if age_limit
is None: # No limit set
4761 if content_limit
is None:
4762 return False # Content available for everyone
4763 return age_limit
< content_limit
4766 def is_html(first_bytes
):
4767 """ Detect whether a file contains HTML by examining its first bytes. """
4770 (b
'\xef\xbb\xbf', 'utf-8'),
4771 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4772 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4773 (b
'\xff\xfe', 'utf-16-le'),
4774 (b
'\xfe\xff', 'utf-16-be'),
4776 for bom
, enc
in BOMS
:
4777 if first_bytes
.startswith(bom
):
4778 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4781 s
= first_bytes
.decode('utf-8', 'replace')
4783 return re
.match(r
'^\s*<', s
)
4786 def determine_protocol(info_dict
):
4787 protocol
= info_dict
.get('protocol')
4788 if protocol
is not None:
4791 url
= sanitize_url(info_dict
['url'])
4792 if url
.startswith('rtmp'):
4794 elif url
.startswith('mms'):
4796 elif url
.startswith('rtsp'):
4799 ext
= determine_ext(url
)
4805 return compat_urllib_parse_urlparse(url
).scheme
4808 def render_table(header_row
, data
, delim
=False, extra_gap
=0, hide_empty
=False):
4809 """ Render a list of rows, each as a list of values.
4810 Text after a \t will be right aligned """
4812 return len(remove_terminal_sequences(string
).replace('\t', ''))
4814 def get_max_lens(table
):
4815 return [max(width(str(v
)) for v
in col
) for col
in zip(*table
)]
4817 def filter_using_list(row
, filterArray
):
4818 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4821 max_lens
= get_max_lens(data
)
4822 header_row
= filter_using_list(header_row
, max_lens
)
4823 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4825 table
= [header_row
] + data
4826 max_lens
= get_max_lens(table
)
4829 table
= [header_row
, [delim
* (ml
+ extra_gap
) for ml
in max_lens
]] + data
4830 table
[1][-1] = table
[1][-1][:-extra_gap
] # Remove extra_gap from end of delimiter
4832 for pos
, text
in enumerate(map(str, row
)):
4834 row
[pos
] = text
.replace('\t', ' ' * (max_lens
[pos
] - width(text
))) + ' ' * extra_gap
4836 row
[pos
] = text
+ ' ' * (max_lens
[pos
] - width(text
) + extra_gap
)
4837 ret
= '\n'.join(''.join(row
).rstrip() for row
in table
)
4841 def _match_one(filter_part
, dct
, incomplete
):
4842 # TODO: Generalize code with YoutubeDL._build_format_filter
4843 STRING_OPERATORS
= {
4844 '*=': operator
.contains
,
4845 '^=': lambda attr
, value
: attr
.startswith(value
),
4846 '$=': lambda attr
, value
: attr
.endswith(value
),
4847 '~=': lambda attr
, value
: re
.search(value
, attr
),
4849 COMPARISON_OPERATORS
= {
4851 '<=': operator
.le
, # "<=" must be defined above "<"
4858 operator_rex
= re
.compile(r
'''(?x)\s*
4860 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4862 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
4866 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4867 m = operator_rex.search(filter_part)
4870 unnegated_op = COMPARISON_OPERATORS[m['op']]
4872 op = lambda attr, value: not unnegated_op(attr, value)
4875 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
4877 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
4878 actual_value = dct.get(m['key'])
4879 numeric_comparison = None
4880 if isinstance(actual_value, compat_numeric_types):
4881 # If the original field is a string and matching comparisonvalue is
4882 # a number we should respect the origin of the original field
4883 # and process comparison value as a string (see
4884 # https://github.com/ytdl-org/youtube-dl/issues/11082)
4886 numeric_comparison = int(comparison_value)
4888 numeric_comparison = parse_filesize(comparison_value)
4889 if numeric_comparison is None:
4890 numeric_comparison = parse_filesize(f'{comparison_value}B')
4891 if numeric_comparison is None:
4892 numeric_comparison = parse_duration(comparison_value)
4893 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
4894 raise ValueError('Operator %s only supports string values!' % m['op'])
4895 if actual_value is None:
4896 return incomplete or m['none_inclusive']
4897 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
4900 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4901 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4903 operator_rex = re.compile(r'''(?x
)\s
*
4904 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4906 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4907 m = operator_rex.search(filter_part)
4909 op = UNARY_OPERATORS[m.group('op')]
4910 actual_value = dct.get(m.group('key'))
4911 if incomplete and actual_value is None:
4913 return op(actual_value)
4915 raise ValueError('Invalid filter part %r' % filter_part)
4918 def match_str(filter_str, dct, incomplete=False):
4919 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
4920 When incomplete, all conditions passes on missing fields
4923 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
4924 for filter_part in re.split(r'(?<!\\)&', filter_str))
4927 def match_filter_func(filter_str):
4928 def _match_func(info_dict, *args, **kwargs):
4929 if match_str(filter_str, info_dict, *args, **kwargs):
4932 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4933 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4937 def parse_dfxp_time_expr(time_expr):
4941 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4943 return float(mobj.group('time_offset'))
4945 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4947 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4950 def srt_subtitles_timecode(seconds):
4951 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
4954 def ass_subtitles_timecode(seconds):
4955 time = timetuple_from_msec(seconds * 1000)
4956 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
4959 def dfxp2srt(dfxp_data):
4961 @param dfxp_data A
bytes-like
object containing DFXP data
4962 @returns A
unicode object containing converted SRT data
4964 LEGACY_NAMESPACES = (
4965 (b'http://www.w3.org/ns/ttml', [
4966 b'http://www.w3.org/2004/11/ttaf1',
4967 b'http://www.w3.org/2006/04/ttaf1',
4968 b'http://www.w3.org/2006/10/ttaf1',
4970 (b'http://www.w3.org/ns/ttml#styling', [
4971 b'http://www.w3.org/ns/ttml#style',
4975 SUPPORTED_STYLING = [
4984 _x = functools.partial(xpath_with_ns, ns_map={
4985 'xml': 'http://www.w3.org/XML/1998/namespace',
4986 'ttml': 'http://www.w3.org/ns/ttml',
4987 'tts': 'http://www.w3.org/ns/ttml#styling',
4993 class TTMLPElementParser(object):
4995 _unclosed_elements = []
4996 _applied_styles = []
4998 def start(self, tag, attrib):
4999 if tag in (_x('ttml:br'), 'br'):
5002 unclosed_elements = []
5004 element_style_id = attrib.get('style')
5006 style.update(default_style)
5007 if element_style_id:
5008 style.update(styles.get(element_style_id, {}))
5009 for prop in SUPPORTED_STYLING:
5010 prop_val = attrib.get(_x('tts:' + prop))
5012 style[prop] = prop_val
5015 for k, v in sorted(style.items()):
5016 if self._applied_styles and self._applied_styles[-1].get(k) == v:
5019 font += ' color="%s"' % v
5020 elif k == 'fontSize':
5021 font += ' size="%s"' % v
5022 elif k == 'fontFamily':
5023 font += ' face="%s"' % v
5024 elif k == 'fontWeight' and v == 'bold':
5026 unclosed_elements.append('b')
5027 elif k == 'fontStyle' and v == 'italic':
5029 unclosed_elements.append('i')
5030 elif k == 'textDecoration' and v == 'underline':
5032 unclosed_elements.append('u')
5034 self._out += '<font' + font + '>'
5035 unclosed_elements.append('font')
5037 if self._applied_styles:
5038 applied_style.update(self._applied_styles[-1])
5039 applied_style.update(style)
5040 self._applied_styles.append(applied_style)
5041 self._unclosed_elements.append(unclosed_elements)
5044 if tag not in (_x('ttml:br'), 'br'):
5045 unclosed_elements = self._unclosed_elements.pop()
5046 for element in reversed(unclosed_elements):
5047 self._out += '</%s>' % element
5048 if unclosed_elements and self._applied_styles:
5049 self._applied_styles.pop()
5051 def data(self, data):
5055 return self._out.strip()
5057 def parse_node(node):
5058 target = TTMLPElementParser()
5059 parser = xml.etree.ElementTree.XMLParser(target=target)
5060 parser.feed(xml.etree.ElementTree.tostring(node))
5061 return parser.close()
5063 for k, v in LEGACY_NAMESPACES:
5065 dfxp_data = dfxp_data.replace(ns, k)
5067 dfxp = compat_etree_fromstring(dfxp_data)
5069 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
5072 raise ValueError('Invalid dfxp/TTML subtitle')
5076 for style in dfxp.findall(_x('.//ttml:style')):
5077 style_id = style.get('id') or style.get(_x('xml:id'))
5080 parent_style_id = style.get('style')
5082 if parent_style_id not in styles:
5085 styles[style_id] = styles[parent_style_id].copy()
5086 for prop in SUPPORTED_STYLING:
5087 prop_val = style.get(_x('tts:' + prop))
5089 styles.setdefault(style_id, {})[prop] = prop_val
5095 for p in ('body', 'div'):
5096 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
5099 style = styles.get(ele.get('style'))
5102 default_style.update(style)
5104 for para, index in zip(paras, itertools.count(1)):
5105 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
5106 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
5107 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
5108 if begin_time is None:
5113 end_time = begin_time + dur
5114 out.append('%d\n%s --> %s\n%s\n\n' % (
5116 srt_subtitles_timecode(begin_time),
5117 srt_subtitles_timecode(end_time),
5123 def cli_option(params, command_option, param):
5124 param = params.get(param)
5126 param = compat_str(param)
5127 return [command_option, param] if param is not None else []
5130 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
5131 param = params.get(param)
5134 assert isinstance(param, bool)
5136 return [command_option + separator + (true_value if param else false_value)]
5137 return [command_option, true_value if param else false_value]
5140 def cli_valueless_option(params, command_option, param, expected_value=True):
5141 param = params.get(param)
5142 return [command_option] if param == expected_value else []
5145 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
5146 if isinstance(argdict, (list, tuple)): # for backward compatibility
5153 assert isinstance(argdict, dict)
5155 assert isinstance(keys, (list, tuple))
5156 for key_list in keys:
5157 arg_list = list(filter(
5158 lambda x: x is not None,
5159 [argdict.get(key.lower()) for key in variadic(key_list)]))
5161 return [arg for args in arg_list for arg in args]
5165 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
5166 main_key, exe = main_key.lower(), exe.lower()
5167 root_key = exe if main_key == exe else f'{main_key}+{exe}'
5168 keys = [f'{root_key}{k}' for k in (keys or [''])]
5169 if root_key in keys:
5171 keys.append((main_key, exe))
5172 keys.append('default')
5175 return cli_configuration_args(argdict, keys, default, use_compat)
5178 class ISO639Utils(object):
5179 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
5238 'iw': 'heb', # Replaced by he in 1989 revision
5248 'in': 'ind', # Replaced by id in 1989 revision
5363 'ji': 'yid', # Replaced by yi in 1989 revision
5371 def short2long(cls, code):
5372 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5373 return cls._lang_map.get(code[:2])
5376 def long2short(cls, code):
5377 """Convert language code from ISO 639-2/T to ISO 639-1"""
5378 for short_name, long_name in cls._lang_map.items():
5379 if long_name == code:
5383 class ISO3166Utils(object):
5384 # From http://data.okfn.org/data/core/country-list
5386 'AF': 'Afghanistan',
5387 'AX': 'Åland Islands',
5390 'AS': 'American Samoa',
5395 'AG': 'Antigua and Barbuda',
5412 'BO': 'Bolivia, Plurinational State of',
5413 'BQ': 'Bonaire, Sint Eustatius and Saba',
5414 'BA': 'Bosnia and Herzegovina',
5416 'BV': 'Bouvet Island',
5418 'IO': 'British Indian Ocean Territory',
5419 'BN': 'Brunei Darussalam',
5421 'BF': 'Burkina Faso',
5427 'KY': 'Cayman Islands',
5428 'CF': 'Central African Republic',
5432 'CX': 'Christmas Island',
5433 'CC': 'Cocos (Keeling) Islands',
5437 'CD': 'Congo, the Democratic Republic of the',
5438 'CK': 'Cook Islands',
5440 'CI': 'Côte d\'Ivoire',
5445 'CZ': 'Czech Republic',
5449 'DO': 'Dominican Republic',
5452 'SV': 'El Salvador',
5453 'GQ': 'Equatorial Guinea',
5457 'FK': 'Falkland Islands (Malvinas)',
5458 'FO': 'Faroe Islands',
5462 'GF': 'French Guiana',
5463 'PF': 'French Polynesia',
5464 'TF': 'French Southern Territories',
5479 'GW': 'Guinea-Bissau',
5482 'HM': 'Heard Island and McDonald Islands',
5483 'VA': 'Holy See (Vatican City State)',
5490 'IR': 'Iran, Islamic Republic of',
5493 'IM': 'Isle of Man',
5503 'KP': 'Korea, Democratic People\'s Republic of',
5504 'KR': 'Korea, Republic of',
5507 'LA': 'Lao People\'s Democratic Republic',
5513 'LI': 'Liechtenstein',
5517 'MK': 'Macedonia, the Former Yugoslav Republic of',
5524 'MH': 'Marshall Islands',
5530 'FM': 'Micronesia, Federated States of',
5531 'MD': 'Moldova, Republic of',
5542 'NL': 'Netherlands',
5543 'NC': 'New Caledonia',
5544 'NZ': 'New Zealand',
5549 'NF': 'Norfolk Island',
5550 'MP': 'Northern Mariana Islands',
5555 'PS': 'Palestine, State of',
5557 'PG': 'Papua New Guinea',
5560 'PH': 'Philippines',
5564 'PR': 'Puerto Rico',
5568 'RU': 'Russian Federation',
5570 'BL': 'Saint Barthélemy',
5571 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5572 'KN': 'Saint Kitts and Nevis',
5573 'LC': 'Saint Lucia',
5574 'MF': 'Saint Martin (French part)',
5575 'PM': 'Saint Pierre and Miquelon',
5576 'VC': 'Saint Vincent and the Grenadines',
5579 'ST': 'Sao Tome and Principe',
5580 'SA': 'Saudi Arabia',
5584 'SL': 'Sierra Leone',
5586 'SX': 'Sint Maarten (Dutch part)',
5589 'SB': 'Solomon Islands',
5591 'ZA': 'South Africa',
5592 'GS': 'South Georgia and the South Sandwich Islands',
5593 'SS': 'South Sudan',
5598 'SJ': 'Svalbard and Jan Mayen',
5601 'CH': 'Switzerland',
5602 'SY': 'Syrian Arab Republic',
5603 'TW': 'Taiwan, Province of China',
5605 'TZ': 'Tanzania, United Republic of',
5607 'TL': 'Timor-Leste',
5611 'TT': 'Trinidad and Tobago',
5614 'TM': 'Turkmenistan',
5615 'TC': 'Turks and Caicos Islands',
5619 'AE': 'United Arab Emirates',
5620 'GB': 'United Kingdom',
5621 'US': 'United States',
5622 'UM': 'United States Minor Outlying Islands',
5626 'VE': 'Venezuela, Bolivarian Republic of',
5628 'VG': 'Virgin Islands, British',
5629 'VI': 'Virgin Islands, U.S.',
5630 'WF': 'Wallis and Futuna',
5631 'EH': 'Western Sahara',
5638 def short2full(cls, code):
5639 """Convert an ISO 3166-2 country code to the corresponding full name"""
5640 return cls._country_map.get(code.upper())
5643 class GeoUtils(object):
5644 # Major IPv4 address blocks per country
5646 'AD': '46.172.224.0/19',
5647 'AE': '94.200.0.0/13',
5648 'AF': '149.54.0.0/17',
5649 'AG': '209.59.64.0/18',
5650 'AI': '204.14.248.0/21',
5651 'AL': '46.99.0.0/16',
5652 'AM': '46.70.0.0/15',
5653 'AO': '105.168.0.0/13',
5654 'AP': '182.50.184.0/21',
5655 'AQ': '23.154.160.0/24',
5656 'AR': '181.0.0.0/12',
5657 'AS': '202.70.112.0/20',
5658 'AT': '77.116.0.0/14',
5659 'AU': '1.128.0.0/11',
5660 'AW': '181.41.0.0/18',
5661 'AX': '185.217.4.0/22',
5662 'AZ': '5.197.0.0/16',
5663 'BA': '31.176.128.0/17',
5664 'BB': '65.48.128.0/17',
5665 'BD': '114.130.0.0/16',
5667 'BF': '102.178.0.0/15',
5668 'BG': '95.42.0.0/15',
5669 'BH': '37.131.0.0/17',
5670 'BI': '154.117.192.0/18',
5671 'BJ': '137.255.0.0/16',
5672 'BL': '185.212.72.0/23',
5673 'BM': '196.12.64.0/18',
5674 'BN': '156.31.0.0/16',
5675 'BO': '161.56.0.0/16',
5676 'BQ': '161.0.80.0/20',
5677 'BR': '191.128.0.0/12',
5678 'BS': '24.51.64.0/18',
5679 'BT': '119.2.96.0/19',
5680 'BW': '168.167.0.0/16',
5681 'BY': '178.120.0.0/13',
5682 'BZ': '179.42.192.0/18',
5683 'CA': '99.224.0.0/11',
5684 'CD': '41.243.0.0/16',
5685 'CF': '197.242.176.0/21',
5686 'CG': '160.113.0.0/16',
5687 'CH': '85.0.0.0/13',
5688 'CI': '102.136.0.0/14',
5689 'CK': '202.65.32.0/19',
5690 'CL': '152.172.0.0/14',
5691 'CM': '102.244.0.0/14',
5692 'CN': '36.128.0.0/10',
5693 'CO': '181.240.0.0/12',
5694 'CR': '201.192.0.0/12',
5695 'CU': '152.206.0.0/15',
5696 'CV': '165.90.96.0/19',
5697 'CW': '190.88.128.0/17',
5698 'CY': '31.153.0.0/16',
5699 'CZ': '88.100.0.0/14',
5701 'DJ': '197.241.0.0/17',
5702 'DK': '87.48.0.0/12',
5703 'DM': '192.243.48.0/20',
5704 'DO': '152.166.0.0/15',
5705 'DZ': '41.96.0.0/12',
5706 'EC': '186.68.0.0/15',
5707 'EE': '90.190.0.0/15',
5708 'EG': '156.160.0.0/11',
5709 'ER': '196.200.96.0/20',
5710 'ES': '88.0.0.0/11',
5711 'ET': '196.188.0.0/14',
5712 'EU': '2.16.0.0/13',
5713 'FI': '91.152.0.0/13',
5714 'FJ': '144.120.0.0/16',
5715 'FK': '80.73.208.0/21',
5716 'FM': '119.252.112.0/20',
5717 'FO': '88.85.32.0/19',
5719 'GA': '41.158.0.0/15',
5721 'GD': '74.122.88.0/21',
5722 'GE': '31.146.0.0/16',
5723 'GF': '161.22.64.0/18',
5724 'GG': '62.68.160.0/19',
5725 'GH': '154.160.0.0/12',
5726 'GI': '95.164.0.0/16',
5727 'GL': '88.83.0.0/19',
5728 'GM': '160.182.0.0/15',
5729 'GN': '197.149.192.0/18',
5730 'GP': '104.250.0.0/19',
5731 'GQ': '105.235.224.0/20',
5732 'GR': '94.64.0.0/13',
5733 'GT': '168.234.0.0/16',
5734 'GU': '168.123.0.0/16',
5735 'GW': '197.214.80.0/20',
5736 'GY': '181.41.64.0/18',
5737 'HK': '113.252.0.0/14',
5738 'HN': '181.210.0.0/16',
5739 'HR': '93.136.0.0/13',
5740 'HT': '148.102.128.0/17',
5741 'HU': '84.0.0.0/14',
5742 'ID': '39.192.0.0/10',
5743 'IE': '87.32.0.0/12',
5744 'IL': '79.176.0.0/13',
5745 'IM': '5.62.80.0/20',
5746 'IN': '117.192.0.0/10',
5747 'IO': '203.83.48.0/21',
5748 'IQ': '37.236.0.0/14',
5749 'IR': '2.176.0.0/12',
5750 'IS': '82.221.0.0/16',
5751 'IT': '79.0.0.0/10',
5752 'JE': '87.244.64.0/18',
5753 'JM': '72.27.0.0/17',
5754 'JO': '176.29.0.0/16',
5755 'JP': '133.0.0.0/8',
5756 'KE': '105.48.0.0/12',
5757 'KG': '158.181.128.0/17',
5758 'KH': '36.37.128.0/17',
5759 'KI': '103.25.140.0/22',
5760 'KM': '197.255.224.0/20',
5761 'KN': '198.167.192.0/19',
5762 'KP': '175.45.176.0/22',
5763 'KR': '175.192.0.0/10',
5764 'KW': '37.36.0.0/14',
5765 'KY': '64.96.0.0/15',
5766 'KZ': '2.72.0.0/13',
5767 'LA': '115.84.64.0/18',
5768 'LB': '178.135.0.0/16',
5769 'LC': '24.92.144.0/20',
5770 'LI': '82.117.0.0/19',
5771 'LK': '112.134.0.0/15',
5772 'LR': '102.183.0.0/16',
5773 'LS': '129.232.0.0/17',
5774 'LT': '78.56.0.0/13',
5775 'LU': '188.42.0.0/16',
5776 'LV': '46.109.0.0/16',
5777 'LY': '41.252.0.0/14',
5778 'MA': '105.128.0.0/11',
5779 'MC': '88.209.64.0/18',
5780 'MD': '37.246.0.0/16',
5781 'ME': '178.175.0.0/17',
5782 'MF': '74.112.232.0/21',
5783 'MG': '154.126.0.0/17',
5784 'MH': '117.103.88.0/21',
5785 'MK': '77.28.0.0/15',
5786 'ML': '154.118.128.0/18',
5787 'MM': '37.111.0.0/17',
5788 'MN': '49.0.128.0/17',
5789 'MO': '60.246.0.0/16',
5790 'MP': '202.88.64.0/20',
5791 'MQ': '109.203.224.0/19',
5792 'MR': '41.188.64.0/18',
5793 'MS': '208.90.112.0/22',
5794 'MT': '46.11.0.0/16',
5795 'MU': '105.16.0.0/12',
5796 'MV': '27.114.128.0/18',
5797 'MW': '102.70.0.0/15',
5798 'MX': '187.192.0.0/11',
5799 'MY': '175.136.0.0/13',
5800 'MZ': '197.218.0.0/15',
5801 'NA': '41.182.0.0/16',
5802 'NC': '101.101.0.0/18',
5803 'NE': '197.214.0.0/18',
5804 'NF': '203.17.240.0/22',
5805 'NG': '105.112.0.0/12',
5806 'NI': '186.76.0.0/15',
5807 'NL': '145.96.0.0/11',
5808 'NO': '84.208.0.0/13',
5809 'NP': '36.252.0.0/15',
5810 'NR': '203.98.224.0/19',
5811 'NU': '49.156.48.0/22',
5812 'NZ': '49.224.0.0/14',
5813 'OM': '5.36.0.0/15',
5814 'PA': '186.72.0.0/15',
5815 'PE': '186.160.0.0/14',
5816 'PF': '123.50.64.0/18',
5817 'PG': '124.240.192.0/19',
5818 'PH': '49.144.0.0/13',
5819 'PK': '39.32.0.0/11',
5820 'PL': '83.0.0.0/11',
5821 'PM': '70.36.0.0/20',
5822 'PR': '66.50.0.0/16',
5823 'PS': '188.161.0.0/16',
5824 'PT': '85.240.0.0/13',
5825 'PW': '202.124.224.0/20',
5826 'PY': '181.120.0.0/14',
5827 'QA': '37.210.0.0/15',
5828 'RE': '102.35.0.0/16',
5829 'RO': '79.112.0.0/13',
5830 'RS': '93.86.0.0/15',
5831 'RU': '5.136.0.0/13',
5832 'RW': '41.186.0.0/16',
5833 'SA': '188.48.0.0/13',
5834 'SB': '202.1.160.0/19',
5835 'SC': '154.192.0.0/11',
5836 'SD': '102.120.0.0/13',
5837 'SE': '78.64.0.0/12',
5838 'SG': '8.128.0.0/10',
5839 'SI': '188.196.0.0/14',
5840 'SK': '78.98.0.0/15',
5841 'SL': '102.143.0.0/17',
5842 'SM': '89.186.32.0/19',
5843 'SN': '41.82.0.0/15',
5844 'SO': '154.115.192.0/18',
5845 'SR': '186.179.128.0/17',
5846 'SS': '105.235.208.0/21',
5847 'ST': '197.159.160.0/19',
5848 'SV': '168.243.0.0/16',
5849 'SX': '190.102.0.0/20',
5851 'SZ': '41.84.224.0/19',
5852 'TC': '65.255.48.0/20',
5853 'TD': '154.68.128.0/19',
5854 'TG': '196.168.0.0/14',
5855 'TH': '171.96.0.0/13',
5856 'TJ': '85.9.128.0/18',
5857 'TK': '27.96.24.0/21',
5858 'TL': '180.189.160.0/20',
5859 'TM': '95.85.96.0/19',
5860 'TN': '197.0.0.0/11',
5861 'TO': '175.176.144.0/21',
5862 'TR': '78.160.0.0/11',
5863 'TT': '186.44.0.0/15',
5864 'TV': '202.2.96.0/19',
5865 'TW': '120.96.0.0/11',
5866 'TZ': '156.156.0.0/14',
5867 'UA': '37.52.0.0/14',
5868 'UG': '102.80.0.0/13',
5870 'UY': '167.56.0.0/13',
5871 'UZ': '84.54.64.0/18',
5872 'VA': '212.77.0.0/19',
5873 'VC': '207.191.240.0/21',
5874 'VE': '186.88.0.0/13',
5875 'VG': '66.81.192.0/20',
5876 'VI': '146.226.0.0/16',
5877 'VN': '14.160.0.0/11',
5878 'VU': '202.80.32.0/20',
5879 'WF': '117.20.32.0/21',
5880 'WS': '202.4.32.0/19',
5881 'YE': '134.35.0.0/16',
5882 'YT': '41.242.116.0/22',
5883 'ZA': '41.0.0.0/11',
5884 'ZM': '102.144.0.0/13',
5885 'ZW': '102.177.192.0/18',
5889 def random_ipv4(cls, code_or_block):
5890 if len(code_or_block) == 2:
5891 block = cls._country_ip_map.get(code_or_block.upper())
5895 block = code_or_block
5896 addr, preflen = block.split('/')
5897 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5898 addr_max = addr_min | (0xffffffff >> int(preflen))
5899 return compat_str(socket.inet_ntoa(
5900 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5903 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5904 def __init__(self, proxies=None):
5905 # Set default handlers
5906 for type in ('http', 'https'):
5907 setattr(self, '%s_open' % type,
5908 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5909 meth(r, proxy, type))
5910 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5912 def proxy_open(self, req, proxy, type):
5913 req_proxy = req.headers.get('Ytdl-request-proxy')
5914 if req_proxy is not None:
5916 del req.headers['Ytdl-request-proxy']
5918 if proxy == '__noproxy__':
5919 return None # No Proxy
5920 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5921 req.add_header('Ytdl-socks-proxy', proxy)
5922 # yt-dlp's http/https handlers do wrapping the socket with socks
5924 return compat_urllib_request.ProxyHandler.proxy_open(
5925 self, req, proxy, type)
5928 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5929 # released into Public Domain
5930 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5932 def long_to_bytes(n, blocksize=0):
5933 """long_to_bytes(n:long, blocksize:int) : string
5934 Convert a long integer to a byte string.
5936 If optional blocksize is given and greater than zero, pad the front of the
5937 byte string with binary zeros so that the length is a multiple of
5940 # after much testing, this algorithm was deemed to be the fastest
5944 s = compat_struct_pack('>I', n & 0xffffffff) + s
5946 # strip off leading zeros
5947 for i in range(len(s)):
5948 if s[i] != b'\000'[0]:
5951 # only happens when n == 0
5955 # add back some pad bytes. this could be done more efficiently w.r.t. the
5956 # de-padding being done above, but sigh...
5957 if blocksize > 0 and len(s) % blocksize:
5958 s = (blocksize - len(s) % blocksize) * b'\000' + s
5962 def bytes_to_long(s):
5963 """bytes_to_long(string) : long
5964 Convert a byte string to a long integer.
5966 This is (essentially) the inverse of long_to_bytes().
5971 extra = (4 - length % 4)
5972 s = b'\000' * extra + s
5973 length = length + extra
5974 for i in range(0, length, 4):
5975 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5979 def ohdave_rsa_encrypt(data, exponent, modulus):
5981 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5984 data: data to encrypt, bytes-like object
5985 exponent, modulus: parameter e and N of RSA algorithm, both integer
5986 Output: hex string of encrypted data
5988 Limitation: supports one block encryption only
5991 payload = int(binascii.hexlify(data[::-1]), 16)
5992 encrypted = pow(payload, exponent, modulus)
5993 return '%x' % encrypted
5996 def pkcs1pad(data, length):
5998 Padding input data with PKCS#1 scheme
6000 @param {int[]} data input data
6001 @param {int} length target length
6002 @returns {int[]} padded data
6004 if len(data) > length - 11:
6005 raise ValueError('Input data too
long for PKCS
#1 padding')
6007 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
6008 return [0, 2] + pseudo_random
+ [0] + data
6011 def encode_base_n(num
, n
, table
=None):
6012 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
6014 table
= FULL_TABLE
[:n
]
6017 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
6024 ret
= table
[num
% n
] + ret
6029 def decode_packed_codes(code
):
6030 mobj
= re
.search(PACKED_CODES_RE
, code
)
6031 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
6034 symbols
= symbols
.split('|')
6039 base_n_count
= encode_base_n(count
, base
)
6040 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
6043 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
6047 def caesar(s
, alphabet
, shift
):
6052 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
6057 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
6060 def parse_m3u8_attributes(attrib
):
6062 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
6063 if val
.startswith('"'):
6069 def urshift(val
, n
):
6070 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
6073 # Based on png2str() written by @gdkchan and improved by @yokrysty
6074 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
6075 def decode_png(png_data
):
6076 # Reference: https://www.w3.org/TR/PNG/
6077 header
= png_data
[8:]
6079 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
6080 raise IOError('Not a valid PNG file.')
6082 int_map
= {1: '>B', 2: '>H', 4: '>I'}
6083 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
6088 length
= unpack_integer(header
[:4])
6091 chunk_type
= header
[:4]
6094 chunk_data
= header
[:length
]
6095 header
= header
[length
:]
6097 header
= header
[4:] # Skip CRC
6105 ihdr
= chunks
[0]['data']
6107 width
= unpack_integer(ihdr
[:4])
6108 height
= unpack_integer(ihdr
[4:8])
6112 for chunk
in chunks
:
6113 if chunk
['type'] == b
'IDAT':
6114 idat
+= chunk
['data']
6117 raise IOError('Unable to read PNG data.')
6119 decompressed_data
= bytearray(zlib
.decompress(idat
))
6124 def _get_pixel(idx
):
6129 for y
in range(height
):
6130 basePos
= y
* (1 + stride
)
6131 filter_type
= decompressed_data
[basePos
]
6135 pixels
.append(current_row
)
6137 for x
in range(stride
):
6138 color
= decompressed_data
[1 + basePos
+ x
]
6139 basex
= y
* stride
+ x
6144 left
= _get_pixel(basex
- 3)
6146 up
= _get_pixel(basex
- stride
)
6148 if filter_type
== 1: # Sub
6149 color
= (color
+ left
) & 0xff
6150 elif filter_type
== 2: # Up
6151 color
= (color
+ up
) & 0xff
6152 elif filter_type
== 3: # Average
6153 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
6154 elif filter_type
== 4: # Paeth
6160 c
= _get_pixel(basex
- stride
- 3)
6168 if pa
<= pb
and pa
<= pc
:
6169 color
= (color
+ a
) & 0xff
6171 color
= (color
+ b
) & 0xff
6173 color
= (color
+ c
) & 0xff
6175 current_row
.append(color
)
6177 return width
, height
, pixels
6180 def write_xattr(path
, key
, value
):
6181 # This mess below finds the best xattr tool for the job
6183 # try the pyxattr module...
6186 if hasattr(xattr
, 'set'): # pyxattr
6187 # Unicode arguments are not supported in python-pyxattr until
6189 # See https://github.com/ytdl-org/youtube-dl/issues/5498
6190 pyxattr_required_version
= '0.5.0'
6191 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
6192 # TODO: fallback to CLI tools
6193 raise XAttrUnavailableError(
6194 'python-pyxattr is detected but is too old. '
6195 'yt-dlp requires %s or above while your version is %s. '
6196 'Falling back to other xattr implementations' % (
6197 pyxattr_required_version
, xattr
.__version
__))
6199 setxattr
= xattr
.set
6201 setxattr
= xattr
.setxattr
6204 setxattr(path
, key
, value
)
6205 except EnvironmentError as e
:
6206 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6209 if compat_os_name
== 'nt':
6210 # Write xattrs to NTFS Alternate Data Streams:
6211 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
6212 assert ':' not in key
6213 assert os
.path
.exists(path
)
6215 ads_fn
= path
+ ':' + key
6217 with open(ads_fn
, 'wb') as f
:
6219 except EnvironmentError as e
:
6220 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6222 user_has_setfattr
= check_executable('setfattr', ['--version'])
6223 user_has_xattr
= check_executable('xattr', ['-h'])
6225 if user_has_setfattr
or user_has_xattr
:
6227 value
= value
.decode('utf-8')
6228 if user_has_setfattr
:
6229 executable
= 'setfattr'
6230 opts
= ['-n', key
, '-v', value
]
6231 elif user_has_xattr
:
6232 executable
= 'xattr'
6233 opts
= ['-w', key
, value
]
6235 cmd
= ([encodeFilename(executable
, True)]
6236 + [encodeArgument(o
) for o
in opts
]
6237 + [encodeFilename(path
, True)])
6241 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
6242 except EnvironmentError as e
:
6243 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6244 stdout
, stderr
= p
.communicate_or_kill()
6245 stderr
= stderr
.decode('utf-8', 'replace')
6246 if p
.returncode
!= 0:
6247 raise XAttrMetadataError(p
.returncode
, stderr
)
6250 # On Unix, and can't find pyxattr, setfattr, or xattr.
6251 if sys
.platform
.startswith('linux'):
6252 raise XAttrUnavailableError(
6253 "Couldn't find a tool to set the xattrs. "
6254 "Install either the python 'pyxattr' or 'xattr' "
6255 "modules, or the GNU 'attr' package "
6256 "(which contains the 'setfattr' tool).")
6258 raise XAttrUnavailableError(
6259 "Couldn't find a tool to set the xattrs. "
6260 "Install either the python 'xattr' module, "
6261 "or the 'xattr' binary.")
6264 def random_birthday(year_field
, month_field
, day_field
):
6265 start_date
= datetime
.date(1950, 1, 1)
6266 end_date
= datetime
.date(1995, 12, 31)
6267 offset
= random
.randint(0, (end_date
- start_date
).days
)
6268 random_date
= start_date
+ datetime
.timedelta(offset
)
6270 year_field
: str(random_date
.year
),
6271 month_field
: str(random_date
.month
),
6272 day_field
: str(random_date
.day
),
6276 # Templates for internet shortcut files, which are plain text files.
6277 DOT_URL_LINK_TEMPLATE
= '''
6282 DOT_WEBLOC_LINK_TEMPLATE
= '''
6283 <?xml version="1.0" encoding="UTF-8"?>
6284 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
6285 <plist version="1.0">
6288 \t<string>%(url)s</string>
6293 DOT_DESKTOP_LINK_TEMPLATE
= '''
6303 'url': DOT_URL_LINK_TEMPLATE
,
6304 'desktop': DOT_DESKTOP_LINK_TEMPLATE
,
6305 'webloc': DOT_WEBLOC_LINK_TEMPLATE
,
6309 def iri_to_uri(iri
):
6311 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
6313 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
6316 iri_parts
= compat_urllib_parse_urlparse(iri
)
6318 if '[' in iri_parts
.netloc
:
6319 raise ValueError('IPv6 URIs are not, yet, supported.')
6320 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
6322 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
6325 if iri_parts
.username
:
6326 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
6327 if iri_parts
.password
is not None:
6328 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6331 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6332 # The 'idna' encoding produces ASCII text.
6333 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6334 net_location
+= ':' + str(iri_parts
.port
)
6336 return compat_urllib_parse_urlunparse(
6340 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6342 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6343 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6345 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6346 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6348 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6350 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6353 def to_high_limit_path(path
):
6354 if sys
.platform
in ['win32', 'cygwin']:
6355 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6356 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6361 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
6363 val
= obj
if obj
is not None else default
6365 val
= obj
.get(field
, default
)
6366 if func
and val
not in ignore
:
6368 return template
% val
if val
not in ignore
else default
6371 def clean_podcast_url(url
):
6372 return re
.sub(r
'''(?x)
6376 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6379 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6382 cn\.co| # https://podcorn.com/analytics-prefix/
6383 st\.fm # https://podsights.com/docs/
6388 _HEX_TABLE
= '0123456789abcdef'
6391 def random_uuidv4():
6392 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6395 def make_dir(path
, to_screen
=None):
6397 dn
= os
.path
.dirname(path
)
6398 if dn
and not os
.path
.exists(dn
):
6401 except (OSError, IOError) as err
:
6402 if callable(to_screen
) is not None:
6403 to_screen('unable to create directory ' + error_to_compat_str(err
))
6407 def get_executable_path():
6408 from zipimport
import zipimporter
6409 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6410 path
= os
.path
.dirname(sys
.executable
)
6411 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6412 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6414 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6415 return os
.path
.abspath(path
)
6418 def load_plugins(name
, suffix
, namespace
):
6421 plugins_spec
= importlib
.util
.spec_from_file_location(
6422 name
, os
.path
.join(get_executable_path(), 'ytdlp_plugins', name
, '__init__.py'))
6423 plugins
= importlib
.util
.module_from_spec(plugins_spec
)
6424 sys
.modules
[plugins_spec
.name
] = plugins
6425 plugins_spec
.loader
.exec_module(plugins
)
6426 for name
in dir(plugins
):
6427 if name
in namespace
:
6429 if not name
.endswith(suffix
):
6431 klass
= getattr(plugins
, name
)
6432 classes
[name
] = namespace
[name
] = klass
6433 except FileNotFoundError
:
6439 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
6440 casesense
=True, is_user_input
=False, traverse_string
=False):
6441 ''' Traverse nested list/dict/tuple
6442 @param path_list A list of paths which are checked one by one.
6443 Each path is a list of keys where each key is a string,
6444 a function, a tuple of strings or "...".
6445 When a fuction is given, it takes the key as argument and
6446 returns whether the key matches or not. When a tuple is given,
6447 all the keys given in the tuple are traversed, and
6448 "..." traverses all the keys in the object
6449 @param default Default value to return
6450 @param expected_type Only accept final value of this type (Can also be any callable)
6451 @param get_all Return all the values obtained from a path or only the first one
6452 @param casesense Whether to consider dictionary keys as case sensitive
6453 @param is_user_input Whether the keys are generated from user input. If True,
6454 strings are converted to int/slice if necessary
6455 @param traverse_string Whether to traverse inside strings. If True, any
6456 non-compatible object will also be converted into a string
6460 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
6461 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
6463 def _traverse_obj(obj
, path
, _current_depth
=0):
6465 path
= tuple(variadic(path
))
6466 for i
, key
in enumerate(path
):
6469 if isinstance(key
, (list, tuple)):
6470 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
6473 obj
= (obj
.values() if isinstance(obj
, dict)
6474 else obj
if isinstance(obj
, (list, tuple, LazyList
))
6475 else str(obj
) if traverse_string
else [])
6477 depth
= max(depth
, _current_depth
)
6478 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
6480 if isinstance(obj
, (list, tuple, LazyList
)):
6481 obj
= enumerate(obj
)
6482 elif isinstance(obj
, dict):
6485 if not traverse_string
:
6489 depth
= max(depth
, _current_depth
)
6490 return [_traverse_obj(v
, path
[i
+ 1:], _current_depth
) for k
, v
in obj
if key(k
)]
6491 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
6492 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
6493 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
6496 key
= (int_or_none(key
) if ':' not in key
6497 else slice(*map(int_or_none
, key
.split(':'))))
6498 if key
== slice(None):
6499 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
6500 if not isinstance(key
, (int, slice)):
6502 if not isinstance(obj
, (list, tuple, LazyList
)):
6503 if not traverse_string
:
6512 if isinstance(expected_type
, type):
6513 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
6514 elif expected_type
is not None:
6515 type_test
= expected_type
6517 type_test
= lambda val
: val
6519 for path
in path_list
:
6521 val
= _traverse_obj(obj
, path
)
6524 for _
in range(depth
- 1):
6525 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
6526 val
= [v
for v
in map(type_test
, val
) if v
is not None]
6528 return val
if get_all
else val
[0]
6530 val
= type_test(val
)
6536 def traverse_dict(dictn
, keys
, casesense
=True):
6537 ''' For backward compatibility. Do not use '''
6538 return traverse_obj(dictn
, keys
, casesense
=casesense
,
6539 is_user_input
=True, traverse_string
=True)
6542 def variadic(x
, allowed_types
=(str, bytes)):
6543 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
6546 # create a JSON Web Signature (jws) with HS256 algorithm
6547 # the resulting format is in JWS Compact Serialization
6548 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
6549 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
6550 def jwt_encode_hs256(payload_data
, key
, headers
={}):
6556 header_data
.update(headers
)
6557 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode('utf-8'))
6558 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode('utf-8'))
6559 h
= hmac
.new(key
.encode('utf-8'), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
6560 signature_b64
= base64
.b64encode(h
.digest())
6561 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64
6565 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
6566 def jwt_decode_hs256(jwt
):
6567 header_b64
, payload_b64
, signature_b64
= jwt
.split('.')
6568 payload_data
= json
.loads(base64
.urlsafe_b64decode(payload_b64
))
6572 def supports_terminal_sequences(stream
):
6573 if compat_os_name
== 'nt':
6574 if get_windows_version() < (10, 0, 10586):
6576 elif not os
.getenv('TERM'):
6579 return stream
.isatty()
6580 except BaseException
:
6584 _terminal_sequences_re
= re
.compile('\033\\[[^m]+m')
6587 def remove_terminal_sequences(string
):
6588 return _terminal_sequences_re
.sub('', string
)
6591 def number_of_digits(number
):
6592 return len('%d' % number
)
6595 def join_nonempty(*values
, delim
='-', from_dict
=None):
6596 if from_dict
is not None:
6597 values
= map(from_dict
.get
, values
)
6598 return delim
.join(map(str, filter(None, values
)))