4 from __future__
import unicode_literals
39 import xml
.etree
.ElementTree
43 compat_HTMLParseError
,
49 compat_ctypes_WINFUNCTYPE
,
50 compat_etree_fromstring
,
53 compat_html_entities_html5
,
66 compat_urllib_parse_urlencode
,
67 compat_urllib_parse_urlparse
,
68 compat_urllib_parse_urlunparse
,
69 compat_urllib_parse_quote
,
70 compat_urllib_parse_quote_plus
,
71 compat_urllib_parse_unquote_plus
,
72 compat_urllib_request
,
83 def register_socks_protocols():
84 # "Register" SOCKS protocols
85 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
86 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
87 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
88 if scheme
not in compat_urlparse
.uses_netloc
:
89 compat_urlparse
.uses_netloc
.append(scheme
)
92 # This is not clearly defined otherwise
93 compiled_regex_type
= type(re
.compile(''))
96 def random_user_agent():
97 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1676 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1680 'User-Agent': random_user_agent(),
1681 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1682 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1683 'Accept-Encoding': 'gzip, deflate',
1684 'Accept-Language': 'en-us,en;q=0.5',
1689 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1693 NO_DEFAULT
= object()
1695 ENGLISH_MONTH_NAMES
= [
1696 'January', 'February', 'March', 'April', 'May', 'June',
1697 'July', 'August', 'September', 'October', 'November', 'December']
1700 'en': ENGLISH_MONTH_NAMES
,
1702 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1703 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1706 KNOWN_EXTENSIONS
= (
1707 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1708 'flv', 'f4v', 'f4a', 'f4b',
1709 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1710 'mkv', 'mka', 'mk3d',
1713 'asf', 'wmv', 'wma',
1719 'f4f', 'f4m', 'm3u8', 'smil')
1721 # needed for sanitizing filenames in restricted mode
1722 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1723 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1724 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1748 '%Y/%m/%d %H:%M:%S',
1752 '%Y-%m-%d %H:%M:%S',
1753 '%Y-%m-%d %H:%M:%S.%f',
1754 '%Y-%m-%d %H:%M:%S:%f',
1757 '%Y-%m-%dT%H:%M:%SZ',
1758 '%Y-%m-%dT%H:%M:%S.%fZ',
1759 '%Y-%m-%dT%H:%M:%S.%f0Z',
1760 '%Y-%m-%dT%H:%M:%S',
1761 '%Y-%m-%dT%H:%M:%S.%f',
1763 '%b %d %Y at %H:%M',
1764 '%b %d %Y at %H:%M:%S',
1765 '%B %d %Y at %H:%M',
1766 '%B %d %Y at %H:%M:%S',
1770 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1771 DATE_FORMATS_DAY_FIRST
.extend([
1777 '%d/%m/%Y %H:%M:%S',
1780 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1781 DATE_FORMATS_MONTH_FIRST
.extend([
1786 '%m/%d/%Y %H:%M:%S',
1789 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1790 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1793 def preferredencoding():
1794 """Get preferred encoding.
1796 Returns the best encoding scheme for the system, based on
1797 locale.getpreferredencoding() and some further tweaks.
1800 pref = locale.getpreferredencoding()
1808 def write_json_file(obj, fn):
1809 """ Encode obj as JSON and write it to fn, atomically if possible """
1811 fn = encodeFilename(fn)
1812 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1813 encoding = get_filesystem_encoding()
1814 # os.path.basename returns a bytes object, but NamedTemporaryFile
1815 # will fail if the filename contains non ascii characters unless we
1816 # use a unicode object
1817 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1818 # the same for os.path.dirname
1819 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1821 path_basename = os.path.basename
1822 path_dirname = os.path.dirname
1826 'prefix
': path_basename(fn) + '.',
1827 'dir': path_dirname(fn),
1831 # In Python 2.x, json.dump expects a bytestream.
1832 # In Python 3.x, it writes to a character stream
1833 if sys.version_info < (3, 0):
1838 'encoding
': 'utf
-8',
1841 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1846 if sys.platform == 'win32
':
1847 # Need to remove existing file on Windows, else os.rename raises
1848 # WindowsError or FileExistsError.
1856 os.chmod(tf.name, 0o666 & ~mask)
1859 os.rename(tf.name, fn)
1868 if sys.version_info >= (2, 7):
1869 def find_xpath_attr(node, xpath, key, val=None):
1870 """ Find the xpath xpath[@key=val] """
1871 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1872 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1873 return node.find(expr)
1875 def find_xpath_attr(node, xpath, key, val=None):
1876 for f in node.findall(compat_xpath(xpath)):
1877 if key not in f.attrib:
1879 if val is None or f.attrib.get(key) == val:
1883 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1884 # the namespace parameter
1887 def xpath_with_ns(path
, ns_map
):
1888 components
= [c
.split(':') for c
in path
.split('/')]
1890 for c
in components
:
1892 replaced
.append(c
[0])
1895 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1896 return '/'.join(replaced
)
1899 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1900 def _find_xpath(xpath
):
1901 return node
.find(compat_xpath(xpath
))
1903 if isinstance(xpath
, (str, compat_str
)):
1904 n
= _find_xpath(xpath
)
1912 if default
is not NO_DEFAULT
:
1915 name
= xpath
if name
is None else name
1916 raise ExtractorError('Could not find XML element %s' % name
)
1922 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1923 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1924 if n
is None or n
== default
:
1927 if default
is not NO_DEFAULT
:
1930 name
= xpath
if name
is None else name
1931 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1937 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1938 n
= find_xpath_attr(node
, xpath
, key
)
1940 if default
is not NO_DEFAULT
:
1943 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1944 raise ExtractorError('Could not find XML attribute %s' % name
)
1947 return n
.attrib
[key
]
1950 def get_element_by_id(id, html
):
1951 """Return the content of the tag with the specified ID in the passed HTML document"""
1952 return get_element_by_attribute('id', id, html
)
1955 def get_element_by_class(class_name
, html
):
1956 """Return the content of the first tag with the specified class in the passed HTML document"""
1957 retval
= get_elements_by_class(class_name
, html
)
1958 return retval
[0] if retval
else None
1961 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1962 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1963 return retval
[0] if retval
else None
1966 def get_elements_by_class(class_name
, html
):
1967 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1968 return get_elements_by_attribute(
1969 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1970 html, escape_value=False)
1973 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1974 """Return the content of the tag with the specified attribute in the passed HTML document"""
1976 value = re.escape(value) if escape_value else value
1979 for m in re.finditer(r'''(?xs)
1981 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1983 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1987 ''' % (re.escape(attribute), value), html):
1988 res = m.group('content
')
1990 if res.startswith('"') or res.startswith("'"):
1993 retlist.append(unescapeHTML(res))
1998 class HTMLAttributeParser(compat_HTMLParser):
1999 """Trivial HTML parser to gather the attributes for a single element"""
2003 compat_HTMLParser.__init__(self)
2005 def handle_starttag(self, tag, attrs):
2006 self.attrs = dict(attrs)
2009 def extract_attributes(html_element):
2010 """Given a string for an HTML element such as
2012 a="foo" B="bar" c="&98;az" d=boz
2013 empty= noval entity="&"
2016 Decode and return a dictionary of attributes.
2018 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2019 'empty
': '', 'noval
': None, 'entity
': '&',
2020 'sq
': '"', 'dq': '\''
2022 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2023 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2025 parser = HTMLAttributeParser()
2027 parser.feed(html_element)
2029 # Older Python may throw HTMLParseError in case of malformed HTML
2030 except compat_HTMLParseError:
2035 def clean_html(html):
2036 """Clean an HTML snippet into a readable string"""
2038 if html is None: # Convenience for sanitizing descriptions etc.
2042 html = html.replace('\n', ' ')
2043 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2044 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2046 html = re.sub('<.*?>', '', html)
2047 # Replace html entities
2048 html = unescapeHTML(html)
2052 def sanitize_open(filename, open_mode):
2053 """Try to open the given filename, and slightly tweak it if this fails.
2055 Attempts to open the given filename. If this fails, it tries to change
2056 the filename slightly, step by step, until it's either able to open it
2057 or it fails and raises a final exception, like the standard open()
2060 It returns the tuple (stream, definitive_file_name).
2064 if sys.platform == 'win32':
2066 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2067 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2068 stream = open(encodeFilename(filename), open_mode)
2069 return (stream, filename)
2070 except (IOError, OSError) as err:
2071 if err.errno in (errno.EACCES,):
2074 # In case of error, try to remove win32 forbidden chars
2075 alt_filename = sanitize_path(filename)
2076 if alt_filename == filename:
2079 # An exception here should be caught in the caller
2080 stream = open(encodeFilename(alt_filename), open_mode)
2081 return (stream, alt_filename)
2084 def timeconvert(timestr):
2085 """Convert RFC 2822 defined time string into system timestamp"""
2087 timetuple = email.utils.parsedate_tz(timestr)
2088 if timetuple is not None:
2089 timestamp = email.utils.mktime_tz(timetuple)
2093 def sanitize_filename(s, restricted=False, is_id=False):
2094 """Sanitizes a string so it could be used as part of a filename.
2095 If restricted is set, use a stricter subset of allowed characters.
2096 Set is_id if this is not an arbitrary string, but an ID that should be kept
2099 def replace_insane(char):
2100 if restricted and char in ACCENT_CHARS:
2101 return ACCENT_CHARS[char]
2102 if char == '?' or ord(char) < 32 or ord(char) == 127:
2105 return '' if restricted else '\''
2107 return '_
-' if restricted else ' -'
2108 elif char in '\\/|
*<>':
2110 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2112 if restricted
and ord(char
) > 127:
2119 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2120 result
= ''.join(map(replace_insane
, s
))
2122 while '__' in result
:
2123 result
= result
.replace('__', '_')
2124 result
= result
.strip('_')
2125 # Common case of "Foreign band name - English song title"
2126 if restricted
and result
.startswith('-_'):
2128 if result
.startswith('-'):
2129 result
= '_' + result
[len('-'):]
2130 result
= result
.lstrip('.')
2136 def sanitize_path(s
, force
=False):
2137 """Sanitizes and normalizes path on Windows"""
2138 if sys
.platform
== 'win32':
2140 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2141 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2142 drive_or_unc
, _
= os
.path
.splitunc(s
)
2148 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2152 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2153 for path_part
in norm_path
]
2155 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2156 elif force
and s
[0] == os
.path
.sep
:
2157 sanitized_path
.insert(0, os
.path
.sep
)
2158 return os
.path
.join(*sanitized_path
)
2161 def sanitize_url(url
):
2162 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2163 # the number of unwanted failures due to missing protocol
2164 if url
.startswith('//'):
2165 return 'http:%s' % url
2166 # Fix some common typos seen so far
2168 # https://github.com/ytdl-org/youtube-dl/issues/15649
2169 (r
'^httpss://', r
'https://'),
2170 # https://bx1.be/lives/direct-tv/
2171 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2173 for mistake
, fixup
in COMMON_TYPOS
:
2174 if re
.match(mistake
, url
):
2175 return re
.sub(mistake
, fixup
, url
)
2179 def extract_basic_auth(url
):
2180 parts
= compat_urlparse
.urlsplit(url
)
2181 if parts
.username
is None:
2183 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
2184 parts
.hostname
if parts
.port
is None
2185 else '%s:%d' % (parts
.hostname
, parts
.port
))))
2186 auth_payload
= base64
.b64encode(
2187 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
2188 return url
, 'Basic ' + auth_payload
.decode('utf-8')
2191 def sanitized_Request(url
, *args
, **kwargs
):
2192 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
2193 if auth_header
is not None:
2194 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
2195 headers
['Authorization'] = auth_header
2196 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
2200 """Expand shell variables and ~"""
2201 return os
.path
.expandvars(compat_expanduser(s
))
2204 def orderedSet(iterable
):
2205 """ Remove all duplicates from the input iterable """
2213 def _htmlentity_transform(entity_with_semicolon
):
2214 """Transforms an HTML entity to a character."""
2215 entity
= entity_with_semicolon
[:-1]
2217 # Known non-numeric HTML entity
2218 if entity
in compat_html_entities
.name2codepoint
:
2219 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2221 # TODO: HTML5 allows entities without a semicolon. For example,
2222 # 'Éric' should be decoded as 'Éric'.
2223 if entity_with_semicolon
in compat_html_entities_html5
:
2224 return compat_html_entities_html5
[entity_with_semicolon
]
2226 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2227 if mobj
is not None:
2228 numstr
= mobj
.group(1)
2229 if numstr
.startswith('x'):
2231 numstr
= '0%s' % numstr
2234 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2236 return compat_chr(int(numstr
, base
))
2240 # Unknown entity in name, return its literal representation
2241 return '&%s;' % entity
2244 def unescapeHTML(s
):
2247 assert type(s
) == compat_str
2250 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2253 def escapeHTML(text
):
2256 .replace('&', '&')
2257 .replace('<', '<')
2258 .replace('>', '>')
2259 .replace('"', '"')
2260 .replace("'", ''')
2264 def process_communicate_or_kill(p
, *args
, **kwargs
):
2266 return p
.communicate(*args
, **kwargs
)
2267 except BaseException
: # Including KeyboardInterrupt
2273 def get_subprocess_encoding():
2274 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2275 # For subprocess calls, encode with locale encoding
2276 # Refer to http://stackoverflow.com/a/9951851/35070
2277 encoding
= preferredencoding()
2279 encoding
= sys
.getfilesystemencoding()
2280 if encoding
is None:
2285 def encodeFilename(s
, for_subprocess
=False):
2287 @param s The name of the file
2290 assert type(s
) == compat_str
2292 # Python 3 has a Unicode API
2293 if sys
.version_info
>= (3, 0):
2296 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2297 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2298 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2299 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2302 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2303 if sys
.platform
.startswith('java'):
2306 return s
.encode(get_subprocess_encoding(), 'ignore')
2309 def decodeFilename(b
, for_subprocess
=False):
2311 if sys
.version_info
>= (3, 0):
2314 if not isinstance(b
, bytes):
2317 return b
.decode(get_subprocess_encoding(), 'ignore')
2320 def encodeArgument(s
):
2321 if not isinstance(s
, compat_str
):
2322 # Legacy code that uses byte strings
2323 # Uncomment the following line after fixing all post processors
2324 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2325 s
= s
.decode('ascii')
2326 return encodeFilename(s
, True)
2329 def decodeArgument(b
):
2330 return decodeFilename(b
, True)
2333 def decodeOption(optval
):
2336 if isinstance(optval
, bytes):
2337 optval
= optval
.decode(preferredencoding())
2339 assert isinstance(optval
, compat_str
)
2343 def formatSeconds(secs
, delim
=':', msec
=False):
2345 ret
= '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2347 ret
= '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2350 return '%s.%03d' % (ret
, secs
% 1) if msec
else ret
2353 def make_HTTPS_handler(params
, **kwargs
):
2354 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2355 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2356 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2357 if opts_no_check_certificate
:
2358 context
.check_hostname
= False
2359 context
.verify_mode
= ssl
.CERT_NONE
2361 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2364 # (create_default_context present but HTTPSHandler has no context=)
2367 if sys
.version_info
< (3, 2):
2368 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2369 else: # Python < 3.4
2370 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2371 context
.verify_mode
= (ssl
.CERT_NONE
2372 if opts_no_check_certificate
2373 else ssl
.CERT_REQUIRED
)
2374 context
.set_default_verify_paths()
2375 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2378 def bug_reports_message(before
=';'):
2379 if ytdl_is_updateable():
2380 update_cmd
= 'type yt-dlp -U to update'
2382 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2383 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2384 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2385 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2387 before
= before
.rstrip()
2388 if not before
or before
.endswith(('.', '!', '?')):
2389 msg
= msg
[0].title() + msg
[1:]
2391 return (before
+ ' ' if before
else '') + msg
2394 class YoutubeDLError(Exception):
2395 """Base exception for YoutubeDL errors."""
2399 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2400 if hasattr(ssl
, 'CertificateError'):
2401 network_exceptions
.append(ssl
.CertificateError
)
2402 network_exceptions
= tuple(network_exceptions
)
2405 class ExtractorError(YoutubeDLError
):
2406 """Error during info extraction."""
2408 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
2409 """ tb, if given, is the original traceback (so that it can be printed out).
2410 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2412 if sys
.exc_info()[0] in network_exceptions
:
2417 self
.expected
= expected
2419 self
.video_id
= video_id
2421 self
.exc_info
= sys
.exc_info() # preserve original exception
2423 super(ExtractorError
, self
).__init
__(''.join((
2424 format_field(ie
, template
='[%s] '),
2425 format_field(video_id
, template
='%s: '),
2427 format_field(cause
, template
=' (caused by %r)'),
2428 '' if expected
else bug_reports_message())))
2430 def format_traceback(self
):
2431 if self
.traceback
is None:
2433 return ''.join(traceback
.format_tb(self
.traceback
))
2436 class UnsupportedError(ExtractorError
):
2437 def __init__(self
, url
):
2438 super(UnsupportedError
, self
).__init
__(
2439 'Unsupported URL: %s' % url
, expected
=True)
2443 class RegexNotFoundError(ExtractorError
):
2444 """Error when a regex didn't match"""
2448 class GeoRestrictedError(ExtractorError
):
2449 """Geographic restriction Error exception.
2451 This exception may be thrown when a video is not available from your
2452 geographic location due to geographic restrictions imposed by a website.
2455 def __init__(self
, msg
, countries
=None):
2456 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2458 self
.countries
= countries
2461 class DownloadError(YoutubeDLError
):
2462 """Download Error exception.
2464 This exception may be thrown by FileDownloader objects if they are not
2465 configured to continue on errors. They will contain the appropriate
2469 def __init__(self
, msg
, exc_info
=None):
2470 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2471 super(DownloadError
, self
).__init
__(msg
)
2472 self
.exc_info
= exc_info
2475 class EntryNotInPlaylist(YoutubeDLError
):
2476 """Entry not in playlist exception.
2478 This exception will be thrown by YoutubeDL when a requested entry
2479 is not found in the playlist info_dict
2484 class SameFileError(YoutubeDLError
):
2485 """Same File exception.
2487 This exception will be thrown by FileDownloader objects if they detect
2488 multiple files would have to be downloaded to the same file on disk.
2493 class PostProcessingError(YoutubeDLError
):
2494 """Post Processing exception.
2496 This exception may be raised by PostProcessor's .run() method to
2497 indicate an error in the postprocessing task.
2500 def __init__(self
, msg
):
2501 super(PostProcessingError
, self
).__init
__(msg
)
2505 class ExistingVideoReached(YoutubeDLError
):
2506 """ --max-downloads limit has been reached. """
2510 class RejectedVideoReached(YoutubeDLError
):
2511 """ --max-downloads limit has been reached. """
2515 class ThrottledDownload(YoutubeDLError
):
2516 """ Download speed below --throttled-rate. """
2520 class MaxDownloadsReached(YoutubeDLError
):
2521 """ --max-downloads limit has been reached. """
2525 class UnavailableVideoError(YoutubeDLError
):
2526 """Unavailable Format exception.
2528 This exception will be thrown when a video is requested
2529 in a format that is not available for that video.
2534 class ContentTooShortError(YoutubeDLError
):
2535 """Content Too Short exception.
2537 This exception may be raised by FileDownloader objects when a file they
2538 download is too small for what the server announced first, indicating
2539 the connection was probably interrupted.
2542 def __init__(self
, downloaded
, expected
):
2543 super(ContentTooShortError
, self
).__init
__(
2544 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2547 self
.downloaded
= downloaded
2548 self
.expected
= expected
2551 class XAttrMetadataError(YoutubeDLError
):
2552 def __init__(self
, code
=None, msg
='Unknown error'):
2553 super(XAttrMetadataError
, self
).__init
__(msg
)
2557 # Parsing code and msg
2558 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2559 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2560 self
.reason
= 'NO_SPACE'
2561 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2562 self
.reason
= 'VALUE_TOO_LONG'
2564 self
.reason
= 'NOT_SUPPORTED'
2567 class XAttrUnavailableError(YoutubeDLError
):
2571 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2572 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2573 # expected HTTP responses to meet HTTP/1.0 or later (see also
2574 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2575 if sys
.version_info
< (3, 0):
2576 kwargs
['strict'] = True
2577 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2578 source_address
= ydl_handler
._params
.get('source_address')
2580 if source_address
is not None:
2581 # This is to workaround _create_connection() from socket where it will try all
2582 # address data from getaddrinfo() including IPv6. This filters the result from
2583 # getaddrinfo() based on the source_address value.
2584 # This is based on the cpython socket.create_connection() function.
2585 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2586 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2587 host
, port
= address
2589 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2590 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2591 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2592 if addrs
and not ip_addrs
:
2593 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2595 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2596 % (ip_version
, source_address
[0]))
2597 for res
in ip_addrs
:
2598 af
, socktype
, proto
, canonname
, sa
= res
2601 sock
= socket
.socket(af
, socktype
, proto
)
2602 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2603 sock
.settimeout(timeout
)
2604 sock
.bind(source_address
)
2606 err
= None # Explicitly break reference cycle
2608 except socket
.error
as _
:
2610 if sock
is not None:
2615 raise socket
.error('getaddrinfo returns an empty list')
2616 if hasattr(hc
, '_create_connection'):
2617 hc
._create
_connection
= _create_connection
2618 sa
= (source_address
, 0)
2619 if hasattr(hc
, 'source_address'): # Python 2.7+
2620 hc
.source_address
= sa
2622 def _hc_connect(self
, *args
, **kwargs
):
2623 sock
= _create_connection(
2624 (self
.host
, self
.port
), self
.timeout
, sa
)
2626 self
.sock
= ssl
.wrap_socket(
2627 sock
, self
.key_file
, self
.cert_file
,
2628 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2631 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2636 def handle_youtubedl_headers(headers
):
2637 filtered_headers
= headers
2639 if 'Youtubedl-no-compression' in filtered_headers
:
2640 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2641 del filtered_headers
['Youtubedl-no-compression']
2643 return filtered_headers
2646 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2647 """Handler for HTTP requests and responses.
2649 This class, when installed with an OpenerDirector, automatically adds
2650 the standard headers to every HTTP request and handles gzipped and
2651 deflated responses from web servers. If compression is to be avoided in
2652 a particular request, the original request in the program code only has
2653 to include the HTTP header "Youtubedl-no-compression", which will be
2654 removed before making the real request.
2656 Part of this code was copied from:
2658 http://techknack.net/python-urllib2-handlers/
2660 Andrew Rowls, the author of that code, agreed to release it to the
2664 def __init__(self
, params
, *args
, **kwargs
):
2665 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2666 self
._params
= params
2668 def http_open(self
, req
):
2669 conn_class
= compat_http_client
.HTTPConnection
2671 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2673 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2674 del req
.headers
['Ytdl-socks-proxy']
2676 return self
.do_open(functools
.partial(
2677 _create_http_connection
, self
, conn_class
, False),
2685 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2687 return zlib
.decompress(data
)
2689 def http_request(self
, req
):
2690 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2691 # always respected by websites, some tend to give out URLs with non percent-encoded
2692 # non-ASCII characters (see telemb.py, ard.py [#3412])
2693 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2694 # To work around aforementioned issue we will replace request's original URL with
2695 # percent-encoded one
2696 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2697 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2698 url
= req
.get_full_url()
2699 url_escaped
= escape_url(url
)
2701 # Substitute URL if any change after escaping
2702 if url
!= url_escaped
:
2703 req
= update_Request(req
, url
=url_escaped
)
2705 for h
, v
in std_headers
.items():
2706 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2707 # The dict keys are capitalized because of this bug by urllib
2708 if h
.capitalize() not in req
.headers
:
2709 req
.add_header(h
, v
)
2711 req
.headers
= handle_youtubedl_headers(req
.headers
)
2713 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2714 # Python 2.6 is brain-dead when it comes to fragments
2715 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2716 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2720 def http_response(self
, req
, resp
):
2723 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2724 content
= resp
.read()
2725 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2727 uncompressed
= io
.BytesIO(gz
.read())
2728 except IOError as original_ioerror
:
2729 # There may be junk add the end of the file
2730 # See http://stackoverflow.com/q/4928560/35070 for details
2731 for i
in range(1, 1024):
2733 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2734 uncompressed
= io
.BytesIO(gz
.read())
2739 raise original_ioerror
2740 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2741 resp
.msg
= old_resp
.msg
2742 del resp
.headers
['Content-encoding']
2744 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2745 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2746 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2747 resp
.msg
= old_resp
.msg
2748 del resp
.headers
['Content-encoding']
2749 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2750 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2751 if 300 <= resp
.code
< 400:
2752 location
= resp
.headers
.get('Location')
2754 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2755 if sys
.version_info
>= (3, 0):
2756 location
= location
.encode('iso-8859-1').decode('utf-8')
2758 location
= location
.decode('utf-8')
2759 location_escaped
= escape_url(location
)
2760 if location
!= location_escaped
:
2761 del resp
.headers
['Location']
2762 if sys
.version_info
< (3, 0):
2763 location_escaped
= location_escaped
.encode('utf-8')
2764 resp
.headers
['Location'] = location_escaped
2767 https_request
= http_request
2768 https_response
= http_response
2771 def make_socks_conn_class(base_class
, socks_proxy
):
2772 assert issubclass(base_class
, (
2773 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2775 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2776 if url_components
.scheme
.lower() == 'socks5':
2777 socks_type
= ProxyType
.SOCKS5
2778 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2779 socks_type
= ProxyType
.SOCKS4
2780 elif url_components
.scheme
.lower() == 'socks4a':
2781 socks_type
= ProxyType
.SOCKS4A
2783 def unquote_if_non_empty(s
):
2786 return compat_urllib_parse_unquote_plus(s
)
2790 url_components
.hostname
, url_components
.port
or 1080,
2792 unquote_if_non_empty(url_components
.username
),
2793 unquote_if_non_empty(url_components
.password
),
2796 class SocksConnection(base_class
):
2798 self
.sock
= sockssocket()
2799 self
.sock
.setproxy(*proxy_args
)
2800 if type(self
.timeout
) in (int, float):
2801 self
.sock
.settimeout(self
.timeout
)
2802 self
.sock
.connect((self
.host
, self
.port
))
2804 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2805 if hasattr(self
, '_context'): # Python > 2.6
2806 self
.sock
= self
._context
.wrap_socket(
2807 self
.sock
, server_hostname
=self
.host
)
2809 self
.sock
= ssl
.wrap_socket(self
.sock
)
2811 return SocksConnection
2814 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2815 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2816 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2817 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2818 self
._params
= params
2820 def https_open(self
, req
):
2822 conn_class
= self
._https
_conn
_class
2824 if hasattr(self
, '_context'): # python > 2.6
2825 kwargs
['context'] = self
._context
2826 if hasattr(self
, '_check_hostname'): # python 3.x
2827 kwargs
['check_hostname'] = self
._check
_hostname
2829 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2831 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2832 del req
.headers
['Ytdl-socks-proxy']
2834 return self
.do_open(functools
.partial(
2835 _create_http_connection
, self
, conn_class
, True),
2839 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2841 See [1] for cookie file format.
2843 1. https://curl.haxx.se/docs/http-cookies.html
2845 _HTTPONLY_PREFIX
= '#HttpOnly_'
2847 _HEADER
= '''# Netscape HTTP Cookie File
2848 # This file is generated by yt-dlp. Do not edit.
2851 _CookieFileEntry
= collections
.namedtuple(
2853 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2855 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2857 Save cookies to a file.
2859 Most of the code is taken from CPython 3.8 and slightly adapted
2860 to support cookie files with UTF-8 in both python 2 and 3.
2862 if filename
is None:
2863 if self
.filename
is not None:
2864 filename
= self
.filename
2866 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2868 # Store session cookies with `expires` set to 0 instead of an empty
2871 if cookie
.expires
is None:
2874 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2875 f
.write(self
._HEADER
)
2878 if not ignore_discard
and cookie
.discard
:
2880 if not ignore_expires
and cookie
.is_expired(now
):
2886 if cookie
.domain
.startswith('.'):
2887 initial_dot
= 'TRUE'
2889 initial_dot
= 'FALSE'
2890 if cookie
.expires
is not None:
2891 expires
= compat_str(cookie
.expires
)
2894 if cookie
.value
is None:
2895 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2896 # with no name, whereas http.cookiejar regards it as a
2897 # cookie with no value.
2902 value
= cookie
.value
2904 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2905 secure
, expires
, name
, value
]) + '\n')
2907 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2908 """Load cookies from a file."""
2909 if filename
is None:
2910 if self
.filename
is not None:
2911 filename
= self
.filename
2913 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2915 def prepare_line(line
):
2916 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2917 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2918 # comments and empty lines are fine
2919 if line
.startswith('#') or not line
.strip():
2921 cookie_list
= line
.split('\t')
2922 if len(cookie_list
) != self
._ENTRY
_LEN
:
2923 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2924 cookie
= self
._CookieFileEntry
(*cookie_list
)
2925 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2926 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2930 with io
.open(filename
, encoding
='utf-8') as f
:
2933 cf
.write(prepare_line(line
))
2934 except compat_cookiejar
.LoadError
as e
:
2936 'WARNING: skipping cookie file entry due to %s: %r\n'
2937 % (e
, line
), sys
.stderr
)
2940 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2941 # Session cookies are denoted by either `expires` field set to
2942 # an empty string or 0. MozillaCookieJar only recognizes the former
2943 # (see [1]). So we need force the latter to be recognized as session
2944 # cookies on our own.
2945 # Session cookies may be important for cookies-based authentication,
2946 # e.g. usually, when user does not check 'Remember me' check box while
2947 # logging in on a site, some important cookies are stored as session
2948 # cookies so that not recognizing them will result in failed login.
2949 # 1. https://bugs.python.org/issue17164
2951 # Treat `expires=0` cookies as session cookies
2952 if cookie
.expires
== 0:
2953 cookie
.expires
= None
2954 cookie
.discard
= True
2957 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2958 def __init__(self
, cookiejar
=None):
2959 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2961 def http_response(self
, request
, response
):
2962 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2963 # characters in Set-Cookie HTTP header of last response (see
2964 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2965 # In order to at least prevent crashing we will percent encode Set-Cookie
2966 # header before HTTPCookieProcessor starts processing it.
2967 # if sys.version_info < (3, 0) and response.headers:
2968 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2969 # set_cookie = response.headers.get(set_cookie_header)
2971 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2972 # if set_cookie != set_cookie_escaped:
2973 # del response.headers[set_cookie_header]
2974 # response.headers[set_cookie_header] = set_cookie_escaped
2975 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2977 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2978 https_response
= http_response
2981 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2982 """YoutubeDL redirect handler
2984 The code is based on HTTPRedirectHandler implementation from CPython [1].
2986 This redirect handler solves two issues:
2987 - ensures redirect URL is always unicode under python 2
2988 - introduces support for experimental HTTP response status code
2989 308 Permanent Redirect [2] used by some sites [3]
2991 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2992 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2993 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2996 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
2998 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2999 """Return a Request or None in response to a redirect.
3001 This is called by the http_error_30x methods when a
3002 redirection response is received. If a redirection should
3003 take place, return a new Request to allow http_error_30x to
3004 perform the redirect. Otherwise, raise HTTPError if no-one
3005 else should try to handle this url. Return None if you can't
3006 but another Handler might.
3008 m
= req
.get_method()
3009 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
3010 or code
in (301, 302, 303) and m
== "POST")):
3011 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
3012 # Strictly (according to RFC 2616), 301 or 302 in response to
3013 # a POST MUST NOT cause a redirection without confirmation
3014 # from the user (of urllib.request, in this case). In practice,
3015 # essentially all clients do redirect in this case, so we do
3018 # On python 2 urlh.geturl() may sometimes return redirect URL
3019 # as byte string instead of unicode. This workaround allows
3020 # to force it always return unicode.
3021 if sys
.version_info
[0] < 3:
3022 newurl
= compat_str(newurl
)
3024 # Be conciliant with URIs containing a space. This is mainly
3025 # redundant with the more complete encoding done in http_error_302(),
3026 # but it is kept for compatibility with other callers.
3027 newurl
= newurl
.replace(' ', '%20')
3029 CONTENT_HEADERS
= ("content-length", "content-type")
3030 # NB: don't use dict comprehension for python 2.6 compatibility
3031 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
3032 if k
.lower() not in CONTENT_HEADERS
)
3033 return compat_urllib_request
.Request(
3034 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
3038 def extract_timezone(date_str
):
3041 ^.{8,}? # >=8 char non-TZ prefix, if present
3042 (?P<tz>Z| # just the UTC Z, or
3043 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
3044 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
3045 [ ]? # optional space
3046 (?P<sign>\+|-) # +/-
3047 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
3051 timezone
= datetime
.timedelta()
3053 date_str
= date_str
[:-len(m
.group('tz'))]
3054 if not m
.group('sign'):
3055 timezone
= datetime
.timedelta()
3057 sign
= 1 if m
.group('sign') == '+' else -1
3058 timezone
= datetime
.timedelta(
3059 hours
=sign
* int(m
.group('hours')),
3060 minutes
=sign
* int(m
.group('minutes')))
3061 return timezone
, date_str
3064 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3065 """ Return a UNIX timestamp from the given date """
3067 if date_str
is None:
3070 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3072 if timezone
is None:
3073 timezone
, date_str
= extract_timezone(date_str
)
3076 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3077 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3078 return calendar
.timegm(dt
.timetuple())
3083 def date_formats(day_first
=True):
3084 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3087 def unified_strdate(date_str
, day_first
=True):
3088 """Return a string with the date in the format YYYYMMDD"""
3090 if date_str
is None:
3094 date_str
= date_str
.replace(',', ' ')
3095 # Remove AM/PM + timezone
3096 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3097 _
, date_str
= extract_timezone(date_str
)
3099 for expression
in date_formats(day_first
):
3101 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3104 if upload_date
is None:
3105 timetuple
= email
.utils
.parsedate_tz(date_str
)
3108 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3111 if upload_date
is not None:
3112 return compat_str(upload_date
)
3115 def unified_timestamp(date_str
, day_first
=True):
3116 if date_str
is None:
3119 date_str
= re
.sub(r
'[,|]', '', date_str
)
3121 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3122 timezone
, date_str
= extract_timezone(date_str
)
3124 # Remove AM/PM + timezone
3125 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3127 # Remove unrecognized timezones from ISO 8601 alike timestamps
3128 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3130 date_str
= date_str
[:-len(m
.group('tz'))]
3132 # Python only supports microseconds, so remove nanoseconds
3133 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3135 date_str
= m
.group(1)
3137 for expression
in date_formats(day_first
):
3139 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3140 return calendar
.timegm(dt
.timetuple())
3143 timetuple
= email
.utils
.parsedate_tz(date_str
)
3145 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3148 def determine_ext(url
, default_ext
='unknown_video'):
3149 if url
is None or '.' not in url
:
3151 guess
= url
.partition('?')[0].rpartition('.')[2]
3152 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3154 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3155 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3156 return guess
.rstrip('/')
3161 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3162 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3165 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3167 Return a datetime object from a string in the format YYYYMMDD or
3168 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3170 format: string date format used to return datetime object from
3171 precision: round the time portion of a datetime object.
3172 auto|microsecond|second|minute|hour|day.
3173 auto: round to the unit provided in date_str (if applicable).
3175 auto_precision
= False
3176 if precision
== 'auto':
3177 auto_precision
= True
3178 precision
= 'microsecond'
3179 today
= datetime_round(datetime
.datetime
.now(), precision
)
3180 if date_str
in ('now', 'today'):
3182 if date_str
== 'yesterday':
3183 return today
- datetime
.timedelta(days
=1)
3185 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3187 if match
is not None:
3188 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3189 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3190 unit
= match
.group('unit')
3191 if unit
== 'month' or unit
== 'year':
3192 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3198 delta
= datetime
.timedelta(**{unit + 's': time}
)
3199 new_date
= start_time
+ delta
3201 return datetime_round(new_date
, unit
)
3204 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3207 def date_from_str(date_str
, format
='%Y%m%d'):
3209 Return a datetime object from a string in the format YYYYMMDD or
3210 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3212 format: string date format used to return datetime object from
3214 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3217 def datetime_add_months(dt
, months
):
3218 """Increment/Decrement a datetime object by months."""
3219 month
= dt
.month
+ months
- 1
3220 year
= dt
.year
+ month
// 12
3221 month
= month
% 12 + 1
3222 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3223 return dt
.replace(year
, month
, day
)
3226 def datetime_round(dt
, precision
='day'):
3228 Round a datetime object's time to a specific precision
3230 if precision
== 'microsecond':
3239 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3240 timestamp
= calendar
.timegm(dt
.timetuple())
3241 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3244 def hyphenate_date(date_str
):
3246 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3247 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3248 if match
is not None:
3249 return '-'.join(match
.groups())
3254 class DateRange(object):
3255 """Represents a time interval between two dates"""
3257 def __init__(self
, start
=None, end
=None):
3258 """start and end must be strings in the format accepted by date"""
3259 if start
is not None:
3260 self
.start
= date_from_str(start
)
3262 self
.start
= datetime
.datetime
.min.date()
3264 self
.end
= date_from_str(end
)
3266 self
.end
= datetime
.datetime
.max.date()
3267 if self
.start
> self
.end
:
3268 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3272 """Returns a range that only contains the given day"""
3273 return cls(day
, day
)
3275 def __contains__(self
, date
):
3276 """Check if the date is in the range"""
3277 if not isinstance(date
, datetime
.date
):
3278 date
= date_from_str(date
)
3279 return self
.start
<= date
<= self
.end
3282 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3285 def platform_name():
3286 """ Returns the platform name as a compat_str """
3287 res
= platform
.platform()
3288 if isinstance(res
, bytes):
3289 res
= res
.decode(preferredencoding())
3291 assert isinstance(res
, compat_str
)
3295 def get_windows_version():
3296 ''' Get Windows version. None if it's not running on Windows '''
3297 if compat_os_name
== 'nt':
3298 return version_tuple(platform
.win32_ver()[1])
3303 def _windows_write_string(s
, out
):
3304 """ Returns True if the string was written using special methods,
3305 False if it has yet to be written out."""
3306 # Adapted from http://stackoverflow.com/a/3259271/35070
3309 import ctypes
.wintypes
3317 fileno
= out
.fileno()
3318 except AttributeError:
3319 # If the output stream doesn't have a fileno, it's virtual
3321 except io
.UnsupportedOperation
:
3322 # Some strange Windows pseudo files?
3324 if fileno
not in WIN_OUTPUT_IDS
:
3327 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3328 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3329 ('GetStdHandle', ctypes
.windll
.kernel32
))
3330 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3332 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3333 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3334 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3335 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3336 written
= ctypes
.wintypes
.DWORD(0)
3338 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3339 FILE_TYPE_CHAR
= 0x0002
3340 FILE_TYPE_REMOTE
= 0x8000
3341 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3342 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3343 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3344 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3345 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3347 def not_a_console(handle
):
3348 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3350 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3351 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3353 if not_a_console(h
):
3356 def next_nonbmp_pos(s
):
3358 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3359 except StopIteration:
3363 count
= min(next_nonbmp_pos(s
), 1024)
3365 ret
= WriteConsoleW(
3366 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3368 raise OSError('Failed to write string')
3369 if not count
: # We just wrote a non-BMP character
3370 assert written
.value
== 2
3373 assert written
.value
> 0
3374 s
= s
[written
.value
:]
3378 def write_string(s
, out
=None, encoding
=None):
3381 assert type(s
) == compat_str
3383 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3384 if _windows_write_string(s
, out
):
3387 if ('b' in getattr(out
, 'mode', '')
3388 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3389 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3391 elif hasattr(out
, 'buffer'):
3392 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3393 byt
= s
.encode(enc
, 'ignore')
3394 out
.buffer.write(byt
)
3400 def bytes_to_intlist(bs
):
3403 if isinstance(bs
[0], int): # Python 3
3406 return [ord(c
) for c
in bs
]
3409 def intlist_to_bytes(xs
):
3412 return compat_struct_pack('%dB' % len(xs
), *xs
)
3415 # Cross-platform file locking
3416 if sys
.platform
== 'win32':
3417 import ctypes
.wintypes
3420 class OVERLAPPED(ctypes
.Structure
):
3422 ('Internal', ctypes
.wintypes
.LPVOID
),
3423 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3424 ('Offset', ctypes
.wintypes
.DWORD
),
3425 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3426 ('hEvent', ctypes
.wintypes
.HANDLE
),
3429 kernel32
= ctypes
.windll
.kernel32
3430 LockFileEx
= kernel32
.LockFileEx
3431 LockFileEx
.argtypes
= [
3432 ctypes
.wintypes
.HANDLE
, # hFile
3433 ctypes
.wintypes
.DWORD
, # dwFlags
3434 ctypes
.wintypes
.DWORD
, # dwReserved
3435 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3436 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3437 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3439 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3440 UnlockFileEx
= kernel32
.UnlockFileEx
3441 UnlockFileEx
.argtypes
= [
3442 ctypes
.wintypes
.HANDLE
, # hFile
3443 ctypes
.wintypes
.DWORD
, # dwReserved
3444 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3445 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3446 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3448 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3449 whole_low
= 0xffffffff
3450 whole_high
= 0x7fffffff
3452 def _lock_file(f
, exclusive
):
3453 overlapped
= OVERLAPPED()
3454 overlapped
.Offset
= 0
3455 overlapped
.OffsetHigh
= 0
3456 overlapped
.hEvent
= 0
3457 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3458 handle
= msvcrt
.get_osfhandle(f
.fileno())
3459 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3460 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3461 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3463 def _unlock_file(f
):
3464 assert f
._lock
_file
_overlapped
_p
3465 handle
= msvcrt
.get_osfhandle(f
.fileno())
3466 if not UnlockFileEx(handle
, 0,
3467 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3468 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3471 # Some platforms, such as Jython, is missing fcntl
3475 def _lock_file(f
, exclusive
):
3476 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3478 def _unlock_file(f
):
3479 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3481 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3483 def _lock_file(f
, exclusive
):
3484 raise IOError(UNSUPPORTED_MSG
)
3486 def _unlock_file(f
):
3487 raise IOError(UNSUPPORTED_MSG
)
3490 class locked_file(object):
3491 def __init__(self
, filename
, mode
, encoding
=None):
3492 assert mode
in ['r', 'a', 'w']
3493 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3496 def __enter__(self
):
3497 exclusive
= self
.mode
!= 'r'
3499 _lock_file(self
.f
, exclusive
)
3505 def __exit__(self
, etype
, value
, traceback
):
3507 _unlock_file(self
.f
)
3514 def write(self
, *args
):
3515 return self
.f
.write(*args
)
3517 def read(self
, *args
):
3518 return self
.f
.read(*args
)
3521 def get_filesystem_encoding():
3522 encoding
= sys
.getfilesystemencoding()
3523 return encoding
if encoding
is not None else 'utf-8'
3526 def shell_quote(args
):
3528 encoding
= get_filesystem_encoding()
3530 if isinstance(a
, bytes):
3531 # We may get a filename encoded with 'encodeFilename'
3532 a
= a
.decode(encoding
)
3533 quoted_args
.append(compat_shlex_quote(a
))
3534 return ' '.join(quoted_args
)
3537 def smuggle_url(url
, data
):
3538 """ Pass additional data in a URL for internal use. """
3540 url
, idata
= unsmuggle_url(url
, {})
3542 sdata
= compat_urllib_parse_urlencode(
3543 {'__youtubedl_smuggle': json.dumps(data)}
)
3544 return url
+ '#' + sdata
3547 def unsmuggle_url(smug_url
, default
=None):
3548 if '#__youtubedl_smuggle' not in smug_url
:
3549 return smug_url
, default
3550 url
, _
, sdata
= smug_url
.rpartition('#')
3551 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3552 data
= json
.loads(jsond
)
3556 def format_bytes(bytes):
3559 if type(bytes) is str:
3560 bytes = float(bytes)
3564 exponent
= int(math
.log(bytes, 1024.0))
3565 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3566 converted
= float(bytes) / float(1024 ** exponent
)
3567 return '%.2f%s' % (converted
, suffix
)
3570 def lookup_unit_table(unit_table
, s
):
3571 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3573 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3576 num_str
= m
.group('num').replace(',', '.')
3577 mult
= unit_table
[m
.group('unit')]
3578 return int(float(num_str
) * mult
)
3581 def parse_filesize(s
):
3585 # The lower-case forms are of course incorrect and unofficial,
3586 # but we support those too
3603 'megabytes': 1000 ** 2,
3604 'mebibytes': 1024 ** 2,
3610 'gigabytes': 1000 ** 3,
3611 'gibibytes': 1024 ** 3,
3617 'terabytes': 1000 ** 4,
3618 'tebibytes': 1024 ** 4,
3624 'petabytes': 1000 ** 5,
3625 'pebibytes': 1024 ** 5,
3631 'exabytes': 1000 ** 6,
3632 'exbibytes': 1024 ** 6,
3638 'zettabytes': 1000 ** 7,
3639 'zebibytes': 1024 ** 7,
3645 'yottabytes': 1000 ** 8,
3646 'yobibytes': 1024 ** 8,
3649 return lookup_unit_table(_UNIT_TABLE
, s
)
3658 if re
.match(r
'^[\d,.]+$', s
):
3659 return str_to_int(s
)
3670 return lookup_unit_table(_UNIT_TABLE
, s
)
3673 def parse_resolution(s
):
3677 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3680 'width': int(mobj
.group('w')),
3681 'height': int(mobj
.group('h')),
3684 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3686 return {'height': int(mobj.group(1))}
3688 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3690 return {'height': int(mobj.group(1)) * 540}
3695 def parse_bitrate(s
):
3696 if not isinstance(s
, compat_str
):
3698 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3700 return int(mobj
.group(1))
3703 def month_by_name(name
, lang
='en'):
3704 """ Return the number of a month by (locale-independently) English name """
3706 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3709 return month_names
.index(name
) + 1
3714 def month_by_abbreviation(abbrev
):
3715 """ Return the number of a month by (locale-independently) English
3719 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3724 def fix_xml_ampersands(xml_str
):
3725 """Replace all the '&' by '&' in XML"""
3727 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3732 def setproctitle(title
):
3733 assert isinstance(title
, compat_str
)
3735 # ctypes in Jython is not complete
3736 # http://bugs.jython.org/issue2148
3737 if sys
.platform
.startswith('java'):
3741 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3745 # LoadLibrary in Windows Python 2.7.13 only expects
3746 # a bytestring, but since unicode_literals turns
3747 # every string into a unicode string, it fails.
3749 title_bytes
= title
.encode('utf-8')
3750 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3751 buf
.value
= title_bytes
3753 libc
.prctl(15, buf
, 0, 0, 0)
3754 except AttributeError:
3755 return # Strange libc, just skip this
3758 def remove_start(s
, start
):
3759 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3762 def remove_end(s
, end
):
3763 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3766 def remove_quotes(s
):
3767 if s
is None or len(s
) < 2:
3769 for quote
in ('"', "'", ):
3770 if s
[0] == quote
and s
[-1] == quote
:
3775 def get_domain(url
):
3776 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3777 return domain
.group('domain') if domain
else None
3780 def url_basename(url
):
3781 path
= compat_urlparse
.urlparse(url
).path
3782 return path
.strip('/').split('/')[-1]
3786 return re
.match(r
'https?://[^?#&]+/', url
).group()
3789 def urljoin(base
, path
):
3790 if isinstance(path
, bytes):
3791 path
= path
.decode('utf-8')
3792 if not isinstance(path
, compat_str
) or not path
:
3794 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3796 if isinstance(base
, bytes):
3797 base
= base
.decode('utf-8')
3798 if not isinstance(base
, compat_str
) or not re
.match(
3799 r
'^(?:https?:)?//', base
):
3801 return compat_urlparse
.urljoin(base
, path
)
3804 class HEADRequest(compat_urllib_request
.Request
):
3805 def get_method(self
):
3809 class PUTRequest(compat_urllib_request
.Request
):
3810 def get_method(self
):
3814 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3817 v
= getattr(v
, get_attr
, None)
3823 return int(v
) * invscale
// scale
3824 except (ValueError, TypeError):
3828 def str_or_none(v
, default
=None):
3829 return default
if v
is None else compat_str(v
)
3832 def str_to_int(int_str
):
3833 """ A more relaxed version of int_or_none """
3834 if isinstance(int_str
, compat_integer_types
):
3836 elif isinstance(int_str
, compat_str
):
3837 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3838 return int_or_none(int_str
)
3841 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3845 return float(v
) * invscale
/ scale
3846 except (ValueError, TypeError):
3850 def bool_or_none(v
, default
=None):
3851 return v
if isinstance(v
, bool) else default
3854 def strip_or_none(v
, default
=None):
3855 return v
.strip() if isinstance(v
, compat_str
) else default
3858 def url_or_none(url
):
3859 if not url
or not isinstance(url
, compat_str
):
3862 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3865 def strftime_or_none(timestamp
, date_format
, default
=None):
3866 datetime_object
= None
3868 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3869 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3870 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3871 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3872 return datetime_object
.strftime(date_format
)
3873 except (ValueError, TypeError, AttributeError):
3877 def parse_duration(s
):
3878 if not isinstance(s
, compat_basestring
):
3883 days
, hours
, mins
, secs
, ms
= [None] * 5
3884 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3886 days
, hours
, mins
, secs
, ms
= m
.groups()
3891 [0-9]+\s*y(?:ears?)?\s*
3894 [0-9]+\s*m(?:onths?)?\s*
3897 [0-9]+\s*w(?:eeks?)?\s*
3900 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3904 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3907 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3910 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3913 days
, hours
, mins
, secs
, ms
= m
.groups()
3915 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3917 hours
, mins
= m
.groups()
3923 duration
+= float(secs
)
3925 duration
+= float(mins
) * 60
3927 duration
+= float(hours
) * 60 * 60
3929 duration
+= float(days
) * 24 * 60 * 60
3931 duration
+= float(ms
)
3935 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3936 name
, real_ext
= os
.path
.splitext(filename
)
3938 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3939 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3940 else '{0}.{1}'.format(filename
, ext
))
3943 def replace_extension(filename
, ext
, expected_real_ext
=None):
3944 name
, real_ext
= os
.path
.splitext(filename
)
3945 return '{0}.{1}'.format(
3946 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3950 def check_executable(exe
, args
=[]):
3951 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3952 args can be a list of arguments for a short output (like -version) """
3954 process_communicate_or_kill(subprocess
.Popen(
3955 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3961 def get_exe_version(exe
, args
=['--version'],
3962 version_re
=None, unrecognized
='present'):
3963 """ Returns the version of the specified executable,
3964 or False if the executable is not present """
3966 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3967 # SIGTTOU if yt-dlp is run in the background.
3968 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3969 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3970 [encodeArgument(exe
)] + args
,
3971 stdin
=subprocess
.PIPE
,
3972 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3975 if isinstance(out
, bytes): # Python 2.x
3976 out
= out
.decode('ascii', 'ignore')
3977 return detect_exe_version(out
, version_re
, unrecognized
)
3980 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3981 assert isinstance(output
, compat_str
)
3982 if version_re
is None:
3983 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3984 m
= re
.search(version_re
, output
)
3991 class LazyList(collections
.abc
.Sequence
):
3992 ''' Lazy immutable list from an iterable
3993 Note that slices of a LazyList are lists and not LazyList'''
3995 class IndexError(IndexError):
3998 def __init__(self
, iterable
):
3999 self
.__iterable
= iter(iterable
)
4001 self
.__reversed
= False
4005 # We need to consume the entire iterable to iterate in reverse
4006 yield from self
.exhaust()
4008 yield from self
.__cache
4009 for item
in self
.__iterable
:
4010 self
.__cache
.append(item
)
4013 def __exhaust(self
):
4014 self
.__cache
.extend(self
.__iterable
)
4018 ''' Evaluate the entire iterable '''
4019 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
4022 def __reverse_index(x
):
4023 return None if x
is None else -(x
+ 1)
4025 def __getitem__(self
, idx
):
4026 if isinstance(idx
, slice):
4028 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
4029 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
4030 elif isinstance(idx
, int):
4032 idx
= self
.__reverse
_index
(idx
)
4033 start
, stop
, step
= idx
, idx
, 0
4035 raise TypeError('indices must be integers or slices')
4036 if ((start
or 0) < 0 or (stop
or 0) < 0
4037 or (start
is None and step
< 0)
4038 or (stop
is None and step
> 0)):
4039 # We need to consume the entire iterable to be able to slice from the end
4040 # Obviously, never use this with infinite iterables
4043 return self
.__cache
[idx
]
4044 except IndexError as e
:
4045 raise self
.IndexError(e
) from e
4046 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
4048 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
4050 return self
.__cache
[idx
]
4051 except IndexError as e
:
4052 raise self
.IndexError(e
) from e
4056 self
[-1] if self
.__reversed
else self
[0]
4057 except self
.IndexError:
4063 return len(self
.__cache
)
4066 self
.__reversed
= not self
.__reversed
4070 # repr and str should mimic a list. So we exhaust the iterable
4071 return repr(self
.exhaust())
4074 return repr(self
.exhaust())
4079 # This is only useful for tests
4080 return len(self
.getslice())
4082 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
4083 self
._pagefunc
= pagefunc
4084 self
._pagesize
= pagesize
4085 self
._use
_cache
= use_cache
4088 def getpage(self
, pagenum
):
4089 page_results
= self
._cache
.get(pagenum
) or list(self
._pagefunc
(pagenum
))
4091 self
._cache
[pagenum
] = page_results
4094 def getslice(self
, start
=0, end
=None):
4095 return list(self
._getslice
(start
, end
))
4097 def _getslice(self
, start
, end
):
4098 raise NotImplementedError('This method must be implemented by subclasses')
4100 def __getitem__(self
, idx
):
4101 # NOTE: cache must be enabled if this is used
4102 if not isinstance(idx
, int) or idx
< 0:
4103 raise TypeError('indices must be non-negative integers')
4104 entries
= self
.getslice(idx
, idx
+ 1)
4105 return entries
[0] if entries
else None
4108 class OnDemandPagedList(PagedList
):
4109 def _getslice(self
, start
, end
):
4110 for pagenum
in itertools
.count(start
// self
._pagesize
):
4111 firstid
= pagenum
* self
._pagesize
4112 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
4113 if start
>= nextfirstid
:
4117 start
% self
._pagesize
4118 if firstid
<= start
< nextfirstid
4121 ((end
- 1) % self
._pagesize
) + 1
4122 if (end
is not None and firstid
<= end
<= nextfirstid
)
4125 page_results
= self
.getpage(pagenum
)
4126 if startv
!= 0 or endv
is not None:
4127 page_results
= page_results
[startv
:endv
]
4128 yield from page_results
4130 # A little optimization - if current page is not "full", ie. does
4131 # not contain page_size videos then we can assume that this page
4132 # is the last one - there are no more ids on further pages -
4133 # i.e. no need to query again.
4134 if len(page_results
) + startv
< self
._pagesize
:
4137 # If we got the whole page, but the next page is not interesting,
4138 # break out early as well
4139 if end
== nextfirstid
:
4143 class InAdvancePagedList(PagedList
):
4144 def __init__(self
, pagefunc
, pagecount
, pagesize
):
4145 self
._pagecount
= pagecount
4146 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
4148 def _getslice(self
, start
, end
):
4149 start_page
= start
// self
._pagesize
4151 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4152 skip_elems
= start
- start_page
* self
._pagesize
4153 only_more
= None if end
is None else end
- start
4154 for pagenum
in range(start_page
, end_page
):
4155 page_results
= self
.getpage(pagenum
)
4157 page_results
= page_results
[skip_elems
:]
4159 if only_more
is not None:
4160 if len(page_results
) < only_more
:
4161 only_more
-= len(page_results
)
4163 yield from page_results
[:only_more
]
4165 yield from page_results
4168 def uppercase_escape(s
):
4169 unicode_escape
= codecs
.getdecoder('unicode_escape')
4171 r
'\\U[0-9a-fA-F]{8}',
4172 lambda m
: unicode_escape(m
.group(0))[0],
4176 def lowercase_escape(s
):
4177 unicode_escape
= codecs
.getdecoder('unicode_escape')
4179 r
'\\u[0-9a-fA-F]{4}',
4180 lambda m
: unicode_escape(m
.group(0))[0],
4184 def escape_rfc3986(s
):
4185 """Escape non-ASCII characters as suggested by RFC 3986"""
4186 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4187 s
= s
.encode('utf-8')
4188 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4191 def escape_url(url
):
4192 """Escape URL as suggested by RFC 3986"""
4193 url_parsed
= compat_urllib_parse_urlparse(url
)
4194 return url_parsed
._replace
(
4195 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4196 path
=escape_rfc3986(url_parsed
.path
),
4197 params
=escape_rfc3986(url_parsed
.params
),
4198 query
=escape_rfc3986(url_parsed
.query
),
4199 fragment
=escape_rfc3986(url_parsed
.fragment
)
4204 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
4207 def read_batch_urls(batch_fd
):
4209 if not isinstance(url
, compat_str
):
4210 url
= url
.decode('utf-8', 'replace')
4211 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4212 for bom
in BOM_UTF8
:
4213 if url
.startswith(bom
):
4214 url
= url
[len(bom
):]
4216 if not url
or url
.startswith(('#', ';', ']')):
4218 # "#" cannot be stripped out since it is part of the URI
4219 # However, it can be safely stipped out if follwing a whitespace
4220 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4222 with contextlib
.closing(batch_fd
) as fd
:
4223 return [url
for url
in map(fixup
, fd
) if url
]
4226 def urlencode_postdata(*args
, **kargs
):
4227 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4230 def update_url_query(url
, query
):
4233 parsed_url
= compat_urlparse
.urlparse(url
)
4234 qs
= compat_parse_qs(parsed_url
.query
)
4236 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4237 query
=compat_urllib_parse_urlencode(qs
, True)))
4240 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4241 req_headers
= req
.headers
.copy()
4242 req_headers
.update(headers
)
4243 req_data
= data
or req
.data
4244 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4245 req_get_method
= req
.get_method()
4246 if req_get_method
== 'HEAD':
4247 req_type
= HEADRequest
4248 elif req_get_method
== 'PUT':
4249 req_type
= PUTRequest
4251 req_type
= compat_urllib_request
.Request
4253 req_url
, data
=req_data
, headers
=req_headers
,
4254 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4255 if hasattr(req
, 'timeout'):
4256 new_req
.timeout
= req
.timeout
4260 def _multipart_encode_impl(data
, boundary
):
4261 content_type
= 'multipart/form-data; boundary=%s' % boundary
4264 for k
, v
in data
.items():
4265 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4266 if isinstance(k
, compat_str
):
4267 k
= k
.encode('utf-8')
4268 if isinstance(v
, compat_str
):
4269 v
= v
.encode('utf-8')
4270 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4271 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4272 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4273 if boundary
.encode('ascii') in content
:
4274 raise ValueError('Boundary overlaps with data')
4277 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4279 return out
, content_type
4282 def multipart_encode(data
, boundary
=None):
4284 Encode a dict to RFC 7578-compliant form-data
4287 A dict where keys and values can be either Unicode or bytes-like
4290 If specified a Unicode object, it's used as the boundary. Otherwise
4291 a random boundary is generated.
4293 Reference: https://tools.ietf.org/html/rfc7578
4295 has_specified_boundary
= boundary
is not None
4298 if boundary
is None:
4299 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4302 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4305 if has_specified_boundary
:
4309 return out
, content_type
4312 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4313 if isinstance(key_or_keys
, (list, tuple)):
4314 for key
in key_or_keys
:
4315 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4319 return d
.get(key_or_keys
, default
)
4322 def try_get(src
, getter
, expected_type
=None):
4323 for get
in variadic(getter
):
4326 except (AttributeError, KeyError, TypeError, IndexError):
4329 if expected_type
is None or isinstance(v
, expected_type
):
4333 def merge_dicts(*dicts
):
4335 for a_dict
in dicts
:
4336 for k
, v
in a_dict
.items():
4340 or (isinstance(v
, compat_str
) and v
4341 and isinstance(merged
[k
], compat_str
)
4342 and not merged
[k
])):
4347 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4348 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4360 TV_PARENTAL_GUIDELINES
= {
4370 def parse_age_limit(s
):
4372 return s
if 0 <= s
<= 21 else None
4373 if not isinstance(s
, compat_basestring
):
4375 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4377 return int(m
.group('age'))
4380 return US_RATINGS
[s
]
4381 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4383 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4387 def strip_jsonp(code
):
4390 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4391 (?:\s*&&\s*(?P=func_name))?
4392 \s*\(\s*(?P<callback_data>.*)\);?
4393 \s*?(?://[^\n]*)*$''',
4394 r
'\g<callback_data>', code
)
4397 def js_to_json(code
, vars={}):
4398 # vars is a dict of var, val pairs to substitute
4399 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4400 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4402 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4403 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4408 if v
in ('true', 'false', 'null'):
4410 elif v
in ('undefined', 'void 0'):
4412 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4415 if v
[0] in ("'", '"'):
4416 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4421 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4423 for regex
, base
in INTEGER_TABLE
:
4424 im
= re
.match(regex
, v
)
4426 i
= int(im
.group(1), base
)
4427 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4434 return re
.sub(r
'''(?sx)
4435 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4436 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4437 {comment}|,(?={skip}[\]}}])|
4438 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4439 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4442 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4445 def qualities(quality_ids
):
4446 """ Get a numeric quality value out of a list of possible values """
4449 return quality_ids
.index(qid
)
4456 'default': '%(title)s [%(id)s].%(ext)s',
4457 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4463 'description': 'description',
4464 'annotation': 'annotations.xml',
4465 'infojson': 'info.json',
4466 'pl_thumbnail': None,
4467 'pl_description': 'description',
4468 'pl_infojson': 'info.json',
4471 # As of [1] format syntax is:
4472 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4473 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4474 STR_FORMAT_RE_TMPL
= r
'''(?x)
4475 (?<!%)(?P<prefix>(?:%%)*)
4477 (?P<has_key>\((?P<key>{0})\))?
4479 (?P<conversion>[#0\-+ ]+)?
4481 (?P<precision>\.\d+)?
4482 (?P<len_mod>[hlL])? # unused in python
4483 {1} # conversion type
4488 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
4491 def limit_length(s
, length
):
4492 """ Add ellipses to overly long strings """
4497 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4501 def version_tuple(v
):
4502 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4505 def is_outdated_version(version
, limit
, assume_new
=True):
4507 return not assume_new
4509 return version_tuple(version
) < version_tuple(limit
)
4511 return not assume_new
4514 def ytdl_is_updateable():
4515 """ Returns if yt-dlp can be updated with -U """
4518 from zipimport
import zipimporter
4520 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4523 def args_to_str(args
):
4524 # Get a short string representation for a subprocess command
4525 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4528 def error_to_compat_str(err
):
4530 # On python 2 error byte string must be decoded with proper
4531 # encoding rather than ascii
4532 if sys
.version_info
[0] < 3:
4533 err_str
= err_str
.decode(preferredencoding())
4537 def mimetype2ext(mt
):
4543 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4544 # it's the most popular one
4545 'audio/mpeg': 'mp3',
4546 'audio/x-wav': 'wav',
4551 _
, _
, res
= mt
.rpartition('/')
4552 res
= res
.split(';')[0].strip().lower()
4556 'smptett+xml': 'tt',
4560 'x-mp4-fragmented': 'mp4',
4561 'x-ms-sami': 'sami',
4564 'x-mpegurl': 'm3u8',
4565 'vnd.apple.mpegurl': 'm3u8',
4569 'vnd.ms-sstr+xml': 'ism',
4576 def parse_codecs(codecs_str
):
4577 # http://tools.ietf.org/html/rfc6381
4580 split_codecs
= list(filter(None, map(
4581 str.strip
, codecs_str
.strip().strip(',').split(','))))
4582 vcodec
, acodec
= None, None
4583 for full_codec
in split_codecs
:
4584 codec
= full_codec
.split('.')[0]
4585 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4588 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4592 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4593 if not vcodec
and not acodec
:
4594 if len(split_codecs
) == 2:
4596 'vcodec': split_codecs
[0],
4597 'acodec': split_codecs
[1],
4601 'vcodec': vcodec
or 'none',
4602 'acodec': acodec
or 'none',
4607 def urlhandle_detect_ext(url_handle
):
4608 getheader
= url_handle
.headers
.get
4610 cd
= getheader('Content-Disposition')
4612 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4614 e
= determine_ext(m
.group('filename'), default_ext
=None)
4618 return mimetype2ext(getheader('Content-Type'))
4621 def encode_data_uri(data
, mime_type
):
4622 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4625 def age_restricted(content_limit
, age_limit
):
4626 """ Returns True iff the content should be blocked """
4628 if age_limit
is None: # No limit set
4630 if content_limit
is None:
4631 return False # Content available for everyone
4632 return age_limit
< content_limit
4635 def is_html(first_bytes
):
4636 """ Detect whether a file contains HTML by examining its first bytes. """
4639 (b
'\xef\xbb\xbf', 'utf-8'),
4640 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4641 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4642 (b
'\xff\xfe', 'utf-16-le'),
4643 (b
'\xfe\xff', 'utf-16-be'),
4645 for bom
, enc
in BOMS
:
4646 if first_bytes
.startswith(bom
):
4647 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4650 s
= first_bytes
.decode('utf-8', 'replace')
4652 return re
.match(r
'^\s*<', s
)
4655 def determine_protocol(info_dict
):
4656 protocol
= info_dict
.get('protocol')
4657 if protocol
is not None:
4660 url
= info_dict
['url']
4661 if url
.startswith('rtmp'):
4663 elif url
.startswith('mms'):
4665 elif url
.startswith('rtsp'):
4668 ext
= determine_ext(url
)
4674 return compat_urllib_parse_urlparse(url
).scheme
4677 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4678 """ Render a list of rows, each as a list of values """
4680 def get_max_lens(table
):
4681 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4683 def filter_using_list(row
, filterArray
):
4684 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4687 max_lens
= get_max_lens(data
)
4688 header_row
= filter_using_list(header_row
, max_lens
)
4689 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4691 table
= [header_row
] + data
4692 max_lens
= get_max_lens(table
)
4694 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4695 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4696 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4699 def _match_one(filter_part
, dct
, incomplete
):
4700 # TODO: Generalize code with YoutubeDL._build_format_filter
4701 STRING_OPERATORS
= {
4702 '*=': operator
.contains
,
4703 '^=': lambda attr
, value
: attr
.startswith(value
),
4704 '$=': lambda attr
, value
: attr
.endswith(value
),
4705 '~=': lambda attr
, value
: re
.search(value
, attr
),
4707 COMPARISON_OPERATORS
= {
4709 '<=': operator
.le
, # "<=" must be defined above "<"
4716 operator_rex
= re
.compile(r
'''(?x)\s*
4718 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4720 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4721 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
4725 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4726 m = operator_rex.search(filter_part)
4728 unnegated_op = COMPARISON_OPERATORS[m.group('op')]
4729 if m.group('negation'):
4730 op = lambda attr, value: not unnegated_op(attr, value)
4733 actual_value = dct.get(m.group('key'))
4734 if (m.group('quotedstrval') is not None
4735 or m.group('strval') is not None
4736 # If the original field is a string and matching comparisonvalue is
4737 # a number we should respect the origin of the original field
4738 # and process comparison value as a string (see
4739 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4740 or actual_value is not None and m.group('intval') is not None
4741 and isinstance(actual_value, compat_str)):
4742 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4743 quote = m.group('quote')
4744 if quote is not None:
4745 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4747 if m.group('op') in STRING_OPERATORS:
4748 raise ValueError('Operator %s only supports string values!' % m.group('op'))
4750 comparison_value = int(m.group('intval'))
4752 comparison_value = parse_filesize(m.group('intval'))
4753 if comparison_value is None:
4754 comparison_value = parse_filesize(m.group('intval') + 'B')
4755 if comparison_value is None:
4757 'Invalid integer value %r in filter part %r' % (
4758 m.group('intval'), filter_part))
4759 if actual_value is None:
4760 return incomplete or m.group('none_inclusive')
4761 return op(actual_value, comparison_value)
4764 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4765 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4767 operator_rex = re.compile(r'''(?x
)\s
*
4768 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4770 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4771 m = operator_rex.search(filter_part)
4773 op = UNARY_OPERATORS[m.group('op')]
4774 actual_value = dct.get(m.group('key'))
4775 if incomplete and actual_value is None:
4777 return op(actual_value)
4779 raise ValueError('Invalid filter part %r' % filter_part)
4782 def match_str(filter_str, dct, incomplete=False):
4783 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
4784 When incomplete, all conditions passes on missing fields
4787 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
4788 for filter_part in re.split(r'(?<!\\)&', filter_str))
4791 def match_filter_func(filter_str):
4792 def _match_func(info_dict, *args, **kwargs):
4793 if match_str(filter_str, info_dict, *args, **kwargs):
4796 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4797 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4801 def parse_dfxp_time_expr(time_expr):
4805 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4807 return float(mobj.group('time_offset'))
4809 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4811 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4814 def srt_subtitles_timecode(seconds):
4815 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4818 def dfxp2srt(dfxp_data):
4820 @param dfxp_data A
bytes-like
object containing DFXP data
4821 @returns A
unicode object containing converted SRT data
4823 LEGACY_NAMESPACES = (
4824 (b'http://www.w3.org/ns/ttml', [
4825 b'http://www.w3.org/2004/11/ttaf1',
4826 b'http://www.w3.org/2006/04/ttaf1',
4827 b'http://www.w3.org/2006/10/ttaf1',
4829 (b'http://www.w3.org/ns/ttml#styling', [
4830 b'http://www.w3.org/ns/ttml#style',
4834 SUPPORTED_STYLING = [
4843 _x = functools.partial(xpath_with_ns, ns_map={
4844 'xml': 'http://www.w3.org/XML/1998/namespace',
4845 'ttml': 'http://www.w3.org/ns/ttml',
4846 'tts': 'http://www.w3.org/ns/ttml#styling',
4852 class TTMLPElementParser(object):
4854 _unclosed_elements = []
4855 _applied_styles = []
4857 def start(self, tag, attrib):
4858 if tag in (_x('ttml:br'), 'br'):
4861 unclosed_elements = []
4863 element_style_id = attrib.get('style')
4865 style.update(default_style)
4866 if element_style_id:
4867 style.update(styles.get(element_style_id, {}))
4868 for prop in SUPPORTED_STYLING:
4869 prop_val = attrib.get(_x('tts:' + prop))
4871 style[prop] = prop_val
4874 for k, v in sorted(style.items()):
4875 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4878 font += ' color="%s"' % v
4879 elif k == 'fontSize':
4880 font += ' size="%s"' % v
4881 elif k == 'fontFamily':
4882 font += ' face="%s"' % v
4883 elif k == 'fontWeight' and v == 'bold':
4885 unclosed_elements.append('b')
4886 elif k == 'fontStyle' and v == 'italic':
4888 unclosed_elements.append('i')
4889 elif k == 'textDecoration' and v == 'underline':
4891 unclosed_elements.append('u')
4893 self._out += '<font' + font + '>'
4894 unclosed_elements.append('font')
4896 if self._applied_styles:
4897 applied_style.update(self._applied_styles[-1])
4898 applied_style.update(style)
4899 self._applied_styles.append(applied_style)
4900 self._unclosed_elements.append(unclosed_elements)
4903 if tag not in (_x('ttml:br'), 'br'):
4904 unclosed_elements = self._unclosed_elements.pop()
4905 for element in reversed(unclosed_elements):
4906 self._out += '</%s>' % element
4907 if unclosed_elements and self._applied_styles:
4908 self._applied_styles.pop()
4910 def data(self, data):
4914 return self._out.strip()
4916 def parse_node(node):
4917 target = TTMLPElementParser()
4918 parser = xml.etree.ElementTree.XMLParser(target=target)
4919 parser.feed(xml.etree.ElementTree.tostring(node))
4920 return parser.close()
4922 for k, v in LEGACY_NAMESPACES:
4924 dfxp_data = dfxp_data.replace(ns, k)
4926 dfxp = compat_etree_fromstring(dfxp_data)
4928 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4931 raise ValueError('Invalid dfxp/TTML subtitle')
4935 for style in dfxp.findall(_x('.//ttml:style')):
4936 style_id = style.get('id') or style.get(_x('xml:id'))
4939 parent_style_id = style.get('style')
4941 if parent_style_id not in styles:
4944 styles[style_id] = styles[parent_style_id].copy()
4945 for prop in SUPPORTED_STYLING:
4946 prop_val = style.get(_x('tts:' + prop))
4948 styles.setdefault(style_id, {})[prop] = prop_val
4954 for p in ('body', 'div'):
4955 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4958 style = styles.get(ele.get('style'))
4961 default_style.update(style)
4963 for para, index in zip(paras, itertools.count(1)):
4964 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4965 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4966 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4967 if begin_time is None:
4972 end_time = begin_time + dur
4973 out.append('%d\n%s --> %s\n%s\n\n' % (
4975 srt_subtitles_timecode(begin_time),
4976 srt_subtitles_timecode(end_time),
4982 def cli_option(params, command_option, param):
4983 param = params.get(param)
4985 param = compat_str(param)
4986 return [command_option, param] if param is not None else []
4989 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4990 param = params.get(param)
4993 assert isinstance(param, bool)
4995 return [command_option + separator + (true_value if param else false_value)]
4996 return [command_option, true_value if param else false_value]
4999 def cli_valueless_option(params, command_option, param, expected_value=True):
5000 param = params.get(param)
5001 return [command_option] if param == expected_value else []
5004 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
5005 if isinstance(argdict, (list, tuple)): # for backward compatibility
5012 assert isinstance(argdict, dict)
5014 assert isinstance(keys, (list, tuple))
5015 for key_list in keys:
5016 arg_list = list(filter(
5017 lambda x: x is not None,
5018 [argdict.get(key.lower()) for key in variadic(key_list)]))
5020 return [arg for args in arg_list for arg in args]
5024 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
5025 main_key, exe = main_key.lower(), exe.lower()
5026 root_key = exe if main_key == exe else f'{main_key}+{exe}'
5027 keys = [f'{root_key}{k}' for k in (keys or [''])]
5028 if root_key in keys:
5030 keys.append((main_key, exe))
5031 keys.append('default')
5034 return cli_configuration_args(argdict, keys, default, use_compat)
5037 class ISO639Utils(object):
5038 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
5097 'iw': 'heb', # Replaced by he in 1989 revision
5107 'in': 'ind', # Replaced by id in 1989 revision
5222 'ji': 'yid', # Replaced by yi in 1989 revision
5230 def short2long(cls, code):
5231 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5232 return cls._lang_map.get(code[:2])
5235 def long2short(cls, code):
5236 """Convert language code from ISO 639-2/T to ISO 639-1"""
5237 for short_name, long_name in cls._lang_map.items():
5238 if long_name == code:
5242 class ISO3166Utils(object):
5243 # From http://data.okfn.org/data/core/country-list
5245 'AF': 'Afghanistan',
5246 'AX': 'Åland Islands',
5249 'AS': 'American Samoa',
5254 'AG': 'Antigua and Barbuda',
5271 'BO': 'Bolivia, Plurinational State of',
5272 'BQ': 'Bonaire, Sint Eustatius and Saba',
5273 'BA': 'Bosnia and Herzegovina',
5275 'BV': 'Bouvet Island',
5277 'IO': 'British Indian Ocean Territory',
5278 'BN': 'Brunei Darussalam',
5280 'BF': 'Burkina Faso',
5286 'KY': 'Cayman Islands',
5287 'CF': 'Central African Republic',
5291 'CX': 'Christmas Island',
5292 'CC': 'Cocos (Keeling) Islands',
5296 'CD': 'Congo, the Democratic Republic of the',
5297 'CK': 'Cook Islands',
5299 'CI': 'Côte d\'Ivoire',
5304 'CZ': 'Czech Republic',
5308 'DO': 'Dominican Republic',
5311 'SV': 'El Salvador',
5312 'GQ': 'Equatorial Guinea',
5316 'FK': 'Falkland Islands (Malvinas)',
5317 'FO': 'Faroe Islands',
5321 'GF': 'French Guiana',
5322 'PF': 'French Polynesia',
5323 'TF': 'French Southern Territories',
5338 'GW': 'Guinea-Bissau',
5341 'HM': 'Heard Island and McDonald Islands',
5342 'VA': 'Holy See (Vatican City State)',
5349 'IR': 'Iran, Islamic Republic of',
5352 'IM': 'Isle of Man',
5362 'KP': 'Korea, Democratic People\'s Republic of',
5363 'KR': 'Korea, Republic of',
5366 'LA': 'Lao People\'s Democratic Republic',
5372 'LI': 'Liechtenstein',
5376 'MK': 'Macedonia, the Former Yugoslav Republic of',
5383 'MH': 'Marshall Islands',
5389 'FM': 'Micronesia, Federated States of',
5390 'MD': 'Moldova, Republic of',
5401 'NL': 'Netherlands',
5402 'NC': 'New Caledonia',
5403 'NZ': 'New Zealand',
5408 'NF': 'Norfolk Island',
5409 'MP': 'Northern Mariana Islands',
5414 'PS': 'Palestine, State of',
5416 'PG': 'Papua New Guinea',
5419 'PH': 'Philippines',
5423 'PR': 'Puerto Rico',
5427 'RU': 'Russian Federation',
5429 'BL': 'Saint Barthélemy',
5430 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5431 'KN': 'Saint Kitts and Nevis',
5432 'LC': 'Saint Lucia',
5433 'MF': 'Saint Martin (French part)',
5434 'PM': 'Saint Pierre and Miquelon',
5435 'VC': 'Saint Vincent and the Grenadines',
5438 'ST': 'Sao Tome and Principe',
5439 'SA': 'Saudi Arabia',
5443 'SL': 'Sierra Leone',
5445 'SX': 'Sint Maarten (Dutch part)',
5448 'SB': 'Solomon Islands',
5450 'ZA': 'South Africa',
5451 'GS': 'South Georgia and the South Sandwich Islands',
5452 'SS': 'South Sudan',
5457 'SJ': 'Svalbard and Jan Mayen',
5460 'CH': 'Switzerland',
5461 'SY': 'Syrian Arab Republic',
5462 'TW': 'Taiwan, Province of China',
5464 'TZ': 'Tanzania, United Republic of',
5466 'TL': 'Timor-Leste',
5470 'TT': 'Trinidad and Tobago',
5473 'TM': 'Turkmenistan',
5474 'TC': 'Turks and Caicos Islands',
5478 'AE': 'United Arab Emirates',
5479 'GB': 'United Kingdom',
5480 'US': 'United States',
5481 'UM': 'United States Minor Outlying Islands',
5485 'VE': 'Venezuela, Bolivarian Republic of',
5487 'VG': 'Virgin Islands, British',
5488 'VI': 'Virgin Islands, U.S.',
5489 'WF': 'Wallis and Futuna',
5490 'EH': 'Western Sahara',
5497 def short2full(cls, code):
5498 """Convert an ISO 3166-2 country code to the corresponding full name"""
5499 return cls._country_map.get(code.upper())
5502 class GeoUtils(object):
5503 # Major IPv4 address blocks per country
5505 'AD': '46.172.224.0/19',
5506 'AE': '94.200.0.0/13',
5507 'AF': '149.54.0.0/17',
5508 'AG': '209.59.64.0/18',
5509 'AI': '204.14.248.0/21',
5510 'AL': '46.99.0.0/16',
5511 'AM': '46.70.0.0/15',
5512 'AO': '105.168.0.0/13',
5513 'AP': '182.50.184.0/21',
5514 'AQ': '23.154.160.0/24',
5515 'AR': '181.0.0.0/12',
5516 'AS': '202.70.112.0/20',
5517 'AT': '77.116.0.0/14',
5518 'AU': '1.128.0.0/11',
5519 'AW': '181.41.0.0/18',
5520 'AX': '185.217.4.0/22',
5521 'AZ': '5.197.0.0/16',
5522 'BA': '31.176.128.0/17',
5523 'BB': '65.48.128.0/17',
5524 'BD': '114.130.0.0/16',
5526 'BF': '102.178.0.0/15',
5527 'BG': '95.42.0.0/15',
5528 'BH': '37.131.0.0/17',
5529 'BI': '154.117.192.0/18',
5530 'BJ': '137.255.0.0/16',
5531 'BL': '185.212.72.0/23',
5532 'BM': '196.12.64.0/18',
5533 'BN': '156.31.0.0/16',
5534 'BO': '161.56.0.0/16',
5535 'BQ': '161.0.80.0/20',
5536 'BR': '191.128.0.0/12',
5537 'BS': '24.51.64.0/18',
5538 'BT': '119.2.96.0/19',
5539 'BW': '168.167.0.0/16',
5540 'BY': '178.120.0.0/13',
5541 'BZ': '179.42.192.0/18',
5542 'CA': '99.224.0.0/11',
5543 'CD': '41.243.0.0/16',
5544 'CF': '197.242.176.0/21',
5545 'CG': '160.113.0.0/16',
5546 'CH': '85.0.0.0/13',
5547 'CI': '102.136.0.0/14',
5548 'CK': '202.65.32.0/19',
5549 'CL': '152.172.0.0/14',
5550 'CM': '102.244.0.0/14',
5551 'CN': '36.128.0.0/10',
5552 'CO': '181.240.0.0/12',
5553 'CR': '201.192.0.0/12',
5554 'CU': '152.206.0.0/15',
5555 'CV': '165.90.96.0/19',
5556 'CW': '190.88.128.0/17',
5557 'CY': '31.153.0.0/16',
5558 'CZ': '88.100.0.0/14',
5560 'DJ': '197.241.0.0/17',
5561 'DK': '87.48.0.0/12',
5562 'DM': '192.243.48.0/20',
5563 'DO': '152.166.0.0/15',
5564 'DZ': '41.96.0.0/12',
5565 'EC': '186.68.0.0/15',
5566 'EE': '90.190.0.0/15',
5567 'EG': '156.160.0.0/11',
5568 'ER': '196.200.96.0/20',
5569 'ES': '88.0.0.0/11',
5570 'ET': '196.188.0.0/14',
5571 'EU': '2.16.0.0/13',
5572 'FI': '91.152.0.0/13',
5573 'FJ': '144.120.0.0/16',
5574 'FK': '80.73.208.0/21',
5575 'FM': '119.252.112.0/20',
5576 'FO': '88.85.32.0/19',
5578 'GA': '41.158.0.0/15',
5580 'GD': '74.122.88.0/21',
5581 'GE': '31.146.0.0/16',
5582 'GF': '161.22.64.0/18',
5583 'GG': '62.68.160.0/19',
5584 'GH': '154.160.0.0/12',
5585 'GI': '95.164.0.0/16',
5586 'GL': '88.83.0.0/19',
5587 'GM': '160.182.0.0/15',
5588 'GN': '197.149.192.0/18',
5589 'GP': '104.250.0.0/19',
5590 'GQ': '105.235.224.0/20',
5591 'GR': '94.64.0.0/13',
5592 'GT': '168.234.0.0/16',
5593 'GU': '168.123.0.0/16',
5594 'GW': '197.214.80.0/20',
5595 'GY': '181.41.64.0/18',
5596 'HK': '113.252.0.0/14',
5597 'HN': '181.210.0.0/16',
5598 'HR': '93.136.0.0/13',
5599 'HT': '148.102.128.0/17',
5600 'HU': '84.0.0.0/14',
5601 'ID': '39.192.0.0/10',
5602 'IE': '87.32.0.0/12',
5603 'IL': '79.176.0.0/13',
5604 'IM': '5.62.80.0/20',
5605 'IN': '117.192.0.0/10',
5606 'IO': '203.83.48.0/21',
5607 'IQ': '37.236.0.0/14',
5608 'IR': '2.176.0.0/12',
5609 'IS': '82.221.0.0/16',
5610 'IT': '79.0.0.0/10',
5611 'JE': '87.244.64.0/18',
5612 'JM': '72.27.0.0/17',
5613 'JO': '176.29.0.0/16',
5614 'JP': '133.0.0.0/8',
5615 'KE': '105.48.0.0/12',
5616 'KG': '158.181.128.0/17',
5617 'KH': '36.37.128.0/17',
5618 'KI': '103.25.140.0/22',
5619 'KM': '197.255.224.0/20',
5620 'KN': '198.167.192.0/19',
5621 'KP': '175.45.176.0/22',
5622 'KR': '175.192.0.0/10',
5623 'KW': '37.36.0.0/14',
5624 'KY': '64.96.0.0/15',
5625 'KZ': '2.72.0.0/13',
5626 'LA': '115.84.64.0/18',
5627 'LB': '178.135.0.0/16',
5628 'LC': '24.92.144.0/20',
5629 'LI': '82.117.0.0/19',
5630 'LK': '112.134.0.0/15',
5631 'LR': '102.183.0.0/16',
5632 'LS': '129.232.0.0/17',
5633 'LT': '78.56.0.0/13',
5634 'LU': '188.42.0.0/16',
5635 'LV': '46.109.0.0/16',
5636 'LY': '41.252.0.0/14',
5637 'MA': '105.128.0.0/11',
5638 'MC': '88.209.64.0/18',
5639 'MD': '37.246.0.0/16',
5640 'ME': '178.175.0.0/17',
5641 'MF': '74.112.232.0/21',
5642 'MG': '154.126.0.0/17',
5643 'MH': '117.103.88.0/21',
5644 'MK': '77.28.0.0/15',
5645 'ML': '154.118.128.0/18',
5646 'MM': '37.111.0.0/17',
5647 'MN': '49.0.128.0/17',
5648 'MO': '60.246.0.0/16',
5649 'MP': '202.88.64.0/20',
5650 'MQ': '109.203.224.0/19',
5651 'MR': '41.188.64.0/18',
5652 'MS': '208.90.112.0/22',
5653 'MT': '46.11.0.0/16',
5654 'MU': '105.16.0.0/12',
5655 'MV': '27.114.128.0/18',
5656 'MW': '102.70.0.0/15',
5657 'MX': '187.192.0.0/11',
5658 'MY': '175.136.0.0/13',
5659 'MZ': '197.218.0.0/15',
5660 'NA': '41.182.0.0/16',
5661 'NC': '101.101.0.0/18',
5662 'NE': '197.214.0.0/18',
5663 'NF': '203.17.240.0/22',
5664 'NG': '105.112.0.0/12',
5665 'NI': '186.76.0.0/15',
5666 'NL': '145.96.0.0/11',
5667 'NO': '84.208.0.0/13',
5668 'NP': '36.252.0.0/15',
5669 'NR': '203.98.224.0/19',
5670 'NU': '49.156.48.0/22',
5671 'NZ': '49.224.0.0/14',
5672 'OM': '5.36.0.0/15',
5673 'PA': '186.72.0.0/15',
5674 'PE': '186.160.0.0/14',
5675 'PF': '123.50.64.0/18',
5676 'PG': '124.240.192.0/19',
5677 'PH': '49.144.0.0/13',
5678 'PK': '39.32.0.0/11',
5679 'PL': '83.0.0.0/11',
5680 'PM': '70.36.0.0/20',
5681 'PR': '66.50.0.0/16',
5682 'PS': '188.161.0.0/16',
5683 'PT': '85.240.0.0/13',
5684 'PW': '202.124.224.0/20',
5685 'PY': '181.120.0.0/14',
5686 'QA': '37.210.0.0/15',
5687 'RE': '102.35.0.0/16',
5688 'RO': '79.112.0.0/13',
5689 'RS': '93.86.0.0/15',
5690 'RU': '5.136.0.0/13',
5691 'RW': '41.186.0.0/16',
5692 'SA': '188.48.0.0/13',
5693 'SB': '202.1.160.0/19',
5694 'SC': '154.192.0.0/11',
5695 'SD': '102.120.0.0/13',
5696 'SE': '78.64.0.0/12',
5697 'SG': '8.128.0.0/10',
5698 'SI': '188.196.0.0/14',
5699 'SK': '78.98.0.0/15',
5700 'SL': '102.143.0.0/17',
5701 'SM': '89.186.32.0/19',
5702 'SN': '41.82.0.0/15',
5703 'SO': '154.115.192.0/18',
5704 'SR': '186.179.128.0/17',
5705 'SS': '105.235.208.0/21',
5706 'ST': '197.159.160.0/19',
5707 'SV': '168.243.0.0/16',
5708 'SX': '190.102.0.0/20',
5710 'SZ': '41.84.224.0/19',
5711 'TC': '65.255.48.0/20',
5712 'TD': '154.68.128.0/19',
5713 'TG': '196.168.0.0/14',
5714 'TH': '171.96.0.0/13',
5715 'TJ': '85.9.128.0/18',
5716 'TK': '27.96.24.0/21',
5717 'TL': '180.189.160.0/20',
5718 'TM': '95.85.96.0/19',
5719 'TN': '197.0.0.0/11',
5720 'TO': '175.176.144.0/21',
5721 'TR': '78.160.0.0/11',
5722 'TT': '186.44.0.0/15',
5723 'TV': '202.2.96.0/19',
5724 'TW': '120.96.0.0/11',
5725 'TZ': '156.156.0.0/14',
5726 'UA': '37.52.0.0/14',
5727 'UG': '102.80.0.0/13',
5729 'UY': '167.56.0.0/13',
5730 'UZ': '84.54.64.0/18',
5731 'VA': '212.77.0.0/19',
5732 'VC': '207.191.240.0/21',
5733 'VE': '186.88.0.0/13',
5734 'VG': '66.81.192.0/20',
5735 'VI': '146.226.0.0/16',
5736 'VN': '14.160.0.0/11',
5737 'VU': '202.80.32.0/20',
5738 'WF': '117.20.32.0/21',
5739 'WS': '202.4.32.0/19',
5740 'YE': '134.35.0.0/16',
5741 'YT': '41.242.116.0/22',
5742 'ZA': '41.0.0.0/11',
5743 'ZM': '102.144.0.0/13',
5744 'ZW': '102.177.192.0/18',
5748 def random_ipv4(cls, code_or_block):
5749 if len(code_or_block) == 2:
5750 block = cls._country_ip_map.get(code_or_block.upper())
5754 block = code_or_block
5755 addr, preflen = block.split('/')
5756 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5757 addr_max = addr_min | (0xffffffff >> int(preflen))
5758 return compat_str(socket.inet_ntoa(
5759 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5762 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5763 def __init__(self, proxies=None):
5764 # Set default handlers
5765 for type in ('http', 'https'):
5766 setattr(self, '%s_open' % type,
5767 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5768 meth(r, proxy, type))
5769 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5771 def proxy_open(self, req, proxy, type):
5772 req_proxy = req.headers.get('Ytdl-request-proxy')
5773 if req_proxy is not None:
5775 del req.headers['Ytdl-request-proxy']
5777 if proxy == '__noproxy__':
5778 return None # No Proxy
5779 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5780 req.add_header('Ytdl-socks-proxy', proxy)
5781 # yt-dlp's http/https handlers do wrapping the socket with socks
5783 return compat_urllib_request.ProxyHandler.proxy_open(
5784 self, req, proxy, type)
5787 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5788 # released into Public Domain
5789 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5791 def long_to_bytes(n, blocksize=0):
5792 """long_to_bytes(n:long, blocksize:int) : string
5793 Convert a long integer to a byte string.
5795 If optional blocksize is given and greater than zero, pad the front of the
5796 byte string with binary zeros so that the length is a multiple of
5799 # after much testing, this algorithm was deemed to be the fastest
5803 s = compat_struct_pack('>I', n & 0xffffffff) + s
5805 # strip off leading zeros
5806 for i in range(len(s)):
5807 if s[i] != b'\000'[0]:
5810 # only happens when n == 0
5814 # add back some pad bytes. this could be done more efficiently w.r.t. the
5815 # de-padding being done above, but sigh...
5816 if blocksize > 0 and len(s) % blocksize:
5817 s = (blocksize - len(s) % blocksize) * b'\000' + s
5821 def bytes_to_long(s):
5822 """bytes_to_long(string) : long
5823 Convert a byte string to a long integer.
5825 This is (essentially) the inverse of long_to_bytes().
5830 extra = (4 - length % 4)
5831 s = b'\000' * extra + s
5832 length = length + extra
5833 for i in range(0, length, 4):
5834 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5838 def ohdave_rsa_encrypt(data, exponent, modulus):
5840 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5843 data: data to encrypt, bytes-like object
5844 exponent, modulus: parameter e and N of RSA algorithm, both integer
5845 Output: hex string of encrypted data
5847 Limitation: supports one block encryption only
5850 payload = int(binascii.hexlify(data[::-1]), 16)
5851 encrypted = pow(payload, exponent, modulus)
5852 return '%x' % encrypted
5855 def pkcs1pad(data, length):
5857 Padding input data with PKCS#1 scheme
5859 @param {int[]} data input data
5860 @param {int} length target length
5861 @returns {int[]} padded data
5863 if len(data) > length - 11:
5864 raise ValueError('Input data too
long for PKCS
#1 padding')
5866 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5867 return [0, 2] + pseudo_random
+ [0] + data
5870 def encode_base_n(num
, n
, table
=None):
5871 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5873 table
= FULL_TABLE
[:n
]
5876 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5883 ret
= table
[num
% n
] + ret
5888 def decode_packed_codes(code
):
5889 mobj
= re
.search(PACKED_CODES_RE
, code
)
5890 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5893 symbols
= symbols
.split('|')
5898 base_n_count
= encode_base_n(count
, base
)
5899 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5902 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5906 def caesar(s
, alphabet
, shift
):
5911 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5916 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5919 def parse_m3u8_attributes(attrib
):
5921 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5922 if val
.startswith('"'):
5928 def urshift(val
, n
):
5929 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5932 # Based on png2str() written by @gdkchan and improved by @yokrysty
5933 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5934 def decode_png(png_data
):
5935 # Reference: https://www.w3.org/TR/PNG/
5936 header
= png_data
[8:]
5938 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5939 raise IOError('Not a valid PNG file.')
5941 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5942 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5947 length
= unpack_integer(header
[:4])
5950 chunk_type
= header
[:4]
5953 chunk_data
= header
[:length
]
5954 header
= header
[length
:]
5956 header
= header
[4:] # Skip CRC
5964 ihdr
= chunks
[0]['data']
5966 width
= unpack_integer(ihdr
[:4])
5967 height
= unpack_integer(ihdr
[4:8])
5971 for chunk
in chunks
:
5972 if chunk
['type'] == b
'IDAT':
5973 idat
+= chunk
['data']
5976 raise IOError('Unable to read PNG data.')
5978 decompressed_data
= bytearray(zlib
.decompress(idat
))
5983 def _get_pixel(idx
):
5988 for y
in range(height
):
5989 basePos
= y
* (1 + stride
)
5990 filter_type
= decompressed_data
[basePos
]
5994 pixels
.append(current_row
)
5996 for x
in range(stride
):
5997 color
= decompressed_data
[1 + basePos
+ x
]
5998 basex
= y
* stride
+ x
6003 left
= _get_pixel(basex
- 3)
6005 up
= _get_pixel(basex
- stride
)
6007 if filter_type
== 1: # Sub
6008 color
= (color
+ left
) & 0xff
6009 elif filter_type
== 2: # Up
6010 color
= (color
+ up
) & 0xff
6011 elif filter_type
== 3: # Average
6012 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
6013 elif filter_type
== 4: # Paeth
6019 c
= _get_pixel(basex
- stride
- 3)
6027 if pa
<= pb
and pa
<= pc
:
6028 color
= (color
+ a
) & 0xff
6030 color
= (color
+ b
) & 0xff
6032 color
= (color
+ c
) & 0xff
6034 current_row
.append(color
)
6036 return width
, height
, pixels
6039 def write_xattr(path
, key
, value
):
6040 # This mess below finds the best xattr tool for the job
6042 # try the pyxattr module...
6045 if hasattr(xattr
, 'set'): # pyxattr
6046 # Unicode arguments are not supported in python-pyxattr until
6048 # See https://github.com/ytdl-org/youtube-dl/issues/5498
6049 pyxattr_required_version
= '0.5.0'
6050 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
6051 # TODO: fallback to CLI tools
6052 raise XAttrUnavailableError(
6053 'python-pyxattr is detected but is too old. '
6054 'yt-dlp requires %s or above while your version is %s. '
6055 'Falling back to other xattr implementations' % (
6056 pyxattr_required_version
, xattr
.__version
__))
6058 setxattr
= xattr
.set
6060 setxattr
= xattr
.setxattr
6063 setxattr(path
, key
, value
)
6064 except EnvironmentError as e
:
6065 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6068 if compat_os_name
== 'nt':
6069 # Write xattrs to NTFS Alternate Data Streams:
6070 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
6071 assert ':' not in key
6072 assert os
.path
.exists(path
)
6074 ads_fn
= path
+ ':' + key
6076 with open(ads_fn
, 'wb') as f
:
6078 except EnvironmentError as e
:
6079 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6081 user_has_setfattr
= check_executable('setfattr', ['--version'])
6082 user_has_xattr
= check_executable('xattr', ['-h'])
6084 if user_has_setfattr
or user_has_xattr
:
6086 value
= value
.decode('utf-8')
6087 if user_has_setfattr
:
6088 executable
= 'setfattr'
6089 opts
= ['-n', key
, '-v', value
]
6090 elif user_has_xattr
:
6091 executable
= 'xattr'
6092 opts
= ['-w', key
, value
]
6094 cmd
= ([encodeFilename(executable
, True)]
6095 + [encodeArgument(o
) for o
in opts
]
6096 + [encodeFilename(path
, True)])
6099 p
= subprocess
.Popen(
6100 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
6101 except EnvironmentError as e
:
6102 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6103 stdout
, stderr
= process_communicate_or_kill(p
)
6104 stderr
= stderr
.decode('utf-8', 'replace')
6105 if p
.returncode
!= 0:
6106 raise XAttrMetadataError(p
.returncode
, stderr
)
6109 # On Unix, and can't find pyxattr, setfattr, or xattr.
6110 if sys
.platform
.startswith('linux'):
6111 raise XAttrUnavailableError(
6112 "Couldn't find a tool to set the xattrs. "
6113 "Install either the python 'pyxattr' or 'xattr' "
6114 "modules, or the GNU 'attr' package "
6115 "(which contains the 'setfattr' tool).")
6117 raise XAttrUnavailableError(
6118 "Couldn't find a tool to set the xattrs. "
6119 "Install either the python 'xattr' module, "
6120 "or the 'xattr' binary.")
6123 def random_birthday(year_field
, month_field
, day_field
):
6124 start_date
= datetime
.date(1950, 1, 1)
6125 end_date
= datetime
.date(1995, 12, 31)
6126 offset
= random
.randint(0, (end_date
- start_date
).days
)
6127 random_date
= start_date
+ datetime
.timedelta(offset
)
6129 year_field
: str(random_date
.year
),
6130 month_field
: str(random_date
.month
),
6131 day_field
: str(random_date
.day
),
6135 # Templates for internet shortcut files, which are plain text files.
6136 DOT_URL_LINK_TEMPLATE
= '''
6141 DOT_WEBLOC_LINK_TEMPLATE
= '''
6142 <?xml version="1.0" encoding="UTF-8"?>
6143 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
6144 <plist version="1.0">
6147 \t<string>%(url)s</string>
6152 DOT_DESKTOP_LINK_TEMPLATE
= '''
6162 def iri_to_uri(iri
):
6164 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
6166 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
6169 iri_parts
= compat_urllib_parse_urlparse(iri
)
6171 if '[' in iri_parts
.netloc
:
6172 raise ValueError('IPv6 URIs are not, yet, supported.')
6173 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
6175 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
6178 if iri_parts
.username
:
6179 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
6180 if iri_parts
.password
is not None:
6181 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6184 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6185 # The 'idna' encoding produces ASCII text.
6186 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6187 net_location
+= ':' + str(iri_parts
.port
)
6189 return compat_urllib_parse_urlunparse(
6193 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6195 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6196 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6198 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6199 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6201 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6203 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6206 def to_high_limit_path(path
):
6207 if sys
.platform
in ['win32', 'cygwin']:
6208 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6209 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6214 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
6216 val
= obj
if obj
is not None else default
6218 val
= obj
.get(field
, default
)
6219 if func
and val
not in ignore
:
6221 return template
% val
if val
not in ignore
else default
6224 def clean_podcast_url(url
):
6225 return re
.sub(r
'''(?x)
6229 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6232 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6235 cn\.co| # https://podcorn.com/analytics-prefix/
6236 st\.fm # https://podsights.com/docs/
6241 _HEX_TABLE
= '0123456789abcdef'
6244 def random_uuidv4():
6245 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6248 def make_dir(path
, to_screen
=None):
6250 dn
= os
.path
.dirname(path
)
6251 if dn
and not os
.path
.exists(dn
):
6254 except (OSError, IOError) as err
:
6255 if callable(to_screen
) is not None:
6256 to_screen('unable to create directory ' + error_to_compat_str(err
))
6260 def get_executable_path():
6261 from zipimport
import zipimporter
6262 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6263 path
= os
.path
.dirname(sys
.executable
)
6264 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6265 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6267 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6268 return os
.path
.abspath(path
)
6271 def load_plugins(name
, suffix
, namespace
):
6272 plugin_info
= [None]
6275 plugin_info
= imp
.find_module(
6276 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
6277 plugins
= imp
.load_module(name
, *plugin_info
)
6278 for name
in dir(plugins
):
6279 if name
in namespace
:
6281 if not name
.endswith(suffix
):
6283 klass
= getattr(plugins
, name
)
6284 classes
.append(klass
)
6285 namespace
[name
] = klass
6289 if plugin_info
[0] is not None:
6290 plugin_info
[0].close()
6295 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
6296 casesense
=True, is_user_input
=False, traverse_string
=False):
6297 ''' Traverse nested list/dict/tuple
6298 @param path_list A list of paths which are checked one by one.
6299 Each path is a list of keys where each key is a string,
6300 a tuple of strings or "...". When a tuple is given,
6301 all the keys given in the tuple are traversed, and
6302 "..." traverses all the keys in the object
6303 @param default Default value to return
6304 @param expected_type Only accept final value of this type (Can also be any callable)
6305 @param get_all Return all the values obtained from a path or only the first one
6306 @param casesense Whether to consider dictionary keys as case sensitive
6307 @param is_user_input Whether the keys are generated from user input. If True,
6308 strings are converted to int/slice if necessary
6309 @param traverse_string Whether to traverse inside strings. If True, any
6310 non-compatible object will also be converted into a string
6314 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
6315 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
6317 def _traverse_obj(obj
, path
, _current_depth
=0):
6321 path
= tuple(variadic(path
))
6322 for i
, key
in enumerate(path
):
6323 if isinstance(key
, (list, tuple)):
6324 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
6327 obj
= (obj
.values() if isinstance(obj
, dict)
6328 else obj
if isinstance(obj
, (list, tuple, LazyList
))
6329 else str(obj
) if traverse_string
else [])
6331 depth
= max(depth
, _current_depth
)
6332 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
6333 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
6334 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
6335 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
6338 key
= (int_or_none(key
) if ':' not in key
6339 else slice(*map(int_or_none
, key
.split(':'))))
6340 if key
== slice(None):
6341 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
6342 if not isinstance(key
, (int, slice)):
6344 if not isinstance(obj
, (list, tuple, LazyList
)):
6345 if not traverse_string
:
6354 if isinstance(expected_type
, type):
6355 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
6356 elif expected_type
is not None:
6357 type_test
= expected_type
6359 type_test
= lambda val
: val
6361 for path
in path_list
:
6363 val
= _traverse_obj(obj
, path
)
6366 for _
in range(depth
- 1):
6367 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
6368 val
= [v
for v
in map(type_test
, val
) if v
is not None]
6370 return val
if get_all
else val
[0]
6372 val
= type_test(val
)
6378 def traverse_dict(dictn
, keys
, casesense
=True):
6379 ''' For backward compatibility. Do not use '''
6380 return traverse_obj(dictn
, keys
, casesense
=casesense
,
6381 is_user_input
=True, traverse_string
=True)
6384 def variadic(x
, allowed_types
=(str, bytes)):
6385 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
6388 # create a JSON Web Signature (jws) with HS256 algorithm
6389 # the resulting format is in JWS Compact Serialization
6390 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
6391 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
6392 def jwt_encode_hs256(payload_data
, key
, headers
={}):
6398 header_data
.update(headers
)
6399 header_b64
= base64
.b64encode(json
.dumps(header_data
).encode('utf-8'))
6400 payload_b64
= base64
.b64encode(json
.dumps(payload_data
).encode('utf-8'))
6401 h
= hmac
.new(key
.encode('utf-8'), header_b64
+ b
'.' + payload_b64
, hashlib
.sha256
)
6402 signature_b64
= base64
.b64encode(h
.digest())
6403 token
= header_b64
+ b
'.' + payload_b64
+ b
'.' + signature_b64