4 from __future__
import unicode_literals
37 import xml
.etree
.ElementTree
41 compat_HTMLParseError
,
47 compat_ctypes_WINFUNCTYPE
,
48 compat_etree_fromstring
,
51 compat_html_entities_html5
,
64 compat_urllib_parse_urlencode
,
65 compat_urllib_parse_urlparse
,
66 compat_urllib_parse_urlunparse
,
67 compat_urllib_parse_quote
,
68 compat_urllib_parse_quote_plus
,
69 compat_urllib_parse_unquote_plus
,
70 compat_urllib_request
,
81 def register_socks_protocols():
82 # "Register" SOCKS protocols
83 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
84 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
85 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
86 if scheme
not in compat_urlparse
.uses_netloc
:
87 compat_urlparse
.uses_netloc
.append(scheme
)
90 # This is not clearly defined otherwise
91 compiled_regex_type
= type(re
.compile(''))
94 def random_user_agent():
95 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1674 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1678 'User-Agent': random_user_agent(),
1679 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1680 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1681 'Accept-Encoding': 'gzip, deflate',
1682 'Accept-Language': 'en-us,en;q=0.5',
1687 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1691 NO_DEFAULT
= object()
1693 ENGLISH_MONTH_NAMES
= [
1694 'January', 'February', 'March', 'April', 'May', 'June',
1695 'July', 'August', 'September', 'October', 'November', 'December']
1698 'en': ENGLISH_MONTH_NAMES
,
1700 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1701 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1704 KNOWN_EXTENSIONS
= (
1705 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1706 'flv', 'f4v', 'f4a', 'f4b',
1707 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1708 'mkv', 'mka', 'mk3d',
1711 'asf', 'wmv', 'wma',
1717 'f4f', 'f4m', 'm3u8', 'smil')
1719 # needed for sanitizing filenames in restricted mode
1720 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1721 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1722 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1746 '%Y/%m/%d %H:%M:%S',
1750 '%Y-%m-%d %H:%M:%S',
1751 '%Y-%m-%d %H:%M:%S.%f',
1752 '%Y-%m-%d %H:%M:%S:%f',
1755 '%Y-%m-%dT%H:%M:%SZ',
1756 '%Y-%m-%dT%H:%M:%S.%fZ',
1757 '%Y-%m-%dT%H:%M:%S.%f0Z',
1758 '%Y-%m-%dT%H:%M:%S',
1759 '%Y-%m-%dT%H:%M:%S.%f',
1761 '%b %d %Y at %H:%M',
1762 '%b %d %Y at %H:%M:%S',
1763 '%B %d %Y at %H:%M',
1764 '%B %d %Y at %H:%M:%S',
1768 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1769 DATE_FORMATS_DAY_FIRST
.extend([
1775 '%d/%m/%Y %H:%M:%S',
1778 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1779 DATE_FORMATS_MONTH_FIRST
.extend([
1784 '%m/%d/%Y %H:%M:%S',
1787 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1788 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1791 def preferredencoding():
1792 """Get preferred encoding.
1794 Returns the best encoding scheme for the system, based on
1795 locale.getpreferredencoding() and some further tweaks.
1798 pref = locale.getpreferredencoding()
1806 def write_json_file(obj, fn):
1807 """ Encode obj as JSON and write it to fn, atomically if possible """
1809 fn = encodeFilename(fn)
1810 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1811 encoding = get_filesystem_encoding()
1812 # os.path.basename returns a bytes object, but NamedTemporaryFile
1813 # will fail if the filename contains non ascii characters unless we
1814 # use a unicode object
1815 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1816 # the same for os.path.dirname
1817 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1819 path_basename = os.path.basename
1820 path_dirname = os.path.dirname
1824 'prefix
': path_basename(fn) + '.',
1825 'dir': path_dirname(fn),
1829 # In Python 2.x, json.dump expects a bytestream.
1830 # In Python 3.x, it writes to a character stream
1831 if sys.version_info < (3, 0):
1836 'encoding
': 'utf
-8',
1839 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1844 if sys.platform == 'win32
':
1845 # Need to remove existing file on Windows, else os.rename raises
1846 # WindowsError or FileExistsError.
1854 os.chmod(tf.name, 0o666 & ~mask)
1857 os.rename(tf.name, fn)
1866 if sys.version_info >= (2, 7):
1867 def find_xpath_attr(node, xpath, key, val=None):
1868 """ Find the xpath xpath[@key=val] """
1869 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1870 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1871 return node.find(expr)
1873 def find_xpath_attr(node, xpath, key, val=None):
1874 for f in node.findall(compat_xpath(xpath)):
1875 if key not in f.attrib:
1877 if val is None or f.attrib.get(key) == val:
1881 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1882 # the namespace parameter
1885 def xpath_with_ns(path
, ns_map
):
1886 components
= [c
.split(':') for c
in path
.split('/')]
1888 for c
in components
:
1890 replaced
.append(c
[0])
1893 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1894 return '/'.join(replaced
)
1897 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1898 def _find_xpath(xpath
):
1899 return node
.find(compat_xpath(xpath
))
1901 if isinstance(xpath
, (str, compat_str
)):
1902 n
= _find_xpath(xpath
)
1910 if default
is not NO_DEFAULT
:
1913 name
= xpath
if name
is None else name
1914 raise ExtractorError('Could not find XML element %s' % name
)
1920 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1921 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1922 if n
is None or n
== default
:
1925 if default
is not NO_DEFAULT
:
1928 name
= xpath
if name
is None else name
1929 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1935 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1936 n
= find_xpath_attr(node
, xpath
, key
)
1938 if default
is not NO_DEFAULT
:
1941 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1942 raise ExtractorError('Could not find XML attribute %s' % name
)
1945 return n
.attrib
[key
]
1948 def get_element_by_id(id, html
):
1949 """Return the content of the tag with the specified ID in the passed HTML document"""
1950 return get_element_by_attribute('id', id, html
)
1953 def get_element_by_class(class_name
, html
):
1954 """Return the content of the first tag with the specified class in the passed HTML document"""
1955 retval
= get_elements_by_class(class_name
, html
)
1956 return retval
[0] if retval
else None
1959 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1960 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1961 return retval
[0] if retval
else None
1964 def get_elements_by_class(class_name
, html
):
1965 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1966 return get_elements_by_attribute(
1967 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1968 html, escape_value=False)
1971 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1972 """Return the content of the tag with the specified attribute in the passed HTML document"""
1974 value = re.escape(value) if escape_value else value
1977 for m in re.finditer(r'''(?xs)
1979 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1981 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1985 ''' % (re.escape(attribute), value), html):
1986 res = m.group('content
')
1988 if res.startswith('"') or res.startswith("'"):
1991 retlist.append(unescapeHTML(res))
1996 class HTMLAttributeParser(compat_HTMLParser):
1997 """Trivial HTML parser to gather the attributes for a single element"""
2001 compat_HTMLParser.__init__(self)
2003 def handle_starttag(self, tag, attrs):
2004 self.attrs = dict(attrs)
2007 def extract_attributes(html_element):
2008 """Given a string for an HTML element such as
2010 a="foo" B="bar" c="&98;az" d=boz
2011 empty= noval entity="&"
2014 Decode and return a dictionary of attributes.
2016 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2017 'empty
': '', 'noval
': None, 'entity
': '&',
2018 'sq
': '"', 'dq': '\''
2020 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2021 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2023 parser = HTMLAttributeParser()
2025 parser.feed(html_element)
2027 # Older Python may throw HTMLParseError in case of malformed HTML
2028 except compat_HTMLParseError:
2033 def clean_html(html):
2034 """Clean an HTML snippet into a readable string"""
2036 if html is None: # Convenience for sanitizing descriptions etc.
2040 html = html.replace('\n', ' ')
2041 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2042 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2044 html = re.sub('<.*?>', '', html)
2045 # Replace html entities
2046 html = unescapeHTML(html)
2050 def sanitize_open(filename, open_mode):
2051 """Try to open the given filename, and slightly tweak it if this fails.
2053 Attempts to open the given filename. If this fails, it tries to change
2054 the filename slightly, step by step, until it's either able to open it
2055 or it fails and raises a final exception, like the standard open()
2058 It returns the tuple (stream, definitive_file_name).
2062 if sys.platform == 'win32':
2064 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2065 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2066 stream = open(encodeFilename(filename), open_mode)
2067 return (stream, filename)
2068 except (IOError, OSError) as err:
2069 if err.errno in (errno.EACCES,):
2072 # In case of error, try to remove win32 forbidden chars
2073 alt_filename = sanitize_path(filename)
2074 if alt_filename == filename:
2077 # An exception here should be caught in the caller
2078 stream = open(encodeFilename(alt_filename), open_mode)
2079 return (stream, alt_filename)
2082 def timeconvert(timestr):
2083 """Convert RFC 2822 defined time string into system timestamp"""
2085 timetuple = email.utils.parsedate_tz(timestr)
2086 if timetuple is not None:
2087 timestamp = email.utils.mktime_tz(timetuple)
2091 def sanitize_filename(s, restricted=False, is_id=False):
2092 """Sanitizes a string so it could be used as part of a filename.
2093 If restricted is set, use a stricter subset of allowed characters.
2094 Set is_id if this is not an arbitrary string, but an ID that should be kept
2097 def replace_insane(char):
2098 if restricted and char in ACCENT_CHARS:
2099 return ACCENT_CHARS[char]
2100 if char == '?' or ord(char) < 32 or ord(char) == 127:
2103 return '' if restricted else '\''
2105 return '_
-' if restricted else ' -'
2106 elif char in '\\/|
*<>':
2108 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2110 if restricted
and ord(char
) > 127:
2117 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2118 result
= ''.join(map(replace_insane
, s
))
2120 while '__' in result
:
2121 result
= result
.replace('__', '_')
2122 result
= result
.strip('_')
2123 # Common case of "Foreign band name - English song title"
2124 if restricted
and result
.startswith('-_'):
2126 if result
.startswith('-'):
2127 result
= '_' + result
[len('-'):]
2128 result
= result
.lstrip('.')
2134 def sanitize_path(s
, force
=False):
2135 """Sanitizes and normalizes path on Windows"""
2136 if sys
.platform
== 'win32':
2138 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2139 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2140 drive_or_unc
, _
= os
.path
.splitunc(s
)
2146 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2150 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2151 for path_part
in norm_path
]
2153 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2154 elif force
and s
[0] == os
.path
.sep
:
2155 sanitized_path
.insert(0, os
.path
.sep
)
2156 return os
.path
.join(*sanitized_path
)
2159 def sanitize_url(url
):
2160 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2161 # the number of unwanted failures due to missing protocol
2162 if url
.startswith('//'):
2163 return 'http:%s' % url
2164 # Fix some common typos seen so far
2166 # https://github.com/ytdl-org/youtube-dl/issues/15649
2167 (r
'^httpss://', r
'https://'),
2168 # https://bx1.be/lives/direct-tv/
2169 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2171 for mistake
, fixup
in COMMON_TYPOS
:
2172 if re
.match(mistake
, url
):
2173 return re
.sub(mistake
, fixup
, url
)
2177 def extract_basic_auth(url
):
2178 parts
= compat_urlparse
.urlsplit(url
)
2179 if parts
.username
is None:
2181 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
2182 parts
.hostname
if parts
.port
is None
2183 else '%s:%d' % (parts
.hostname
, parts
.port
))))
2184 auth_payload
= base64
.b64encode(
2185 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
2186 return url
, 'Basic ' + auth_payload
.decode('utf-8')
2189 def sanitized_Request(url
, *args
, **kwargs
):
2190 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
2191 if auth_header
is not None:
2192 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
2193 headers
['Authorization'] = auth_header
2194 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
2198 """Expand shell variables and ~"""
2199 return os
.path
.expandvars(compat_expanduser(s
))
2202 def orderedSet(iterable
):
2203 """ Remove all duplicates from the input iterable """
2211 def _htmlentity_transform(entity_with_semicolon
):
2212 """Transforms an HTML entity to a character."""
2213 entity
= entity_with_semicolon
[:-1]
2215 # Known non-numeric HTML entity
2216 if entity
in compat_html_entities
.name2codepoint
:
2217 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2219 # TODO: HTML5 allows entities without a semicolon. For example,
2220 # 'Éric' should be decoded as 'Éric'.
2221 if entity_with_semicolon
in compat_html_entities_html5
:
2222 return compat_html_entities_html5
[entity_with_semicolon
]
2224 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2225 if mobj
is not None:
2226 numstr
= mobj
.group(1)
2227 if numstr
.startswith('x'):
2229 numstr
= '0%s' % numstr
2232 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2234 return compat_chr(int(numstr
, base
))
2238 # Unknown entity in name, return its literal representation
2239 return '&%s;' % entity
2242 def unescapeHTML(s
):
2245 assert type(s
) == compat_str
2248 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2251 def escapeHTML(text
):
2254 .replace('&', '&')
2255 .replace('<', '<')
2256 .replace('>', '>')
2257 .replace('"', '"')
2258 .replace("'", ''')
2262 def process_communicate_or_kill(p
, *args
, **kwargs
):
2264 return p
.communicate(*args
, **kwargs
)
2265 except BaseException
: # Including KeyboardInterrupt
2271 def get_subprocess_encoding():
2272 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2273 # For subprocess calls, encode with locale encoding
2274 # Refer to http://stackoverflow.com/a/9951851/35070
2275 encoding
= preferredencoding()
2277 encoding
= sys
.getfilesystemencoding()
2278 if encoding
is None:
2283 def encodeFilename(s
, for_subprocess
=False):
2285 @param s The name of the file
2288 assert type(s
) == compat_str
2290 # Python 3 has a Unicode API
2291 if sys
.version_info
>= (3, 0):
2294 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2295 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2296 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2297 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2300 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2301 if sys
.platform
.startswith('java'):
2304 return s
.encode(get_subprocess_encoding(), 'ignore')
2307 def decodeFilename(b
, for_subprocess
=False):
2309 if sys
.version_info
>= (3, 0):
2312 if not isinstance(b
, bytes):
2315 return b
.decode(get_subprocess_encoding(), 'ignore')
2318 def encodeArgument(s
):
2319 if not isinstance(s
, compat_str
):
2320 # Legacy code that uses byte strings
2321 # Uncomment the following line after fixing all post processors
2322 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2323 s
= s
.decode('ascii')
2324 return encodeFilename(s
, True)
2327 def decodeArgument(b
):
2328 return decodeFilename(b
, True)
2331 def decodeOption(optval
):
2334 if isinstance(optval
, bytes):
2335 optval
= optval
.decode(preferredencoding())
2337 assert isinstance(optval
, compat_str
)
2341 def formatSeconds(secs
, delim
=':', msec
=False):
2343 ret
= '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2345 ret
= '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2348 return '%s.%03d' % (ret
, secs
% 1) if msec
else ret
2351 def make_HTTPS_handler(params
, **kwargs
):
2352 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2353 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2354 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2355 if opts_no_check_certificate
:
2356 context
.check_hostname
= False
2357 context
.verify_mode
= ssl
.CERT_NONE
2359 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2362 # (create_default_context present but HTTPSHandler has no context=)
2365 if sys
.version_info
< (3, 2):
2366 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2367 else: # Python < 3.4
2368 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2369 context
.verify_mode
= (ssl
.CERT_NONE
2370 if opts_no_check_certificate
2371 else ssl
.CERT_REQUIRED
)
2372 context
.set_default_verify_paths()
2373 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2376 def bug_reports_message(before
=';'):
2377 if ytdl_is_updateable():
2378 update_cmd
= 'type yt-dlp -U to update'
2380 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2381 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2382 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2383 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2385 before
= before
.rstrip()
2386 if not before
or before
.endswith(('.', '!', '?')):
2387 msg
= msg
[0].title() + msg
[1:]
2389 return (before
+ ' ' if before
else '') + msg
2392 class YoutubeDLError(Exception):
2393 """Base exception for YoutubeDL errors."""
2397 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2398 if hasattr(ssl
, 'CertificateError'):
2399 network_exceptions
.append(ssl
.CertificateError
)
2400 network_exceptions
= tuple(network_exceptions
)
2403 class ExtractorError(YoutubeDLError
):
2404 """Error during info extraction."""
2406 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
2407 """ tb, if given, is the original traceback (so that it can be printed out).
2408 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2410 if sys
.exc_info()[0] in network_exceptions
:
2415 self
.expected
= expected
2417 self
.video_id
= video_id
2419 self
.exc_info
= sys
.exc_info() # preserve original exception
2421 super(ExtractorError
, self
).__init
__(''.join((
2422 format_field(ie
, template
='[%s] '),
2423 format_field(video_id
, template
='%s: '),
2425 format_field(cause
, template
=' (caused by %r)'),
2426 '' if expected
else bug_reports_message())))
2428 def format_traceback(self
):
2429 if self
.traceback
is None:
2431 return ''.join(traceback
.format_tb(self
.traceback
))
2434 class UnsupportedError(ExtractorError
):
2435 def __init__(self
, url
):
2436 super(UnsupportedError
, self
).__init
__(
2437 'Unsupported URL: %s' % url
, expected
=True)
2441 class RegexNotFoundError(ExtractorError
):
2442 """Error when a regex didn't match"""
2446 class GeoRestrictedError(ExtractorError
):
2447 """Geographic restriction Error exception.
2449 This exception may be thrown when a video is not available from your
2450 geographic location due to geographic restrictions imposed by a website.
2453 def __init__(self
, msg
, countries
=None):
2454 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2456 self
.countries
= countries
2459 class DownloadError(YoutubeDLError
):
2460 """Download Error exception.
2462 This exception may be thrown by FileDownloader objects if they are not
2463 configured to continue on errors. They will contain the appropriate
2467 def __init__(self
, msg
, exc_info
=None):
2468 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2469 super(DownloadError
, self
).__init
__(msg
)
2470 self
.exc_info
= exc_info
2473 class EntryNotInPlaylist(YoutubeDLError
):
2474 """Entry not in playlist exception.
2476 This exception will be thrown by YoutubeDL when a requested entry
2477 is not found in the playlist info_dict
2482 class SameFileError(YoutubeDLError
):
2483 """Same File exception.
2485 This exception will be thrown by FileDownloader objects if they detect
2486 multiple files would have to be downloaded to the same file on disk.
2491 class PostProcessingError(YoutubeDLError
):
2492 """Post Processing exception.
2494 This exception may be raised by PostProcessor's .run() method to
2495 indicate an error in the postprocessing task.
2498 def __init__(self
, msg
):
2499 super(PostProcessingError
, self
).__init
__(msg
)
2503 class ExistingVideoReached(YoutubeDLError
):
2504 """ --max-downloads limit has been reached. """
2508 class RejectedVideoReached(YoutubeDLError
):
2509 """ --max-downloads limit has been reached. """
2513 class ThrottledDownload(YoutubeDLError
):
2514 """ Download speed below --throttled-rate. """
2518 class MaxDownloadsReached(YoutubeDLError
):
2519 """ --max-downloads limit has been reached. """
2523 class UnavailableVideoError(YoutubeDLError
):
2524 """Unavailable Format exception.
2526 This exception will be thrown when a video is requested
2527 in a format that is not available for that video.
2532 class ContentTooShortError(YoutubeDLError
):
2533 """Content Too Short exception.
2535 This exception may be raised by FileDownloader objects when a file they
2536 download is too small for what the server announced first, indicating
2537 the connection was probably interrupted.
2540 def __init__(self
, downloaded
, expected
):
2541 super(ContentTooShortError
, self
).__init
__(
2542 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2545 self
.downloaded
= downloaded
2546 self
.expected
= expected
2549 class XAttrMetadataError(YoutubeDLError
):
2550 def __init__(self
, code
=None, msg
='Unknown error'):
2551 super(XAttrMetadataError
, self
).__init
__(msg
)
2555 # Parsing code and msg
2556 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2557 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2558 self
.reason
= 'NO_SPACE'
2559 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2560 self
.reason
= 'VALUE_TOO_LONG'
2562 self
.reason
= 'NOT_SUPPORTED'
2565 class XAttrUnavailableError(YoutubeDLError
):
2569 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2570 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2571 # expected HTTP responses to meet HTTP/1.0 or later (see also
2572 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2573 if sys
.version_info
< (3, 0):
2574 kwargs
['strict'] = True
2575 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2576 source_address
= ydl_handler
._params
.get('source_address')
2578 if source_address
is not None:
2579 # This is to workaround _create_connection() from socket where it will try all
2580 # address data from getaddrinfo() including IPv6. This filters the result from
2581 # getaddrinfo() based on the source_address value.
2582 # This is based on the cpython socket.create_connection() function.
2583 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2584 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2585 host
, port
= address
2587 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2588 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2589 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2590 if addrs
and not ip_addrs
:
2591 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2593 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2594 % (ip_version
, source_address
[0]))
2595 for res
in ip_addrs
:
2596 af
, socktype
, proto
, canonname
, sa
= res
2599 sock
= socket
.socket(af
, socktype
, proto
)
2600 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2601 sock
.settimeout(timeout
)
2602 sock
.bind(source_address
)
2604 err
= None # Explicitly break reference cycle
2606 except socket
.error
as _
:
2608 if sock
is not None:
2613 raise socket
.error('getaddrinfo returns an empty list')
2614 if hasattr(hc
, '_create_connection'):
2615 hc
._create
_connection
= _create_connection
2616 sa
= (source_address
, 0)
2617 if hasattr(hc
, 'source_address'): # Python 2.7+
2618 hc
.source_address
= sa
2620 def _hc_connect(self
, *args
, **kwargs
):
2621 sock
= _create_connection(
2622 (self
.host
, self
.port
), self
.timeout
, sa
)
2624 self
.sock
= ssl
.wrap_socket(
2625 sock
, self
.key_file
, self
.cert_file
,
2626 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2629 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2634 def handle_youtubedl_headers(headers
):
2635 filtered_headers
= headers
2637 if 'Youtubedl-no-compression' in filtered_headers
:
2638 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2639 del filtered_headers
['Youtubedl-no-compression']
2641 return filtered_headers
2644 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2645 """Handler for HTTP requests and responses.
2647 This class, when installed with an OpenerDirector, automatically adds
2648 the standard headers to every HTTP request and handles gzipped and
2649 deflated responses from web servers. If compression is to be avoided in
2650 a particular request, the original request in the program code only has
2651 to include the HTTP header "Youtubedl-no-compression", which will be
2652 removed before making the real request.
2654 Part of this code was copied from:
2656 http://techknack.net/python-urllib2-handlers/
2658 Andrew Rowls, the author of that code, agreed to release it to the
2662 def __init__(self
, params
, *args
, **kwargs
):
2663 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2664 self
._params
= params
2666 def http_open(self
, req
):
2667 conn_class
= compat_http_client
.HTTPConnection
2669 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2671 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2672 del req
.headers
['Ytdl-socks-proxy']
2674 return self
.do_open(functools
.partial(
2675 _create_http_connection
, self
, conn_class
, False),
2683 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2685 return zlib
.decompress(data
)
2687 def http_request(self
, req
):
2688 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2689 # always respected by websites, some tend to give out URLs with non percent-encoded
2690 # non-ASCII characters (see telemb.py, ard.py [#3412])
2691 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2692 # To work around aforementioned issue we will replace request's original URL with
2693 # percent-encoded one
2694 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2695 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2696 url
= req
.get_full_url()
2697 url_escaped
= escape_url(url
)
2699 # Substitute URL if any change after escaping
2700 if url
!= url_escaped
:
2701 req
= update_Request(req
, url
=url_escaped
)
2703 for h
, v
in std_headers
.items():
2704 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2705 # The dict keys are capitalized because of this bug by urllib
2706 if h
.capitalize() not in req
.headers
:
2707 req
.add_header(h
, v
)
2709 req
.headers
= handle_youtubedl_headers(req
.headers
)
2711 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2712 # Python 2.6 is brain-dead when it comes to fragments
2713 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2714 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2718 def http_response(self
, req
, resp
):
2721 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2722 content
= resp
.read()
2723 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2725 uncompressed
= io
.BytesIO(gz
.read())
2726 except IOError as original_ioerror
:
2727 # There may be junk add the end of the file
2728 # See http://stackoverflow.com/q/4928560/35070 for details
2729 for i
in range(1, 1024):
2731 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2732 uncompressed
= io
.BytesIO(gz
.read())
2737 raise original_ioerror
2738 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2739 resp
.msg
= old_resp
.msg
2740 del resp
.headers
['Content-encoding']
2742 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2743 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2744 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2745 resp
.msg
= old_resp
.msg
2746 del resp
.headers
['Content-encoding']
2747 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2748 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2749 if 300 <= resp
.code
< 400:
2750 location
= resp
.headers
.get('Location')
2752 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2753 if sys
.version_info
>= (3, 0):
2754 location
= location
.encode('iso-8859-1').decode('utf-8')
2756 location
= location
.decode('utf-8')
2757 location_escaped
= escape_url(location
)
2758 if location
!= location_escaped
:
2759 del resp
.headers
['Location']
2760 if sys
.version_info
< (3, 0):
2761 location_escaped
= location_escaped
.encode('utf-8')
2762 resp
.headers
['Location'] = location_escaped
2765 https_request
= http_request
2766 https_response
= http_response
2769 def make_socks_conn_class(base_class
, socks_proxy
):
2770 assert issubclass(base_class
, (
2771 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2773 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2774 if url_components
.scheme
.lower() == 'socks5':
2775 socks_type
= ProxyType
.SOCKS5
2776 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2777 socks_type
= ProxyType
.SOCKS4
2778 elif url_components
.scheme
.lower() == 'socks4a':
2779 socks_type
= ProxyType
.SOCKS4A
2781 def unquote_if_non_empty(s
):
2784 return compat_urllib_parse_unquote_plus(s
)
2788 url_components
.hostname
, url_components
.port
or 1080,
2790 unquote_if_non_empty(url_components
.username
),
2791 unquote_if_non_empty(url_components
.password
),
2794 class SocksConnection(base_class
):
2796 self
.sock
= sockssocket()
2797 self
.sock
.setproxy(*proxy_args
)
2798 if type(self
.timeout
) in (int, float):
2799 self
.sock
.settimeout(self
.timeout
)
2800 self
.sock
.connect((self
.host
, self
.port
))
2802 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2803 if hasattr(self
, '_context'): # Python > 2.6
2804 self
.sock
= self
._context
.wrap_socket(
2805 self
.sock
, server_hostname
=self
.host
)
2807 self
.sock
= ssl
.wrap_socket(self
.sock
)
2809 return SocksConnection
2812 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2813 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2814 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2815 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2816 self
._params
= params
2818 def https_open(self
, req
):
2820 conn_class
= self
._https
_conn
_class
2822 if hasattr(self
, '_context'): # python > 2.6
2823 kwargs
['context'] = self
._context
2824 if hasattr(self
, '_check_hostname'): # python 3.x
2825 kwargs
['check_hostname'] = self
._check
_hostname
2827 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2829 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2830 del req
.headers
['Ytdl-socks-proxy']
2832 return self
.do_open(functools
.partial(
2833 _create_http_connection
, self
, conn_class
, True),
2837 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2839 See [1] for cookie file format.
2841 1. https://curl.haxx.se/docs/http-cookies.html
2843 _HTTPONLY_PREFIX
= '#HttpOnly_'
2845 _HEADER
= '''# Netscape HTTP Cookie File
2846 # This file is generated by yt-dlp. Do not edit.
2849 _CookieFileEntry
= collections
.namedtuple(
2851 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2853 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2855 Save cookies to a file.
2857 Most of the code is taken from CPython 3.8 and slightly adapted
2858 to support cookie files with UTF-8 in both python 2 and 3.
2860 if filename
is None:
2861 if self
.filename
is not None:
2862 filename
= self
.filename
2864 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2866 # Store session cookies with `expires` set to 0 instead of an empty
2869 if cookie
.expires
is None:
2872 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2873 f
.write(self
._HEADER
)
2876 if not ignore_discard
and cookie
.discard
:
2878 if not ignore_expires
and cookie
.is_expired(now
):
2884 if cookie
.domain
.startswith('.'):
2885 initial_dot
= 'TRUE'
2887 initial_dot
= 'FALSE'
2888 if cookie
.expires
is not None:
2889 expires
= compat_str(cookie
.expires
)
2892 if cookie
.value
is None:
2893 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2894 # with no name, whereas http.cookiejar regards it as a
2895 # cookie with no value.
2900 value
= cookie
.value
2902 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2903 secure
, expires
, name
, value
]) + '\n')
2905 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2906 """Load cookies from a file."""
2907 if filename
is None:
2908 if self
.filename
is not None:
2909 filename
= self
.filename
2911 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2913 def prepare_line(line
):
2914 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2915 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2916 # comments and empty lines are fine
2917 if line
.startswith('#') or not line
.strip():
2919 cookie_list
= line
.split('\t')
2920 if len(cookie_list
) != self
._ENTRY
_LEN
:
2921 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2922 cookie
= self
._CookieFileEntry
(*cookie_list
)
2923 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2924 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2928 with io
.open(filename
, encoding
='utf-8') as f
:
2931 cf
.write(prepare_line(line
))
2932 except compat_cookiejar
.LoadError
as e
:
2934 'WARNING: skipping cookie file entry due to %s: %r\n'
2935 % (e
, line
), sys
.stderr
)
2938 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2939 # Session cookies are denoted by either `expires` field set to
2940 # an empty string or 0. MozillaCookieJar only recognizes the former
2941 # (see [1]). So we need force the latter to be recognized as session
2942 # cookies on our own.
2943 # Session cookies may be important for cookies-based authentication,
2944 # e.g. usually, when user does not check 'Remember me' check box while
2945 # logging in on a site, some important cookies are stored as session
2946 # cookies so that not recognizing them will result in failed login.
2947 # 1. https://bugs.python.org/issue17164
2949 # Treat `expires=0` cookies as session cookies
2950 if cookie
.expires
== 0:
2951 cookie
.expires
= None
2952 cookie
.discard
= True
2955 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2956 def __init__(self
, cookiejar
=None):
2957 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2959 def http_response(self
, request
, response
):
2960 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2961 # characters in Set-Cookie HTTP header of last response (see
2962 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2963 # In order to at least prevent crashing we will percent encode Set-Cookie
2964 # header before HTTPCookieProcessor starts processing it.
2965 # if sys.version_info < (3, 0) and response.headers:
2966 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2967 # set_cookie = response.headers.get(set_cookie_header)
2969 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2970 # if set_cookie != set_cookie_escaped:
2971 # del response.headers[set_cookie_header]
2972 # response.headers[set_cookie_header] = set_cookie_escaped
2973 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2975 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2976 https_response
= http_response
2979 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2980 """YoutubeDL redirect handler
2982 The code is based on HTTPRedirectHandler implementation from CPython [1].
2984 This redirect handler solves two issues:
2985 - ensures redirect URL is always unicode under python 2
2986 - introduces support for experimental HTTP response status code
2987 308 Permanent Redirect [2] used by some sites [3]
2989 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2990 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2991 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2994 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
2996 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2997 """Return a Request or None in response to a redirect.
2999 This is called by the http_error_30x methods when a
3000 redirection response is received. If a redirection should
3001 take place, return a new Request to allow http_error_30x to
3002 perform the redirect. Otherwise, raise HTTPError if no-one
3003 else should try to handle this url. Return None if you can't
3004 but another Handler might.
3006 m
= req
.get_method()
3007 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
3008 or code
in (301, 302, 303) and m
== "POST")):
3009 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
3010 # Strictly (according to RFC 2616), 301 or 302 in response to
3011 # a POST MUST NOT cause a redirection without confirmation
3012 # from the user (of urllib.request, in this case). In practice,
3013 # essentially all clients do redirect in this case, so we do
3016 # On python 2 urlh.geturl() may sometimes return redirect URL
3017 # as byte string instead of unicode. This workaround allows
3018 # to force it always return unicode.
3019 if sys
.version_info
[0] < 3:
3020 newurl
= compat_str(newurl
)
3022 # Be conciliant with URIs containing a space. This is mainly
3023 # redundant with the more complete encoding done in http_error_302(),
3024 # but it is kept for compatibility with other callers.
3025 newurl
= newurl
.replace(' ', '%20')
3027 CONTENT_HEADERS
= ("content-length", "content-type")
3028 # NB: don't use dict comprehension for python 2.6 compatibility
3029 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
3030 if k
.lower() not in CONTENT_HEADERS
)
3031 return compat_urllib_request
.Request(
3032 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
3036 def extract_timezone(date_str
):
3039 ^.{8,}? # >=8 char non-TZ prefix, if present
3040 (?P<tz>Z| # just the UTC Z, or
3041 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
3042 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
3043 [ ]? # optional space
3044 (?P<sign>\+|-) # +/-
3045 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
3049 timezone
= datetime
.timedelta()
3051 date_str
= date_str
[:-len(m
.group('tz'))]
3052 if not m
.group('sign'):
3053 timezone
= datetime
.timedelta()
3055 sign
= 1 if m
.group('sign') == '+' else -1
3056 timezone
= datetime
.timedelta(
3057 hours
=sign
* int(m
.group('hours')),
3058 minutes
=sign
* int(m
.group('minutes')))
3059 return timezone
, date_str
3062 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3063 """ Return a UNIX timestamp from the given date """
3065 if date_str
is None:
3068 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3070 if timezone
is None:
3071 timezone
, date_str
= extract_timezone(date_str
)
3074 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3075 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3076 return calendar
.timegm(dt
.timetuple())
3081 def date_formats(day_first
=True):
3082 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3085 def unified_strdate(date_str
, day_first
=True):
3086 """Return a string with the date in the format YYYYMMDD"""
3088 if date_str
is None:
3092 date_str
= date_str
.replace(',', ' ')
3093 # Remove AM/PM + timezone
3094 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3095 _
, date_str
= extract_timezone(date_str
)
3097 for expression
in date_formats(day_first
):
3099 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3102 if upload_date
is None:
3103 timetuple
= email
.utils
.parsedate_tz(date_str
)
3106 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3109 if upload_date
is not None:
3110 return compat_str(upload_date
)
3113 def unified_timestamp(date_str
, day_first
=True):
3114 if date_str
is None:
3117 date_str
= re
.sub(r
'[,|]', '', date_str
)
3119 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3120 timezone
, date_str
= extract_timezone(date_str
)
3122 # Remove AM/PM + timezone
3123 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3125 # Remove unrecognized timezones from ISO 8601 alike timestamps
3126 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3128 date_str
= date_str
[:-len(m
.group('tz'))]
3130 # Python only supports microseconds, so remove nanoseconds
3131 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3133 date_str
= m
.group(1)
3135 for expression
in date_formats(day_first
):
3137 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3138 return calendar
.timegm(dt
.timetuple())
3141 timetuple
= email
.utils
.parsedate_tz(date_str
)
3143 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3146 def determine_ext(url
, default_ext
='unknown_video'):
3147 if url
is None or '.' not in url
:
3149 guess
= url
.partition('?')[0].rpartition('.')[2]
3150 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3152 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3153 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3154 return guess
.rstrip('/')
3159 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3160 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3163 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3165 Return a datetime object from a string in the format YYYYMMDD or
3166 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3168 format: string date format used to return datetime object from
3169 precision: round the time portion of a datetime object.
3170 auto|microsecond|second|minute|hour|day.
3171 auto: round to the unit provided in date_str (if applicable).
3173 auto_precision
= False
3174 if precision
== 'auto':
3175 auto_precision
= True
3176 precision
= 'microsecond'
3177 today
= datetime_round(datetime
.datetime
.now(), precision
)
3178 if date_str
in ('now', 'today'):
3180 if date_str
== 'yesterday':
3181 return today
- datetime
.timedelta(days
=1)
3183 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3185 if match
is not None:
3186 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3187 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3188 unit
= match
.group('unit')
3189 if unit
== 'month' or unit
== 'year':
3190 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3196 delta
= datetime
.timedelta(**{unit + 's': time}
)
3197 new_date
= start_time
+ delta
3199 return datetime_round(new_date
, unit
)
3202 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3205 def date_from_str(date_str
, format
='%Y%m%d'):
3207 Return a datetime object from a string in the format YYYYMMDD or
3208 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3210 format: string date format used to return datetime object from
3212 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3215 def datetime_add_months(dt
, months
):
3216 """Increment/Decrement a datetime object by months."""
3217 month
= dt
.month
+ months
- 1
3218 year
= dt
.year
+ month
// 12
3219 month
= month
% 12 + 1
3220 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3221 return dt
.replace(year
, month
, day
)
3224 def datetime_round(dt
, precision
='day'):
3226 Round a datetime object's time to a specific precision
3228 if precision
== 'microsecond':
3237 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3238 timestamp
= calendar
.timegm(dt
.timetuple())
3239 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3242 def hyphenate_date(date_str
):
3244 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3245 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3246 if match
is not None:
3247 return '-'.join(match
.groups())
3252 class DateRange(object):
3253 """Represents a time interval between two dates"""
3255 def __init__(self
, start
=None, end
=None):
3256 """start and end must be strings in the format accepted by date"""
3257 if start
is not None:
3258 self
.start
= date_from_str(start
)
3260 self
.start
= datetime
.datetime
.min.date()
3262 self
.end
= date_from_str(end
)
3264 self
.end
= datetime
.datetime
.max.date()
3265 if self
.start
> self
.end
:
3266 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3270 """Returns a range that only contains the given day"""
3271 return cls(day
, day
)
3273 def __contains__(self
, date
):
3274 """Check if the date is in the range"""
3275 if not isinstance(date
, datetime
.date
):
3276 date
= date_from_str(date
)
3277 return self
.start
<= date
<= self
.end
3280 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3283 def platform_name():
3284 """ Returns the platform name as a compat_str """
3285 res
= platform
.platform()
3286 if isinstance(res
, bytes):
3287 res
= res
.decode(preferredencoding())
3289 assert isinstance(res
, compat_str
)
3293 def _windows_write_string(s
, out
):
3294 """ Returns True if the string was written using special methods,
3295 False if it has yet to be written out."""
3296 # Adapted from http://stackoverflow.com/a/3259271/35070
3299 import ctypes
.wintypes
3307 fileno
= out
.fileno()
3308 except AttributeError:
3309 # If the output stream doesn't have a fileno, it's virtual
3311 except io
.UnsupportedOperation
:
3312 # Some strange Windows pseudo files?
3314 if fileno
not in WIN_OUTPUT_IDS
:
3317 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3318 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3319 ('GetStdHandle', ctypes
.windll
.kernel32
))
3320 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3322 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3323 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3324 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3325 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3326 written
= ctypes
.wintypes
.DWORD(0)
3328 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3329 FILE_TYPE_CHAR
= 0x0002
3330 FILE_TYPE_REMOTE
= 0x8000
3331 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3332 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3333 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3334 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3335 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3337 def not_a_console(handle
):
3338 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3340 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3341 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3343 if not_a_console(h
):
3346 def next_nonbmp_pos(s
):
3348 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3349 except StopIteration:
3353 count
= min(next_nonbmp_pos(s
), 1024)
3355 ret
= WriteConsoleW(
3356 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3358 raise OSError('Failed to write string')
3359 if not count
: # We just wrote a non-BMP character
3360 assert written
.value
== 2
3363 assert written
.value
> 0
3364 s
= s
[written
.value
:]
3368 def write_string(s
, out
=None, encoding
=None):
3371 assert type(s
) == compat_str
3373 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3374 if _windows_write_string(s
, out
):
3377 if ('b' in getattr(out
, 'mode', '')
3378 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3379 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3381 elif hasattr(out
, 'buffer'):
3382 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3383 byt
= s
.encode(enc
, 'ignore')
3384 out
.buffer.write(byt
)
3390 def bytes_to_intlist(bs
):
3393 if isinstance(bs
[0], int): # Python 3
3396 return [ord(c
) for c
in bs
]
3399 def intlist_to_bytes(xs
):
3402 return compat_struct_pack('%dB' % len(xs
), *xs
)
3405 # Cross-platform file locking
3406 if sys
.platform
== 'win32':
3407 import ctypes
.wintypes
3410 class OVERLAPPED(ctypes
.Structure
):
3412 ('Internal', ctypes
.wintypes
.LPVOID
),
3413 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3414 ('Offset', ctypes
.wintypes
.DWORD
),
3415 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3416 ('hEvent', ctypes
.wintypes
.HANDLE
),
3419 kernel32
= ctypes
.windll
.kernel32
3420 LockFileEx
= kernel32
.LockFileEx
3421 LockFileEx
.argtypes
= [
3422 ctypes
.wintypes
.HANDLE
, # hFile
3423 ctypes
.wintypes
.DWORD
, # dwFlags
3424 ctypes
.wintypes
.DWORD
, # dwReserved
3425 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3426 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3427 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3429 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3430 UnlockFileEx
= kernel32
.UnlockFileEx
3431 UnlockFileEx
.argtypes
= [
3432 ctypes
.wintypes
.HANDLE
, # hFile
3433 ctypes
.wintypes
.DWORD
, # dwReserved
3434 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3435 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3436 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3438 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3439 whole_low
= 0xffffffff
3440 whole_high
= 0x7fffffff
3442 def _lock_file(f
, exclusive
):
3443 overlapped
= OVERLAPPED()
3444 overlapped
.Offset
= 0
3445 overlapped
.OffsetHigh
= 0
3446 overlapped
.hEvent
= 0
3447 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3448 handle
= msvcrt
.get_osfhandle(f
.fileno())
3449 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3450 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3451 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3453 def _unlock_file(f
):
3454 assert f
._lock
_file
_overlapped
_p
3455 handle
= msvcrt
.get_osfhandle(f
.fileno())
3456 if not UnlockFileEx(handle
, 0,
3457 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3458 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3461 # Some platforms, such as Jython, is missing fcntl
3465 def _lock_file(f
, exclusive
):
3466 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3468 def _unlock_file(f
):
3469 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3471 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3473 def _lock_file(f
, exclusive
):
3474 raise IOError(UNSUPPORTED_MSG
)
3476 def _unlock_file(f
):
3477 raise IOError(UNSUPPORTED_MSG
)
3480 class locked_file(object):
3481 def __init__(self
, filename
, mode
, encoding
=None):
3482 assert mode
in ['r', 'a', 'w']
3483 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3486 def __enter__(self
):
3487 exclusive
= self
.mode
!= 'r'
3489 _lock_file(self
.f
, exclusive
)
3495 def __exit__(self
, etype
, value
, traceback
):
3497 _unlock_file(self
.f
)
3504 def write(self
, *args
):
3505 return self
.f
.write(*args
)
3507 def read(self
, *args
):
3508 return self
.f
.read(*args
)
3511 def get_filesystem_encoding():
3512 encoding
= sys
.getfilesystemencoding()
3513 return encoding
if encoding
is not None else 'utf-8'
3516 def shell_quote(args
):
3518 encoding
= get_filesystem_encoding()
3520 if isinstance(a
, bytes):
3521 # We may get a filename encoded with 'encodeFilename'
3522 a
= a
.decode(encoding
)
3523 quoted_args
.append(compat_shlex_quote(a
))
3524 return ' '.join(quoted_args
)
3527 def smuggle_url(url
, data
):
3528 """ Pass additional data in a URL for internal use. """
3530 url
, idata
= unsmuggle_url(url
, {})
3532 sdata
= compat_urllib_parse_urlencode(
3533 {'__youtubedl_smuggle': json.dumps(data)}
)
3534 return url
+ '#' + sdata
3537 def unsmuggle_url(smug_url
, default
=None):
3538 if '#__youtubedl_smuggle' not in smug_url
:
3539 return smug_url
, default
3540 url
, _
, sdata
= smug_url
.rpartition('#')
3541 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3542 data
= json
.loads(jsond
)
3546 def format_bytes(bytes):
3549 if type(bytes) is str:
3550 bytes = float(bytes)
3554 exponent
= int(math
.log(bytes, 1024.0))
3555 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3556 converted
= float(bytes) / float(1024 ** exponent
)
3557 return '%.2f%s' % (converted
, suffix
)
3560 def lookup_unit_table(unit_table
, s
):
3561 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3563 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3566 num_str
= m
.group('num').replace(',', '.')
3567 mult
= unit_table
[m
.group('unit')]
3568 return int(float(num_str
) * mult
)
3571 def parse_filesize(s
):
3575 # The lower-case forms are of course incorrect and unofficial,
3576 # but we support those too
3593 'megabytes': 1000 ** 2,
3594 'mebibytes': 1024 ** 2,
3600 'gigabytes': 1000 ** 3,
3601 'gibibytes': 1024 ** 3,
3607 'terabytes': 1000 ** 4,
3608 'tebibytes': 1024 ** 4,
3614 'petabytes': 1000 ** 5,
3615 'pebibytes': 1024 ** 5,
3621 'exabytes': 1000 ** 6,
3622 'exbibytes': 1024 ** 6,
3628 'zettabytes': 1000 ** 7,
3629 'zebibytes': 1024 ** 7,
3635 'yottabytes': 1000 ** 8,
3636 'yobibytes': 1024 ** 8,
3639 return lookup_unit_table(_UNIT_TABLE
, s
)
3648 if re
.match(r
'^[\d,.]+$', s
):
3649 return str_to_int(s
)
3660 return lookup_unit_table(_UNIT_TABLE
, s
)
3663 def parse_resolution(s
):
3667 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3670 'width': int(mobj
.group('w')),
3671 'height': int(mobj
.group('h')),
3674 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3676 return {'height': int(mobj.group(1))}
3678 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3680 return {'height': int(mobj.group(1)) * 540}
3685 def parse_bitrate(s
):
3686 if not isinstance(s
, compat_str
):
3688 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3690 return int(mobj
.group(1))
3693 def month_by_name(name
, lang
='en'):
3694 """ Return the number of a month by (locale-independently) English name """
3696 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3699 return month_names
.index(name
) + 1
3704 def month_by_abbreviation(abbrev
):
3705 """ Return the number of a month by (locale-independently) English
3709 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3714 def fix_xml_ampersands(xml_str
):
3715 """Replace all the '&' by '&' in XML"""
3717 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3722 def setproctitle(title
):
3723 assert isinstance(title
, compat_str
)
3725 # ctypes in Jython is not complete
3726 # http://bugs.jython.org/issue2148
3727 if sys
.platform
.startswith('java'):
3731 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3735 # LoadLibrary in Windows Python 2.7.13 only expects
3736 # a bytestring, but since unicode_literals turns
3737 # every string into a unicode string, it fails.
3739 title_bytes
= title
.encode('utf-8')
3740 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3741 buf
.value
= title_bytes
3743 libc
.prctl(15, buf
, 0, 0, 0)
3744 except AttributeError:
3745 return # Strange libc, just skip this
3748 def remove_start(s
, start
):
3749 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3752 def remove_end(s
, end
):
3753 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3756 def remove_quotes(s
):
3757 if s
is None or len(s
) < 2:
3759 for quote
in ('"', "'", ):
3760 if s
[0] == quote
and s
[-1] == quote
:
3765 def get_domain(url
):
3766 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3767 return domain
.group('domain') if domain
else None
3770 def url_basename(url
):
3771 path
= compat_urlparse
.urlparse(url
).path
3772 return path
.strip('/').split('/')[-1]
3776 return re
.match(r
'https?://[^?#&]+/', url
).group()
3779 def urljoin(base
, path
):
3780 if isinstance(path
, bytes):
3781 path
= path
.decode('utf-8')
3782 if not isinstance(path
, compat_str
) or not path
:
3784 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3786 if isinstance(base
, bytes):
3787 base
= base
.decode('utf-8')
3788 if not isinstance(base
, compat_str
) or not re
.match(
3789 r
'^(?:https?:)?//', base
):
3791 return compat_urlparse
.urljoin(base
, path
)
3794 class HEADRequest(compat_urllib_request
.Request
):
3795 def get_method(self
):
3799 class PUTRequest(compat_urllib_request
.Request
):
3800 def get_method(self
):
3804 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3807 v
= getattr(v
, get_attr
, None)
3813 return int(v
) * invscale
// scale
3814 except (ValueError, TypeError):
3818 def str_or_none(v
, default
=None):
3819 return default
if v
is None else compat_str(v
)
3822 def str_to_int(int_str
):
3823 """ A more relaxed version of int_or_none """
3824 if isinstance(int_str
, compat_integer_types
):
3826 elif isinstance(int_str
, compat_str
):
3827 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3828 return int_or_none(int_str
)
3831 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3835 return float(v
) * invscale
/ scale
3836 except (ValueError, TypeError):
3840 def bool_or_none(v
, default
=None):
3841 return v
if isinstance(v
, bool) else default
3844 def strip_or_none(v
, default
=None):
3845 return v
.strip() if isinstance(v
, compat_str
) else default
3848 def url_or_none(url
):
3849 if not url
or not isinstance(url
, compat_str
):
3852 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3855 def strftime_or_none(timestamp
, date_format
, default
=None):
3856 datetime_object
= None
3858 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3859 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3860 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3861 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3862 return datetime_object
.strftime(date_format
)
3863 except (ValueError, TypeError, AttributeError):
3867 def parse_duration(s
):
3868 if not isinstance(s
, compat_basestring
):
3873 days
, hours
, mins
, secs
, ms
= [None] * 5
3874 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3876 days
, hours
, mins
, secs
, ms
= m
.groups()
3881 [0-9]+\s*y(?:ears?)?\s*
3884 [0-9]+\s*m(?:onths?)?\s*
3887 [0-9]+\s*w(?:eeks?)?\s*
3890 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3894 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3897 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3900 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3903 days
, hours
, mins
, secs
, ms
= m
.groups()
3905 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3907 hours
, mins
= m
.groups()
3913 duration
+= float(secs
)
3915 duration
+= float(mins
) * 60
3917 duration
+= float(hours
) * 60 * 60
3919 duration
+= float(days
) * 24 * 60 * 60
3921 duration
+= float(ms
)
3925 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3926 name
, real_ext
= os
.path
.splitext(filename
)
3928 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3929 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3930 else '{0}.{1}'.format(filename
, ext
))
3933 def replace_extension(filename
, ext
, expected_real_ext
=None):
3934 name
, real_ext
= os
.path
.splitext(filename
)
3935 return '{0}.{1}'.format(
3936 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3940 def check_executable(exe
, args
=[]):
3941 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3942 args can be a list of arguments for a short output (like -version) """
3944 process_communicate_or_kill(subprocess
.Popen(
3945 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3951 def get_exe_version(exe
, args
=['--version'],
3952 version_re
=None, unrecognized
='present'):
3953 """ Returns the version of the specified executable,
3954 or False if the executable is not present """
3956 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3957 # SIGTTOU if yt-dlp is run in the background.
3958 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3959 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3960 [encodeArgument(exe
)] + args
,
3961 stdin
=subprocess
.PIPE
,
3962 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3965 if isinstance(out
, bytes): # Python 2.x
3966 out
= out
.decode('ascii', 'ignore')
3967 return detect_exe_version(out
, version_re
, unrecognized
)
3970 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3971 assert isinstance(output
, compat_str
)
3972 if version_re
is None:
3973 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3974 m
= re
.search(version_re
, output
)
3981 class LazyList(collections
.abc
.Sequence
):
3982 ''' Lazy immutable list from an iterable
3983 Note that slices of a LazyList are lists and not LazyList'''
3985 class IndexError(IndexError):
3988 def __init__(self
, iterable
):
3989 self
.__iterable
= iter(iterable
)
3991 self
.__reversed
= False
3995 # We need to consume the entire iterable to iterate in reverse
3996 yield from self
.exhaust()
3998 yield from self
.__cache
3999 for item
in self
.__iterable
:
4000 self
.__cache
.append(item
)
4003 def __exhaust(self
):
4004 self
.__cache
.extend(self
.__iterable
)
4008 ''' Evaluate the entire iterable '''
4009 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
4012 def __reverse_index(x
):
4013 return None if x
is None else -(x
+ 1)
4015 def __getitem__(self
, idx
):
4016 if isinstance(idx
, slice):
4018 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
4019 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
4020 elif isinstance(idx
, int):
4022 idx
= self
.__reverse
_index
(idx
)
4023 start
, stop
, step
= idx
, idx
, 0
4025 raise TypeError('indices must be integers or slices')
4026 if ((start
or 0) < 0 or (stop
or 0) < 0
4027 or (start
is None and step
< 0)
4028 or (stop
is None and step
> 0)):
4029 # We need to consume the entire iterable to be able to slice from the end
4030 # Obviously, never use this with infinite iterables
4033 return self
.__cache
[idx
]
4034 except IndexError as e
:
4035 raise self
.IndexError(e
) from e
4036 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
4038 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
4040 return self
.__cache
[idx
]
4041 except IndexError as e
:
4042 raise self
.IndexError(e
) from e
4046 self
[-1] if self
.__reversed
else self
[0]
4047 except self
.IndexError:
4053 return len(self
.__cache
)
4056 self
.__reversed
= not self
.__reversed
4060 # repr and str should mimic a list. So we exhaust the iterable
4061 return repr(self
.exhaust())
4064 return repr(self
.exhaust())
4069 # This is only useful for tests
4070 return len(self
.getslice())
4072 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
4073 self
._pagefunc
= pagefunc
4074 self
._pagesize
= pagesize
4075 self
._use
_cache
= use_cache
4078 def getpage(self
, pagenum
):
4079 page_results
= self
._cache
.get(pagenum
) or list(self
._pagefunc
(pagenum
))
4081 self
._cache
[pagenum
] = page_results
4084 def getslice(self
, start
=0, end
=None):
4085 return list(self
._getslice
(start
, end
))
4087 def _getslice(self
, start
, end
):
4088 raise NotImplementedError('This method must be implemented by subclasses')
4090 def __getitem__(self
, idx
):
4091 # NOTE: cache must be enabled if this is used
4092 if not isinstance(idx
, int) or idx
< 0:
4093 raise TypeError('indices must be non-negative integers')
4094 entries
= self
.getslice(idx
, idx
+ 1)
4095 return entries
[0] if entries
else None
4098 class OnDemandPagedList(PagedList
):
4099 def _getslice(self
, start
, end
):
4100 for pagenum
in itertools
.count(start
// self
._pagesize
):
4101 firstid
= pagenum
* self
._pagesize
4102 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
4103 if start
>= nextfirstid
:
4107 start
% self
._pagesize
4108 if firstid
<= start
< nextfirstid
4111 ((end
- 1) % self
._pagesize
) + 1
4112 if (end
is not None and firstid
<= end
<= nextfirstid
)
4115 page_results
= self
.getpage(pagenum
)
4116 if startv
!= 0 or endv
is not None:
4117 page_results
= page_results
[startv
:endv
]
4118 yield from page_results
4120 # A little optimization - if current page is not "full", ie. does
4121 # not contain page_size videos then we can assume that this page
4122 # is the last one - there are no more ids on further pages -
4123 # i.e. no need to query again.
4124 if len(page_results
) + startv
< self
._pagesize
:
4127 # If we got the whole page, but the next page is not interesting,
4128 # break out early as well
4129 if end
== nextfirstid
:
4133 class InAdvancePagedList(PagedList
):
4134 def __init__(self
, pagefunc
, pagecount
, pagesize
):
4135 self
._pagecount
= pagecount
4136 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
4138 def _getslice(self
, start
, end
):
4139 start_page
= start
// self
._pagesize
4141 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4142 skip_elems
= start
- start_page
* self
._pagesize
4143 only_more
= None if end
is None else end
- start
4144 for pagenum
in range(start_page
, end_page
):
4145 page_results
= self
.getpage(pagenum
)
4147 page_results
= page_results
[skip_elems
:]
4149 if only_more
is not None:
4150 if len(page_results
) < only_more
:
4151 only_more
-= len(page_results
)
4153 yield from page_results
[:only_more
]
4155 yield from page_results
4158 def uppercase_escape(s
):
4159 unicode_escape
= codecs
.getdecoder('unicode_escape')
4161 r
'\\U[0-9a-fA-F]{8}',
4162 lambda m
: unicode_escape(m
.group(0))[0],
4166 def lowercase_escape(s
):
4167 unicode_escape
= codecs
.getdecoder('unicode_escape')
4169 r
'\\u[0-9a-fA-F]{4}',
4170 lambda m
: unicode_escape(m
.group(0))[0],
4174 def escape_rfc3986(s
):
4175 """Escape non-ASCII characters as suggested by RFC 3986"""
4176 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4177 s
= s
.encode('utf-8')
4178 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4181 def escape_url(url
):
4182 """Escape URL as suggested by RFC 3986"""
4183 url_parsed
= compat_urllib_parse_urlparse(url
)
4184 return url_parsed
._replace
(
4185 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4186 path
=escape_rfc3986(url_parsed
.path
),
4187 params
=escape_rfc3986(url_parsed
.params
),
4188 query
=escape_rfc3986(url_parsed
.query
),
4189 fragment
=escape_rfc3986(url_parsed
.fragment
)
4194 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
4197 def read_batch_urls(batch_fd
):
4199 if not isinstance(url
, compat_str
):
4200 url
= url
.decode('utf-8', 'replace')
4201 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4202 for bom
in BOM_UTF8
:
4203 if url
.startswith(bom
):
4204 url
= url
[len(bom
):]
4206 if not url
or url
.startswith(('#', ';', ']')):
4208 # "#" cannot be stripped out since it is part of the URI
4209 # However, it can be safely stipped out if follwing a whitespace
4210 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4212 with contextlib
.closing(batch_fd
) as fd
:
4213 return [url
for url
in map(fixup
, fd
) if url
]
4216 def urlencode_postdata(*args
, **kargs
):
4217 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4220 def update_url_query(url
, query
):
4223 parsed_url
= compat_urlparse
.urlparse(url
)
4224 qs
= compat_parse_qs(parsed_url
.query
)
4226 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4227 query
=compat_urllib_parse_urlencode(qs
, True)))
4230 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4231 req_headers
= req
.headers
.copy()
4232 req_headers
.update(headers
)
4233 req_data
= data
or req
.data
4234 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4235 req_get_method
= req
.get_method()
4236 if req_get_method
== 'HEAD':
4237 req_type
= HEADRequest
4238 elif req_get_method
== 'PUT':
4239 req_type
= PUTRequest
4241 req_type
= compat_urllib_request
.Request
4243 req_url
, data
=req_data
, headers
=req_headers
,
4244 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4245 if hasattr(req
, 'timeout'):
4246 new_req
.timeout
= req
.timeout
4250 def _multipart_encode_impl(data
, boundary
):
4251 content_type
= 'multipart/form-data; boundary=%s' % boundary
4254 for k
, v
in data
.items():
4255 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4256 if isinstance(k
, compat_str
):
4257 k
= k
.encode('utf-8')
4258 if isinstance(v
, compat_str
):
4259 v
= v
.encode('utf-8')
4260 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4261 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4262 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4263 if boundary
.encode('ascii') in content
:
4264 raise ValueError('Boundary overlaps with data')
4267 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4269 return out
, content_type
4272 def multipart_encode(data
, boundary
=None):
4274 Encode a dict to RFC 7578-compliant form-data
4277 A dict where keys and values can be either Unicode or bytes-like
4280 If specified a Unicode object, it's used as the boundary. Otherwise
4281 a random boundary is generated.
4283 Reference: https://tools.ietf.org/html/rfc7578
4285 has_specified_boundary
= boundary
is not None
4288 if boundary
is None:
4289 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4292 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4295 if has_specified_boundary
:
4299 return out
, content_type
4302 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4303 if isinstance(key_or_keys
, (list, tuple)):
4304 for key
in key_or_keys
:
4305 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4309 return d
.get(key_or_keys
, default
)
4312 def try_get(src
, getter
, expected_type
=None):
4313 for get
in variadic(getter
):
4316 except (AttributeError, KeyError, TypeError, IndexError):
4319 if expected_type
is None or isinstance(v
, expected_type
):
4323 def merge_dicts(*dicts
):
4325 for a_dict
in dicts
:
4326 for k
, v
in a_dict
.items():
4330 or (isinstance(v
, compat_str
) and v
4331 and isinstance(merged
[k
], compat_str
)
4332 and not merged
[k
])):
4337 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4338 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4350 TV_PARENTAL_GUIDELINES
= {
4360 def parse_age_limit(s
):
4362 return s
if 0 <= s
<= 21 else None
4363 if not isinstance(s
, compat_basestring
):
4365 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4367 return int(m
.group('age'))
4370 return US_RATINGS
[s
]
4371 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4373 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4377 def strip_jsonp(code
):
4380 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4381 (?:\s*&&\s*(?P=func_name))?
4382 \s*\(\s*(?P<callback_data>.*)\);?
4383 \s*?(?://[^\n]*)*$''',
4384 r
'\g<callback_data>', code
)
4387 def js_to_json(code
, vars={}):
4388 # vars is a dict of var, val pairs to substitute
4389 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4390 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4392 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4393 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4398 if v
in ('true', 'false', 'null'):
4400 elif v
in ('undefined', 'void 0'):
4402 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4405 if v
[0] in ("'", '"'):
4406 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4411 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4413 for regex
, base
in INTEGER_TABLE
:
4414 im
= re
.match(regex
, v
)
4416 i
= int(im
.group(1), base
)
4417 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4424 return re
.sub(r
'''(?sx)
4425 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4426 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4427 {comment}|,(?={skip}[\]}}])|
4428 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4429 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4432 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4435 def qualities(quality_ids
):
4436 """ Get a numeric quality value out of a list of possible values """
4439 return quality_ids
.index(qid
)
4446 'default': '%(title)s [%(id)s].%(ext)s',
4447 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4453 'description': 'description',
4454 'annotation': 'annotations.xml',
4455 'infojson': 'info.json',
4456 'pl_thumbnail': None,
4457 'pl_description': 'description',
4458 'pl_infojson': 'info.json',
4461 # As of [1] format syntax is:
4462 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4463 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4464 STR_FORMAT_RE_TMPL
= r
'''(?x)
4465 (?<!%)(?P<prefix>(?:%%)*)
4467 (?P<has_key>\((?P<key>{0})\))? # mapping key
4469 (?:[#0\-+ ]+)? # conversion flags (optional)
4470 (?:\d+)? # minimum field width (optional)
4471 (?:\.\d+)? # precision (optional)
4472 [hlL]? # length modifier (optional)
4473 {1} # conversion type
4478 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
4481 def limit_length(s
, length
):
4482 """ Add ellipses to overly long strings """
4487 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4491 def version_tuple(v
):
4492 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4495 def is_outdated_version(version
, limit
, assume_new
=True):
4497 return not assume_new
4499 return version_tuple(version
) < version_tuple(limit
)
4501 return not assume_new
4504 def ytdl_is_updateable():
4505 """ Returns if yt-dlp can be updated with -U """
4508 from zipimport
import zipimporter
4510 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4513 def args_to_str(args
):
4514 # Get a short string representation for a subprocess command
4515 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4518 def error_to_compat_str(err
):
4520 # On python 2 error byte string must be decoded with proper
4521 # encoding rather than ascii
4522 if sys
.version_info
[0] < 3:
4523 err_str
= err_str
.decode(preferredencoding())
4527 def mimetype2ext(mt
):
4533 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4534 # it's the most popular one
4535 'audio/mpeg': 'mp3',
4536 'audio/x-wav': 'wav',
4541 _
, _
, res
= mt
.rpartition('/')
4542 res
= res
.split(';')[0].strip().lower()
4546 'smptett+xml': 'tt',
4550 'x-mp4-fragmented': 'mp4',
4551 'x-ms-sami': 'sami',
4554 'x-mpegurl': 'm3u8',
4555 'vnd.apple.mpegurl': 'm3u8',
4559 'vnd.ms-sstr+xml': 'ism',
4566 def parse_codecs(codecs_str
):
4567 # http://tools.ietf.org/html/rfc6381
4570 split_codecs
= list(filter(None, map(
4571 str.strip
, codecs_str
.strip().strip(',').split(','))))
4572 vcodec
, acodec
= None, None
4573 for full_codec
in split_codecs
:
4574 codec
= full_codec
.split('.')[0]
4575 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4578 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4582 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4583 if not vcodec
and not acodec
:
4584 if len(split_codecs
) == 2:
4586 'vcodec': split_codecs
[0],
4587 'acodec': split_codecs
[1],
4591 'vcodec': vcodec
or 'none',
4592 'acodec': acodec
or 'none',
4597 def urlhandle_detect_ext(url_handle
):
4598 getheader
= url_handle
.headers
.get
4600 cd
= getheader('Content-Disposition')
4602 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4604 e
= determine_ext(m
.group('filename'), default_ext
=None)
4608 return mimetype2ext(getheader('Content-Type'))
4611 def encode_data_uri(data
, mime_type
):
4612 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4615 def age_restricted(content_limit
, age_limit
):
4616 """ Returns True iff the content should be blocked """
4618 if age_limit
is None: # No limit set
4620 if content_limit
is None:
4621 return False # Content available for everyone
4622 return age_limit
< content_limit
4625 def is_html(first_bytes
):
4626 """ Detect whether a file contains HTML by examining its first bytes. """
4629 (b
'\xef\xbb\xbf', 'utf-8'),
4630 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4631 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4632 (b
'\xff\xfe', 'utf-16-le'),
4633 (b
'\xfe\xff', 'utf-16-be'),
4635 for bom
, enc
in BOMS
:
4636 if first_bytes
.startswith(bom
):
4637 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4640 s
= first_bytes
.decode('utf-8', 'replace')
4642 return re
.match(r
'^\s*<', s
)
4645 def determine_protocol(info_dict
):
4646 protocol
= info_dict
.get('protocol')
4647 if protocol
is not None:
4650 url
= info_dict
['url']
4651 if url
.startswith('rtmp'):
4653 elif url
.startswith('mms'):
4655 elif url
.startswith('rtsp'):
4658 ext
= determine_ext(url
)
4664 return compat_urllib_parse_urlparse(url
).scheme
4667 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4668 """ Render a list of rows, each as a list of values """
4670 def get_max_lens(table
):
4671 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4673 def filter_using_list(row
, filterArray
):
4674 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4677 max_lens
= get_max_lens(data
)
4678 header_row
= filter_using_list(header_row
, max_lens
)
4679 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4681 table
= [header_row
] + data
4682 max_lens
= get_max_lens(table
)
4684 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4685 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4686 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4689 def _match_one(filter_part
, dct
, incomplete
):
4690 # TODO: Generalize code with YoutubeDL._build_format_filter
4691 STRING_OPERATORS
= {
4692 '*=': operator
.contains
,
4693 '^=': lambda attr
, value
: attr
.startswith(value
),
4694 '$=': lambda attr
, value
: attr
.endswith(value
),
4695 '~=': lambda attr
, value
: re
.search(value
, attr
),
4697 COMPARISON_OPERATORS
= {
4699 '<=': operator
.le
, # "<=" must be defined above "<"
4706 operator_rex
= re
.compile(r
'''(?x)\s*
4708 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4710 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4711 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
4715 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4716 m = operator_rex.search(filter_part)
4718 unnegated_op = COMPARISON_OPERATORS[m.group('op')]
4719 if m.group('negation'):
4720 op = lambda attr, value: not unnegated_op(attr, value)
4723 actual_value = dct.get(m.group('key'))
4724 if (m.group('quotedstrval') is not None
4725 or m.group('strval') is not None
4726 # If the original field is a string and matching comparisonvalue is
4727 # a number we should respect the origin of the original field
4728 # and process comparison value as a string (see
4729 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4730 or actual_value is not None and m.group('intval') is not None
4731 and isinstance(actual_value, compat_str)):
4732 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4733 quote = m.group('quote')
4734 if quote is not None:
4735 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4737 if m.group('op') in STRING_OPERATORS:
4738 raise ValueError('Operator %s only supports string values!' % m.group('op'))
4740 comparison_value = int(m.group('intval'))
4742 comparison_value = parse_filesize(m.group('intval'))
4743 if comparison_value is None:
4744 comparison_value = parse_filesize(m.group('intval') + 'B')
4745 if comparison_value is None:
4747 'Invalid integer value %r in filter part %r' % (
4748 m.group('intval'), filter_part))
4749 if actual_value is None:
4750 return incomplete or m.group('none_inclusive')
4751 return op(actual_value, comparison_value)
4754 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4755 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4757 operator_rex = re.compile(r'''(?x
)\s
*
4758 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4760 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4761 m = operator_rex.search(filter_part)
4763 op = UNARY_OPERATORS[m.group('op')]
4764 actual_value = dct.get(m.group('key'))
4765 if incomplete and actual_value is None:
4767 return op(actual_value)
4769 raise ValueError('Invalid filter part %r' % filter_part)
4772 def match_str(filter_str, dct, incomplete=False):
4773 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
4774 When incomplete, all conditions passes on missing fields
4777 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
4778 for filter_part in re.split(r'(?<!\\)&', filter_str))
4781 def match_filter_func(filter_str):
4782 def _match_func(info_dict, *args, **kwargs):
4783 if match_str(filter_str, info_dict, *args, **kwargs):
4786 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4787 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4791 def parse_dfxp_time_expr(time_expr):
4795 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4797 return float(mobj.group('time_offset'))
4799 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4801 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4804 def srt_subtitles_timecode(seconds):
4805 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4808 def dfxp2srt(dfxp_data):
4810 @param dfxp_data A
bytes-like
object containing DFXP data
4811 @returns A
unicode object containing converted SRT data
4813 LEGACY_NAMESPACES = (
4814 (b'http://www.w3.org/ns/ttml', [
4815 b'http://www.w3.org/2004/11/ttaf1',
4816 b'http://www.w3.org/2006/04/ttaf1',
4817 b'http://www.w3.org/2006/10/ttaf1',
4819 (b'http://www.w3.org/ns/ttml#styling', [
4820 b'http://www.w3.org/ns/ttml#style',
4824 SUPPORTED_STYLING = [
4833 _x = functools.partial(xpath_with_ns, ns_map={
4834 'xml': 'http://www.w3.org/XML/1998/namespace',
4835 'ttml': 'http://www.w3.org/ns/ttml',
4836 'tts': 'http://www.w3.org/ns/ttml#styling',
4842 class TTMLPElementParser(object):
4844 _unclosed_elements = []
4845 _applied_styles = []
4847 def start(self, tag, attrib):
4848 if tag in (_x('ttml:br'), 'br'):
4851 unclosed_elements = []
4853 element_style_id = attrib.get('style')
4855 style.update(default_style)
4856 if element_style_id:
4857 style.update(styles.get(element_style_id, {}))
4858 for prop in SUPPORTED_STYLING:
4859 prop_val = attrib.get(_x('tts:' + prop))
4861 style[prop] = prop_val
4864 for k, v in sorted(style.items()):
4865 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4868 font += ' color="%s"' % v
4869 elif k == 'fontSize':
4870 font += ' size="%s"' % v
4871 elif k == 'fontFamily':
4872 font += ' face="%s"' % v
4873 elif k == 'fontWeight' and v == 'bold':
4875 unclosed_elements.append('b')
4876 elif k == 'fontStyle' and v == 'italic':
4878 unclosed_elements.append('i')
4879 elif k == 'textDecoration' and v == 'underline':
4881 unclosed_elements.append('u')
4883 self._out += '<font' + font + '>'
4884 unclosed_elements.append('font')
4886 if self._applied_styles:
4887 applied_style.update(self._applied_styles[-1])
4888 applied_style.update(style)
4889 self._applied_styles.append(applied_style)
4890 self._unclosed_elements.append(unclosed_elements)
4893 if tag not in (_x('ttml:br'), 'br'):
4894 unclosed_elements = self._unclosed_elements.pop()
4895 for element in reversed(unclosed_elements):
4896 self._out += '</%s>' % element
4897 if unclosed_elements and self._applied_styles:
4898 self._applied_styles.pop()
4900 def data(self, data):
4904 return self._out.strip()
4906 def parse_node(node):
4907 target = TTMLPElementParser()
4908 parser = xml.etree.ElementTree.XMLParser(target=target)
4909 parser.feed(xml.etree.ElementTree.tostring(node))
4910 return parser.close()
4912 for k, v in LEGACY_NAMESPACES:
4914 dfxp_data = dfxp_data.replace(ns, k)
4916 dfxp = compat_etree_fromstring(dfxp_data)
4918 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4921 raise ValueError('Invalid dfxp/TTML subtitle')
4925 for style in dfxp.findall(_x('.//ttml:style')):
4926 style_id = style.get('id') or style.get(_x('xml:id'))
4929 parent_style_id = style.get('style')
4931 if parent_style_id not in styles:
4934 styles[style_id] = styles[parent_style_id].copy()
4935 for prop in SUPPORTED_STYLING:
4936 prop_val = style.get(_x('tts:' + prop))
4938 styles.setdefault(style_id, {})[prop] = prop_val
4944 for p in ('body', 'div'):
4945 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4948 style = styles.get(ele.get('style'))
4951 default_style.update(style)
4953 for para, index in zip(paras, itertools.count(1)):
4954 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4955 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4956 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4957 if begin_time is None:
4962 end_time = begin_time + dur
4963 out.append('%d\n%s --> %s\n%s\n\n' % (
4965 srt_subtitles_timecode(begin_time),
4966 srt_subtitles_timecode(end_time),
4972 def cli_option(params, command_option, param):
4973 param = params.get(param)
4975 param = compat_str(param)
4976 return [command_option, param] if param is not None else []
4979 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4980 param = params.get(param)
4983 assert isinstance(param, bool)
4985 return [command_option + separator + (true_value if param else false_value)]
4986 return [command_option, true_value if param else false_value]
4989 def cli_valueless_option(params, command_option, param, expected_value=True):
4990 param = params.get(param)
4991 return [command_option] if param == expected_value else []
4994 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4995 if isinstance(argdict, (list, tuple)): # for backward compatibility
5002 assert isinstance(argdict, dict)
5004 assert isinstance(keys, (list, tuple))
5005 for key_list in keys:
5006 arg_list = list(filter(
5007 lambda x: x is not None,
5008 [argdict.get(key.lower()) for key in variadic(key_list)]))
5010 return [arg for args in arg_list for arg in args]
5014 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
5015 main_key, exe = main_key.lower(), exe.lower()
5016 root_key = exe if main_key == exe else f'{main_key}+{exe}'
5017 keys = [f'{root_key}{k}' for k in (keys or [''])]
5018 if root_key in keys:
5020 keys.append((main_key, exe))
5021 keys.append('default')
5024 return cli_configuration_args(argdict, keys, default, use_compat)
5027 class ISO639Utils(object):
5028 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
5087 'iw': 'heb', # Replaced by he in 1989 revision
5097 'in': 'ind', # Replaced by id in 1989 revision
5212 'ji': 'yid', # Replaced by yi in 1989 revision
5220 def short2long(cls, code):
5221 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5222 return cls._lang_map.get(code[:2])
5225 def long2short(cls, code):
5226 """Convert language code from ISO 639-2/T to ISO 639-1"""
5227 for short_name, long_name in cls._lang_map.items():
5228 if long_name == code:
5232 class ISO3166Utils(object):
5233 # From http://data.okfn.org/data/core/country-list
5235 'AF': 'Afghanistan',
5236 'AX': 'Åland Islands',
5239 'AS': 'American Samoa',
5244 'AG': 'Antigua and Barbuda',
5261 'BO': 'Bolivia, Plurinational State of',
5262 'BQ': 'Bonaire, Sint Eustatius and Saba',
5263 'BA': 'Bosnia and Herzegovina',
5265 'BV': 'Bouvet Island',
5267 'IO': 'British Indian Ocean Territory',
5268 'BN': 'Brunei Darussalam',
5270 'BF': 'Burkina Faso',
5276 'KY': 'Cayman Islands',
5277 'CF': 'Central African Republic',
5281 'CX': 'Christmas Island',
5282 'CC': 'Cocos (Keeling) Islands',
5286 'CD': 'Congo, the Democratic Republic of the',
5287 'CK': 'Cook Islands',
5289 'CI': 'Côte d\'Ivoire',
5294 'CZ': 'Czech Republic',
5298 'DO': 'Dominican Republic',
5301 'SV': 'El Salvador',
5302 'GQ': 'Equatorial Guinea',
5306 'FK': 'Falkland Islands (Malvinas)',
5307 'FO': 'Faroe Islands',
5311 'GF': 'French Guiana',
5312 'PF': 'French Polynesia',
5313 'TF': 'French Southern Territories',
5328 'GW': 'Guinea-Bissau',
5331 'HM': 'Heard Island and McDonald Islands',
5332 'VA': 'Holy See (Vatican City State)',
5339 'IR': 'Iran, Islamic Republic of',
5342 'IM': 'Isle of Man',
5352 'KP': 'Korea, Democratic People\'s Republic of',
5353 'KR': 'Korea, Republic of',
5356 'LA': 'Lao People\'s Democratic Republic',
5362 'LI': 'Liechtenstein',
5366 'MK': 'Macedonia, the Former Yugoslav Republic of',
5373 'MH': 'Marshall Islands',
5379 'FM': 'Micronesia, Federated States of',
5380 'MD': 'Moldova, Republic of',
5391 'NL': 'Netherlands',
5392 'NC': 'New Caledonia',
5393 'NZ': 'New Zealand',
5398 'NF': 'Norfolk Island',
5399 'MP': 'Northern Mariana Islands',
5404 'PS': 'Palestine, State of',
5406 'PG': 'Papua New Guinea',
5409 'PH': 'Philippines',
5413 'PR': 'Puerto Rico',
5417 'RU': 'Russian Federation',
5419 'BL': 'Saint Barthélemy',
5420 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5421 'KN': 'Saint Kitts and Nevis',
5422 'LC': 'Saint Lucia',
5423 'MF': 'Saint Martin (French part)',
5424 'PM': 'Saint Pierre and Miquelon',
5425 'VC': 'Saint Vincent and the Grenadines',
5428 'ST': 'Sao Tome and Principe',
5429 'SA': 'Saudi Arabia',
5433 'SL': 'Sierra Leone',
5435 'SX': 'Sint Maarten (Dutch part)',
5438 'SB': 'Solomon Islands',
5440 'ZA': 'South Africa',
5441 'GS': 'South Georgia and the South Sandwich Islands',
5442 'SS': 'South Sudan',
5447 'SJ': 'Svalbard and Jan Mayen',
5450 'CH': 'Switzerland',
5451 'SY': 'Syrian Arab Republic',
5452 'TW': 'Taiwan, Province of China',
5454 'TZ': 'Tanzania, United Republic of',
5456 'TL': 'Timor-Leste',
5460 'TT': 'Trinidad and Tobago',
5463 'TM': 'Turkmenistan',
5464 'TC': 'Turks and Caicos Islands',
5468 'AE': 'United Arab Emirates',
5469 'GB': 'United Kingdom',
5470 'US': 'United States',
5471 'UM': 'United States Minor Outlying Islands',
5475 'VE': 'Venezuela, Bolivarian Republic of',
5477 'VG': 'Virgin Islands, British',
5478 'VI': 'Virgin Islands, U.S.',
5479 'WF': 'Wallis and Futuna',
5480 'EH': 'Western Sahara',
5487 def short2full(cls, code):
5488 """Convert an ISO 3166-2 country code to the corresponding full name"""
5489 return cls._country_map.get(code.upper())
5492 class GeoUtils(object):
5493 # Major IPv4 address blocks per country
5495 'AD': '46.172.224.0/19',
5496 'AE': '94.200.0.0/13',
5497 'AF': '149.54.0.0/17',
5498 'AG': '209.59.64.0/18',
5499 'AI': '204.14.248.0/21',
5500 'AL': '46.99.0.0/16',
5501 'AM': '46.70.0.0/15',
5502 'AO': '105.168.0.0/13',
5503 'AP': '182.50.184.0/21',
5504 'AQ': '23.154.160.0/24',
5505 'AR': '181.0.0.0/12',
5506 'AS': '202.70.112.0/20',
5507 'AT': '77.116.0.0/14',
5508 'AU': '1.128.0.0/11',
5509 'AW': '181.41.0.0/18',
5510 'AX': '185.217.4.0/22',
5511 'AZ': '5.197.0.0/16',
5512 'BA': '31.176.128.0/17',
5513 'BB': '65.48.128.0/17',
5514 'BD': '114.130.0.0/16',
5516 'BF': '102.178.0.0/15',
5517 'BG': '95.42.0.0/15',
5518 'BH': '37.131.0.0/17',
5519 'BI': '154.117.192.0/18',
5520 'BJ': '137.255.0.0/16',
5521 'BL': '185.212.72.0/23',
5522 'BM': '196.12.64.0/18',
5523 'BN': '156.31.0.0/16',
5524 'BO': '161.56.0.0/16',
5525 'BQ': '161.0.80.0/20',
5526 'BR': '191.128.0.0/12',
5527 'BS': '24.51.64.0/18',
5528 'BT': '119.2.96.0/19',
5529 'BW': '168.167.0.0/16',
5530 'BY': '178.120.0.0/13',
5531 'BZ': '179.42.192.0/18',
5532 'CA': '99.224.0.0/11',
5533 'CD': '41.243.0.0/16',
5534 'CF': '197.242.176.0/21',
5535 'CG': '160.113.0.0/16',
5536 'CH': '85.0.0.0/13',
5537 'CI': '102.136.0.0/14',
5538 'CK': '202.65.32.0/19',
5539 'CL': '152.172.0.0/14',
5540 'CM': '102.244.0.0/14',
5541 'CN': '36.128.0.0/10',
5542 'CO': '181.240.0.0/12',
5543 'CR': '201.192.0.0/12',
5544 'CU': '152.206.0.0/15',
5545 'CV': '165.90.96.0/19',
5546 'CW': '190.88.128.0/17',
5547 'CY': '31.153.0.0/16',
5548 'CZ': '88.100.0.0/14',
5550 'DJ': '197.241.0.0/17',
5551 'DK': '87.48.0.0/12',
5552 'DM': '192.243.48.0/20',
5553 'DO': '152.166.0.0/15',
5554 'DZ': '41.96.0.0/12',
5555 'EC': '186.68.0.0/15',
5556 'EE': '90.190.0.0/15',
5557 'EG': '156.160.0.0/11',
5558 'ER': '196.200.96.0/20',
5559 'ES': '88.0.0.0/11',
5560 'ET': '196.188.0.0/14',
5561 'EU': '2.16.0.0/13',
5562 'FI': '91.152.0.0/13',
5563 'FJ': '144.120.0.0/16',
5564 'FK': '80.73.208.0/21',
5565 'FM': '119.252.112.0/20',
5566 'FO': '88.85.32.0/19',
5568 'GA': '41.158.0.0/15',
5570 'GD': '74.122.88.0/21',
5571 'GE': '31.146.0.0/16',
5572 'GF': '161.22.64.0/18',
5573 'GG': '62.68.160.0/19',
5574 'GH': '154.160.0.0/12',
5575 'GI': '95.164.0.0/16',
5576 'GL': '88.83.0.0/19',
5577 'GM': '160.182.0.0/15',
5578 'GN': '197.149.192.0/18',
5579 'GP': '104.250.0.0/19',
5580 'GQ': '105.235.224.0/20',
5581 'GR': '94.64.0.0/13',
5582 'GT': '168.234.0.0/16',
5583 'GU': '168.123.0.0/16',
5584 'GW': '197.214.80.0/20',
5585 'GY': '181.41.64.0/18',
5586 'HK': '113.252.0.0/14',
5587 'HN': '181.210.0.0/16',
5588 'HR': '93.136.0.0/13',
5589 'HT': '148.102.128.0/17',
5590 'HU': '84.0.0.0/14',
5591 'ID': '39.192.0.0/10',
5592 'IE': '87.32.0.0/12',
5593 'IL': '79.176.0.0/13',
5594 'IM': '5.62.80.0/20',
5595 'IN': '117.192.0.0/10',
5596 'IO': '203.83.48.0/21',
5597 'IQ': '37.236.0.0/14',
5598 'IR': '2.176.0.0/12',
5599 'IS': '82.221.0.0/16',
5600 'IT': '79.0.0.0/10',
5601 'JE': '87.244.64.0/18',
5602 'JM': '72.27.0.0/17',
5603 'JO': '176.29.0.0/16',
5604 'JP': '133.0.0.0/8',
5605 'KE': '105.48.0.0/12',
5606 'KG': '158.181.128.0/17',
5607 'KH': '36.37.128.0/17',
5608 'KI': '103.25.140.0/22',
5609 'KM': '197.255.224.0/20',
5610 'KN': '198.167.192.0/19',
5611 'KP': '175.45.176.0/22',
5612 'KR': '175.192.0.0/10',
5613 'KW': '37.36.0.0/14',
5614 'KY': '64.96.0.0/15',
5615 'KZ': '2.72.0.0/13',
5616 'LA': '115.84.64.0/18',
5617 'LB': '178.135.0.0/16',
5618 'LC': '24.92.144.0/20',
5619 'LI': '82.117.0.0/19',
5620 'LK': '112.134.0.0/15',
5621 'LR': '102.183.0.0/16',
5622 'LS': '129.232.0.0/17',
5623 'LT': '78.56.0.0/13',
5624 'LU': '188.42.0.0/16',
5625 'LV': '46.109.0.0/16',
5626 'LY': '41.252.0.0/14',
5627 'MA': '105.128.0.0/11',
5628 'MC': '88.209.64.0/18',
5629 'MD': '37.246.0.0/16',
5630 'ME': '178.175.0.0/17',
5631 'MF': '74.112.232.0/21',
5632 'MG': '154.126.0.0/17',
5633 'MH': '117.103.88.0/21',
5634 'MK': '77.28.0.0/15',
5635 'ML': '154.118.128.0/18',
5636 'MM': '37.111.0.0/17',
5637 'MN': '49.0.128.0/17',
5638 'MO': '60.246.0.0/16',
5639 'MP': '202.88.64.0/20',
5640 'MQ': '109.203.224.0/19',
5641 'MR': '41.188.64.0/18',
5642 'MS': '208.90.112.0/22',
5643 'MT': '46.11.0.0/16',
5644 'MU': '105.16.0.0/12',
5645 'MV': '27.114.128.0/18',
5646 'MW': '102.70.0.0/15',
5647 'MX': '187.192.0.0/11',
5648 'MY': '175.136.0.0/13',
5649 'MZ': '197.218.0.0/15',
5650 'NA': '41.182.0.0/16',
5651 'NC': '101.101.0.0/18',
5652 'NE': '197.214.0.0/18',
5653 'NF': '203.17.240.0/22',
5654 'NG': '105.112.0.0/12',
5655 'NI': '186.76.0.0/15',
5656 'NL': '145.96.0.0/11',
5657 'NO': '84.208.0.0/13',
5658 'NP': '36.252.0.0/15',
5659 'NR': '203.98.224.0/19',
5660 'NU': '49.156.48.0/22',
5661 'NZ': '49.224.0.0/14',
5662 'OM': '5.36.0.0/15',
5663 'PA': '186.72.0.0/15',
5664 'PE': '186.160.0.0/14',
5665 'PF': '123.50.64.0/18',
5666 'PG': '124.240.192.0/19',
5667 'PH': '49.144.0.0/13',
5668 'PK': '39.32.0.0/11',
5669 'PL': '83.0.0.0/11',
5670 'PM': '70.36.0.0/20',
5671 'PR': '66.50.0.0/16',
5672 'PS': '188.161.0.0/16',
5673 'PT': '85.240.0.0/13',
5674 'PW': '202.124.224.0/20',
5675 'PY': '181.120.0.0/14',
5676 'QA': '37.210.0.0/15',
5677 'RE': '102.35.0.0/16',
5678 'RO': '79.112.0.0/13',
5679 'RS': '93.86.0.0/15',
5680 'RU': '5.136.0.0/13',
5681 'RW': '41.186.0.0/16',
5682 'SA': '188.48.0.0/13',
5683 'SB': '202.1.160.0/19',
5684 'SC': '154.192.0.0/11',
5685 'SD': '102.120.0.0/13',
5686 'SE': '78.64.0.0/12',
5687 'SG': '8.128.0.0/10',
5688 'SI': '188.196.0.0/14',
5689 'SK': '78.98.0.0/15',
5690 'SL': '102.143.0.0/17',
5691 'SM': '89.186.32.0/19',
5692 'SN': '41.82.0.0/15',
5693 'SO': '154.115.192.0/18',
5694 'SR': '186.179.128.0/17',
5695 'SS': '105.235.208.0/21',
5696 'ST': '197.159.160.0/19',
5697 'SV': '168.243.0.0/16',
5698 'SX': '190.102.0.0/20',
5700 'SZ': '41.84.224.0/19',
5701 'TC': '65.255.48.0/20',
5702 'TD': '154.68.128.0/19',
5703 'TG': '196.168.0.0/14',
5704 'TH': '171.96.0.0/13',
5705 'TJ': '85.9.128.0/18',
5706 'TK': '27.96.24.0/21',
5707 'TL': '180.189.160.0/20',
5708 'TM': '95.85.96.0/19',
5709 'TN': '197.0.0.0/11',
5710 'TO': '175.176.144.0/21',
5711 'TR': '78.160.0.0/11',
5712 'TT': '186.44.0.0/15',
5713 'TV': '202.2.96.0/19',
5714 'TW': '120.96.0.0/11',
5715 'TZ': '156.156.0.0/14',
5716 'UA': '37.52.0.0/14',
5717 'UG': '102.80.0.0/13',
5719 'UY': '167.56.0.0/13',
5720 'UZ': '84.54.64.0/18',
5721 'VA': '212.77.0.0/19',
5722 'VC': '207.191.240.0/21',
5723 'VE': '186.88.0.0/13',
5724 'VG': '66.81.192.0/20',
5725 'VI': '146.226.0.0/16',
5726 'VN': '14.160.0.0/11',
5727 'VU': '202.80.32.0/20',
5728 'WF': '117.20.32.0/21',
5729 'WS': '202.4.32.0/19',
5730 'YE': '134.35.0.0/16',
5731 'YT': '41.242.116.0/22',
5732 'ZA': '41.0.0.0/11',
5733 'ZM': '102.144.0.0/13',
5734 'ZW': '102.177.192.0/18',
5738 def random_ipv4(cls, code_or_block):
5739 if len(code_or_block) == 2:
5740 block = cls._country_ip_map.get(code_or_block.upper())
5744 block = code_or_block
5745 addr, preflen = block.split('/')
5746 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5747 addr_max = addr_min | (0xffffffff >> int(preflen))
5748 return compat_str(socket.inet_ntoa(
5749 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5752 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5753 def __init__(self, proxies=None):
5754 # Set default handlers
5755 for type in ('http', 'https'):
5756 setattr(self, '%s_open' % type,
5757 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5758 meth(r, proxy, type))
5759 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5761 def proxy_open(self, req, proxy, type):
5762 req_proxy = req.headers.get('Ytdl-request-proxy')
5763 if req_proxy is not None:
5765 del req.headers['Ytdl-request-proxy']
5767 if proxy == '__noproxy__':
5768 return None # No Proxy
5769 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5770 req.add_header('Ytdl-socks-proxy', proxy)
5771 # yt-dlp's http/https handlers do wrapping the socket with socks
5773 return compat_urllib_request.ProxyHandler.proxy_open(
5774 self, req, proxy, type)
5777 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5778 # released into Public Domain
5779 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5781 def long_to_bytes(n, blocksize=0):
5782 """long_to_bytes(n:long, blocksize:int) : string
5783 Convert a long integer to a byte string.
5785 If optional blocksize is given and greater than zero, pad the front of the
5786 byte string with binary zeros so that the length is a multiple of
5789 # after much testing, this algorithm was deemed to be the fastest
5793 s = compat_struct_pack('>I', n & 0xffffffff) + s
5795 # strip off leading zeros
5796 for i in range(len(s)):
5797 if s[i] != b'\000'[0]:
5800 # only happens when n == 0
5804 # add back some pad bytes. this could be done more efficiently w.r.t. the
5805 # de-padding being done above, but sigh...
5806 if blocksize > 0 and len(s) % blocksize:
5807 s = (blocksize - len(s) % blocksize) * b'\000' + s
5811 def bytes_to_long(s):
5812 """bytes_to_long(string) : long
5813 Convert a byte string to a long integer.
5815 This is (essentially) the inverse of long_to_bytes().
5820 extra = (4 - length % 4)
5821 s = b'\000' * extra + s
5822 length = length + extra
5823 for i in range(0, length, 4):
5824 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5828 def ohdave_rsa_encrypt(data, exponent, modulus):
5830 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5833 data: data to encrypt, bytes-like object
5834 exponent, modulus: parameter e and N of RSA algorithm, both integer
5835 Output: hex string of encrypted data
5837 Limitation: supports one block encryption only
5840 payload = int(binascii.hexlify(data[::-1]), 16)
5841 encrypted = pow(payload, exponent, modulus)
5842 return '%x' % encrypted
5845 def pkcs1pad(data, length):
5847 Padding input data with PKCS#1 scheme
5849 @param {int[]} data input data
5850 @param {int} length target length
5851 @returns {int[]} padded data
5853 if len(data) > length - 11:
5854 raise ValueError('Input data too
long for PKCS
#1 padding')
5856 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5857 return [0, 2] + pseudo_random
+ [0] + data
5860 def encode_base_n(num
, n
, table
=None):
5861 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5863 table
= FULL_TABLE
[:n
]
5866 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5873 ret
= table
[num
% n
] + ret
5878 def decode_packed_codes(code
):
5879 mobj
= re
.search(PACKED_CODES_RE
, code
)
5880 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5883 symbols
= symbols
.split('|')
5888 base_n_count
= encode_base_n(count
, base
)
5889 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5892 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5896 def caesar(s
, alphabet
, shift
):
5901 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5906 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5909 def parse_m3u8_attributes(attrib
):
5911 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5912 if val
.startswith('"'):
5918 def urshift(val
, n
):
5919 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5922 # Based on png2str() written by @gdkchan and improved by @yokrysty
5923 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5924 def decode_png(png_data
):
5925 # Reference: https://www.w3.org/TR/PNG/
5926 header
= png_data
[8:]
5928 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5929 raise IOError('Not a valid PNG file.')
5931 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5932 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5937 length
= unpack_integer(header
[:4])
5940 chunk_type
= header
[:4]
5943 chunk_data
= header
[:length
]
5944 header
= header
[length
:]
5946 header
= header
[4:] # Skip CRC
5954 ihdr
= chunks
[0]['data']
5956 width
= unpack_integer(ihdr
[:4])
5957 height
= unpack_integer(ihdr
[4:8])
5961 for chunk
in chunks
:
5962 if chunk
['type'] == b
'IDAT':
5963 idat
+= chunk
['data']
5966 raise IOError('Unable to read PNG data.')
5968 decompressed_data
= bytearray(zlib
.decompress(idat
))
5973 def _get_pixel(idx
):
5978 for y
in range(height
):
5979 basePos
= y
* (1 + stride
)
5980 filter_type
= decompressed_data
[basePos
]
5984 pixels
.append(current_row
)
5986 for x
in range(stride
):
5987 color
= decompressed_data
[1 + basePos
+ x
]
5988 basex
= y
* stride
+ x
5993 left
= _get_pixel(basex
- 3)
5995 up
= _get_pixel(basex
- stride
)
5997 if filter_type
== 1: # Sub
5998 color
= (color
+ left
) & 0xff
5999 elif filter_type
== 2: # Up
6000 color
= (color
+ up
) & 0xff
6001 elif filter_type
== 3: # Average
6002 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
6003 elif filter_type
== 4: # Paeth
6009 c
= _get_pixel(basex
- stride
- 3)
6017 if pa
<= pb
and pa
<= pc
:
6018 color
= (color
+ a
) & 0xff
6020 color
= (color
+ b
) & 0xff
6022 color
= (color
+ c
) & 0xff
6024 current_row
.append(color
)
6026 return width
, height
, pixels
6029 def write_xattr(path
, key
, value
):
6030 # This mess below finds the best xattr tool for the job
6032 # try the pyxattr module...
6035 if hasattr(xattr
, 'set'): # pyxattr
6036 # Unicode arguments are not supported in python-pyxattr until
6038 # See https://github.com/ytdl-org/youtube-dl/issues/5498
6039 pyxattr_required_version
= '0.5.0'
6040 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
6041 # TODO: fallback to CLI tools
6042 raise XAttrUnavailableError(
6043 'python-pyxattr is detected but is too old. '
6044 'yt-dlp requires %s or above while your version is %s. '
6045 'Falling back to other xattr implementations' % (
6046 pyxattr_required_version
, xattr
.__version
__))
6048 setxattr
= xattr
.set
6050 setxattr
= xattr
.setxattr
6053 setxattr(path
, key
, value
)
6054 except EnvironmentError as e
:
6055 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6058 if compat_os_name
== 'nt':
6059 # Write xattrs to NTFS Alternate Data Streams:
6060 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
6061 assert ':' not in key
6062 assert os
.path
.exists(path
)
6064 ads_fn
= path
+ ':' + key
6066 with open(ads_fn
, 'wb') as f
:
6068 except EnvironmentError as e
:
6069 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6071 user_has_setfattr
= check_executable('setfattr', ['--version'])
6072 user_has_xattr
= check_executable('xattr', ['-h'])
6074 if user_has_setfattr
or user_has_xattr
:
6076 value
= value
.decode('utf-8')
6077 if user_has_setfattr
:
6078 executable
= 'setfattr'
6079 opts
= ['-n', key
, '-v', value
]
6080 elif user_has_xattr
:
6081 executable
= 'xattr'
6082 opts
= ['-w', key
, value
]
6084 cmd
= ([encodeFilename(executable
, True)]
6085 + [encodeArgument(o
) for o
in opts
]
6086 + [encodeFilename(path
, True)])
6089 p
= subprocess
.Popen(
6090 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
6091 except EnvironmentError as e
:
6092 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6093 stdout
, stderr
= process_communicate_or_kill(p
)
6094 stderr
= stderr
.decode('utf-8', 'replace')
6095 if p
.returncode
!= 0:
6096 raise XAttrMetadataError(p
.returncode
, stderr
)
6099 # On Unix, and can't find pyxattr, setfattr, or xattr.
6100 if sys
.platform
.startswith('linux'):
6101 raise XAttrUnavailableError(
6102 "Couldn't find a tool to set the xattrs. "
6103 "Install either the python 'pyxattr' or 'xattr' "
6104 "modules, or the GNU 'attr' package "
6105 "(which contains the 'setfattr' tool).")
6107 raise XAttrUnavailableError(
6108 "Couldn't find a tool to set the xattrs. "
6109 "Install either the python 'xattr' module, "
6110 "or the 'xattr' binary.")
6113 def random_birthday(year_field
, month_field
, day_field
):
6114 start_date
= datetime
.date(1950, 1, 1)
6115 end_date
= datetime
.date(1995, 12, 31)
6116 offset
= random
.randint(0, (end_date
- start_date
).days
)
6117 random_date
= start_date
+ datetime
.timedelta(offset
)
6119 year_field
: str(random_date
.year
),
6120 month_field
: str(random_date
.month
),
6121 day_field
: str(random_date
.day
),
6125 # Templates for internet shortcut files, which are plain text files.
6126 DOT_URL_LINK_TEMPLATE
= '''
6131 DOT_WEBLOC_LINK_TEMPLATE
= '''
6132 <?xml version="1.0" encoding="UTF-8"?>
6133 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
6134 <plist version="1.0">
6137 \t<string>%(url)s</string>
6142 DOT_DESKTOP_LINK_TEMPLATE
= '''
6152 def iri_to_uri(iri
):
6154 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
6156 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
6159 iri_parts
= compat_urllib_parse_urlparse(iri
)
6161 if '[' in iri_parts
.netloc
:
6162 raise ValueError('IPv6 URIs are not, yet, supported.')
6163 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
6165 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
6168 if iri_parts
.username
:
6169 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
6170 if iri_parts
.password
is not None:
6171 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6174 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6175 # The 'idna' encoding produces ASCII text.
6176 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6177 net_location
+= ':' + str(iri_parts
.port
)
6179 return compat_urllib_parse_urlunparse(
6183 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6185 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6186 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6188 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6189 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6191 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6193 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6196 def to_high_limit_path(path
):
6197 if sys
.platform
in ['win32', 'cygwin']:
6198 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6199 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6204 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
6206 val
= obj
if obj
is not None else default
6208 val
= obj
.get(field
, default
)
6209 if func
and val
not in ignore
:
6211 return template
% val
if val
not in ignore
else default
6214 def clean_podcast_url(url
):
6215 return re
.sub(r
'''(?x)
6219 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6222 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6225 cn\.co| # https://podcorn.com/analytics-prefix/
6226 st\.fm # https://podsights.com/docs/
6231 _HEX_TABLE
= '0123456789abcdef'
6234 def random_uuidv4():
6235 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6238 def make_dir(path
, to_screen
=None):
6240 dn
= os
.path
.dirname(path
)
6241 if dn
and not os
.path
.exists(dn
):
6244 except (OSError, IOError) as err
:
6245 if callable(to_screen
) is not None:
6246 to_screen('unable to create directory ' + error_to_compat_str(err
))
6250 def get_executable_path():
6251 from zipimport
import zipimporter
6252 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6253 path
= os
.path
.dirname(sys
.executable
)
6254 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6255 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6257 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6258 return os
.path
.abspath(path
)
6261 def load_plugins(name
, suffix
, namespace
):
6262 plugin_info
= [None]
6265 plugin_info
= imp
.find_module(
6266 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
6267 plugins
= imp
.load_module(name
, *plugin_info
)
6268 for name
in dir(plugins
):
6269 if name
in namespace
:
6271 if not name
.endswith(suffix
):
6273 klass
= getattr(plugins
, name
)
6274 classes
.append(klass
)
6275 namespace
[name
] = klass
6279 if plugin_info
[0] is not None:
6280 plugin_info
[0].close()
6285 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
6286 casesense
=True, is_user_input
=False, traverse_string
=False):
6287 ''' Traverse nested list/dict/tuple
6288 @param path_list A list of paths which are checked one by one.
6289 Each path is a list of keys where each key is a string,
6290 a tuple of strings or "...". When a tuple is given,
6291 all the keys given in the tuple are traversed, and
6292 "..." traverses all the keys in the object
6293 @param default Default value to return
6294 @param expected_type Only accept final value of this type (Can also be any callable)
6295 @param get_all Return all the values obtained from a path or only the first one
6296 @param casesense Whether to consider dictionary keys as case sensitive
6297 @param is_user_input Whether the keys are generated from user input. If True,
6298 strings are converted to int/slice if necessary
6299 @param traverse_string Whether to traverse inside strings. If True, any
6300 non-compatible object will also be converted into a string
6304 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
6305 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
6307 def _traverse_obj(obj
, path
, _current_depth
=0):
6311 path
= tuple(variadic(path
))
6312 for i
, key
in enumerate(path
):
6313 if isinstance(key
, (list, tuple)):
6314 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
6317 obj
= (obj
.values() if isinstance(obj
, dict)
6318 else obj
if isinstance(obj
, (list, tuple, LazyList
))
6319 else str(obj
) if traverse_string
else [])
6321 depth
= max(depth
, _current_depth
)
6322 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
6323 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
6324 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
6325 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
6328 key
= (int_or_none(key
) if ':' not in key
6329 else slice(*map(int_or_none
, key
.split(':'))))
6330 if key
== slice(None):
6331 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
6332 if not isinstance(key
, (int, slice)):
6334 if not isinstance(obj
, (list, tuple, LazyList
)):
6335 if not traverse_string
:
6344 if isinstance(expected_type
, type):
6345 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
6346 elif expected_type
is not None:
6347 type_test
= expected_type
6349 type_test
= lambda val
: val
6351 for path
in path_list
:
6353 val
= _traverse_obj(obj
, path
)
6356 for _
in range(depth
- 1):
6357 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
6358 val
= [v
for v
in map(type_test
, val
) if v
is not None]
6360 return val
if get_all
else val
[0]
6362 val
= type_test(val
)
6368 def traverse_dict(dictn
, keys
, casesense
=True):
6369 ''' For backward compatibility. Do not use '''
6370 return traverse_obj(dictn
, keys
, casesense
=casesense
,
6371 is_user_input
=True, traverse_string
=True)
6374 def variadic(x
, allowed_types
=(str, bytes)):
6375 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)
6378 def get_windows_version():
6379 ''' Get Windows version. None if it's not running on Windows '''
6380 if compat_os_name
== 'nt':
6381 return version_tuple(platform
.win32_ver()[1])