4 from __future__
import unicode_literals
37 import xml
.etree
.ElementTree
41 compat_HTMLParseError
,
47 compat_ctypes_WINFUNCTYPE
,
48 compat_etree_fromstring
,
51 compat_html_entities_html5
,
64 compat_urllib_parse_urlencode
,
65 compat_urllib_parse_urlparse
,
66 compat_urllib_parse_urlunparse
,
67 compat_urllib_parse_quote
,
68 compat_urllib_parse_quote_plus
,
69 compat_urllib_parse_unquote_plus
,
70 compat_urllib_request
,
81 def register_socks_protocols():
82 # "Register" SOCKS protocols
83 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
84 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
85 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
86 if scheme
not in compat_urlparse
.uses_netloc
:
87 compat_urlparse
.uses_netloc
.append(scheme
)
90 # This is not clearly defined otherwise
91 compiled_regex_type
= type(re
.compile(''))
94 def random_user_agent():
95 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1674 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1678 'User-Agent': random_user_agent(),
1679 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1680 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1681 'Accept-Encoding': 'gzip, deflate',
1682 'Accept-Language': 'en-us,en;q=0.5',
1687 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1691 NO_DEFAULT
= object()
1693 ENGLISH_MONTH_NAMES
= [
1694 'January', 'February', 'March', 'April', 'May', 'June',
1695 'July', 'August', 'September', 'October', 'November', 'December']
1698 'en': ENGLISH_MONTH_NAMES
,
1700 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1701 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1704 KNOWN_EXTENSIONS
= (
1705 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1706 'flv', 'f4v', 'f4a', 'f4b',
1707 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1708 'mkv', 'mka', 'mk3d',
1711 'asf', 'wmv', 'wma',
1717 'f4f', 'f4m', 'm3u8', 'smil')
1719 # needed for sanitizing filenames in restricted mode
1720 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1721 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1722 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1745 '%Y/%m/%d %H:%M:%S',
1747 '%Y-%m-%d %H:%M:%S',
1748 '%Y-%m-%d %H:%M:%S.%f',
1749 '%Y-%m-%d %H:%M:%S:%f',
1752 '%Y-%m-%dT%H:%M:%SZ',
1753 '%Y-%m-%dT%H:%M:%S.%fZ',
1754 '%Y-%m-%dT%H:%M:%S.%f0Z',
1755 '%Y-%m-%dT%H:%M:%S',
1756 '%Y-%m-%dT%H:%M:%S.%f',
1758 '%b %d %Y at %H:%M',
1759 '%b %d %Y at %H:%M:%S',
1760 '%B %d %Y at %H:%M',
1761 '%B %d %Y at %H:%M:%S',
1764 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1765 DATE_FORMATS_DAY_FIRST
.extend([
1771 '%d/%m/%Y %H:%M:%S',
1774 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1775 DATE_FORMATS_MONTH_FIRST
.extend([
1780 '%m/%d/%Y %H:%M:%S',
1783 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1784 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1787 def preferredencoding():
1788 """Get preferred encoding.
1790 Returns the best encoding scheme for the system, based on
1791 locale.getpreferredencoding() and some further tweaks.
1794 pref = locale.getpreferredencoding()
1802 def write_json_file(obj, fn):
1803 """ Encode obj as JSON and write it to fn, atomically if possible """
1805 fn = encodeFilename(fn)
1806 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1807 encoding = get_filesystem_encoding()
1808 # os.path.basename returns a bytes object, but NamedTemporaryFile
1809 # will fail if the filename contains non ascii characters unless we
1810 # use a unicode object
1811 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1812 # the same for os.path.dirname
1813 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1815 path_basename = os.path.basename
1816 path_dirname = os.path.dirname
1820 'prefix
': path_basename(fn) + '.',
1821 'dir': path_dirname(fn),
1825 # In Python 2.x, json.dump expects a bytestream.
1826 # In Python 3.x, it writes to a character stream
1827 if sys.version_info < (3, 0):
1832 'encoding
': 'utf
-8',
1835 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1839 json.dump(obj, tf, default=repr)
1840 if sys.platform == 'win32
':
1841 # Need to remove existing file on Windows, else os.rename raises
1842 # WindowsError or FileExistsError.
1850 os.chmod(tf.name, 0o666 & ~mask)
1853 os.rename(tf.name, fn)
1862 if sys.version_info >= (2, 7):
1863 def find_xpath_attr(node, xpath, key, val=None):
1864 """ Find the xpath xpath[@key=val] """
1865 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1866 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1867 return node.find(expr)
1869 def find_xpath_attr(node, xpath, key, val=None):
1870 for f in node.findall(compat_xpath(xpath)):
1871 if key not in f.attrib:
1873 if val is None or f.attrib.get(key) == val:
1877 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1878 # the namespace parameter
1881 def xpath_with_ns(path
, ns_map
):
1882 components
= [c
.split(':') for c
in path
.split('/')]
1884 for c
in components
:
1886 replaced
.append(c
[0])
1889 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1890 return '/'.join(replaced
)
1893 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1894 def _find_xpath(xpath
):
1895 return node
.find(compat_xpath(xpath
))
1897 if isinstance(xpath
, (str, compat_str
)):
1898 n
= _find_xpath(xpath
)
1906 if default
is not NO_DEFAULT
:
1909 name
= xpath
if name
is None else name
1910 raise ExtractorError('Could not find XML element %s' % name
)
1916 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1917 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1918 if n
is None or n
== default
:
1921 if default
is not NO_DEFAULT
:
1924 name
= xpath
if name
is None else name
1925 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1931 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1932 n
= find_xpath_attr(node
, xpath
, key
)
1934 if default
is not NO_DEFAULT
:
1937 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1938 raise ExtractorError('Could not find XML attribute %s' % name
)
1941 return n
.attrib
[key
]
1944 def get_element_by_id(id, html
):
1945 """Return the content of the tag with the specified ID in the passed HTML document"""
1946 return get_element_by_attribute('id', id, html
)
1949 def get_element_by_class(class_name
, html
):
1950 """Return the content of the first tag with the specified class in the passed HTML document"""
1951 retval
= get_elements_by_class(class_name
, html
)
1952 return retval
[0] if retval
else None
1955 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1956 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1957 return retval
[0] if retval
else None
1960 def get_elements_by_class(class_name
, html
):
1961 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1962 return get_elements_by_attribute(
1963 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1964 html, escape_value=False)
1967 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1968 """Return the content of the tag with the specified attribute in the passed HTML document"""
1970 value = re.escape(value) if escape_value else value
1973 for m in re.finditer(r'''(?xs)
1975 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1977 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1981 ''' % (re.escape(attribute), value), html):
1982 res = m.group('content
')
1984 if res.startswith('"') or res.startswith("'"):
1987 retlist.append(unescapeHTML(res))
1992 class HTMLAttributeParser(compat_HTMLParser):
1993 """Trivial HTML parser to gather the attributes for a single element"""
1997 compat_HTMLParser.__init__(self)
1999 def handle_starttag(self, tag, attrs):
2000 self.attrs = dict(attrs)
2003 def extract_attributes(html_element):
2004 """Given a string for an HTML element such as
2006 a="foo" B="bar" c="&98;az" d=boz
2007 empty= noval entity="&"
2010 Decode and return a dictionary of attributes.
2012 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2013 'empty
': '', 'noval
': None, 'entity
': '&',
2014 'sq
': '"', 'dq': '\''
2016 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2017 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2019 parser = HTMLAttributeParser()
2021 parser.feed(html_element)
2023 # Older Python may throw HTMLParseError in case of malformed HTML
2024 except compat_HTMLParseError:
2029 def clean_html(html):
2030 """Clean an HTML snippet into a readable string"""
2032 if html is None: # Convenience for sanitizing descriptions etc.
2036 html = html.replace('\n', ' ')
2037 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2038 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2040 html = re.sub('<.*?>', '', html)
2041 # Replace html entities
2042 html = unescapeHTML(html)
2046 def sanitize_open(filename, open_mode):
2047 """Try to open the given filename, and slightly tweak it if this fails.
2049 Attempts to open the given filename. If this fails, it tries to change
2050 the filename slightly, step by step, until it's either able to open it
2051 or it fails and raises a final exception, like the standard open()
2054 It returns the tuple (stream, definitive_file_name).
2058 if sys.platform == 'win32':
2060 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2061 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2062 stream = open(encodeFilename(filename), open_mode)
2063 return (stream, filename)
2064 except (IOError, OSError) as err:
2065 if err.errno in (errno.EACCES,):
2068 # In case of error, try to remove win32 forbidden chars
2069 alt_filename = sanitize_path(filename)
2070 if alt_filename == filename:
2073 # An exception here should be caught in the caller
2074 stream = open(encodeFilename(alt_filename), open_mode)
2075 return (stream, alt_filename)
2078 def timeconvert(timestr):
2079 """Convert RFC 2822 defined time string into system timestamp"""
2081 timetuple = email.utils.parsedate_tz(timestr)
2082 if timetuple is not None:
2083 timestamp = email.utils.mktime_tz(timetuple)
2087 def sanitize_filename(s, restricted=False, is_id=False):
2088 """Sanitizes a string so it could be used as part of a filename.
2089 If restricted is set, use a stricter subset of allowed characters.
2090 Set is_id if this is not an arbitrary string, but an ID that should be kept
2093 def replace_insane(char):
2094 if restricted and char in ACCENT_CHARS:
2095 return ACCENT_CHARS[char]
2096 if char == '?' or ord(char) < 32 or ord(char) == 127:
2099 return '' if restricted else '\''
2101 return '_
-' if restricted else ' -'
2102 elif char in '\\/|
*<>':
2104 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2106 if restricted
and ord(char
) > 127:
2113 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2114 result
= ''.join(map(replace_insane
, s
))
2116 while '__' in result
:
2117 result
= result
.replace('__', '_')
2118 result
= result
.strip('_')
2119 # Common case of "Foreign band name - English song title"
2120 if restricted
and result
.startswith('-_'):
2122 if result
.startswith('-'):
2123 result
= '_' + result
[len('-'):]
2124 result
= result
.lstrip('.')
2130 def sanitize_path(s
, force
=False):
2131 """Sanitizes and normalizes path on Windows"""
2132 if sys
.platform
== 'win32':
2134 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2135 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2136 drive_or_unc
, _
= os
.path
.splitunc(s
)
2142 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2146 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2147 for path_part
in norm_path
]
2149 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2150 elif force
and s
[0] == os
.path
.sep
:
2151 sanitized_path
.insert(0, os
.path
.sep
)
2152 return os
.path
.join(*sanitized_path
)
2155 def sanitize_url(url
):
2156 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2157 # the number of unwanted failures due to missing protocol
2158 if url
.startswith('//'):
2159 return 'http:%s' % url
2160 # Fix some common typos seen so far
2162 # https://github.com/ytdl-org/youtube-dl/issues/15649
2163 (r
'^httpss://', r
'https://'),
2164 # https://bx1.be/lives/direct-tv/
2165 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2167 for mistake
, fixup
in COMMON_TYPOS
:
2168 if re
.match(mistake
, url
):
2169 return re
.sub(mistake
, fixup
, url
)
2173 def extract_basic_auth(url
):
2174 parts
= compat_urlparse
.urlsplit(url
)
2175 if parts
.username
is None:
2177 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
2178 parts
.hostname
if parts
.port
is None
2179 else '%s:%d' % (parts
.hostname
, parts
.port
))))
2180 auth_payload
= base64
.b64encode(
2181 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
2182 return url
, 'Basic ' + auth_payload
.decode('utf-8')
2185 def sanitized_Request(url
, *args
, **kwargs
):
2186 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
2187 if auth_header
is not None:
2188 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
2189 headers
['Authorization'] = auth_header
2190 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
2194 """Expand shell variables and ~"""
2195 return os
.path
.expandvars(compat_expanduser(s
))
2198 def orderedSet(iterable
):
2199 """ Remove all duplicates from the input iterable """
2207 def _htmlentity_transform(entity_with_semicolon
):
2208 """Transforms an HTML entity to a character."""
2209 entity
= entity_with_semicolon
[:-1]
2211 # Known non-numeric HTML entity
2212 if entity
in compat_html_entities
.name2codepoint
:
2213 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2215 # TODO: HTML5 allows entities without a semicolon. For example,
2216 # 'Éric' should be decoded as 'Éric'.
2217 if entity_with_semicolon
in compat_html_entities_html5
:
2218 return compat_html_entities_html5
[entity_with_semicolon
]
2220 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2221 if mobj
is not None:
2222 numstr
= mobj
.group(1)
2223 if numstr
.startswith('x'):
2225 numstr
= '0%s' % numstr
2228 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2230 return compat_chr(int(numstr
, base
))
2234 # Unknown entity in name, return its literal representation
2235 return '&%s;' % entity
2238 def unescapeHTML(s
):
2241 assert type(s
) == compat_str
2244 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2247 def escapeHTML(text
):
2250 .replace('&', '&')
2251 .replace('<', '<')
2252 .replace('>', '>')
2253 .replace('"', '"')
2254 .replace("'", ''')
2258 def process_communicate_or_kill(p
, *args
, **kwargs
):
2260 return p
.communicate(*args
, **kwargs
)
2261 except BaseException
: # Including KeyboardInterrupt
2267 def get_subprocess_encoding():
2268 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2269 # For subprocess calls, encode with locale encoding
2270 # Refer to http://stackoverflow.com/a/9951851/35070
2271 encoding
= preferredencoding()
2273 encoding
= sys
.getfilesystemencoding()
2274 if encoding
is None:
2279 def encodeFilename(s
, for_subprocess
=False):
2281 @param s The name of the file
2284 assert type(s
) == compat_str
2286 # Python 3 has a Unicode API
2287 if sys
.version_info
>= (3, 0):
2290 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2291 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2292 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2293 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2296 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2297 if sys
.platform
.startswith('java'):
2300 return s
.encode(get_subprocess_encoding(), 'ignore')
2303 def decodeFilename(b
, for_subprocess
=False):
2305 if sys
.version_info
>= (3, 0):
2308 if not isinstance(b
, bytes):
2311 return b
.decode(get_subprocess_encoding(), 'ignore')
2314 def encodeArgument(s
):
2315 if not isinstance(s
, compat_str
):
2316 # Legacy code that uses byte strings
2317 # Uncomment the following line after fixing all post processors
2318 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2319 s
= s
.decode('ascii')
2320 return encodeFilename(s
, True)
2323 def decodeArgument(b
):
2324 return decodeFilename(b
, True)
2327 def decodeOption(optval
):
2330 if isinstance(optval
, bytes):
2331 optval
= optval
.decode(preferredencoding())
2333 assert isinstance(optval
, compat_str
)
2337 def formatSeconds(secs
, delim
=':', msec
=False):
2339 ret
= '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2341 ret
= '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2344 return '%s.%03d' % (ret
, secs
% 1) if msec
else ret
2347 def make_HTTPS_handler(params
, **kwargs
):
2348 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2349 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2350 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2351 if opts_no_check_certificate
:
2352 context
.check_hostname
= False
2353 context
.verify_mode
= ssl
.CERT_NONE
2355 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2358 # (create_default_context present but HTTPSHandler has no context=)
2361 if sys
.version_info
< (3, 2):
2362 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2363 else: # Python < 3.4
2364 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2365 context
.verify_mode
= (ssl
.CERT_NONE
2366 if opts_no_check_certificate
2367 else ssl
.CERT_REQUIRED
)
2368 context
.set_default_verify_paths()
2369 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2372 def bug_reports_message(before
=';'):
2373 if ytdl_is_updateable():
2374 update_cmd
= 'type yt-dlp -U to update'
2376 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2377 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2378 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2379 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2381 before
= before
.rstrip()
2382 if not before
or before
.endswith(('.', '!', '?')):
2383 msg
= msg
[0].title() + msg
[1:]
2385 return (before
+ ' ' if before
else '') + msg
2388 class YoutubeDLError(Exception):
2389 """Base exception for YoutubeDL errors."""
2393 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2394 if hasattr(ssl
, 'CertificateError'):
2395 network_exceptions
.append(ssl
.CertificateError
)
2396 network_exceptions
= tuple(network_exceptions
)
2399 class ExtractorError(YoutubeDLError
):
2400 """Error during info extraction."""
2402 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2403 """ tb, if given, is the original traceback (so that it can be printed out).
2404 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2407 if sys
.exc_info()[0] in network_exceptions
:
2409 if video_id
is not None:
2410 msg
= video_id
+ ': ' + msg
2412 msg
+= ' (caused by %r)' % cause
2414 msg
+= bug_reports_message()
2415 super(ExtractorError
, self
).__init
__(msg
)
2418 self
.exc_info
= sys
.exc_info() # preserve original exception
2420 self
.video_id
= video_id
2422 def format_traceback(self
):
2423 if self
.traceback
is None:
2425 return ''.join(traceback
.format_tb(self
.traceback
))
2428 class UnsupportedError(ExtractorError
):
2429 def __init__(self
, url
):
2430 super(UnsupportedError
, self
).__init
__(
2431 'Unsupported URL: %s' % url
, expected
=True)
2435 class RegexNotFoundError(ExtractorError
):
2436 """Error when a regex didn't match"""
2440 class GeoRestrictedError(ExtractorError
):
2441 """Geographic restriction Error exception.
2443 This exception may be thrown when a video is not available from your
2444 geographic location due to geographic restrictions imposed by a website.
2447 def __init__(self
, msg
, countries
=None):
2448 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2450 self
.countries
= countries
2453 class DownloadError(YoutubeDLError
):
2454 """Download Error exception.
2456 This exception may be thrown by FileDownloader objects if they are not
2457 configured to continue on errors. They will contain the appropriate
2461 def __init__(self
, msg
, exc_info
=None):
2462 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2463 super(DownloadError
, self
).__init
__(msg
)
2464 self
.exc_info
= exc_info
2467 class EntryNotInPlaylist(YoutubeDLError
):
2468 """Entry not in playlist exception.
2470 This exception will be thrown by YoutubeDL when a requested entry
2471 is not found in the playlist info_dict
2476 class SameFileError(YoutubeDLError
):
2477 """Same File exception.
2479 This exception will be thrown by FileDownloader objects if they detect
2480 multiple files would have to be downloaded to the same file on disk.
2485 class PostProcessingError(YoutubeDLError
):
2486 """Post Processing exception.
2488 This exception may be raised by PostProcessor's .run() method to
2489 indicate an error in the postprocessing task.
2492 def __init__(self
, msg
):
2493 super(PostProcessingError
, self
).__init
__(msg
)
2497 class ExistingVideoReached(YoutubeDLError
):
2498 """ --max-downloads limit has been reached. """
2502 class RejectedVideoReached(YoutubeDLError
):
2503 """ --max-downloads limit has been reached. """
2507 class ThrottledDownload(YoutubeDLError
):
2508 """ Download speed below --throttled-rate. """
2512 class MaxDownloadsReached(YoutubeDLError
):
2513 """ --max-downloads limit has been reached. """
2517 class UnavailableVideoError(YoutubeDLError
):
2518 """Unavailable Format exception.
2520 This exception will be thrown when a video is requested
2521 in a format that is not available for that video.
2526 class ContentTooShortError(YoutubeDLError
):
2527 """Content Too Short exception.
2529 This exception may be raised by FileDownloader objects when a file they
2530 download is too small for what the server announced first, indicating
2531 the connection was probably interrupted.
2534 def __init__(self
, downloaded
, expected
):
2535 super(ContentTooShortError
, self
).__init
__(
2536 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2539 self
.downloaded
= downloaded
2540 self
.expected
= expected
2543 class XAttrMetadataError(YoutubeDLError
):
2544 def __init__(self
, code
=None, msg
='Unknown error'):
2545 super(XAttrMetadataError
, self
).__init
__(msg
)
2549 # Parsing code and msg
2550 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2551 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2552 self
.reason
= 'NO_SPACE'
2553 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2554 self
.reason
= 'VALUE_TOO_LONG'
2556 self
.reason
= 'NOT_SUPPORTED'
2559 class XAttrUnavailableError(YoutubeDLError
):
2563 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2564 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2565 # expected HTTP responses to meet HTTP/1.0 or later (see also
2566 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2567 if sys
.version_info
< (3, 0):
2568 kwargs
['strict'] = True
2569 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2570 source_address
= ydl_handler
._params
.get('source_address')
2572 if source_address
is not None:
2573 # This is to workaround _create_connection() from socket where it will try all
2574 # address data from getaddrinfo() including IPv6. This filters the result from
2575 # getaddrinfo() based on the source_address value.
2576 # This is based on the cpython socket.create_connection() function.
2577 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2578 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2579 host
, port
= address
2581 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2582 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2583 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2584 if addrs
and not ip_addrs
:
2585 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2587 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2588 % (ip_version
, source_address
[0]))
2589 for res
in ip_addrs
:
2590 af
, socktype
, proto
, canonname
, sa
= res
2593 sock
= socket
.socket(af
, socktype
, proto
)
2594 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2595 sock
.settimeout(timeout
)
2596 sock
.bind(source_address
)
2598 err
= None # Explicitly break reference cycle
2600 except socket
.error
as _
:
2602 if sock
is not None:
2607 raise socket
.error('getaddrinfo returns an empty list')
2608 if hasattr(hc
, '_create_connection'):
2609 hc
._create
_connection
= _create_connection
2610 sa
= (source_address
, 0)
2611 if hasattr(hc
, 'source_address'): # Python 2.7+
2612 hc
.source_address
= sa
2614 def _hc_connect(self
, *args
, **kwargs
):
2615 sock
= _create_connection(
2616 (self
.host
, self
.port
), self
.timeout
, sa
)
2618 self
.sock
= ssl
.wrap_socket(
2619 sock
, self
.key_file
, self
.cert_file
,
2620 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2623 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2628 def handle_youtubedl_headers(headers
):
2629 filtered_headers
= headers
2631 if 'Youtubedl-no-compression' in filtered_headers
:
2632 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2633 del filtered_headers
['Youtubedl-no-compression']
2635 return filtered_headers
2638 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2639 """Handler for HTTP requests and responses.
2641 This class, when installed with an OpenerDirector, automatically adds
2642 the standard headers to every HTTP request and handles gzipped and
2643 deflated responses from web servers. If compression is to be avoided in
2644 a particular request, the original request in the program code only has
2645 to include the HTTP header "Youtubedl-no-compression", which will be
2646 removed before making the real request.
2648 Part of this code was copied from:
2650 http://techknack.net/python-urllib2-handlers/
2652 Andrew Rowls, the author of that code, agreed to release it to the
2656 def __init__(self
, params
, *args
, **kwargs
):
2657 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2658 self
._params
= params
2660 def http_open(self
, req
):
2661 conn_class
= compat_http_client
.HTTPConnection
2663 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2665 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2666 del req
.headers
['Ytdl-socks-proxy']
2668 return self
.do_open(functools
.partial(
2669 _create_http_connection
, self
, conn_class
, False),
2677 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2679 return zlib
.decompress(data
)
2681 def http_request(self
, req
):
2682 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2683 # always respected by websites, some tend to give out URLs with non percent-encoded
2684 # non-ASCII characters (see telemb.py, ard.py [#3412])
2685 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2686 # To work around aforementioned issue we will replace request's original URL with
2687 # percent-encoded one
2688 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2689 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2690 url
= req
.get_full_url()
2691 url_escaped
= escape_url(url
)
2693 # Substitute URL if any change after escaping
2694 if url
!= url_escaped
:
2695 req
= update_Request(req
, url
=url_escaped
)
2697 for h
, v
in std_headers
.items():
2698 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2699 # The dict keys are capitalized because of this bug by urllib
2700 if h
.capitalize() not in req
.headers
:
2701 req
.add_header(h
, v
)
2703 req
.headers
= handle_youtubedl_headers(req
.headers
)
2705 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2706 # Python 2.6 is brain-dead when it comes to fragments
2707 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2708 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2712 def http_response(self
, req
, resp
):
2715 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2716 content
= resp
.read()
2717 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2719 uncompressed
= io
.BytesIO(gz
.read())
2720 except IOError as original_ioerror
:
2721 # There may be junk add the end of the file
2722 # See http://stackoverflow.com/q/4928560/35070 for details
2723 for i
in range(1, 1024):
2725 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2726 uncompressed
= io
.BytesIO(gz
.read())
2731 raise original_ioerror
2732 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2733 resp
.msg
= old_resp
.msg
2734 del resp
.headers
['Content-encoding']
2736 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2737 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2738 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2739 resp
.msg
= old_resp
.msg
2740 del resp
.headers
['Content-encoding']
2741 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2742 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2743 if 300 <= resp
.code
< 400:
2744 location
= resp
.headers
.get('Location')
2746 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2747 if sys
.version_info
>= (3, 0):
2748 location
= location
.encode('iso-8859-1').decode('utf-8')
2750 location
= location
.decode('utf-8')
2751 location_escaped
= escape_url(location
)
2752 if location
!= location_escaped
:
2753 del resp
.headers
['Location']
2754 if sys
.version_info
< (3, 0):
2755 location_escaped
= location_escaped
.encode('utf-8')
2756 resp
.headers
['Location'] = location_escaped
2759 https_request
= http_request
2760 https_response
= http_response
2763 def make_socks_conn_class(base_class
, socks_proxy
):
2764 assert issubclass(base_class
, (
2765 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2767 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2768 if url_components
.scheme
.lower() == 'socks5':
2769 socks_type
= ProxyType
.SOCKS5
2770 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2771 socks_type
= ProxyType
.SOCKS4
2772 elif url_components
.scheme
.lower() == 'socks4a':
2773 socks_type
= ProxyType
.SOCKS4A
2775 def unquote_if_non_empty(s
):
2778 return compat_urllib_parse_unquote_plus(s
)
2782 url_components
.hostname
, url_components
.port
or 1080,
2784 unquote_if_non_empty(url_components
.username
),
2785 unquote_if_non_empty(url_components
.password
),
2788 class SocksConnection(base_class
):
2790 self
.sock
= sockssocket()
2791 self
.sock
.setproxy(*proxy_args
)
2792 if type(self
.timeout
) in (int, float):
2793 self
.sock
.settimeout(self
.timeout
)
2794 self
.sock
.connect((self
.host
, self
.port
))
2796 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2797 if hasattr(self
, '_context'): # Python > 2.6
2798 self
.sock
= self
._context
.wrap_socket(
2799 self
.sock
, server_hostname
=self
.host
)
2801 self
.sock
= ssl
.wrap_socket(self
.sock
)
2803 return SocksConnection
2806 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2807 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2808 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2809 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2810 self
._params
= params
2812 def https_open(self
, req
):
2814 conn_class
= self
._https
_conn
_class
2816 if hasattr(self
, '_context'): # python > 2.6
2817 kwargs
['context'] = self
._context
2818 if hasattr(self
, '_check_hostname'): # python 3.x
2819 kwargs
['check_hostname'] = self
._check
_hostname
2821 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2823 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2824 del req
.headers
['Ytdl-socks-proxy']
2826 return self
.do_open(functools
.partial(
2827 _create_http_connection
, self
, conn_class
, True),
2831 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2833 See [1] for cookie file format.
2835 1. https://curl.haxx.se/docs/http-cookies.html
2837 _HTTPONLY_PREFIX
= '#HttpOnly_'
2839 _HEADER
= '''# Netscape HTTP Cookie File
2840 # This file is generated by yt-dlp. Do not edit.
2843 _CookieFileEntry
= collections
.namedtuple(
2845 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2847 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2849 Save cookies to a file.
2851 Most of the code is taken from CPython 3.8 and slightly adapted
2852 to support cookie files with UTF-8 in both python 2 and 3.
2854 if filename
is None:
2855 if self
.filename
is not None:
2856 filename
= self
.filename
2858 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2860 # Store session cookies with `expires` set to 0 instead of an empty
2863 if cookie
.expires
is None:
2866 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2867 f
.write(self
._HEADER
)
2870 if not ignore_discard
and cookie
.discard
:
2872 if not ignore_expires
and cookie
.is_expired(now
):
2878 if cookie
.domain
.startswith('.'):
2879 initial_dot
= 'TRUE'
2881 initial_dot
= 'FALSE'
2882 if cookie
.expires
is not None:
2883 expires
= compat_str(cookie
.expires
)
2886 if cookie
.value
is None:
2887 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2888 # with no name, whereas http.cookiejar regards it as a
2889 # cookie with no value.
2894 value
= cookie
.value
2896 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2897 secure
, expires
, name
, value
]) + '\n')
2899 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2900 """Load cookies from a file."""
2901 if filename
is None:
2902 if self
.filename
is not None:
2903 filename
= self
.filename
2905 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2907 def prepare_line(line
):
2908 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2909 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2910 # comments and empty lines are fine
2911 if line
.startswith('#') or not line
.strip():
2913 cookie_list
= line
.split('\t')
2914 if len(cookie_list
) != self
._ENTRY
_LEN
:
2915 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2916 cookie
= self
._CookieFileEntry
(*cookie_list
)
2917 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2918 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2922 with io
.open(filename
, encoding
='utf-8') as f
:
2925 cf
.write(prepare_line(line
))
2926 except compat_cookiejar
.LoadError
as e
:
2928 'WARNING: skipping cookie file entry due to %s: %r\n'
2929 % (e
, line
), sys
.stderr
)
2932 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2933 # Session cookies are denoted by either `expires` field set to
2934 # an empty string or 0. MozillaCookieJar only recognizes the former
2935 # (see [1]). So we need force the latter to be recognized as session
2936 # cookies on our own.
2937 # Session cookies may be important for cookies-based authentication,
2938 # e.g. usually, when user does not check 'Remember me' check box while
2939 # logging in on a site, some important cookies are stored as session
2940 # cookies so that not recognizing them will result in failed login.
2941 # 1. https://bugs.python.org/issue17164
2943 # Treat `expires=0` cookies as session cookies
2944 if cookie
.expires
== 0:
2945 cookie
.expires
= None
2946 cookie
.discard
= True
2949 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2950 def __init__(self
, cookiejar
=None):
2951 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2953 def http_response(self
, request
, response
):
2954 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2955 # characters in Set-Cookie HTTP header of last response (see
2956 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2957 # In order to at least prevent crashing we will percent encode Set-Cookie
2958 # header before HTTPCookieProcessor starts processing it.
2959 # if sys.version_info < (3, 0) and response.headers:
2960 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2961 # set_cookie = response.headers.get(set_cookie_header)
2963 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2964 # if set_cookie != set_cookie_escaped:
2965 # del response.headers[set_cookie_header]
2966 # response.headers[set_cookie_header] = set_cookie_escaped
2967 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2969 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2970 https_response
= http_response
2973 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2974 """YoutubeDL redirect handler
2976 The code is based on HTTPRedirectHandler implementation from CPython [1].
2978 This redirect handler solves two issues:
2979 - ensures redirect URL is always unicode under python 2
2980 - introduces support for experimental HTTP response status code
2981 308 Permanent Redirect [2] used by some sites [3]
2983 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2984 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2985 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2988 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
2990 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2991 """Return a Request or None in response to a redirect.
2993 This is called by the http_error_30x methods when a
2994 redirection response is received. If a redirection should
2995 take place, return a new Request to allow http_error_30x to
2996 perform the redirect. Otherwise, raise HTTPError if no-one
2997 else should try to handle this url. Return None if you can't
2998 but another Handler might.
3000 m
= req
.get_method()
3001 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
3002 or code
in (301, 302, 303) and m
== "POST")):
3003 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
3004 # Strictly (according to RFC 2616), 301 or 302 in response to
3005 # a POST MUST NOT cause a redirection without confirmation
3006 # from the user (of urllib.request, in this case). In practice,
3007 # essentially all clients do redirect in this case, so we do
3010 # On python 2 urlh.geturl() may sometimes return redirect URL
3011 # as byte string instead of unicode. This workaround allows
3012 # to force it always return unicode.
3013 if sys
.version_info
[0] < 3:
3014 newurl
= compat_str(newurl
)
3016 # Be conciliant with URIs containing a space. This is mainly
3017 # redundant with the more complete encoding done in http_error_302(),
3018 # but it is kept for compatibility with other callers.
3019 newurl
= newurl
.replace(' ', '%20')
3021 CONTENT_HEADERS
= ("content-length", "content-type")
3022 # NB: don't use dict comprehension for python 2.6 compatibility
3023 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
3024 if k
.lower() not in CONTENT_HEADERS
)
3025 return compat_urllib_request
.Request(
3026 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
3030 def extract_timezone(date_str
):
3032 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
3035 timezone
= datetime
.timedelta()
3037 date_str
= date_str
[:-len(m
.group('tz'))]
3038 if not m
.group('sign'):
3039 timezone
= datetime
.timedelta()
3041 sign
= 1 if m
.group('sign') == '+' else -1
3042 timezone
= datetime
.timedelta(
3043 hours
=sign
* int(m
.group('hours')),
3044 minutes
=sign
* int(m
.group('minutes')))
3045 return timezone
, date_str
3048 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3049 """ Return a UNIX timestamp from the given date """
3051 if date_str
is None:
3054 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3056 if timezone
is None:
3057 timezone
, date_str
= extract_timezone(date_str
)
3060 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3061 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3062 return calendar
.timegm(dt
.timetuple())
3067 def date_formats(day_first
=True):
3068 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3071 def unified_strdate(date_str
, day_first
=True):
3072 """Return a string with the date in the format YYYYMMDD"""
3074 if date_str
is None:
3078 date_str
= date_str
.replace(',', ' ')
3079 # Remove AM/PM + timezone
3080 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3081 _
, date_str
= extract_timezone(date_str
)
3083 for expression
in date_formats(day_first
):
3085 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3088 if upload_date
is None:
3089 timetuple
= email
.utils
.parsedate_tz(date_str
)
3092 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3095 if upload_date
is not None:
3096 return compat_str(upload_date
)
3099 def unified_timestamp(date_str
, day_first
=True):
3100 if date_str
is None:
3103 date_str
= re
.sub(r
'[,|]', '', date_str
)
3105 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3106 timezone
, date_str
= extract_timezone(date_str
)
3108 # Remove AM/PM + timezone
3109 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3111 # Remove unrecognized timezones from ISO 8601 alike timestamps
3112 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3114 date_str
= date_str
[:-len(m
.group('tz'))]
3116 # Python only supports microseconds, so remove nanoseconds
3117 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3119 date_str
= m
.group(1)
3121 for expression
in date_formats(day_first
):
3123 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3124 return calendar
.timegm(dt
.timetuple())
3127 timetuple
= email
.utils
.parsedate_tz(date_str
)
3129 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3132 def determine_ext(url
, default_ext
='unknown_video'):
3133 if url
is None or '.' not in url
:
3135 guess
= url
.partition('?')[0].rpartition('.')[2]
3136 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3138 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3139 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3140 return guess
.rstrip('/')
3145 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3146 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3149 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3151 Return a datetime object from a string in the format YYYYMMDD or
3152 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3154 format: string date format used to return datetime object from
3155 precision: round the time portion of a datetime object.
3156 auto|microsecond|second|minute|hour|day.
3157 auto: round to the unit provided in date_str (if applicable).
3159 auto_precision
= False
3160 if precision
== 'auto':
3161 auto_precision
= True
3162 precision
= 'microsecond'
3163 today
= datetime_round(datetime
.datetime
.now(), precision
)
3164 if date_str
in ('now', 'today'):
3166 if date_str
== 'yesterday':
3167 return today
- datetime
.timedelta(days
=1)
3169 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3171 if match
is not None:
3172 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3173 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3174 unit
= match
.group('unit')
3175 if unit
== 'month' or unit
== 'year':
3176 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3182 delta
= datetime
.timedelta(**{unit + 's': time}
)
3183 new_date
= start_time
+ delta
3185 return datetime_round(new_date
, unit
)
3188 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3191 def date_from_str(date_str
, format
='%Y%m%d'):
3193 Return a datetime object from a string in the format YYYYMMDD or
3194 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3196 format: string date format used to return datetime object from
3198 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3201 def datetime_add_months(dt
, months
):
3202 """Increment/Decrement a datetime object by months."""
3203 month
= dt
.month
+ months
- 1
3204 year
= dt
.year
+ month
// 12
3205 month
= month
% 12 + 1
3206 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3207 return dt
.replace(year
, month
, day
)
3210 def datetime_round(dt
, precision
='day'):
3212 Round a datetime object's time to a specific precision
3214 if precision
== 'microsecond':
3223 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3224 timestamp
= calendar
.timegm(dt
.timetuple())
3225 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3228 def hyphenate_date(date_str
):
3230 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3231 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3232 if match
is not None:
3233 return '-'.join(match
.groups())
3238 class DateRange(object):
3239 """Represents a time interval between two dates"""
3241 def __init__(self
, start
=None, end
=None):
3242 """start and end must be strings in the format accepted by date"""
3243 if start
is not None:
3244 self
.start
= date_from_str(start
)
3246 self
.start
= datetime
.datetime
.min.date()
3248 self
.end
= date_from_str(end
)
3250 self
.end
= datetime
.datetime
.max.date()
3251 if self
.start
> self
.end
:
3252 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3256 """Returns a range that only contains the given day"""
3257 return cls(day
, day
)
3259 def __contains__(self
, date
):
3260 """Check if the date is in the range"""
3261 if not isinstance(date
, datetime
.date
):
3262 date
= date_from_str(date
)
3263 return self
.start
<= date
<= self
.end
3266 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3269 def platform_name():
3270 """ Returns the platform name as a compat_str """
3271 res
= platform
.platform()
3272 if isinstance(res
, bytes):
3273 res
= res
.decode(preferredencoding())
3275 assert isinstance(res
, compat_str
)
3279 def _windows_write_string(s
, out
):
3280 """ Returns True if the string was written using special methods,
3281 False if it has yet to be written out."""
3282 # Adapted from http://stackoverflow.com/a/3259271/35070
3285 import ctypes
.wintypes
3293 fileno
= out
.fileno()
3294 except AttributeError:
3295 # If the output stream doesn't have a fileno, it's virtual
3297 except io
.UnsupportedOperation
:
3298 # Some strange Windows pseudo files?
3300 if fileno
not in WIN_OUTPUT_IDS
:
3303 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3304 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3305 ('GetStdHandle', ctypes
.windll
.kernel32
))
3306 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3308 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3309 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3310 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3311 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3312 written
= ctypes
.wintypes
.DWORD(0)
3314 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3315 FILE_TYPE_CHAR
= 0x0002
3316 FILE_TYPE_REMOTE
= 0x8000
3317 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3318 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3319 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3320 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3321 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3323 def not_a_console(handle
):
3324 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3326 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3327 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3329 if not_a_console(h
):
3332 def next_nonbmp_pos(s
):
3334 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3335 except StopIteration:
3339 count
= min(next_nonbmp_pos(s
), 1024)
3341 ret
= WriteConsoleW(
3342 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3344 raise OSError('Failed to write string')
3345 if not count
: # We just wrote a non-BMP character
3346 assert written
.value
== 2
3349 assert written
.value
> 0
3350 s
= s
[written
.value
:]
3354 def write_string(s
, out
=None, encoding
=None):
3357 assert type(s
) == compat_str
3359 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3360 if _windows_write_string(s
, out
):
3363 if ('b' in getattr(out
, 'mode', '')
3364 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3365 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3367 elif hasattr(out
, 'buffer'):
3368 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3369 byt
= s
.encode(enc
, 'ignore')
3370 out
.buffer.write(byt
)
3376 def bytes_to_intlist(bs
):
3379 if isinstance(bs
[0], int): # Python 3
3382 return [ord(c
) for c
in bs
]
3385 def intlist_to_bytes(xs
):
3388 return compat_struct_pack('%dB' % len(xs
), *xs
)
3391 # Cross-platform file locking
3392 if sys
.platform
== 'win32':
3393 import ctypes
.wintypes
3396 class OVERLAPPED(ctypes
.Structure
):
3398 ('Internal', ctypes
.wintypes
.LPVOID
),
3399 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3400 ('Offset', ctypes
.wintypes
.DWORD
),
3401 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3402 ('hEvent', ctypes
.wintypes
.HANDLE
),
3405 kernel32
= ctypes
.windll
.kernel32
3406 LockFileEx
= kernel32
.LockFileEx
3407 LockFileEx
.argtypes
= [
3408 ctypes
.wintypes
.HANDLE
, # hFile
3409 ctypes
.wintypes
.DWORD
, # dwFlags
3410 ctypes
.wintypes
.DWORD
, # dwReserved
3411 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3412 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3413 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3415 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3416 UnlockFileEx
= kernel32
.UnlockFileEx
3417 UnlockFileEx
.argtypes
= [
3418 ctypes
.wintypes
.HANDLE
, # hFile
3419 ctypes
.wintypes
.DWORD
, # dwReserved
3420 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3421 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3422 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3424 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3425 whole_low
= 0xffffffff
3426 whole_high
= 0x7fffffff
3428 def _lock_file(f
, exclusive
):
3429 overlapped
= OVERLAPPED()
3430 overlapped
.Offset
= 0
3431 overlapped
.OffsetHigh
= 0
3432 overlapped
.hEvent
= 0
3433 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3434 handle
= msvcrt
.get_osfhandle(f
.fileno())
3435 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3436 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3437 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3439 def _unlock_file(f
):
3440 assert f
._lock
_file
_overlapped
_p
3441 handle
= msvcrt
.get_osfhandle(f
.fileno())
3442 if not UnlockFileEx(handle
, 0,
3443 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3444 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3447 # Some platforms, such as Jython, is missing fcntl
3451 def _lock_file(f
, exclusive
):
3452 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3454 def _unlock_file(f
):
3455 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3457 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3459 def _lock_file(f
, exclusive
):
3460 raise IOError(UNSUPPORTED_MSG
)
3462 def _unlock_file(f
):
3463 raise IOError(UNSUPPORTED_MSG
)
3466 class locked_file(object):
3467 def __init__(self
, filename
, mode
, encoding
=None):
3468 assert mode
in ['r', 'a', 'w']
3469 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3472 def __enter__(self
):
3473 exclusive
= self
.mode
!= 'r'
3475 _lock_file(self
.f
, exclusive
)
3481 def __exit__(self
, etype
, value
, traceback
):
3483 _unlock_file(self
.f
)
3490 def write(self
, *args
):
3491 return self
.f
.write(*args
)
3493 def read(self
, *args
):
3494 return self
.f
.read(*args
)
3497 def get_filesystem_encoding():
3498 encoding
= sys
.getfilesystemencoding()
3499 return encoding
if encoding
is not None else 'utf-8'
3502 def shell_quote(args
):
3504 encoding
= get_filesystem_encoding()
3506 if isinstance(a
, bytes):
3507 # We may get a filename encoded with 'encodeFilename'
3508 a
= a
.decode(encoding
)
3509 quoted_args
.append(compat_shlex_quote(a
))
3510 return ' '.join(quoted_args
)
3513 def smuggle_url(url
, data
):
3514 """ Pass additional data in a URL for internal use. """
3516 url
, idata
= unsmuggle_url(url
, {})
3518 sdata
= compat_urllib_parse_urlencode(
3519 {'__youtubedl_smuggle': json.dumps(data)}
)
3520 return url
+ '#' + sdata
3523 def unsmuggle_url(smug_url
, default
=None):
3524 if '#__youtubedl_smuggle' not in smug_url
:
3525 return smug_url
, default
3526 url
, _
, sdata
= smug_url
.rpartition('#')
3527 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3528 data
= json
.loads(jsond
)
3532 def format_bytes(bytes):
3535 if type(bytes) is str:
3536 bytes = float(bytes)
3540 exponent
= int(math
.log(bytes, 1024.0))
3541 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3542 converted
= float(bytes) / float(1024 ** exponent
)
3543 return '%.2f%s' % (converted
, suffix
)
3546 def lookup_unit_table(unit_table
, s
):
3547 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3549 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3552 num_str
= m
.group('num').replace(',', '.')
3553 mult
= unit_table
[m
.group('unit')]
3554 return int(float(num_str
) * mult
)
3557 def parse_filesize(s
):
3561 # The lower-case forms are of course incorrect and unofficial,
3562 # but we support those too
3579 'megabytes': 1000 ** 2,
3580 'mebibytes': 1024 ** 2,
3586 'gigabytes': 1000 ** 3,
3587 'gibibytes': 1024 ** 3,
3593 'terabytes': 1000 ** 4,
3594 'tebibytes': 1024 ** 4,
3600 'petabytes': 1000 ** 5,
3601 'pebibytes': 1024 ** 5,
3607 'exabytes': 1000 ** 6,
3608 'exbibytes': 1024 ** 6,
3614 'zettabytes': 1000 ** 7,
3615 'zebibytes': 1024 ** 7,
3621 'yottabytes': 1000 ** 8,
3622 'yobibytes': 1024 ** 8,
3625 return lookup_unit_table(_UNIT_TABLE
, s
)
3634 if re
.match(r
'^[\d,.]+$', s
):
3635 return str_to_int(s
)
3646 return lookup_unit_table(_UNIT_TABLE
, s
)
3649 def parse_resolution(s
):
3653 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3656 'width': int(mobj
.group('w')),
3657 'height': int(mobj
.group('h')),
3660 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3662 return {'height': int(mobj.group(1))}
3664 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3666 return {'height': int(mobj.group(1)) * 540}
3671 def parse_bitrate(s
):
3672 if not isinstance(s
, compat_str
):
3674 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3676 return int(mobj
.group(1))
3679 def month_by_name(name
, lang
='en'):
3680 """ Return the number of a month by (locale-independently) English name """
3682 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3685 return month_names
.index(name
) + 1
3690 def month_by_abbreviation(abbrev
):
3691 """ Return the number of a month by (locale-independently) English
3695 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3700 def fix_xml_ampersands(xml_str
):
3701 """Replace all the '&' by '&' in XML"""
3703 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3708 def setproctitle(title
):
3709 assert isinstance(title
, compat_str
)
3711 # ctypes in Jython is not complete
3712 # http://bugs.jython.org/issue2148
3713 if sys
.platform
.startswith('java'):
3717 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3721 # LoadLibrary in Windows Python 2.7.13 only expects
3722 # a bytestring, but since unicode_literals turns
3723 # every string into a unicode string, it fails.
3725 title_bytes
= title
.encode('utf-8')
3726 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3727 buf
.value
= title_bytes
3729 libc
.prctl(15, buf
, 0, 0, 0)
3730 except AttributeError:
3731 return # Strange libc, just skip this
3734 def remove_start(s
, start
):
3735 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3738 def remove_end(s
, end
):
3739 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3742 def remove_quotes(s
):
3743 if s
is None or len(s
) < 2:
3745 for quote
in ('"', "'", ):
3746 if s
[0] == quote
and s
[-1] == quote
:
3751 def get_domain(url
):
3752 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3753 return domain
.group('domain') if domain
else None
3756 def url_basename(url
):
3757 path
= compat_urlparse
.urlparse(url
).path
3758 return path
.strip('/').split('/')[-1]
3762 return re
.match(r
'https?://[^?#&]+/', url
).group()
3765 def urljoin(base
, path
):
3766 if isinstance(path
, bytes):
3767 path
= path
.decode('utf-8')
3768 if not isinstance(path
, compat_str
) or not path
:
3770 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3772 if isinstance(base
, bytes):
3773 base
= base
.decode('utf-8')
3774 if not isinstance(base
, compat_str
) or not re
.match(
3775 r
'^(?:https?:)?//', base
):
3777 return compat_urlparse
.urljoin(base
, path
)
3780 class HEADRequest(compat_urllib_request
.Request
):
3781 def get_method(self
):
3785 class PUTRequest(compat_urllib_request
.Request
):
3786 def get_method(self
):
3790 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3793 v
= getattr(v
, get_attr
, None)
3799 return int(v
) * invscale
// scale
3800 except (ValueError, TypeError):
3804 def str_or_none(v
, default
=None):
3805 return default
if v
is None else compat_str(v
)
3808 def str_to_int(int_str
):
3809 """ A more relaxed version of int_or_none """
3810 if isinstance(int_str
, compat_integer_types
):
3812 elif isinstance(int_str
, compat_str
):
3813 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3814 return int_or_none(int_str
)
3817 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3821 return float(v
) * invscale
/ scale
3822 except (ValueError, TypeError):
3826 def bool_or_none(v
, default
=None):
3827 return v
if isinstance(v
, bool) else default
3830 def strip_or_none(v
, default
=None):
3831 return v
.strip() if isinstance(v
, compat_str
) else default
3834 def url_or_none(url
):
3835 if not url
or not isinstance(url
, compat_str
):
3838 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3841 def strftime_or_none(timestamp
, date_format
, default
=None):
3842 datetime_object
= None
3844 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3845 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3846 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3847 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3848 return datetime_object
.strftime(date_format
)
3849 except (ValueError, TypeError, AttributeError):
3853 def parse_duration(s
):
3854 if not isinstance(s
, compat_basestring
):
3859 days
, hours
, mins
, secs
, ms
= [None] * 5
3860 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3862 days
, hours
, mins
, secs
, ms
= m
.groups()
3867 [0-9]+\s*y(?:ears?)?\s*
3870 [0-9]+\s*m(?:onths?)?\s*
3873 [0-9]+\s*w(?:eeks?)?\s*
3876 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3880 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3883 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3886 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3889 days
, hours
, mins
, secs
, ms
= m
.groups()
3891 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3893 hours
, mins
= m
.groups()
3899 duration
+= float(secs
)
3901 duration
+= float(mins
) * 60
3903 duration
+= float(hours
) * 60 * 60
3905 duration
+= float(days
) * 24 * 60 * 60
3907 duration
+= float(ms
)
3911 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3912 name
, real_ext
= os
.path
.splitext(filename
)
3914 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3915 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3916 else '{0}.{1}'.format(filename
, ext
))
3919 def replace_extension(filename
, ext
, expected_real_ext
=None):
3920 name
, real_ext
= os
.path
.splitext(filename
)
3921 return '{0}.{1}'.format(
3922 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3926 def check_executable(exe
, args
=[]):
3927 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3928 args can be a list of arguments for a short output (like -version) """
3930 process_communicate_or_kill(subprocess
.Popen(
3931 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3937 def get_exe_version(exe
, args
=['--version'],
3938 version_re
=None, unrecognized
='present'):
3939 """ Returns the version of the specified executable,
3940 or False if the executable is not present """
3942 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3943 # SIGTTOU if yt-dlp is run in the background.
3944 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3945 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3946 [encodeArgument(exe
)] + args
,
3947 stdin
=subprocess
.PIPE
,
3948 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3951 if isinstance(out
, bytes): # Python 2.x
3952 out
= out
.decode('ascii', 'ignore')
3953 return detect_exe_version(out
, version_re
, unrecognized
)
3956 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3957 assert isinstance(output
, compat_str
)
3958 if version_re
is None:
3959 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3960 m
= re
.search(version_re
, output
)
3967 class LazyList(collections
.abc
.Sequence
):
3968 ''' Lazy immutable list from an iterable
3969 Note that slices of a LazyList are lists and not LazyList'''
3971 def __init__(self
, iterable
):
3972 self
.__iterable
= iter(iterable
)
3974 self
.__reversed
= False
3978 # We need to consume the entire iterable to iterate in reverse
3979 yield from self
.exhaust()
3981 yield from self
.__cache
3982 for item
in self
.__iterable
:
3983 self
.__cache
.append(item
)
3986 def __exhaust(self
):
3987 self
.__cache
.extend(self
.__iterable
)
3991 ''' Evaluate the entire iterable '''
3992 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
3995 def __reverse_index(x
):
3996 return None if x
is None else -(x
+ 1)
3998 def __getitem__(self
, idx
):
3999 if isinstance(idx
, slice):
4001 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
4002 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
4003 elif isinstance(idx
, int):
4005 idx
= self
.__reverse
_index
(idx
)
4006 start
, stop
, step
= idx
, idx
, 0
4008 raise TypeError('indices must be integers or slices')
4009 if ((start
or 0) < 0 or (stop
or 0) < 0
4010 or (start
is None and step
< 0)
4011 or (stop
is None and step
> 0)):
4012 # We need to consume the entire iterable to be able to slice from the end
4013 # Obviously, never use this with infinite iterables
4014 return self
.__exhaust
()[idx
]
4016 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
4018 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
4019 return self
.__cache
[idx
]
4023 self
[-1] if self
.__reversed
else self
[0]
4030 return len(self
.__cache
)
4033 self
.__reversed
= not self
.__reversed
4037 # repr and str should mimic a list. So we exhaust the iterable
4038 return repr(self
.exhaust())
4041 return repr(self
.exhaust())
4044 class PagedList(object):
4046 # This is only useful for tests
4047 return len(self
.getslice())
4049 def getslice(self
, start
, end
):
4050 raise NotImplementedError('This method must be implemented by subclasses')
4052 def __getitem__(self
, idx
):
4053 if not isinstance(idx
, int) or idx
< 0:
4054 raise TypeError('indices must be non-negative integers')
4055 entries
= self
.getslice(idx
, idx
+ 1)
4056 return entries
[0] if entries
else None
4059 class OnDemandPagedList(PagedList
):
4060 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
4061 self
._pagefunc
= pagefunc
4062 self
._pagesize
= pagesize
4063 self
._use
_cache
= use_cache
4067 def getslice(self
, start
=0, end
=None):
4069 for pagenum
in itertools
.count(start
// self
._pagesize
):
4070 firstid
= pagenum
* self
._pagesize
4071 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
4072 if start
>= nextfirstid
:
4077 page_results
= self
._cache
.get(pagenum
)
4078 if page_results
is None:
4079 page_results
= list(self
._pagefunc
(pagenum
))
4081 self
._cache
[pagenum
] = page_results
4084 start
% self
._pagesize
4085 if firstid
<= start
< nextfirstid
4089 ((end
- 1) % self
._pagesize
) + 1
4090 if (end
is not None and firstid
<= end
<= nextfirstid
)
4093 if startv
!= 0 or endv
is not None:
4094 page_results
= page_results
[startv
:endv
]
4095 res
.extend(page_results
)
4097 # A little optimization - if current page is not "full", ie. does
4098 # not contain page_size videos then we can assume that this page
4099 # is the last one - there are no more ids on further pages -
4100 # i.e. no need to query again.
4101 if len(page_results
) + startv
< self
._pagesize
:
4104 # If we got the whole page, but the next page is not interesting,
4105 # break out early as well
4106 if end
== nextfirstid
:
4111 class InAdvancePagedList(PagedList
):
4112 def __init__(self
, pagefunc
, pagecount
, pagesize
):
4113 self
._pagefunc
= pagefunc
4114 self
._pagecount
= pagecount
4115 self
._pagesize
= pagesize
4117 def getslice(self
, start
=0, end
=None):
4119 start_page
= start
// self
._pagesize
4121 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4122 skip_elems
= start
- start_page
* self
._pagesize
4123 only_more
= None if end
is None else end
- start
4124 for pagenum
in range(start_page
, end_page
):
4125 page
= list(self
._pagefunc
(pagenum
))
4127 page
= page
[skip_elems
:]
4129 if only_more
is not None:
4130 if len(page
) < only_more
:
4131 only_more
-= len(page
)
4133 page
= page
[:only_more
]
4140 def uppercase_escape(s
):
4141 unicode_escape
= codecs
.getdecoder('unicode_escape')
4143 r
'\\U[0-9a-fA-F]{8}',
4144 lambda m
: unicode_escape(m
.group(0))[0],
4148 def lowercase_escape(s
):
4149 unicode_escape
= codecs
.getdecoder('unicode_escape')
4151 r
'\\u[0-9a-fA-F]{4}',
4152 lambda m
: unicode_escape(m
.group(0))[0],
4156 def escape_rfc3986(s
):
4157 """Escape non-ASCII characters as suggested by RFC 3986"""
4158 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4159 s
= s
.encode('utf-8')
4160 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4163 def escape_url(url
):
4164 """Escape URL as suggested by RFC 3986"""
4165 url_parsed
= compat_urllib_parse_urlparse(url
)
4166 return url_parsed
._replace
(
4167 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4168 path
=escape_rfc3986(url_parsed
.path
),
4169 params
=escape_rfc3986(url_parsed
.params
),
4170 query
=escape_rfc3986(url_parsed
.query
),
4171 fragment
=escape_rfc3986(url_parsed
.fragment
)
4175 def read_batch_urls(batch_fd
):
4177 if not isinstance(url
, compat_str
):
4178 url
= url
.decode('utf-8', 'replace')
4179 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4180 for bom
in BOM_UTF8
:
4181 if url
.startswith(bom
):
4182 url
= url
[len(bom
):]
4184 if not url
or url
.startswith(('#', ';', ']')):
4186 # "#" cannot be stripped out since it is part of the URI
4187 # However, it can be safely stipped out if follwing a whitespace
4188 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4190 with contextlib
.closing(batch_fd
) as fd
:
4191 return [url
for url
in map(fixup
, fd
) if url
]
4194 def urlencode_postdata(*args
, **kargs
):
4195 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4198 def update_url_query(url
, query
):
4201 parsed_url
= compat_urlparse
.urlparse(url
)
4202 qs
= compat_parse_qs(parsed_url
.query
)
4204 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4205 query
=compat_urllib_parse_urlencode(qs
, True)))
4208 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4209 req_headers
= req
.headers
.copy()
4210 req_headers
.update(headers
)
4211 req_data
= data
or req
.data
4212 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4213 req_get_method
= req
.get_method()
4214 if req_get_method
== 'HEAD':
4215 req_type
= HEADRequest
4216 elif req_get_method
== 'PUT':
4217 req_type
= PUTRequest
4219 req_type
= compat_urllib_request
.Request
4221 req_url
, data
=req_data
, headers
=req_headers
,
4222 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4223 if hasattr(req
, 'timeout'):
4224 new_req
.timeout
= req
.timeout
4228 def _multipart_encode_impl(data
, boundary
):
4229 content_type
= 'multipart/form-data; boundary=%s' % boundary
4232 for k
, v
in data
.items():
4233 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4234 if isinstance(k
, compat_str
):
4235 k
= k
.encode('utf-8')
4236 if isinstance(v
, compat_str
):
4237 v
= v
.encode('utf-8')
4238 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4239 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4240 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4241 if boundary
.encode('ascii') in content
:
4242 raise ValueError('Boundary overlaps with data')
4245 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4247 return out
, content_type
4250 def multipart_encode(data
, boundary
=None):
4252 Encode a dict to RFC 7578-compliant form-data
4255 A dict where keys and values can be either Unicode or bytes-like
4258 If specified a Unicode object, it's used as the boundary. Otherwise
4259 a random boundary is generated.
4261 Reference: https://tools.ietf.org/html/rfc7578
4263 has_specified_boundary
= boundary
is not None
4266 if boundary
is None:
4267 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4270 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4273 if has_specified_boundary
:
4277 return out
, content_type
4280 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4281 if isinstance(key_or_keys
, (list, tuple)):
4282 for key
in key_or_keys
:
4283 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4287 return d
.get(key_or_keys
, default
)
4290 def try_get(src
, getter
, expected_type
=None):
4291 for get
in variadic(getter
):
4294 except (AttributeError, KeyError, TypeError, IndexError):
4297 if expected_type
is None or isinstance(v
, expected_type
):
4301 def merge_dicts(*dicts
):
4303 for a_dict
in dicts
:
4304 for k
, v
in a_dict
.items():
4308 or (isinstance(v
, compat_str
) and v
4309 and isinstance(merged
[k
], compat_str
)
4310 and not merged
[k
])):
4315 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4316 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4328 TV_PARENTAL_GUIDELINES
= {
4338 def parse_age_limit(s
):
4340 return s
if 0 <= s
<= 21 else None
4341 if not isinstance(s
, compat_basestring
):
4343 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4345 return int(m
.group('age'))
4348 return US_RATINGS
[s
]
4349 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4351 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4355 def strip_jsonp(code
):
4358 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4359 (?:\s*&&\s*(?P=func_name))?
4360 \s*\(\s*(?P<callback_data>.*)\);?
4361 \s*?(?://[^\n]*)*$''',
4362 r
'\g<callback_data>', code
)
4365 def js_to_json(code
, vars={}):
4366 # vars is a dict of var, val pairs to substitute
4367 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4368 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4370 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4371 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4376 if v
in ('true', 'false', 'null'):
4378 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4381 if v
[0] in ("'", '"'):
4382 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4387 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4389 for regex
, base
in INTEGER_TABLE
:
4390 im
= re
.match(regex
, v
)
4392 i
= int(im
.group(1), base
)
4393 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4400 return re
.sub(r
'''(?sx)
4401 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4402 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4403 {comment}|,(?={skip}[\]}}])|
4404 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4405 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4408 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4411 def qualities(quality_ids
):
4412 """ Get a numeric quality value out of a list of possible values """
4415 return quality_ids
.index(qid
)
4422 'default': '%(title)s [%(id)s].%(ext)s',
4423 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4429 'description': 'description',
4430 'annotation': 'annotations.xml',
4431 'infojson': 'info.json',
4432 'pl_thumbnail': None,
4433 'pl_description': 'description',
4434 'pl_infojson': 'info.json',
4437 # As of [1] format syntax is:
4438 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4439 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4440 STR_FORMAT_RE_TMPL
= r
'''(?x)
4441 (?<!%)(?P<prefix>(?:%%)*)
4443 (?P<has_key>\((?P<key>{0})\))? # mapping key
4445 (?:[#0\-+ ]+)? # conversion flags (optional)
4446 (?:\d+)? # minimum field width (optional)
4447 (?:\.\d+)? # precision (optional)
4448 [hlL]? # length modifier (optional)
4449 {1} # conversion type
4454 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
4457 def limit_length(s
, length
):
4458 """ Add ellipses to overly long strings """
4463 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4467 def version_tuple(v
):
4468 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4471 def is_outdated_version(version
, limit
, assume_new
=True):
4473 return not assume_new
4475 return version_tuple(version
) < version_tuple(limit
)
4477 return not assume_new
4480 def ytdl_is_updateable():
4481 """ Returns if yt-dlp can be updated with -U """
4484 from zipimport
import zipimporter
4486 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4489 def args_to_str(args
):
4490 # Get a short string representation for a subprocess command
4491 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4494 def error_to_compat_str(err
):
4496 # On python 2 error byte string must be decoded with proper
4497 # encoding rather than ascii
4498 if sys
.version_info
[0] < 3:
4499 err_str
= err_str
.decode(preferredencoding())
4503 def mimetype2ext(mt
):
4509 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4510 # it's the most popular one
4511 'audio/mpeg': 'mp3',
4512 'audio/x-wav': 'wav',
4517 _
, _
, res
= mt
.rpartition('/')
4518 res
= res
.split(';')[0].strip().lower()
4522 'smptett+xml': 'tt',
4526 'x-mp4-fragmented': 'mp4',
4527 'x-ms-sami': 'sami',
4530 'x-mpegurl': 'm3u8',
4531 'vnd.apple.mpegurl': 'm3u8',
4535 'vnd.ms-sstr+xml': 'ism',
4542 def parse_codecs(codecs_str
):
4543 # http://tools.ietf.org/html/rfc6381
4546 split_codecs
= list(filter(None, map(
4547 str.strip
, codecs_str
.strip().strip(',').split(','))))
4548 vcodec
, acodec
= None, None
4549 for full_codec
in split_codecs
:
4550 codec
= full_codec
.split('.')[0]
4551 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4554 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4558 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4559 if not vcodec
and not acodec
:
4560 if len(split_codecs
) == 2:
4562 'vcodec': split_codecs
[0],
4563 'acodec': split_codecs
[1],
4567 'vcodec': vcodec
or 'none',
4568 'acodec': acodec
or 'none',
4573 def urlhandle_detect_ext(url_handle
):
4574 getheader
= url_handle
.headers
.get
4576 cd
= getheader('Content-Disposition')
4578 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4580 e
= determine_ext(m
.group('filename'), default_ext
=None)
4584 return mimetype2ext(getheader('Content-Type'))
4587 def encode_data_uri(data
, mime_type
):
4588 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4591 def age_restricted(content_limit
, age_limit
):
4592 """ Returns True iff the content should be blocked """
4594 if age_limit
is None: # No limit set
4596 if content_limit
is None:
4597 return False # Content available for everyone
4598 return age_limit
< content_limit
4601 def is_html(first_bytes
):
4602 """ Detect whether a file contains HTML by examining its first bytes. """
4605 (b
'\xef\xbb\xbf', 'utf-8'),
4606 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4607 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4608 (b
'\xff\xfe', 'utf-16-le'),
4609 (b
'\xfe\xff', 'utf-16-be'),
4611 for bom
, enc
in BOMS
:
4612 if first_bytes
.startswith(bom
):
4613 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4616 s
= first_bytes
.decode('utf-8', 'replace')
4618 return re
.match(r
'^\s*<', s
)
4621 def determine_protocol(info_dict
):
4622 protocol
= info_dict
.get('protocol')
4623 if protocol
is not None:
4626 url
= info_dict
['url']
4627 if url
.startswith('rtmp'):
4629 elif url
.startswith('mms'):
4631 elif url
.startswith('rtsp'):
4634 ext
= determine_ext(url
)
4640 return compat_urllib_parse_urlparse(url
).scheme
4643 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4644 """ Render a list of rows, each as a list of values """
4646 def get_max_lens(table
):
4647 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4649 def filter_using_list(row
, filterArray
):
4650 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4653 max_lens
= get_max_lens(data
)
4654 header_row
= filter_using_list(header_row
, max_lens
)
4655 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4657 table
= [header_row
] + data
4658 max_lens
= get_max_lens(table
)
4660 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4661 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4662 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4665 def _match_one(filter_part
, dct
):
4666 # TODO: Generalize code with YoutubeDL._build_format_filter
4667 STRING_OPERATORS
= {
4668 '*=': operator
.contains
,
4669 '^=': lambda attr
, value
: attr
.startswith(value
),
4670 '$=': lambda attr
, value
: attr
.endswith(value
),
4671 '~=': lambda attr
, value
: re
.search(value
, attr
),
4673 COMPARISON_OPERATORS
= {
4675 '<=': operator
.le
, # "<=" must be defined above "<"
4682 operator_rex
= re
.compile(r
'''(?x)\s*
4684 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4686 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4687 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
4691 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4692 m = operator_rex.search(filter_part)
4694 unnegated_op = COMPARISON_OPERATORS[m.group('op')]
4695 if m.group('negation'):
4696 op = lambda attr, value: not unnegated_op(attr, value)
4699 actual_value = dct.get(m.group('key'))
4700 if (m.group('quotedstrval') is not None
4701 or m.group('strval') is not None
4702 # If the original field is a string and matching comparisonvalue is
4703 # a number we should respect the origin of the original field
4704 # and process comparison value as a string (see
4705 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4706 or actual_value is not None and m.group('intval') is not None
4707 and isinstance(actual_value, compat_str)):
4708 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4709 quote = m.group('quote')
4710 if quote is not None:
4711 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4713 if m.group('op') in STRING_OPERATORS:
4714 raise ValueError('Operator %s only supports string values!' % m.group('op'))
4716 comparison_value = int(m.group('intval'))
4718 comparison_value = parse_filesize(m.group('intval'))
4719 if comparison_value is None:
4720 comparison_value = parse_filesize(m.group('intval') + 'B')
4721 if comparison_value is None:
4723 'Invalid integer value %r in filter part %r' % (
4724 m.group('intval'), filter_part))
4725 if actual_value is None:
4726 return m.group('none_inclusive')
4727 return op(actual_value, comparison_value)
4730 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4731 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4733 operator_rex = re.compile(r'''(?x
)\s
*
4734 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4736 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4737 m = operator_rex.search(filter_part)
4739 op = UNARY_OPERATORS[m.group('op')]
4740 actual_value = dct.get(m.group('key'))
4741 return op(actual_value)
4743 raise ValueError('Invalid filter part %r' % filter_part)
4746 def match_str(filter_str, dct):
4747 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4750 _match_one(filter_part.replace(r'\&', '&'), dct)
4751 for filter_part in re.split(r'(?<!\\)&', filter_str))
4754 def match_filter_func(filter_str):
4755 def _match_func(info_dict):
4756 if match_str(filter_str, info_dict):
4759 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4760 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4764 def parse_dfxp_time_expr(time_expr):
4768 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4770 return float(mobj.group('time_offset'))
4772 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4774 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4777 def srt_subtitles_timecode(seconds):
4778 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4781 def dfxp2srt(dfxp_data):
4783 @param dfxp_data A
bytes-like
object containing DFXP data
4784 @returns A
unicode object containing converted SRT data
4786 LEGACY_NAMESPACES = (
4787 (b'http://www.w3.org/ns/ttml', [
4788 b'http://www.w3.org/2004/11/ttaf1',
4789 b'http://www.w3.org/2006/04/ttaf1',
4790 b'http://www.w3.org/2006/10/ttaf1',
4792 (b'http://www.w3.org/ns/ttml#styling', [
4793 b'http://www.w3.org/ns/ttml#style',
4797 SUPPORTED_STYLING = [
4806 _x = functools.partial(xpath_with_ns, ns_map={
4807 'xml': 'http://www.w3.org/XML/1998/namespace',
4808 'ttml': 'http://www.w3.org/ns/ttml',
4809 'tts': 'http://www.w3.org/ns/ttml#styling',
4815 class TTMLPElementParser(object):
4817 _unclosed_elements = []
4818 _applied_styles = []
4820 def start(self, tag, attrib):
4821 if tag in (_x('ttml:br'), 'br'):
4824 unclosed_elements = []
4826 element_style_id = attrib.get('style')
4828 style.update(default_style)
4829 if element_style_id:
4830 style.update(styles.get(element_style_id, {}))
4831 for prop in SUPPORTED_STYLING:
4832 prop_val = attrib.get(_x('tts:' + prop))
4834 style[prop] = prop_val
4837 for k, v in sorted(style.items()):
4838 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4841 font += ' color="%s"' % v
4842 elif k == 'fontSize':
4843 font += ' size="%s"' % v
4844 elif k == 'fontFamily':
4845 font += ' face="%s"' % v
4846 elif k == 'fontWeight' and v == 'bold':
4848 unclosed_elements.append('b')
4849 elif k == 'fontStyle' and v == 'italic':
4851 unclosed_elements.append('i')
4852 elif k == 'textDecoration' and v == 'underline':
4854 unclosed_elements.append('u')
4856 self._out += '<font' + font + '>'
4857 unclosed_elements.append('font')
4859 if self._applied_styles:
4860 applied_style.update(self._applied_styles[-1])
4861 applied_style.update(style)
4862 self._applied_styles.append(applied_style)
4863 self._unclosed_elements.append(unclosed_elements)
4866 if tag not in (_x('ttml:br'), 'br'):
4867 unclosed_elements = self._unclosed_elements.pop()
4868 for element in reversed(unclosed_elements):
4869 self._out += '</%s>' % element
4870 if unclosed_elements and self._applied_styles:
4871 self._applied_styles.pop()
4873 def data(self, data):
4877 return self._out.strip()
4879 def parse_node(node):
4880 target = TTMLPElementParser()
4881 parser = xml.etree.ElementTree.XMLParser(target=target)
4882 parser.feed(xml.etree.ElementTree.tostring(node))
4883 return parser.close()
4885 for k, v in LEGACY_NAMESPACES:
4887 dfxp_data = dfxp_data.replace(ns, k)
4889 dfxp = compat_etree_fromstring(dfxp_data)
4891 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4894 raise ValueError('Invalid dfxp/TTML subtitle')
4898 for style in dfxp.findall(_x('.//ttml:style')):
4899 style_id = style.get('id') or style.get(_x('xml:id'))
4902 parent_style_id = style.get('style')
4904 if parent_style_id not in styles:
4907 styles[style_id] = styles[parent_style_id].copy()
4908 for prop in SUPPORTED_STYLING:
4909 prop_val = style.get(_x('tts:' + prop))
4911 styles.setdefault(style_id, {})[prop] = prop_val
4917 for p in ('body', 'div'):
4918 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4921 style = styles.get(ele.get('style'))
4924 default_style.update(style)
4926 for para, index in zip(paras, itertools.count(1)):
4927 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4928 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4929 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4930 if begin_time is None:
4935 end_time = begin_time + dur
4936 out.append('%d\n%s --> %s\n%s\n\n' % (
4938 srt_subtitles_timecode(begin_time),
4939 srt_subtitles_timecode(end_time),
4945 def cli_option(params, command_option, param):
4946 param = params.get(param)
4948 param = compat_str(param)
4949 return [command_option, param] if param is not None else []
4952 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4953 param = params.get(param)
4956 assert isinstance(param, bool)
4958 return [command_option + separator + (true_value if param else false_value)]
4959 return [command_option, true_value if param else false_value]
4962 def cli_valueless_option(params, command_option, param, expected_value=True):
4963 param = params.get(param)
4964 return [command_option] if param == expected_value else []
4967 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4968 if isinstance(argdict, (list, tuple)): # for backward compatibility
4975 assert isinstance(argdict, dict)
4977 assert isinstance(keys, (list, tuple))
4978 for key_list in keys:
4979 arg_list = list(filter(
4980 lambda x: x is not None,
4981 [argdict.get(key.lower()) for key in variadic(key_list)]))
4983 return [arg for args in arg_list for arg in args]
4987 class ISO639Utils(object):
4988 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
5047 'iw': 'heb', # Replaced by he in 1989 revision
5057 'in': 'ind', # Replaced by id in 1989 revision
5172 'ji': 'yid', # Replaced by yi in 1989 revision
5180 def short2long(cls, code):
5181 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5182 return cls._lang_map.get(code[:2])
5185 def long2short(cls, code):
5186 """Convert language code from ISO 639-2/T to ISO 639-1"""
5187 for short_name, long_name in cls._lang_map.items():
5188 if long_name == code:
5192 class ISO3166Utils(object):
5193 # From http://data.okfn.org/data/core/country-list
5195 'AF': 'Afghanistan',
5196 'AX': 'Åland Islands',
5199 'AS': 'American Samoa',
5204 'AG': 'Antigua and Barbuda',
5221 'BO': 'Bolivia, Plurinational State of',
5222 'BQ': 'Bonaire, Sint Eustatius and Saba',
5223 'BA': 'Bosnia and Herzegovina',
5225 'BV': 'Bouvet Island',
5227 'IO': 'British Indian Ocean Territory',
5228 'BN': 'Brunei Darussalam',
5230 'BF': 'Burkina Faso',
5236 'KY': 'Cayman Islands',
5237 'CF': 'Central African Republic',
5241 'CX': 'Christmas Island',
5242 'CC': 'Cocos (Keeling) Islands',
5246 'CD': 'Congo, the Democratic Republic of the',
5247 'CK': 'Cook Islands',
5249 'CI': 'Côte d\'Ivoire',
5254 'CZ': 'Czech Republic',
5258 'DO': 'Dominican Republic',
5261 'SV': 'El Salvador',
5262 'GQ': 'Equatorial Guinea',
5266 'FK': 'Falkland Islands (Malvinas)',
5267 'FO': 'Faroe Islands',
5271 'GF': 'French Guiana',
5272 'PF': 'French Polynesia',
5273 'TF': 'French Southern Territories',
5288 'GW': 'Guinea-Bissau',
5291 'HM': 'Heard Island and McDonald Islands',
5292 'VA': 'Holy See (Vatican City State)',
5299 'IR': 'Iran, Islamic Republic of',
5302 'IM': 'Isle of Man',
5312 'KP': 'Korea, Democratic People\'s Republic of',
5313 'KR': 'Korea, Republic of',
5316 'LA': 'Lao People\'s Democratic Republic',
5322 'LI': 'Liechtenstein',
5326 'MK': 'Macedonia, the Former Yugoslav Republic of',
5333 'MH': 'Marshall Islands',
5339 'FM': 'Micronesia, Federated States of',
5340 'MD': 'Moldova, Republic of',
5351 'NL': 'Netherlands',
5352 'NC': 'New Caledonia',
5353 'NZ': 'New Zealand',
5358 'NF': 'Norfolk Island',
5359 'MP': 'Northern Mariana Islands',
5364 'PS': 'Palestine, State of',
5366 'PG': 'Papua New Guinea',
5369 'PH': 'Philippines',
5373 'PR': 'Puerto Rico',
5377 'RU': 'Russian Federation',
5379 'BL': 'Saint Barthélemy',
5380 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5381 'KN': 'Saint Kitts and Nevis',
5382 'LC': 'Saint Lucia',
5383 'MF': 'Saint Martin (French part)',
5384 'PM': 'Saint Pierre and Miquelon',
5385 'VC': 'Saint Vincent and the Grenadines',
5388 'ST': 'Sao Tome and Principe',
5389 'SA': 'Saudi Arabia',
5393 'SL': 'Sierra Leone',
5395 'SX': 'Sint Maarten (Dutch part)',
5398 'SB': 'Solomon Islands',
5400 'ZA': 'South Africa',
5401 'GS': 'South Georgia and the South Sandwich Islands',
5402 'SS': 'South Sudan',
5407 'SJ': 'Svalbard and Jan Mayen',
5410 'CH': 'Switzerland',
5411 'SY': 'Syrian Arab Republic',
5412 'TW': 'Taiwan, Province of China',
5414 'TZ': 'Tanzania, United Republic of',
5416 'TL': 'Timor-Leste',
5420 'TT': 'Trinidad and Tobago',
5423 'TM': 'Turkmenistan',
5424 'TC': 'Turks and Caicos Islands',
5428 'AE': 'United Arab Emirates',
5429 'GB': 'United Kingdom',
5430 'US': 'United States',
5431 'UM': 'United States Minor Outlying Islands',
5435 'VE': 'Venezuela, Bolivarian Republic of',
5437 'VG': 'Virgin Islands, British',
5438 'VI': 'Virgin Islands, U.S.',
5439 'WF': 'Wallis and Futuna',
5440 'EH': 'Western Sahara',
5447 def short2full(cls, code):
5448 """Convert an ISO 3166-2 country code to the corresponding full name"""
5449 return cls._country_map.get(code.upper())
5452 class GeoUtils(object):
5453 # Major IPv4 address blocks per country
5455 'AD': '46.172.224.0/19',
5456 'AE': '94.200.0.0/13',
5457 'AF': '149.54.0.0/17',
5458 'AG': '209.59.64.0/18',
5459 'AI': '204.14.248.0/21',
5460 'AL': '46.99.0.0/16',
5461 'AM': '46.70.0.0/15',
5462 'AO': '105.168.0.0/13',
5463 'AP': '182.50.184.0/21',
5464 'AQ': '23.154.160.0/24',
5465 'AR': '181.0.0.0/12',
5466 'AS': '202.70.112.0/20',
5467 'AT': '77.116.0.0/14',
5468 'AU': '1.128.0.0/11',
5469 'AW': '181.41.0.0/18',
5470 'AX': '185.217.4.0/22',
5471 'AZ': '5.197.0.0/16',
5472 'BA': '31.176.128.0/17',
5473 'BB': '65.48.128.0/17',
5474 'BD': '114.130.0.0/16',
5476 'BF': '102.178.0.0/15',
5477 'BG': '95.42.0.0/15',
5478 'BH': '37.131.0.0/17',
5479 'BI': '154.117.192.0/18',
5480 'BJ': '137.255.0.0/16',
5481 'BL': '185.212.72.0/23',
5482 'BM': '196.12.64.0/18',
5483 'BN': '156.31.0.0/16',
5484 'BO': '161.56.0.0/16',
5485 'BQ': '161.0.80.0/20',
5486 'BR': '191.128.0.0/12',
5487 'BS': '24.51.64.0/18',
5488 'BT': '119.2.96.0/19',
5489 'BW': '168.167.0.0/16',
5490 'BY': '178.120.0.0/13',
5491 'BZ': '179.42.192.0/18',
5492 'CA': '99.224.0.0/11',
5493 'CD': '41.243.0.0/16',
5494 'CF': '197.242.176.0/21',
5495 'CG': '160.113.0.0/16',
5496 'CH': '85.0.0.0/13',
5497 'CI': '102.136.0.0/14',
5498 'CK': '202.65.32.0/19',
5499 'CL': '152.172.0.0/14',
5500 'CM': '102.244.0.0/14',
5501 'CN': '36.128.0.0/10',
5502 'CO': '181.240.0.0/12',
5503 'CR': '201.192.0.0/12',
5504 'CU': '152.206.0.0/15',
5505 'CV': '165.90.96.0/19',
5506 'CW': '190.88.128.0/17',
5507 'CY': '31.153.0.0/16',
5508 'CZ': '88.100.0.0/14',
5510 'DJ': '197.241.0.0/17',
5511 'DK': '87.48.0.0/12',
5512 'DM': '192.243.48.0/20',
5513 'DO': '152.166.0.0/15',
5514 'DZ': '41.96.0.0/12',
5515 'EC': '186.68.0.0/15',
5516 'EE': '90.190.0.0/15',
5517 'EG': '156.160.0.0/11',
5518 'ER': '196.200.96.0/20',
5519 'ES': '88.0.0.0/11',
5520 'ET': '196.188.0.0/14',
5521 'EU': '2.16.0.0/13',
5522 'FI': '91.152.0.0/13',
5523 'FJ': '144.120.0.0/16',
5524 'FK': '80.73.208.0/21',
5525 'FM': '119.252.112.0/20',
5526 'FO': '88.85.32.0/19',
5528 'GA': '41.158.0.0/15',
5530 'GD': '74.122.88.0/21',
5531 'GE': '31.146.0.0/16',
5532 'GF': '161.22.64.0/18',
5533 'GG': '62.68.160.0/19',
5534 'GH': '154.160.0.0/12',
5535 'GI': '95.164.0.0/16',
5536 'GL': '88.83.0.0/19',
5537 'GM': '160.182.0.0/15',
5538 'GN': '197.149.192.0/18',
5539 'GP': '104.250.0.0/19',
5540 'GQ': '105.235.224.0/20',
5541 'GR': '94.64.0.0/13',
5542 'GT': '168.234.0.0/16',
5543 'GU': '168.123.0.0/16',
5544 'GW': '197.214.80.0/20',
5545 'GY': '181.41.64.0/18',
5546 'HK': '113.252.0.0/14',
5547 'HN': '181.210.0.0/16',
5548 'HR': '93.136.0.0/13',
5549 'HT': '148.102.128.0/17',
5550 'HU': '84.0.0.0/14',
5551 'ID': '39.192.0.0/10',
5552 'IE': '87.32.0.0/12',
5553 'IL': '79.176.0.0/13',
5554 'IM': '5.62.80.0/20',
5555 'IN': '117.192.0.0/10',
5556 'IO': '203.83.48.0/21',
5557 'IQ': '37.236.0.0/14',
5558 'IR': '2.176.0.0/12',
5559 'IS': '82.221.0.0/16',
5560 'IT': '79.0.0.0/10',
5561 'JE': '87.244.64.0/18',
5562 'JM': '72.27.0.0/17',
5563 'JO': '176.29.0.0/16',
5564 'JP': '133.0.0.0/8',
5565 'KE': '105.48.0.0/12',
5566 'KG': '158.181.128.0/17',
5567 'KH': '36.37.128.0/17',
5568 'KI': '103.25.140.0/22',
5569 'KM': '197.255.224.0/20',
5570 'KN': '198.167.192.0/19',
5571 'KP': '175.45.176.0/22',
5572 'KR': '175.192.0.0/10',
5573 'KW': '37.36.0.0/14',
5574 'KY': '64.96.0.0/15',
5575 'KZ': '2.72.0.0/13',
5576 'LA': '115.84.64.0/18',
5577 'LB': '178.135.0.0/16',
5578 'LC': '24.92.144.0/20',
5579 'LI': '82.117.0.0/19',
5580 'LK': '112.134.0.0/15',
5581 'LR': '102.183.0.0/16',
5582 'LS': '129.232.0.0/17',
5583 'LT': '78.56.0.0/13',
5584 'LU': '188.42.0.0/16',
5585 'LV': '46.109.0.0/16',
5586 'LY': '41.252.0.0/14',
5587 'MA': '105.128.0.0/11',
5588 'MC': '88.209.64.0/18',
5589 'MD': '37.246.0.0/16',
5590 'ME': '178.175.0.0/17',
5591 'MF': '74.112.232.0/21',
5592 'MG': '154.126.0.0/17',
5593 'MH': '117.103.88.0/21',
5594 'MK': '77.28.0.0/15',
5595 'ML': '154.118.128.0/18',
5596 'MM': '37.111.0.0/17',
5597 'MN': '49.0.128.0/17',
5598 'MO': '60.246.0.0/16',
5599 'MP': '202.88.64.0/20',
5600 'MQ': '109.203.224.0/19',
5601 'MR': '41.188.64.0/18',
5602 'MS': '208.90.112.0/22',
5603 'MT': '46.11.0.0/16',
5604 'MU': '105.16.0.0/12',
5605 'MV': '27.114.128.0/18',
5606 'MW': '102.70.0.0/15',
5607 'MX': '187.192.0.0/11',
5608 'MY': '175.136.0.0/13',
5609 'MZ': '197.218.0.0/15',
5610 'NA': '41.182.0.0/16',
5611 'NC': '101.101.0.0/18',
5612 'NE': '197.214.0.0/18',
5613 'NF': '203.17.240.0/22',
5614 'NG': '105.112.0.0/12',
5615 'NI': '186.76.0.0/15',
5616 'NL': '145.96.0.0/11',
5617 'NO': '84.208.0.0/13',
5618 'NP': '36.252.0.0/15',
5619 'NR': '203.98.224.0/19',
5620 'NU': '49.156.48.0/22',
5621 'NZ': '49.224.0.0/14',
5622 'OM': '5.36.0.0/15',
5623 'PA': '186.72.0.0/15',
5624 'PE': '186.160.0.0/14',
5625 'PF': '123.50.64.0/18',
5626 'PG': '124.240.192.0/19',
5627 'PH': '49.144.0.0/13',
5628 'PK': '39.32.0.0/11',
5629 'PL': '83.0.0.0/11',
5630 'PM': '70.36.0.0/20',
5631 'PR': '66.50.0.0/16',
5632 'PS': '188.161.0.0/16',
5633 'PT': '85.240.0.0/13',
5634 'PW': '202.124.224.0/20',
5635 'PY': '181.120.0.0/14',
5636 'QA': '37.210.0.0/15',
5637 'RE': '102.35.0.0/16',
5638 'RO': '79.112.0.0/13',
5639 'RS': '93.86.0.0/15',
5640 'RU': '5.136.0.0/13',
5641 'RW': '41.186.0.0/16',
5642 'SA': '188.48.0.0/13',
5643 'SB': '202.1.160.0/19',
5644 'SC': '154.192.0.0/11',
5645 'SD': '102.120.0.0/13',
5646 'SE': '78.64.0.0/12',
5647 'SG': '8.128.0.0/10',
5648 'SI': '188.196.0.0/14',
5649 'SK': '78.98.0.0/15',
5650 'SL': '102.143.0.0/17',
5651 'SM': '89.186.32.0/19',
5652 'SN': '41.82.0.0/15',
5653 'SO': '154.115.192.0/18',
5654 'SR': '186.179.128.0/17',
5655 'SS': '105.235.208.0/21',
5656 'ST': '197.159.160.0/19',
5657 'SV': '168.243.0.0/16',
5658 'SX': '190.102.0.0/20',
5660 'SZ': '41.84.224.0/19',
5661 'TC': '65.255.48.0/20',
5662 'TD': '154.68.128.0/19',
5663 'TG': '196.168.0.0/14',
5664 'TH': '171.96.0.0/13',
5665 'TJ': '85.9.128.0/18',
5666 'TK': '27.96.24.0/21',
5667 'TL': '180.189.160.0/20',
5668 'TM': '95.85.96.0/19',
5669 'TN': '197.0.0.0/11',
5670 'TO': '175.176.144.0/21',
5671 'TR': '78.160.0.0/11',
5672 'TT': '186.44.0.0/15',
5673 'TV': '202.2.96.0/19',
5674 'TW': '120.96.0.0/11',
5675 'TZ': '156.156.0.0/14',
5676 'UA': '37.52.0.0/14',
5677 'UG': '102.80.0.0/13',
5679 'UY': '167.56.0.0/13',
5680 'UZ': '84.54.64.0/18',
5681 'VA': '212.77.0.0/19',
5682 'VC': '207.191.240.0/21',
5683 'VE': '186.88.0.0/13',
5684 'VG': '66.81.192.0/20',
5685 'VI': '146.226.0.0/16',
5686 'VN': '14.160.0.0/11',
5687 'VU': '202.80.32.0/20',
5688 'WF': '117.20.32.0/21',
5689 'WS': '202.4.32.0/19',
5690 'YE': '134.35.0.0/16',
5691 'YT': '41.242.116.0/22',
5692 'ZA': '41.0.0.0/11',
5693 'ZM': '102.144.0.0/13',
5694 'ZW': '102.177.192.0/18',
5698 def random_ipv4(cls, code_or_block):
5699 if len(code_or_block) == 2:
5700 block = cls._country_ip_map.get(code_or_block.upper())
5704 block = code_or_block
5705 addr, preflen = block.split('/')
5706 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5707 addr_max = addr_min | (0xffffffff >> int(preflen))
5708 return compat_str(socket.inet_ntoa(
5709 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5712 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5713 def __init__(self, proxies=None):
5714 # Set default handlers
5715 for type in ('http', 'https'):
5716 setattr(self, '%s_open' % type,
5717 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5718 meth(r, proxy, type))
5719 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5721 def proxy_open(self, req, proxy, type):
5722 req_proxy = req.headers.get('Ytdl-request-proxy')
5723 if req_proxy is not None:
5725 del req.headers['Ytdl-request-proxy']
5727 if proxy == '__noproxy__':
5728 return None # No Proxy
5729 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5730 req.add_header('Ytdl-socks-proxy', proxy)
5731 # yt-dlp's http/https handlers do wrapping the socket with socks
5733 return compat_urllib_request.ProxyHandler.proxy_open(
5734 self, req, proxy, type)
5737 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5738 # released into Public Domain
5739 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5741 def long_to_bytes(n, blocksize=0):
5742 """long_to_bytes(n:long, blocksize:int) : string
5743 Convert a long integer to a byte string.
5745 If optional blocksize is given and greater than zero, pad the front of the
5746 byte string with binary zeros so that the length is a multiple of
5749 # after much testing, this algorithm was deemed to be the fastest
5753 s = compat_struct_pack('>I', n & 0xffffffff) + s
5755 # strip off leading zeros
5756 for i in range(len(s)):
5757 if s[i] != b'\000'[0]:
5760 # only happens when n == 0
5764 # add back some pad bytes. this could be done more efficiently w.r.t. the
5765 # de-padding being done above, but sigh...
5766 if blocksize > 0 and len(s) % blocksize:
5767 s = (blocksize - len(s) % blocksize) * b'\000' + s
5771 def bytes_to_long(s):
5772 """bytes_to_long(string) : long
5773 Convert a byte string to a long integer.
5775 This is (essentially) the inverse of long_to_bytes().
5780 extra = (4 - length % 4)
5781 s = b'\000' * extra + s
5782 length = length + extra
5783 for i in range(0, length, 4):
5784 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5788 def ohdave_rsa_encrypt(data, exponent, modulus):
5790 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5793 data: data to encrypt, bytes-like object
5794 exponent, modulus: parameter e and N of RSA algorithm, both integer
5795 Output: hex string of encrypted data
5797 Limitation: supports one block encryption only
5800 payload = int(binascii.hexlify(data[::-1]), 16)
5801 encrypted = pow(payload, exponent, modulus)
5802 return '%x' % encrypted
5805 def pkcs1pad(data, length):
5807 Padding input data with PKCS#1 scheme
5809 @param {int[]} data input data
5810 @param {int} length target length
5811 @returns {int[]} padded data
5813 if len(data) > length - 11:
5814 raise ValueError('Input data too
long for PKCS
#1 padding')
5816 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5817 return [0, 2] + pseudo_random
+ [0] + data
5820 def encode_base_n(num
, n
, table
=None):
5821 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5823 table
= FULL_TABLE
[:n
]
5826 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5833 ret
= table
[num
% n
] + ret
5838 def decode_packed_codes(code
):
5839 mobj
= re
.search(PACKED_CODES_RE
, code
)
5840 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5843 symbols
= symbols
.split('|')
5848 base_n_count
= encode_base_n(count
, base
)
5849 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5852 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5856 def caesar(s
, alphabet
, shift
):
5861 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5866 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5869 def parse_m3u8_attributes(attrib
):
5871 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5872 if val
.startswith('"'):
5878 def urshift(val
, n
):
5879 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5882 # Based on png2str() written by @gdkchan and improved by @yokrysty
5883 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5884 def decode_png(png_data
):
5885 # Reference: https://www.w3.org/TR/PNG/
5886 header
= png_data
[8:]
5888 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5889 raise IOError('Not a valid PNG file.')
5891 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5892 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5897 length
= unpack_integer(header
[:4])
5900 chunk_type
= header
[:4]
5903 chunk_data
= header
[:length
]
5904 header
= header
[length
:]
5906 header
= header
[4:] # Skip CRC
5914 ihdr
= chunks
[0]['data']
5916 width
= unpack_integer(ihdr
[:4])
5917 height
= unpack_integer(ihdr
[4:8])
5921 for chunk
in chunks
:
5922 if chunk
['type'] == b
'IDAT':
5923 idat
+= chunk
['data']
5926 raise IOError('Unable to read PNG data.')
5928 decompressed_data
= bytearray(zlib
.decompress(idat
))
5933 def _get_pixel(idx
):
5938 for y
in range(height
):
5939 basePos
= y
* (1 + stride
)
5940 filter_type
= decompressed_data
[basePos
]
5944 pixels
.append(current_row
)
5946 for x
in range(stride
):
5947 color
= decompressed_data
[1 + basePos
+ x
]
5948 basex
= y
* stride
+ x
5953 left
= _get_pixel(basex
- 3)
5955 up
= _get_pixel(basex
- stride
)
5957 if filter_type
== 1: # Sub
5958 color
= (color
+ left
) & 0xff
5959 elif filter_type
== 2: # Up
5960 color
= (color
+ up
) & 0xff
5961 elif filter_type
== 3: # Average
5962 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5963 elif filter_type
== 4: # Paeth
5969 c
= _get_pixel(basex
- stride
- 3)
5977 if pa
<= pb
and pa
<= pc
:
5978 color
= (color
+ a
) & 0xff
5980 color
= (color
+ b
) & 0xff
5982 color
= (color
+ c
) & 0xff
5984 current_row
.append(color
)
5986 return width
, height
, pixels
5989 def write_xattr(path
, key
, value
):
5990 # This mess below finds the best xattr tool for the job
5992 # try the pyxattr module...
5995 if hasattr(xattr
, 'set'): # pyxattr
5996 # Unicode arguments are not supported in python-pyxattr until
5998 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5999 pyxattr_required_version
= '0.5.0'
6000 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
6001 # TODO: fallback to CLI tools
6002 raise XAttrUnavailableError(
6003 'python-pyxattr is detected but is too old. '
6004 'yt-dlp requires %s or above while your version is %s. '
6005 'Falling back to other xattr implementations' % (
6006 pyxattr_required_version
, xattr
.__version
__))
6008 setxattr
= xattr
.set
6010 setxattr
= xattr
.setxattr
6013 setxattr(path
, key
, value
)
6014 except EnvironmentError as e
:
6015 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6018 if compat_os_name
== 'nt':
6019 # Write xattrs to NTFS Alternate Data Streams:
6020 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
6021 assert ':' not in key
6022 assert os
.path
.exists(path
)
6024 ads_fn
= path
+ ':' + key
6026 with open(ads_fn
, 'wb') as f
:
6028 except EnvironmentError as e
:
6029 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6031 user_has_setfattr
= check_executable('setfattr', ['--version'])
6032 user_has_xattr
= check_executable('xattr', ['-h'])
6034 if user_has_setfattr
or user_has_xattr
:
6036 value
= value
.decode('utf-8')
6037 if user_has_setfattr
:
6038 executable
= 'setfattr'
6039 opts
= ['-n', key
, '-v', value
]
6040 elif user_has_xattr
:
6041 executable
= 'xattr'
6042 opts
= ['-w', key
, value
]
6044 cmd
= ([encodeFilename(executable
, True)]
6045 + [encodeArgument(o
) for o
in opts
]
6046 + [encodeFilename(path
, True)])
6049 p
= subprocess
.Popen(
6050 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
6051 except EnvironmentError as e
:
6052 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6053 stdout
, stderr
= process_communicate_or_kill(p
)
6054 stderr
= stderr
.decode('utf-8', 'replace')
6055 if p
.returncode
!= 0:
6056 raise XAttrMetadataError(p
.returncode
, stderr
)
6059 # On Unix, and can't find pyxattr, setfattr, or xattr.
6060 if sys
.platform
.startswith('linux'):
6061 raise XAttrUnavailableError(
6062 "Couldn't find a tool to set the xattrs. "
6063 "Install either the python 'pyxattr' or 'xattr' "
6064 "modules, or the GNU 'attr' package "
6065 "(which contains the 'setfattr' tool).")
6067 raise XAttrUnavailableError(
6068 "Couldn't find a tool to set the xattrs. "
6069 "Install either the python 'xattr' module, "
6070 "or the 'xattr' binary.")
6073 def random_birthday(year_field
, month_field
, day_field
):
6074 start_date
= datetime
.date(1950, 1, 1)
6075 end_date
= datetime
.date(1995, 12, 31)
6076 offset
= random
.randint(0, (end_date
- start_date
).days
)
6077 random_date
= start_date
+ datetime
.timedelta(offset
)
6079 year_field
: str(random_date
.year
),
6080 month_field
: str(random_date
.month
),
6081 day_field
: str(random_date
.day
),
6085 # Templates for internet shortcut files, which are plain text files.
6086 DOT_URL_LINK_TEMPLATE
= '''
6091 DOT_WEBLOC_LINK_TEMPLATE
= '''
6092 <?xml version="1.0" encoding="UTF-8"?>
6093 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
6094 <plist version="1.0">
6097 \t<string>%(url)s</string>
6102 DOT_DESKTOP_LINK_TEMPLATE
= '''
6112 def iri_to_uri(iri
):
6114 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
6116 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
6119 iri_parts
= compat_urllib_parse_urlparse(iri
)
6121 if '[' in iri_parts
.netloc
:
6122 raise ValueError('IPv6 URIs are not, yet, supported.')
6123 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
6125 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
6128 if iri_parts
.username
:
6129 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
6130 if iri_parts
.password
is not None:
6131 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6134 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6135 # The 'idna' encoding produces ASCII text.
6136 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6137 net_location
+= ':' + str(iri_parts
.port
)
6139 return compat_urllib_parse_urlunparse(
6143 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6145 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6146 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6148 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6149 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6151 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6153 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6156 def to_high_limit_path(path
):
6157 if sys
.platform
in ['win32', 'cygwin']:
6158 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6159 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6164 def format_field(obj
, field
, template
='%s', ignore
=(None, ''), default
='', func
=None):
6165 val
= obj
.get(field
, default
)
6166 if func
and val
not in ignore
:
6168 return template
% val
if val
not in ignore
else default
6171 def clean_podcast_url(url
):
6172 return re
.sub(r
'''(?x)
6176 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6179 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6182 cn\.co| # https://podcorn.com/analytics-prefix/
6183 st\.fm # https://podsights.com/docs/
6188 _HEX_TABLE
= '0123456789abcdef'
6191 def random_uuidv4():
6192 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6195 def make_dir(path
, to_screen
=None):
6197 dn
= os
.path
.dirname(path
)
6198 if dn
and not os
.path
.exists(dn
):
6201 except (OSError, IOError) as err
:
6202 if callable(to_screen
) is not None:
6203 to_screen('unable to create directory ' + error_to_compat_str(err
))
6207 def get_executable_path():
6208 from zipimport
import zipimporter
6209 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6210 path
= os
.path
.dirname(sys
.executable
)
6211 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6212 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6214 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6215 return os
.path
.abspath(path
)
6218 def load_plugins(name
, suffix
, namespace
):
6219 plugin_info
= [None]
6222 plugin_info
= imp
.find_module(
6223 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
6224 plugins
= imp
.load_module(name
, *plugin_info
)
6225 for name
in dir(plugins
):
6226 if name
in namespace
:
6228 if not name
.endswith(suffix
):
6230 klass
= getattr(plugins
, name
)
6231 classes
.append(klass
)
6232 namespace
[name
] = klass
6236 if plugin_info
[0] is not None:
6237 plugin_info
[0].close()
6242 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
6243 casesense
=True, is_user_input
=False, traverse_string
=False):
6244 ''' Traverse nested list/dict/tuple
6245 @param path_list A list of paths which are checked one by one.
6246 Each path is a list of keys where each key is a string,
6247 a tuple of strings or "...". When a tuple is given,
6248 all the keys given in the tuple are traversed, and
6249 "..." traverses all the keys in the object
6250 @param default Default value to return
6251 @param expected_type Only accept final value of this type (Can also be any callable)
6252 @param get_all Return all the values obtained from a path or only the first one
6253 @param casesense Whether to consider dictionary keys as case sensitive
6254 @param is_user_input Whether the keys are generated from user input. If True,
6255 strings are converted to int/slice if necessary
6256 @param traverse_string Whether to traverse inside strings. If True, any
6257 non-compatible object will also be converted into a string
6261 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
6262 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
6264 def _traverse_obj(obj
, path
, _current_depth
=0):
6268 path
= tuple(variadic(path
))
6269 for i
, key
in enumerate(path
):
6270 if isinstance(key
, (list, tuple)):
6271 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
6274 obj
= (obj
.values() if isinstance(obj
, dict)
6275 else obj
if isinstance(obj
, (list, tuple, LazyList
))
6276 else str(obj
) if traverse_string
else [])
6278 depth
= max(depth
, _current_depth
)
6279 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
6280 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
6281 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
6282 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
6285 key
= (int_or_none(key
) if ':' not in key
6286 else slice(*map(int_or_none
, key
.split(':'))))
6287 if key
== slice(None):
6288 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
6289 if not isinstance(key
, (int, slice)):
6291 if not isinstance(obj
, (list, tuple, LazyList
)):
6292 if not traverse_string
:
6301 if isinstance(expected_type
, type):
6302 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
6303 elif expected_type
is not None:
6304 type_test
= expected_type
6306 type_test
= lambda val
: val
6308 for path
in path_list
:
6310 val
= _traverse_obj(obj
, path
)
6313 for _
in range(depth
- 1):
6314 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
6315 val
= [v
for v
in map(type_test
, val
) if v
is not None]
6317 return val
if get_all
else val
[0]
6319 val
= type_test(val
)
6325 def traverse_dict(dictn
, keys
, casesense
=True):
6326 ''' For backward compatibility. Do not use '''
6327 return traverse_obj(dictn
, keys
, casesense
=casesense
,
6328 is_user_input
=True, traverse_string
=True)
6331 def variadic(x
, allowed_types
=(str, bytes)):
6332 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)