4 from __future__
import unicode_literals
37 import xml
.etree
.ElementTree
41 compat_HTMLParseError
,
47 compat_ctypes_WINFUNCTYPE
,
48 compat_etree_fromstring
,
51 compat_html_entities_html5
,
64 compat_urllib_parse_urlencode
,
65 compat_urllib_parse_urlparse
,
66 compat_urllib_parse_urlunparse
,
67 compat_urllib_parse_quote
,
68 compat_urllib_parse_quote_plus
,
69 compat_urllib_parse_unquote_plus
,
70 compat_urllib_request
,
81 def register_socks_protocols():
82 # "Register" SOCKS protocols
83 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
84 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
85 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
86 if scheme
not in compat_urlparse
.uses_netloc
:
87 compat_urlparse
.uses_netloc
.append(scheme
)
90 # This is not clearly defined otherwise
91 compiled_regex_type
= type(re
.compile(''))
94 def random_user_agent():
95 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1674 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1678 'User-Agent': random_user_agent(),
1679 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1680 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1681 'Accept-Encoding': 'gzip, deflate',
1682 'Accept-Language': 'en-us,en;q=0.5',
1687 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1691 NO_DEFAULT
= object()
1693 ENGLISH_MONTH_NAMES
= [
1694 'January', 'February', 'March', 'April', 'May', 'June',
1695 'July', 'August', 'September', 'October', 'November', 'December']
1698 'en': ENGLISH_MONTH_NAMES
,
1700 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1701 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1704 KNOWN_EXTENSIONS
= (
1705 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1706 'flv', 'f4v', 'f4a', 'f4b',
1707 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1708 'mkv', 'mka', 'mk3d',
1711 'asf', 'wmv', 'wma',
1717 'f4f', 'f4m', 'm3u8', 'smil')
1719 # needed for sanitizing filenames in restricted mode
1720 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1721 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1722 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1746 '%Y/%m/%d %H:%M:%S',
1750 '%Y-%m-%d %H:%M:%S',
1751 '%Y-%m-%d %H:%M:%S.%f',
1752 '%Y-%m-%d %H:%M:%S:%f',
1755 '%Y-%m-%dT%H:%M:%SZ',
1756 '%Y-%m-%dT%H:%M:%S.%fZ',
1757 '%Y-%m-%dT%H:%M:%S.%f0Z',
1758 '%Y-%m-%dT%H:%M:%S',
1759 '%Y-%m-%dT%H:%M:%S.%f',
1761 '%b %d %Y at %H:%M',
1762 '%b %d %Y at %H:%M:%S',
1763 '%B %d %Y at %H:%M',
1764 '%B %d %Y at %H:%M:%S',
1767 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1768 DATE_FORMATS_DAY_FIRST
.extend([
1774 '%d/%m/%Y %H:%M:%S',
1777 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1778 DATE_FORMATS_MONTH_FIRST
.extend([
1783 '%m/%d/%Y %H:%M:%S',
1786 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1787 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1790 def preferredencoding():
1791 """Get preferred encoding.
1793 Returns the best encoding scheme for the system, based on
1794 locale.getpreferredencoding() and some further tweaks.
1797 pref = locale.getpreferredencoding()
1805 def write_json_file(obj, fn):
1806 """ Encode obj as JSON and write it to fn, atomically if possible """
1808 fn = encodeFilename(fn)
1809 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1810 encoding = get_filesystem_encoding()
1811 # os.path.basename returns a bytes object, but NamedTemporaryFile
1812 # will fail if the filename contains non ascii characters unless we
1813 # use a unicode object
1814 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1815 # the same for os.path.dirname
1816 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1818 path_basename = os.path.basename
1819 path_dirname = os.path.dirname
1823 'prefix
': path_basename(fn) + '.',
1824 'dir': path_dirname(fn),
1828 # In Python 2.x, json.dump expects a bytestream.
1829 # In Python 3.x, it writes to a character stream
1830 if sys.version_info < (3, 0):
1835 'encoding
': 'utf
-8',
1838 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1843 if sys.platform == 'win32
':
1844 # Need to remove existing file on Windows, else os.rename raises
1845 # WindowsError or FileExistsError.
1853 os.chmod(tf.name, 0o666 & ~mask)
1856 os.rename(tf.name, fn)
1865 if sys.version_info >= (2, 7):
1866 def find_xpath_attr(node, xpath, key, val=None):
1867 """ Find the xpath xpath[@key=val] """
1868 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1869 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1870 return node.find(expr)
1872 def find_xpath_attr(node, xpath, key, val=None):
1873 for f in node.findall(compat_xpath(xpath)):
1874 if key not in f.attrib:
1876 if val is None or f.attrib.get(key) == val:
1880 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1881 # the namespace parameter
1884 def xpath_with_ns(path
, ns_map
):
1885 components
= [c
.split(':') for c
in path
.split('/')]
1887 for c
in components
:
1889 replaced
.append(c
[0])
1892 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1893 return '/'.join(replaced
)
1896 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1897 def _find_xpath(xpath
):
1898 return node
.find(compat_xpath(xpath
))
1900 if isinstance(xpath
, (str, compat_str
)):
1901 n
= _find_xpath(xpath
)
1909 if default
is not NO_DEFAULT
:
1912 name
= xpath
if name
is None else name
1913 raise ExtractorError('Could not find XML element %s' % name
)
1919 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1920 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1921 if n
is None or n
== default
:
1924 if default
is not NO_DEFAULT
:
1927 name
= xpath
if name
is None else name
1928 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1934 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1935 n
= find_xpath_attr(node
, xpath
, key
)
1937 if default
is not NO_DEFAULT
:
1940 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1941 raise ExtractorError('Could not find XML attribute %s' % name
)
1944 return n
.attrib
[key
]
1947 def get_element_by_id(id, html
):
1948 """Return the content of the tag with the specified ID in the passed HTML document"""
1949 return get_element_by_attribute('id', id, html
)
1952 def get_element_by_class(class_name
, html
):
1953 """Return the content of the first tag with the specified class in the passed HTML document"""
1954 retval
= get_elements_by_class(class_name
, html
)
1955 return retval
[0] if retval
else None
1958 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1959 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1960 return retval
[0] if retval
else None
1963 def get_elements_by_class(class_name
, html
):
1964 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1965 return get_elements_by_attribute(
1966 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1967 html, escape_value=False)
1970 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1971 """Return the content of the tag with the specified attribute in the passed HTML document"""
1973 value = re.escape(value) if escape_value else value
1976 for m in re.finditer(r'''(?xs)
1978 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1980 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1984 ''' % (re.escape(attribute), value), html):
1985 res = m.group('content
')
1987 if res.startswith('"') or res.startswith("'"):
1990 retlist.append(unescapeHTML(res))
1995 class HTMLAttributeParser(compat_HTMLParser):
1996 """Trivial HTML parser to gather the attributes for a single element"""
2000 compat_HTMLParser.__init__(self)
2002 def handle_starttag(self, tag, attrs):
2003 self.attrs = dict(attrs)
2006 def extract_attributes(html_element):
2007 """Given a string for an HTML element such as
2009 a="foo" B="bar" c="&98;az" d=boz
2010 empty= noval entity="&"
2013 Decode and return a dictionary of attributes.
2015 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2016 'empty
': '', 'noval
': None, 'entity
': '&',
2017 'sq
': '"', 'dq': '\''
2019 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2020 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2022 parser = HTMLAttributeParser()
2024 parser.feed(html_element)
2026 # Older Python may throw HTMLParseError in case of malformed HTML
2027 except compat_HTMLParseError:
2032 def clean_html(html):
2033 """Clean an HTML snippet into a readable string"""
2035 if html is None: # Convenience for sanitizing descriptions etc.
2039 html = html.replace('\n', ' ')
2040 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2041 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2043 html = re.sub('<.*?>', '', html)
2044 # Replace html entities
2045 html = unescapeHTML(html)
2049 def sanitize_open(filename, open_mode):
2050 """Try to open the given filename, and slightly tweak it if this fails.
2052 Attempts to open the given filename. If this fails, it tries to change
2053 the filename slightly, step by step, until it's either able to open it
2054 or it fails and raises a final exception, like the standard open()
2057 It returns the tuple (stream, definitive_file_name).
2061 if sys.platform == 'win32':
2063 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2064 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2065 stream = open(encodeFilename(filename), open_mode)
2066 return (stream, filename)
2067 except (IOError, OSError) as err:
2068 if err.errno in (errno.EACCES,):
2071 # In case of error, try to remove win32 forbidden chars
2072 alt_filename = sanitize_path(filename)
2073 if alt_filename == filename:
2076 # An exception here should be caught in the caller
2077 stream = open(encodeFilename(alt_filename), open_mode)
2078 return (stream, alt_filename)
2081 def timeconvert(timestr):
2082 """Convert RFC 2822 defined time string into system timestamp"""
2084 timetuple = email.utils.parsedate_tz(timestr)
2085 if timetuple is not None:
2086 timestamp = email.utils.mktime_tz(timetuple)
2090 def sanitize_filename(s, restricted=False, is_id=False):
2091 """Sanitizes a string so it could be used as part of a filename.
2092 If restricted is set, use a stricter subset of allowed characters.
2093 Set is_id if this is not an arbitrary string, but an ID that should be kept
2096 def replace_insane(char):
2097 if restricted and char in ACCENT_CHARS:
2098 return ACCENT_CHARS[char]
2099 if char == '?' or ord(char) < 32 or ord(char) == 127:
2102 return '' if restricted else '\''
2104 return '_
-' if restricted else ' -'
2105 elif char in '\\/|
*<>':
2107 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2109 if restricted
and ord(char
) > 127:
2116 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2117 result
= ''.join(map(replace_insane
, s
))
2119 while '__' in result
:
2120 result
= result
.replace('__', '_')
2121 result
= result
.strip('_')
2122 # Common case of "Foreign band name - English song title"
2123 if restricted
and result
.startswith('-_'):
2125 if result
.startswith('-'):
2126 result
= '_' + result
[len('-'):]
2127 result
= result
.lstrip('.')
2133 def sanitize_path(s
, force
=False):
2134 """Sanitizes and normalizes path on Windows"""
2135 if sys
.platform
== 'win32':
2137 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2138 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2139 drive_or_unc
, _
= os
.path
.splitunc(s
)
2145 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2149 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2150 for path_part
in norm_path
]
2152 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2153 elif force
and s
[0] == os
.path
.sep
:
2154 sanitized_path
.insert(0, os
.path
.sep
)
2155 return os
.path
.join(*sanitized_path
)
2158 def sanitize_url(url
):
2159 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2160 # the number of unwanted failures due to missing protocol
2161 if url
.startswith('//'):
2162 return 'http:%s' % url
2163 # Fix some common typos seen so far
2165 # https://github.com/ytdl-org/youtube-dl/issues/15649
2166 (r
'^httpss://', r
'https://'),
2167 # https://bx1.be/lives/direct-tv/
2168 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2170 for mistake
, fixup
in COMMON_TYPOS
:
2171 if re
.match(mistake
, url
):
2172 return re
.sub(mistake
, fixup
, url
)
2176 def extract_basic_auth(url
):
2177 parts
= compat_urlparse
.urlsplit(url
)
2178 if parts
.username
is None:
2180 url
= compat_urlparse
.urlunsplit(parts
._replace
(netloc
=(
2181 parts
.hostname
if parts
.port
is None
2182 else '%s:%d' % (parts
.hostname
, parts
.port
))))
2183 auth_payload
= base64
.b64encode(
2184 ('%s:%s' % (parts
.username
, parts
.password
or '')).encode('utf-8'))
2185 return url
, 'Basic ' + auth_payload
.decode('utf-8')
2188 def sanitized_Request(url
, *args
, **kwargs
):
2189 url
, auth_header
= extract_basic_auth(escape_url(sanitize_url(url
)))
2190 if auth_header
is not None:
2191 headers
= args
[1] if len(args
) >= 2 else kwargs
.setdefault('headers', {})
2192 headers
['Authorization'] = auth_header
2193 return compat_urllib_request
.Request(url
, *args
, **kwargs
)
2197 """Expand shell variables and ~"""
2198 return os
.path
.expandvars(compat_expanduser(s
))
2201 def orderedSet(iterable
):
2202 """ Remove all duplicates from the input iterable """
2210 def _htmlentity_transform(entity_with_semicolon
):
2211 """Transforms an HTML entity to a character."""
2212 entity
= entity_with_semicolon
[:-1]
2214 # Known non-numeric HTML entity
2215 if entity
in compat_html_entities
.name2codepoint
:
2216 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2218 # TODO: HTML5 allows entities without a semicolon. For example,
2219 # 'Éric' should be decoded as 'Éric'.
2220 if entity_with_semicolon
in compat_html_entities_html5
:
2221 return compat_html_entities_html5
[entity_with_semicolon
]
2223 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2224 if mobj
is not None:
2225 numstr
= mobj
.group(1)
2226 if numstr
.startswith('x'):
2228 numstr
= '0%s' % numstr
2231 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2233 return compat_chr(int(numstr
, base
))
2237 # Unknown entity in name, return its literal representation
2238 return '&%s;' % entity
2241 def unescapeHTML(s
):
2244 assert type(s
) == compat_str
2247 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2250 def escapeHTML(text
):
2253 .replace('&', '&')
2254 .replace('<', '<')
2255 .replace('>', '>')
2256 .replace('"', '"')
2257 .replace("'", ''')
2261 def process_communicate_or_kill(p
, *args
, **kwargs
):
2263 return p
.communicate(*args
, **kwargs
)
2264 except BaseException
: # Including KeyboardInterrupt
2270 def get_subprocess_encoding():
2271 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2272 # For subprocess calls, encode with locale encoding
2273 # Refer to http://stackoverflow.com/a/9951851/35070
2274 encoding
= preferredencoding()
2276 encoding
= sys
.getfilesystemencoding()
2277 if encoding
is None:
2282 def encodeFilename(s
, for_subprocess
=False):
2284 @param s The name of the file
2287 assert type(s
) == compat_str
2289 # Python 3 has a Unicode API
2290 if sys
.version_info
>= (3, 0):
2293 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2294 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2295 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2296 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2299 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2300 if sys
.platform
.startswith('java'):
2303 return s
.encode(get_subprocess_encoding(), 'ignore')
2306 def decodeFilename(b
, for_subprocess
=False):
2308 if sys
.version_info
>= (3, 0):
2311 if not isinstance(b
, bytes):
2314 return b
.decode(get_subprocess_encoding(), 'ignore')
2317 def encodeArgument(s
):
2318 if not isinstance(s
, compat_str
):
2319 # Legacy code that uses byte strings
2320 # Uncomment the following line after fixing all post processors
2321 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2322 s
= s
.decode('ascii')
2323 return encodeFilename(s
, True)
2326 def decodeArgument(b
):
2327 return decodeFilename(b
, True)
2330 def decodeOption(optval
):
2333 if isinstance(optval
, bytes):
2334 optval
= optval
.decode(preferredencoding())
2336 assert isinstance(optval
, compat_str
)
2340 def formatSeconds(secs
, delim
=':', msec
=False):
2342 ret
= '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2344 ret
= '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2347 return '%s.%03d' % (ret
, secs
% 1) if msec
else ret
2350 def make_HTTPS_handler(params
, **kwargs
):
2351 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2352 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2353 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2354 if opts_no_check_certificate
:
2355 context
.check_hostname
= False
2356 context
.verify_mode
= ssl
.CERT_NONE
2358 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2361 # (create_default_context present but HTTPSHandler has no context=)
2364 if sys
.version_info
< (3, 2):
2365 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2366 else: # Python < 3.4
2367 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2368 context
.verify_mode
= (ssl
.CERT_NONE
2369 if opts_no_check_certificate
2370 else ssl
.CERT_REQUIRED
)
2371 context
.set_default_verify_paths()
2372 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2375 def bug_reports_message(before
=';'):
2376 if ytdl_is_updateable():
2377 update_cmd
= 'type yt-dlp -U to update'
2379 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2380 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2381 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2382 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2384 before
= before
.rstrip()
2385 if not before
or before
.endswith(('.', '!', '?')):
2386 msg
= msg
[0].title() + msg
[1:]
2388 return (before
+ ' ' if before
else '') + msg
2391 class YoutubeDLError(Exception):
2392 """Base exception for YoutubeDL errors."""
2396 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2397 if hasattr(ssl
, 'CertificateError'):
2398 network_exceptions
.append(ssl
.CertificateError
)
2399 network_exceptions
= tuple(network_exceptions
)
2402 class ExtractorError(YoutubeDLError
):
2403 """Error during info extraction."""
2405 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None, ie
=None):
2406 """ tb, if given, is the original traceback (so that it can be printed out).
2407 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2409 if sys
.exc_info()[0] in network_exceptions
:
2414 self
.expected
= expected
2416 self
.video_id
= video_id
2418 self
.exc_info
= sys
.exc_info() # preserve original exception
2420 super(ExtractorError
, self
).__init
__(''.join((
2421 format_field(ie
, template
='[%s] '),
2422 format_field(video_id
, template
='%s: '),
2424 format_field(cause
, template
=' (caused by %r)'),
2425 '' if expected
else bug_reports_message())))
2427 def format_traceback(self
):
2428 if self
.traceback
is None:
2430 return ''.join(traceback
.format_tb(self
.traceback
))
2433 class UnsupportedError(ExtractorError
):
2434 def __init__(self
, url
):
2435 super(UnsupportedError
, self
).__init
__(
2436 'Unsupported URL: %s' % url
, expected
=True)
2440 class RegexNotFoundError(ExtractorError
):
2441 """Error when a regex didn't match"""
2445 class GeoRestrictedError(ExtractorError
):
2446 """Geographic restriction Error exception.
2448 This exception may be thrown when a video is not available from your
2449 geographic location due to geographic restrictions imposed by a website.
2452 def __init__(self
, msg
, countries
=None):
2453 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2455 self
.countries
= countries
2458 class DownloadError(YoutubeDLError
):
2459 """Download Error exception.
2461 This exception may be thrown by FileDownloader objects if they are not
2462 configured to continue on errors. They will contain the appropriate
2466 def __init__(self
, msg
, exc_info
=None):
2467 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2468 super(DownloadError
, self
).__init
__(msg
)
2469 self
.exc_info
= exc_info
2472 class EntryNotInPlaylist(YoutubeDLError
):
2473 """Entry not in playlist exception.
2475 This exception will be thrown by YoutubeDL when a requested entry
2476 is not found in the playlist info_dict
2481 class SameFileError(YoutubeDLError
):
2482 """Same File exception.
2484 This exception will be thrown by FileDownloader objects if they detect
2485 multiple files would have to be downloaded to the same file on disk.
2490 class PostProcessingError(YoutubeDLError
):
2491 """Post Processing exception.
2493 This exception may be raised by PostProcessor's .run() method to
2494 indicate an error in the postprocessing task.
2497 def __init__(self
, msg
):
2498 super(PostProcessingError
, self
).__init
__(msg
)
2502 class ExistingVideoReached(YoutubeDLError
):
2503 """ --max-downloads limit has been reached. """
2507 class RejectedVideoReached(YoutubeDLError
):
2508 """ --max-downloads limit has been reached. """
2512 class ThrottledDownload(YoutubeDLError
):
2513 """ Download speed below --throttled-rate. """
2517 class MaxDownloadsReached(YoutubeDLError
):
2518 """ --max-downloads limit has been reached. """
2522 class UnavailableVideoError(YoutubeDLError
):
2523 """Unavailable Format exception.
2525 This exception will be thrown when a video is requested
2526 in a format that is not available for that video.
2531 class ContentTooShortError(YoutubeDLError
):
2532 """Content Too Short exception.
2534 This exception may be raised by FileDownloader objects when a file they
2535 download is too small for what the server announced first, indicating
2536 the connection was probably interrupted.
2539 def __init__(self
, downloaded
, expected
):
2540 super(ContentTooShortError
, self
).__init
__(
2541 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2544 self
.downloaded
= downloaded
2545 self
.expected
= expected
2548 class XAttrMetadataError(YoutubeDLError
):
2549 def __init__(self
, code
=None, msg
='Unknown error'):
2550 super(XAttrMetadataError
, self
).__init
__(msg
)
2554 # Parsing code and msg
2555 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2556 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2557 self
.reason
= 'NO_SPACE'
2558 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2559 self
.reason
= 'VALUE_TOO_LONG'
2561 self
.reason
= 'NOT_SUPPORTED'
2564 class XAttrUnavailableError(YoutubeDLError
):
2568 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2569 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2570 # expected HTTP responses to meet HTTP/1.0 or later (see also
2571 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2572 if sys
.version_info
< (3, 0):
2573 kwargs
['strict'] = True
2574 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2575 source_address
= ydl_handler
._params
.get('source_address')
2577 if source_address
is not None:
2578 # This is to workaround _create_connection() from socket where it will try all
2579 # address data from getaddrinfo() including IPv6. This filters the result from
2580 # getaddrinfo() based on the source_address value.
2581 # This is based on the cpython socket.create_connection() function.
2582 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2583 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2584 host
, port
= address
2586 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2587 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2588 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2589 if addrs
and not ip_addrs
:
2590 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2592 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2593 % (ip_version
, source_address
[0]))
2594 for res
in ip_addrs
:
2595 af
, socktype
, proto
, canonname
, sa
= res
2598 sock
= socket
.socket(af
, socktype
, proto
)
2599 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2600 sock
.settimeout(timeout
)
2601 sock
.bind(source_address
)
2603 err
= None # Explicitly break reference cycle
2605 except socket
.error
as _
:
2607 if sock
is not None:
2612 raise socket
.error('getaddrinfo returns an empty list')
2613 if hasattr(hc
, '_create_connection'):
2614 hc
._create
_connection
= _create_connection
2615 sa
= (source_address
, 0)
2616 if hasattr(hc
, 'source_address'): # Python 2.7+
2617 hc
.source_address
= sa
2619 def _hc_connect(self
, *args
, **kwargs
):
2620 sock
= _create_connection(
2621 (self
.host
, self
.port
), self
.timeout
, sa
)
2623 self
.sock
= ssl
.wrap_socket(
2624 sock
, self
.key_file
, self
.cert_file
,
2625 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2628 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2633 def handle_youtubedl_headers(headers
):
2634 filtered_headers
= headers
2636 if 'Youtubedl-no-compression' in filtered_headers
:
2637 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2638 del filtered_headers
['Youtubedl-no-compression']
2640 return filtered_headers
2643 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2644 """Handler for HTTP requests and responses.
2646 This class, when installed with an OpenerDirector, automatically adds
2647 the standard headers to every HTTP request and handles gzipped and
2648 deflated responses from web servers. If compression is to be avoided in
2649 a particular request, the original request in the program code only has
2650 to include the HTTP header "Youtubedl-no-compression", which will be
2651 removed before making the real request.
2653 Part of this code was copied from:
2655 http://techknack.net/python-urllib2-handlers/
2657 Andrew Rowls, the author of that code, agreed to release it to the
2661 def __init__(self
, params
, *args
, **kwargs
):
2662 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2663 self
._params
= params
2665 def http_open(self
, req
):
2666 conn_class
= compat_http_client
.HTTPConnection
2668 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2670 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2671 del req
.headers
['Ytdl-socks-proxy']
2673 return self
.do_open(functools
.partial(
2674 _create_http_connection
, self
, conn_class
, False),
2682 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2684 return zlib
.decompress(data
)
2686 def http_request(self
, req
):
2687 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2688 # always respected by websites, some tend to give out URLs with non percent-encoded
2689 # non-ASCII characters (see telemb.py, ard.py [#3412])
2690 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2691 # To work around aforementioned issue we will replace request's original URL with
2692 # percent-encoded one
2693 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2694 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2695 url
= req
.get_full_url()
2696 url_escaped
= escape_url(url
)
2698 # Substitute URL if any change after escaping
2699 if url
!= url_escaped
:
2700 req
= update_Request(req
, url
=url_escaped
)
2702 for h
, v
in std_headers
.items():
2703 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2704 # The dict keys are capitalized because of this bug by urllib
2705 if h
.capitalize() not in req
.headers
:
2706 req
.add_header(h
, v
)
2708 req
.headers
= handle_youtubedl_headers(req
.headers
)
2710 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2711 # Python 2.6 is brain-dead when it comes to fragments
2712 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2713 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2717 def http_response(self
, req
, resp
):
2720 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2721 content
= resp
.read()
2722 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2724 uncompressed
= io
.BytesIO(gz
.read())
2725 except IOError as original_ioerror
:
2726 # There may be junk add the end of the file
2727 # See http://stackoverflow.com/q/4928560/35070 for details
2728 for i
in range(1, 1024):
2730 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2731 uncompressed
= io
.BytesIO(gz
.read())
2736 raise original_ioerror
2737 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2738 resp
.msg
= old_resp
.msg
2739 del resp
.headers
['Content-encoding']
2741 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2742 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2743 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2744 resp
.msg
= old_resp
.msg
2745 del resp
.headers
['Content-encoding']
2746 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2747 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2748 if 300 <= resp
.code
< 400:
2749 location
= resp
.headers
.get('Location')
2751 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2752 if sys
.version_info
>= (3, 0):
2753 location
= location
.encode('iso-8859-1').decode('utf-8')
2755 location
= location
.decode('utf-8')
2756 location_escaped
= escape_url(location
)
2757 if location
!= location_escaped
:
2758 del resp
.headers
['Location']
2759 if sys
.version_info
< (3, 0):
2760 location_escaped
= location_escaped
.encode('utf-8')
2761 resp
.headers
['Location'] = location_escaped
2764 https_request
= http_request
2765 https_response
= http_response
2768 def make_socks_conn_class(base_class
, socks_proxy
):
2769 assert issubclass(base_class
, (
2770 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2772 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2773 if url_components
.scheme
.lower() == 'socks5':
2774 socks_type
= ProxyType
.SOCKS5
2775 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2776 socks_type
= ProxyType
.SOCKS4
2777 elif url_components
.scheme
.lower() == 'socks4a':
2778 socks_type
= ProxyType
.SOCKS4A
2780 def unquote_if_non_empty(s
):
2783 return compat_urllib_parse_unquote_plus(s
)
2787 url_components
.hostname
, url_components
.port
or 1080,
2789 unquote_if_non_empty(url_components
.username
),
2790 unquote_if_non_empty(url_components
.password
),
2793 class SocksConnection(base_class
):
2795 self
.sock
= sockssocket()
2796 self
.sock
.setproxy(*proxy_args
)
2797 if type(self
.timeout
) in (int, float):
2798 self
.sock
.settimeout(self
.timeout
)
2799 self
.sock
.connect((self
.host
, self
.port
))
2801 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2802 if hasattr(self
, '_context'): # Python > 2.6
2803 self
.sock
= self
._context
.wrap_socket(
2804 self
.sock
, server_hostname
=self
.host
)
2806 self
.sock
= ssl
.wrap_socket(self
.sock
)
2808 return SocksConnection
2811 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2812 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2813 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2814 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2815 self
._params
= params
2817 def https_open(self
, req
):
2819 conn_class
= self
._https
_conn
_class
2821 if hasattr(self
, '_context'): # python > 2.6
2822 kwargs
['context'] = self
._context
2823 if hasattr(self
, '_check_hostname'): # python 3.x
2824 kwargs
['check_hostname'] = self
._check
_hostname
2826 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2828 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2829 del req
.headers
['Ytdl-socks-proxy']
2831 return self
.do_open(functools
.partial(
2832 _create_http_connection
, self
, conn_class
, True),
2836 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2838 See [1] for cookie file format.
2840 1. https://curl.haxx.se/docs/http-cookies.html
2842 _HTTPONLY_PREFIX
= '#HttpOnly_'
2844 _HEADER
= '''# Netscape HTTP Cookie File
2845 # This file is generated by yt-dlp. Do not edit.
2848 _CookieFileEntry
= collections
.namedtuple(
2850 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2852 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2854 Save cookies to a file.
2856 Most of the code is taken from CPython 3.8 and slightly adapted
2857 to support cookie files with UTF-8 in both python 2 and 3.
2859 if filename
is None:
2860 if self
.filename
is not None:
2861 filename
= self
.filename
2863 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2865 # Store session cookies with `expires` set to 0 instead of an empty
2868 if cookie
.expires
is None:
2871 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2872 f
.write(self
._HEADER
)
2875 if not ignore_discard
and cookie
.discard
:
2877 if not ignore_expires
and cookie
.is_expired(now
):
2883 if cookie
.domain
.startswith('.'):
2884 initial_dot
= 'TRUE'
2886 initial_dot
= 'FALSE'
2887 if cookie
.expires
is not None:
2888 expires
= compat_str(cookie
.expires
)
2891 if cookie
.value
is None:
2892 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2893 # with no name, whereas http.cookiejar regards it as a
2894 # cookie with no value.
2899 value
= cookie
.value
2901 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2902 secure
, expires
, name
, value
]) + '\n')
2904 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2905 """Load cookies from a file."""
2906 if filename
is None:
2907 if self
.filename
is not None:
2908 filename
= self
.filename
2910 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2912 def prepare_line(line
):
2913 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2914 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2915 # comments and empty lines are fine
2916 if line
.startswith('#') or not line
.strip():
2918 cookie_list
= line
.split('\t')
2919 if len(cookie_list
) != self
._ENTRY
_LEN
:
2920 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2921 cookie
= self
._CookieFileEntry
(*cookie_list
)
2922 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2923 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2927 with io
.open(filename
, encoding
='utf-8') as f
:
2930 cf
.write(prepare_line(line
))
2931 except compat_cookiejar
.LoadError
as e
:
2933 'WARNING: skipping cookie file entry due to %s: %r\n'
2934 % (e
, line
), sys
.stderr
)
2937 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2938 # Session cookies are denoted by either `expires` field set to
2939 # an empty string or 0. MozillaCookieJar only recognizes the former
2940 # (see [1]). So we need force the latter to be recognized as session
2941 # cookies on our own.
2942 # Session cookies may be important for cookies-based authentication,
2943 # e.g. usually, when user does not check 'Remember me' check box while
2944 # logging in on a site, some important cookies are stored as session
2945 # cookies so that not recognizing them will result in failed login.
2946 # 1. https://bugs.python.org/issue17164
2948 # Treat `expires=0` cookies as session cookies
2949 if cookie
.expires
== 0:
2950 cookie
.expires
= None
2951 cookie
.discard
= True
2954 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2955 def __init__(self
, cookiejar
=None):
2956 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2958 def http_response(self
, request
, response
):
2959 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2960 # characters in Set-Cookie HTTP header of last response (see
2961 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2962 # In order to at least prevent crashing we will percent encode Set-Cookie
2963 # header before HTTPCookieProcessor starts processing it.
2964 # if sys.version_info < (3, 0) and response.headers:
2965 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2966 # set_cookie = response.headers.get(set_cookie_header)
2968 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2969 # if set_cookie != set_cookie_escaped:
2970 # del response.headers[set_cookie_header]
2971 # response.headers[set_cookie_header] = set_cookie_escaped
2972 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2974 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2975 https_response
= http_response
2978 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2979 """YoutubeDL redirect handler
2981 The code is based on HTTPRedirectHandler implementation from CPython [1].
2983 This redirect handler solves two issues:
2984 - ensures redirect URL is always unicode under python 2
2985 - introduces support for experimental HTTP response status code
2986 308 Permanent Redirect [2] used by some sites [3]
2988 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2989 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2990 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2993 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
2995 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2996 """Return a Request or None in response to a redirect.
2998 This is called by the http_error_30x methods when a
2999 redirection response is received. If a redirection should
3000 take place, return a new Request to allow http_error_30x to
3001 perform the redirect. Otherwise, raise HTTPError if no-one
3002 else should try to handle this url. Return None if you can't
3003 but another Handler might.
3005 m
= req
.get_method()
3006 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
3007 or code
in (301, 302, 303) and m
== "POST")):
3008 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
3009 # Strictly (according to RFC 2616), 301 or 302 in response to
3010 # a POST MUST NOT cause a redirection without confirmation
3011 # from the user (of urllib.request, in this case). In practice,
3012 # essentially all clients do redirect in this case, so we do
3015 # On python 2 urlh.geturl() may sometimes return redirect URL
3016 # as byte string instead of unicode. This workaround allows
3017 # to force it always return unicode.
3018 if sys
.version_info
[0] < 3:
3019 newurl
= compat_str(newurl
)
3021 # Be conciliant with URIs containing a space. This is mainly
3022 # redundant with the more complete encoding done in http_error_302(),
3023 # but it is kept for compatibility with other callers.
3024 newurl
= newurl
.replace(' ', '%20')
3026 CONTENT_HEADERS
= ("content-length", "content-type")
3027 # NB: don't use dict comprehension for python 2.6 compatibility
3028 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
3029 if k
.lower() not in CONTENT_HEADERS
)
3030 return compat_urllib_request
.Request(
3031 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
3035 def extract_timezone(date_str
):
3038 ^.{8,}? # >=8 char non-TZ prefix, if present
3039 (?P<tz>Z| # just the UTC Z, or
3040 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
3041 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
3042 [ ]? # optional space
3043 (?P<sign>\+|-) # +/-
3044 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
3048 timezone
= datetime
.timedelta()
3050 date_str
= date_str
[:-len(m
.group('tz'))]
3051 if not m
.group('sign'):
3052 timezone
= datetime
.timedelta()
3054 sign
= 1 if m
.group('sign') == '+' else -1
3055 timezone
= datetime
.timedelta(
3056 hours
=sign
* int(m
.group('hours')),
3057 minutes
=sign
* int(m
.group('minutes')))
3058 return timezone
, date_str
3061 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3062 """ Return a UNIX timestamp from the given date """
3064 if date_str
is None:
3067 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3069 if timezone
is None:
3070 timezone
, date_str
= extract_timezone(date_str
)
3073 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3074 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3075 return calendar
.timegm(dt
.timetuple())
3080 def date_formats(day_first
=True):
3081 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3084 def unified_strdate(date_str
, day_first
=True):
3085 """Return a string with the date in the format YYYYMMDD"""
3087 if date_str
is None:
3091 date_str
= date_str
.replace(',', ' ')
3092 # Remove AM/PM + timezone
3093 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3094 _
, date_str
= extract_timezone(date_str
)
3096 for expression
in date_formats(day_first
):
3098 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3101 if upload_date
is None:
3102 timetuple
= email
.utils
.parsedate_tz(date_str
)
3105 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3108 if upload_date
is not None:
3109 return compat_str(upload_date
)
3112 def unified_timestamp(date_str
, day_first
=True):
3113 if date_str
is None:
3116 date_str
= re
.sub(r
'[,|]', '', date_str
)
3118 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3119 timezone
, date_str
= extract_timezone(date_str
)
3121 # Remove AM/PM + timezone
3122 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3124 # Remove unrecognized timezones from ISO 8601 alike timestamps
3125 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3127 date_str
= date_str
[:-len(m
.group('tz'))]
3129 # Python only supports microseconds, so remove nanoseconds
3130 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3132 date_str
= m
.group(1)
3134 for expression
in date_formats(day_first
):
3136 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3137 return calendar
.timegm(dt
.timetuple())
3140 timetuple
= email
.utils
.parsedate_tz(date_str
)
3142 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3145 def determine_ext(url
, default_ext
='unknown_video'):
3146 if url
is None or '.' not in url
:
3148 guess
= url
.partition('?')[0].rpartition('.')[2]
3149 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3151 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3152 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3153 return guess
.rstrip('/')
3158 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3159 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3162 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3164 Return a datetime object from a string in the format YYYYMMDD or
3165 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3167 format: string date format used to return datetime object from
3168 precision: round the time portion of a datetime object.
3169 auto|microsecond|second|minute|hour|day.
3170 auto: round to the unit provided in date_str (if applicable).
3172 auto_precision
= False
3173 if precision
== 'auto':
3174 auto_precision
= True
3175 precision
= 'microsecond'
3176 today
= datetime_round(datetime
.datetime
.now(), precision
)
3177 if date_str
in ('now', 'today'):
3179 if date_str
== 'yesterday':
3180 return today
- datetime
.timedelta(days
=1)
3182 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3184 if match
is not None:
3185 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3186 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3187 unit
= match
.group('unit')
3188 if unit
== 'month' or unit
== 'year':
3189 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3195 delta
= datetime
.timedelta(**{unit + 's': time}
)
3196 new_date
= start_time
+ delta
3198 return datetime_round(new_date
, unit
)
3201 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3204 def date_from_str(date_str
, format
='%Y%m%d'):
3206 Return a datetime object from a string in the format YYYYMMDD or
3207 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3209 format: string date format used to return datetime object from
3211 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3214 def datetime_add_months(dt
, months
):
3215 """Increment/Decrement a datetime object by months."""
3216 month
= dt
.month
+ months
- 1
3217 year
= dt
.year
+ month
// 12
3218 month
= month
% 12 + 1
3219 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3220 return dt
.replace(year
, month
, day
)
3223 def datetime_round(dt
, precision
='day'):
3225 Round a datetime object's time to a specific precision
3227 if precision
== 'microsecond':
3236 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3237 timestamp
= calendar
.timegm(dt
.timetuple())
3238 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3241 def hyphenate_date(date_str
):
3243 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3244 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3245 if match
is not None:
3246 return '-'.join(match
.groups())
3251 class DateRange(object):
3252 """Represents a time interval between two dates"""
3254 def __init__(self
, start
=None, end
=None):
3255 """start and end must be strings in the format accepted by date"""
3256 if start
is not None:
3257 self
.start
= date_from_str(start
)
3259 self
.start
= datetime
.datetime
.min.date()
3261 self
.end
= date_from_str(end
)
3263 self
.end
= datetime
.datetime
.max.date()
3264 if self
.start
> self
.end
:
3265 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3269 """Returns a range that only contains the given day"""
3270 return cls(day
, day
)
3272 def __contains__(self
, date
):
3273 """Check if the date is in the range"""
3274 if not isinstance(date
, datetime
.date
):
3275 date
= date_from_str(date
)
3276 return self
.start
<= date
<= self
.end
3279 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3282 def platform_name():
3283 """ Returns the platform name as a compat_str """
3284 res
= platform
.platform()
3285 if isinstance(res
, bytes):
3286 res
= res
.decode(preferredencoding())
3288 assert isinstance(res
, compat_str
)
3292 def _windows_write_string(s
, out
):
3293 """ Returns True if the string was written using special methods,
3294 False if it has yet to be written out."""
3295 # Adapted from http://stackoverflow.com/a/3259271/35070
3298 import ctypes
.wintypes
3306 fileno
= out
.fileno()
3307 except AttributeError:
3308 # If the output stream doesn't have a fileno, it's virtual
3310 except io
.UnsupportedOperation
:
3311 # Some strange Windows pseudo files?
3313 if fileno
not in WIN_OUTPUT_IDS
:
3316 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3317 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3318 ('GetStdHandle', ctypes
.windll
.kernel32
))
3319 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3321 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3322 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3323 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3324 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3325 written
= ctypes
.wintypes
.DWORD(0)
3327 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3328 FILE_TYPE_CHAR
= 0x0002
3329 FILE_TYPE_REMOTE
= 0x8000
3330 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3331 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3332 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3333 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3334 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3336 def not_a_console(handle
):
3337 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3339 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3340 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3342 if not_a_console(h
):
3345 def next_nonbmp_pos(s
):
3347 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3348 except StopIteration:
3352 count
= min(next_nonbmp_pos(s
), 1024)
3354 ret
= WriteConsoleW(
3355 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3357 raise OSError('Failed to write string')
3358 if not count
: # We just wrote a non-BMP character
3359 assert written
.value
== 2
3362 assert written
.value
> 0
3363 s
= s
[written
.value
:]
3367 def write_string(s
, out
=None, encoding
=None):
3370 assert type(s
) == compat_str
3372 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3373 if _windows_write_string(s
, out
):
3376 if ('b' in getattr(out
, 'mode', '')
3377 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3378 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3380 elif hasattr(out
, 'buffer'):
3381 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3382 byt
= s
.encode(enc
, 'ignore')
3383 out
.buffer.write(byt
)
3389 def bytes_to_intlist(bs
):
3392 if isinstance(bs
[0], int): # Python 3
3395 return [ord(c
) for c
in bs
]
3398 def intlist_to_bytes(xs
):
3401 return compat_struct_pack('%dB' % len(xs
), *xs
)
3404 # Cross-platform file locking
3405 if sys
.platform
== 'win32':
3406 import ctypes
.wintypes
3409 class OVERLAPPED(ctypes
.Structure
):
3411 ('Internal', ctypes
.wintypes
.LPVOID
),
3412 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3413 ('Offset', ctypes
.wintypes
.DWORD
),
3414 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3415 ('hEvent', ctypes
.wintypes
.HANDLE
),
3418 kernel32
= ctypes
.windll
.kernel32
3419 LockFileEx
= kernel32
.LockFileEx
3420 LockFileEx
.argtypes
= [
3421 ctypes
.wintypes
.HANDLE
, # hFile
3422 ctypes
.wintypes
.DWORD
, # dwFlags
3423 ctypes
.wintypes
.DWORD
, # dwReserved
3424 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3425 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3426 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3428 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3429 UnlockFileEx
= kernel32
.UnlockFileEx
3430 UnlockFileEx
.argtypes
= [
3431 ctypes
.wintypes
.HANDLE
, # hFile
3432 ctypes
.wintypes
.DWORD
, # dwReserved
3433 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3434 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3435 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3437 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3438 whole_low
= 0xffffffff
3439 whole_high
= 0x7fffffff
3441 def _lock_file(f
, exclusive
):
3442 overlapped
= OVERLAPPED()
3443 overlapped
.Offset
= 0
3444 overlapped
.OffsetHigh
= 0
3445 overlapped
.hEvent
= 0
3446 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3447 handle
= msvcrt
.get_osfhandle(f
.fileno())
3448 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3449 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3450 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3452 def _unlock_file(f
):
3453 assert f
._lock
_file
_overlapped
_p
3454 handle
= msvcrt
.get_osfhandle(f
.fileno())
3455 if not UnlockFileEx(handle
, 0,
3456 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3457 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3460 # Some platforms, such as Jython, is missing fcntl
3464 def _lock_file(f
, exclusive
):
3465 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3467 def _unlock_file(f
):
3468 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3470 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3472 def _lock_file(f
, exclusive
):
3473 raise IOError(UNSUPPORTED_MSG
)
3475 def _unlock_file(f
):
3476 raise IOError(UNSUPPORTED_MSG
)
3479 class locked_file(object):
3480 def __init__(self
, filename
, mode
, encoding
=None):
3481 assert mode
in ['r', 'a', 'w']
3482 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3485 def __enter__(self
):
3486 exclusive
= self
.mode
!= 'r'
3488 _lock_file(self
.f
, exclusive
)
3494 def __exit__(self
, etype
, value
, traceback
):
3496 _unlock_file(self
.f
)
3503 def write(self
, *args
):
3504 return self
.f
.write(*args
)
3506 def read(self
, *args
):
3507 return self
.f
.read(*args
)
3510 def get_filesystem_encoding():
3511 encoding
= sys
.getfilesystemencoding()
3512 return encoding
if encoding
is not None else 'utf-8'
3515 def shell_quote(args
):
3517 encoding
= get_filesystem_encoding()
3519 if isinstance(a
, bytes):
3520 # We may get a filename encoded with 'encodeFilename'
3521 a
= a
.decode(encoding
)
3522 quoted_args
.append(compat_shlex_quote(a
))
3523 return ' '.join(quoted_args
)
3526 def smuggle_url(url
, data
):
3527 """ Pass additional data in a URL for internal use. """
3529 url
, idata
= unsmuggle_url(url
, {})
3531 sdata
= compat_urllib_parse_urlencode(
3532 {'__youtubedl_smuggle': json.dumps(data)}
)
3533 return url
+ '#' + sdata
3536 def unsmuggle_url(smug_url
, default
=None):
3537 if '#__youtubedl_smuggle' not in smug_url
:
3538 return smug_url
, default
3539 url
, _
, sdata
= smug_url
.rpartition('#')
3540 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3541 data
= json
.loads(jsond
)
3545 def format_bytes(bytes):
3548 if type(bytes) is str:
3549 bytes = float(bytes)
3553 exponent
= int(math
.log(bytes, 1024.0))
3554 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3555 converted
= float(bytes) / float(1024 ** exponent
)
3556 return '%.2f%s' % (converted
, suffix
)
3559 def lookup_unit_table(unit_table
, s
):
3560 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3562 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3565 num_str
= m
.group('num').replace(',', '.')
3566 mult
= unit_table
[m
.group('unit')]
3567 return int(float(num_str
) * mult
)
3570 def parse_filesize(s
):
3574 # The lower-case forms are of course incorrect and unofficial,
3575 # but we support those too
3592 'megabytes': 1000 ** 2,
3593 'mebibytes': 1024 ** 2,
3599 'gigabytes': 1000 ** 3,
3600 'gibibytes': 1024 ** 3,
3606 'terabytes': 1000 ** 4,
3607 'tebibytes': 1024 ** 4,
3613 'petabytes': 1000 ** 5,
3614 'pebibytes': 1024 ** 5,
3620 'exabytes': 1000 ** 6,
3621 'exbibytes': 1024 ** 6,
3627 'zettabytes': 1000 ** 7,
3628 'zebibytes': 1024 ** 7,
3634 'yottabytes': 1000 ** 8,
3635 'yobibytes': 1024 ** 8,
3638 return lookup_unit_table(_UNIT_TABLE
, s
)
3647 if re
.match(r
'^[\d,.]+$', s
):
3648 return str_to_int(s
)
3659 return lookup_unit_table(_UNIT_TABLE
, s
)
3662 def parse_resolution(s
):
3666 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3669 'width': int(mobj
.group('w')),
3670 'height': int(mobj
.group('h')),
3673 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3675 return {'height': int(mobj.group(1))}
3677 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3679 return {'height': int(mobj.group(1)) * 540}
3684 def parse_bitrate(s
):
3685 if not isinstance(s
, compat_str
):
3687 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3689 return int(mobj
.group(1))
3692 def month_by_name(name
, lang
='en'):
3693 """ Return the number of a month by (locale-independently) English name """
3695 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3698 return month_names
.index(name
) + 1
3703 def month_by_abbreviation(abbrev
):
3704 """ Return the number of a month by (locale-independently) English
3708 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3713 def fix_xml_ampersands(xml_str
):
3714 """Replace all the '&' by '&' in XML"""
3716 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3721 def setproctitle(title
):
3722 assert isinstance(title
, compat_str
)
3724 # ctypes in Jython is not complete
3725 # http://bugs.jython.org/issue2148
3726 if sys
.platform
.startswith('java'):
3730 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3734 # LoadLibrary in Windows Python 2.7.13 only expects
3735 # a bytestring, but since unicode_literals turns
3736 # every string into a unicode string, it fails.
3738 title_bytes
= title
.encode('utf-8')
3739 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3740 buf
.value
= title_bytes
3742 libc
.prctl(15, buf
, 0, 0, 0)
3743 except AttributeError:
3744 return # Strange libc, just skip this
3747 def remove_start(s
, start
):
3748 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3751 def remove_end(s
, end
):
3752 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3755 def remove_quotes(s
):
3756 if s
is None or len(s
) < 2:
3758 for quote
in ('"', "'", ):
3759 if s
[0] == quote
and s
[-1] == quote
:
3764 def get_domain(url
):
3765 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3766 return domain
.group('domain') if domain
else None
3769 def url_basename(url
):
3770 path
= compat_urlparse
.urlparse(url
).path
3771 return path
.strip('/').split('/')[-1]
3775 return re
.match(r
'https?://[^?#&]+/', url
).group()
3778 def urljoin(base
, path
):
3779 if isinstance(path
, bytes):
3780 path
= path
.decode('utf-8')
3781 if not isinstance(path
, compat_str
) or not path
:
3783 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3785 if isinstance(base
, bytes):
3786 base
= base
.decode('utf-8')
3787 if not isinstance(base
, compat_str
) or not re
.match(
3788 r
'^(?:https?:)?//', base
):
3790 return compat_urlparse
.urljoin(base
, path
)
3793 class HEADRequest(compat_urllib_request
.Request
):
3794 def get_method(self
):
3798 class PUTRequest(compat_urllib_request
.Request
):
3799 def get_method(self
):
3803 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3806 v
= getattr(v
, get_attr
, None)
3812 return int(v
) * invscale
// scale
3813 except (ValueError, TypeError):
3817 def str_or_none(v
, default
=None):
3818 return default
if v
is None else compat_str(v
)
3821 def str_to_int(int_str
):
3822 """ A more relaxed version of int_or_none """
3823 if isinstance(int_str
, compat_integer_types
):
3825 elif isinstance(int_str
, compat_str
):
3826 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3827 return int_or_none(int_str
)
3830 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3834 return float(v
) * invscale
/ scale
3835 except (ValueError, TypeError):
3839 def bool_or_none(v
, default
=None):
3840 return v
if isinstance(v
, bool) else default
3843 def strip_or_none(v
, default
=None):
3844 return v
.strip() if isinstance(v
, compat_str
) else default
3847 def url_or_none(url
):
3848 if not url
or not isinstance(url
, compat_str
):
3851 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3854 def strftime_or_none(timestamp
, date_format
, default
=None):
3855 datetime_object
= None
3857 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3858 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3859 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3860 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3861 return datetime_object
.strftime(date_format
)
3862 except (ValueError, TypeError, AttributeError):
3866 def parse_duration(s
):
3867 if not isinstance(s
, compat_basestring
):
3872 days
, hours
, mins
, secs
, ms
= [None] * 5
3873 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3875 days
, hours
, mins
, secs
, ms
= m
.groups()
3880 [0-9]+\s*y(?:ears?)?\s*
3883 [0-9]+\s*m(?:onths?)?\s*
3886 [0-9]+\s*w(?:eeks?)?\s*
3889 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3893 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3896 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3899 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3902 days
, hours
, mins
, secs
, ms
= m
.groups()
3904 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3906 hours
, mins
= m
.groups()
3912 duration
+= float(secs
)
3914 duration
+= float(mins
) * 60
3916 duration
+= float(hours
) * 60 * 60
3918 duration
+= float(days
) * 24 * 60 * 60
3920 duration
+= float(ms
)
3924 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3925 name
, real_ext
= os
.path
.splitext(filename
)
3927 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3928 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3929 else '{0}.{1}'.format(filename
, ext
))
3932 def replace_extension(filename
, ext
, expected_real_ext
=None):
3933 name
, real_ext
= os
.path
.splitext(filename
)
3934 return '{0}.{1}'.format(
3935 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3939 def check_executable(exe
, args
=[]):
3940 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3941 args can be a list of arguments for a short output (like -version) """
3943 process_communicate_or_kill(subprocess
.Popen(
3944 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3950 def get_exe_version(exe
, args
=['--version'],
3951 version_re
=None, unrecognized
='present'):
3952 """ Returns the version of the specified executable,
3953 or False if the executable is not present """
3955 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3956 # SIGTTOU if yt-dlp is run in the background.
3957 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3958 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3959 [encodeArgument(exe
)] + args
,
3960 stdin
=subprocess
.PIPE
,
3961 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3964 if isinstance(out
, bytes): # Python 2.x
3965 out
= out
.decode('ascii', 'ignore')
3966 return detect_exe_version(out
, version_re
, unrecognized
)
3969 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3970 assert isinstance(output
, compat_str
)
3971 if version_re
is None:
3972 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3973 m
= re
.search(version_re
, output
)
3980 class LazyList(collections
.abc
.Sequence
):
3981 ''' Lazy immutable list from an iterable
3982 Note that slices of a LazyList are lists and not LazyList'''
3984 class IndexError(IndexError):
3987 def __init__(self
, iterable
):
3988 self
.__iterable
= iter(iterable
)
3990 self
.__reversed
= False
3994 # We need to consume the entire iterable to iterate in reverse
3995 yield from self
.exhaust()
3997 yield from self
.__cache
3998 for item
in self
.__iterable
:
3999 self
.__cache
.append(item
)
4002 def __exhaust(self
):
4003 self
.__cache
.extend(self
.__iterable
)
4007 ''' Evaluate the entire iterable '''
4008 return self
.__exhaust
()[::-1 if self
.__reversed
else 1]
4011 def __reverse_index(x
):
4012 return None if x
is None else -(x
+ 1)
4014 def __getitem__(self
, idx
):
4015 if isinstance(idx
, slice):
4017 idx
= slice(self
.__reverse
_index
(idx
.start
), self
.__reverse
_index
(idx
.stop
), -(idx
.step
or 1))
4018 start
, stop
, step
= idx
.start
, idx
.stop
, idx
.step
or 1
4019 elif isinstance(idx
, int):
4021 idx
= self
.__reverse
_index
(idx
)
4022 start
, stop
, step
= idx
, idx
, 0
4024 raise TypeError('indices must be integers or slices')
4025 if ((start
or 0) < 0 or (stop
or 0) < 0
4026 or (start
is None and step
< 0)
4027 or (stop
is None and step
> 0)):
4028 # We need to consume the entire iterable to be able to slice from the end
4029 # Obviously, never use this with infinite iterables
4032 return self
.__cache
[idx
]
4033 except IndexError as e
:
4034 raise self
.IndexError(e
) from e
4035 n
= max(start
or 0, stop
or 0) - len(self
.__cache
) + 1
4037 self
.__cache
.extend(itertools
.islice(self
.__iterable
, n
))
4039 return self
.__cache
[idx
]
4040 except IndexError as e
:
4041 raise self
.IndexError(e
) from e
4045 self
[-1] if self
.__reversed
else self
[0]
4046 except self
.IndexError:
4052 return len(self
.__cache
)
4055 self
.__reversed
= not self
.__reversed
4059 # repr and str should mimic a list. So we exhaust the iterable
4060 return repr(self
.exhaust())
4063 return repr(self
.exhaust())
4068 # This is only useful for tests
4069 return len(self
.getslice())
4071 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
4072 self
._pagefunc
= pagefunc
4073 self
._pagesize
= pagesize
4074 self
._use
_cache
= use_cache
4077 def getpage(self
, pagenum
):
4078 page_results
= self
._cache
.get(pagenum
) or list(self
._pagefunc
(pagenum
))
4080 self
._cache
[pagenum
] = page_results
4083 def getslice(self
, start
=0, end
=None):
4084 return list(self
._getslice
(start
, end
))
4086 def _getslice(self
, start
, end
):
4087 raise NotImplementedError('This method must be implemented by subclasses')
4089 def __getitem__(self
, idx
):
4090 # NOTE: cache must be enabled if this is used
4091 if not isinstance(idx
, int) or idx
< 0:
4092 raise TypeError('indices must be non-negative integers')
4093 entries
= self
.getslice(idx
, idx
+ 1)
4094 return entries
[0] if entries
else None
4097 class OnDemandPagedList(PagedList
):
4098 def _getslice(self
, start
, end
):
4099 for pagenum
in itertools
.count(start
// self
._pagesize
):
4100 firstid
= pagenum
* self
._pagesize
4101 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
4102 if start
>= nextfirstid
:
4106 start
% self
._pagesize
4107 if firstid
<= start
< nextfirstid
4110 ((end
- 1) % self
._pagesize
) + 1
4111 if (end
is not None and firstid
<= end
<= nextfirstid
)
4114 page_results
= self
.getpage(pagenum
)
4115 if startv
!= 0 or endv
is not None:
4116 page_results
= page_results
[startv
:endv
]
4117 yield from page_results
4119 # A little optimization - if current page is not "full", ie. does
4120 # not contain page_size videos then we can assume that this page
4121 # is the last one - there are no more ids on further pages -
4122 # i.e. no need to query again.
4123 if len(page_results
) + startv
< self
._pagesize
:
4126 # If we got the whole page, but the next page is not interesting,
4127 # break out early as well
4128 if end
== nextfirstid
:
4132 class InAdvancePagedList(PagedList
):
4133 def __init__(self
, pagefunc
, pagecount
, pagesize
):
4134 self
._pagecount
= pagecount
4135 PagedList
.__init
__(self
, pagefunc
, pagesize
, True)
4137 def _getslice(self
, start
, end
):
4138 start_page
= start
// self
._pagesize
4140 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4141 skip_elems
= start
- start_page
* self
._pagesize
4142 only_more
= None if end
is None else end
- start
4143 for pagenum
in range(start_page
, end_page
):
4144 page_results
= self
.getpage(pagenum
)
4146 page_results
= page_results
[skip_elems
:]
4148 if only_more
is not None:
4149 if len(page_results
) < only_more
:
4150 only_more
-= len(page_results
)
4152 yield from page_results
[:only_more
]
4154 yield from page_results
4157 def uppercase_escape(s
):
4158 unicode_escape
= codecs
.getdecoder('unicode_escape')
4160 r
'\\U[0-9a-fA-F]{8}',
4161 lambda m
: unicode_escape(m
.group(0))[0],
4165 def lowercase_escape(s
):
4166 unicode_escape
= codecs
.getdecoder('unicode_escape')
4168 r
'\\u[0-9a-fA-F]{4}',
4169 lambda m
: unicode_escape(m
.group(0))[0],
4173 def escape_rfc3986(s
):
4174 """Escape non-ASCII characters as suggested by RFC 3986"""
4175 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4176 s
= s
.encode('utf-8')
4177 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4180 def escape_url(url
):
4181 """Escape URL as suggested by RFC 3986"""
4182 url_parsed
= compat_urllib_parse_urlparse(url
)
4183 return url_parsed
._replace
(
4184 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4185 path
=escape_rfc3986(url_parsed
.path
),
4186 params
=escape_rfc3986(url_parsed
.params
),
4187 query
=escape_rfc3986(url_parsed
.query
),
4188 fragment
=escape_rfc3986(url_parsed
.fragment
)
4193 return compat_parse_qs(compat_urllib_parse_urlparse(url
).query
)
4196 def read_batch_urls(batch_fd
):
4198 if not isinstance(url
, compat_str
):
4199 url
= url
.decode('utf-8', 'replace')
4200 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4201 for bom
in BOM_UTF8
:
4202 if url
.startswith(bom
):
4203 url
= url
[len(bom
):]
4205 if not url
or url
.startswith(('#', ';', ']')):
4207 # "#" cannot be stripped out since it is part of the URI
4208 # However, it can be safely stipped out if follwing a whitespace
4209 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4211 with contextlib
.closing(batch_fd
) as fd
:
4212 return [url
for url
in map(fixup
, fd
) if url
]
4215 def urlencode_postdata(*args
, **kargs
):
4216 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4219 def update_url_query(url
, query
):
4222 parsed_url
= compat_urlparse
.urlparse(url
)
4223 qs
= compat_parse_qs(parsed_url
.query
)
4225 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4226 query
=compat_urllib_parse_urlencode(qs
, True)))
4229 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4230 req_headers
= req
.headers
.copy()
4231 req_headers
.update(headers
)
4232 req_data
= data
or req
.data
4233 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4234 req_get_method
= req
.get_method()
4235 if req_get_method
== 'HEAD':
4236 req_type
= HEADRequest
4237 elif req_get_method
== 'PUT':
4238 req_type
= PUTRequest
4240 req_type
= compat_urllib_request
.Request
4242 req_url
, data
=req_data
, headers
=req_headers
,
4243 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4244 if hasattr(req
, 'timeout'):
4245 new_req
.timeout
= req
.timeout
4249 def _multipart_encode_impl(data
, boundary
):
4250 content_type
= 'multipart/form-data; boundary=%s' % boundary
4253 for k
, v
in data
.items():
4254 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4255 if isinstance(k
, compat_str
):
4256 k
= k
.encode('utf-8')
4257 if isinstance(v
, compat_str
):
4258 v
= v
.encode('utf-8')
4259 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4260 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4261 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4262 if boundary
.encode('ascii') in content
:
4263 raise ValueError('Boundary overlaps with data')
4266 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4268 return out
, content_type
4271 def multipart_encode(data
, boundary
=None):
4273 Encode a dict to RFC 7578-compliant form-data
4276 A dict where keys and values can be either Unicode or bytes-like
4279 If specified a Unicode object, it's used as the boundary. Otherwise
4280 a random boundary is generated.
4282 Reference: https://tools.ietf.org/html/rfc7578
4284 has_specified_boundary
= boundary
is not None
4287 if boundary
is None:
4288 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4291 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4294 if has_specified_boundary
:
4298 return out
, content_type
4301 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4302 if isinstance(key_or_keys
, (list, tuple)):
4303 for key
in key_or_keys
:
4304 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4308 return d
.get(key_or_keys
, default
)
4311 def try_get(src
, getter
, expected_type
=None):
4312 for get
in variadic(getter
):
4315 except (AttributeError, KeyError, TypeError, IndexError):
4318 if expected_type
is None or isinstance(v
, expected_type
):
4322 def merge_dicts(*dicts
):
4324 for a_dict
in dicts
:
4325 for k
, v
in a_dict
.items():
4329 or (isinstance(v
, compat_str
) and v
4330 and isinstance(merged
[k
], compat_str
)
4331 and not merged
[k
])):
4336 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4337 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4349 TV_PARENTAL_GUIDELINES
= {
4359 def parse_age_limit(s
):
4361 return s
if 0 <= s
<= 21 else None
4362 if not isinstance(s
, compat_basestring
):
4364 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4366 return int(m
.group('age'))
4369 return US_RATINGS
[s
]
4370 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4372 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4376 def strip_jsonp(code
):
4379 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4380 (?:\s*&&\s*(?P=func_name))?
4381 \s*\(\s*(?P<callback_data>.*)\);?
4382 \s*?(?://[^\n]*)*$''',
4383 r
'\g<callback_data>', code
)
4386 def js_to_json(code
, vars={}):
4387 # vars is a dict of var, val pairs to substitute
4388 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4389 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4391 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4392 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4397 if v
in ('true', 'false', 'null'):
4399 elif v
in ('undefined', 'void 0'):
4401 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4404 if v
[0] in ("'", '"'):
4405 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4410 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4412 for regex
, base
in INTEGER_TABLE
:
4413 im
= re
.match(regex
, v
)
4415 i
= int(im
.group(1), base
)
4416 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4423 return re
.sub(r
'''(?sx)
4424 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4425 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4426 {comment}|,(?={skip}[\]}}])|
4427 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4428 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4431 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4434 def qualities(quality_ids
):
4435 """ Get a numeric quality value out of a list of possible values """
4438 return quality_ids
.index(qid
)
4445 'default': '%(title)s [%(id)s].%(ext)s',
4446 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4452 'description': 'description',
4453 'annotation': 'annotations.xml',
4454 'infojson': 'info.json',
4455 'pl_thumbnail': None,
4456 'pl_description': 'description',
4457 'pl_infojson': 'info.json',
4460 # As of [1] format syntax is:
4461 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4462 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4463 STR_FORMAT_RE_TMPL
= r
'''(?x)
4464 (?<!%)(?P<prefix>(?:%%)*)
4466 (?P<has_key>\((?P<key>{0})\))? # mapping key
4468 (?:[#0\-+ ]+)? # conversion flags (optional)
4469 (?:\d+)? # minimum field width (optional)
4470 (?:\.\d+)? # precision (optional)
4471 [hlL]? # length modifier (optional)
4472 {1} # conversion type
4477 STR_FORMAT_TYPES
= 'diouxXeEfFgGcrs'
4480 def limit_length(s
, length
):
4481 """ Add ellipses to overly long strings """
4486 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4490 def version_tuple(v
):
4491 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4494 def is_outdated_version(version
, limit
, assume_new
=True):
4496 return not assume_new
4498 return version_tuple(version
) < version_tuple(limit
)
4500 return not assume_new
4503 def ytdl_is_updateable():
4504 """ Returns if yt-dlp can be updated with -U """
4507 from zipimport
import zipimporter
4509 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4512 def args_to_str(args
):
4513 # Get a short string representation for a subprocess command
4514 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4517 def error_to_compat_str(err
):
4519 # On python 2 error byte string must be decoded with proper
4520 # encoding rather than ascii
4521 if sys
.version_info
[0] < 3:
4522 err_str
= err_str
.decode(preferredencoding())
4526 def mimetype2ext(mt
):
4532 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4533 # it's the most popular one
4534 'audio/mpeg': 'mp3',
4535 'audio/x-wav': 'wav',
4540 _
, _
, res
= mt
.rpartition('/')
4541 res
= res
.split(';')[0].strip().lower()
4545 'smptett+xml': 'tt',
4549 'x-mp4-fragmented': 'mp4',
4550 'x-ms-sami': 'sami',
4553 'x-mpegurl': 'm3u8',
4554 'vnd.apple.mpegurl': 'm3u8',
4558 'vnd.ms-sstr+xml': 'ism',
4565 def parse_codecs(codecs_str
):
4566 # http://tools.ietf.org/html/rfc6381
4569 split_codecs
= list(filter(None, map(
4570 str.strip
, codecs_str
.strip().strip(',').split(','))))
4571 vcodec
, acodec
= None, None
4572 for full_codec
in split_codecs
:
4573 codec
= full_codec
.split('.')[0]
4574 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4577 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4581 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4582 if not vcodec
and not acodec
:
4583 if len(split_codecs
) == 2:
4585 'vcodec': split_codecs
[0],
4586 'acodec': split_codecs
[1],
4590 'vcodec': vcodec
or 'none',
4591 'acodec': acodec
or 'none',
4596 def urlhandle_detect_ext(url_handle
):
4597 getheader
= url_handle
.headers
.get
4599 cd
= getheader('Content-Disposition')
4601 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4603 e
= determine_ext(m
.group('filename'), default_ext
=None)
4607 return mimetype2ext(getheader('Content-Type'))
4610 def encode_data_uri(data
, mime_type
):
4611 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4614 def age_restricted(content_limit
, age_limit
):
4615 """ Returns True iff the content should be blocked """
4617 if age_limit
is None: # No limit set
4619 if content_limit
is None:
4620 return False # Content available for everyone
4621 return age_limit
< content_limit
4624 def is_html(first_bytes
):
4625 """ Detect whether a file contains HTML by examining its first bytes. """
4628 (b
'\xef\xbb\xbf', 'utf-8'),
4629 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4630 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4631 (b
'\xff\xfe', 'utf-16-le'),
4632 (b
'\xfe\xff', 'utf-16-be'),
4634 for bom
, enc
in BOMS
:
4635 if first_bytes
.startswith(bom
):
4636 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4639 s
= first_bytes
.decode('utf-8', 'replace')
4641 return re
.match(r
'^\s*<', s
)
4644 def determine_protocol(info_dict
):
4645 protocol
= info_dict
.get('protocol')
4646 if protocol
is not None:
4649 url
= info_dict
['url']
4650 if url
.startswith('rtmp'):
4652 elif url
.startswith('mms'):
4654 elif url
.startswith('rtsp'):
4657 ext
= determine_ext(url
)
4663 return compat_urllib_parse_urlparse(url
).scheme
4666 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4667 """ Render a list of rows, each as a list of values """
4669 def get_max_lens(table
):
4670 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4672 def filter_using_list(row
, filterArray
):
4673 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4676 max_lens
= get_max_lens(data
)
4677 header_row
= filter_using_list(header_row
, max_lens
)
4678 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4680 table
= [header_row
] + data
4681 max_lens
= get_max_lens(table
)
4683 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4684 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4685 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4688 def _match_one(filter_part
, dct
, incomplete
):
4689 # TODO: Generalize code with YoutubeDL._build_format_filter
4690 STRING_OPERATORS
= {
4691 '*=': operator
.contains
,
4692 '^=': lambda attr
, value
: attr
.startswith(value
),
4693 '$=': lambda attr
, value
: attr
.endswith(value
),
4694 '~=': lambda attr
, value
: re
.search(value
, attr
),
4696 COMPARISON_OPERATORS
= {
4698 '<=': operator
.le
, # "<=" must be defined above "<"
4705 operator_rex
= re
.compile(r
'''(?x)\s*
4707 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4709 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4710 (?P<quote>["\'])(?P
<quotedstrval
>.+?
)(?P
=quote
)|
4714 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4715 m = operator_rex.search(filter_part)
4717 unnegated_op = COMPARISON_OPERATORS[m.group('op')]
4718 if m.group('negation'):
4719 op = lambda attr, value: not unnegated_op(attr, value)
4722 actual_value = dct.get(m.group('key'))
4723 if (m.group('quotedstrval') is not None
4724 or m.group('strval') is not None
4725 # If the original field is a string and matching comparisonvalue is
4726 # a number we should respect the origin of the original field
4727 # and process comparison value as a string (see
4728 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4729 or actual_value is not None and m.group('intval') is not None
4730 and isinstance(actual_value, compat_str)):
4731 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4732 quote = m.group('quote')
4733 if quote is not None:
4734 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4736 if m.group('op') in STRING_OPERATORS:
4737 raise ValueError('Operator %s only supports string values!' % m.group('op'))
4739 comparison_value = int(m.group('intval'))
4741 comparison_value = parse_filesize(m.group('intval'))
4742 if comparison_value is None:
4743 comparison_value = parse_filesize(m.group('intval') + 'B')
4744 if comparison_value is None:
4746 'Invalid integer value %r in filter part %r' % (
4747 m.group('intval'), filter_part))
4748 if actual_value is None:
4749 return incomplete or m.group('none_inclusive')
4750 return op(actual_value, comparison_value)
4753 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4754 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4756 operator_rex = re.compile(r'''(?x
)\s
*
4757 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4759 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4760 m = operator_rex.search(filter_part)
4762 op = UNARY_OPERATORS[m.group('op')]
4763 actual_value = dct.get(m.group('key'))
4764 if incomplete and actual_value is None:
4766 return op(actual_value)
4768 raise ValueError('Invalid filter part %r' % filter_part)
4771 def match_str(filter_str, dct, incomplete=False):
4772 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
4773 When incomplete, all conditions passes on missing fields
4776 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
4777 for filter_part in re.split(r'(?<!\\)&', filter_str))
4780 def match_filter_func(filter_str):
4781 def _match_func(info_dict, *args, **kwargs):
4782 if match_str(filter_str, info_dict, *args, **kwargs):
4785 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4786 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4790 def parse_dfxp_time_expr(time_expr):
4794 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4796 return float(mobj.group('time_offset'))
4798 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4800 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4803 def srt_subtitles_timecode(seconds):
4804 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4807 def dfxp2srt(dfxp_data):
4809 @param dfxp_data A
bytes-like
object containing DFXP data
4810 @returns A
unicode object containing converted SRT data
4812 LEGACY_NAMESPACES = (
4813 (b'http://www.w3.org/ns/ttml', [
4814 b'http://www.w3.org/2004/11/ttaf1',
4815 b'http://www.w3.org/2006/04/ttaf1',
4816 b'http://www.w3.org/2006/10/ttaf1',
4818 (b'http://www.w3.org/ns/ttml#styling', [
4819 b'http://www.w3.org/ns/ttml#style',
4823 SUPPORTED_STYLING = [
4832 _x = functools.partial(xpath_with_ns, ns_map={
4833 'xml': 'http://www.w3.org/XML/1998/namespace',
4834 'ttml': 'http://www.w3.org/ns/ttml',
4835 'tts': 'http://www.w3.org/ns/ttml#styling',
4841 class TTMLPElementParser(object):
4843 _unclosed_elements = []
4844 _applied_styles = []
4846 def start(self, tag, attrib):
4847 if tag in (_x('ttml:br'), 'br'):
4850 unclosed_elements = []
4852 element_style_id = attrib.get('style')
4854 style.update(default_style)
4855 if element_style_id:
4856 style.update(styles.get(element_style_id, {}))
4857 for prop in SUPPORTED_STYLING:
4858 prop_val = attrib.get(_x('tts:' + prop))
4860 style[prop] = prop_val
4863 for k, v in sorted(style.items()):
4864 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4867 font += ' color="%s"' % v
4868 elif k == 'fontSize':
4869 font += ' size="%s"' % v
4870 elif k == 'fontFamily':
4871 font += ' face="%s"' % v
4872 elif k == 'fontWeight' and v == 'bold':
4874 unclosed_elements.append('b')
4875 elif k == 'fontStyle' and v == 'italic':
4877 unclosed_elements.append('i')
4878 elif k == 'textDecoration' and v == 'underline':
4880 unclosed_elements.append('u')
4882 self._out += '<font' + font + '>'
4883 unclosed_elements.append('font')
4885 if self._applied_styles:
4886 applied_style.update(self._applied_styles[-1])
4887 applied_style.update(style)
4888 self._applied_styles.append(applied_style)
4889 self._unclosed_elements.append(unclosed_elements)
4892 if tag not in (_x('ttml:br'), 'br'):
4893 unclosed_elements = self._unclosed_elements.pop()
4894 for element in reversed(unclosed_elements):
4895 self._out += '</%s>' % element
4896 if unclosed_elements and self._applied_styles:
4897 self._applied_styles.pop()
4899 def data(self, data):
4903 return self._out.strip()
4905 def parse_node(node):
4906 target = TTMLPElementParser()
4907 parser = xml.etree.ElementTree.XMLParser(target=target)
4908 parser.feed(xml.etree.ElementTree.tostring(node))
4909 return parser.close()
4911 for k, v in LEGACY_NAMESPACES:
4913 dfxp_data = dfxp_data.replace(ns, k)
4915 dfxp = compat_etree_fromstring(dfxp_data)
4917 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4920 raise ValueError('Invalid dfxp/TTML subtitle')
4924 for style in dfxp.findall(_x('.//ttml:style')):
4925 style_id = style.get('id') or style.get(_x('xml:id'))
4928 parent_style_id = style.get('style')
4930 if parent_style_id not in styles:
4933 styles[style_id] = styles[parent_style_id].copy()
4934 for prop in SUPPORTED_STYLING:
4935 prop_val = style.get(_x('tts:' + prop))
4937 styles.setdefault(style_id, {})[prop] = prop_val
4943 for p in ('body', 'div'):
4944 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4947 style = styles.get(ele.get('style'))
4950 default_style.update(style)
4952 for para, index in zip(paras, itertools.count(1)):
4953 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4954 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4955 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4956 if begin_time is None:
4961 end_time = begin_time + dur
4962 out.append('%d\n%s --> %s\n%s\n\n' % (
4964 srt_subtitles_timecode(begin_time),
4965 srt_subtitles_timecode(end_time),
4971 def cli_option(params, command_option, param):
4972 param = params.get(param)
4974 param = compat_str(param)
4975 return [command_option, param] if param is not None else []
4978 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4979 param = params.get(param)
4982 assert isinstance(param, bool)
4984 return [command_option + separator + (true_value if param else false_value)]
4985 return [command_option, true_value if param else false_value]
4988 def cli_valueless_option(params, command_option, param, expected_value=True):
4989 param = params.get(param)
4990 return [command_option] if param == expected_value else []
4993 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4994 if isinstance(argdict, (list, tuple)): # for backward compatibility
5001 assert isinstance(argdict, dict)
5003 assert isinstance(keys, (list, tuple))
5004 for key_list in keys:
5005 arg_list = list(filter(
5006 lambda x: x is not None,
5007 [argdict.get(key.lower()) for key in variadic(key_list)]))
5009 return [arg for args in arg_list for arg in args]
5013 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
5014 main_key, exe = main_key.lower(), exe.lower()
5015 root_key = exe if main_key == exe else f'{main_key}+{exe}'
5016 keys = [f'{root_key}{k}' for k in (keys or [''])]
5017 if root_key in keys:
5019 keys.append((main_key, exe))
5020 keys.append('default')
5023 return cli_configuration_args(argdict, keys, default, use_compat)
5026 class ISO639Utils(object):
5027 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
5086 'iw': 'heb', # Replaced by he in 1989 revision
5096 'in': 'ind', # Replaced by id in 1989 revision
5211 'ji': 'yid', # Replaced by yi in 1989 revision
5219 def short2long(cls, code):
5220 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5221 return cls._lang_map.get(code[:2])
5224 def long2short(cls, code):
5225 """Convert language code from ISO 639-2/T to ISO 639-1"""
5226 for short_name, long_name in cls._lang_map.items():
5227 if long_name == code:
5231 class ISO3166Utils(object):
5232 # From http://data.okfn.org/data/core/country-list
5234 'AF': 'Afghanistan',
5235 'AX': 'Åland Islands',
5238 'AS': 'American Samoa',
5243 'AG': 'Antigua and Barbuda',
5260 'BO': 'Bolivia, Plurinational State of',
5261 'BQ': 'Bonaire, Sint Eustatius and Saba',
5262 'BA': 'Bosnia and Herzegovina',
5264 'BV': 'Bouvet Island',
5266 'IO': 'British Indian Ocean Territory',
5267 'BN': 'Brunei Darussalam',
5269 'BF': 'Burkina Faso',
5275 'KY': 'Cayman Islands',
5276 'CF': 'Central African Republic',
5280 'CX': 'Christmas Island',
5281 'CC': 'Cocos (Keeling) Islands',
5285 'CD': 'Congo, the Democratic Republic of the',
5286 'CK': 'Cook Islands',
5288 'CI': 'Côte d\'Ivoire',
5293 'CZ': 'Czech Republic',
5297 'DO': 'Dominican Republic',
5300 'SV': 'El Salvador',
5301 'GQ': 'Equatorial Guinea',
5305 'FK': 'Falkland Islands (Malvinas)',
5306 'FO': 'Faroe Islands',
5310 'GF': 'French Guiana',
5311 'PF': 'French Polynesia',
5312 'TF': 'French Southern Territories',
5327 'GW': 'Guinea-Bissau',
5330 'HM': 'Heard Island and McDonald Islands',
5331 'VA': 'Holy See (Vatican City State)',
5338 'IR': 'Iran, Islamic Republic of',
5341 'IM': 'Isle of Man',
5351 'KP': 'Korea, Democratic People\'s Republic of',
5352 'KR': 'Korea, Republic of',
5355 'LA': 'Lao People\'s Democratic Republic',
5361 'LI': 'Liechtenstein',
5365 'MK': 'Macedonia, the Former Yugoslav Republic of',
5372 'MH': 'Marshall Islands',
5378 'FM': 'Micronesia, Federated States of',
5379 'MD': 'Moldova, Republic of',
5390 'NL': 'Netherlands',
5391 'NC': 'New Caledonia',
5392 'NZ': 'New Zealand',
5397 'NF': 'Norfolk Island',
5398 'MP': 'Northern Mariana Islands',
5403 'PS': 'Palestine, State of',
5405 'PG': 'Papua New Guinea',
5408 'PH': 'Philippines',
5412 'PR': 'Puerto Rico',
5416 'RU': 'Russian Federation',
5418 'BL': 'Saint Barthélemy',
5419 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5420 'KN': 'Saint Kitts and Nevis',
5421 'LC': 'Saint Lucia',
5422 'MF': 'Saint Martin (French part)',
5423 'PM': 'Saint Pierre and Miquelon',
5424 'VC': 'Saint Vincent and the Grenadines',
5427 'ST': 'Sao Tome and Principe',
5428 'SA': 'Saudi Arabia',
5432 'SL': 'Sierra Leone',
5434 'SX': 'Sint Maarten (Dutch part)',
5437 'SB': 'Solomon Islands',
5439 'ZA': 'South Africa',
5440 'GS': 'South Georgia and the South Sandwich Islands',
5441 'SS': 'South Sudan',
5446 'SJ': 'Svalbard and Jan Mayen',
5449 'CH': 'Switzerland',
5450 'SY': 'Syrian Arab Republic',
5451 'TW': 'Taiwan, Province of China',
5453 'TZ': 'Tanzania, United Republic of',
5455 'TL': 'Timor-Leste',
5459 'TT': 'Trinidad and Tobago',
5462 'TM': 'Turkmenistan',
5463 'TC': 'Turks and Caicos Islands',
5467 'AE': 'United Arab Emirates',
5468 'GB': 'United Kingdom',
5469 'US': 'United States',
5470 'UM': 'United States Minor Outlying Islands',
5474 'VE': 'Venezuela, Bolivarian Republic of',
5476 'VG': 'Virgin Islands, British',
5477 'VI': 'Virgin Islands, U.S.',
5478 'WF': 'Wallis and Futuna',
5479 'EH': 'Western Sahara',
5486 def short2full(cls, code):
5487 """Convert an ISO 3166-2 country code to the corresponding full name"""
5488 return cls._country_map.get(code.upper())
5491 class GeoUtils(object):
5492 # Major IPv4 address blocks per country
5494 'AD': '46.172.224.0/19',
5495 'AE': '94.200.0.0/13',
5496 'AF': '149.54.0.0/17',
5497 'AG': '209.59.64.0/18',
5498 'AI': '204.14.248.0/21',
5499 'AL': '46.99.0.0/16',
5500 'AM': '46.70.0.0/15',
5501 'AO': '105.168.0.0/13',
5502 'AP': '182.50.184.0/21',
5503 'AQ': '23.154.160.0/24',
5504 'AR': '181.0.0.0/12',
5505 'AS': '202.70.112.0/20',
5506 'AT': '77.116.0.0/14',
5507 'AU': '1.128.0.0/11',
5508 'AW': '181.41.0.0/18',
5509 'AX': '185.217.4.0/22',
5510 'AZ': '5.197.0.0/16',
5511 'BA': '31.176.128.0/17',
5512 'BB': '65.48.128.0/17',
5513 'BD': '114.130.0.0/16',
5515 'BF': '102.178.0.0/15',
5516 'BG': '95.42.0.0/15',
5517 'BH': '37.131.0.0/17',
5518 'BI': '154.117.192.0/18',
5519 'BJ': '137.255.0.0/16',
5520 'BL': '185.212.72.0/23',
5521 'BM': '196.12.64.0/18',
5522 'BN': '156.31.0.0/16',
5523 'BO': '161.56.0.0/16',
5524 'BQ': '161.0.80.0/20',
5525 'BR': '191.128.0.0/12',
5526 'BS': '24.51.64.0/18',
5527 'BT': '119.2.96.0/19',
5528 'BW': '168.167.0.0/16',
5529 'BY': '178.120.0.0/13',
5530 'BZ': '179.42.192.0/18',
5531 'CA': '99.224.0.0/11',
5532 'CD': '41.243.0.0/16',
5533 'CF': '197.242.176.0/21',
5534 'CG': '160.113.0.0/16',
5535 'CH': '85.0.0.0/13',
5536 'CI': '102.136.0.0/14',
5537 'CK': '202.65.32.0/19',
5538 'CL': '152.172.0.0/14',
5539 'CM': '102.244.0.0/14',
5540 'CN': '36.128.0.0/10',
5541 'CO': '181.240.0.0/12',
5542 'CR': '201.192.0.0/12',
5543 'CU': '152.206.0.0/15',
5544 'CV': '165.90.96.0/19',
5545 'CW': '190.88.128.0/17',
5546 'CY': '31.153.0.0/16',
5547 'CZ': '88.100.0.0/14',
5549 'DJ': '197.241.0.0/17',
5550 'DK': '87.48.0.0/12',
5551 'DM': '192.243.48.0/20',
5552 'DO': '152.166.0.0/15',
5553 'DZ': '41.96.0.0/12',
5554 'EC': '186.68.0.0/15',
5555 'EE': '90.190.0.0/15',
5556 'EG': '156.160.0.0/11',
5557 'ER': '196.200.96.0/20',
5558 'ES': '88.0.0.0/11',
5559 'ET': '196.188.0.0/14',
5560 'EU': '2.16.0.0/13',
5561 'FI': '91.152.0.0/13',
5562 'FJ': '144.120.0.0/16',
5563 'FK': '80.73.208.0/21',
5564 'FM': '119.252.112.0/20',
5565 'FO': '88.85.32.0/19',
5567 'GA': '41.158.0.0/15',
5569 'GD': '74.122.88.0/21',
5570 'GE': '31.146.0.0/16',
5571 'GF': '161.22.64.0/18',
5572 'GG': '62.68.160.0/19',
5573 'GH': '154.160.0.0/12',
5574 'GI': '95.164.0.0/16',
5575 'GL': '88.83.0.0/19',
5576 'GM': '160.182.0.0/15',
5577 'GN': '197.149.192.0/18',
5578 'GP': '104.250.0.0/19',
5579 'GQ': '105.235.224.0/20',
5580 'GR': '94.64.0.0/13',
5581 'GT': '168.234.0.0/16',
5582 'GU': '168.123.0.0/16',
5583 'GW': '197.214.80.0/20',
5584 'GY': '181.41.64.0/18',
5585 'HK': '113.252.0.0/14',
5586 'HN': '181.210.0.0/16',
5587 'HR': '93.136.0.0/13',
5588 'HT': '148.102.128.0/17',
5589 'HU': '84.0.0.0/14',
5590 'ID': '39.192.0.0/10',
5591 'IE': '87.32.0.0/12',
5592 'IL': '79.176.0.0/13',
5593 'IM': '5.62.80.0/20',
5594 'IN': '117.192.0.0/10',
5595 'IO': '203.83.48.0/21',
5596 'IQ': '37.236.0.0/14',
5597 'IR': '2.176.0.0/12',
5598 'IS': '82.221.0.0/16',
5599 'IT': '79.0.0.0/10',
5600 'JE': '87.244.64.0/18',
5601 'JM': '72.27.0.0/17',
5602 'JO': '176.29.0.0/16',
5603 'JP': '133.0.0.0/8',
5604 'KE': '105.48.0.0/12',
5605 'KG': '158.181.128.0/17',
5606 'KH': '36.37.128.0/17',
5607 'KI': '103.25.140.0/22',
5608 'KM': '197.255.224.0/20',
5609 'KN': '198.167.192.0/19',
5610 'KP': '175.45.176.0/22',
5611 'KR': '175.192.0.0/10',
5612 'KW': '37.36.0.0/14',
5613 'KY': '64.96.0.0/15',
5614 'KZ': '2.72.0.0/13',
5615 'LA': '115.84.64.0/18',
5616 'LB': '178.135.0.0/16',
5617 'LC': '24.92.144.0/20',
5618 'LI': '82.117.0.0/19',
5619 'LK': '112.134.0.0/15',
5620 'LR': '102.183.0.0/16',
5621 'LS': '129.232.0.0/17',
5622 'LT': '78.56.0.0/13',
5623 'LU': '188.42.0.0/16',
5624 'LV': '46.109.0.0/16',
5625 'LY': '41.252.0.0/14',
5626 'MA': '105.128.0.0/11',
5627 'MC': '88.209.64.0/18',
5628 'MD': '37.246.0.0/16',
5629 'ME': '178.175.0.0/17',
5630 'MF': '74.112.232.0/21',
5631 'MG': '154.126.0.0/17',
5632 'MH': '117.103.88.0/21',
5633 'MK': '77.28.0.0/15',
5634 'ML': '154.118.128.0/18',
5635 'MM': '37.111.0.0/17',
5636 'MN': '49.0.128.0/17',
5637 'MO': '60.246.0.0/16',
5638 'MP': '202.88.64.0/20',
5639 'MQ': '109.203.224.0/19',
5640 'MR': '41.188.64.0/18',
5641 'MS': '208.90.112.0/22',
5642 'MT': '46.11.0.0/16',
5643 'MU': '105.16.0.0/12',
5644 'MV': '27.114.128.0/18',
5645 'MW': '102.70.0.0/15',
5646 'MX': '187.192.0.0/11',
5647 'MY': '175.136.0.0/13',
5648 'MZ': '197.218.0.0/15',
5649 'NA': '41.182.0.0/16',
5650 'NC': '101.101.0.0/18',
5651 'NE': '197.214.0.0/18',
5652 'NF': '203.17.240.0/22',
5653 'NG': '105.112.0.0/12',
5654 'NI': '186.76.0.0/15',
5655 'NL': '145.96.0.0/11',
5656 'NO': '84.208.0.0/13',
5657 'NP': '36.252.0.0/15',
5658 'NR': '203.98.224.0/19',
5659 'NU': '49.156.48.0/22',
5660 'NZ': '49.224.0.0/14',
5661 'OM': '5.36.0.0/15',
5662 'PA': '186.72.0.0/15',
5663 'PE': '186.160.0.0/14',
5664 'PF': '123.50.64.0/18',
5665 'PG': '124.240.192.0/19',
5666 'PH': '49.144.0.0/13',
5667 'PK': '39.32.0.0/11',
5668 'PL': '83.0.0.0/11',
5669 'PM': '70.36.0.0/20',
5670 'PR': '66.50.0.0/16',
5671 'PS': '188.161.0.0/16',
5672 'PT': '85.240.0.0/13',
5673 'PW': '202.124.224.0/20',
5674 'PY': '181.120.0.0/14',
5675 'QA': '37.210.0.0/15',
5676 'RE': '102.35.0.0/16',
5677 'RO': '79.112.0.0/13',
5678 'RS': '93.86.0.0/15',
5679 'RU': '5.136.0.0/13',
5680 'RW': '41.186.0.0/16',
5681 'SA': '188.48.0.0/13',
5682 'SB': '202.1.160.0/19',
5683 'SC': '154.192.0.0/11',
5684 'SD': '102.120.0.0/13',
5685 'SE': '78.64.0.0/12',
5686 'SG': '8.128.0.0/10',
5687 'SI': '188.196.0.0/14',
5688 'SK': '78.98.0.0/15',
5689 'SL': '102.143.0.0/17',
5690 'SM': '89.186.32.0/19',
5691 'SN': '41.82.0.0/15',
5692 'SO': '154.115.192.0/18',
5693 'SR': '186.179.128.0/17',
5694 'SS': '105.235.208.0/21',
5695 'ST': '197.159.160.0/19',
5696 'SV': '168.243.0.0/16',
5697 'SX': '190.102.0.0/20',
5699 'SZ': '41.84.224.0/19',
5700 'TC': '65.255.48.0/20',
5701 'TD': '154.68.128.0/19',
5702 'TG': '196.168.0.0/14',
5703 'TH': '171.96.0.0/13',
5704 'TJ': '85.9.128.0/18',
5705 'TK': '27.96.24.0/21',
5706 'TL': '180.189.160.0/20',
5707 'TM': '95.85.96.0/19',
5708 'TN': '197.0.0.0/11',
5709 'TO': '175.176.144.0/21',
5710 'TR': '78.160.0.0/11',
5711 'TT': '186.44.0.0/15',
5712 'TV': '202.2.96.0/19',
5713 'TW': '120.96.0.0/11',
5714 'TZ': '156.156.0.0/14',
5715 'UA': '37.52.0.0/14',
5716 'UG': '102.80.0.0/13',
5718 'UY': '167.56.0.0/13',
5719 'UZ': '84.54.64.0/18',
5720 'VA': '212.77.0.0/19',
5721 'VC': '207.191.240.0/21',
5722 'VE': '186.88.0.0/13',
5723 'VG': '66.81.192.0/20',
5724 'VI': '146.226.0.0/16',
5725 'VN': '14.160.0.0/11',
5726 'VU': '202.80.32.0/20',
5727 'WF': '117.20.32.0/21',
5728 'WS': '202.4.32.0/19',
5729 'YE': '134.35.0.0/16',
5730 'YT': '41.242.116.0/22',
5731 'ZA': '41.0.0.0/11',
5732 'ZM': '102.144.0.0/13',
5733 'ZW': '102.177.192.0/18',
5737 def random_ipv4(cls, code_or_block):
5738 if len(code_or_block) == 2:
5739 block = cls._country_ip_map.get(code_or_block.upper())
5743 block = code_or_block
5744 addr, preflen = block.split('/')
5745 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5746 addr_max = addr_min | (0xffffffff >> int(preflen))
5747 return compat_str(socket.inet_ntoa(
5748 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5751 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5752 def __init__(self, proxies=None):
5753 # Set default handlers
5754 for type in ('http', 'https'):
5755 setattr(self, '%s_open' % type,
5756 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5757 meth(r, proxy, type))
5758 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5760 def proxy_open(self, req, proxy, type):
5761 req_proxy = req.headers.get('Ytdl-request-proxy')
5762 if req_proxy is not None:
5764 del req.headers['Ytdl-request-proxy']
5766 if proxy == '__noproxy__':
5767 return None # No Proxy
5768 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5769 req.add_header('Ytdl-socks-proxy', proxy)
5770 # yt-dlp's http/https handlers do wrapping the socket with socks
5772 return compat_urllib_request.ProxyHandler.proxy_open(
5773 self, req, proxy, type)
5776 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5777 # released into Public Domain
5778 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5780 def long_to_bytes(n, blocksize=0):
5781 """long_to_bytes(n:long, blocksize:int) : string
5782 Convert a long integer to a byte string.
5784 If optional blocksize is given and greater than zero, pad the front of the
5785 byte string with binary zeros so that the length is a multiple of
5788 # after much testing, this algorithm was deemed to be the fastest
5792 s = compat_struct_pack('>I', n & 0xffffffff) + s
5794 # strip off leading zeros
5795 for i in range(len(s)):
5796 if s[i] != b'\000'[0]:
5799 # only happens when n == 0
5803 # add back some pad bytes. this could be done more efficiently w.r.t. the
5804 # de-padding being done above, but sigh...
5805 if blocksize > 0 and len(s) % blocksize:
5806 s = (blocksize - len(s) % blocksize) * b'\000' + s
5810 def bytes_to_long(s):
5811 """bytes_to_long(string) : long
5812 Convert a byte string to a long integer.
5814 This is (essentially) the inverse of long_to_bytes().
5819 extra = (4 - length % 4)
5820 s = b'\000' * extra + s
5821 length = length + extra
5822 for i in range(0, length, 4):
5823 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5827 def ohdave_rsa_encrypt(data, exponent, modulus):
5829 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5832 data: data to encrypt, bytes-like object
5833 exponent, modulus: parameter e and N of RSA algorithm, both integer
5834 Output: hex string of encrypted data
5836 Limitation: supports one block encryption only
5839 payload = int(binascii.hexlify(data[::-1]), 16)
5840 encrypted = pow(payload, exponent, modulus)
5841 return '%x' % encrypted
5844 def pkcs1pad(data, length):
5846 Padding input data with PKCS#1 scheme
5848 @param {int[]} data input data
5849 @param {int} length target length
5850 @returns {int[]} padded data
5852 if len(data) > length - 11:
5853 raise ValueError('Input data too
long for PKCS
#1 padding')
5855 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5856 return [0, 2] + pseudo_random
+ [0] + data
5859 def encode_base_n(num
, n
, table
=None):
5860 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5862 table
= FULL_TABLE
[:n
]
5865 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5872 ret
= table
[num
% n
] + ret
5877 def decode_packed_codes(code
):
5878 mobj
= re
.search(PACKED_CODES_RE
, code
)
5879 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5882 symbols
= symbols
.split('|')
5887 base_n_count
= encode_base_n(count
, base
)
5888 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5891 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5895 def caesar(s
, alphabet
, shift
):
5900 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5905 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5908 def parse_m3u8_attributes(attrib
):
5910 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5911 if val
.startswith('"'):
5917 def urshift(val
, n
):
5918 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5921 # Based on png2str() written by @gdkchan and improved by @yokrysty
5922 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5923 def decode_png(png_data
):
5924 # Reference: https://www.w3.org/TR/PNG/
5925 header
= png_data
[8:]
5927 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5928 raise IOError('Not a valid PNG file.')
5930 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5931 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5936 length
= unpack_integer(header
[:4])
5939 chunk_type
= header
[:4]
5942 chunk_data
= header
[:length
]
5943 header
= header
[length
:]
5945 header
= header
[4:] # Skip CRC
5953 ihdr
= chunks
[0]['data']
5955 width
= unpack_integer(ihdr
[:4])
5956 height
= unpack_integer(ihdr
[4:8])
5960 for chunk
in chunks
:
5961 if chunk
['type'] == b
'IDAT':
5962 idat
+= chunk
['data']
5965 raise IOError('Unable to read PNG data.')
5967 decompressed_data
= bytearray(zlib
.decompress(idat
))
5972 def _get_pixel(idx
):
5977 for y
in range(height
):
5978 basePos
= y
* (1 + stride
)
5979 filter_type
= decompressed_data
[basePos
]
5983 pixels
.append(current_row
)
5985 for x
in range(stride
):
5986 color
= decompressed_data
[1 + basePos
+ x
]
5987 basex
= y
* stride
+ x
5992 left
= _get_pixel(basex
- 3)
5994 up
= _get_pixel(basex
- stride
)
5996 if filter_type
== 1: # Sub
5997 color
= (color
+ left
) & 0xff
5998 elif filter_type
== 2: # Up
5999 color
= (color
+ up
) & 0xff
6000 elif filter_type
== 3: # Average
6001 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
6002 elif filter_type
== 4: # Paeth
6008 c
= _get_pixel(basex
- stride
- 3)
6016 if pa
<= pb
and pa
<= pc
:
6017 color
= (color
+ a
) & 0xff
6019 color
= (color
+ b
) & 0xff
6021 color
= (color
+ c
) & 0xff
6023 current_row
.append(color
)
6025 return width
, height
, pixels
6028 def write_xattr(path
, key
, value
):
6029 # This mess below finds the best xattr tool for the job
6031 # try the pyxattr module...
6034 if hasattr(xattr
, 'set'): # pyxattr
6035 # Unicode arguments are not supported in python-pyxattr until
6037 # See https://github.com/ytdl-org/youtube-dl/issues/5498
6038 pyxattr_required_version
= '0.5.0'
6039 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
6040 # TODO: fallback to CLI tools
6041 raise XAttrUnavailableError(
6042 'python-pyxattr is detected but is too old. '
6043 'yt-dlp requires %s or above while your version is %s. '
6044 'Falling back to other xattr implementations' % (
6045 pyxattr_required_version
, xattr
.__version
__))
6047 setxattr
= xattr
.set
6049 setxattr
= xattr
.setxattr
6052 setxattr(path
, key
, value
)
6053 except EnvironmentError as e
:
6054 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6057 if compat_os_name
== 'nt':
6058 # Write xattrs to NTFS Alternate Data Streams:
6059 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
6060 assert ':' not in key
6061 assert os
.path
.exists(path
)
6063 ads_fn
= path
+ ':' + key
6065 with open(ads_fn
, 'wb') as f
:
6067 except EnvironmentError as e
:
6068 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6070 user_has_setfattr
= check_executable('setfattr', ['--version'])
6071 user_has_xattr
= check_executable('xattr', ['-h'])
6073 if user_has_setfattr
or user_has_xattr
:
6075 value
= value
.decode('utf-8')
6076 if user_has_setfattr
:
6077 executable
= 'setfattr'
6078 opts
= ['-n', key
, '-v', value
]
6079 elif user_has_xattr
:
6080 executable
= 'xattr'
6081 opts
= ['-w', key
, value
]
6083 cmd
= ([encodeFilename(executable
, True)]
6084 + [encodeArgument(o
) for o
in opts
]
6085 + [encodeFilename(path
, True)])
6088 p
= subprocess
.Popen(
6089 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
6090 except EnvironmentError as e
:
6091 raise XAttrMetadataError(e
.errno
, e
.strerror
)
6092 stdout
, stderr
= process_communicate_or_kill(p
)
6093 stderr
= stderr
.decode('utf-8', 'replace')
6094 if p
.returncode
!= 0:
6095 raise XAttrMetadataError(p
.returncode
, stderr
)
6098 # On Unix, and can't find pyxattr, setfattr, or xattr.
6099 if sys
.platform
.startswith('linux'):
6100 raise XAttrUnavailableError(
6101 "Couldn't find a tool to set the xattrs. "
6102 "Install either the python 'pyxattr' or 'xattr' "
6103 "modules, or the GNU 'attr' package "
6104 "(which contains the 'setfattr' tool).")
6106 raise XAttrUnavailableError(
6107 "Couldn't find a tool to set the xattrs. "
6108 "Install either the python 'xattr' module, "
6109 "or the 'xattr' binary.")
6112 def random_birthday(year_field
, month_field
, day_field
):
6113 start_date
= datetime
.date(1950, 1, 1)
6114 end_date
= datetime
.date(1995, 12, 31)
6115 offset
= random
.randint(0, (end_date
- start_date
).days
)
6116 random_date
= start_date
+ datetime
.timedelta(offset
)
6118 year_field
: str(random_date
.year
),
6119 month_field
: str(random_date
.month
),
6120 day_field
: str(random_date
.day
),
6124 # Templates for internet shortcut files, which are plain text files.
6125 DOT_URL_LINK_TEMPLATE
= '''
6130 DOT_WEBLOC_LINK_TEMPLATE
= '''
6131 <?xml version="1.0" encoding="UTF-8"?>
6132 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
6133 <plist version="1.0">
6136 \t<string>%(url)s</string>
6141 DOT_DESKTOP_LINK_TEMPLATE
= '''
6151 def iri_to_uri(iri
):
6153 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
6155 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
6158 iri_parts
= compat_urllib_parse_urlparse(iri
)
6160 if '[' in iri_parts
.netloc
:
6161 raise ValueError('IPv6 URIs are not, yet, supported.')
6162 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
6164 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
6167 if iri_parts
.username
:
6168 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
6169 if iri_parts
.password
is not None:
6170 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6173 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6174 # The 'idna' encoding produces ASCII text.
6175 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6176 net_location
+= ':' + str(iri_parts
.port
)
6178 return compat_urllib_parse_urlunparse(
6182 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6184 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6185 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6187 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6188 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6190 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6192 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6195 def to_high_limit_path(path
):
6196 if sys
.platform
in ['win32', 'cygwin']:
6197 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6198 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6203 def format_field(obj
, field
=None, template
='%s', ignore
=(None, ''), default
='', func
=None):
6205 val
= obj
if obj
is not None else default
6207 val
= obj
.get(field
, default
)
6208 if func
and val
not in ignore
:
6210 return template
% val
if val
not in ignore
else default
6213 def clean_podcast_url(url
):
6214 return re
.sub(r
'''(?x)
6218 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6221 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6224 cn\.co| # https://podcorn.com/analytics-prefix/
6225 st\.fm # https://podsights.com/docs/
6230 _HEX_TABLE
= '0123456789abcdef'
6233 def random_uuidv4():
6234 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6237 def make_dir(path
, to_screen
=None):
6239 dn
= os
.path
.dirname(path
)
6240 if dn
and not os
.path
.exists(dn
):
6243 except (OSError, IOError) as err
:
6244 if callable(to_screen
) is not None:
6245 to_screen('unable to create directory ' + error_to_compat_str(err
))
6249 def get_executable_path():
6250 from zipimport
import zipimporter
6251 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6252 path
= os
.path
.dirname(sys
.executable
)
6253 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6254 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6256 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6257 return os
.path
.abspath(path
)
6260 def load_plugins(name
, suffix
, namespace
):
6261 plugin_info
= [None]
6264 plugin_info
= imp
.find_module(
6265 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
6266 plugins
= imp
.load_module(name
, *plugin_info
)
6267 for name
in dir(plugins
):
6268 if name
in namespace
:
6270 if not name
.endswith(suffix
):
6272 klass
= getattr(plugins
, name
)
6273 classes
.append(klass
)
6274 namespace
[name
] = klass
6278 if plugin_info
[0] is not None:
6279 plugin_info
[0].close()
6284 obj
, *path_list
, default
=None, expected_type
=None, get_all
=True,
6285 casesense
=True, is_user_input
=False, traverse_string
=False):
6286 ''' Traverse nested list/dict/tuple
6287 @param path_list A list of paths which are checked one by one.
6288 Each path is a list of keys where each key is a string,
6289 a tuple of strings or "...". When a tuple is given,
6290 all the keys given in the tuple are traversed, and
6291 "..." traverses all the keys in the object
6292 @param default Default value to return
6293 @param expected_type Only accept final value of this type (Can also be any callable)
6294 @param get_all Return all the values obtained from a path or only the first one
6295 @param casesense Whether to consider dictionary keys as case sensitive
6296 @param is_user_input Whether the keys are generated from user input. If True,
6297 strings are converted to int/slice if necessary
6298 @param traverse_string Whether to traverse inside strings. If True, any
6299 non-compatible object will also be converted into a string
6303 _lower
= lambda k
: (k
.lower() if isinstance(k
, str) else k
)
6304 path_list
= (map(_lower
, variadic(path
)) for path
in path_list
)
6306 def _traverse_obj(obj
, path
, _current_depth
=0):
6310 path
= tuple(variadic(path
))
6311 for i
, key
in enumerate(path
):
6312 if isinstance(key
, (list, tuple)):
6313 obj
= [_traverse_obj(obj
, sub_key
, _current_depth
) for sub_key
in key
]
6316 obj
= (obj
.values() if isinstance(obj
, dict)
6317 else obj
if isinstance(obj
, (list, tuple, LazyList
))
6318 else str(obj
) if traverse_string
else [])
6320 depth
= max(depth
, _current_depth
)
6321 return [_traverse_obj(inner_obj
, path
[i
+ 1:], _current_depth
) for inner_obj
in obj
]
6322 elif isinstance(obj
, dict) and not (is_user_input
and key
== ':'):
6323 obj
= (obj
.get(key
) if casesense
or (key
in obj
)
6324 else next((v
for k
, v
in obj
.items() if _lower(k
) == key
), None))
6327 key
= (int_or_none(key
) if ':' not in key
6328 else slice(*map(int_or_none
, key
.split(':'))))
6329 if key
== slice(None):
6330 return _traverse_obj(obj
, (..., *path
[i
+ 1:]), _current_depth
)
6331 if not isinstance(key
, (int, slice)):
6333 if not isinstance(obj
, (list, tuple, LazyList
)):
6334 if not traverse_string
:
6343 if isinstance(expected_type
, type):
6344 type_test
= lambda val
: val
if isinstance(val
, expected_type
) else None
6345 elif expected_type
is not None:
6346 type_test
= expected_type
6348 type_test
= lambda val
: val
6350 for path
in path_list
:
6352 val
= _traverse_obj(obj
, path
)
6355 for _
in range(depth
- 1):
6356 val
= itertools
.chain
.from_iterable(v
for v
in val
if v
is not None)
6357 val
= [v
for v
in map(type_test
, val
) if v
is not None]
6359 return val
if get_all
else val
[0]
6361 val
= type_test(val
)
6367 def traverse_dict(dictn
, keys
, casesense
=True):
6368 ''' For backward compatibility. Do not use '''
6369 return traverse_obj(dictn
, keys
, casesense
=casesense
,
6370 is_user_input
=True, traverse_string
=True)
6373 def variadic(x
, allowed_types
=(str, bytes)):
6374 return x
if isinstance(x
, collections
.abc
.Iterable
) and not isinstance(x
, allowed_types
) else (x
,)