4 from __future__
import unicode_literals
37 import xml
.etree
.ElementTree
41 compat_HTMLParseError
,
47 compat_ctypes_WINFUNCTYPE
,
48 compat_etree_fromstring
,
51 compat_html_entities_html5
,
64 compat_urllib_parse_urlencode
,
65 compat_urllib_parse_urlparse
,
66 compat_urllib_parse_urlunparse
,
67 compat_urllib_parse_quote
,
68 compat_urllib_parse_quote_plus
,
69 compat_urllib_parse_unquote_plus
,
70 compat_urllib_request
,
81 def register_socks_protocols():
82 # "Register" SOCKS protocols
83 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
84 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
85 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
86 if scheme
not in compat_urlparse
.uses_netloc
:
87 compat_urlparse
.uses_netloc
.append(scheme
)
90 # This is not clearly defined otherwise
91 compiled_regex_type
= type(re
.compile(''))
94 def random_user_agent():
95 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1674 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1678 'User-Agent': random_user_agent(),
1679 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1680 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1681 'Accept-Encoding': 'gzip, deflate',
1682 'Accept-Language': 'en-us,en;q=0.5',
1687 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1691 NO_DEFAULT
= object()
1693 ENGLISH_MONTH_NAMES
= [
1694 'January', 'February', 'March', 'April', 'May', 'June',
1695 'July', 'August', 'September', 'October', 'November', 'December']
1698 'en': ENGLISH_MONTH_NAMES
,
1700 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1701 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1704 KNOWN_EXTENSIONS
= (
1705 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1706 'flv', 'f4v', 'f4a', 'f4b',
1707 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1708 'mkv', 'mka', 'mk3d',
1711 'asf', 'wmv', 'wma',
1717 'f4f', 'f4m', 'm3u8', 'smil')
1719 REMUX_EXTENSIONS
= ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mp3', 'mka', 'm4a', 'ogg', 'opus')
1721 # needed for sanitizing filenames in restricted mode
1722 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1723 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1724 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1747 '%Y/%m/%d %H:%M:%S',
1749 '%Y-%m-%d %H:%M:%S',
1750 '%Y-%m-%d %H:%M:%S.%f',
1753 '%Y-%m-%dT%H:%M:%SZ',
1754 '%Y-%m-%dT%H:%M:%S.%fZ',
1755 '%Y-%m-%dT%H:%M:%S.%f0Z',
1756 '%Y-%m-%dT%H:%M:%S',
1757 '%Y-%m-%dT%H:%M:%S.%f',
1759 '%b %d %Y at %H:%M',
1760 '%b %d %Y at %H:%M:%S',
1761 '%B %d %Y at %H:%M',
1762 '%B %d %Y at %H:%M:%S',
1765 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1766 DATE_FORMATS_DAY_FIRST
.extend([
1772 '%d/%m/%Y %H:%M:%S',
1775 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1776 DATE_FORMATS_MONTH_FIRST
.extend([
1781 '%m/%d/%Y %H:%M:%S',
1784 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1785 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1788 def preferredencoding():
1789 """Get preferred encoding.
1791 Returns the best encoding scheme for the system, based on
1792 locale.getpreferredencoding() and some further tweaks.
1795 pref = locale.getpreferredencoding()
1803 def write_json_file(obj, fn):
1804 """ Encode obj as JSON and write it to fn, atomically if possible """
1806 fn = encodeFilename(fn)
1807 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1808 encoding = get_filesystem_encoding()
1809 # os.path.basename returns a bytes object, but NamedTemporaryFile
1810 # will fail if the filename contains non ascii characters unless we
1811 # use a unicode object
1812 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1813 # the same for os.path.dirname
1814 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1816 path_basename = os.path.basename
1817 path_dirname = os.path.dirname
1821 'prefix
': path_basename(fn) + '.',
1822 'dir': path_dirname(fn),
1826 # In Python 2.x, json.dump expects a bytestream.
1827 # In Python 3.x, it writes to a character stream
1828 if sys.version_info < (3, 0):
1833 'encoding
': 'utf
-8',
1836 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1840 json.dump(obj, tf, default=repr)
1841 if sys.platform == 'win32
':
1842 # Need to remove existing file on Windows, else os.rename raises
1843 # WindowsError or FileExistsError.
1851 os.chmod(tf.name, 0o666 & ~mask)
1854 os.rename(tf.name, fn)
1863 if sys.version_info >= (2, 7):
1864 def find_xpath_attr(node, xpath, key, val=None):
1865 """ Find the xpath xpath[@key=val] """
1866 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1867 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1868 return node.find(expr)
1870 def find_xpath_attr(node, xpath, key, val=None):
1871 for f in node.findall(compat_xpath(xpath)):
1872 if key not in f.attrib:
1874 if val is None or f.attrib.get(key) == val:
1878 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1879 # the namespace parameter
1882 def xpath_with_ns(path
, ns_map
):
1883 components
= [c
.split(':') for c
in path
.split('/')]
1885 for c
in components
:
1887 replaced
.append(c
[0])
1890 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1891 return '/'.join(replaced
)
1894 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1895 def _find_xpath(xpath
):
1896 return node
.find(compat_xpath(xpath
))
1898 if isinstance(xpath
, (str, compat_str
)):
1899 n
= _find_xpath(xpath
)
1907 if default
is not NO_DEFAULT
:
1910 name
= xpath
if name
is None else name
1911 raise ExtractorError('Could not find XML element %s' % name
)
1917 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1918 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1919 if n
is None or n
== default
:
1922 if default
is not NO_DEFAULT
:
1925 name
= xpath
if name
is None else name
1926 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1932 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1933 n
= find_xpath_attr(node
, xpath
, key
)
1935 if default
is not NO_DEFAULT
:
1938 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1939 raise ExtractorError('Could not find XML attribute %s' % name
)
1942 return n
.attrib
[key
]
1945 def get_element_by_id(id, html
):
1946 """Return the content of the tag with the specified ID in the passed HTML document"""
1947 return get_element_by_attribute('id', id, html
)
1950 def get_element_by_class(class_name
, html
):
1951 """Return the content of the first tag with the specified class in the passed HTML document"""
1952 retval
= get_elements_by_class(class_name
, html
)
1953 return retval
[0] if retval
else None
1956 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1957 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1958 return retval
[0] if retval
else None
1961 def get_elements_by_class(class_name
, html
):
1962 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1963 return get_elements_by_attribute(
1964 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1965 html, escape_value=False)
1968 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1969 """Return the content of the tag with the specified attribute in the passed HTML document"""
1971 value = re.escape(value) if escape_value else value
1974 for m in re.finditer(r'''(?xs)
1976 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1978 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1982 ''' % (re.escape(attribute), value), html):
1983 res = m.group('content
')
1985 if res.startswith('"') or res.startswith("'"):
1988 retlist.append(unescapeHTML(res))
1993 class HTMLAttributeParser(compat_HTMLParser):
1994 """Trivial HTML parser to gather the attributes for a single element"""
1998 compat_HTMLParser.__init__(self)
2000 def handle_starttag(self, tag, attrs):
2001 self.attrs = dict(attrs)
2004 def extract_attributes(html_element):
2005 """Given a string for an HTML element such as
2007 a="foo" B="bar" c="&98;az" d=boz
2008 empty= noval entity="&"
2011 Decode and return a dictionary of attributes.
2013 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2014 'empty
': '', 'noval
': None, 'entity
': '&',
2015 'sq
': '"', 'dq': '\''
2017 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2018 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2020 parser = HTMLAttributeParser()
2022 parser.feed(html_element)
2024 # Older Python may throw HTMLParseError in case of malformed HTML
2025 except compat_HTMLParseError:
2030 def clean_html(html):
2031 """Clean an HTML snippet into a readable string"""
2033 if html is None: # Convenience for sanitizing descriptions etc.
2037 html = html.replace('\n', ' ')
2038 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2039 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2041 html = re.sub('<.*?>', '', html)
2042 # Replace html entities
2043 html = unescapeHTML(html)
2047 def sanitize_open(filename, open_mode):
2048 """Try to open the given filename, and slightly tweak it if this fails.
2050 Attempts to open the given filename. If this fails, it tries to change
2051 the filename slightly, step by step, until it's either able to open it
2052 or it fails and raises a final exception, like the standard open()
2055 It returns the tuple (stream, definitive_file_name).
2059 if sys.platform == 'win32':
2061 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2062 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2063 stream = open(encodeFilename(filename), open_mode)
2064 return (stream, filename)
2065 except (IOError, OSError) as err:
2066 if err.errno in (errno.EACCES,):
2069 # In case of error, try to remove win32 forbidden chars
2070 alt_filename = sanitize_path(filename)
2071 if alt_filename == filename:
2074 # An exception here should be caught in the caller
2075 stream = open(encodeFilename(alt_filename), open_mode)
2076 return (stream, alt_filename)
2079 def timeconvert(timestr):
2080 """Convert RFC 2822 defined time string into system timestamp"""
2082 timetuple = email.utils.parsedate_tz(timestr)
2083 if timetuple is not None:
2084 timestamp = email.utils.mktime_tz(timetuple)
2088 def sanitize_filename(s, restricted=False, is_id=False):
2089 """Sanitizes a string so it could be used as part of a filename.
2090 If restricted is set, use a stricter subset of allowed characters.
2091 Set is_id if this is not an arbitrary string, but an ID that should be kept
2094 def replace_insane(char):
2095 if restricted and char in ACCENT_CHARS:
2096 return ACCENT_CHARS[char]
2097 if char == '?' or ord(char) < 32 or ord(char) == 127:
2100 return '' if restricted else '\''
2102 return '_
-' if restricted else ' -'
2103 elif char in '\\/|
*<>':
2105 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2107 if restricted
and ord(char
) > 127:
2112 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2113 result
= ''.join(map(replace_insane
, s
))
2115 while '__' in result
:
2116 result
= result
.replace('__', '_')
2117 result
= result
.strip('_')
2118 # Common case of "Foreign band name - English song title"
2119 if restricted
and result
.startswith('-_'):
2121 if result
.startswith('-'):
2122 result
= '_' + result
[len('-'):]
2123 result
= result
.lstrip('.')
2129 def sanitize_path(s
, force
=False):
2130 """Sanitizes and normalizes path on Windows"""
2131 if sys
.platform
== 'win32':
2133 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2134 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2135 drive_or_unc
, _
= os
.path
.splitunc(s
)
2141 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2145 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2146 for path_part
in norm_path
]
2148 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2149 elif force
and s
[0] == os
.path
.sep
:
2150 sanitized_path
.insert(0, os
.path
.sep
)
2151 return os
.path
.join(*sanitized_path
)
2154 def sanitize_url(url
):
2155 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2156 # the number of unwanted failures due to missing protocol
2157 if url
.startswith('//'):
2158 return 'http:%s' % url
2159 # Fix some common typos seen so far
2161 # https://github.com/ytdl-org/youtube-dl/issues/15649
2162 (r
'^httpss://', r
'https://'),
2163 # https://bx1.be/lives/direct-tv/
2164 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2166 for mistake
, fixup
in COMMON_TYPOS
:
2167 if re
.match(mistake
, url
):
2168 return re
.sub(mistake
, fixup
, url
)
2169 return escape_url(url
)
2172 def sanitized_Request(url
, *args
, **kwargs
):
2173 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2177 """Expand shell variables and ~"""
2178 return os
.path
.expandvars(compat_expanduser(s
))
2181 def orderedSet(iterable
):
2182 """ Remove all duplicates from the input iterable """
2190 def _htmlentity_transform(entity_with_semicolon
):
2191 """Transforms an HTML entity to a character."""
2192 entity
= entity_with_semicolon
[:-1]
2194 # Known non-numeric HTML entity
2195 if entity
in compat_html_entities
.name2codepoint
:
2196 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2198 # TODO: HTML5 allows entities without a semicolon. For example,
2199 # 'Éric' should be decoded as 'Éric'.
2200 if entity_with_semicolon
in compat_html_entities_html5
:
2201 return compat_html_entities_html5
[entity_with_semicolon
]
2203 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2204 if mobj
is not None:
2205 numstr
= mobj
.group(1)
2206 if numstr
.startswith('x'):
2208 numstr
= '0%s' % numstr
2211 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2213 return compat_chr(int(numstr
, base
))
2217 # Unknown entity in name, return its literal representation
2218 return '&%s;' % entity
2221 def unescapeHTML(s
):
2224 assert type(s
) == compat_str
2227 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2230 def process_communicate_or_kill(p
, *args
, **kwargs
):
2232 return p
.communicate(*args
, **kwargs
)
2233 except BaseException
: # Including KeyboardInterrupt
2239 def get_subprocess_encoding():
2240 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2241 # For subprocess calls, encode with locale encoding
2242 # Refer to http://stackoverflow.com/a/9951851/35070
2243 encoding
= preferredencoding()
2245 encoding
= sys
.getfilesystemencoding()
2246 if encoding
is None:
2251 def encodeFilename(s
, for_subprocess
=False):
2253 @param s The name of the file
2256 assert type(s
) == compat_str
2258 # Python 3 has a Unicode API
2259 if sys
.version_info
>= (3, 0):
2262 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2263 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2264 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2265 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2268 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2269 if sys
.platform
.startswith('java'):
2272 return s
.encode(get_subprocess_encoding(), 'ignore')
2275 def decodeFilename(b
, for_subprocess
=False):
2277 if sys
.version_info
>= (3, 0):
2280 if not isinstance(b
, bytes):
2283 return b
.decode(get_subprocess_encoding(), 'ignore')
2286 def encodeArgument(s
):
2287 if not isinstance(s
, compat_str
):
2288 # Legacy code that uses byte strings
2289 # Uncomment the following line after fixing all post processors
2290 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2291 s
= s
.decode('ascii')
2292 return encodeFilename(s
, True)
2295 def decodeArgument(b
):
2296 return decodeFilename(b
, True)
2299 def decodeOption(optval
):
2302 if isinstance(optval
, bytes):
2303 optval
= optval
.decode(preferredencoding())
2305 assert isinstance(optval
, compat_str
)
2309 def formatSeconds(secs
, delim
=':'):
2311 return '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2313 return '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2318 def make_HTTPS_handler(params
, **kwargs
):
2319 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2320 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2321 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2322 if opts_no_check_certificate
:
2323 context
.check_hostname
= False
2324 context
.verify_mode
= ssl
.CERT_NONE
2326 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2329 # (create_default_context present but HTTPSHandler has no context=)
2332 if sys
.version_info
< (3, 2):
2333 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2334 else: # Python < 3.4
2335 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2336 context
.verify_mode
= (ssl
.CERT_NONE
2337 if opts_no_check_certificate
2338 else ssl
.CERT_REQUIRED
)
2339 context
.set_default_verify_paths()
2340 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2343 def bug_reports_message(before
=';'):
2344 if ytdl_is_updateable():
2345 update_cmd
= 'type yt-dlp -U to update'
2347 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2348 msg
= 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
2349 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2350 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2352 before
= before
.rstrip()
2353 if not before
or before
.endswith(('.', '!', '?')):
2354 msg
= msg
[0].title() + msg
[1:]
2356 return (before
+ ' ' if before
else '') + msg
2359 class YoutubeDLError(Exception):
2360 """Base exception for YoutubeDL errors."""
2364 network_exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
2365 if hasattr(ssl
, 'CertificateError'):
2366 network_exceptions
.append(ssl
.CertificateError
)
2367 network_exceptions
= tuple(network_exceptions
)
2370 class ExtractorError(YoutubeDLError
):
2371 """Error during info extraction."""
2373 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2374 """ tb, if given, is the original traceback (so that it can be printed out).
2375 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2378 if sys
.exc_info()[0] in network_exceptions
:
2380 if video_id
is not None:
2381 msg
= video_id
+ ': ' + msg
2383 msg
+= ' (caused by %r)' % cause
2385 msg
+= bug_reports_message()
2386 super(ExtractorError
, self
).__init
__(msg
)
2389 self
.exc_info
= sys
.exc_info() # preserve original exception
2391 self
.video_id
= video_id
2393 def format_traceback(self
):
2394 if self
.traceback
is None:
2396 return ''.join(traceback
.format_tb(self
.traceback
))
2399 class UnsupportedError(ExtractorError
):
2400 def __init__(self
, url
):
2401 super(UnsupportedError
, self
).__init
__(
2402 'Unsupported URL: %s' % url
, expected
=True)
2406 class RegexNotFoundError(ExtractorError
):
2407 """Error when a regex didn't match"""
2411 class GeoRestrictedError(ExtractorError
):
2412 """Geographic restriction Error exception.
2414 This exception may be thrown when a video is not available from your
2415 geographic location due to geographic restrictions imposed by a website.
2418 def __init__(self
, msg
, countries
=None):
2419 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2421 self
.countries
= countries
2424 class DownloadError(YoutubeDLError
):
2425 """Download Error exception.
2427 This exception may be thrown by FileDownloader objects if they are not
2428 configured to continue on errors. They will contain the appropriate
2432 def __init__(self
, msg
, exc_info
=None):
2433 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2434 super(DownloadError
, self
).__init
__(msg
)
2435 self
.exc_info
= exc_info
2438 class EntryNotInPlaylist(YoutubeDLError
):
2439 """Entry not in playlist exception.
2441 This exception will be thrown by YoutubeDL when a requested entry
2442 is not found in the playlist info_dict
2447 class SameFileError(YoutubeDLError
):
2448 """Same File exception.
2450 This exception will be thrown by FileDownloader objects if they detect
2451 multiple files would have to be downloaded to the same file on disk.
2456 class PostProcessingError(YoutubeDLError
):
2457 """Post Processing exception.
2459 This exception may be raised by PostProcessor's .run() method to
2460 indicate an error in the postprocessing task.
2463 def __init__(self
, msg
):
2464 super(PostProcessingError
, self
).__init
__(msg
)
2468 class ExistingVideoReached(YoutubeDLError
):
2469 """ --max-downloads limit has been reached. """
2473 class RejectedVideoReached(YoutubeDLError
):
2474 """ --max-downloads limit has been reached. """
2478 class MaxDownloadsReached(YoutubeDLError
):
2479 """ --max-downloads limit has been reached. """
2483 class UnavailableVideoError(YoutubeDLError
):
2484 """Unavailable Format exception.
2486 This exception will be thrown when a video is requested
2487 in a format that is not available for that video.
2492 class ContentTooShortError(YoutubeDLError
):
2493 """Content Too Short exception.
2495 This exception may be raised by FileDownloader objects when a file they
2496 download is too small for what the server announced first, indicating
2497 the connection was probably interrupted.
2500 def __init__(self
, downloaded
, expected
):
2501 super(ContentTooShortError
, self
).__init
__(
2502 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2505 self
.downloaded
= downloaded
2506 self
.expected
= expected
2509 class XAttrMetadataError(YoutubeDLError
):
2510 def __init__(self
, code
=None, msg
='Unknown error'):
2511 super(XAttrMetadataError
, self
).__init
__(msg
)
2515 # Parsing code and msg
2516 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2517 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2518 self
.reason
= 'NO_SPACE'
2519 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2520 self
.reason
= 'VALUE_TOO_LONG'
2522 self
.reason
= 'NOT_SUPPORTED'
2525 class XAttrUnavailableError(YoutubeDLError
):
2529 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2530 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2531 # expected HTTP responses to meet HTTP/1.0 or later (see also
2532 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2533 if sys
.version_info
< (3, 0):
2534 kwargs
['strict'] = True
2535 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2536 source_address
= ydl_handler
._params
.get('source_address')
2538 if source_address
is not None:
2539 # This is to workaround _create_connection() from socket where it will try all
2540 # address data from getaddrinfo() including IPv6. This filters the result from
2541 # getaddrinfo() based on the source_address value.
2542 # This is based on the cpython socket.create_connection() function.
2543 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2544 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2545 host
, port
= address
2547 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2548 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2549 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2550 if addrs
and not ip_addrs
:
2551 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2553 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2554 % (ip_version
, source_address
[0]))
2555 for res
in ip_addrs
:
2556 af
, socktype
, proto
, canonname
, sa
= res
2559 sock
= socket
.socket(af
, socktype
, proto
)
2560 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2561 sock
.settimeout(timeout
)
2562 sock
.bind(source_address
)
2564 err
= None # Explicitly break reference cycle
2566 except socket
.error
as _
:
2568 if sock
is not None:
2573 raise socket
.error('getaddrinfo returns an empty list')
2574 if hasattr(hc
, '_create_connection'):
2575 hc
._create
_connection
= _create_connection
2576 sa
= (source_address
, 0)
2577 if hasattr(hc
, 'source_address'): # Python 2.7+
2578 hc
.source_address
= sa
2580 def _hc_connect(self
, *args
, **kwargs
):
2581 sock
= _create_connection(
2582 (self
.host
, self
.port
), self
.timeout
, sa
)
2584 self
.sock
= ssl
.wrap_socket(
2585 sock
, self
.key_file
, self
.cert_file
,
2586 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2589 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2594 def handle_youtubedl_headers(headers
):
2595 filtered_headers
= headers
2597 if 'Youtubedl-no-compression' in filtered_headers
:
2598 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2599 del filtered_headers
['Youtubedl-no-compression']
2601 return filtered_headers
2604 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2605 """Handler for HTTP requests and responses.
2607 This class, when installed with an OpenerDirector, automatically adds
2608 the standard headers to every HTTP request and handles gzipped and
2609 deflated responses from web servers. If compression is to be avoided in
2610 a particular request, the original request in the program code only has
2611 to include the HTTP header "Youtubedl-no-compression", which will be
2612 removed before making the real request.
2614 Part of this code was copied from:
2616 http://techknack.net/python-urllib2-handlers/
2618 Andrew Rowls, the author of that code, agreed to release it to the
2622 def __init__(self
, params
, *args
, **kwargs
):
2623 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2624 self
._params
= params
2626 def http_open(self
, req
):
2627 conn_class
= compat_http_client
.HTTPConnection
2629 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2631 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2632 del req
.headers
['Ytdl-socks-proxy']
2634 return self
.do_open(functools
.partial(
2635 _create_http_connection
, self
, conn_class
, False),
2643 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2645 return zlib
.decompress(data
)
2647 def http_request(self
, req
):
2648 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2649 # always respected by websites, some tend to give out URLs with non percent-encoded
2650 # non-ASCII characters (see telemb.py, ard.py [#3412])
2651 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2652 # To work around aforementioned issue we will replace request's original URL with
2653 # percent-encoded one
2654 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2655 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2656 url
= req
.get_full_url()
2657 url_escaped
= escape_url(url
)
2659 # Substitute URL if any change after escaping
2660 if url
!= url_escaped
:
2661 req
= update_Request(req
, url
=url_escaped
)
2663 for h
, v
in std_headers
.items():
2664 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2665 # The dict keys are capitalized because of this bug by urllib
2666 if h
.capitalize() not in req
.headers
:
2667 req
.add_header(h
, v
)
2669 req
.headers
= handle_youtubedl_headers(req
.headers
)
2671 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2672 # Python 2.6 is brain-dead when it comes to fragments
2673 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2674 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2678 def http_response(self
, req
, resp
):
2681 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2682 content
= resp
.read()
2683 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2685 uncompressed
= io
.BytesIO(gz
.read())
2686 except IOError as original_ioerror
:
2687 # There may be junk add the end of the file
2688 # See http://stackoverflow.com/q/4928560/35070 for details
2689 for i
in range(1, 1024):
2691 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2692 uncompressed
= io
.BytesIO(gz
.read())
2697 raise original_ioerror
2698 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2699 resp
.msg
= old_resp
.msg
2700 del resp
.headers
['Content-encoding']
2702 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2703 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2704 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2705 resp
.msg
= old_resp
.msg
2706 del resp
.headers
['Content-encoding']
2707 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2708 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2709 if 300 <= resp
.code
< 400:
2710 location
= resp
.headers
.get('Location')
2712 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2713 if sys
.version_info
>= (3, 0):
2714 location
= location
.encode('iso-8859-1').decode('utf-8')
2716 location
= location
.decode('utf-8')
2717 location_escaped
= escape_url(location
)
2718 if location
!= location_escaped
:
2719 del resp
.headers
['Location']
2720 if sys
.version_info
< (3, 0):
2721 location_escaped
= location_escaped
.encode('utf-8')
2722 resp
.headers
['Location'] = location_escaped
2725 https_request
= http_request
2726 https_response
= http_response
2729 def make_socks_conn_class(base_class
, socks_proxy
):
2730 assert issubclass(base_class
, (
2731 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2733 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2734 if url_components
.scheme
.lower() == 'socks5':
2735 socks_type
= ProxyType
.SOCKS5
2736 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2737 socks_type
= ProxyType
.SOCKS4
2738 elif url_components
.scheme
.lower() == 'socks4a':
2739 socks_type
= ProxyType
.SOCKS4A
2741 def unquote_if_non_empty(s
):
2744 return compat_urllib_parse_unquote_plus(s
)
2748 url_components
.hostname
, url_components
.port
or 1080,
2750 unquote_if_non_empty(url_components
.username
),
2751 unquote_if_non_empty(url_components
.password
),
2754 class SocksConnection(base_class
):
2756 self
.sock
= sockssocket()
2757 self
.sock
.setproxy(*proxy_args
)
2758 if type(self
.timeout
) in (int, float):
2759 self
.sock
.settimeout(self
.timeout
)
2760 self
.sock
.connect((self
.host
, self
.port
))
2762 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2763 if hasattr(self
, '_context'): # Python > 2.6
2764 self
.sock
= self
._context
.wrap_socket(
2765 self
.sock
, server_hostname
=self
.host
)
2767 self
.sock
= ssl
.wrap_socket(self
.sock
)
2769 return SocksConnection
2772 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2773 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2774 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2775 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2776 self
._params
= params
2778 def https_open(self
, req
):
2780 conn_class
= self
._https
_conn
_class
2782 if hasattr(self
, '_context'): # python > 2.6
2783 kwargs
['context'] = self
._context
2784 if hasattr(self
, '_check_hostname'): # python 3.x
2785 kwargs
['check_hostname'] = self
._check
_hostname
2787 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2789 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2790 del req
.headers
['Ytdl-socks-proxy']
2792 return self
.do_open(functools
.partial(
2793 _create_http_connection
, self
, conn_class
, True),
2797 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2799 See [1] for cookie file format.
2801 1. https://curl.haxx.se/docs/http-cookies.html
2803 _HTTPONLY_PREFIX
= '#HttpOnly_'
2805 _HEADER
= '''# Netscape HTTP Cookie File
2806 # This file is generated by yt-dlp. Do not edit.
2809 _CookieFileEntry
= collections
.namedtuple(
2811 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2813 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2815 Save cookies to a file.
2817 Most of the code is taken from CPython 3.8 and slightly adapted
2818 to support cookie files with UTF-8 in both python 2 and 3.
2820 if filename
is None:
2821 if self
.filename
is not None:
2822 filename
= self
.filename
2824 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2826 # Store session cookies with `expires` set to 0 instead of an empty
2829 if cookie
.expires
is None:
2832 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2833 f
.write(self
._HEADER
)
2836 if not ignore_discard
and cookie
.discard
:
2838 if not ignore_expires
and cookie
.is_expired(now
):
2844 if cookie
.domain
.startswith('.'):
2845 initial_dot
= 'TRUE'
2847 initial_dot
= 'FALSE'
2848 if cookie
.expires
is not None:
2849 expires
= compat_str(cookie
.expires
)
2852 if cookie
.value
is None:
2853 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2854 # with no name, whereas http.cookiejar regards it as a
2855 # cookie with no value.
2860 value
= cookie
.value
2862 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2863 secure
, expires
, name
, value
]) + '\n')
2865 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2866 """Load cookies from a file."""
2867 if filename
is None:
2868 if self
.filename
is not None:
2869 filename
= self
.filename
2871 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2873 def prepare_line(line
):
2874 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2875 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2876 # comments and empty lines are fine
2877 if line
.startswith('#') or not line
.strip():
2879 cookie_list
= line
.split('\t')
2880 if len(cookie_list
) != self
._ENTRY
_LEN
:
2881 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2882 cookie
= self
._CookieFileEntry
(*cookie_list
)
2883 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2884 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2888 with io
.open(filename
, encoding
='utf-8') as f
:
2891 cf
.write(prepare_line(line
))
2892 except compat_cookiejar
.LoadError
as e
:
2894 'WARNING: skipping cookie file entry due to %s: %r\n'
2895 % (e
, line
), sys
.stderr
)
2898 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2899 # Session cookies are denoted by either `expires` field set to
2900 # an empty string or 0. MozillaCookieJar only recognizes the former
2901 # (see [1]). So we need force the latter to be recognized as session
2902 # cookies on our own.
2903 # Session cookies may be important for cookies-based authentication,
2904 # e.g. usually, when user does not check 'Remember me' check box while
2905 # logging in on a site, some important cookies are stored as session
2906 # cookies so that not recognizing them will result in failed login.
2907 # 1. https://bugs.python.org/issue17164
2909 # Treat `expires=0` cookies as session cookies
2910 if cookie
.expires
== 0:
2911 cookie
.expires
= None
2912 cookie
.discard
= True
2915 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2916 def __init__(self
, cookiejar
=None):
2917 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2919 def http_response(self
, request
, response
):
2920 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2921 # characters in Set-Cookie HTTP header of last response (see
2922 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2923 # In order to at least prevent crashing we will percent encode Set-Cookie
2924 # header before HTTPCookieProcessor starts processing it.
2925 # if sys.version_info < (3, 0) and response.headers:
2926 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2927 # set_cookie = response.headers.get(set_cookie_header)
2929 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2930 # if set_cookie != set_cookie_escaped:
2931 # del response.headers[set_cookie_header]
2932 # response.headers[set_cookie_header] = set_cookie_escaped
2933 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2935 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2936 https_response
= http_response
2939 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2940 """YoutubeDL redirect handler
2942 The code is based on HTTPRedirectHandler implementation from CPython [1].
2944 This redirect handler solves two issues:
2945 - ensures redirect URL is always unicode under python 2
2946 - introduces support for experimental HTTP response status code
2947 308 Permanent Redirect [2] used by some sites [3]
2949 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2950 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2951 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2954 http_error_301
= http_error_303
= http_error_307
= http_error_308
= compat_urllib_request
.HTTPRedirectHandler
.http_error_302
2956 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2957 """Return a Request or None in response to a redirect.
2959 This is called by the http_error_30x methods when a
2960 redirection response is received. If a redirection should
2961 take place, return a new Request to allow http_error_30x to
2962 perform the redirect. Otherwise, raise HTTPError if no-one
2963 else should try to handle this url. Return None if you can't
2964 but another Handler might.
2966 m
= req
.get_method()
2967 if (not (code
in (301, 302, 303, 307, 308) and m
in ("GET", "HEAD")
2968 or code
in (301, 302, 303) and m
== "POST")):
2969 raise compat_HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
2970 # Strictly (according to RFC 2616), 301 or 302 in response to
2971 # a POST MUST NOT cause a redirection without confirmation
2972 # from the user (of urllib.request, in this case). In practice,
2973 # essentially all clients do redirect in this case, so we do
2976 # On python 2 urlh.geturl() may sometimes return redirect URL
2977 # as byte string instead of unicode. This workaround allows
2978 # to force it always return unicode.
2979 if sys
.version_info
[0] < 3:
2980 newurl
= compat_str(newurl
)
2982 # Be conciliant with URIs containing a space. This is mainly
2983 # redundant with the more complete encoding done in http_error_302(),
2984 # but it is kept for compatibility with other callers.
2985 newurl
= newurl
.replace(' ', '%20')
2987 CONTENT_HEADERS
= ("content-length", "content-type")
2988 # NB: don't use dict comprehension for python 2.6 compatibility
2989 newheaders
= dict((k
, v
) for k
, v
in req
.headers
.items()
2990 if k
.lower() not in CONTENT_HEADERS
)
2991 return compat_urllib_request
.Request(
2992 newurl
, headers
=newheaders
, origin_req_host
=req
.origin_req_host
,
2996 def extract_timezone(date_str
):
2998 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
3001 timezone
= datetime
.timedelta()
3003 date_str
= date_str
[:-len(m
.group('tz'))]
3004 if not m
.group('sign'):
3005 timezone
= datetime
.timedelta()
3007 sign
= 1 if m
.group('sign') == '+' else -1
3008 timezone
= datetime
.timedelta(
3009 hours
=sign
* int(m
.group('hours')),
3010 minutes
=sign
* int(m
.group('minutes')))
3011 return timezone
, date_str
3014 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
3015 """ Return a UNIX timestamp from the given date """
3017 if date_str
is None:
3020 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
3022 if timezone
is None:
3023 timezone
, date_str
= extract_timezone(date_str
)
3026 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
3027 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
3028 return calendar
.timegm(dt
.timetuple())
3033 def date_formats(day_first
=True):
3034 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
3037 def unified_strdate(date_str
, day_first
=True):
3038 """Return a string with the date in the format YYYYMMDD"""
3040 if date_str
is None:
3044 date_str
= date_str
.replace(',', ' ')
3045 # Remove AM/PM + timezone
3046 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3047 _
, date_str
= extract_timezone(date_str
)
3049 for expression
in date_formats(day_first
):
3051 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
3054 if upload_date
is None:
3055 timetuple
= email
.utils
.parsedate_tz(date_str
)
3058 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3061 if upload_date
is not None:
3062 return compat_str(upload_date
)
3065 def unified_timestamp(date_str
, day_first
=True):
3066 if date_str
is None:
3069 date_str
= re
.sub(r
'[,|]', '', date_str
)
3071 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3072 timezone
, date_str
= extract_timezone(date_str
)
3074 # Remove AM/PM + timezone
3075 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3077 # Remove unrecognized timezones from ISO 8601 alike timestamps
3078 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3080 date_str
= date_str
[:-len(m
.group('tz'))]
3082 # Python only supports microseconds, so remove nanoseconds
3083 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3085 date_str
= m
.group(1)
3087 for expression
in date_formats(day_first
):
3089 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3090 return calendar
.timegm(dt
.timetuple())
3093 timetuple
= email
.utils
.parsedate_tz(date_str
)
3095 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3098 def determine_ext(url
, default_ext
='unknown_video'):
3099 if url
is None or '.' not in url
:
3101 guess
= url
.partition('?')[0].rpartition('.')[2]
3102 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3104 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3105 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3106 return guess
.rstrip('/')
3111 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3112 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3115 def datetime_from_str(date_str
, precision
='auto', format
='%Y%m%d'):
3117 Return a datetime object from a string in the format YYYYMMDD or
3118 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3120 format: string date format used to return datetime object from
3121 precision: round the time portion of a datetime object.
3122 auto|microsecond|second|minute|hour|day.
3123 auto: round to the unit provided in date_str (if applicable).
3125 auto_precision
= False
3126 if precision
== 'auto':
3127 auto_precision
= True
3128 precision
= 'microsecond'
3129 today
= datetime_round(datetime
.datetime
.now(), precision
)
3130 if date_str
in ('now', 'today'):
3132 if date_str
== 'yesterday':
3133 return today
- datetime
.timedelta(days
=1)
3135 r
'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
3137 if match
is not None:
3138 start_time
= datetime_from_str(match
.group('start'), precision
, format
)
3139 time
= int(match
.group('time')) * (-1 if match
.group('sign') == '-' else 1)
3140 unit
= match
.group('unit')
3141 if unit
== 'month' or unit
== 'year':
3142 new_date
= datetime_add_months(start_time
, time
* 12 if unit
== 'year' else time
)
3148 delta
= datetime
.timedelta(**{unit + 's': time}
)
3149 new_date
= start_time
+ delta
3151 return datetime_round(new_date
, unit
)
3154 return datetime_round(datetime
.datetime
.strptime(date_str
, format
), precision
)
3157 def date_from_str(date_str
, format
='%Y%m%d'):
3159 Return a datetime object from a string in the format YYYYMMDD or
3160 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
3162 format: string date format used to return datetime object from
3164 return datetime_from_str(date_str
, precision
='microsecond', format
=format
).date()
3167 def datetime_add_months(dt
, months
):
3168 """Increment/Decrement a datetime object by months."""
3169 month
= dt
.month
+ months
- 1
3170 year
= dt
.year
+ month
// 12
3171 month
= month
% 12 + 1
3172 day
= min(dt
.day
, calendar
.monthrange(year
, month
)[1])
3173 return dt
.replace(year
, month
, day
)
3176 def datetime_round(dt
, precision
='day'):
3178 Round a datetime object's time to a specific precision
3180 if precision
== 'microsecond':
3189 roundto
= lambda x
, n
: ((x
+ n
/ 2) // n
) * n
3190 timestamp
= calendar
.timegm(dt
.timetuple())
3191 return datetime
.datetime
.utcfromtimestamp(roundto(timestamp
, unit_seconds
[precision
]))
3194 def hyphenate_date(date_str
):
3196 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3197 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3198 if match
is not None:
3199 return '-'.join(match
.groups())
3204 class DateRange(object):
3205 """Represents a time interval between two dates"""
3207 def __init__(self
, start
=None, end
=None):
3208 """start and end must be strings in the format accepted by date"""
3209 if start
is not None:
3210 self
.start
= date_from_str(start
)
3212 self
.start
= datetime
.datetime
.min.date()
3214 self
.end
= date_from_str(end
)
3216 self
.end
= datetime
.datetime
.max.date()
3217 if self
.start
> self
.end
:
3218 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3222 """Returns a range that only contains the given day"""
3223 return cls(day
, day
)
3225 def __contains__(self
, date
):
3226 """Check if the date is in the range"""
3227 if not isinstance(date
, datetime
.date
):
3228 date
= date_from_str(date
)
3229 return self
.start
<= date
<= self
.end
3232 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3235 def platform_name():
3236 """ Returns the platform name as a compat_str """
3237 res
= platform
.platform()
3238 if isinstance(res
, bytes):
3239 res
= res
.decode(preferredencoding())
3241 assert isinstance(res
, compat_str
)
3245 def _windows_write_string(s
, out
):
3246 """ Returns True if the string was written using special methods,
3247 False if it has yet to be written out."""
3248 # Adapted from http://stackoverflow.com/a/3259271/35070
3251 import ctypes
.wintypes
3259 fileno
= out
.fileno()
3260 except AttributeError:
3261 # If the output stream doesn't have a fileno, it's virtual
3263 except io
.UnsupportedOperation
:
3264 # Some strange Windows pseudo files?
3266 if fileno
not in WIN_OUTPUT_IDS
:
3269 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3270 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3271 ('GetStdHandle', ctypes
.windll
.kernel32
))
3272 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3274 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3275 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3276 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3277 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3278 written
= ctypes
.wintypes
.DWORD(0)
3280 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3281 FILE_TYPE_CHAR
= 0x0002
3282 FILE_TYPE_REMOTE
= 0x8000
3283 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3284 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3285 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3286 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3287 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3289 def not_a_console(handle
):
3290 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3292 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3293 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3295 if not_a_console(h
):
3298 def next_nonbmp_pos(s
):
3300 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3301 except StopIteration:
3305 count
= min(next_nonbmp_pos(s
), 1024)
3307 ret
= WriteConsoleW(
3308 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3310 raise OSError('Failed to write string')
3311 if not count
: # We just wrote a non-BMP character
3312 assert written
.value
== 2
3315 assert written
.value
> 0
3316 s
= s
[written
.value
:]
3320 def write_string(s
, out
=None, encoding
=None):
3323 assert type(s
) == compat_str
3325 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3326 if _windows_write_string(s
, out
):
3329 if ('b' in getattr(out
, 'mode', '')
3330 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3331 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3333 elif hasattr(out
, 'buffer'):
3334 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3335 byt
= s
.encode(enc
, 'ignore')
3336 out
.buffer.write(byt
)
3342 def bytes_to_intlist(bs
):
3345 if isinstance(bs
[0], int): # Python 3
3348 return [ord(c
) for c
in bs
]
3351 def intlist_to_bytes(xs
):
3354 return compat_struct_pack('%dB' % len(xs
), *xs
)
3357 # Cross-platform file locking
3358 if sys
.platform
== 'win32':
3359 import ctypes
.wintypes
3362 class OVERLAPPED(ctypes
.Structure
):
3364 ('Internal', ctypes
.wintypes
.LPVOID
),
3365 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3366 ('Offset', ctypes
.wintypes
.DWORD
),
3367 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3368 ('hEvent', ctypes
.wintypes
.HANDLE
),
3371 kernel32
= ctypes
.windll
.kernel32
3372 LockFileEx
= kernel32
.LockFileEx
3373 LockFileEx
.argtypes
= [
3374 ctypes
.wintypes
.HANDLE
, # hFile
3375 ctypes
.wintypes
.DWORD
, # dwFlags
3376 ctypes
.wintypes
.DWORD
, # dwReserved
3377 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3378 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3379 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3381 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3382 UnlockFileEx
= kernel32
.UnlockFileEx
3383 UnlockFileEx
.argtypes
= [
3384 ctypes
.wintypes
.HANDLE
, # hFile
3385 ctypes
.wintypes
.DWORD
, # dwReserved
3386 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3387 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3388 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3390 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3391 whole_low
= 0xffffffff
3392 whole_high
= 0x7fffffff
3394 def _lock_file(f
, exclusive
):
3395 overlapped
= OVERLAPPED()
3396 overlapped
.Offset
= 0
3397 overlapped
.OffsetHigh
= 0
3398 overlapped
.hEvent
= 0
3399 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3400 handle
= msvcrt
.get_osfhandle(f
.fileno())
3401 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3402 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3403 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3405 def _unlock_file(f
):
3406 assert f
._lock
_file
_overlapped
_p
3407 handle
= msvcrt
.get_osfhandle(f
.fileno())
3408 if not UnlockFileEx(handle
, 0,
3409 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3410 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3413 # Some platforms, such as Jython, is missing fcntl
3417 def _lock_file(f
, exclusive
):
3418 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3420 def _unlock_file(f
):
3421 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3423 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3425 def _lock_file(f
, exclusive
):
3426 raise IOError(UNSUPPORTED_MSG
)
3428 def _unlock_file(f
):
3429 raise IOError(UNSUPPORTED_MSG
)
3432 class locked_file(object):
3433 def __init__(self
, filename
, mode
, encoding
=None):
3434 assert mode
in ['r', 'a', 'w']
3435 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3438 def __enter__(self
):
3439 exclusive
= self
.mode
!= 'r'
3441 _lock_file(self
.f
, exclusive
)
3447 def __exit__(self
, etype
, value
, traceback
):
3449 _unlock_file(self
.f
)
3456 def write(self
, *args
):
3457 return self
.f
.write(*args
)
3459 def read(self
, *args
):
3460 return self
.f
.read(*args
)
3463 def get_filesystem_encoding():
3464 encoding
= sys
.getfilesystemencoding()
3465 return encoding
if encoding
is not None else 'utf-8'
3468 def shell_quote(args
):
3470 encoding
= get_filesystem_encoding()
3472 if isinstance(a
, bytes):
3473 # We may get a filename encoded with 'encodeFilename'
3474 a
= a
.decode(encoding
)
3475 quoted_args
.append(compat_shlex_quote(a
))
3476 return ' '.join(quoted_args
)
3479 def smuggle_url(url
, data
):
3480 """ Pass additional data in a URL for internal use. """
3482 url
, idata
= unsmuggle_url(url
, {})
3484 sdata
= compat_urllib_parse_urlencode(
3485 {'__youtubedl_smuggle': json.dumps(data)}
)
3486 return url
+ '#' + sdata
3489 def unsmuggle_url(smug_url
, default
=None):
3490 if '#__youtubedl_smuggle' not in smug_url
:
3491 return smug_url
, default
3492 url
, _
, sdata
= smug_url
.rpartition('#')
3493 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3494 data
= json
.loads(jsond
)
3498 def format_bytes(bytes):
3501 if type(bytes) is str:
3502 bytes = float(bytes)
3506 exponent
= int(math
.log(bytes, 1024.0))
3507 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3508 converted
= float(bytes) / float(1024 ** exponent
)
3509 return '%.2f%s' % (converted
, suffix
)
3512 def lookup_unit_table(unit_table
, s
):
3513 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3515 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3518 num_str
= m
.group('num').replace(',', '.')
3519 mult
= unit_table
[m
.group('unit')]
3520 return int(float(num_str
) * mult
)
3523 def parse_filesize(s
):
3527 # The lower-case forms are of course incorrect and unofficial,
3528 # but we support those too
3545 'megabytes': 1000 ** 2,
3546 'mebibytes': 1024 ** 2,
3552 'gigabytes': 1000 ** 3,
3553 'gibibytes': 1024 ** 3,
3559 'terabytes': 1000 ** 4,
3560 'tebibytes': 1024 ** 4,
3566 'petabytes': 1000 ** 5,
3567 'pebibytes': 1024 ** 5,
3573 'exabytes': 1000 ** 6,
3574 'exbibytes': 1024 ** 6,
3580 'zettabytes': 1000 ** 7,
3581 'zebibytes': 1024 ** 7,
3587 'yottabytes': 1000 ** 8,
3588 'yobibytes': 1024 ** 8,
3591 return lookup_unit_table(_UNIT_TABLE
, s
)
3600 if re
.match(r
'^[\d,.]+$', s
):
3601 return str_to_int(s
)
3612 return lookup_unit_table(_UNIT_TABLE
, s
)
3615 def parse_resolution(s
):
3619 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3622 'width': int(mobj
.group('w')),
3623 'height': int(mobj
.group('h')),
3626 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3628 return {'height': int(mobj.group(1))}
3630 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3632 return {'height': int(mobj.group(1)) * 540}
3637 def parse_bitrate(s
):
3638 if not isinstance(s
, compat_str
):
3640 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3642 return int(mobj
.group(1))
3645 def month_by_name(name
, lang
='en'):
3646 """ Return the number of a month by (locale-independently) English name """
3648 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3651 return month_names
.index(name
) + 1
3656 def month_by_abbreviation(abbrev
):
3657 """ Return the number of a month by (locale-independently) English
3661 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3666 def fix_xml_ampersands(xml_str
):
3667 """Replace all the '&' by '&' in XML"""
3669 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3674 def setproctitle(title
):
3675 assert isinstance(title
, compat_str
)
3677 # ctypes in Jython is not complete
3678 # http://bugs.jython.org/issue2148
3679 if sys
.platform
.startswith('java'):
3683 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3687 # LoadLibrary in Windows Python 2.7.13 only expects
3688 # a bytestring, but since unicode_literals turns
3689 # every string into a unicode string, it fails.
3691 title_bytes
= title
.encode('utf-8')
3692 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3693 buf
.value
= title_bytes
3695 libc
.prctl(15, buf
, 0, 0, 0)
3696 except AttributeError:
3697 return # Strange libc, just skip this
3700 def remove_start(s
, start
):
3701 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3704 def remove_end(s
, end
):
3705 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3708 def remove_quotes(s
):
3709 if s
is None or len(s
) < 2:
3711 for quote
in ('"', "'", ):
3712 if s
[0] == quote
and s
[-1] == quote
:
3717 def get_domain(url
):
3718 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3719 return domain
.group('domain') if domain
else None
3722 def url_basename(url
):
3723 path
= compat_urlparse
.urlparse(url
).path
3724 return path
.strip('/').split('/')[-1]
3728 return re
.match(r
'https?://[^?#&]+/', url
).group()
3731 def urljoin(base
, path
):
3732 if isinstance(path
, bytes):
3733 path
= path
.decode('utf-8')
3734 if not isinstance(path
, compat_str
) or not path
:
3736 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3738 if isinstance(base
, bytes):
3739 base
= base
.decode('utf-8')
3740 if not isinstance(base
, compat_str
) or not re
.match(
3741 r
'^(?:https?:)?//', base
):
3743 return compat_urlparse
.urljoin(base
, path
)
3746 class HEADRequest(compat_urllib_request
.Request
):
3747 def get_method(self
):
3751 class PUTRequest(compat_urllib_request
.Request
):
3752 def get_method(self
):
3756 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3759 v
= getattr(v
, get_attr
, None)
3765 return int(v
) * invscale
// scale
3766 except (ValueError, TypeError):
3770 def str_or_none(v
, default
=None):
3771 return default
if v
is None else compat_str(v
)
3774 def str_to_int(int_str
):
3775 """ A more relaxed version of int_or_none """
3776 if isinstance(int_str
, compat_integer_types
):
3778 elif isinstance(int_str
, compat_str
):
3779 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3780 return int_or_none(int_str
)
3783 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3787 return float(v
) * invscale
/ scale
3788 except (ValueError, TypeError):
3792 def bool_or_none(v
, default
=None):
3793 return v
if isinstance(v
, bool) else default
3796 def strip_or_none(v
, default
=None):
3797 return v
.strip() if isinstance(v
, compat_str
) else default
3800 def url_or_none(url
):
3801 if not url
or not isinstance(url
, compat_str
):
3804 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3807 def strftime_or_none(timestamp
, date_format
, default
=None):
3808 datetime_object
= None
3810 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3811 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3812 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3813 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3814 return datetime_object
.strftime(date_format
)
3815 except (ValueError, TypeError, AttributeError):
3819 def parse_duration(s
):
3820 if not isinstance(s
, compat_basestring
):
3825 days
, hours
, mins
, secs
, ms
= [None] * 5
3826 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3828 days
, hours
, mins
, secs
, ms
= m
.groups()
3833 [0-9]+\s*y(?:ears?)?\s*
3836 [0-9]+\s*m(?:onths?)?\s*
3839 [0-9]+\s*w(?:eeks?)?\s*
3842 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3846 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3849 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3852 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3855 days
, hours
, mins
, secs
, ms
= m
.groups()
3857 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3859 hours
, mins
= m
.groups()
3865 duration
+= float(secs
)
3867 duration
+= float(mins
) * 60
3869 duration
+= float(hours
) * 60 * 60
3871 duration
+= float(days
) * 24 * 60 * 60
3873 duration
+= float(ms
)
3877 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3878 name
, real_ext
= os
.path
.splitext(filename
)
3880 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3881 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3882 else '{0}.{1}'.format(filename
, ext
))
3885 def replace_extension(filename
, ext
, expected_real_ext
=None):
3886 name
, real_ext
= os
.path
.splitext(filename
)
3887 return '{0}.{1}'.format(
3888 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3892 def check_executable(exe
, args
=[]):
3893 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3894 args can be a list of arguments for a short output (like -version) """
3896 process_communicate_or_kill(subprocess
.Popen(
3897 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3903 def get_exe_version(exe
, args
=['--version'],
3904 version_re
=None, unrecognized
='present'):
3905 """ Returns the version of the specified executable,
3906 or False if the executable is not present """
3908 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3909 # SIGTTOU if yt-dlp is run in the background.
3910 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3911 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3912 [encodeArgument(exe
)] + args
,
3913 stdin
=subprocess
.PIPE
,
3914 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3917 if isinstance(out
, bytes): # Python 2.x
3918 out
= out
.decode('ascii', 'ignore')
3919 return detect_exe_version(out
, version_re
, unrecognized
)
3922 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3923 assert isinstance(output
, compat_str
)
3924 if version_re
is None:
3925 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3926 m
= re
.search(version_re
, output
)
3933 class PagedList(object):
3935 # This is only useful for tests
3936 return len(self
.getslice())
3939 class OnDemandPagedList(PagedList
):
3940 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3941 self
._pagefunc
= pagefunc
3942 self
._pagesize
= pagesize
3943 self
._use
_cache
= use_cache
3947 def getslice(self
, start
=0, end
=None):
3949 for pagenum
in itertools
.count(start
// self
._pagesize
):
3950 firstid
= pagenum
* self
._pagesize
3951 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3952 if start
>= nextfirstid
:
3957 page_results
= self
._cache
.get(pagenum
)
3958 if page_results
is None:
3959 page_results
= list(self
._pagefunc
(pagenum
))
3961 self
._cache
[pagenum
] = page_results
3964 start
% self
._pagesize
3965 if firstid
<= start
< nextfirstid
3969 ((end
- 1) % self
._pagesize
) + 1
3970 if (end
is not None and firstid
<= end
<= nextfirstid
)
3973 if startv
!= 0 or endv
is not None:
3974 page_results
= page_results
[startv
:endv
]
3975 res
.extend(page_results
)
3977 # A little optimization - if current page is not "full", ie. does
3978 # not contain page_size videos then we can assume that this page
3979 # is the last one - there are no more ids on further pages -
3980 # i.e. no need to query again.
3981 if len(page_results
) + startv
< self
._pagesize
:
3984 # If we got the whole page, but the next page is not interesting,
3985 # break out early as well
3986 if end
== nextfirstid
:
3991 class InAdvancePagedList(PagedList
):
3992 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3993 self
._pagefunc
= pagefunc
3994 self
._pagecount
= pagecount
3995 self
._pagesize
= pagesize
3997 def getslice(self
, start
=0, end
=None):
3999 start_page
= start
// self
._pagesize
4001 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
4002 skip_elems
= start
- start_page
* self
._pagesize
4003 only_more
= None if end
is None else end
- start
4004 for pagenum
in range(start_page
, end_page
):
4005 page
= list(self
._pagefunc
(pagenum
))
4007 page
= page
[skip_elems
:]
4009 if only_more
is not None:
4010 if len(page
) < only_more
:
4011 only_more
-= len(page
)
4013 page
= page
[:only_more
]
4020 def uppercase_escape(s
):
4021 unicode_escape
= codecs
.getdecoder('unicode_escape')
4023 r
'\\U[0-9a-fA-F]{8}',
4024 lambda m
: unicode_escape(m
.group(0))[0],
4028 def lowercase_escape(s
):
4029 unicode_escape
= codecs
.getdecoder('unicode_escape')
4031 r
'\\u[0-9a-fA-F]{4}',
4032 lambda m
: unicode_escape(m
.group(0))[0],
4036 def escape_rfc3986(s
):
4037 """Escape non-ASCII characters as suggested by RFC 3986"""
4038 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
4039 s
= s
.encode('utf-8')
4040 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
4043 def escape_url(url
):
4044 """Escape URL as suggested by RFC 3986"""
4045 url_parsed
= compat_urllib_parse_urlparse(url
)
4046 return url_parsed
._replace
(
4047 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
4048 path
=escape_rfc3986(url_parsed
.path
),
4049 params
=escape_rfc3986(url_parsed
.params
),
4050 query
=escape_rfc3986(url_parsed
.query
),
4051 fragment
=escape_rfc3986(url_parsed
.fragment
)
4055 def read_batch_urls(batch_fd
):
4057 if not isinstance(url
, compat_str
):
4058 url
= url
.decode('utf-8', 'replace')
4059 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
4060 for bom
in BOM_UTF8
:
4061 if url
.startswith(bom
):
4062 url
= url
[len(bom
):]
4064 if not url
or url
.startswith(('#', ';', ']')):
4066 # "#" cannot be stripped out since it is part of the URI
4067 # However, it can be safely stipped out if follwing a whitespace
4068 return re
.split(r
'\s#', url
, 1)[0].rstrip()
4070 with contextlib
.closing(batch_fd
) as fd
:
4071 return [url
for url
in map(fixup
, fd
) if url
]
4074 def urlencode_postdata(*args
, **kargs
):
4075 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
4078 def update_url_query(url
, query
):
4081 parsed_url
= compat_urlparse
.urlparse(url
)
4082 qs
= compat_parse_qs(parsed_url
.query
)
4084 return compat_urlparse
.urlunparse(parsed_url
._replace
(
4085 query
=compat_urllib_parse_urlencode(qs
, True)))
4088 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
4089 req_headers
= req
.headers
.copy()
4090 req_headers
.update(headers
)
4091 req_data
= data
or req
.data
4092 req_url
= update_url_query(url
or req
.get_full_url(), query
)
4093 req_get_method
= req
.get_method()
4094 if req_get_method
== 'HEAD':
4095 req_type
= HEADRequest
4096 elif req_get_method
== 'PUT':
4097 req_type
= PUTRequest
4099 req_type
= compat_urllib_request
.Request
4101 req_url
, data
=req_data
, headers
=req_headers
,
4102 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
4103 if hasattr(req
, 'timeout'):
4104 new_req
.timeout
= req
.timeout
4108 def _multipart_encode_impl(data
, boundary
):
4109 content_type
= 'multipart/form-data; boundary=%s' % boundary
4112 for k
, v
in data
.items():
4113 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4114 if isinstance(k
, compat_str
):
4115 k
= k
.encode('utf-8')
4116 if isinstance(v
, compat_str
):
4117 v
= v
.encode('utf-8')
4118 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4119 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4120 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4121 if boundary
.encode('ascii') in content
:
4122 raise ValueError('Boundary overlaps with data')
4125 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4127 return out
, content_type
4130 def multipart_encode(data
, boundary
=None):
4132 Encode a dict to RFC 7578-compliant form-data
4135 A dict where keys and values can be either Unicode or bytes-like
4138 If specified a Unicode object, it's used as the boundary. Otherwise
4139 a random boundary is generated.
4141 Reference: https://tools.ietf.org/html/rfc7578
4143 has_specified_boundary
= boundary
is not None
4146 if boundary
is None:
4147 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4150 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4153 if has_specified_boundary
:
4157 return out
, content_type
4160 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4161 if isinstance(key_or_keys
, (list, tuple)):
4162 for key
in key_or_keys
:
4163 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4167 return d
.get(key_or_keys
, default
)
4170 def try_get(src
, getter
, expected_type
=None):
4171 if not isinstance(getter
, (list, tuple)):
4176 except (AttributeError, KeyError, TypeError, IndexError):
4179 if expected_type
is None or isinstance(v
, expected_type
):
4183 def merge_dicts(*dicts
):
4185 for a_dict
in dicts
:
4186 for k
, v
in a_dict
.items():
4190 or (isinstance(v
, compat_str
) and v
4191 and isinstance(merged
[k
], compat_str
)
4192 and not merged
[k
])):
4197 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4198 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4210 TV_PARENTAL_GUIDELINES
= {
4220 def parse_age_limit(s
):
4222 return s
if 0 <= s
<= 21 else None
4223 if not isinstance(s
, compat_basestring
):
4225 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4227 return int(m
.group('age'))
4230 return US_RATINGS
[s
]
4231 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4233 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4237 def strip_jsonp(code
):
4240 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4241 (?:\s*&&\s*(?P=func_name))?
4242 \s*\(\s*(?P<callback_data>.*)\);?
4243 \s*?(?://[^\n]*)*$''',
4244 r
'\g<callback_data>', code
)
4247 def js_to_json(code
, vars={}):
4248 # vars is a dict of var, val pairs to substitute
4249 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4250 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4252 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4253 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4258 if v
in ('true', 'false', 'null'):
4260 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4263 if v
[0] in ("'", '"'):
4264 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4269 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4271 for regex
, base
in INTEGER_TABLE
:
4272 im
= re
.match(regex
, v
)
4274 i
= int(im
.group(1), base
)
4275 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4282 return re
.sub(r
'''(?sx)
4283 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4284 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4285 {comment}|,(?={skip}[\]}}])|
4286 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4287 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4290 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4293 def qualities(quality_ids
):
4294 """ Get a numeric quality value out of a list of possible values """
4297 return quality_ids
.index(qid
)
4304 'default': '%(title)s [%(id)s].%(ext)s',
4305 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4311 'description': 'description',
4312 'annotation': 'annotations.xml',
4313 'infojson': 'info.json',
4314 'pl_description': 'description',
4315 'pl_infojson': 'info.json',
4318 # As of [1] format syntax is:
4319 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4320 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4321 FORMAT_RE
= r
'''(?x)
4324 \({0}\) # mapping key
4325 (?:[#0\-+ ]+)? # conversion flags (optional)
4326 (?:\d+)? # minimum field width (optional)
4327 (?:\.\d+)? # precision (optional)
4328 [hlL]? # length modifier (optional)
4329 (?P<type>[diouxXeEfFgGcrs%]) # conversion type
4333 def limit_length(s
, length
):
4334 """ Add ellipses to overly long strings """
4339 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4343 def version_tuple(v
):
4344 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4347 def is_outdated_version(version
, limit
, assume_new
=True):
4349 return not assume_new
4351 return version_tuple(version
) < version_tuple(limit
)
4353 return not assume_new
4356 def ytdl_is_updateable():
4357 """ Returns if yt-dlp can be updated with -U """
4360 from zipimport
import zipimporter
4362 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4365 def args_to_str(args
):
4366 # Get a short string representation for a subprocess command
4367 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4370 def error_to_compat_str(err
):
4372 # On python 2 error byte string must be decoded with proper
4373 # encoding rather than ascii
4374 if sys
.version_info
[0] < 3:
4375 err_str
= err_str
.decode(preferredencoding())
4379 def mimetype2ext(mt
):
4385 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4386 # it's the most popular one
4387 'audio/mpeg': 'mp3',
4388 'audio/x-wav': 'wav',
4393 _
, _
, res
= mt
.rpartition('/')
4394 res
= res
.split(';')[0].strip().lower()
4398 'smptett+xml': 'tt',
4402 'x-mp4-fragmented': 'mp4',
4403 'x-ms-sami': 'sami',
4406 'x-mpegurl': 'm3u8',
4407 'vnd.apple.mpegurl': 'm3u8',
4411 'vnd.ms-sstr+xml': 'ism',
4418 def parse_codecs(codecs_str
):
4419 # http://tools.ietf.org/html/rfc6381
4422 split_codecs
= list(filter(None, map(
4423 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4424 vcodec
, acodec
= None, None
4425 for full_codec
in split_codecs
:
4426 codec
= full_codec
.split('.')[0]
4427 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4430 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4434 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4435 if not vcodec
and not acodec
:
4436 if len(split_codecs
) == 2:
4438 'vcodec': split_codecs
[0],
4439 'acodec': split_codecs
[1],
4443 'vcodec': vcodec
or 'none',
4444 'acodec': acodec
or 'none',
4449 def urlhandle_detect_ext(url_handle
):
4450 getheader
= url_handle
.headers
.get
4452 cd
= getheader('Content-Disposition')
4454 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4456 e
= determine_ext(m
.group('filename'), default_ext
=None)
4460 return mimetype2ext(getheader('Content-Type'))
4463 def encode_data_uri(data
, mime_type
):
4464 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4467 def age_restricted(content_limit
, age_limit
):
4468 """ Returns True iff the content should be blocked """
4470 if age_limit
is None: # No limit set
4472 if content_limit
is None:
4473 return False # Content available for everyone
4474 return age_limit
< content_limit
4477 def is_html(first_bytes
):
4478 """ Detect whether a file contains HTML by examining its first bytes. """
4481 (b
'\xef\xbb\xbf', 'utf-8'),
4482 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4483 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4484 (b
'\xff\xfe', 'utf-16-le'),
4485 (b
'\xfe\xff', 'utf-16-be'),
4487 for bom
, enc
in BOMS
:
4488 if first_bytes
.startswith(bom
):
4489 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4492 s
= first_bytes
.decode('utf-8', 'replace')
4494 return re
.match(r
'^\s*<', s
)
4497 def determine_protocol(info_dict
):
4498 protocol
= info_dict
.get('protocol')
4499 if protocol
is not None:
4502 url
= info_dict
['url']
4503 if url
.startswith('rtmp'):
4505 elif url
.startswith('mms'):
4507 elif url
.startswith('rtsp'):
4510 ext
= determine_ext(url
)
4516 return compat_urllib_parse_urlparse(url
).scheme
4519 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4520 """ Render a list of rows, each as a list of values """
4522 def get_max_lens(table
):
4523 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4525 def filter_using_list(row
, filterArray
):
4526 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4529 max_lens
= get_max_lens(data
)
4530 header_row
= filter_using_list(header_row
, max_lens
)
4531 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4533 table
= [header_row
] + data
4534 max_lens
= get_max_lens(table
)
4536 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4537 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4538 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4541 def _match_one(filter_part
, dct
):
4542 COMPARISON_OPERATORS
= {
4550 operator_rex
= re
.compile(r
'''(?x)\s*
4552 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4554 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4555 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4556 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4559 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4560 m = operator_rex.search(filter_part)
4562 op = COMPARISON_OPERATORS[m.group('op')]
4563 actual_value = dct.get(m.group('key'))
4564 if (m.group('quotedstrval') is not None
4565 or m.group('strval') is not None
4566 # If the original field is a string and matching comparisonvalue is
4567 # a number we should respect the origin of the original field
4568 # and process comparison value as a string (see
4569 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4570 or actual_value is not None and m.group('intval') is not None
4571 and isinstance(actual_value, compat_str)):
4572 if m.group('op') not in ('=', '!='):
4574 'Operator %s does not support string values!' % m.group('op'))
4575 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4576 quote = m.group('quote')
4577 if quote is not None:
4578 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4581 comparison_value = int(m.group('intval'))
4583 comparison_value = parse_filesize(m.group('intval'))
4584 if comparison_value is None:
4585 comparison_value = parse_filesize(m.group('intval') + 'B')
4586 if comparison_value is None:
4588 'Invalid integer value %r in filter part %r' % (
4589 m.group('intval'), filter_part))
4590 if actual_value is None:
4591 return m.group('none_inclusive')
4592 return op(actual_value, comparison_value)
4595 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4596 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4598 operator_rex = re.compile(r'''(?x
)\s
*
4599 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4601 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4602 m = operator_rex.search(filter_part)
4604 op = UNARY_OPERATORS[m.group('op')]
4605 actual_value = dct.get(m.group('key'))
4606 return op(actual_value)
4608 raise ValueError('Invalid filter part %r' % filter_part)
4611 def match_str(filter_str, dct):
4612 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4615 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4618 def match_filter_func(filter_str):
4619 def _match_func(info_dict):
4620 if match_str(filter_str, info_dict):
4623 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4624 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4628 def parse_dfxp_time_expr(time_expr):
4632 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4634 return float(mobj.group('time_offset'))
4636 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4638 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4641 def srt_subtitles_timecode(seconds):
4642 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4645 def dfxp2srt(dfxp_data):
4647 @param dfxp_data A
bytes-like
object containing DFXP data
4648 @returns A
unicode object containing converted SRT data
4650 LEGACY_NAMESPACES = (
4651 (b'http://www.w3.org/ns/ttml', [
4652 b'http://www.w3.org/2004/11/ttaf1',
4653 b'http://www.w3.org/2006/04/ttaf1',
4654 b'http://www.w3.org/2006/10/ttaf1',
4656 (b'http://www.w3.org/ns/ttml#styling', [
4657 b'http://www.w3.org/ns/ttml#style',
4661 SUPPORTED_STYLING = [
4670 _x = functools.partial(xpath_with_ns, ns_map={
4671 'xml': 'http://www.w3.org/XML/1998/namespace',
4672 'ttml': 'http://www.w3.org/ns/ttml',
4673 'tts': 'http://www.w3.org/ns/ttml#styling',
4679 class TTMLPElementParser(object):
4681 _unclosed_elements = []
4682 _applied_styles = []
4684 def start(self, tag, attrib):
4685 if tag in (_x('ttml:br'), 'br'):
4688 unclosed_elements = []
4690 element_style_id = attrib.get('style')
4692 style.update(default_style)
4693 if element_style_id:
4694 style.update(styles.get(element_style_id, {}))
4695 for prop in SUPPORTED_STYLING:
4696 prop_val = attrib.get(_x('tts:' + prop))
4698 style[prop] = prop_val
4701 for k, v in sorted(style.items()):
4702 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4705 font += ' color="%s"' % v
4706 elif k == 'fontSize':
4707 font += ' size="%s"' % v
4708 elif k == 'fontFamily':
4709 font += ' face="%s"' % v
4710 elif k == 'fontWeight' and v == 'bold':
4712 unclosed_elements.append('b')
4713 elif k == 'fontStyle' and v == 'italic':
4715 unclosed_elements.append('i')
4716 elif k == 'textDecoration' and v == 'underline':
4718 unclosed_elements.append('u')
4720 self._out += '<font' + font + '>'
4721 unclosed_elements.append('font')
4723 if self._applied_styles:
4724 applied_style.update(self._applied_styles[-1])
4725 applied_style.update(style)
4726 self._applied_styles.append(applied_style)
4727 self._unclosed_elements.append(unclosed_elements)
4730 if tag not in (_x('ttml:br'), 'br'):
4731 unclosed_elements = self._unclosed_elements.pop()
4732 for element in reversed(unclosed_elements):
4733 self._out += '</%s>' % element
4734 if unclosed_elements and self._applied_styles:
4735 self._applied_styles.pop()
4737 def data(self, data):
4741 return self._out.strip()
4743 def parse_node(node):
4744 target = TTMLPElementParser()
4745 parser = xml.etree.ElementTree.XMLParser(target=target)
4746 parser.feed(xml.etree.ElementTree.tostring(node))
4747 return parser.close()
4749 for k, v in LEGACY_NAMESPACES:
4751 dfxp_data = dfxp_data.replace(ns, k)
4753 dfxp = compat_etree_fromstring(dfxp_data)
4755 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4758 raise ValueError('Invalid dfxp/TTML subtitle')
4762 for style in dfxp.findall(_x('.//ttml:style')):
4763 style_id = style.get('id') or style.get(_x('xml:id'))
4766 parent_style_id = style.get('style')
4768 if parent_style_id not in styles:
4771 styles[style_id] = styles[parent_style_id].copy()
4772 for prop in SUPPORTED_STYLING:
4773 prop_val = style.get(_x('tts:' + prop))
4775 styles.setdefault(style_id, {})[prop] = prop_val
4781 for p in ('body', 'div'):
4782 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4785 style = styles.get(ele.get('style'))
4788 default_style.update(style)
4790 for para, index in zip(paras, itertools.count(1)):
4791 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4792 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4793 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4794 if begin_time is None:
4799 end_time = begin_time + dur
4800 out.append('%d\n%s --> %s\n%s\n\n' % (
4802 srt_subtitles_timecode(begin_time),
4803 srt_subtitles_timecode(end_time),
4809 def cli_option(params, command_option, param):
4810 param = params.get(param)
4812 param = compat_str(param)
4813 return [command_option, param] if param is not None else []
4816 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4817 param = params.get(param)
4820 assert isinstance(param, bool)
4822 return [command_option + separator + (true_value if param else false_value)]
4823 return [command_option, true_value if param else false_value]
4826 def cli_valueless_option(params, command_option, param, expected_value=True):
4827 param = params.get(param)
4828 return [command_option] if param == expected_value else []
4831 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4832 if isinstance(argdict, (list, tuple)): # for backward compatibility
4839 assert isinstance(argdict, dict)
4841 assert isinstance(keys, (list, tuple))
4842 for key_list in keys:
4843 if isinstance(key_list, compat_str):
4844 key_list = (key_list,)
4845 arg_list = list(filter(
4846 lambda x: x is not None,
4847 [argdict.get(key.lower()) for key in key_list]))
4849 return [arg for args in arg_list for arg in args]
4853 class ISO639Utils(object):
4854 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4913 'iw': 'heb', # Replaced by he in 1989 revision
4923 'in': 'ind', # Replaced by id in 1989 revision
5038 'ji': 'yid', # Replaced by yi in 1989 revision
5046 def short2long(cls, code):
5047 """Convert language code from ISO 639-1 to ISO 639-2/T"""
5048 return cls._lang_map.get(code[:2])
5051 def long2short(cls, code):
5052 """Convert language code from ISO 639-2/T to ISO 639-1"""
5053 for short_name, long_name in cls._lang_map.items():
5054 if long_name == code:
5058 class ISO3166Utils(object):
5059 # From http://data.okfn.org/data/core/country-list
5061 'AF': 'Afghanistan',
5062 'AX': 'Åland Islands',
5065 'AS': 'American Samoa',
5070 'AG': 'Antigua and Barbuda',
5087 'BO': 'Bolivia, Plurinational State of',
5088 'BQ': 'Bonaire, Sint Eustatius and Saba',
5089 'BA': 'Bosnia and Herzegovina',
5091 'BV': 'Bouvet Island',
5093 'IO': 'British Indian Ocean Territory',
5094 'BN': 'Brunei Darussalam',
5096 'BF': 'Burkina Faso',
5102 'KY': 'Cayman Islands',
5103 'CF': 'Central African Republic',
5107 'CX': 'Christmas Island',
5108 'CC': 'Cocos (Keeling) Islands',
5112 'CD': 'Congo, the Democratic Republic of the',
5113 'CK': 'Cook Islands',
5115 'CI': 'Côte d\'Ivoire',
5120 'CZ': 'Czech Republic',
5124 'DO': 'Dominican Republic',
5127 'SV': 'El Salvador',
5128 'GQ': 'Equatorial Guinea',
5132 'FK': 'Falkland Islands (Malvinas)',
5133 'FO': 'Faroe Islands',
5137 'GF': 'French Guiana',
5138 'PF': 'French Polynesia',
5139 'TF': 'French Southern Territories',
5154 'GW': 'Guinea-Bissau',
5157 'HM': 'Heard Island and McDonald Islands',
5158 'VA': 'Holy See (Vatican City State)',
5165 'IR': 'Iran, Islamic Republic of',
5168 'IM': 'Isle of Man',
5178 'KP': 'Korea, Democratic People\'s Republic of',
5179 'KR': 'Korea, Republic of',
5182 'LA': 'Lao People\'s Democratic Republic',
5188 'LI': 'Liechtenstein',
5192 'MK': 'Macedonia, the Former Yugoslav Republic of',
5199 'MH': 'Marshall Islands',
5205 'FM': 'Micronesia, Federated States of',
5206 'MD': 'Moldova, Republic of',
5217 'NL': 'Netherlands',
5218 'NC': 'New Caledonia',
5219 'NZ': 'New Zealand',
5224 'NF': 'Norfolk Island',
5225 'MP': 'Northern Mariana Islands',
5230 'PS': 'Palestine, State of',
5232 'PG': 'Papua New Guinea',
5235 'PH': 'Philippines',
5239 'PR': 'Puerto Rico',
5243 'RU': 'Russian Federation',
5245 'BL': 'Saint Barthélemy',
5246 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5247 'KN': 'Saint Kitts and Nevis',
5248 'LC': 'Saint Lucia',
5249 'MF': 'Saint Martin (French part)',
5250 'PM': 'Saint Pierre and Miquelon',
5251 'VC': 'Saint Vincent and the Grenadines',
5254 'ST': 'Sao Tome and Principe',
5255 'SA': 'Saudi Arabia',
5259 'SL': 'Sierra Leone',
5261 'SX': 'Sint Maarten (Dutch part)',
5264 'SB': 'Solomon Islands',
5266 'ZA': 'South Africa',
5267 'GS': 'South Georgia and the South Sandwich Islands',
5268 'SS': 'South Sudan',
5273 'SJ': 'Svalbard and Jan Mayen',
5276 'CH': 'Switzerland',
5277 'SY': 'Syrian Arab Republic',
5278 'TW': 'Taiwan, Province of China',
5280 'TZ': 'Tanzania, United Republic of',
5282 'TL': 'Timor-Leste',
5286 'TT': 'Trinidad and Tobago',
5289 'TM': 'Turkmenistan',
5290 'TC': 'Turks and Caicos Islands',
5294 'AE': 'United Arab Emirates',
5295 'GB': 'United Kingdom',
5296 'US': 'United States',
5297 'UM': 'United States Minor Outlying Islands',
5301 'VE': 'Venezuela, Bolivarian Republic of',
5303 'VG': 'Virgin Islands, British',
5304 'VI': 'Virgin Islands, U.S.',
5305 'WF': 'Wallis and Futuna',
5306 'EH': 'Western Sahara',
5313 def short2full(cls, code):
5314 """Convert an ISO 3166-2 country code to the corresponding full name"""
5315 return cls._country_map.get(code.upper())
5318 class GeoUtils(object):
5319 # Major IPv4 address blocks per country
5321 'AD': '46.172.224.0/19',
5322 'AE': '94.200.0.0/13',
5323 'AF': '149.54.0.0/17',
5324 'AG': '209.59.64.0/18',
5325 'AI': '204.14.248.0/21',
5326 'AL': '46.99.0.0/16',
5327 'AM': '46.70.0.0/15',
5328 'AO': '105.168.0.0/13',
5329 'AP': '182.50.184.0/21',
5330 'AQ': '23.154.160.0/24',
5331 'AR': '181.0.0.0/12',
5332 'AS': '202.70.112.0/20',
5333 'AT': '77.116.0.0/14',
5334 'AU': '1.128.0.0/11',
5335 'AW': '181.41.0.0/18',
5336 'AX': '185.217.4.0/22',
5337 'AZ': '5.197.0.0/16',
5338 'BA': '31.176.128.0/17',
5339 'BB': '65.48.128.0/17',
5340 'BD': '114.130.0.0/16',
5342 'BF': '102.178.0.0/15',
5343 'BG': '95.42.0.0/15',
5344 'BH': '37.131.0.0/17',
5345 'BI': '154.117.192.0/18',
5346 'BJ': '137.255.0.0/16',
5347 'BL': '185.212.72.0/23',
5348 'BM': '196.12.64.0/18',
5349 'BN': '156.31.0.0/16',
5350 'BO': '161.56.0.0/16',
5351 'BQ': '161.0.80.0/20',
5352 'BR': '191.128.0.0/12',
5353 'BS': '24.51.64.0/18',
5354 'BT': '119.2.96.0/19',
5355 'BW': '168.167.0.0/16',
5356 'BY': '178.120.0.0/13',
5357 'BZ': '179.42.192.0/18',
5358 'CA': '99.224.0.0/11',
5359 'CD': '41.243.0.0/16',
5360 'CF': '197.242.176.0/21',
5361 'CG': '160.113.0.0/16',
5362 'CH': '85.0.0.0/13',
5363 'CI': '102.136.0.0/14',
5364 'CK': '202.65.32.0/19',
5365 'CL': '152.172.0.0/14',
5366 'CM': '102.244.0.0/14',
5367 'CN': '36.128.0.0/10',
5368 'CO': '181.240.0.0/12',
5369 'CR': '201.192.0.0/12',
5370 'CU': '152.206.0.0/15',
5371 'CV': '165.90.96.0/19',
5372 'CW': '190.88.128.0/17',
5373 'CY': '31.153.0.0/16',
5374 'CZ': '88.100.0.0/14',
5376 'DJ': '197.241.0.0/17',
5377 'DK': '87.48.0.0/12',
5378 'DM': '192.243.48.0/20',
5379 'DO': '152.166.0.0/15',
5380 'DZ': '41.96.0.0/12',
5381 'EC': '186.68.0.0/15',
5382 'EE': '90.190.0.0/15',
5383 'EG': '156.160.0.0/11',
5384 'ER': '196.200.96.0/20',
5385 'ES': '88.0.0.0/11',
5386 'ET': '196.188.0.0/14',
5387 'EU': '2.16.0.0/13',
5388 'FI': '91.152.0.0/13',
5389 'FJ': '144.120.0.0/16',
5390 'FK': '80.73.208.0/21',
5391 'FM': '119.252.112.0/20',
5392 'FO': '88.85.32.0/19',
5394 'GA': '41.158.0.0/15',
5396 'GD': '74.122.88.0/21',
5397 'GE': '31.146.0.0/16',
5398 'GF': '161.22.64.0/18',
5399 'GG': '62.68.160.0/19',
5400 'GH': '154.160.0.0/12',
5401 'GI': '95.164.0.0/16',
5402 'GL': '88.83.0.0/19',
5403 'GM': '160.182.0.0/15',
5404 'GN': '197.149.192.0/18',
5405 'GP': '104.250.0.0/19',
5406 'GQ': '105.235.224.0/20',
5407 'GR': '94.64.0.0/13',
5408 'GT': '168.234.0.0/16',
5409 'GU': '168.123.0.0/16',
5410 'GW': '197.214.80.0/20',
5411 'GY': '181.41.64.0/18',
5412 'HK': '113.252.0.0/14',
5413 'HN': '181.210.0.0/16',
5414 'HR': '93.136.0.0/13',
5415 'HT': '148.102.128.0/17',
5416 'HU': '84.0.0.0/14',
5417 'ID': '39.192.0.0/10',
5418 'IE': '87.32.0.0/12',
5419 'IL': '79.176.0.0/13',
5420 'IM': '5.62.80.0/20',
5421 'IN': '117.192.0.0/10',
5422 'IO': '203.83.48.0/21',
5423 'IQ': '37.236.0.0/14',
5424 'IR': '2.176.0.0/12',
5425 'IS': '82.221.0.0/16',
5426 'IT': '79.0.0.0/10',
5427 'JE': '87.244.64.0/18',
5428 'JM': '72.27.0.0/17',
5429 'JO': '176.29.0.0/16',
5430 'JP': '133.0.0.0/8',
5431 'KE': '105.48.0.0/12',
5432 'KG': '158.181.128.0/17',
5433 'KH': '36.37.128.0/17',
5434 'KI': '103.25.140.0/22',
5435 'KM': '197.255.224.0/20',
5436 'KN': '198.167.192.0/19',
5437 'KP': '175.45.176.0/22',
5438 'KR': '175.192.0.0/10',
5439 'KW': '37.36.0.0/14',
5440 'KY': '64.96.0.0/15',
5441 'KZ': '2.72.0.0/13',
5442 'LA': '115.84.64.0/18',
5443 'LB': '178.135.0.0/16',
5444 'LC': '24.92.144.0/20',
5445 'LI': '82.117.0.0/19',
5446 'LK': '112.134.0.0/15',
5447 'LR': '102.183.0.0/16',
5448 'LS': '129.232.0.0/17',
5449 'LT': '78.56.0.0/13',
5450 'LU': '188.42.0.0/16',
5451 'LV': '46.109.0.0/16',
5452 'LY': '41.252.0.0/14',
5453 'MA': '105.128.0.0/11',
5454 'MC': '88.209.64.0/18',
5455 'MD': '37.246.0.0/16',
5456 'ME': '178.175.0.0/17',
5457 'MF': '74.112.232.0/21',
5458 'MG': '154.126.0.0/17',
5459 'MH': '117.103.88.0/21',
5460 'MK': '77.28.0.0/15',
5461 'ML': '154.118.128.0/18',
5462 'MM': '37.111.0.0/17',
5463 'MN': '49.0.128.0/17',
5464 'MO': '60.246.0.0/16',
5465 'MP': '202.88.64.0/20',
5466 'MQ': '109.203.224.0/19',
5467 'MR': '41.188.64.0/18',
5468 'MS': '208.90.112.0/22',
5469 'MT': '46.11.0.0/16',
5470 'MU': '105.16.0.0/12',
5471 'MV': '27.114.128.0/18',
5472 'MW': '102.70.0.0/15',
5473 'MX': '187.192.0.0/11',
5474 'MY': '175.136.0.0/13',
5475 'MZ': '197.218.0.0/15',
5476 'NA': '41.182.0.0/16',
5477 'NC': '101.101.0.0/18',
5478 'NE': '197.214.0.0/18',
5479 'NF': '203.17.240.0/22',
5480 'NG': '105.112.0.0/12',
5481 'NI': '186.76.0.0/15',
5482 'NL': '145.96.0.0/11',
5483 'NO': '84.208.0.0/13',
5484 'NP': '36.252.0.0/15',
5485 'NR': '203.98.224.0/19',
5486 'NU': '49.156.48.0/22',
5487 'NZ': '49.224.0.0/14',
5488 'OM': '5.36.0.0/15',
5489 'PA': '186.72.0.0/15',
5490 'PE': '186.160.0.0/14',
5491 'PF': '123.50.64.0/18',
5492 'PG': '124.240.192.0/19',
5493 'PH': '49.144.0.0/13',
5494 'PK': '39.32.0.0/11',
5495 'PL': '83.0.0.0/11',
5496 'PM': '70.36.0.0/20',
5497 'PR': '66.50.0.0/16',
5498 'PS': '188.161.0.0/16',
5499 'PT': '85.240.0.0/13',
5500 'PW': '202.124.224.0/20',
5501 'PY': '181.120.0.0/14',
5502 'QA': '37.210.0.0/15',
5503 'RE': '102.35.0.0/16',
5504 'RO': '79.112.0.0/13',
5505 'RS': '93.86.0.0/15',
5506 'RU': '5.136.0.0/13',
5507 'RW': '41.186.0.0/16',
5508 'SA': '188.48.0.0/13',
5509 'SB': '202.1.160.0/19',
5510 'SC': '154.192.0.0/11',
5511 'SD': '102.120.0.0/13',
5512 'SE': '78.64.0.0/12',
5513 'SG': '8.128.0.0/10',
5514 'SI': '188.196.0.0/14',
5515 'SK': '78.98.0.0/15',
5516 'SL': '102.143.0.0/17',
5517 'SM': '89.186.32.0/19',
5518 'SN': '41.82.0.0/15',
5519 'SO': '154.115.192.0/18',
5520 'SR': '186.179.128.0/17',
5521 'SS': '105.235.208.0/21',
5522 'ST': '197.159.160.0/19',
5523 'SV': '168.243.0.0/16',
5524 'SX': '190.102.0.0/20',
5526 'SZ': '41.84.224.0/19',
5527 'TC': '65.255.48.0/20',
5528 'TD': '154.68.128.0/19',
5529 'TG': '196.168.0.0/14',
5530 'TH': '171.96.0.0/13',
5531 'TJ': '85.9.128.0/18',
5532 'TK': '27.96.24.0/21',
5533 'TL': '180.189.160.0/20',
5534 'TM': '95.85.96.0/19',
5535 'TN': '197.0.0.0/11',
5536 'TO': '175.176.144.0/21',
5537 'TR': '78.160.0.0/11',
5538 'TT': '186.44.0.0/15',
5539 'TV': '202.2.96.0/19',
5540 'TW': '120.96.0.0/11',
5541 'TZ': '156.156.0.0/14',
5542 'UA': '37.52.0.0/14',
5543 'UG': '102.80.0.0/13',
5545 'UY': '167.56.0.0/13',
5546 'UZ': '84.54.64.0/18',
5547 'VA': '212.77.0.0/19',
5548 'VC': '207.191.240.0/21',
5549 'VE': '186.88.0.0/13',
5550 'VG': '66.81.192.0/20',
5551 'VI': '146.226.0.0/16',
5552 'VN': '14.160.0.0/11',
5553 'VU': '202.80.32.0/20',
5554 'WF': '117.20.32.0/21',
5555 'WS': '202.4.32.0/19',
5556 'YE': '134.35.0.0/16',
5557 'YT': '41.242.116.0/22',
5558 'ZA': '41.0.0.0/11',
5559 'ZM': '102.144.0.0/13',
5560 'ZW': '102.177.192.0/18',
5564 def random_ipv4(cls, code_or_block):
5565 if len(code_or_block) == 2:
5566 block = cls._country_ip_map.get(code_or_block.upper())
5570 block = code_or_block
5571 addr, preflen = block.split('/')
5572 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5573 addr_max = addr_min | (0xffffffff >> int(preflen))
5574 return compat_str(socket.inet_ntoa(
5575 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5578 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5579 def __init__(self, proxies=None):
5580 # Set default handlers
5581 for type in ('http', 'https'):
5582 setattr(self, '%s_open' % type,
5583 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5584 meth(r, proxy, type))
5585 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5587 def proxy_open(self, req, proxy, type):
5588 req_proxy = req.headers.get('Ytdl-request-proxy')
5589 if req_proxy is not None:
5591 del req.headers['Ytdl-request-proxy']
5593 if proxy == '__noproxy__':
5594 return None # No Proxy
5595 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5596 req.add_header('Ytdl-socks-proxy', proxy)
5597 # yt-dlp's http/https handlers do wrapping the socket with socks
5599 return compat_urllib_request.ProxyHandler.proxy_open(
5600 self, req, proxy, type)
5603 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5604 # released into Public Domain
5605 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5607 def long_to_bytes(n, blocksize=0):
5608 """long_to_bytes(n:long, blocksize:int) : string
5609 Convert a long integer to a byte string.
5611 If optional blocksize is given and greater than zero, pad the front of the
5612 byte string with binary zeros so that the length is a multiple of
5615 # after much testing, this algorithm was deemed to be the fastest
5619 s = compat_struct_pack('>I', n & 0xffffffff) + s
5621 # strip off leading zeros
5622 for i in range(len(s)):
5623 if s[i] != b'\000'[0]:
5626 # only happens when n == 0
5630 # add back some pad bytes. this could be done more efficiently w.r.t. the
5631 # de-padding being done above, but sigh...
5632 if blocksize > 0 and len(s) % blocksize:
5633 s = (blocksize - len(s) % blocksize) * b'\000' + s
5637 def bytes_to_long(s):
5638 """bytes_to_long(string) : long
5639 Convert a byte string to a long integer.
5641 This is (essentially) the inverse of long_to_bytes().
5646 extra = (4 - length % 4)
5647 s = b'\000' * extra + s
5648 length = length + extra
5649 for i in range(0, length, 4):
5650 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5654 def ohdave_rsa_encrypt(data, exponent, modulus):
5656 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5659 data: data to encrypt, bytes-like object
5660 exponent, modulus: parameter e and N of RSA algorithm, both integer
5661 Output: hex string of encrypted data
5663 Limitation: supports one block encryption only
5666 payload = int(binascii.hexlify(data[::-1]), 16)
5667 encrypted = pow(payload, exponent, modulus)
5668 return '%x' % encrypted
5671 def pkcs1pad(data, length):
5673 Padding input data with PKCS#1 scheme
5675 @param {int[]} data input data
5676 @param {int} length target length
5677 @returns {int[]} padded data
5679 if len(data) > length - 11:
5680 raise ValueError('Input data too
long for PKCS
#1 padding')
5682 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5683 return [0, 2] + pseudo_random
+ [0] + data
5686 def encode_base_n(num
, n
, table
=None):
5687 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5689 table
= FULL_TABLE
[:n
]
5692 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5699 ret
= table
[num
% n
] + ret
5704 def decode_packed_codes(code
):
5705 mobj
= re
.search(PACKED_CODES_RE
, code
)
5706 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5709 symbols
= symbols
.split('|')
5714 base_n_count
= encode_base_n(count
, base
)
5715 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5718 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5722 def caesar(s
, alphabet
, shift
):
5727 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5732 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5735 def parse_m3u8_attributes(attrib
):
5737 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5738 if val
.startswith('"'):
5744 def urshift(val
, n
):
5745 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5748 # Based on png2str() written by @gdkchan and improved by @yokrysty
5749 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5750 def decode_png(png_data
):
5751 # Reference: https://www.w3.org/TR/PNG/
5752 header
= png_data
[8:]
5754 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5755 raise IOError('Not a valid PNG file.')
5757 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5758 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5763 length
= unpack_integer(header
[:4])
5766 chunk_type
= header
[:4]
5769 chunk_data
= header
[:length
]
5770 header
= header
[length
:]
5772 header
= header
[4:] # Skip CRC
5780 ihdr
= chunks
[0]['data']
5782 width
= unpack_integer(ihdr
[:4])
5783 height
= unpack_integer(ihdr
[4:8])
5787 for chunk
in chunks
:
5788 if chunk
['type'] == b
'IDAT':
5789 idat
+= chunk
['data']
5792 raise IOError('Unable to read PNG data.')
5794 decompressed_data
= bytearray(zlib
.decompress(idat
))
5799 def _get_pixel(idx
):
5804 for y
in range(height
):
5805 basePos
= y
* (1 + stride
)
5806 filter_type
= decompressed_data
[basePos
]
5810 pixels
.append(current_row
)
5812 for x
in range(stride
):
5813 color
= decompressed_data
[1 + basePos
+ x
]
5814 basex
= y
* stride
+ x
5819 left
= _get_pixel(basex
- 3)
5821 up
= _get_pixel(basex
- stride
)
5823 if filter_type
== 1: # Sub
5824 color
= (color
+ left
) & 0xff
5825 elif filter_type
== 2: # Up
5826 color
= (color
+ up
) & 0xff
5827 elif filter_type
== 3: # Average
5828 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5829 elif filter_type
== 4: # Paeth
5835 c
= _get_pixel(basex
- stride
- 3)
5843 if pa
<= pb
and pa
<= pc
:
5844 color
= (color
+ a
) & 0xff
5846 color
= (color
+ b
) & 0xff
5848 color
= (color
+ c
) & 0xff
5850 current_row
.append(color
)
5852 return width
, height
, pixels
5855 def write_xattr(path
, key
, value
):
5856 # This mess below finds the best xattr tool for the job
5858 # try the pyxattr module...
5861 if hasattr(xattr
, 'set'): # pyxattr
5862 # Unicode arguments are not supported in python-pyxattr until
5864 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5865 pyxattr_required_version
= '0.5.0'
5866 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5867 # TODO: fallback to CLI tools
5868 raise XAttrUnavailableError(
5869 'python-pyxattr is detected but is too old. '
5870 'yt-dlp requires %s or above while your version is %s. '
5871 'Falling back to other xattr implementations' % (
5872 pyxattr_required_version
, xattr
.__version
__))
5874 setxattr
= xattr
.set
5876 setxattr
= xattr
.setxattr
5879 setxattr(path
, key
, value
)
5880 except EnvironmentError as e
:
5881 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5884 if compat_os_name
== 'nt':
5885 # Write xattrs to NTFS Alternate Data Streams:
5886 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5887 assert ':' not in key
5888 assert os
.path
.exists(path
)
5890 ads_fn
= path
+ ':' + key
5892 with open(ads_fn
, 'wb') as f
:
5894 except EnvironmentError as e
:
5895 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5897 user_has_setfattr
= check_executable('setfattr', ['--version'])
5898 user_has_xattr
= check_executable('xattr', ['-h'])
5900 if user_has_setfattr
or user_has_xattr
:
5902 value
= value
.decode('utf-8')
5903 if user_has_setfattr
:
5904 executable
= 'setfattr'
5905 opts
= ['-n', key
, '-v', value
]
5906 elif user_has_xattr
:
5907 executable
= 'xattr'
5908 opts
= ['-w', key
, value
]
5910 cmd
= ([encodeFilename(executable
, True)]
5911 + [encodeArgument(o
) for o
in opts
]
5912 + [encodeFilename(path
, True)])
5915 p
= subprocess
.Popen(
5916 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5917 except EnvironmentError as e
:
5918 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5919 stdout
, stderr
= process_communicate_or_kill(p
)
5920 stderr
= stderr
.decode('utf-8', 'replace')
5921 if p
.returncode
!= 0:
5922 raise XAttrMetadataError(p
.returncode
, stderr
)
5925 # On Unix, and can't find pyxattr, setfattr, or xattr.
5926 if sys
.platform
.startswith('linux'):
5927 raise XAttrUnavailableError(
5928 "Couldn't find a tool to set the xattrs. "
5929 "Install either the python 'pyxattr' or 'xattr' "
5930 "modules, or the GNU 'attr' package "
5931 "(which contains the 'setfattr' tool).")
5933 raise XAttrUnavailableError(
5934 "Couldn't find a tool to set the xattrs. "
5935 "Install either the python 'xattr' module, "
5936 "or the 'xattr' binary.")
5939 def random_birthday(year_field
, month_field
, day_field
):
5940 start_date
= datetime
.date(1950, 1, 1)
5941 end_date
= datetime
.date(1995, 12, 31)
5942 offset
= random
.randint(0, (end_date
- start_date
).days
)
5943 random_date
= start_date
+ datetime
.timedelta(offset
)
5945 year_field
: str(random_date
.year
),
5946 month_field
: str(random_date
.month
),
5947 day_field
: str(random_date
.day
),
5951 # Templates for internet shortcut files, which are plain text files.
5952 DOT_URL_LINK_TEMPLATE
= '''
5957 DOT_WEBLOC_LINK_TEMPLATE
= '''
5958 <?xml version="1.0" encoding="UTF-8"?>
5959 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5960 <plist version="1.0">
5963 \t<string>%(url)s</string>
5968 DOT_DESKTOP_LINK_TEMPLATE
= '''
5978 def iri_to_uri(iri
):
5980 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5982 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5985 iri_parts
= compat_urllib_parse_urlparse(iri
)
5987 if '[' in iri_parts
.netloc
:
5988 raise ValueError('IPv6 URIs are not, yet, supported.')
5989 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5991 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5994 if iri_parts
.username
:
5995 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5996 if iri_parts
.password
is not None:
5997 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
6000 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
6001 # The 'idna' encoding produces ASCII text.
6002 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
6003 net_location
+= ':' + str(iri_parts
.port
)
6005 return compat_urllib_parse_urlunparse(
6009 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
6011 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
6012 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
6014 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
6015 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
6017 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
6019 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
6022 def to_high_limit_path(path
):
6023 if sys
.platform
in ['win32', 'cygwin']:
6024 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
6025 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
6030 def format_field(obj
, field
, template
='%s', ignore
=(None, ''), default
='', func
=None):
6031 val
= obj
.get(field
, default
)
6032 if func
and val
not in ignore
:
6034 return template
% val
if val
not in ignore
else default
6037 def clean_podcast_url(url
):
6038 return re
.sub(r
'''(?x)
6042 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
6045 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
6048 cn\.co| # https://podcorn.com/analytics-prefix/
6049 st\.fm # https://podsights.com/docs/
6054 _HEX_TABLE
= '0123456789abcdef'
6057 def random_uuidv4():
6058 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
6061 def make_dir(path
, to_screen
=None):
6063 dn
= os
.path
.dirname(path
)
6064 if dn
and not os
.path
.exists(dn
):
6067 except (OSError, IOError) as err
:
6068 if callable(to_screen
) is not None:
6069 to_screen('unable to create directory ' + error_to_compat_str(err
))
6073 def get_executable_path():
6074 from zipimport
import zipimporter
6075 if hasattr(sys
, 'frozen'): # Running from PyInstaller
6076 path
= os
.path
.dirname(sys
.executable
)
6077 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
6078 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
6080 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
6081 return os
.path
.abspath(path
)
6084 def load_plugins(name
, suffix
, namespace
):
6085 plugin_info
= [None]
6088 plugin_info
= imp
.find_module(
6089 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
6090 plugins
= imp
.load_module(name
, *plugin_info
)
6091 for name
in dir(plugins
):
6092 if name
in namespace
:
6094 if not name
.endswith(suffix
):
6096 klass
= getattr(plugins
, name
)
6097 classes
.append(klass
)
6098 namespace
[name
] = klass
6102 if plugin_info
[0] is not None:
6103 plugin_info
[0].close()
6107 def traverse_dict(dictn
, keys
, casesense
=True):
6108 keys
= list(keys
)[::-1]
6111 if isinstance(dictn
, dict):
6113 dictn
= {k.lower(): v for k, v in dictn.items()}
6115 dictn
= dictn
.get(key
)
6116 elif isinstance(dictn
, (list, tuple, compat_str
)):
6118 key
= slice(*map(int_or_none
, key
.split(':')))
6120 key
= int_or_none(key
)
6121 dictn
= try_get(dictn
, lambda x
: x
[key
])