4 from __future__
import unicode_literals
37 import xml
.etree
.ElementTree
41 compat_HTMLParseError
,
46 compat_ctypes_WINFUNCTYPE
,
47 compat_etree_fromstring
,
50 compat_html_entities_html5
,
63 compat_urllib_parse_urlencode
,
64 compat_urllib_parse_urlparse
,
65 compat_urllib_parse_urlunparse
,
66 compat_urllib_parse_quote
,
67 compat_urllib_parse_quote_plus
,
68 compat_urllib_parse_unquote_plus
,
69 compat_urllib_request
,
80 def register_socks_protocols():
81 # "Register" SOCKS protocols
82 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
83 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
84 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
85 if scheme
not in compat_urlparse
.uses_netloc
:
86 compat_urlparse
.uses_netloc
.append(scheme
)
89 # This is not clearly defined otherwise
90 compiled_regex_type
= type(re
.compile(''))
93 def random_user_agent():
94 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1673 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1677 'User-Agent': random_user_agent(),
1678 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1679 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1680 'Accept-Encoding': 'gzip, deflate',
1681 'Accept-Language': 'en-us,en;q=0.5',
1686 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1690 NO_DEFAULT
= object()
1692 ENGLISH_MONTH_NAMES
= [
1693 'January', 'February', 'March', 'April', 'May', 'June',
1694 'July', 'August', 'September', 'October', 'November', 'December']
1697 'en': ENGLISH_MONTH_NAMES
,
1699 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1700 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1703 KNOWN_EXTENSIONS
= (
1704 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1705 'flv', 'f4v', 'f4a', 'f4b',
1706 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1707 'mkv', 'mka', 'mk3d',
1710 'asf', 'wmv', 'wma',
1716 'f4f', 'f4m', 'm3u8', 'smil')
1718 REMUX_EXTENSIONS
= ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mp3', 'mka', 'm4a', 'ogg', 'opus')
1720 # needed for sanitizing filenames in restricted mode
1721 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1722 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1723 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1746 '%Y/%m/%d %H:%M:%S',
1748 '%Y-%m-%d %H:%M:%S',
1749 '%Y-%m-%d %H:%M:%S.%f',
1752 '%Y-%m-%dT%H:%M:%SZ',
1753 '%Y-%m-%dT%H:%M:%S.%fZ',
1754 '%Y-%m-%dT%H:%M:%S.%f0Z',
1755 '%Y-%m-%dT%H:%M:%S',
1756 '%Y-%m-%dT%H:%M:%S.%f',
1758 '%b %d %Y at %H:%M',
1759 '%b %d %Y at %H:%M:%S',
1760 '%B %d %Y at %H:%M',
1761 '%B %d %Y at %H:%M:%S',
1764 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1765 DATE_FORMATS_DAY_FIRST
.extend([
1771 '%d/%m/%Y %H:%M:%S',
1774 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1775 DATE_FORMATS_MONTH_FIRST
.extend([
1780 '%m/%d/%Y %H:%M:%S',
1783 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1784 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1787 def preferredencoding():
1788 """Get preferred encoding.
1790 Returns the best encoding scheme for the system, based on
1791 locale.getpreferredencoding() and some further tweaks.
1794 pref = locale.getpreferredencoding()
1802 def write_json_file(obj, fn):
1803 """ Encode obj as JSON and write it to fn, atomically if possible """
1805 fn = encodeFilename(fn)
1806 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1807 encoding = get_filesystem_encoding()
1808 # os.path.basename returns a bytes object, but NamedTemporaryFile
1809 # will fail if the filename contains non ascii characters unless we
1810 # use a unicode object
1811 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1812 # the same for os.path.dirname
1813 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1815 path_basename = os.path.basename
1816 path_dirname = os.path.dirname
1820 'prefix
': path_basename(fn) + '.',
1821 'dir': path_dirname(fn),
1825 # In Python 2.x, json.dump expects a bytestream.
1826 # In Python 3.x, it writes to a character stream
1827 if sys.version_info < (3, 0):
1832 'encoding
': 'utf
-8',
1835 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1839 json.dump(obj, tf, default=repr)
1840 if sys.platform == 'win32
':
1841 # Need to remove existing file on Windows, else os.rename raises
1842 # WindowsError or FileExistsError.
1850 os.chmod(tf.name, 0o666 & ~mask)
1853 os.rename(tf.name, fn)
1862 if sys.version_info >= (2, 7):
1863 def find_xpath_attr(node, xpath, key, val=None):
1864 """ Find the xpath xpath[@key=val] """
1865 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1866 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1867 return node.find(expr)
1869 def find_xpath_attr(node, xpath, key, val=None):
1870 for f in node.findall(compat_xpath(xpath)):
1871 if key not in f.attrib:
1873 if val is None or f.attrib.get(key) == val:
1877 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1878 # the namespace parameter
1881 def xpath_with_ns(path
, ns_map
):
1882 components
= [c
.split(':') for c
in path
.split('/')]
1884 for c
in components
:
1886 replaced
.append(c
[0])
1889 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1890 return '/'.join(replaced
)
1893 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1894 def _find_xpath(xpath
):
1895 return node
.find(compat_xpath(xpath
))
1897 if isinstance(xpath
, (str, compat_str
)):
1898 n
= _find_xpath(xpath
)
1906 if default
is not NO_DEFAULT
:
1909 name
= xpath
if name
is None else name
1910 raise ExtractorError('Could not find XML element %s' % name
)
1916 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1917 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1918 if n
is None or n
== default
:
1921 if default
is not NO_DEFAULT
:
1924 name
= xpath
if name
is None else name
1925 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1931 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1932 n
= find_xpath_attr(node
, xpath
, key
)
1934 if default
is not NO_DEFAULT
:
1937 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1938 raise ExtractorError('Could not find XML attribute %s' % name
)
1941 return n
.attrib
[key
]
1944 def get_element_by_id(id, html
):
1945 """Return the content of the tag with the specified ID in the passed HTML document"""
1946 return get_element_by_attribute('id', id, html
)
1949 def get_element_by_class(class_name
, html
):
1950 """Return the content of the first tag with the specified class in the passed HTML document"""
1951 retval
= get_elements_by_class(class_name
, html
)
1952 return retval
[0] if retval
else None
1955 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1956 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1957 return retval
[0] if retval
else None
1960 def get_elements_by_class(class_name
, html
):
1961 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1962 return get_elements_by_attribute(
1963 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1964 html, escape_value=False)
1967 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1968 """Return the content of the tag with the specified attribute in the passed HTML document"""
1970 value = re.escape(value) if escape_value else value
1973 for m in re.finditer(r'''(?xs)
1975 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1977 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1981 ''' % (re.escape(attribute), value), html):
1982 res = m.group('content
')
1984 if res.startswith('"') or res.startswith("'"):
1987 retlist.append(unescapeHTML(res))
1992 class HTMLAttributeParser(compat_HTMLParser):
1993 """Trivial HTML parser to gather the attributes for a single element"""
1997 compat_HTMLParser.__init__(self)
1999 def handle_starttag(self, tag, attrs):
2000 self.attrs = dict(attrs)
2003 def extract_attributes(html_element):
2004 """Given a string for an HTML element such as
2006 a="foo" B="bar" c="&98;az" d=boz
2007 empty= noval entity="&"
2010 Decode and return a dictionary of attributes.
2012 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2013 'empty
': '', 'noval
': None, 'entity
': '&',
2014 'sq
': '"', 'dq': '\''
2016 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2017 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2019 parser = HTMLAttributeParser()
2021 parser.feed(html_element)
2023 # Older Python may throw HTMLParseError in case of malformed HTML
2024 except compat_HTMLParseError:
2029 def clean_html(html):
2030 """Clean an HTML snippet into a readable string"""
2032 if html is None: # Convenience for sanitizing descriptions etc.
2036 html = html.replace('\n', ' ')
2037 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2038 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2040 html = re.sub('<.*?>', '', html)
2041 # Replace html entities
2042 html = unescapeHTML(html)
2046 def sanitize_open(filename, open_mode):
2047 """Try to open the given filename, and slightly tweak it if this fails.
2049 Attempts to open the given filename. If this fails, it tries to change
2050 the filename slightly, step by step, until it's either able to open it
2051 or it fails and raises a final exception, like the standard open()
2054 It returns the tuple (stream, definitive_file_name).
2058 if sys.platform == 'win32':
2060 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2061 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2062 stream = open(encodeFilename(filename), open_mode)
2063 return (stream, filename)
2064 except (IOError, OSError) as err:
2065 if err.errno in (errno.EACCES,):
2068 # In case of error, try to remove win32 forbidden chars
2069 alt_filename = sanitize_path(filename)
2070 if alt_filename == filename:
2073 # An exception here should be caught in the caller
2074 stream = open(encodeFilename(alt_filename), open_mode)
2075 return (stream, alt_filename)
2078 def timeconvert(timestr):
2079 """Convert RFC 2822 defined time string into system timestamp"""
2081 timetuple = email.utils.parsedate_tz(timestr)
2082 if timetuple is not None:
2083 timestamp = email.utils.mktime_tz(timetuple)
2087 def sanitize_filename(s, restricted=False, is_id=False):
2088 """Sanitizes a string so it could be used as part of a filename.
2089 If restricted is set, use a stricter subset of allowed characters.
2090 Set is_id if this is not an arbitrary string, but an ID that should be kept
2093 def replace_insane(char):
2094 if restricted and char in ACCENT_CHARS:
2095 return ACCENT_CHARS[char]
2096 if char == '?' or ord(char) < 32 or ord(char) == 127:
2099 return '' if restricted else '\''
2101 return '_
-' if restricted else ' -'
2102 elif char in '\\/|
*<>':
2104 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2106 if restricted
and ord(char
) > 127:
2111 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2112 result
= ''.join(map(replace_insane
, s
))
2114 while '__' in result
:
2115 result
= result
.replace('__', '_')
2116 result
= result
.strip('_')
2117 # Common case of "Foreign band name - English song title"
2118 if restricted
and result
.startswith('-_'):
2120 if result
.startswith('-'):
2121 result
= '_' + result
[len('-'):]
2122 result
= result
.lstrip('.')
2128 def sanitize_path(s
, force
=False):
2129 """Sanitizes and normalizes path on Windows"""
2130 if sys
.platform
== 'win32':
2132 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2133 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2134 drive_or_unc
, _
= os
.path
.splitunc(s
)
2140 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2144 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2145 for path_part
in norm_path
]
2147 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2148 elif force
and s
[0] == os
.path
.sep
:
2149 sanitized_path
.insert(0, os
.path
.sep
)
2150 return os
.path
.join(*sanitized_path
)
2153 def sanitize_url(url
):
2154 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2155 # the number of unwanted failures due to missing protocol
2156 if url
.startswith('//'):
2157 return 'http:%s' % url
2158 # Fix some common typos seen so far
2160 # https://github.com/ytdl-org/youtube-dl/issues/15649
2161 (r
'^httpss://', r
'https://'),
2162 # https://bx1.be/lives/direct-tv/
2163 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2165 for mistake
, fixup
in COMMON_TYPOS
:
2166 if re
.match(mistake
, url
):
2167 return re
.sub(mistake
, fixup
, url
)
2171 def sanitized_Request(url
, *args
, **kwargs
):
2172 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2176 """Expand shell variables and ~"""
2177 return os
.path
.expandvars(compat_expanduser(s
))
2180 def orderedSet(iterable
):
2181 """ Remove all duplicates from the input iterable """
2189 def _htmlentity_transform(entity_with_semicolon
):
2190 """Transforms an HTML entity to a character."""
2191 entity
= entity_with_semicolon
[:-1]
2193 # Known non-numeric HTML entity
2194 if entity
in compat_html_entities
.name2codepoint
:
2195 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2197 # TODO: HTML5 allows entities without a semicolon. For example,
2198 # 'Éric' should be decoded as 'Éric'.
2199 if entity_with_semicolon
in compat_html_entities_html5
:
2200 return compat_html_entities_html5
[entity_with_semicolon
]
2202 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2203 if mobj
is not None:
2204 numstr
= mobj
.group(1)
2205 if numstr
.startswith('x'):
2207 numstr
= '0%s' % numstr
2210 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2212 return compat_chr(int(numstr
, base
))
2216 # Unknown entity in name, return its literal representation
2217 return '&%s;' % entity
2220 def unescapeHTML(s
):
2223 assert type(s
) == compat_str
2226 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2229 def process_communicate_or_kill(p
, *args
, **kwargs
):
2231 return p
.communicate(*args
, **kwargs
)
2232 except BaseException
: # Including KeyboardInterrupt
2238 def get_subprocess_encoding():
2239 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2240 # For subprocess calls, encode with locale encoding
2241 # Refer to http://stackoverflow.com/a/9951851/35070
2242 encoding
= preferredencoding()
2244 encoding
= sys
.getfilesystemencoding()
2245 if encoding
is None:
2250 def encodeFilename(s
, for_subprocess
=False):
2252 @param s The name of the file
2255 assert type(s
) == compat_str
2257 # Python 3 has a Unicode API
2258 if sys
.version_info
>= (3, 0):
2261 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2262 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2263 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2264 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2267 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2268 if sys
.platform
.startswith('java'):
2271 return s
.encode(get_subprocess_encoding(), 'ignore')
2274 def decodeFilename(b
, for_subprocess
=False):
2276 if sys
.version_info
>= (3, 0):
2279 if not isinstance(b
, bytes):
2282 return b
.decode(get_subprocess_encoding(), 'ignore')
2285 def encodeArgument(s
):
2286 if not isinstance(s
, compat_str
):
2287 # Legacy code that uses byte strings
2288 # Uncomment the following line after fixing all post processors
2289 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2290 s
= s
.decode('ascii')
2291 return encodeFilename(s
, True)
2294 def decodeArgument(b
):
2295 return decodeFilename(b
, True)
2298 def decodeOption(optval
):
2301 if isinstance(optval
, bytes):
2302 optval
= optval
.decode(preferredencoding())
2304 assert isinstance(optval
, compat_str
)
2308 def formatSeconds(secs
, delim
=':'):
2310 return '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2312 return '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2317 def make_HTTPS_handler(params
, **kwargs
):
2318 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2319 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2320 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2321 if opts_no_check_certificate
:
2322 context
.check_hostname
= False
2323 context
.verify_mode
= ssl
.CERT_NONE
2325 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2328 # (create_default_context present but HTTPSHandler has no context=)
2331 if sys
.version_info
< (3, 2):
2332 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2333 else: # Python < 3.4
2334 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2335 context
.verify_mode
= (ssl
.CERT_NONE
2336 if opts_no_check_certificate
2337 else ssl
.CERT_REQUIRED
)
2338 context
.set_default_verify_paths()
2339 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2342 def bug_reports_message():
2343 if ytdl_is_updateable():
2344 update_cmd
= 'type yt-dlp -U to update'
2346 update_cmd
= 'see https://github.com/yt-dlp/yt-dlp on how to update'
2347 msg
= '; please report this issue on https://github.com/yt-dlp/yt-dlp .'
2348 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2349 msg
+= ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
2353 class YoutubeDLError(Exception):
2354 """Base exception for YoutubeDL errors."""
2358 class ExtractorError(YoutubeDLError
):
2359 """Error during info extraction."""
2361 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2362 """ tb, if given, is the original traceback (so that it can be printed out).
2363 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
2366 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2368 if video_id
is not None:
2369 msg
= video_id
+ ': ' + msg
2371 msg
+= ' (caused by %r)' % cause
2373 msg
+= bug_reports_message()
2374 super(ExtractorError
, self
).__init
__(msg
)
2377 self
.exc_info
= sys
.exc_info() # preserve original exception
2379 self
.video_id
= video_id
2381 def format_traceback(self
):
2382 if self
.traceback
is None:
2384 return ''.join(traceback
.format_tb(self
.traceback
))
2387 class UnsupportedError(ExtractorError
):
2388 def __init__(self
, url
):
2389 super(UnsupportedError
, self
).__init
__(
2390 'Unsupported URL: %s' % url
, expected
=True)
2394 class RegexNotFoundError(ExtractorError
):
2395 """Error when a regex didn't match"""
2399 class GeoRestrictedError(ExtractorError
):
2400 """Geographic restriction Error exception.
2402 This exception may be thrown when a video is not available from your
2403 geographic location due to geographic restrictions imposed by a website.
2406 def __init__(self
, msg
, countries
=None):
2407 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2409 self
.countries
= countries
2412 class DownloadError(YoutubeDLError
):
2413 """Download Error exception.
2415 This exception may be thrown by FileDownloader objects if they are not
2416 configured to continue on errors. They will contain the appropriate
2420 def __init__(self
, msg
, exc_info
=None):
2421 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2422 super(DownloadError
, self
).__init
__(msg
)
2423 self
.exc_info
= exc_info
2426 class EntryNotInPlaylist(YoutubeDLError
):
2427 """Entry not in playlist exception.
2429 This exception will be thrown by YoutubeDL when a requested entry
2430 is not found in the playlist info_dict
2435 class SameFileError(YoutubeDLError
):
2436 """Same File exception.
2438 This exception will be thrown by FileDownloader objects if they detect
2439 multiple files would have to be downloaded to the same file on disk.
2444 class PostProcessingError(YoutubeDLError
):
2445 """Post Processing exception.
2447 This exception may be raised by PostProcessor's .run() method to
2448 indicate an error in the postprocessing task.
2451 def __init__(self
, msg
):
2452 super(PostProcessingError
, self
).__init
__(msg
)
2456 class ExistingVideoReached(YoutubeDLError
):
2457 """ --max-downloads limit has been reached. """
2461 class RejectedVideoReached(YoutubeDLError
):
2462 """ --max-downloads limit has been reached. """
2466 class MaxDownloadsReached(YoutubeDLError
):
2467 """ --max-downloads limit has been reached. """
2471 class UnavailableVideoError(YoutubeDLError
):
2472 """Unavailable Format exception.
2474 This exception will be thrown when a video is requested
2475 in a format that is not available for that video.
2480 class ContentTooShortError(YoutubeDLError
):
2481 """Content Too Short exception.
2483 This exception may be raised by FileDownloader objects when a file they
2484 download is too small for what the server announced first, indicating
2485 the connection was probably interrupted.
2488 def __init__(self
, downloaded
, expected
):
2489 super(ContentTooShortError
, self
).__init
__(
2490 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2493 self
.downloaded
= downloaded
2494 self
.expected
= expected
2497 class XAttrMetadataError(YoutubeDLError
):
2498 def __init__(self
, code
=None, msg
='Unknown error'):
2499 super(XAttrMetadataError
, self
).__init
__(msg
)
2503 # Parsing code and msg
2504 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2505 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2506 self
.reason
= 'NO_SPACE'
2507 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2508 self
.reason
= 'VALUE_TOO_LONG'
2510 self
.reason
= 'NOT_SUPPORTED'
2513 class XAttrUnavailableError(YoutubeDLError
):
2517 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2518 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2519 # expected HTTP responses to meet HTTP/1.0 or later (see also
2520 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2521 if sys
.version_info
< (3, 0):
2522 kwargs
['strict'] = True
2523 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2524 source_address
= ydl_handler
._params
.get('source_address')
2526 if source_address
is not None:
2527 # This is to workaround _create_connection() from socket where it will try all
2528 # address data from getaddrinfo() including IPv6. This filters the result from
2529 # getaddrinfo() based on the source_address value.
2530 # This is based on the cpython socket.create_connection() function.
2531 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2532 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2533 host
, port
= address
2535 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2536 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2537 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2538 if addrs
and not ip_addrs
:
2539 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2541 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2542 % (ip_version
, source_address
[0]))
2543 for res
in ip_addrs
:
2544 af
, socktype
, proto
, canonname
, sa
= res
2547 sock
= socket
.socket(af
, socktype
, proto
)
2548 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2549 sock
.settimeout(timeout
)
2550 sock
.bind(source_address
)
2552 err
= None # Explicitly break reference cycle
2554 except socket
.error
as _
:
2556 if sock
is not None:
2561 raise socket
.error('getaddrinfo returns an empty list')
2562 if hasattr(hc
, '_create_connection'):
2563 hc
._create
_connection
= _create_connection
2564 sa
= (source_address
, 0)
2565 if hasattr(hc
, 'source_address'): # Python 2.7+
2566 hc
.source_address
= sa
2568 def _hc_connect(self
, *args
, **kwargs
):
2569 sock
= _create_connection(
2570 (self
.host
, self
.port
), self
.timeout
, sa
)
2572 self
.sock
= ssl
.wrap_socket(
2573 sock
, self
.key_file
, self
.cert_file
,
2574 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2577 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2582 def handle_youtubedl_headers(headers
):
2583 filtered_headers
= headers
2585 if 'Youtubedl-no-compression' in filtered_headers
:
2586 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2587 del filtered_headers
['Youtubedl-no-compression']
2589 return filtered_headers
2592 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2593 """Handler for HTTP requests and responses.
2595 This class, when installed with an OpenerDirector, automatically adds
2596 the standard headers to every HTTP request and handles gzipped and
2597 deflated responses from web servers. If compression is to be avoided in
2598 a particular request, the original request in the program code only has
2599 to include the HTTP header "Youtubedl-no-compression", which will be
2600 removed before making the real request.
2602 Part of this code was copied from:
2604 http://techknack.net/python-urllib2-handlers/
2606 Andrew Rowls, the author of that code, agreed to release it to the
2610 def __init__(self
, params
, *args
, **kwargs
):
2611 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2612 self
._params
= params
2614 def http_open(self
, req
):
2615 conn_class
= compat_http_client
.HTTPConnection
2617 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2619 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2620 del req
.headers
['Ytdl-socks-proxy']
2622 return self
.do_open(functools
.partial(
2623 _create_http_connection
, self
, conn_class
, False),
2631 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2633 return zlib
.decompress(data
)
2635 def http_request(self
, req
):
2636 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2637 # always respected by websites, some tend to give out URLs with non percent-encoded
2638 # non-ASCII characters (see telemb.py, ard.py [#3412])
2639 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2640 # To work around aforementioned issue we will replace request's original URL with
2641 # percent-encoded one
2642 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2643 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2644 url
= req
.get_full_url()
2645 url_escaped
= escape_url(url
)
2647 # Substitute URL if any change after escaping
2648 if url
!= url_escaped
:
2649 req
= update_Request(req
, url
=url_escaped
)
2651 for h
, v
in std_headers
.items():
2652 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2653 # The dict keys are capitalized because of this bug by urllib
2654 if h
.capitalize() not in req
.headers
:
2655 req
.add_header(h
, v
)
2657 req
.headers
= handle_youtubedl_headers(req
.headers
)
2659 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2660 # Python 2.6 is brain-dead when it comes to fragments
2661 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2662 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2666 def http_response(self
, req
, resp
):
2669 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2670 content
= resp
.read()
2671 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2673 uncompressed
= io
.BytesIO(gz
.read())
2674 except IOError as original_ioerror
:
2675 # There may be junk add the end of the file
2676 # See http://stackoverflow.com/q/4928560/35070 for details
2677 for i
in range(1, 1024):
2679 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2680 uncompressed
= io
.BytesIO(gz
.read())
2685 raise original_ioerror
2686 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2687 resp
.msg
= old_resp
.msg
2688 del resp
.headers
['Content-encoding']
2690 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2691 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2692 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2693 resp
.msg
= old_resp
.msg
2694 del resp
.headers
['Content-encoding']
2695 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2696 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2697 if 300 <= resp
.code
< 400:
2698 location
= resp
.headers
.get('Location')
2700 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2701 if sys
.version_info
>= (3, 0):
2702 location
= location
.encode('iso-8859-1').decode('utf-8')
2704 location
= location
.decode('utf-8')
2705 location_escaped
= escape_url(location
)
2706 if location
!= location_escaped
:
2707 del resp
.headers
['Location']
2708 if sys
.version_info
< (3, 0):
2709 location_escaped
= location_escaped
.encode('utf-8')
2710 resp
.headers
['Location'] = location_escaped
2713 https_request
= http_request
2714 https_response
= http_response
2717 def make_socks_conn_class(base_class
, socks_proxy
):
2718 assert issubclass(base_class
, (
2719 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2721 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2722 if url_components
.scheme
.lower() == 'socks5':
2723 socks_type
= ProxyType
.SOCKS5
2724 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2725 socks_type
= ProxyType
.SOCKS4
2726 elif url_components
.scheme
.lower() == 'socks4a':
2727 socks_type
= ProxyType
.SOCKS4A
2729 def unquote_if_non_empty(s
):
2732 return compat_urllib_parse_unquote_plus(s
)
2736 url_components
.hostname
, url_components
.port
or 1080,
2738 unquote_if_non_empty(url_components
.username
),
2739 unquote_if_non_empty(url_components
.password
),
2742 class SocksConnection(base_class
):
2744 self
.sock
= sockssocket()
2745 self
.sock
.setproxy(*proxy_args
)
2746 if type(self
.timeout
) in (int, float):
2747 self
.sock
.settimeout(self
.timeout
)
2748 self
.sock
.connect((self
.host
, self
.port
))
2750 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2751 if hasattr(self
, '_context'): # Python > 2.6
2752 self
.sock
= self
._context
.wrap_socket(
2753 self
.sock
, server_hostname
=self
.host
)
2755 self
.sock
= ssl
.wrap_socket(self
.sock
)
2757 return SocksConnection
2760 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2761 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2762 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2763 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2764 self
._params
= params
2766 def https_open(self
, req
):
2768 conn_class
= self
._https
_conn
_class
2770 if hasattr(self
, '_context'): # python > 2.6
2771 kwargs
['context'] = self
._context
2772 if hasattr(self
, '_check_hostname'): # python 3.x
2773 kwargs
['check_hostname'] = self
._check
_hostname
2775 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2777 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2778 del req
.headers
['Ytdl-socks-proxy']
2780 return self
.do_open(functools
.partial(
2781 _create_http_connection
, self
, conn_class
, True),
2785 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2787 See [1] for cookie file format.
2789 1. https://curl.haxx.se/docs/http-cookies.html
2791 _HTTPONLY_PREFIX
= '#HttpOnly_'
2793 _HEADER
= '''# Netscape HTTP Cookie File
2794 # This file is generated by yt-dlp. Do not edit.
2797 _CookieFileEntry
= collections
.namedtuple(
2799 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2801 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2803 Save cookies to a file.
2805 Most of the code is taken from CPython 3.8 and slightly adapted
2806 to support cookie files with UTF-8 in both python 2 and 3.
2808 if filename
is None:
2809 if self
.filename
is not None:
2810 filename
= self
.filename
2812 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2814 # Store session cookies with `expires` set to 0 instead of an empty
2817 if cookie
.expires
is None:
2820 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2821 f
.write(self
._HEADER
)
2824 if not ignore_discard
and cookie
.discard
:
2826 if not ignore_expires
and cookie
.is_expired(now
):
2832 if cookie
.domain
.startswith('.'):
2833 initial_dot
= 'TRUE'
2835 initial_dot
= 'FALSE'
2836 if cookie
.expires
is not None:
2837 expires
= compat_str(cookie
.expires
)
2840 if cookie
.value
is None:
2841 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2842 # with no name, whereas http.cookiejar regards it as a
2843 # cookie with no value.
2848 value
= cookie
.value
2850 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2851 secure
, expires
, name
, value
]) + '\n')
2853 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2854 """Load cookies from a file."""
2855 if filename
is None:
2856 if self
.filename
is not None:
2857 filename
= self
.filename
2859 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2861 def prepare_line(line
):
2862 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2863 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2864 # comments and empty lines are fine
2865 if line
.startswith('#') or not line
.strip():
2867 cookie_list
= line
.split('\t')
2868 if len(cookie_list
) != self
._ENTRY
_LEN
:
2869 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2870 cookie
= self
._CookieFileEntry
(*cookie_list
)
2871 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2872 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2876 with io
.open(filename
, encoding
='utf-8') as f
:
2879 cf
.write(prepare_line(line
))
2880 except compat_cookiejar
.LoadError
as e
:
2882 'WARNING: skipping cookie file entry due to %s: %r\n'
2883 % (e
, line
), sys
.stderr
)
2886 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2887 # Session cookies are denoted by either `expires` field set to
2888 # an empty string or 0. MozillaCookieJar only recognizes the former
2889 # (see [1]). So we need force the latter to be recognized as session
2890 # cookies on our own.
2891 # Session cookies may be important for cookies-based authentication,
2892 # e.g. usually, when user does not check 'Remember me' check box while
2893 # logging in on a site, some important cookies are stored as session
2894 # cookies so that not recognizing them will result in failed login.
2895 # 1. https://bugs.python.org/issue17164
2897 # Treat `expires=0` cookies as session cookies
2898 if cookie
.expires
== 0:
2899 cookie
.expires
= None
2900 cookie
.discard
= True
2903 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2904 def __init__(self
, cookiejar
=None):
2905 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2907 def http_response(self
, request
, response
):
2908 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2909 # characters in Set-Cookie HTTP header of last response (see
2910 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2911 # In order to at least prevent crashing we will percent encode Set-Cookie
2912 # header before HTTPCookieProcessor starts processing it.
2913 # if sys.version_info < (3, 0) and response.headers:
2914 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2915 # set_cookie = response.headers.get(set_cookie_header)
2917 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2918 # if set_cookie != set_cookie_escaped:
2919 # del response.headers[set_cookie_header]
2920 # response.headers[set_cookie_header] = set_cookie_escaped
2921 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2923 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2924 https_response
= http_response
2927 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2928 if sys
.version_info
[0] < 3:
2929 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2930 # On python 2 urlh.geturl() may sometimes return redirect URL
2931 # as byte string instead of unicode. This workaround allows
2932 # to force it always return unicode.
2933 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2936 def extract_timezone(date_str
):
2938 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2941 timezone
= datetime
.timedelta()
2943 date_str
= date_str
[:-len(m
.group('tz'))]
2944 if not m
.group('sign'):
2945 timezone
= datetime
.timedelta()
2947 sign
= 1 if m
.group('sign') == '+' else -1
2948 timezone
= datetime
.timedelta(
2949 hours
=sign
* int(m
.group('hours')),
2950 minutes
=sign
* int(m
.group('minutes')))
2951 return timezone
, date_str
2954 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2955 """ Return a UNIX timestamp from the given date """
2957 if date_str
is None:
2960 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2962 if timezone
is None:
2963 timezone
, date_str
= extract_timezone(date_str
)
2966 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2967 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2968 return calendar
.timegm(dt
.timetuple())
2973 def date_formats(day_first
=True):
2974 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2977 def unified_strdate(date_str
, day_first
=True):
2978 """Return a string with the date in the format YYYYMMDD"""
2980 if date_str
is None:
2984 date_str
= date_str
.replace(',', ' ')
2985 # Remove AM/PM + timezone
2986 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2987 _
, date_str
= extract_timezone(date_str
)
2989 for expression
in date_formats(day_first
):
2991 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2994 if upload_date
is None:
2995 timetuple
= email
.utils
.parsedate_tz(date_str
)
2998 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
3001 if upload_date
is not None:
3002 return compat_str(upload_date
)
3005 def unified_timestamp(date_str
, day_first
=True):
3006 if date_str
is None:
3009 date_str
= re
.sub(r
'[,|]', '', date_str
)
3011 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
3012 timezone
, date_str
= extract_timezone(date_str
)
3014 # Remove AM/PM + timezone
3015 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
3017 # Remove unrecognized timezones from ISO 8601 alike timestamps
3018 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
3020 date_str
= date_str
[:-len(m
.group('tz'))]
3022 # Python only supports microseconds, so remove nanoseconds
3023 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3025 date_str
= m
.group(1)
3027 for expression
in date_formats(day_first
):
3029 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3030 return calendar
.timegm(dt
.timetuple())
3033 timetuple
= email
.utils
.parsedate_tz(date_str
)
3035 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3038 def determine_ext(url
, default_ext
='unknown_video'):
3039 if url
is None or '.' not in url
:
3041 guess
= url
.partition('?')[0].rpartition('.')[2]
3042 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3044 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3045 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3046 return guess
.rstrip('/')
3051 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3052 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3055 def date_from_str(date_str
):
3057 Return a datetime object from a string in the format YYYYMMDD or
3058 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3059 today
= datetime
.date
.today()
3060 if date_str
in ('now', 'today'):
3062 if date_str
== 'yesterday':
3063 return today
- datetime
.timedelta(days
=1)
3064 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3065 if match
is not None:
3066 sign
= match
.group('sign')
3067 time
= int(match
.group('time'))
3070 unit
= match
.group('unit')
3071 # A bad approximation?
3075 elif unit
== 'year':
3079 delta
= datetime
.timedelta(**{unit: time}
)
3080 return today
+ delta
3081 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3084 def hyphenate_date(date_str
):
3086 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3087 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3088 if match
is not None:
3089 return '-'.join(match
.groups())
3094 class DateRange(object):
3095 """Represents a time interval between two dates"""
3097 def __init__(self
, start
=None, end
=None):
3098 """start and end must be strings in the format accepted by date"""
3099 if start
is not None:
3100 self
.start
= date_from_str(start
)
3102 self
.start
= datetime
.datetime
.min.date()
3104 self
.end
= date_from_str(end
)
3106 self
.end
= datetime
.datetime
.max.date()
3107 if self
.start
> self
.end
:
3108 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3112 """Returns a range that only contains the given day"""
3113 return cls(day
, day
)
3115 def __contains__(self
, date
):
3116 """Check if the date is in the range"""
3117 if not isinstance(date
, datetime
.date
):
3118 date
= date_from_str(date
)
3119 return self
.start
<= date
<= self
.end
3122 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3125 def platform_name():
3126 """ Returns the platform name as a compat_str """
3127 res
= platform
.platform()
3128 if isinstance(res
, bytes):
3129 res
= res
.decode(preferredencoding())
3131 assert isinstance(res
, compat_str
)
3135 def _windows_write_string(s
, out
):
3136 """ Returns True if the string was written using special methods,
3137 False if it has yet to be written out."""
3138 # Adapted from http://stackoverflow.com/a/3259271/35070
3141 import ctypes
.wintypes
3149 fileno
= out
.fileno()
3150 except AttributeError:
3151 # If the output stream doesn't have a fileno, it's virtual
3153 except io
.UnsupportedOperation
:
3154 # Some strange Windows pseudo files?
3156 if fileno
not in WIN_OUTPUT_IDS
:
3159 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3160 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3161 ('GetStdHandle', ctypes
.windll
.kernel32
))
3162 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3164 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3165 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3166 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3167 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3168 written
= ctypes
.wintypes
.DWORD(0)
3170 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3171 FILE_TYPE_CHAR
= 0x0002
3172 FILE_TYPE_REMOTE
= 0x8000
3173 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3174 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3175 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3176 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3177 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3179 def not_a_console(handle
):
3180 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3182 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3183 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3185 if not_a_console(h
):
3188 def next_nonbmp_pos(s
):
3190 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3191 except StopIteration:
3195 count
= min(next_nonbmp_pos(s
), 1024)
3197 ret
= WriteConsoleW(
3198 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3200 raise OSError('Failed to write string')
3201 if not count
: # We just wrote a non-BMP character
3202 assert written
.value
== 2
3205 assert written
.value
> 0
3206 s
= s
[written
.value
:]
3210 def write_string(s
, out
=None, encoding
=None):
3213 assert type(s
) == compat_str
3215 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3216 if _windows_write_string(s
, out
):
3219 if ('b' in getattr(out
, 'mode', '')
3220 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3221 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3223 elif hasattr(out
, 'buffer'):
3224 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3225 byt
= s
.encode(enc
, 'ignore')
3226 out
.buffer.write(byt
)
3232 def bytes_to_intlist(bs
):
3235 if isinstance(bs
[0], int): # Python 3
3238 return [ord(c
) for c
in bs
]
3241 def intlist_to_bytes(xs
):
3244 return compat_struct_pack('%dB' % len(xs
), *xs
)
3247 # Cross-platform file locking
3248 if sys
.platform
== 'win32':
3249 import ctypes
.wintypes
3252 class OVERLAPPED(ctypes
.Structure
):
3254 ('Internal', ctypes
.wintypes
.LPVOID
),
3255 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3256 ('Offset', ctypes
.wintypes
.DWORD
),
3257 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3258 ('hEvent', ctypes
.wintypes
.HANDLE
),
3261 kernel32
= ctypes
.windll
.kernel32
3262 LockFileEx
= kernel32
.LockFileEx
3263 LockFileEx
.argtypes
= [
3264 ctypes
.wintypes
.HANDLE
, # hFile
3265 ctypes
.wintypes
.DWORD
, # dwFlags
3266 ctypes
.wintypes
.DWORD
, # dwReserved
3267 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3268 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3269 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3271 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3272 UnlockFileEx
= kernel32
.UnlockFileEx
3273 UnlockFileEx
.argtypes
= [
3274 ctypes
.wintypes
.HANDLE
, # hFile
3275 ctypes
.wintypes
.DWORD
, # dwReserved
3276 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3277 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3278 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3280 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3281 whole_low
= 0xffffffff
3282 whole_high
= 0x7fffffff
3284 def _lock_file(f
, exclusive
):
3285 overlapped
= OVERLAPPED()
3286 overlapped
.Offset
= 0
3287 overlapped
.OffsetHigh
= 0
3288 overlapped
.hEvent
= 0
3289 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3290 handle
= msvcrt
.get_osfhandle(f
.fileno())
3291 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3292 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3293 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3295 def _unlock_file(f
):
3296 assert f
._lock
_file
_overlapped
_p
3297 handle
= msvcrt
.get_osfhandle(f
.fileno())
3298 if not UnlockFileEx(handle
, 0,
3299 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3300 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3303 # Some platforms, such as Jython, is missing fcntl
3307 def _lock_file(f
, exclusive
):
3308 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3310 def _unlock_file(f
):
3311 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3313 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3315 def _lock_file(f
, exclusive
):
3316 raise IOError(UNSUPPORTED_MSG
)
3318 def _unlock_file(f
):
3319 raise IOError(UNSUPPORTED_MSG
)
3322 class locked_file(object):
3323 def __init__(self
, filename
, mode
, encoding
=None):
3324 assert mode
in ['r', 'a', 'w']
3325 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3328 def __enter__(self
):
3329 exclusive
= self
.mode
!= 'r'
3331 _lock_file(self
.f
, exclusive
)
3337 def __exit__(self
, etype
, value
, traceback
):
3339 _unlock_file(self
.f
)
3346 def write(self
, *args
):
3347 return self
.f
.write(*args
)
3349 def read(self
, *args
):
3350 return self
.f
.read(*args
)
3353 def get_filesystem_encoding():
3354 encoding
= sys
.getfilesystemencoding()
3355 return encoding
if encoding
is not None else 'utf-8'
3358 def shell_quote(args
):
3360 encoding
= get_filesystem_encoding()
3362 if isinstance(a
, bytes):
3363 # We may get a filename encoded with 'encodeFilename'
3364 a
= a
.decode(encoding
)
3365 quoted_args
.append(compat_shlex_quote(a
))
3366 return ' '.join(quoted_args
)
3369 def smuggle_url(url
, data
):
3370 """ Pass additional data in a URL for internal use. """
3372 url
, idata
= unsmuggle_url(url
, {})
3374 sdata
= compat_urllib_parse_urlencode(
3375 {'__youtubedl_smuggle': json.dumps(data)}
)
3376 return url
+ '#' + sdata
3379 def unsmuggle_url(smug_url
, default
=None):
3380 if '#__youtubedl_smuggle' not in smug_url
:
3381 return smug_url
, default
3382 url
, _
, sdata
= smug_url
.rpartition('#')
3383 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3384 data
= json
.loads(jsond
)
3388 def format_bytes(bytes):
3391 if type(bytes) is str:
3392 bytes = float(bytes)
3396 exponent
= int(math
.log(bytes, 1024.0))
3397 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3398 converted
= float(bytes) / float(1024 ** exponent
)
3399 return '%.2f%s' % (converted
, suffix
)
3402 def lookup_unit_table(unit_table
, s
):
3403 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3405 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3408 num_str
= m
.group('num').replace(',', '.')
3409 mult
= unit_table
[m
.group('unit')]
3410 return int(float(num_str
) * mult
)
3413 def parse_filesize(s
):
3417 # The lower-case forms are of course incorrect and unofficial,
3418 # but we support those too
3435 'megabytes': 1000 ** 2,
3436 'mebibytes': 1024 ** 2,
3442 'gigabytes': 1000 ** 3,
3443 'gibibytes': 1024 ** 3,
3449 'terabytes': 1000 ** 4,
3450 'tebibytes': 1024 ** 4,
3456 'petabytes': 1000 ** 5,
3457 'pebibytes': 1024 ** 5,
3463 'exabytes': 1000 ** 6,
3464 'exbibytes': 1024 ** 6,
3470 'zettabytes': 1000 ** 7,
3471 'zebibytes': 1024 ** 7,
3477 'yottabytes': 1000 ** 8,
3478 'yobibytes': 1024 ** 8,
3481 return lookup_unit_table(_UNIT_TABLE
, s
)
3490 if re
.match(r
'^[\d,.]+$', s
):
3491 return str_to_int(s
)
3502 return lookup_unit_table(_UNIT_TABLE
, s
)
3505 def parse_resolution(s
):
3509 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3512 'width': int(mobj
.group('w')),
3513 'height': int(mobj
.group('h')),
3516 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3518 return {'height': int(mobj.group(1))}
3520 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3522 return {'height': int(mobj.group(1)) * 540}
3527 def parse_bitrate(s
):
3528 if not isinstance(s
, compat_str
):
3530 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3532 return int(mobj
.group(1))
3535 def month_by_name(name
, lang
='en'):
3536 """ Return the number of a month by (locale-independently) English name """
3538 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3541 return month_names
.index(name
) + 1
3546 def month_by_abbreviation(abbrev
):
3547 """ Return the number of a month by (locale-independently) English
3551 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3556 def fix_xml_ampersands(xml_str
):
3557 """Replace all the '&' by '&' in XML"""
3559 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3564 def setproctitle(title
):
3565 assert isinstance(title
, compat_str
)
3567 # ctypes in Jython is not complete
3568 # http://bugs.jython.org/issue2148
3569 if sys
.platform
.startswith('java'):
3573 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3577 # LoadLibrary in Windows Python 2.7.13 only expects
3578 # a bytestring, but since unicode_literals turns
3579 # every string into a unicode string, it fails.
3581 title_bytes
= title
.encode('utf-8')
3582 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3583 buf
.value
= title_bytes
3585 libc
.prctl(15, buf
, 0, 0, 0)
3586 except AttributeError:
3587 return # Strange libc, just skip this
3590 def remove_start(s
, start
):
3591 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3594 def remove_end(s
, end
):
3595 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3598 def remove_quotes(s
):
3599 if s
is None or len(s
) < 2:
3601 for quote
in ('"', "'", ):
3602 if s
[0] == quote
and s
[-1] == quote
:
3607 def get_domain(url
):
3608 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3609 return domain
.group('domain') if domain
else None
3612 def url_basename(url
):
3613 path
= compat_urlparse
.urlparse(url
).path
3614 return path
.strip('/').split('/')[-1]
3618 return re
.match(r
'https?://[^?#&]+/', url
).group()
3621 def urljoin(base
, path
):
3622 if isinstance(path
, bytes):
3623 path
= path
.decode('utf-8')
3624 if not isinstance(path
, compat_str
) or not path
:
3626 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3628 if isinstance(base
, bytes):
3629 base
= base
.decode('utf-8')
3630 if not isinstance(base
, compat_str
) or not re
.match(
3631 r
'^(?:https?:)?//', base
):
3633 return compat_urlparse
.urljoin(base
, path
)
3636 class HEADRequest(compat_urllib_request
.Request
):
3637 def get_method(self
):
3641 class PUTRequest(compat_urllib_request
.Request
):
3642 def get_method(self
):
3646 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3649 v
= getattr(v
, get_attr
, None)
3655 return int(v
) * invscale
// scale
3656 except (ValueError, TypeError):
3660 def str_or_none(v
, default
=None):
3661 return default
if v
is None else compat_str(v
)
3664 def str_to_int(int_str
):
3665 """ A more relaxed version of int_or_none """
3666 if isinstance(int_str
, compat_integer_types
):
3668 elif isinstance(int_str
, compat_str
):
3669 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3670 return int_or_none(int_str
)
3673 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3677 return float(v
) * invscale
/ scale
3678 except (ValueError, TypeError):
3682 def bool_or_none(v
, default
=None):
3683 return v
if isinstance(v
, bool) else default
3686 def strip_or_none(v
, default
=None):
3687 return v
.strip() if isinstance(v
, compat_str
) else default
3690 def url_or_none(url
):
3691 if not url
or not isinstance(url
, compat_str
):
3694 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3697 def strftime_or_none(timestamp
, date_format
, default
=None):
3698 datetime_object
= None
3700 if isinstance(timestamp
, compat_numeric_types
): # unix timestamp
3701 datetime_object
= datetime
.datetime
.utcfromtimestamp(timestamp
)
3702 elif isinstance(timestamp
, compat_str
): # assume YYYYMMDD
3703 datetime_object
= datetime
.datetime
.strptime(timestamp
, '%Y%m%d')
3704 return datetime_object
.strftime(date_format
)
3705 except (ValueError, TypeError, AttributeError):
3709 def parse_duration(s
):
3710 if not isinstance(s
, compat_basestring
):
3715 days
, hours
, mins
, secs
, ms
= [None] * 5
3716 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3718 days
, hours
, mins
, secs
, ms
= m
.groups()
3723 [0-9]+\s*y(?:ears?)?\s*
3726 [0-9]+\s*m(?:onths?)?\s*
3729 [0-9]+\s*w(?:eeks?)?\s*
3732 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3736 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3739 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3742 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3745 days
, hours
, mins
, secs
, ms
= m
.groups()
3747 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3749 hours
, mins
= m
.groups()
3755 duration
+= float(secs
)
3757 duration
+= float(mins
) * 60
3759 duration
+= float(hours
) * 60 * 60
3761 duration
+= float(days
) * 24 * 60 * 60
3763 duration
+= float(ms
)
3767 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3768 name
, real_ext
= os
.path
.splitext(filename
)
3770 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3771 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3772 else '{0}.{1}'.format(filename
, ext
))
3775 def replace_extension(filename
, ext
, expected_real_ext
=None):
3776 name
, real_ext
= os
.path
.splitext(filename
)
3777 return '{0}.{1}'.format(
3778 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3782 def check_executable(exe
, args
=[]):
3783 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3784 args can be a list of arguments for a short output (like -version) """
3786 process_communicate_or_kill(subprocess
.Popen(
3787 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3793 def get_exe_version(exe
, args
=['--version'],
3794 version_re
=None, unrecognized
='present'):
3795 """ Returns the version of the specified executable,
3796 or False if the executable is not present """
3798 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3799 # SIGTTOU if yt-dlp is run in the background.
3800 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3801 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3802 [encodeArgument(exe
)] + args
,
3803 stdin
=subprocess
.PIPE
,
3804 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3807 if isinstance(out
, bytes): # Python 2.x
3808 out
= out
.decode('ascii', 'ignore')
3809 return detect_exe_version(out
, version_re
, unrecognized
)
3812 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3813 assert isinstance(output
, compat_str
)
3814 if version_re
is None:
3815 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3816 m
= re
.search(version_re
, output
)
3823 class PagedList(object):
3825 # This is only useful for tests
3826 return len(self
.getslice())
3829 class OnDemandPagedList(PagedList
):
3830 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3831 self
._pagefunc
= pagefunc
3832 self
._pagesize
= pagesize
3833 self
._use
_cache
= use_cache
3837 def getslice(self
, start
=0, end
=None):
3839 for pagenum
in itertools
.count(start
// self
._pagesize
):
3840 firstid
= pagenum
* self
._pagesize
3841 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3842 if start
>= nextfirstid
:
3847 page_results
= self
._cache
.get(pagenum
)
3848 if page_results
is None:
3849 page_results
= list(self
._pagefunc
(pagenum
))
3851 self
._cache
[pagenum
] = page_results
3854 start
% self
._pagesize
3855 if firstid
<= start
< nextfirstid
3859 ((end
- 1) % self
._pagesize
) + 1
3860 if (end
is not None and firstid
<= end
<= nextfirstid
)
3863 if startv
!= 0 or endv
is not None:
3864 page_results
= page_results
[startv
:endv
]
3865 res
.extend(page_results
)
3867 # A little optimization - if current page is not "full", ie. does
3868 # not contain page_size videos then we can assume that this page
3869 # is the last one - there are no more ids on further pages -
3870 # i.e. no need to query again.
3871 if len(page_results
) + startv
< self
._pagesize
:
3874 # If we got the whole page, but the next page is not interesting,
3875 # break out early as well
3876 if end
== nextfirstid
:
3881 class InAdvancePagedList(PagedList
):
3882 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3883 self
._pagefunc
= pagefunc
3884 self
._pagecount
= pagecount
3885 self
._pagesize
= pagesize
3887 def getslice(self
, start
=0, end
=None):
3889 start_page
= start
// self
._pagesize
3891 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3892 skip_elems
= start
- start_page
* self
._pagesize
3893 only_more
= None if end
is None else end
- start
3894 for pagenum
in range(start_page
, end_page
):
3895 page
= list(self
._pagefunc
(pagenum
))
3897 page
= page
[skip_elems
:]
3899 if only_more
is not None:
3900 if len(page
) < only_more
:
3901 only_more
-= len(page
)
3903 page
= page
[:only_more
]
3910 def uppercase_escape(s
):
3911 unicode_escape
= codecs
.getdecoder('unicode_escape')
3913 r
'\\U[0-9a-fA-F]{8}',
3914 lambda m
: unicode_escape(m
.group(0))[0],
3918 def lowercase_escape(s
):
3919 unicode_escape
= codecs
.getdecoder('unicode_escape')
3921 r
'\\u[0-9a-fA-F]{4}',
3922 lambda m
: unicode_escape(m
.group(0))[0],
3926 def escape_rfc3986(s
):
3927 """Escape non-ASCII characters as suggested by RFC 3986"""
3928 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3929 s
= s
.encode('utf-8')
3930 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3933 def escape_url(url
):
3934 """Escape URL as suggested by RFC 3986"""
3935 url_parsed
= compat_urllib_parse_urlparse(url
)
3936 return url_parsed
._replace
(
3937 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3938 path
=escape_rfc3986(url_parsed
.path
),
3939 params
=escape_rfc3986(url_parsed
.params
),
3940 query
=escape_rfc3986(url_parsed
.query
),
3941 fragment
=escape_rfc3986(url_parsed
.fragment
)
3945 def read_batch_urls(batch_fd
):
3947 if not isinstance(url
, compat_str
):
3948 url
= url
.decode('utf-8', 'replace')
3949 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
3950 for bom
in BOM_UTF8
:
3951 if url
.startswith(bom
):
3952 url
= url
[len(bom
):]
3954 if not url
or url
.startswith(('#', ';', ']')):
3956 # "#" cannot be stripped out since it is part of the URI
3957 # However, it can be safely stipped out if follwing a whitespace
3958 return re
.split(r
'\s#', url
, 1)[0].rstrip()
3960 with contextlib
.closing(batch_fd
) as fd
:
3961 return [url
for url
in map(fixup
, fd
) if url
]
3964 def urlencode_postdata(*args
, **kargs
):
3965 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3968 def update_url_query(url
, query
):
3971 parsed_url
= compat_urlparse
.urlparse(url
)
3972 qs
= compat_parse_qs(parsed_url
.query
)
3974 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3975 query
=compat_urllib_parse_urlencode(qs
, True)))
3978 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
3979 req_headers
= req
.headers
.copy()
3980 req_headers
.update(headers
)
3981 req_data
= data
or req
.data
3982 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3983 req_get_method
= req
.get_method()
3984 if req_get_method
== 'HEAD':
3985 req_type
= HEADRequest
3986 elif req_get_method
== 'PUT':
3987 req_type
= PUTRequest
3989 req_type
= compat_urllib_request
.Request
3991 req_url
, data
=req_data
, headers
=req_headers
,
3992 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3993 if hasattr(req
, 'timeout'):
3994 new_req
.timeout
= req
.timeout
3998 def _multipart_encode_impl(data
, boundary
):
3999 content_type
= 'multipart/form-data; boundary=%s' % boundary
4002 for k
, v
in data
.items():
4003 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
4004 if isinstance(k
, compat_str
):
4005 k
= k
.encode('utf-8')
4006 if isinstance(v
, compat_str
):
4007 v
= v
.encode('utf-8')
4008 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
4009 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
4010 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
4011 if boundary
.encode('ascii') in content
:
4012 raise ValueError('Boundary overlaps with data')
4015 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
4017 return out
, content_type
4020 def multipart_encode(data
, boundary
=None):
4022 Encode a dict to RFC 7578-compliant form-data
4025 A dict where keys and values can be either Unicode or bytes-like
4028 If specified a Unicode object, it's used as the boundary. Otherwise
4029 a random boundary is generated.
4031 Reference: https://tools.ietf.org/html/rfc7578
4033 has_specified_boundary
= boundary
is not None
4036 if boundary
is None:
4037 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4040 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4043 if has_specified_boundary
:
4047 return out
, content_type
4050 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4051 if isinstance(key_or_keys
, (list, tuple)):
4052 for key
in key_or_keys
:
4053 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4057 return d
.get(key_or_keys
, default
)
4060 def try_get(src
, getter
, expected_type
=None):
4061 if not isinstance(getter
, (list, tuple)):
4066 except (AttributeError, KeyError, TypeError, IndexError):
4069 if expected_type
is None or isinstance(v
, expected_type
):
4073 def merge_dicts(*dicts
):
4075 for a_dict
in dicts
:
4076 for k
, v
in a_dict
.items():
4080 or (isinstance(v
, compat_str
) and v
4081 and isinstance(merged
[k
], compat_str
)
4082 and not merged
[k
])):
4087 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4088 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4100 TV_PARENTAL_GUIDELINES
= {
4110 def parse_age_limit(s
):
4112 return s
if 0 <= s
<= 21 else None
4113 if not isinstance(s
, compat_basestring
):
4115 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4117 return int(m
.group('age'))
4120 return US_RATINGS
[s
]
4121 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4123 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4127 def strip_jsonp(code
):
4130 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4131 (?:\s*&&\s*(?P=func_name))?
4132 \s*\(\s*(?P<callback_data>.*)\);?
4133 \s*?(?://[^\n]*)*$''',
4134 r
'\g<callback_data>', code
)
4137 def js_to_json(code
, vars={}):
4138 # vars is a dict of var, val pairs to substitute
4139 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4140 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4142 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4143 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4148 if v
in ('true', 'false', 'null'):
4150 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4153 if v
[0] in ("'", '"'):
4154 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4159 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4161 for regex
, base
in INTEGER_TABLE
:
4162 im
= re
.match(regex
, v
)
4164 i
= int(im
.group(1), base
)
4165 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4172 return re
.sub(r
'''(?sx)
4173 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4174 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4175 {comment}|,(?={skip}[\]}}])|
4176 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4177 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4180 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4183 def qualities(quality_ids
):
4184 """ Get a numeric quality value out of a list of possible values """
4187 return quality_ids
.index(qid
)
4194 'default': '%(title)s [%(id)s].%(ext)s',
4195 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
4201 'description': 'description',
4202 'annotation': 'annotations.xml',
4203 'infojson': 'info.json',
4204 'pl_description': 'description',
4205 'pl_infojson': 'info.json',
4208 # As of [1] format syntax is:
4209 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
4210 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
4211 FORMAT_RE
= r
'''(?x)
4214 \({0}\) # mapping key
4215 (?:[#0\-+ ]+)? # conversion flags (optional)
4216 (?:\d+)? # minimum field width (optional)
4217 (?:\.\d+)? # precision (optional)
4218 [hlL]? # length modifier (optional)
4219 (?P<type>[diouxXeEfFgGcrs%]) # conversion type
4223 def limit_length(s
, length
):
4224 """ Add ellipses to overly long strings """
4229 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4233 def version_tuple(v
):
4234 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4237 def is_outdated_version(version
, limit
, assume_new
=True):
4239 return not assume_new
4241 return version_tuple(version
) < version_tuple(limit
)
4243 return not assume_new
4246 def ytdl_is_updateable():
4247 """ Returns if yt-dlp can be updated with -U """
4250 from zipimport
import zipimporter
4252 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4255 def args_to_str(args
):
4256 # Get a short string representation for a subprocess command
4257 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4260 def error_to_compat_str(err
):
4262 # On python 2 error byte string must be decoded with proper
4263 # encoding rather than ascii
4264 if sys
.version_info
[0] < 3:
4265 err_str
= err_str
.decode(preferredencoding())
4269 def mimetype2ext(mt
):
4275 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4276 # it's the most popular one
4277 'audio/mpeg': 'mp3',
4278 'audio/x-wav': 'wav',
4283 _
, _
, res
= mt
.rpartition('/')
4284 res
= res
.split(';')[0].strip().lower()
4288 'smptett+xml': 'tt',
4292 'x-mp4-fragmented': 'mp4',
4293 'x-ms-sami': 'sami',
4296 'x-mpegurl': 'm3u8',
4297 'vnd.apple.mpegurl': 'm3u8',
4301 'vnd.ms-sstr+xml': 'ism',
4308 def parse_codecs(codecs_str
):
4309 # http://tools.ietf.org/html/rfc6381
4312 split_codecs
= list(filter(None, map(
4313 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4314 vcodec
, acodec
= None, None
4315 for full_codec
in split_codecs
:
4316 codec
= full_codec
.split('.')[0]
4317 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4320 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4324 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4325 if not vcodec
and not acodec
:
4326 if len(split_codecs
) == 2:
4328 'vcodec': split_codecs
[0],
4329 'acodec': split_codecs
[1],
4333 'vcodec': vcodec
or 'none',
4334 'acodec': acodec
or 'none',
4339 def urlhandle_detect_ext(url_handle
):
4340 getheader
= url_handle
.headers
.get
4342 cd
= getheader('Content-Disposition')
4344 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4346 e
= determine_ext(m
.group('filename'), default_ext
=None)
4350 return mimetype2ext(getheader('Content-Type'))
4353 def encode_data_uri(data
, mime_type
):
4354 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4357 def age_restricted(content_limit
, age_limit
):
4358 """ Returns True iff the content should be blocked """
4360 if age_limit
is None: # No limit set
4362 if content_limit
is None:
4363 return False # Content available for everyone
4364 return age_limit
< content_limit
4367 def is_html(first_bytes
):
4368 """ Detect whether a file contains HTML by examining its first bytes. """
4371 (b
'\xef\xbb\xbf', 'utf-8'),
4372 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4373 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4374 (b
'\xff\xfe', 'utf-16-le'),
4375 (b
'\xfe\xff', 'utf-16-be'),
4377 for bom
, enc
in BOMS
:
4378 if first_bytes
.startswith(bom
):
4379 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4382 s
= first_bytes
.decode('utf-8', 'replace')
4384 return re
.match(r
'^\s*<', s
)
4387 def determine_protocol(info_dict
):
4388 protocol
= info_dict
.get('protocol')
4389 if protocol
is not None:
4392 url
= info_dict
['url']
4393 if url
.startswith('rtmp'):
4395 elif url
.startswith('mms'):
4397 elif url
.startswith('rtsp'):
4400 ext
= determine_ext(url
)
4406 return compat_urllib_parse_urlparse(url
).scheme
4409 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4410 """ Render a list of rows, each as a list of values """
4412 def get_max_lens(table
):
4413 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4415 def filter_using_list(row
, filterArray
):
4416 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4419 max_lens
= get_max_lens(data
)
4420 header_row
= filter_using_list(header_row
, max_lens
)
4421 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4423 table
= [header_row
] + data
4424 max_lens
= get_max_lens(table
)
4426 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4427 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4428 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4431 def _match_one(filter_part
, dct
):
4432 COMPARISON_OPERATORS
= {
4440 operator_rex
= re
.compile(r
'''(?x)\s*
4442 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4444 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4445 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4446 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4449 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4450 m = operator_rex.search(filter_part)
4452 op = COMPARISON_OPERATORS[m.group('op')]
4453 actual_value = dct.get(m.group('key'))
4454 if (m.group('quotedstrval') is not None
4455 or m.group('strval') is not None
4456 # If the original field is a string and matching comparisonvalue is
4457 # a number we should respect the origin of the original field
4458 # and process comparison value as a string (see
4459 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4460 or actual_value is not None and m.group('intval') is not None
4461 and isinstance(actual_value, compat_str)):
4462 if m.group('op') not in ('=', '!='):
4464 'Operator %s does not support string values!' % m.group('op'))
4465 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4466 quote = m.group('quote')
4467 if quote is not None:
4468 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4471 comparison_value = int(m.group('intval'))
4473 comparison_value = parse_filesize(m.group('intval'))
4474 if comparison_value is None:
4475 comparison_value = parse_filesize(m.group('intval') + 'B')
4476 if comparison_value is None:
4478 'Invalid integer value %r in filter part %r' % (
4479 m.group('intval'), filter_part))
4480 if actual_value is None:
4481 return m.group('none_inclusive')
4482 return op(actual_value, comparison_value)
4485 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4486 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4488 operator_rex = re.compile(r'''(?x
)\s
*
4489 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4491 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4492 m = operator_rex.search(filter_part)
4494 op = UNARY_OPERATORS[m.group('op')]
4495 actual_value = dct.get(m.group('key'))
4496 return op(actual_value)
4498 raise ValueError('Invalid filter part %r' % filter_part)
4501 def match_str(filter_str, dct):
4502 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4505 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4508 def match_filter_func(filter_str):
4509 def _match_func(info_dict):
4510 if match_str(filter_str, info_dict):
4513 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4514 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4518 def parse_dfxp_time_expr(time_expr):
4522 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4524 return float(mobj.group('time_offset'))
4526 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4528 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4531 def srt_subtitles_timecode(seconds):
4532 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4535 def dfxp2srt(dfxp_data):
4537 @param dfxp_data A
bytes-like
object containing DFXP data
4538 @returns A
unicode object containing converted SRT data
4540 LEGACY_NAMESPACES = (
4541 (b'http://www.w3.org/ns/ttml', [
4542 b'http://www.w3.org/2004/11/ttaf1',
4543 b'http://www.w3.org/2006/04/ttaf1',
4544 b'http://www.w3.org/2006/10/ttaf1',
4546 (b'http://www.w3.org/ns/ttml#styling', [
4547 b'http://www.w3.org/ns/ttml#style',
4551 SUPPORTED_STYLING = [
4560 _x = functools.partial(xpath_with_ns, ns_map={
4561 'xml': 'http://www.w3.org/XML/1998/namespace',
4562 'ttml': 'http://www.w3.org/ns/ttml',
4563 'tts': 'http://www.w3.org/ns/ttml#styling',
4569 class TTMLPElementParser(object):
4571 _unclosed_elements = []
4572 _applied_styles = []
4574 def start(self, tag, attrib):
4575 if tag in (_x('ttml:br'), 'br'):
4578 unclosed_elements = []
4580 element_style_id = attrib.get('style')
4582 style.update(default_style)
4583 if element_style_id:
4584 style.update(styles.get(element_style_id, {}))
4585 for prop in SUPPORTED_STYLING:
4586 prop_val = attrib.get(_x('tts:' + prop))
4588 style[prop] = prop_val
4591 for k, v in sorted(style.items()):
4592 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4595 font += ' color="%s"' % v
4596 elif k == 'fontSize':
4597 font += ' size="%s"' % v
4598 elif k == 'fontFamily':
4599 font += ' face="%s"' % v
4600 elif k == 'fontWeight' and v == 'bold':
4602 unclosed_elements.append('b')
4603 elif k == 'fontStyle' and v == 'italic':
4605 unclosed_elements.append('i')
4606 elif k == 'textDecoration' and v == 'underline':
4608 unclosed_elements.append('u')
4610 self._out += '<font' + font + '>'
4611 unclosed_elements.append('font')
4613 if self._applied_styles:
4614 applied_style.update(self._applied_styles[-1])
4615 applied_style.update(style)
4616 self._applied_styles.append(applied_style)
4617 self._unclosed_elements.append(unclosed_elements)
4620 if tag not in (_x('ttml:br'), 'br'):
4621 unclosed_elements = self._unclosed_elements.pop()
4622 for element in reversed(unclosed_elements):
4623 self._out += '</%s>' % element
4624 if unclosed_elements and self._applied_styles:
4625 self._applied_styles.pop()
4627 def data(self, data):
4631 return self._out.strip()
4633 def parse_node(node):
4634 target = TTMLPElementParser()
4635 parser = xml.etree.ElementTree.XMLParser(target=target)
4636 parser.feed(xml.etree.ElementTree.tostring(node))
4637 return parser.close()
4639 for k, v in LEGACY_NAMESPACES:
4641 dfxp_data = dfxp_data.replace(ns, k)
4643 dfxp = compat_etree_fromstring(dfxp_data)
4645 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4648 raise ValueError('Invalid dfxp/TTML subtitle')
4652 for style in dfxp.findall(_x('.//ttml:style')):
4653 style_id = style.get('id') or style.get(_x('xml:id'))
4656 parent_style_id = style.get('style')
4658 if parent_style_id not in styles:
4661 styles[style_id] = styles[parent_style_id].copy()
4662 for prop in SUPPORTED_STYLING:
4663 prop_val = style.get(_x('tts:' + prop))
4665 styles.setdefault(style_id, {})[prop] = prop_val
4671 for p in ('body', 'div'):
4672 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4675 style = styles.get(ele.get('style'))
4678 default_style.update(style)
4680 for para, index in zip(paras, itertools.count(1)):
4681 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4682 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4683 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4684 if begin_time is None:
4689 end_time = begin_time + dur
4690 out.append('%d\n%s --> %s\n%s\n\n' % (
4692 srt_subtitles_timecode(begin_time),
4693 srt_subtitles_timecode(end_time),
4699 def cli_option(params, command_option, param):
4700 param = params.get(param)
4702 param = compat_str(param)
4703 return [command_option, param] if param is not None else []
4706 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4707 param = params.get(param)
4710 assert isinstance(param, bool)
4712 return [command_option + separator + (true_value if param else false_value)]
4713 return [command_option, true_value if param else false_value]
4716 def cli_valueless_option(params, command_option, param, expected_value=True):
4717 param = params.get(param)
4718 return [command_option] if param == expected_value else []
4721 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4722 if isinstance(argdict, (list, tuple)): # for backward compatibility
4729 assert isinstance(argdict, dict)
4731 assert isinstance(keys, (list, tuple))
4732 for key_list in keys:
4733 if isinstance(key_list, compat_str):
4734 key_list = (key_list,)
4735 arg_list = list(filter(
4736 lambda x: x is not None,
4737 [argdict.get(key.lower()) for key in key_list]))
4739 return [arg for args in arg_list for arg in args]
4743 class ISO639Utils(object):
4744 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4803 'iw': 'heb', # Replaced by he in 1989 revision
4813 'in': 'ind', # Replaced by id in 1989 revision
4928 'ji': 'yid', # Replaced by yi in 1989 revision
4936 def short2long(cls, code):
4937 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4938 return cls._lang_map.get(code[:2])
4941 def long2short(cls, code):
4942 """Convert language code from ISO 639-2/T to ISO 639-1"""
4943 for short_name, long_name in cls._lang_map.items():
4944 if long_name == code:
4948 class ISO3166Utils(object):
4949 # From http://data.okfn.org/data/core/country-list
4951 'AF': 'Afghanistan',
4952 'AX': 'Åland Islands',
4955 'AS': 'American Samoa',
4960 'AG': 'Antigua and Barbuda',
4977 'BO': 'Bolivia, Plurinational State of',
4978 'BQ': 'Bonaire, Sint Eustatius and Saba',
4979 'BA': 'Bosnia and Herzegovina',
4981 'BV': 'Bouvet Island',
4983 'IO': 'British Indian Ocean Territory',
4984 'BN': 'Brunei Darussalam',
4986 'BF': 'Burkina Faso',
4992 'KY': 'Cayman Islands',
4993 'CF': 'Central African Republic',
4997 'CX': 'Christmas Island',
4998 'CC': 'Cocos (Keeling) Islands',
5002 'CD': 'Congo, the Democratic Republic of the',
5003 'CK': 'Cook Islands',
5005 'CI': 'Côte d\'Ivoire',
5010 'CZ': 'Czech Republic',
5014 'DO': 'Dominican Republic',
5017 'SV': 'El Salvador',
5018 'GQ': 'Equatorial Guinea',
5022 'FK': 'Falkland Islands (Malvinas)',
5023 'FO': 'Faroe Islands',
5027 'GF': 'French Guiana',
5028 'PF': 'French Polynesia',
5029 'TF': 'French Southern Territories',
5044 'GW': 'Guinea-Bissau',
5047 'HM': 'Heard Island and McDonald Islands',
5048 'VA': 'Holy See (Vatican City State)',
5055 'IR': 'Iran, Islamic Republic of',
5058 'IM': 'Isle of Man',
5068 'KP': 'Korea, Democratic People\'s Republic of',
5069 'KR': 'Korea, Republic of',
5072 'LA': 'Lao People\'s Democratic Republic',
5078 'LI': 'Liechtenstein',
5082 'MK': 'Macedonia, the Former Yugoslav Republic of',
5089 'MH': 'Marshall Islands',
5095 'FM': 'Micronesia, Federated States of',
5096 'MD': 'Moldova, Republic of',
5107 'NL': 'Netherlands',
5108 'NC': 'New Caledonia',
5109 'NZ': 'New Zealand',
5114 'NF': 'Norfolk Island',
5115 'MP': 'Northern Mariana Islands',
5120 'PS': 'Palestine, State of',
5122 'PG': 'Papua New Guinea',
5125 'PH': 'Philippines',
5129 'PR': 'Puerto Rico',
5133 'RU': 'Russian Federation',
5135 'BL': 'Saint Barthélemy',
5136 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5137 'KN': 'Saint Kitts and Nevis',
5138 'LC': 'Saint Lucia',
5139 'MF': 'Saint Martin (French part)',
5140 'PM': 'Saint Pierre and Miquelon',
5141 'VC': 'Saint Vincent and the Grenadines',
5144 'ST': 'Sao Tome and Principe',
5145 'SA': 'Saudi Arabia',
5149 'SL': 'Sierra Leone',
5151 'SX': 'Sint Maarten (Dutch part)',
5154 'SB': 'Solomon Islands',
5156 'ZA': 'South Africa',
5157 'GS': 'South Georgia and the South Sandwich Islands',
5158 'SS': 'South Sudan',
5163 'SJ': 'Svalbard and Jan Mayen',
5166 'CH': 'Switzerland',
5167 'SY': 'Syrian Arab Republic',
5168 'TW': 'Taiwan, Province of China',
5170 'TZ': 'Tanzania, United Republic of',
5172 'TL': 'Timor-Leste',
5176 'TT': 'Trinidad and Tobago',
5179 'TM': 'Turkmenistan',
5180 'TC': 'Turks and Caicos Islands',
5184 'AE': 'United Arab Emirates',
5185 'GB': 'United Kingdom',
5186 'US': 'United States',
5187 'UM': 'United States Minor Outlying Islands',
5191 'VE': 'Venezuela, Bolivarian Republic of',
5193 'VG': 'Virgin Islands, British',
5194 'VI': 'Virgin Islands, U.S.',
5195 'WF': 'Wallis and Futuna',
5196 'EH': 'Western Sahara',
5203 def short2full(cls, code):
5204 """Convert an ISO 3166-2 country code to the corresponding full name"""
5205 return cls._country_map.get(code.upper())
5208 class GeoUtils(object):
5209 # Major IPv4 address blocks per country
5211 'AD': '46.172.224.0/19',
5212 'AE': '94.200.0.0/13',
5213 'AF': '149.54.0.0/17',
5214 'AG': '209.59.64.0/18',
5215 'AI': '204.14.248.0/21',
5216 'AL': '46.99.0.0/16',
5217 'AM': '46.70.0.0/15',
5218 'AO': '105.168.0.0/13',
5219 'AP': '182.50.184.0/21',
5220 'AQ': '23.154.160.0/24',
5221 'AR': '181.0.0.0/12',
5222 'AS': '202.70.112.0/20',
5223 'AT': '77.116.0.0/14',
5224 'AU': '1.128.0.0/11',
5225 'AW': '181.41.0.0/18',
5226 'AX': '185.217.4.0/22',
5227 'AZ': '5.197.0.0/16',
5228 'BA': '31.176.128.0/17',
5229 'BB': '65.48.128.0/17',
5230 'BD': '114.130.0.0/16',
5232 'BF': '102.178.0.0/15',
5233 'BG': '95.42.0.0/15',
5234 'BH': '37.131.0.0/17',
5235 'BI': '154.117.192.0/18',
5236 'BJ': '137.255.0.0/16',
5237 'BL': '185.212.72.0/23',
5238 'BM': '196.12.64.0/18',
5239 'BN': '156.31.0.0/16',
5240 'BO': '161.56.0.0/16',
5241 'BQ': '161.0.80.0/20',
5242 'BR': '191.128.0.0/12',
5243 'BS': '24.51.64.0/18',
5244 'BT': '119.2.96.0/19',
5245 'BW': '168.167.0.0/16',
5246 'BY': '178.120.0.0/13',
5247 'BZ': '179.42.192.0/18',
5248 'CA': '99.224.0.0/11',
5249 'CD': '41.243.0.0/16',
5250 'CF': '197.242.176.0/21',
5251 'CG': '160.113.0.0/16',
5252 'CH': '85.0.0.0/13',
5253 'CI': '102.136.0.0/14',
5254 'CK': '202.65.32.0/19',
5255 'CL': '152.172.0.0/14',
5256 'CM': '102.244.0.0/14',
5257 'CN': '36.128.0.0/10',
5258 'CO': '181.240.0.0/12',
5259 'CR': '201.192.0.0/12',
5260 'CU': '152.206.0.0/15',
5261 'CV': '165.90.96.0/19',
5262 'CW': '190.88.128.0/17',
5263 'CY': '31.153.0.0/16',
5264 'CZ': '88.100.0.0/14',
5266 'DJ': '197.241.0.0/17',
5267 'DK': '87.48.0.0/12',
5268 'DM': '192.243.48.0/20',
5269 'DO': '152.166.0.0/15',
5270 'DZ': '41.96.0.0/12',
5271 'EC': '186.68.0.0/15',
5272 'EE': '90.190.0.0/15',
5273 'EG': '156.160.0.0/11',
5274 'ER': '196.200.96.0/20',
5275 'ES': '88.0.0.0/11',
5276 'ET': '196.188.0.0/14',
5277 'EU': '2.16.0.0/13',
5278 'FI': '91.152.0.0/13',
5279 'FJ': '144.120.0.0/16',
5280 'FK': '80.73.208.0/21',
5281 'FM': '119.252.112.0/20',
5282 'FO': '88.85.32.0/19',
5284 'GA': '41.158.0.0/15',
5286 'GD': '74.122.88.0/21',
5287 'GE': '31.146.0.0/16',
5288 'GF': '161.22.64.0/18',
5289 'GG': '62.68.160.0/19',
5290 'GH': '154.160.0.0/12',
5291 'GI': '95.164.0.0/16',
5292 'GL': '88.83.0.0/19',
5293 'GM': '160.182.0.0/15',
5294 'GN': '197.149.192.0/18',
5295 'GP': '104.250.0.0/19',
5296 'GQ': '105.235.224.0/20',
5297 'GR': '94.64.0.0/13',
5298 'GT': '168.234.0.0/16',
5299 'GU': '168.123.0.0/16',
5300 'GW': '197.214.80.0/20',
5301 'GY': '181.41.64.0/18',
5302 'HK': '113.252.0.0/14',
5303 'HN': '181.210.0.0/16',
5304 'HR': '93.136.0.0/13',
5305 'HT': '148.102.128.0/17',
5306 'HU': '84.0.0.0/14',
5307 'ID': '39.192.0.0/10',
5308 'IE': '87.32.0.0/12',
5309 'IL': '79.176.0.0/13',
5310 'IM': '5.62.80.0/20',
5311 'IN': '117.192.0.0/10',
5312 'IO': '203.83.48.0/21',
5313 'IQ': '37.236.0.0/14',
5314 'IR': '2.176.0.0/12',
5315 'IS': '82.221.0.0/16',
5316 'IT': '79.0.0.0/10',
5317 'JE': '87.244.64.0/18',
5318 'JM': '72.27.0.0/17',
5319 'JO': '176.29.0.0/16',
5320 'JP': '133.0.0.0/8',
5321 'KE': '105.48.0.0/12',
5322 'KG': '158.181.128.0/17',
5323 'KH': '36.37.128.0/17',
5324 'KI': '103.25.140.0/22',
5325 'KM': '197.255.224.0/20',
5326 'KN': '198.167.192.0/19',
5327 'KP': '175.45.176.0/22',
5328 'KR': '175.192.0.0/10',
5329 'KW': '37.36.0.0/14',
5330 'KY': '64.96.0.0/15',
5331 'KZ': '2.72.0.0/13',
5332 'LA': '115.84.64.0/18',
5333 'LB': '178.135.0.0/16',
5334 'LC': '24.92.144.0/20',
5335 'LI': '82.117.0.0/19',
5336 'LK': '112.134.0.0/15',
5337 'LR': '102.183.0.0/16',
5338 'LS': '129.232.0.0/17',
5339 'LT': '78.56.0.0/13',
5340 'LU': '188.42.0.0/16',
5341 'LV': '46.109.0.0/16',
5342 'LY': '41.252.0.0/14',
5343 'MA': '105.128.0.0/11',
5344 'MC': '88.209.64.0/18',
5345 'MD': '37.246.0.0/16',
5346 'ME': '178.175.0.0/17',
5347 'MF': '74.112.232.0/21',
5348 'MG': '154.126.0.0/17',
5349 'MH': '117.103.88.0/21',
5350 'MK': '77.28.0.0/15',
5351 'ML': '154.118.128.0/18',
5352 'MM': '37.111.0.0/17',
5353 'MN': '49.0.128.0/17',
5354 'MO': '60.246.0.0/16',
5355 'MP': '202.88.64.0/20',
5356 'MQ': '109.203.224.0/19',
5357 'MR': '41.188.64.0/18',
5358 'MS': '208.90.112.0/22',
5359 'MT': '46.11.0.0/16',
5360 'MU': '105.16.0.0/12',
5361 'MV': '27.114.128.0/18',
5362 'MW': '102.70.0.0/15',
5363 'MX': '187.192.0.0/11',
5364 'MY': '175.136.0.0/13',
5365 'MZ': '197.218.0.0/15',
5366 'NA': '41.182.0.0/16',
5367 'NC': '101.101.0.0/18',
5368 'NE': '197.214.0.0/18',
5369 'NF': '203.17.240.0/22',
5370 'NG': '105.112.0.0/12',
5371 'NI': '186.76.0.0/15',
5372 'NL': '145.96.0.0/11',
5373 'NO': '84.208.0.0/13',
5374 'NP': '36.252.0.0/15',
5375 'NR': '203.98.224.0/19',
5376 'NU': '49.156.48.0/22',
5377 'NZ': '49.224.0.0/14',
5378 'OM': '5.36.0.0/15',
5379 'PA': '186.72.0.0/15',
5380 'PE': '186.160.0.0/14',
5381 'PF': '123.50.64.0/18',
5382 'PG': '124.240.192.0/19',
5383 'PH': '49.144.0.0/13',
5384 'PK': '39.32.0.0/11',
5385 'PL': '83.0.0.0/11',
5386 'PM': '70.36.0.0/20',
5387 'PR': '66.50.0.0/16',
5388 'PS': '188.161.0.0/16',
5389 'PT': '85.240.0.0/13',
5390 'PW': '202.124.224.0/20',
5391 'PY': '181.120.0.0/14',
5392 'QA': '37.210.0.0/15',
5393 'RE': '102.35.0.0/16',
5394 'RO': '79.112.0.0/13',
5395 'RS': '93.86.0.0/15',
5396 'RU': '5.136.0.0/13',
5397 'RW': '41.186.0.0/16',
5398 'SA': '188.48.0.0/13',
5399 'SB': '202.1.160.0/19',
5400 'SC': '154.192.0.0/11',
5401 'SD': '102.120.0.0/13',
5402 'SE': '78.64.0.0/12',
5403 'SG': '8.128.0.0/10',
5404 'SI': '188.196.0.0/14',
5405 'SK': '78.98.0.0/15',
5406 'SL': '102.143.0.0/17',
5407 'SM': '89.186.32.0/19',
5408 'SN': '41.82.0.0/15',
5409 'SO': '154.115.192.0/18',
5410 'SR': '186.179.128.0/17',
5411 'SS': '105.235.208.0/21',
5412 'ST': '197.159.160.0/19',
5413 'SV': '168.243.0.0/16',
5414 'SX': '190.102.0.0/20',
5416 'SZ': '41.84.224.0/19',
5417 'TC': '65.255.48.0/20',
5418 'TD': '154.68.128.0/19',
5419 'TG': '196.168.0.0/14',
5420 'TH': '171.96.0.0/13',
5421 'TJ': '85.9.128.0/18',
5422 'TK': '27.96.24.0/21',
5423 'TL': '180.189.160.0/20',
5424 'TM': '95.85.96.0/19',
5425 'TN': '197.0.0.0/11',
5426 'TO': '175.176.144.0/21',
5427 'TR': '78.160.0.0/11',
5428 'TT': '186.44.0.0/15',
5429 'TV': '202.2.96.0/19',
5430 'TW': '120.96.0.0/11',
5431 'TZ': '156.156.0.0/14',
5432 'UA': '37.52.0.0/14',
5433 'UG': '102.80.0.0/13',
5435 'UY': '167.56.0.0/13',
5436 'UZ': '84.54.64.0/18',
5437 'VA': '212.77.0.0/19',
5438 'VC': '207.191.240.0/21',
5439 'VE': '186.88.0.0/13',
5440 'VG': '66.81.192.0/20',
5441 'VI': '146.226.0.0/16',
5442 'VN': '14.160.0.0/11',
5443 'VU': '202.80.32.0/20',
5444 'WF': '117.20.32.0/21',
5445 'WS': '202.4.32.0/19',
5446 'YE': '134.35.0.0/16',
5447 'YT': '41.242.116.0/22',
5448 'ZA': '41.0.0.0/11',
5449 'ZM': '102.144.0.0/13',
5450 'ZW': '102.177.192.0/18',
5454 def random_ipv4(cls, code_or_block):
5455 if len(code_or_block) == 2:
5456 block = cls._country_ip_map.get(code_or_block.upper())
5460 block = code_or_block
5461 addr, preflen = block.split('/')
5462 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5463 addr_max = addr_min | (0xffffffff >> int(preflen))
5464 return compat_str(socket.inet_ntoa(
5465 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5468 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5469 def __init__(self, proxies=None):
5470 # Set default handlers
5471 for type in ('http', 'https'):
5472 setattr(self, '%s_open' % type,
5473 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5474 meth(r, proxy, type))
5475 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5477 def proxy_open(self, req, proxy, type):
5478 req_proxy = req.headers.get('Ytdl-request-proxy')
5479 if req_proxy is not None:
5481 del req.headers['Ytdl-request-proxy']
5483 if proxy == '__noproxy__':
5484 return None # No Proxy
5485 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5486 req.add_header('Ytdl-socks-proxy', proxy)
5487 # yt-dlp's http/https handlers do wrapping the socket with socks
5489 return compat_urllib_request.ProxyHandler.proxy_open(
5490 self, req, proxy, type)
5493 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5494 # released into Public Domain
5495 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5497 def long_to_bytes(n, blocksize=0):
5498 """long_to_bytes(n:long, blocksize:int) : string
5499 Convert a long integer to a byte string.
5501 If optional blocksize is given and greater than zero, pad the front of the
5502 byte string with binary zeros so that the length is a multiple of
5505 # after much testing, this algorithm was deemed to be the fastest
5509 s = compat_struct_pack('>I', n & 0xffffffff) + s
5511 # strip off leading zeros
5512 for i in range(len(s)):
5513 if s[i] != b'\000'[0]:
5516 # only happens when n == 0
5520 # add back some pad bytes. this could be done more efficiently w.r.t. the
5521 # de-padding being done above, but sigh...
5522 if blocksize > 0 and len(s) % blocksize:
5523 s = (blocksize - len(s) % blocksize) * b'\000' + s
5527 def bytes_to_long(s):
5528 """bytes_to_long(string) : long
5529 Convert a byte string to a long integer.
5531 This is (essentially) the inverse of long_to_bytes().
5536 extra = (4 - length % 4)
5537 s = b'\000' * extra + s
5538 length = length + extra
5539 for i in range(0, length, 4):
5540 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5544 def ohdave_rsa_encrypt(data, exponent, modulus):
5546 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5549 data: data to encrypt, bytes-like object
5550 exponent, modulus: parameter e and N of RSA algorithm, both integer
5551 Output: hex string of encrypted data
5553 Limitation: supports one block encryption only
5556 payload = int(binascii.hexlify(data[::-1]), 16)
5557 encrypted = pow(payload, exponent, modulus)
5558 return '%x' % encrypted
5561 def pkcs1pad(data, length):
5563 Padding input data with PKCS#1 scheme
5565 @param {int[]} data input data
5566 @param {int} length target length
5567 @returns {int[]} padded data
5569 if len(data) > length - 11:
5570 raise ValueError('Input data too
long for PKCS
#1 padding')
5572 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5573 return [0, 2] + pseudo_random
+ [0] + data
5576 def encode_base_n(num
, n
, table
=None):
5577 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5579 table
= FULL_TABLE
[:n
]
5582 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5589 ret
= table
[num
% n
] + ret
5594 def decode_packed_codes(code
):
5595 mobj
= re
.search(PACKED_CODES_RE
, code
)
5596 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5599 symbols
= symbols
.split('|')
5604 base_n_count
= encode_base_n(count
, base
)
5605 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5608 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5612 def caesar(s
, alphabet
, shift
):
5617 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5622 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5625 def parse_m3u8_attributes(attrib
):
5627 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5628 if val
.startswith('"'):
5634 def urshift(val
, n
):
5635 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5638 # Based on png2str() written by @gdkchan and improved by @yokrysty
5639 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5640 def decode_png(png_data
):
5641 # Reference: https://www.w3.org/TR/PNG/
5642 header
= png_data
[8:]
5644 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5645 raise IOError('Not a valid PNG file.')
5647 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5648 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5653 length
= unpack_integer(header
[:4])
5656 chunk_type
= header
[:4]
5659 chunk_data
= header
[:length
]
5660 header
= header
[length
:]
5662 header
= header
[4:] # Skip CRC
5670 ihdr
= chunks
[0]['data']
5672 width
= unpack_integer(ihdr
[:4])
5673 height
= unpack_integer(ihdr
[4:8])
5677 for chunk
in chunks
:
5678 if chunk
['type'] == b
'IDAT':
5679 idat
+= chunk
['data']
5682 raise IOError('Unable to read PNG data.')
5684 decompressed_data
= bytearray(zlib
.decompress(idat
))
5689 def _get_pixel(idx
):
5694 for y
in range(height
):
5695 basePos
= y
* (1 + stride
)
5696 filter_type
= decompressed_data
[basePos
]
5700 pixels
.append(current_row
)
5702 for x
in range(stride
):
5703 color
= decompressed_data
[1 + basePos
+ x
]
5704 basex
= y
* stride
+ x
5709 left
= _get_pixel(basex
- 3)
5711 up
= _get_pixel(basex
- stride
)
5713 if filter_type
== 1: # Sub
5714 color
= (color
+ left
) & 0xff
5715 elif filter_type
== 2: # Up
5716 color
= (color
+ up
) & 0xff
5717 elif filter_type
== 3: # Average
5718 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5719 elif filter_type
== 4: # Paeth
5725 c
= _get_pixel(basex
- stride
- 3)
5733 if pa
<= pb
and pa
<= pc
:
5734 color
= (color
+ a
) & 0xff
5736 color
= (color
+ b
) & 0xff
5738 color
= (color
+ c
) & 0xff
5740 current_row
.append(color
)
5742 return width
, height
, pixels
5745 def write_xattr(path
, key
, value
):
5746 # This mess below finds the best xattr tool for the job
5748 # try the pyxattr module...
5751 if hasattr(xattr
, 'set'): # pyxattr
5752 # Unicode arguments are not supported in python-pyxattr until
5754 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5755 pyxattr_required_version
= '0.5.0'
5756 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5757 # TODO: fallback to CLI tools
5758 raise XAttrUnavailableError(
5759 'python-pyxattr is detected but is too old. '
5760 'yt-dlp requires %s or above while your version is %s. '
5761 'Falling back to other xattr implementations' % (
5762 pyxattr_required_version
, xattr
.__version
__))
5764 setxattr
= xattr
.set
5766 setxattr
= xattr
.setxattr
5769 setxattr(path
, key
, value
)
5770 except EnvironmentError as e
:
5771 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5774 if compat_os_name
== 'nt':
5775 # Write xattrs to NTFS Alternate Data Streams:
5776 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5777 assert ':' not in key
5778 assert os
.path
.exists(path
)
5780 ads_fn
= path
+ ':' + key
5782 with open(ads_fn
, 'wb') as f
:
5784 except EnvironmentError as e
:
5785 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5787 user_has_setfattr
= check_executable('setfattr', ['--version'])
5788 user_has_xattr
= check_executable('xattr', ['-h'])
5790 if user_has_setfattr
or user_has_xattr
:
5792 value
= value
.decode('utf-8')
5793 if user_has_setfattr
:
5794 executable
= 'setfattr'
5795 opts
= ['-n', key
, '-v', value
]
5796 elif user_has_xattr
:
5797 executable
= 'xattr'
5798 opts
= ['-w', key
, value
]
5800 cmd
= ([encodeFilename(executable
, True)]
5801 + [encodeArgument(o
) for o
in opts
]
5802 + [encodeFilename(path
, True)])
5805 p
= subprocess
.Popen(
5806 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5807 except EnvironmentError as e
:
5808 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5809 stdout
, stderr
= process_communicate_or_kill(p
)
5810 stderr
= stderr
.decode('utf-8', 'replace')
5811 if p
.returncode
!= 0:
5812 raise XAttrMetadataError(p
.returncode
, stderr
)
5815 # On Unix, and can't find pyxattr, setfattr, or xattr.
5816 if sys
.platform
.startswith('linux'):
5817 raise XAttrUnavailableError(
5818 "Couldn't find a tool to set the xattrs. "
5819 "Install either the python 'pyxattr' or 'xattr' "
5820 "modules, or the GNU 'attr' package "
5821 "(which contains the 'setfattr' tool).")
5823 raise XAttrUnavailableError(
5824 "Couldn't find a tool to set the xattrs. "
5825 "Install either the python 'xattr' module, "
5826 "or the 'xattr' binary.")
5829 def random_birthday(year_field
, month_field
, day_field
):
5830 start_date
= datetime
.date(1950, 1, 1)
5831 end_date
= datetime
.date(1995, 12, 31)
5832 offset
= random
.randint(0, (end_date
- start_date
).days
)
5833 random_date
= start_date
+ datetime
.timedelta(offset
)
5835 year_field
: str(random_date
.year
),
5836 month_field
: str(random_date
.month
),
5837 day_field
: str(random_date
.day
),
5841 # Templates for internet shortcut files, which are plain text files.
5842 DOT_URL_LINK_TEMPLATE
= '''
5847 DOT_WEBLOC_LINK_TEMPLATE
= '''
5848 <?xml version="1.0" encoding="UTF-8"?>
5849 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5850 <plist version="1.0">
5853 \t<string>%(url)s</string>
5858 DOT_DESKTOP_LINK_TEMPLATE
= '''
5868 def iri_to_uri(iri
):
5870 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5872 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5875 iri_parts
= compat_urllib_parse_urlparse(iri
)
5877 if '[' in iri_parts
.netloc
:
5878 raise ValueError('IPv6 URIs are not, yet, supported.')
5879 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5881 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5884 if iri_parts
.username
:
5885 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5886 if iri_parts
.password
is not None:
5887 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5890 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
5891 # The 'idna' encoding produces ASCII text.
5892 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5893 net_location
+= ':' + str(iri_parts
.port
)
5895 return compat_urllib_parse_urlunparse(
5899 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5901 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5902 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5904 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5905 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5907 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5909 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5912 def to_high_limit_path(path
):
5913 if sys
.platform
in ['win32', 'cygwin']:
5914 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5915 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
5920 def format_field(obj
, field
, template
='%s', ignore
=(None, ''), default
='', func
=None):
5921 val
= obj
.get(field
, default
)
5922 if func
and val
not in ignore
:
5924 return template
% val
if val
not in ignore
else default
5927 def clean_podcast_url(url
):
5928 return re
.sub(r
'''(?x)
5932 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5935 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5938 cn\.co| # https://podcorn.com/analytics-prefix/
5939 st\.fm # https://podsights.com/docs/
5944 _HEX_TABLE
= '0123456789abcdef'
5947 def random_uuidv4():
5948 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5951 def make_dir(path
, to_screen
=None):
5953 dn
= os
.path
.dirname(path
)
5954 if dn
and not os
.path
.exists(dn
):
5957 except (OSError, IOError) as err
:
5958 if callable(to_screen
) is not None:
5959 to_screen('unable to create directory ' + error_to_compat_str(err
))
5963 def get_executable_path():
5964 from zipimport
import zipimporter
5965 if hasattr(sys
, 'frozen'): # Running from PyInstaller
5966 path
= os
.path
.dirname(sys
.executable
)
5967 elif isinstance(globals().get('__loader__'), zipimporter
): # Running from ZIP
5968 path
= os
.path
.join(os
.path
.dirname(__file__
), '../..')
5970 path
= os
.path
.join(os
.path
.dirname(__file__
), '..')
5971 return os
.path
.abspath(path
)
5974 def load_plugins(name
, type, namespace
):
5975 plugin_info
= [None]
5978 plugin_info
= imp
.find_module(
5979 name
, [os
.path
.join(get_executable_path(), 'ytdlp_plugins')])
5980 plugins
= imp
.load_module(name
, *plugin_info
)
5981 for name
in dir(plugins
):
5982 if not name
.endswith(type):
5984 klass
= getattr(plugins
, name
)
5985 classes
.append(klass
)
5986 namespace
[name
] = klass
5990 if plugin_info
[0] is not None:
5991 plugin_info
[0].close()
5995 def traverse_dict(dictn
, keys
, casesense
=True):
5996 if not isinstance(dictn
, dict):
6000 dictn
= {key.lower(): val for key, val in dictn.items()}
6001 first_key
= first_key
.lower()
6002 value
= dictn
.get(first_key
, None)
6003 return value
if len(keys
) < 2 else traverse_dict(value
, keys
[1:], casesense
)