4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_unquote_plus
,
64 compat_urllib_request
,
75 def register_socks_protocols():
76 # "Register" SOCKS protocols
77 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
78 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
79 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
80 if scheme
not in compat_urlparse
.uses_netloc
:
81 compat_urlparse
.uses_netloc
.append(scheme
)
84 # This is not clearly defined otherwise
85 compiled_regex_type
= type(re
.compile(''))
88 def random_user_agent():
89 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1668 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1672 'User-Agent': random_user_agent(),
1673 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1674 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1675 'Accept-Encoding': 'gzip, deflate',
1676 'Accept-Language': 'en-us,en;q=0.5',
1681 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1685 NO_DEFAULT
= object()
1687 ENGLISH_MONTH_NAMES
= [
1688 'January', 'February', 'March', 'April', 'May', 'June',
1689 'July', 'August', 'September', 'October', 'November', 'December']
1692 'en': ENGLISH_MONTH_NAMES
,
1694 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1695 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1698 KNOWN_EXTENSIONS
= (
1699 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1700 'flv', 'f4v', 'f4a', 'f4b',
1701 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1702 'mkv', 'mka', 'mk3d',
1705 'asf', 'wmv', 'wma',
1711 'f4f', 'f4m', 'm3u8', 'smil')
1713 # needed for sanitizing filenames in restricted mode
1714 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1715 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1716 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1739 '%Y/%m/%d %H:%M:%S',
1741 '%Y-%m-%d %H:%M:%S',
1742 '%Y-%m-%d %H:%M:%S.%f',
1745 '%Y-%m-%dT%H:%M:%SZ',
1746 '%Y-%m-%dT%H:%M:%S.%fZ',
1747 '%Y-%m-%dT%H:%M:%S.%f0Z',
1748 '%Y-%m-%dT%H:%M:%S',
1749 '%Y-%m-%dT%H:%M:%S.%f',
1751 '%b %d %Y at %H:%M',
1752 '%b %d %Y at %H:%M:%S',
1753 '%B %d %Y at %H:%M',
1754 '%B %d %Y at %H:%M:%S',
1757 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1758 DATE_FORMATS_DAY_FIRST
.extend([
1764 '%d/%m/%Y %H:%M:%S',
1767 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1768 DATE_FORMATS_MONTH_FIRST
.extend([
1773 '%m/%d/%Y %H:%M:%S',
1776 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1777 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1780 def preferredencoding():
1781 """Get preferred encoding.
1783 Returns the best encoding scheme for the system, based on
1784 locale.getpreferredencoding() and some further tweaks.
1787 pref = locale.getpreferredencoding()
1795 def write_json_file(obj, fn):
1796 """ Encode obj as JSON and write it to fn, atomically if possible """
1798 fn = encodeFilename(fn)
1799 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1800 encoding = get_filesystem_encoding()
1801 # os.path.basename returns a bytes object, but NamedTemporaryFile
1802 # will fail if the filename contains non ascii characters unless we
1803 # use a unicode object
1804 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1805 # the same for os.path.dirname
1806 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1808 path_basename = os.path.basename
1809 path_dirname = os.path.dirname
1813 'prefix
': path_basename(fn) + '.',
1814 'dir': path_dirname(fn),
1818 # In Python 2.x, json.dump expects a bytestream.
1819 # In Python 3.x, it writes to a character stream
1820 if sys.version_info < (3, 0):
1825 'encoding
': 'utf
-8',
1828 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1833 if sys.platform == 'win32
':
1834 # Need to remove existing file on Windows, else os.rename raises
1835 # WindowsError or FileExistsError.
1843 os.chmod(tf.name, 0o666 & ~mask)
1846 os.rename(tf.name, fn)
1855 if sys.version_info >= (2, 7):
1856 def find_xpath_attr(node, xpath, key, val=None):
1857 """ Find the xpath xpath[@key=val] """
1858 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1859 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1860 return node.find(expr)
1862 def find_xpath_attr(node, xpath, key, val=None):
1863 for f in node.findall(compat_xpath(xpath)):
1864 if key not in f.attrib:
1866 if val is None or f.attrib.get(key) == val:
1870 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1871 # the namespace parameter
1874 def xpath_with_ns(path
, ns_map
):
1875 components
= [c
.split(':') for c
in path
.split('/')]
1877 for c
in components
:
1879 replaced
.append(c
[0])
1882 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1883 return '/'.join(replaced
)
1886 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1887 def _find_xpath(xpath
):
1888 return node
.find(compat_xpath(xpath
))
1890 if isinstance(xpath
, (str, compat_str
)):
1891 n
= _find_xpath(xpath
)
1899 if default
is not NO_DEFAULT
:
1902 name
= xpath
if name
is None else name
1903 raise ExtractorError('Could not find XML element %s' % name
)
1909 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1910 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1911 if n
is None or n
== default
:
1914 if default
is not NO_DEFAULT
:
1917 name
= xpath
if name
is None else name
1918 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1924 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1925 n
= find_xpath_attr(node
, xpath
, key
)
1927 if default
is not NO_DEFAULT
:
1930 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1931 raise ExtractorError('Could not find XML attribute %s' % name
)
1934 return n
.attrib
[key
]
1937 def get_element_by_id(id, html
):
1938 """Return the content of the tag with the specified ID in the passed HTML document"""
1939 return get_element_by_attribute('id', id, html
)
1942 def get_element_by_class(class_name
, html
):
1943 """Return the content of the first tag with the specified class in the passed HTML document"""
1944 retval
= get_elements_by_class(class_name
, html
)
1945 return retval
[0] if retval
else None
1948 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1949 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1950 return retval
[0] if retval
else None
1953 def get_elements_by_class(class_name
, html
):
1954 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1955 return get_elements_by_attribute(
1956 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1957 html, escape_value=False)
1960 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1961 """Return the content of the tag with the specified attribute in the passed HTML document"""
1963 value = re.escape(value) if escape_value else value
1966 for m in re.finditer(r'''(?xs)
1968 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1970 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1974 ''' % (re.escape(attribute), value), html):
1975 res = m.group('content
')
1977 if res.startswith('"') or res.startswith("'"):
1980 retlist.append(unescapeHTML(res))
1985 class HTMLAttributeParser(compat_HTMLParser):
1986 """Trivial HTML parser to gather the attributes for a single element"""
1990 compat_HTMLParser.__init__(self)
1992 def handle_starttag(self, tag, attrs):
1993 self.attrs = dict(attrs)
1996 def extract_attributes(html_element):
1997 """Given a string for an HTML element such as
1999 a="foo" B="bar" c="&98;az" d=boz
2000 empty= noval entity="&"
2003 Decode and return a dictionary of attributes.
2005 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2006 'empty
': '', 'noval
': None, 'entity
': '&',
2007 'sq
': '"', 'dq': '\''
2009 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2010 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2012 parser = HTMLAttributeParser()
2014 parser.feed(html_element)
2016 # Older Python may throw HTMLParseError in case of malformed HTML
2017 except compat_HTMLParseError:
2022 def clean_html(html):
2023 """Clean an HTML snippet into a readable string"""
2025 if html is None: # Convenience for sanitizing descriptions etc.
2029 html = html.replace('\n', ' ')
2030 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2031 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2033 html = re.sub('<.*?>', '', html)
2034 # Replace html entities
2035 html = unescapeHTML(html)
2039 def sanitize_open(filename, open_mode):
2040 """Try to open the given filename, and slightly tweak it if this fails.
2042 Attempts to open the given filename. If this fails, it tries to change
2043 the filename slightly, step by step, until it's either able to open it
2044 or it fails and raises a final exception, like the standard open()
2047 It returns the tuple (stream, definitive_file_name).
2051 if sys.platform == 'win32':
2053 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2054 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2055 stream = open(encodeFilename(filename), open_mode)
2056 return (stream, filename)
2057 except (IOError, OSError) as err:
2058 if err.errno in (errno.EACCES,):
2061 # In case of error, try to remove win32 forbidden chars
2062 alt_filename = sanitize_path(filename)
2063 if alt_filename == filename:
2066 # An exception here should be caught in the caller
2067 stream = open(encodeFilename(alt_filename), open_mode)
2068 return (stream, alt_filename)
2071 def timeconvert(timestr):
2072 """Convert RFC 2822 defined time string into system timestamp"""
2074 timetuple = email.utils.parsedate_tz(timestr)
2075 if timetuple is not None:
2076 timestamp = email.utils.mktime_tz(timetuple)
2080 def sanitize_filename(s, restricted=False, is_id=False):
2081 """Sanitizes a string so it could be used as part of a filename.
2082 If restricted is set, use a stricter subset of allowed characters.
2083 Set is_id if this is not an arbitrary string, but an ID that should be kept
2086 def replace_insane(char):
2087 if restricted and char in ACCENT_CHARS:
2088 return ACCENT_CHARS[char]
2089 if char == '?' or ord(char) < 32 or ord(char) == 127:
2092 return '' if restricted else '\''
2094 return '_
-' if restricted else ' -'
2095 elif char in '\\/|
*<>':
2097 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2099 if restricted
and ord(char
) > 127:
2104 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2105 result
= ''.join(map(replace_insane
, s
))
2107 while '__' in result
:
2108 result
= result
.replace('__', '_')
2109 result
= result
.strip('_')
2110 # Common case of "Foreign band name - English song title"
2111 if restricted
and result
.startswith('-_'):
2113 if result
.startswith('-'):
2114 result
= '_' + result
[len('-'):]
2115 result
= result
.lstrip('.')
2121 def sanitize_path(s
):
2122 """Sanitizes and normalizes path on Windows"""
2123 if sys
.platform
!= 'win32':
2125 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2126 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2127 drive_or_unc
, _
= os
.path
.splitunc(s
)
2128 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2132 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2133 for path_part
in norm_path
]
2135 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2136 return os
.path
.join(*sanitized_path
)
2139 def sanitize_url(url
):
2140 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2141 # the number of unwanted failures due to missing protocol
2142 if url
.startswith('//'):
2143 return 'http:%s' % url
2144 # Fix some common typos seen so far
2146 # https://github.com/ytdl-org/youtube-dl/issues/15649
2147 (r
'^httpss://', r
'https://'),
2148 # https://bx1.be/lives/direct-tv/
2149 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2151 for mistake
, fixup
in COMMON_TYPOS
:
2152 if re
.match(mistake
, url
):
2153 return re
.sub(mistake
, fixup
, url
)
2157 def sanitized_Request(url
, *args
, **kwargs
):
2158 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2162 """Expand shell variables and ~"""
2163 return os
.path
.expandvars(compat_expanduser(s
))
2166 def orderedSet(iterable
):
2167 """ Remove all duplicates from the input iterable """
2175 def _htmlentity_transform(entity_with_semicolon
):
2176 """Transforms an HTML entity to a character."""
2177 entity
= entity_with_semicolon
[:-1]
2179 # Known non-numeric HTML entity
2180 if entity
in compat_html_entities
.name2codepoint
:
2181 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2183 # TODO: HTML5 allows entities without a semicolon. For example,
2184 # 'Éric' should be decoded as 'Éric'.
2185 if entity_with_semicolon
in compat_html_entities_html5
:
2186 return compat_html_entities_html5
[entity_with_semicolon
]
2188 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2189 if mobj
is not None:
2190 numstr
= mobj
.group(1)
2191 if numstr
.startswith('x'):
2193 numstr
= '0%s' % numstr
2196 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2198 return compat_chr(int(numstr
, base
))
2202 # Unknown entity in name, return its literal representation
2203 return '&%s;' % entity
2206 def unescapeHTML(s
):
2209 assert type(s
) == compat_str
2212 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2215 def get_subprocess_encoding():
2216 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2217 # For subprocess calls, encode with locale encoding
2218 # Refer to http://stackoverflow.com/a/9951851/35070
2219 encoding
= preferredencoding()
2221 encoding
= sys
.getfilesystemencoding()
2222 if encoding
is None:
2227 def encodeFilename(s
, for_subprocess
=False):
2229 @param s The name of the file
2232 assert type(s
) == compat_str
2234 # Python 3 has a Unicode API
2235 if sys
.version_info
>= (3, 0):
2238 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2239 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2240 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2241 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2244 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2245 if sys
.platform
.startswith('java'):
2248 return s
.encode(get_subprocess_encoding(), 'ignore')
2251 def decodeFilename(b
, for_subprocess
=False):
2253 if sys
.version_info
>= (3, 0):
2256 if not isinstance(b
, bytes):
2259 return b
.decode(get_subprocess_encoding(), 'ignore')
2262 def encodeArgument(s
):
2263 if not isinstance(s
, compat_str
):
2264 # Legacy code that uses byte strings
2265 # Uncomment the following line after fixing all post processors
2266 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2267 s
= s
.decode('ascii')
2268 return encodeFilename(s
, True)
2271 def decodeArgument(b
):
2272 return decodeFilename(b
, True)
2275 def decodeOption(optval
):
2278 if isinstance(optval
, bytes):
2279 optval
= optval
.decode(preferredencoding())
2281 assert isinstance(optval
, compat_str
)
2285 def formatSeconds(secs
):
2287 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
2289 return '%d:%02d' % (secs
// 60, secs
% 60)
2294 def make_HTTPS_handler(params
, **kwargs
):
2295 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2296 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2297 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2298 if opts_no_check_certificate
:
2299 context
.check_hostname
= False
2300 context
.verify_mode
= ssl
.CERT_NONE
2302 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2305 # (create_default_context present but HTTPSHandler has no context=)
2308 if sys
.version_info
< (3, 2):
2309 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2310 else: # Python < 3.4
2311 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2312 context
.verify_mode
= (ssl
.CERT_NONE
2313 if opts_no_check_certificate
2314 else ssl
.CERT_REQUIRED
)
2315 context
.set_default_verify_paths()
2316 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2319 def bug_reports_message():
2320 if ytdl_is_updateable():
2321 update_cmd
= 'type youtube-dlc -U to update'
2323 update_cmd
= 'see https://github.com/blackjack4494/yt-dlc on how to update'
2324 msg
= '; please report this issue on https://github.com/blackjack4494/yt-dlc .'
2325 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2326 msg
+= ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.'
2330 class YoutubeDLError(Exception):
2331 """Base exception for YoutubeDL errors."""
2335 class ExtractorError(YoutubeDLError
):
2336 """Error during info extraction."""
2338 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2339 """ tb, if given, is the original traceback (so that it can be printed out).
2340 If expected is set, this is a normal error message and most likely not a bug in youtube-dlc.
2343 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2345 if video_id
is not None:
2346 msg
= video_id
+ ': ' + msg
2348 msg
+= ' (caused by %r)' % cause
2350 msg
+= bug_reports_message()
2351 super(ExtractorError
, self
).__init
__(msg
)
2354 self
.exc_info
= sys
.exc_info() # preserve original exception
2356 self
.video_id
= video_id
2358 def format_traceback(self
):
2359 if self
.traceback
is None:
2361 return ''.join(traceback
.format_tb(self
.traceback
))
2364 class UnsupportedError(ExtractorError
):
2365 def __init__(self
, url
):
2366 super(UnsupportedError
, self
).__init
__(
2367 'Unsupported URL: %s' % url
, expected
=True)
2371 class RegexNotFoundError(ExtractorError
):
2372 """Error when a regex didn't match"""
2376 class GeoRestrictedError(ExtractorError
):
2377 """Geographic restriction Error exception.
2379 This exception may be thrown when a video is not available from your
2380 geographic location due to geographic restrictions imposed by a website.
2383 def __init__(self
, msg
, countries
=None):
2384 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2386 self
.countries
= countries
2389 class DownloadError(YoutubeDLError
):
2390 """Download Error exception.
2392 This exception may be thrown by FileDownloader objects if they are not
2393 configured to continue on errors. They will contain the appropriate
2397 def __init__(self
, msg
, exc_info
=None):
2398 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2399 super(DownloadError
, self
).__init
__(msg
)
2400 self
.exc_info
= exc_info
2403 class SameFileError(YoutubeDLError
):
2404 """Same File exception.
2406 This exception will be thrown by FileDownloader objects if they detect
2407 multiple files would have to be downloaded to the same file on disk.
2412 class PostProcessingError(YoutubeDLError
):
2413 """Post Processing exception.
2415 This exception may be raised by PostProcessor's .run() method to
2416 indicate an error in the postprocessing task.
2419 def __init__(self
, msg
):
2420 super(PostProcessingError
, self
).__init
__(msg
)
2424 class MaxDownloadsReached(YoutubeDLError
):
2425 """ --max-downloads limit has been reached. """
2429 class UnavailableVideoError(YoutubeDLError
):
2430 """Unavailable Format exception.
2432 This exception will be thrown when a video is requested
2433 in a format that is not available for that video.
2438 class ContentTooShortError(YoutubeDLError
):
2439 """Content Too Short exception.
2441 This exception may be raised by FileDownloader objects when a file they
2442 download is too small for what the server announced first, indicating
2443 the connection was probably interrupted.
2446 def __init__(self
, downloaded
, expected
):
2447 super(ContentTooShortError
, self
).__init
__(
2448 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2451 self
.downloaded
= downloaded
2452 self
.expected
= expected
2455 class XAttrMetadataError(YoutubeDLError
):
2456 def __init__(self
, code
=None, msg
='Unknown error'):
2457 super(XAttrMetadataError
, self
).__init
__(msg
)
2461 # Parsing code and msg
2462 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2463 or 'No space left' in self
.msg
or 'Disk quota excedded' in self
.msg
):
2464 self
.reason
= 'NO_SPACE'
2465 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2466 self
.reason
= 'VALUE_TOO_LONG'
2468 self
.reason
= 'NOT_SUPPORTED'
2471 class XAttrUnavailableError(YoutubeDLError
):
2475 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2476 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2477 # expected HTTP responses to meet HTTP/1.0 or later (see also
2478 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2479 if sys
.version_info
< (3, 0):
2480 kwargs
['strict'] = True
2481 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2482 source_address
= ydl_handler
._params
.get('source_address')
2484 if source_address
is not None:
2485 # This is to workaround _create_connection() from socket where it will try all
2486 # address data from getaddrinfo() including IPv6. This filters the result from
2487 # getaddrinfo() based on the source_address value.
2488 # This is based on the cpython socket.create_connection() function.
2489 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2490 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2491 host
, port
= address
2493 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2494 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2495 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2496 if addrs
and not ip_addrs
:
2497 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2499 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2500 % (ip_version
, source_address
[0]))
2501 for res
in ip_addrs
:
2502 af
, socktype
, proto
, canonname
, sa
= res
2505 sock
= socket
.socket(af
, socktype
, proto
)
2506 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2507 sock
.settimeout(timeout
)
2508 sock
.bind(source_address
)
2510 err
= None # Explicitly break reference cycle
2512 except socket
.error
as _
:
2514 if sock
is not None:
2519 raise socket
.error('getaddrinfo returns an empty list')
2520 if hasattr(hc
, '_create_connection'):
2521 hc
._create
_connection
= _create_connection
2522 sa
= (source_address
, 0)
2523 if hasattr(hc
, 'source_address'): # Python 2.7+
2524 hc
.source_address
= sa
2526 def _hc_connect(self
, *args
, **kwargs
):
2527 sock
= _create_connection(
2528 (self
.host
, self
.port
), self
.timeout
, sa
)
2530 self
.sock
= ssl
.wrap_socket(
2531 sock
, self
.key_file
, self
.cert_file
,
2532 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2535 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2540 def handle_youtubedl_headers(headers
):
2541 filtered_headers
= headers
2543 if 'Youtubedl-no-compression' in filtered_headers
:
2544 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2545 del filtered_headers
['Youtubedl-no-compression']
2547 return filtered_headers
2550 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2551 """Handler for HTTP requests and responses.
2553 This class, when installed with an OpenerDirector, automatically adds
2554 the standard headers to every HTTP request and handles gzipped and
2555 deflated responses from web servers. If compression is to be avoided in
2556 a particular request, the original request in the program code only has
2557 to include the HTTP header "Youtubedl-no-compression", which will be
2558 removed before making the real request.
2560 Part of this code was copied from:
2562 http://techknack.net/python-urllib2-handlers/
2564 Andrew Rowls, the author of that code, agreed to release it to the
2568 def __init__(self
, params
, *args
, **kwargs
):
2569 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2570 self
._params
= params
2572 def http_open(self
, req
):
2573 conn_class
= compat_http_client
.HTTPConnection
2575 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2577 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2578 del req
.headers
['Ytdl-socks-proxy']
2580 return self
.do_open(functools
.partial(
2581 _create_http_connection
, self
, conn_class
, False),
2587 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2589 return zlib
.decompress(data
)
2591 def http_request(self
, req
):
2592 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2593 # always respected by websites, some tend to give out URLs with non percent-encoded
2594 # non-ASCII characters (see telemb.py, ard.py [#3412])
2595 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2596 # To work around aforementioned issue we will replace request's original URL with
2597 # percent-encoded one
2598 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2599 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2600 url
= req
.get_full_url()
2601 url_escaped
= escape_url(url
)
2603 # Substitute URL if any change after escaping
2604 if url
!= url_escaped
:
2605 req
= update_Request(req
, url
=url_escaped
)
2607 for h
, v
in std_headers
.items():
2608 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2609 # The dict keys are capitalized because of this bug by urllib
2610 if h
.capitalize() not in req
.headers
:
2611 req
.add_header(h
, v
)
2613 req
.headers
= handle_youtubedl_headers(req
.headers
)
2615 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2616 # Python 2.6 is brain-dead when it comes to fragments
2617 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2618 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2622 def http_response(self
, req
, resp
):
2625 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2626 content
= resp
.read()
2627 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2629 uncompressed
= io
.BytesIO(gz
.read())
2630 except IOError as original_ioerror
:
2631 # There may be junk add the end of the file
2632 # See http://stackoverflow.com/q/4928560/35070 for details
2633 for i
in range(1, 1024):
2635 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2636 uncompressed
= io
.BytesIO(gz
.read())
2641 raise original_ioerror
2642 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2643 resp
.msg
= old_resp
.msg
2644 del resp
.headers
['Content-encoding']
2646 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2647 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2648 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2649 resp
.msg
= old_resp
.msg
2650 del resp
.headers
['Content-encoding']
2651 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2652 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2653 if 300 <= resp
.code
< 400:
2654 location
= resp
.headers
.get('Location')
2656 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2657 if sys
.version_info
>= (3, 0):
2658 location
= location
.encode('iso-8859-1').decode('utf-8')
2660 location
= location
.decode('utf-8')
2661 location_escaped
= escape_url(location
)
2662 if location
!= location_escaped
:
2663 del resp
.headers
['Location']
2664 if sys
.version_info
< (3, 0):
2665 location_escaped
= location_escaped
.encode('utf-8')
2666 resp
.headers
['Location'] = location_escaped
2669 https_request
= http_request
2670 https_response
= http_response
2673 def make_socks_conn_class(base_class
, socks_proxy
):
2674 assert issubclass(base_class
, (
2675 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2677 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2678 if url_components
.scheme
.lower() == 'socks5':
2679 socks_type
= ProxyType
.SOCKS5
2680 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2681 socks_type
= ProxyType
.SOCKS4
2682 elif url_components
.scheme
.lower() == 'socks4a':
2683 socks_type
= ProxyType
.SOCKS4A
2685 def unquote_if_non_empty(s
):
2688 return compat_urllib_parse_unquote_plus(s
)
2692 url_components
.hostname
, url_components
.port
or 1080,
2694 unquote_if_non_empty(url_components
.username
),
2695 unquote_if_non_empty(url_components
.password
),
2698 class SocksConnection(base_class
):
2700 self
.sock
= sockssocket()
2701 self
.sock
.setproxy(*proxy_args
)
2702 if type(self
.timeout
) in (int, float):
2703 self
.sock
.settimeout(self
.timeout
)
2704 self
.sock
.connect((self
.host
, self
.port
))
2706 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2707 if hasattr(self
, '_context'): # Python > 2.6
2708 self
.sock
= self
._context
.wrap_socket(
2709 self
.sock
, server_hostname
=self
.host
)
2711 self
.sock
= ssl
.wrap_socket(self
.sock
)
2713 return SocksConnection
2716 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2717 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2718 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2719 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2720 self
._params
= params
2722 def https_open(self
, req
):
2724 conn_class
= self
._https
_conn
_class
2726 if hasattr(self
, '_context'): # python > 2.6
2727 kwargs
['context'] = self
._context
2728 if hasattr(self
, '_check_hostname'): # python 3.x
2729 kwargs
['check_hostname'] = self
._check
_hostname
2731 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2733 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2734 del req
.headers
['Ytdl-socks-proxy']
2736 return self
.do_open(functools
.partial(
2737 _create_http_connection
, self
, conn_class
, True),
2741 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2743 See [1] for cookie file format.
2745 1. https://curl.haxx.se/docs/http-cookies.html
2747 _HTTPONLY_PREFIX
= '#HttpOnly_'
2749 _HEADER
= '''# Netscape HTTP Cookie File
2750 # This file is generated by youtube-dlc. Do not edit.
2753 _CookieFileEntry
= collections
.namedtuple(
2755 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2757 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2759 Save cookies to a file.
2761 Most of the code is taken from CPython 3.8 and slightly adapted
2762 to support cookie files with UTF-8 in both python 2 and 3.
2764 if filename
is None:
2765 if self
.filename
is not None:
2766 filename
= self
.filename
2768 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2770 # Store session cookies with `expires` set to 0 instead of an empty
2773 if cookie
.expires
is None:
2776 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2777 f
.write(self
._HEADER
)
2780 if not ignore_discard
and cookie
.discard
:
2782 if not ignore_expires
and cookie
.is_expired(now
):
2788 if cookie
.domain
.startswith('.'):
2789 initial_dot
= 'TRUE'
2791 initial_dot
= 'FALSE'
2792 if cookie
.expires
is not None:
2793 expires
= compat_str(cookie
.expires
)
2796 if cookie
.value
is None:
2797 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2798 # with no name, whereas http.cookiejar regards it as a
2799 # cookie with no value.
2804 value
= cookie
.value
2806 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2807 secure
, expires
, name
, value
]) + '\n')
2809 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2810 """Load cookies from a file."""
2811 if filename
is None:
2812 if self
.filename
is not None:
2813 filename
= self
.filename
2815 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2817 def prepare_line(line
):
2818 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2819 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2820 # comments and empty lines are fine
2821 if line
.startswith('#') or not line
.strip():
2823 cookie_list
= line
.split('\t')
2824 if len(cookie_list
) != self
._ENTRY
_LEN
:
2825 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2826 cookie
= self
._CookieFileEntry
(*cookie_list
)
2827 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2828 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2832 with io
.open(filename
, encoding
='utf-8') as f
:
2835 cf
.write(prepare_line(line
))
2836 except compat_cookiejar
.LoadError
as e
:
2838 'WARNING: skipping cookie file entry due to %s: %r\n'
2839 % (e
, line
), sys
.stderr
)
2842 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2843 # Session cookies are denoted by either `expires` field set to
2844 # an empty string or 0. MozillaCookieJar only recognizes the former
2845 # (see [1]). So we need force the latter to be recognized as session
2846 # cookies on our own.
2847 # Session cookies may be important for cookies-based authentication,
2848 # e.g. usually, when user does not check 'Remember me' check box while
2849 # logging in on a site, some important cookies are stored as session
2850 # cookies so that not recognizing them will result in failed login.
2851 # 1. https://bugs.python.org/issue17164
2853 # Treat `expires=0` cookies as session cookies
2854 if cookie
.expires
== 0:
2855 cookie
.expires
= None
2856 cookie
.discard
= True
2859 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2860 def __init__(self
, cookiejar
=None):
2861 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2863 def http_response(self
, request
, response
):
2864 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2865 # characters in Set-Cookie HTTP header of last response (see
2866 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2867 # In order to at least prevent crashing we will percent encode Set-Cookie
2868 # header before HTTPCookieProcessor starts processing it.
2869 # if sys.version_info < (3, 0) and response.headers:
2870 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2871 # set_cookie = response.headers.get(set_cookie_header)
2873 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2874 # if set_cookie != set_cookie_escaped:
2875 # del response.headers[set_cookie_header]
2876 # response.headers[set_cookie_header] = set_cookie_escaped
2877 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2879 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2880 https_response
= http_response
2883 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2884 if sys
.version_info
[0] < 3:
2885 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2886 # On python 2 urlh.geturl() may sometimes return redirect URL
2887 # as byte string instead of unicode. This workaround allows
2888 # to force it always return unicode.
2889 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2892 def extract_timezone(date_str
):
2894 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2897 timezone
= datetime
.timedelta()
2899 date_str
= date_str
[:-len(m
.group('tz'))]
2900 if not m
.group('sign'):
2901 timezone
= datetime
.timedelta()
2903 sign
= 1 if m
.group('sign') == '+' else -1
2904 timezone
= datetime
.timedelta(
2905 hours
=sign
* int(m
.group('hours')),
2906 minutes
=sign
* int(m
.group('minutes')))
2907 return timezone
, date_str
2910 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2911 """ Return a UNIX timestamp from the given date """
2913 if date_str
is None:
2916 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2918 if timezone
is None:
2919 timezone
, date_str
= extract_timezone(date_str
)
2922 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2923 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2924 return calendar
.timegm(dt
.timetuple())
2929 def date_formats(day_first
=True):
2930 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2933 def unified_strdate(date_str
, day_first
=True):
2934 """Return a string with the date in the format YYYYMMDD"""
2936 if date_str
is None:
2940 date_str
= date_str
.replace(',', ' ')
2941 # Remove AM/PM + timezone
2942 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2943 _
, date_str
= extract_timezone(date_str
)
2945 for expression
in date_formats(day_first
):
2947 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2950 if upload_date
is None:
2951 timetuple
= email
.utils
.parsedate_tz(date_str
)
2954 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2957 if upload_date
is not None:
2958 return compat_str(upload_date
)
2961 def unified_timestamp(date_str
, day_first
=True):
2962 if date_str
is None:
2965 date_str
= re
.sub(r
'[,|]', '', date_str
)
2967 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2968 timezone
, date_str
= extract_timezone(date_str
)
2970 # Remove AM/PM + timezone
2971 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2973 # Remove unrecognized timezones from ISO 8601 alike timestamps
2974 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2976 date_str
= date_str
[:-len(m
.group('tz'))]
2978 # Python only supports microseconds, so remove nanoseconds
2979 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2981 date_str
= m
.group(1)
2983 for expression
in date_formats(day_first
):
2985 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2986 return calendar
.timegm(dt
.timetuple())
2989 timetuple
= email
.utils
.parsedate_tz(date_str
)
2991 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2994 def determine_ext(url
, default_ext
='unknown_video'):
2995 if url
is None or '.' not in url
:
2997 guess
= url
.partition('?')[0].rpartition('.')[2]
2998 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3000 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3001 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3002 return guess
.rstrip('/')
3007 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3008 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3011 def date_from_str(date_str
):
3013 Return a datetime object from a string in the format YYYYMMDD or
3014 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3015 today
= datetime
.date
.today()
3016 if date_str
in ('now', 'today'):
3018 if date_str
== 'yesterday':
3019 return today
- datetime
.timedelta(days
=1)
3020 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3021 if match
is not None:
3022 sign
= match
.group('sign')
3023 time
= int(match
.group('time'))
3026 unit
= match
.group('unit')
3027 # A bad approximation?
3031 elif unit
== 'year':
3035 delta
= datetime
.timedelta(**{unit: time}
)
3036 return today
+ delta
3037 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3040 def hyphenate_date(date_str
):
3042 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3043 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3044 if match
is not None:
3045 return '-'.join(match
.groups())
3050 class DateRange(object):
3051 """Represents a time interval between two dates"""
3053 def __init__(self
, start
=None, end
=None):
3054 """start and end must be strings in the format accepted by date"""
3055 if start
is not None:
3056 self
.start
= date_from_str(start
)
3058 self
.start
= datetime
.datetime
.min.date()
3060 self
.end
= date_from_str(end
)
3062 self
.end
= datetime
.datetime
.max.date()
3063 if self
.start
> self
.end
:
3064 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3068 """Returns a range that only contains the given day"""
3069 return cls(day
, day
)
3071 def __contains__(self
, date
):
3072 """Check if the date is in the range"""
3073 if not isinstance(date
, datetime
.date
):
3074 date
= date_from_str(date
)
3075 return self
.start
<= date
<= self
.end
3078 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3081 def platform_name():
3082 """ Returns the platform name as a compat_str """
3083 res
= platform
.platform()
3084 if isinstance(res
, bytes):
3085 res
= res
.decode(preferredencoding())
3087 assert isinstance(res
, compat_str
)
3091 def _windows_write_string(s
, out
):
3092 """ Returns True if the string was written using special methods,
3093 False if it has yet to be written out."""
3094 # Adapted from http://stackoverflow.com/a/3259271/35070
3097 import ctypes
.wintypes
3105 fileno
= out
.fileno()
3106 except AttributeError:
3107 # If the output stream doesn't have a fileno, it's virtual
3109 except io
.UnsupportedOperation
:
3110 # Some strange Windows pseudo files?
3112 if fileno
not in WIN_OUTPUT_IDS
:
3115 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3116 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3117 ('GetStdHandle', ctypes
.windll
.kernel32
))
3118 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3120 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3121 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3122 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3123 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3124 written
= ctypes
.wintypes
.DWORD(0)
3126 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3127 FILE_TYPE_CHAR
= 0x0002
3128 FILE_TYPE_REMOTE
= 0x8000
3129 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3130 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3131 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3132 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3133 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3135 def not_a_console(handle
):
3136 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3138 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3139 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3141 if not_a_console(h
):
3144 def next_nonbmp_pos(s
):
3146 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3147 except StopIteration:
3151 count
= min(next_nonbmp_pos(s
), 1024)
3153 ret
= WriteConsoleW(
3154 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3156 raise OSError('Failed to write string')
3157 if not count
: # We just wrote a non-BMP character
3158 assert written
.value
== 2
3161 assert written
.value
> 0
3162 s
= s
[written
.value
:]
3166 def write_string(s
, out
=None, encoding
=None):
3169 assert type(s
) == compat_str
3171 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3172 if _windows_write_string(s
, out
):
3175 if ('b' in getattr(out
, 'mode', '')
3176 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3177 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3179 elif hasattr(out
, 'buffer'):
3180 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3181 byt
= s
.encode(enc
, 'ignore')
3182 out
.buffer.write(byt
)
3188 def bytes_to_intlist(bs
):
3191 if isinstance(bs
[0], int): # Python 3
3194 return [ord(c
) for c
in bs
]
3197 def intlist_to_bytes(xs
):
3200 return compat_struct_pack('%dB' % len(xs
), *xs
)
3203 # Cross-platform file locking
3204 if sys
.platform
== 'win32':
3205 import ctypes
.wintypes
3208 class OVERLAPPED(ctypes
.Structure
):
3210 ('Internal', ctypes
.wintypes
.LPVOID
),
3211 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3212 ('Offset', ctypes
.wintypes
.DWORD
),
3213 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3214 ('hEvent', ctypes
.wintypes
.HANDLE
),
3217 kernel32
= ctypes
.windll
.kernel32
3218 LockFileEx
= kernel32
.LockFileEx
3219 LockFileEx
.argtypes
= [
3220 ctypes
.wintypes
.HANDLE
, # hFile
3221 ctypes
.wintypes
.DWORD
, # dwFlags
3222 ctypes
.wintypes
.DWORD
, # dwReserved
3223 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3224 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3225 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3227 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3228 UnlockFileEx
= kernel32
.UnlockFileEx
3229 UnlockFileEx
.argtypes
= [
3230 ctypes
.wintypes
.HANDLE
, # hFile
3231 ctypes
.wintypes
.DWORD
, # dwReserved
3232 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3233 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3234 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3236 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3237 whole_low
= 0xffffffff
3238 whole_high
= 0x7fffffff
3240 def _lock_file(f
, exclusive
):
3241 overlapped
= OVERLAPPED()
3242 overlapped
.Offset
= 0
3243 overlapped
.OffsetHigh
= 0
3244 overlapped
.hEvent
= 0
3245 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3246 handle
= msvcrt
.get_osfhandle(f
.fileno())
3247 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3248 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3249 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3251 def _unlock_file(f
):
3252 assert f
._lock
_file
_overlapped
_p
3253 handle
= msvcrt
.get_osfhandle(f
.fileno())
3254 if not UnlockFileEx(handle
, 0,
3255 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3256 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3259 # Some platforms, such as Jython, is missing fcntl
3263 def _lock_file(f
, exclusive
):
3264 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3266 def _unlock_file(f
):
3267 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3269 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3271 def _lock_file(f
, exclusive
):
3272 raise IOError(UNSUPPORTED_MSG
)
3274 def _unlock_file(f
):
3275 raise IOError(UNSUPPORTED_MSG
)
3278 class locked_file(object):
3279 def __init__(self
, filename
, mode
, encoding
=None):
3280 assert mode
in ['r', 'a', 'w']
3281 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3284 def __enter__(self
):
3285 exclusive
= self
.mode
!= 'r'
3287 _lock_file(self
.f
, exclusive
)
3293 def __exit__(self
, etype
, value
, traceback
):
3295 _unlock_file(self
.f
)
3302 def write(self
, *args
):
3303 return self
.f
.write(*args
)
3305 def read(self
, *args
):
3306 return self
.f
.read(*args
)
3309 def get_filesystem_encoding():
3310 encoding
= sys
.getfilesystemencoding()
3311 return encoding
if encoding
is not None else 'utf-8'
3314 def shell_quote(args
):
3316 encoding
= get_filesystem_encoding()
3318 if isinstance(a
, bytes):
3319 # We may get a filename encoded with 'encodeFilename'
3320 a
= a
.decode(encoding
)
3321 quoted_args
.append(compat_shlex_quote(a
))
3322 return ' '.join(quoted_args
)
3325 def smuggle_url(url
, data
):
3326 """ Pass additional data in a URL for internal use. """
3328 url
, idata
= unsmuggle_url(url
, {})
3330 sdata
= compat_urllib_parse_urlencode(
3331 {'__youtubedl_smuggle': json.dumps(data)}
)
3332 return url
+ '#' + sdata
3335 def unsmuggle_url(smug_url
, default
=None):
3336 if '#__youtubedl_smuggle' not in smug_url
:
3337 return smug_url
, default
3338 url
, _
, sdata
= smug_url
.rpartition('#')
3339 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3340 data
= json
.loads(jsond
)
3344 def format_bytes(bytes):
3347 if type(bytes) is str:
3348 bytes = float(bytes)
3352 exponent
= int(math
.log(bytes, 1024.0))
3353 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3354 converted
= float(bytes) / float(1024 ** exponent
)
3355 return '%.2f%s' % (converted
, suffix
)
3358 def lookup_unit_table(unit_table
, s
):
3359 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3361 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3364 num_str
= m
.group('num').replace(',', '.')
3365 mult
= unit_table
[m
.group('unit')]
3366 return int(float(num_str
) * mult
)
3369 def parse_filesize(s
):
3373 # The lower-case forms are of course incorrect and unofficial,
3374 # but we support those too
3391 'megabytes': 1000 ** 2,
3392 'mebibytes': 1024 ** 2,
3398 'gigabytes': 1000 ** 3,
3399 'gibibytes': 1024 ** 3,
3405 'terabytes': 1000 ** 4,
3406 'tebibytes': 1024 ** 4,
3412 'petabytes': 1000 ** 5,
3413 'pebibytes': 1024 ** 5,
3419 'exabytes': 1000 ** 6,
3420 'exbibytes': 1024 ** 6,
3426 'zettabytes': 1000 ** 7,
3427 'zebibytes': 1024 ** 7,
3433 'yottabytes': 1000 ** 8,
3434 'yobibytes': 1024 ** 8,
3437 return lookup_unit_table(_UNIT_TABLE
, s
)
3446 if re
.match(r
'^[\d,.]+$', s
):
3447 return str_to_int(s
)
3458 return lookup_unit_table(_UNIT_TABLE
, s
)
3461 def parse_resolution(s
):
3465 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3468 'width': int(mobj
.group('w')),
3469 'height': int(mobj
.group('h')),
3472 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3474 return {'height': int(mobj.group(1))}
3476 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3478 return {'height': int(mobj.group(1)) * 540}
3483 def parse_bitrate(s
):
3484 if not isinstance(s
, compat_str
):
3486 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3488 return int(mobj
.group(1))
3491 def month_by_name(name
, lang
='en'):
3492 """ Return the number of a month by (locale-independently) English name """
3494 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3497 return month_names
.index(name
) + 1
3502 def month_by_abbreviation(abbrev
):
3503 """ Return the number of a month by (locale-independently) English
3507 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3512 def fix_xml_ampersands(xml_str
):
3513 """Replace all the '&' by '&' in XML"""
3515 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3520 def setproctitle(title
):
3521 assert isinstance(title
, compat_str
)
3523 # ctypes in Jython is not complete
3524 # http://bugs.jython.org/issue2148
3525 if sys
.platform
.startswith('java'):
3529 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3533 # LoadLibrary in Windows Python 2.7.13 only expects
3534 # a bytestring, but since unicode_literals turns
3535 # every string into a unicode string, it fails.
3537 title_bytes
= title
.encode('utf-8')
3538 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3539 buf
.value
= title_bytes
3541 libc
.prctl(15, buf
, 0, 0, 0)
3542 except AttributeError:
3543 return # Strange libc, just skip this
3546 def remove_start(s
, start
):
3547 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3550 def remove_end(s
, end
):
3551 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3554 def remove_quotes(s
):
3555 if s
is None or len(s
) < 2:
3557 for quote
in ('"', "'", ):
3558 if s
[0] == quote
and s
[-1] == quote
:
3563 def get_domain(url
):
3564 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3565 return domain
.group('domain') if domain
else None
3568 def url_basename(url
):
3569 path
= compat_urlparse
.urlparse(url
).path
3570 return path
.strip('/').split('/')[-1]
3574 return re
.match(r
'https?://[^?#&]+/', url
).group()
3577 def urljoin(base
, path
):
3578 if isinstance(path
, bytes):
3579 path
= path
.decode('utf-8')
3580 if not isinstance(path
, compat_str
) or not path
:
3582 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3584 if isinstance(base
, bytes):
3585 base
= base
.decode('utf-8')
3586 if not isinstance(base
, compat_str
) or not re
.match(
3587 r
'^(?:https?:)?//', base
):
3589 return compat_urlparse
.urljoin(base
, path
)
3592 class HEADRequest(compat_urllib_request
.Request
):
3593 def get_method(self
):
3597 class PUTRequest(compat_urllib_request
.Request
):
3598 def get_method(self
):
3602 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3605 v
= getattr(v
, get_attr
, None)
3611 return int(v
) * invscale
// scale
3612 except (ValueError, TypeError):
3616 def str_or_none(v
, default
=None):
3617 return default
if v
is None else compat_str(v
)
3620 def str_to_int(int_str
):
3621 """ A more relaxed version of int_or_none """
3622 if isinstance(int_str
, compat_integer_types
):
3624 elif isinstance(int_str
, compat_str
):
3625 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3626 return int_or_none(int_str
)
3629 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3633 return float(v
) * invscale
/ scale
3634 except (ValueError, TypeError):
3638 def bool_or_none(v
, default
=None):
3639 return v
if isinstance(v
, bool) else default
3642 def strip_or_none(v
, default
=None):
3643 return v
.strip() if isinstance(v
, compat_str
) else default
3646 def url_or_none(url
):
3647 if not url
or not isinstance(url
, compat_str
):
3650 return url
if re
.match(r
'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url
) else None
3653 def parse_duration(s
):
3654 if not isinstance(s
, compat_basestring
):
3659 days
, hours
, mins
, secs
, ms
= [None] * 5
3660 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3662 days
, hours
, mins
, secs
, ms
= m
.groups()
3667 [0-9]+\s*y(?:ears?)?\s*
3670 [0-9]+\s*m(?:onths?)?\s*
3673 [0-9]+\s*w(?:eeks?)?\s*
3676 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3680 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3683 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3686 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3689 days
, hours
, mins
, secs
, ms
= m
.groups()
3691 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3693 hours
, mins
= m
.groups()
3699 duration
+= float(secs
)
3701 duration
+= float(mins
) * 60
3703 duration
+= float(hours
) * 60 * 60
3705 duration
+= float(days
) * 24 * 60 * 60
3707 duration
+= float(ms
)
3711 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3712 name
, real_ext
= os
.path
.splitext(filename
)
3714 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3715 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3716 else '{0}.{1}'.format(filename
, ext
))
3719 def replace_extension(filename
, ext
, expected_real_ext
=None):
3720 name
, real_ext
= os
.path
.splitext(filename
)
3721 return '{0}.{1}'.format(
3722 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3726 def check_executable(exe
, args
=[]):
3727 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3728 args can be a list of arguments for a short output (like -version) """
3730 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3736 def get_exe_version(exe
, args
=['--version'],
3737 version_re
=None, unrecognized
='present'):
3738 """ Returns the version of the specified executable,
3739 or False if the executable is not present """
3741 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3742 # SIGTTOU if youtube-dlc is run in the background.
3743 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3744 out
, _
= subprocess
.Popen(
3745 [encodeArgument(exe
)] + args
,
3746 stdin
=subprocess
.PIPE
,
3747 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3750 if isinstance(out
, bytes): # Python 2.x
3751 out
= out
.decode('ascii', 'ignore')
3752 return detect_exe_version(out
, version_re
, unrecognized
)
3755 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3756 assert isinstance(output
, compat_str
)
3757 if version_re
is None:
3758 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3759 m
= re
.search(version_re
, output
)
3766 class PagedList(object):
3768 # This is only useful for tests
3769 return len(self
.getslice())
3772 class OnDemandPagedList(PagedList
):
3773 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3774 self
._pagefunc
= pagefunc
3775 self
._pagesize
= pagesize
3776 self
._use
_cache
= use_cache
3780 def getslice(self
, start
=0, end
=None):
3782 for pagenum
in itertools
.count(start
// self
._pagesize
):
3783 firstid
= pagenum
* self
._pagesize
3784 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3785 if start
>= nextfirstid
:
3790 page_results
= self
._cache
.get(pagenum
)
3791 if page_results
is None:
3792 page_results
= list(self
._pagefunc
(pagenum
))
3794 self
._cache
[pagenum
] = page_results
3797 start
% self
._pagesize
3798 if firstid
<= start
< nextfirstid
3802 ((end
- 1) % self
._pagesize
) + 1
3803 if (end
is not None and firstid
<= end
<= nextfirstid
)
3806 if startv
!= 0 or endv
is not None:
3807 page_results
= page_results
[startv
:endv
]
3808 res
.extend(page_results
)
3810 # A little optimization - if current page is not "full", ie. does
3811 # not contain page_size videos then we can assume that this page
3812 # is the last one - there are no more ids on further pages -
3813 # i.e. no need to query again.
3814 if len(page_results
) + startv
< self
._pagesize
:
3817 # If we got the whole page, but the next page is not interesting,
3818 # break out early as well
3819 if end
== nextfirstid
:
3824 class InAdvancePagedList(PagedList
):
3825 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3826 self
._pagefunc
= pagefunc
3827 self
._pagecount
= pagecount
3828 self
._pagesize
= pagesize
3830 def getslice(self
, start
=0, end
=None):
3832 start_page
= start
// self
._pagesize
3834 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3835 skip_elems
= start
- start_page
* self
._pagesize
3836 only_more
= None if end
is None else end
- start
3837 for pagenum
in range(start_page
, end_page
):
3838 page
= list(self
._pagefunc
(pagenum
))
3840 page
= page
[skip_elems
:]
3842 if only_more
is not None:
3843 if len(page
) < only_more
:
3844 only_more
-= len(page
)
3846 page
= page
[:only_more
]
3853 def uppercase_escape(s
):
3854 unicode_escape
= codecs
.getdecoder('unicode_escape')
3856 r
'\\U[0-9a-fA-F]{8}',
3857 lambda m
: unicode_escape(m
.group(0))[0],
3861 def lowercase_escape(s
):
3862 unicode_escape
= codecs
.getdecoder('unicode_escape')
3864 r
'\\u[0-9a-fA-F]{4}',
3865 lambda m
: unicode_escape(m
.group(0))[0],
3869 def escape_rfc3986(s
):
3870 """Escape non-ASCII characters as suggested by RFC 3986"""
3871 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3872 s
= s
.encode('utf-8')
3873 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3876 def escape_url(url
):
3877 """Escape URL as suggested by RFC 3986"""
3878 url_parsed
= compat_urllib_parse_urlparse(url
)
3879 return url_parsed
._replace
(
3880 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3881 path
=escape_rfc3986(url_parsed
.path
),
3882 params
=escape_rfc3986(url_parsed
.params
),
3883 query
=escape_rfc3986(url_parsed
.query
),
3884 fragment
=escape_rfc3986(url_parsed
.fragment
)
3888 def read_batch_urls(batch_fd
):
3890 if not isinstance(url
, compat_str
):
3891 url
= url
.decode('utf-8', 'replace')
3892 BOM_UTF8
= '\xef\xbb\xbf'
3893 if url
.startswith(BOM_UTF8
):
3894 url
= url
[len(BOM_UTF8
):]
3896 if url
.startswith(('#', ';', ']')):
3900 with contextlib
.closing(batch_fd
) as fd
:
3901 return [url
for url
in map(fixup
, fd
) if url
]
3904 def urlencode_postdata(*args
, **kargs
):
3905 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3908 def update_url_query(url
, query
):
3911 parsed_url
= compat_urlparse
.urlparse(url
)
3912 qs
= compat_parse_qs(parsed_url
.query
)
3914 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3915 query
=compat_urllib_parse_urlencode(qs
, True)))
3918 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
3919 req_headers
= req
.headers
.copy()
3920 req_headers
.update(headers
)
3921 req_data
= data
or req
.data
3922 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3923 req_get_method
= req
.get_method()
3924 if req_get_method
== 'HEAD':
3925 req_type
= HEADRequest
3926 elif req_get_method
== 'PUT':
3927 req_type
= PUTRequest
3929 req_type
= compat_urllib_request
.Request
3931 req_url
, data
=req_data
, headers
=req_headers
,
3932 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3933 if hasattr(req
, 'timeout'):
3934 new_req
.timeout
= req
.timeout
3938 def _multipart_encode_impl(data
, boundary
):
3939 content_type
= 'multipart/form-data; boundary=%s' % boundary
3942 for k
, v
in data
.items():
3943 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3944 if isinstance(k
, compat_str
):
3945 k
= k
.encode('utf-8')
3946 if isinstance(v
, compat_str
):
3947 v
= v
.encode('utf-8')
3948 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3949 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3950 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3951 if boundary
.encode('ascii') in content
:
3952 raise ValueError('Boundary overlaps with data')
3955 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3957 return out
, content_type
3960 def multipart_encode(data
, boundary
=None):
3962 Encode a dict to RFC 7578-compliant form-data
3965 A dict where keys and values can be either Unicode or bytes-like
3968 If specified a Unicode object, it's used as the boundary. Otherwise
3969 a random boundary is generated.
3971 Reference: https://tools.ietf.org/html/rfc7578
3973 has_specified_boundary
= boundary
is not None
3976 if boundary
is None:
3977 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3980 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3983 if has_specified_boundary
:
3987 return out
, content_type
3990 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3991 if isinstance(key_or_keys
, (list, tuple)):
3992 for key
in key_or_keys
:
3993 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
3997 return d
.get(key_or_keys
, default
)
4000 def try_get(src
, getter
, expected_type
=None):
4001 if not isinstance(getter
, (list, tuple)):
4006 except (AttributeError, KeyError, TypeError, IndexError):
4009 if expected_type
is None or isinstance(v
, expected_type
):
4013 def merge_dicts(*dicts
):
4015 for a_dict
in dicts
:
4016 for k
, v
in a_dict
.items():
4020 or (isinstance(v
, compat_str
) and v
4021 and isinstance(merged
[k
], compat_str
)
4022 and not merged
[k
])):
4027 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4028 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4040 TV_PARENTAL_GUIDELINES
= {
4050 def parse_age_limit(s
):
4052 return s
if 0 <= s
<= 21 else None
4053 if not isinstance(s
, compat_basestring
):
4055 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4057 return int(m
.group('age'))
4059 return US_RATINGS
[s
]
4060 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4062 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4066 def strip_jsonp(code
):
4069 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4070 (?:\s*&&\s*(?P=func_name))?
4071 \s*\(\s*(?P<callback_data>.*)\);?
4072 \s*?(?://[^\n]*)*$''',
4073 r
'\g<callback_data>', code
)
4076 def js_to_json(code
):
4077 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4078 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4080 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4081 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4086 if v
in ('true', 'false', 'null'):
4088 elif v
.startswith('/*') or v
.startswith('//') or v
== ',':
4091 if v
[0] in ("'", '"'):
4092 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4097 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4099 for regex
, base
in INTEGER_TABLE
:
4100 im
= re
.match(regex
, v
)
4102 i
= int(im
.group(1), base
)
4103 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4107 return re
.sub(r
'''(?sx)
4108 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4109 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4110 {comment}|,(?={skip}[\]}}])|
4111 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4112 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4114 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4117 def qualities(quality_ids
):
4118 """ Get a numeric quality value out of a list of possible values """
4121 return quality_ids
.index(qid
)
4127 DEFAULT_OUTTMPL
= '%(title)s-%(id)s.%(ext)s'
4130 def limit_length(s
, length
):
4131 """ Add ellipses to overly long strings """
4136 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4140 def version_tuple(v
):
4141 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4144 def is_outdated_version(version
, limit
, assume_new
=True):
4146 return not assume_new
4148 return version_tuple(version
) < version_tuple(limit
)
4150 return not assume_new
4153 def ytdl_is_updateable():
4154 """ Returns if youtube-dlc can be updated with -U """
4155 from zipimport
import zipimporter
4157 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4160 def args_to_str(args
):
4161 # Get a short string representation for a subprocess command
4162 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4165 def error_to_compat_str(err
):
4167 # On python 2 error byte string must be decoded with proper
4168 # encoding rather than ascii
4169 if sys
.version_info
[0] < 3:
4170 err_str
= err_str
.decode(preferredencoding())
4174 def mimetype2ext(mt
):
4180 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4181 # it's the most popular one
4182 'audio/mpeg': 'mp3',
4183 'audio/x-wav': 'wav',
4188 _
, _
, res
= mt
.rpartition('/')
4189 res
= res
.split(';')[0].strip().lower()
4193 'smptett+xml': 'tt',
4197 'x-mp4-fragmented': 'mp4',
4198 'x-ms-sami': 'sami',
4201 'x-mpegurl': 'm3u8',
4202 'vnd.apple.mpegurl': 'm3u8',
4206 'vnd.ms-sstr+xml': 'ism',
4213 def parse_codecs(codecs_str
):
4214 # http://tools.ietf.org/html/rfc6381
4217 splited_codecs
= list(filter(None, map(
4218 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4219 vcodec
, acodec
= None, None
4220 for full_codec
in splited_codecs
:
4221 codec
= full_codec
.split('.')[0]
4222 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4225 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4229 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4230 if not vcodec
and not acodec
:
4231 if len(splited_codecs
) == 2:
4233 'vcodec': splited_codecs
[0],
4234 'acodec': splited_codecs
[1],
4238 'vcodec': vcodec
or 'none',
4239 'acodec': acodec
or 'none',
4244 def urlhandle_detect_ext(url_handle
):
4245 getheader
= url_handle
.headers
.get
4247 cd
= getheader('Content-Disposition')
4249 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4251 e
= determine_ext(m
.group('filename'), default_ext
=None)
4255 return mimetype2ext(getheader('Content-Type'))
4258 def encode_data_uri(data
, mime_type
):
4259 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4262 def age_restricted(content_limit
, age_limit
):
4263 """ Returns True iff the content should be blocked """
4265 if age_limit
is None: # No limit set
4267 if content_limit
is None:
4268 return False # Content available for everyone
4269 return age_limit
< content_limit
4272 def is_html(first_bytes
):
4273 """ Detect whether a file contains HTML by examining its first bytes. """
4276 (b
'\xef\xbb\xbf', 'utf-8'),
4277 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4278 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4279 (b
'\xff\xfe', 'utf-16-le'),
4280 (b
'\xfe\xff', 'utf-16-be'),
4282 for bom
, enc
in BOMS
:
4283 if first_bytes
.startswith(bom
):
4284 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4287 s
= first_bytes
.decode('utf-8', 'replace')
4289 return re
.match(r
'^\s*<', s
)
4292 def determine_protocol(info_dict
):
4293 protocol
= info_dict
.get('protocol')
4294 if protocol
is not None:
4297 url
= info_dict
['url']
4298 if url
.startswith('rtmp'):
4300 elif url
.startswith('mms'):
4302 elif url
.startswith('rtsp'):
4305 ext
= determine_ext(url
)
4311 return compat_urllib_parse_urlparse(url
).scheme
4314 def render_table(header_row
, data
):
4315 """ Render a list of rows, each as a list of values """
4316 table
= [header_row
] + data
4317 max_lens
= [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4318 format_str
= ' '.join('%-' + compat_str(ml
+ 1) + 's' for ml
in max_lens
[:-1]) + '%s'
4319 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4322 def _match_one(filter_part
, dct
):
4323 COMPARISON_OPERATORS
= {
4331 operator_rex
= re
.compile(r
'''(?x)\s*
4333 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4335 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4336 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4337 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4340 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4341 m = operator_rex.search(filter_part)
4343 op = COMPARISON_OPERATORS[m.group('op')]
4344 actual_value = dct.get(m.group('key'))
4345 if (m.group('quotedstrval') is not None
4346 or m.group('strval') is not None
4347 # If the original field is a string and matching comparisonvalue is
4348 # a number we should respect the origin of the original field
4349 # and process comparison value as a string (see
4350 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4351 or actual_value is not None and m.group('intval') is not None
4352 and isinstance(actual_value, compat_str)):
4353 if m.group('op') not in ('=', '!='):
4355 'Operator %s does not support string values!' % m.group('op'))
4356 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4357 quote = m.group('quote')
4358 if quote is not None:
4359 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4362 comparison_value = int(m.group('intval'))
4364 comparison_value = parse_filesize(m.group('intval'))
4365 if comparison_value is None:
4366 comparison_value = parse_filesize(m.group('intval') + 'B')
4367 if comparison_value is None:
4369 'Invalid integer value %r in filter part %r' % (
4370 m.group('intval'), filter_part))
4371 if actual_value is None:
4372 return m.group('none_inclusive')
4373 return op(actual_value, comparison_value)
4376 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4377 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4379 operator_rex = re.compile(r'''(?x
)\s
*
4380 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4382 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4383 m = operator_rex.search(filter_part)
4385 op = UNARY_OPERATORS[m.group('op')]
4386 actual_value = dct.get(m.group('key'))
4387 return op(actual_value)
4389 raise ValueError('Invalid filter part %r' % filter_part)
4392 def match_str(filter_str, dct):
4393 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4396 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4399 def match_filter_func(filter_str):
4400 def _match_func(info_dict):
4401 if match_str(filter_str, info_dict):
4404 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4405 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4409 def parse_dfxp_time_expr(time_expr):
4413 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4415 return float(mobj.group('time_offset'))
4417 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4419 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4422 def srt_subtitles_timecode(seconds):
4423 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4426 def dfxp2srt(dfxp_data):
4428 @param dfxp_data A
bytes-like
object containing DFXP data
4429 @returns A
unicode object containing converted SRT data
4431 LEGACY_NAMESPACES = (
4432 (b'http://www.w3.org/ns/ttml', [
4433 b'http://www.w3.org/2004/11/ttaf1',
4434 b'http://www.w3.org/2006/04/ttaf1',
4435 b'http://www.w3.org/2006/10/ttaf1',
4437 (b'http://www.w3.org/ns/ttml#styling', [
4438 b'http://www.w3.org/ns/ttml#style',
4442 SUPPORTED_STYLING = [
4451 _x = functools.partial(xpath_with_ns, ns_map={
4452 'xml': 'http://www.w3.org/XML/1998/namespace',
4453 'ttml': 'http://www.w3.org/ns/ttml',
4454 'tts': 'http://www.w3.org/ns/ttml#styling',
4460 class TTMLPElementParser(object):
4462 _unclosed_elements = []
4463 _applied_styles = []
4465 def start(self, tag, attrib):
4466 if tag in (_x('ttml:br'), 'br'):
4469 unclosed_elements = []
4471 element_style_id = attrib.get('style')
4473 style.update(default_style)
4474 if element_style_id:
4475 style.update(styles.get(element_style_id, {}))
4476 for prop in SUPPORTED_STYLING:
4477 prop_val = attrib.get(_x('tts:' + prop))
4479 style[prop] = prop_val
4482 for k, v in sorted(style.items()):
4483 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4486 font += ' color="%s"' % v
4487 elif k == 'fontSize':
4488 font += ' size="%s"' % v
4489 elif k == 'fontFamily':
4490 font += ' face="%s"' % v
4491 elif k == 'fontWeight' and v == 'bold':
4493 unclosed_elements.append('b')
4494 elif k == 'fontStyle' and v == 'italic':
4496 unclosed_elements.append('i')
4497 elif k == 'textDecoration' and v == 'underline':
4499 unclosed_elements.append('u')
4501 self._out += '<font' + font + '>'
4502 unclosed_elements.append('font')
4504 if self._applied_styles:
4505 applied_style.update(self._applied_styles[-1])
4506 applied_style.update(style)
4507 self._applied_styles.append(applied_style)
4508 self._unclosed_elements.append(unclosed_elements)
4511 if tag not in (_x('ttml:br'), 'br'):
4512 unclosed_elements = self._unclosed_elements.pop()
4513 for element in reversed(unclosed_elements):
4514 self._out += '</%s>' % element
4515 if unclosed_elements and self._applied_styles:
4516 self._applied_styles.pop()
4518 def data(self, data):
4522 return self._out.strip()
4524 def parse_node(node):
4525 target = TTMLPElementParser()
4526 parser = xml.etree.ElementTree.XMLParser(target=target)
4527 parser.feed(xml.etree.ElementTree.tostring(node))
4528 return parser.close()
4530 for k, v in LEGACY_NAMESPACES:
4532 dfxp_data = dfxp_data.replace(ns, k)
4534 dfxp = compat_etree_fromstring(dfxp_data)
4536 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4539 raise ValueError('Invalid dfxp/TTML subtitle')
4543 for style in dfxp.findall(_x('.//ttml:style')):
4544 style_id = style.get('id') or style.get(_x('xml:id'))
4547 parent_style_id = style.get('style')
4549 if parent_style_id not in styles:
4552 styles[style_id] = styles[parent_style_id].copy()
4553 for prop in SUPPORTED_STYLING:
4554 prop_val = style.get(_x('tts:' + prop))
4556 styles.setdefault(style_id, {})[prop] = prop_val
4562 for p in ('body', 'div'):
4563 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4566 style = styles.get(ele.get('style'))
4569 default_style.update(style)
4571 for para, index in zip(paras, itertools.count(1)):
4572 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4573 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4574 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4575 if begin_time is None:
4580 end_time = begin_time + dur
4581 out.append('%d\n%s --> %s\n%s\n\n' % (
4583 srt_subtitles_timecode(begin_time),
4584 srt_subtitles_timecode(end_time),
4590 def cli_option(params, command_option, param):
4591 param = params.get(param)
4593 param = compat_str(param)
4594 return [command_option, param] if param is not None else []
4597 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4598 param = params.get(param)
4601 assert isinstance(param, bool)
4603 return [command_option + separator + (true_value if param else false_value)]
4604 return [command_option, true_value if param else false_value]
4607 def cli_valueless_option(params, command_option, param, expected_value=True):
4608 param = params.get(param)
4609 return [command_option] if param == expected_value else []
4612 def cli_configuration_args(params, param, default=[]):
4613 ex_args = params.get(param)
4616 assert isinstance(ex_args, list)
4620 class ISO639Utils(object):
4621 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4680 'iw': 'heb', # Replaced by he in 1989 revision
4690 'in': 'ind', # Replaced by id in 1989 revision
4805 'ji': 'yid', # Replaced by yi in 1989 revision
4813 def short2long(cls, code):
4814 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4815 return cls._lang_map.get(code[:2])
4818 def long2short(cls, code):
4819 """Convert language code from ISO 639-2/T to ISO 639-1"""
4820 for short_name, long_name in cls._lang_map.items():
4821 if long_name == code:
4825 class ISO3166Utils(object):
4826 # From http://data.okfn.org/data/core/country-list
4828 'AF': 'Afghanistan',
4829 'AX': 'Åland Islands',
4832 'AS': 'American Samoa',
4837 'AG': 'Antigua and Barbuda',
4854 'BO': 'Bolivia, Plurinational State of',
4855 'BQ': 'Bonaire, Sint Eustatius and Saba',
4856 'BA': 'Bosnia and Herzegovina',
4858 'BV': 'Bouvet Island',
4860 'IO': 'British Indian Ocean Territory',
4861 'BN': 'Brunei Darussalam',
4863 'BF': 'Burkina Faso',
4869 'KY': 'Cayman Islands',
4870 'CF': 'Central African Republic',
4874 'CX': 'Christmas Island',
4875 'CC': 'Cocos (Keeling) Islands',
4879 'CD': 'Congo, the Democratic Republic of the',
4880 'CK': 'Cook Islands',
4882 'CI': 'Côte d\'Ivoire',
4887 'CZ': 'Czech Republic',
4891 'DO': 'Dominican Republic',
4894 'SV': 'El Salvador',
4895 'GQ': 'Equatorial Guinea',
4899 'FK': 'Falkland Islands (Malvinas)',
4900 'FO': 'Faroe Islands',
4904 'GF': 'French Guiana',
4905 'PF': 'French Polynesia',
4906 'TF': 'French Southern Territories',
4921 'GW': 'Guinea-Bissau',
4924 'HM': 'Heard Island and McDonald Islands',
4925 'VA': 'Holy See (Vatican City State)',
4932 'IR': 'Iran, Islamic Republic of',
4935 'IM': 'Isle of Man',
4945 'KP': 'Korea, Democratic People\'s Republic of',
4946 'KR': 'Korea, Republic of',
4949 'LA': 'Lao People\'s Democratic Republic',
4955 'LI': 'Liechtenstein',
4959 'MK': 'Macedonia, the Former Yugoslav Republic of',
4966 'MH': 'Marshall Islands',
4972 'FM': 'Micronesia, Federated States of',
4973 'MD': 'Moldova, Republic of',
4984 'NL': 'Netherlands',
4985 'NC': 'New Caledonia',
4986 'NZ': 'New Zealand',
4991 'NF': 'Norfolk Island',
4992 'MP': 'Northern Mariana Islands',
4997 'PS': 'Palestine, State of',
4999 'PG': 'Papua New Guinea',
5002 'PH': 'Philippines',
5006 'PR': 'Puerto Rico',
5010 'RU': 'Russian Federation',
5012 'BL': 'Saint Barthélemy',
5013 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5014 'KN': 'Saint Kitts and Nevis',
5015 'LC': 'Saint Lucia',
5016 'MF': 'Saint Martin (French part)',
5017 'PM': 'Saint Pierre and Miquelon',
5018 'VC': 'Saint Vincent and the Grenadines',
5021 'ST': 'Sao Tome and Principe',
5022 'SA': 'Saudi Arabia',
5026 'SL': 'Sierra Leone',
5028 'SX': 'Sint Maarten (Dutch part)',
5031 'SB': 'Solomon Islands',
5033 'ZA': 'South Africa',
5034 'GS': 'South Georgia and the South Sandwich Islands',
5035 'SS': 'South Sudan',
5040 'SJ': 'Svalbard and Jan Mayen',
5043 'CH': 'Switzerland',
5044 'SY': 'Syrian Arab Republic',
5045 'TW': 'Taiwan, Province of China',
5047 'TZ': 'Tanzania, United Republic of',
5049 'TL': 'Timor-Leste',
5053 'TT': 'Trinidad and Tobago',
5056 'TM': 'Turkmenistan',
5057 'TC': 'Turks and Caicos Islands',
5061 'AE': 'United Arab Emirates',
5062 'GB': 'United Kingdom',
5063 'US': 'United States',
5064 'UM': 'United States Minor Outlying Islands',
5068 'VE': 'Venezuela, Bolivarian Republic of',
5070 'VG': 'Virgin Islands, British',
5071 'VI': 'Virgin Islands, U.S.',
5072 'WF': 'Wallis and Futuna',
5073 'EH': 'Western Sahara',
5080 def short2full(cls, code):
5081 """Convert an ISO 3166-2 country code to the corresponding full name"""
5082 return cls._country_map.get(code.upper())
5085 class GeoUtils(object):
5086 # Major IPv4 address blocks per country
5088 'AD': '46.172.224.0/19',
5089 'AE': '94.200.0.0/13',
5090 'AF': '149.54.0.0/17',
5091 'AG': '209.59.64.0/18',
5092 'AI': '204.14.248.0/21',
5093 'AL': '46.99.0.0/16',
5094 'AM': '46.70.0.0/15',
5095 'AO': '105.168.0.0/13',
5096 'AP': '182.50.184.0/21',
5097 'AQ': '23.154.160.0/24',
5098 'AR': '181.0.0.0/12',
5099 'AS': '202.70.112.0/20',
5100 'AT': '77.116.0.0/14',
5101 'AU': '1.128.0.0/11',
5102 'AW': '181.41.0.0/18',
5103 'AX': '185.217.4.0/22',
5104 'AZ': '5.197.0.0/16',
5105 'BA': '31.176.128.0/17',
5106 'BB': '65.48.128.0/17',
5107 'BD': '114.130.0.0/16',
5109 'BF': '102.178.0.0/15',
5110 'BG': '95.42.0.0/15',
5111 'BH': '37.131.0.0/17',
5112 'BI': '154.117.192.0/18',
5113 'BJ': '137.255.0.0/16',
5114 'BL': '185.212.72.0/23',
5115 'BM': '196.12.64.0/18',
5116 'BN': '156.31.0.0/16',
5117 'BO': '161.56.0.0/16',
5118 'BQ': '161.0.80.0/20',
5119 'BR': '191.128.0.0/12',
5120 'BS': '24.51.64.0/18',
5121 'BT': '119.2.96.0/19',
5122 'BW': '168.167.0.0/16',
5123 'BY': '178.120.0.0/13',
5124 'BZ': '179.42.192.0/18',
5125 'CA': '99.224.0.0/11',
5126 'CD': '41.243.0.0/16',
5127 'CF': '197.242.176.0/21',
5128 'CG': '160.113.0.0/16',
5129 'CH': '85.0.0.0/13',
5130 'CI': '102.136.0.0/14',
5131 'CK': '202.65.32.0/19',
5132 'CL': '152.172.0.0/14',
5133 'CM': '102.244.0.0/14',
5134 'CN': '36.128.0.0/10',
5135 'CO': '181.240.0.0/12',
5136 'CR': '201.192.0.0/12',
5137 'CU': '152.206.0.0/15',
5138 'CV': '165.90.96.0/19',
5139 'CW': '190.88.128.0/17',
5140 'CY': '31.153.0.0/16',
5141 'CZ': '88.100.0.0/14',
5143 'DJ': '197.241.0.0/17',
5144 'DK': '87.48.0.0/12',
5145 'DM': '192.243.48.0/20',
5146 'DO': '152.166.0.0/15',
5147 'DZ': '41.96.0.0/12',
5148 'EC': '186.68.0.0/15',
5149 'EE': '90.190.0.0/15',
5150 'EG': '156.160.0.0/11',
5151 'ER': '196.200.96.0/20',
5152 'ES': '88.0.0.0/11',
5153 'ET': '196.188.0.0/14',
5154 'EU': '2.16.0.0/13',
5155 'FI': '91.152.0.0/13',
5156 'FJ': '144.120.0.0/16',
5157 'FK': '80.73.208.0/21',
5158 'FM': '119.252.112.0/20',
5159 'FO': '88.85.32.0/19',
5161 'GA': '41.158.0.0/15',
5163 'GD': '74.122.88.0/21',
5164 'GE': '31.146.0.0/16',
5165 'GF': '161.22.64.0/18',
5166 'GG': '62.68.160.0/19',
5167 'GH': '154.160.0.0/12',
5168 'GI': '95.164.0.0/16',
5169 'GL': '88.83.0.0/19',
5170 'GM': '160.182.0.0/15',
5171 'GN': '197.149.192.0/18',
5172 'GP': '104.250.0.0/19',
5173 'GQ': '105.235.224.0/20',
5174 'GR': '94.64.0.0/13',
5175 'GT': '168.234.0.0/16',
5176 'GU': '168.123.0.0/16',
5177 'GW': '197.214.80.0/20',
5178 'GY': '181.41.64.0/18',
5179 'HK': '113.252.0.0/14',
5180 'HN': '181.210.0.0/16',
5181 'HR': '93.136.0.0/13',
5182 'HT': '148.102.128.0/17',
5183 'HU': '84.0.0.0/14',
5184 'ID': '39.192.0.0/10',
5185 'IE': '87.32.0.0/12',
5186 'IL': '79.176.0.0/13',
5187 'IM': '5.62.80.0/20',
5188 'IN': '117.192.0.0/10',
5189 'IO': '203.83.48.0/21',
5190 'IQ': '37.236.0.0/14',
5191 'IR': '2.176.0.0/12',
5192 'IS': '82.221.0.0/16',
5193 'IT': '79.0.0.0/10',
5194 'JE': '87.244.64.0/18',
5195 'JM': '72.27.0.0/17',
5196 'JO': '176.29.0.0/16',
5197 'JP': '133.0.0.0/8',
5198 'KE': '105.48.0.0/12',
5199 'KG': '158.181.128.0/17',
5200 'KH': '36.37.128.0/17',
5201 'KI': '103.25.140.0/22',
5202 'KM': '197.255.224.0/20',
5203 'KN': '198.167.192.0/19',
5204 'KP': '175.45.176.0/22',
5205 'KR': '175.192.0.0/10',
5206 'KW': '37.36.0.0/14',
5207 'KY': '64.96.0.0/15',
5208 'KZ': '2.72.0.0/13',
5209 'LA': '115.84.64.0/18',
5210 'LB': '178.135.0.0/16',
5211 'LC': '24.92.144.0/20',
5212 'LI': '82.117.0.0/19',
5213 'LK': '112.134.0.0/15',
5214 'LR': '102.183.0.0/16',
5215 'LS': '129.232.0.0/17',
5216 'LT': '78.56.0.0/13',
5217 'LU': '188.42.0.0/16',
5218 'LV': '46.109.0.0/16',
5219 'LY': '41.252.0.0/14',
5220 'MA': '105.128.0.0/11',
5221 'MC': '88.209.64.0/18',
5222 'MD': '37.246.0.0/16',
5223 'ME': '178.175.0.0/17',
5224 'MF': '74.112.232.0/21',
5225 'MG': '154.126.0.0/17',
5226 'MH': '117.103.88.0/21',
5227 'MK': '77.28.0.0/15',
5228 'ML': '154.118.128.0/18',
5229 'MM': '37.111.0.0/17',
5230 'MN': '49.0.128.0/17',
5231 'MO': '60.246.0.0/16',
5232 'MP': '202.88.64.0/20',
5233 'MQ': '109.203.224.0/19',
5234 'MR': '41.188.64.0/18',
5235 'MS': '208.90.112.0/22',
5236 'MT': '46.11.0.0/16',
5237 'MU': '105.16.0.0/12',
5238 'MV': '27.114.128.0/18',
5239 'MW': '102.70.0.0/15',
5240 'MX': '187.192.0.0/11',
5241 'MY': '175.136.0.0/13',
5242 'MZ': '197.218.0.0/15',
5243 'NA': '41.182.0.0/16',
5244 'NC': '101.101.0.0/18',
5245 'NE': '197.214.0.0/18',
5246 'NF': '203.17.240.0/22',
5247 'NG': '105.112.0.0/12',
5248 'NI': '186.76.0.0/15',
5249 'NL': '145.96.0.0/11',
5250 'NO': '84.208.0.0/13',
5251 'NP': '36.252.0.0/15',
5252 'NR': '203.98.224.0/19',
5253 'NU': '49.156.48.0/22',
5254 'NZ': '49.224.0.0/14',
5255 'OM': '5.36.0.0/15',
5256 'PA': '186.72.0.0/15',
5257 'PE': '186.160.0.0/14',
5258 'PF': '123.50.64.0/18',
5259 'PG': '124.240.192.0/19',
5260 'PH': '49.144.0.0/13',
5261 'PK': '39.32.0.0/11',
5262 'PL': '83.0.0.0/11',
5263 'PM': '70.36.0.0/20',
5264 'PR': '66.50.0.0/16',
5265 'PS': '188.161.0.0/16',
5266 'PT': '85.240.0.0/13',
5267 'PW': '202.124.224.0/20',
5268 'PY': '181.120.0.0/14',
5269 'QA': '37.210.0.0/15',
5270 'RE': '102.35.0.0/16',
5271 'RO': '79.112.0.0/13',
5272 'RS': '93.86.0.0/15',
5273 'RU': '5.136.0.0/13',
5274 'RW': '41.186.0.0/16',
5275 'SA': '188.48.0.0/13',
5276 'SB': '202.1.160.0/19',
5277 'SC': '154.192.0.0/11',
5278 'SD': '102.120.0.0/13',
5279 'SE': '78.64.0.0/12',
5280 'SG': '8.128.0.0/10',
5281 'SI': '188.196.0.0/14',
5282 'SK': '78.98.0.0/15',
5283 'SL': '102.143.0.0/17',
5284 'SM': '89.186.32.0/19',
5285 'SN': '41.82.0.0/15',
5286 'SO': '154.115.192.0/18',
5287 'SR': '186.179.128.0/17',
5288 'SS': '105.235.208.0/21',
5289 'ST': '197.159.160.0/19',
5290 'SV': '168.243.0.0/16',
5291 'SX': '190.102.0.0/20',
5293 'SZ': '41.84.224.0/19',
5294 'TC': '65.255.48.0/20',
5295 'TD': '154.68.128.0/19',
5296 'TG': '196.168.0.0/14',
5297 'TH': '171.96.0.0/13',
5298 'TJ': '85.9.128.0/18',
5299 'TK': '27.96.24.0/21',
5300 'TL': '180.189.160.0/20',
5301 'TM': '95.85.96.0/19',
5302 'TN': '197.0.0.0/11',
5303 'TO': '175.176.144.0/21',
5304 'TR': '78.160.0.0/11',
5305 'TT': '186.44.0.0/15',
5306 'TV': '202.2.96.0/19',
5307 'TW': '120.96.0.0/11',
5308 'TZ': '156.156.0.0/14',
5309 'UA': '37.52.0.0/14',
5310 'UG': '102.80.0.0/13',
5312 'UY': '167.56.0.0/13',
5313 'UZ': '84.54.64.0/18',
5314 'VA': '212.77.0.0/19',
5315 'VC': '207.191.240.0/21',
5316 'VE': '186.88.0.0/13',
5317 'VG': '66.81.192.0/20',
5318 'VI': '146.226.0.0/16',
5319 'VN': '14.160.0.0/11',
5320 'VU': '202.80.32.0/20',
5321 'WF': '117.20.32.0/21',
5322 'WS': '202.4.32.0/19',
5323 'YE': '134.35.0.0/16',
5324 'YT': '41.242.116.0/22',
5325 'ZA': '41.0.0.0/11',
5326 'ZM': '102.144.0.0/13',
5327 'ZW': '102.177.192.0/18',
5331 def random_ipv4(cls, code_or_block):
5332 if len(code_or_block) == 2:
5333 block = cls._country_ip_map.get(code_or_block.upper())
5337 block = code_or_block
5338 addr, preflen = block.split('/')
5339 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5340 addr_max = addr_min | (0xffffffff >> int(preflen))
5341 return compat_str(socket.inet_ntoa(
5342 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5345 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5346 def __init__(self, proxies=None):
5347 # Set default handlers
5348 for type in ('http', 'https'):
5349 setattr(self, '%s_open' % type,
5350 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5351 meth(r, proxy, type))
5352 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5354 def proxy_open(self, req, proxy, type):
5355 req_proxy = req.headers.get('Ytdl-request-proxy')
5356 if req_proxy is not None:
5358 del req.headers['Ytdl-request-proxy']
5360 if proxy == '__noproxy__':
5361 return None # No Proxy
5362 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5363 req.add_header('Ytdl-socks-proxy', proxy)
5364 # youtube-dlc's http/https handlers do wrapping the socket with socks
5366 return compat_urllib_request.ProxyHandler.proxy_open(
5367 self, req, proxy, type)
5370 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5371 # released into Public Domain
5372 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5374 def long_to_bytes(n, blocksize=0):
5375 """long_to_bytes(n:long, blocksize:int) : string
5376 Convert a long integer to a byte string.
5378 If optional blocksize is given and greater than zero, pad the front of the
5379 byte string with binary zeros so that the length is a multiple of
5382 # after much testing, this algorithm was deemed to be the fastest
5386 s = compat_struct_pack('>I', n & 0xffffffff) + s
5388 # strip off leading zeros
5389 for i in range(len(s)):
5390 if s[i] != b'\000'[0]:
5393 # only happens when n == 0
5397 # add back some pad bytes. this could be done more efficiently w.r.t. the
5398 # de-padding being done above, but sigh...
5399 if blocksize > 0 and len(s) % blocksize:
5400 s = (blocksize - len(s) % blocksize) * b'\000' + s
5404 def bytes_to_long(s):
5405 """bytes_to_long(string) : long
5406 Convert a byte string to a long integer.
5408 This is (essentially) the inverse of long_to_bytes().
5413 extra = (4 - length % 4)
5414 s = b'\000' * extra + s
5415 length = length + extra
5416 for i in range(0, length, 4):
5417 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5421 def ohdave_rsa_encrypt(data, exponent, modulus):
5423 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5426 data: data to encrypt, bytes-like object
5427 exponent, modulus: parameter e and N of RSA algorithm, both integer
5428 Output: hex string of encrypted data
5430 Limitation: supports one block encryption only
5433 payload = int(binascii.hexlify(data[::-1]), 16)
5434 encrypted = pow(payload, exponent, modulus)
5435 return '%x' % encrypted
5438 def pkcs1pad(data, length):
5440 Padding input data with PKCS#1 scheme
5442 @param {int[]} data input data
5443 @param {int} length target length
5444 @returns {int[]} padded data
5446 if len(data) > length - 11:
5447 raise ValueError('Input data too
long for PKCS
#1 padding')
5449 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5450 return [0, 2] + pseudo_random
+ [0] + data
5453 def encode_base_n(num
, n
, table
=None):
5454 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5456 table
= FULL_TABLE
[:n
]
5459 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5466 ret
= table
[num
% n
] + ret
5471 def decode_packed_codes(code
):
5472 mobj
= re
.search(PACKED_CODES_RE
, code
)
5473 obfucasted_code
, base
, count
, symbols
= mobj
.groups()
5476 symbols
= symbols
.split('|')
5481 base_n_count
= encode_base_n(count
, base
)
5482 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5485 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5489 def caesar(s
, alphabet
, shift
):
5494 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5499 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5502 def parse_m3u8_attributes(attrib
):
5504 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5505 if val
.startswith('"'):
5511 def urshift(val
, n
):
5512 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5515 # Based on png2str() written by @gdkchan and improved by @yokrysty
5516 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5517 def decode_png(png_data
):
5518 # Reference: https://www.w3.org/TR/PNG/
5519 header
= png_data
[8:]
5521 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5522 raise IOError('Not a valid PNG file.')
5524 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5525 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5530 length
= unpack_integer(header
[:4])
5533 chunk_type
= header
[:4]
5536 chunk_data
= header
[:length
]
5537 header
= header
[length
:]
5539 header
= header
[4:] # Skip CRC
5547 ihdr
= chunks
[0]['data']
5549 width
= unpack_integer(ihdr
[:4])
5550 height
= unpack_integer(ihdr
[4:8])
5554 for chunk
in chunks
:
5555 if chunk
['type'] == b
'IDAT':
5556 idat
+= chunk
['data']
5559 raise IOError('Unable to read PNG data.')
5561 decompressed_data
= bytearray(zlib
.decompress(idat
))
5566 def _get_pixel(idx
):
5571 for y
in range(height
):
5572 basePos
= y
* (1 + stride
)
5573 filter_type
= decompressed_data
[basePos
]
5577 pixels
.append(current_row
)
5579 for x
in range(stride
):
5580 color
= decompressed_data
[1 + basePos
+ x
]
5581 basex
= y
* stride
+ x
5586 left
= _get_pixel(basex
- 3)
5588 up
= _get_pixel(basex
- stride
)
5590 if filter_type
== 1: # Sub
5591 color
= (color
+ left
) & 0xff
5592 elif filter_type
== 2: # Up
5593 color
= (color
+ up
) & 0xff
5594 elif filter_type
== 3: # Average
5595 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5596 elif filter_type
== 4: # Paeth
5602 c
= _get_pixel(basex
- stride
- 3)
5610 if pa
<= pb
and pa
<= pc
:
5611 color
= (color
+ a
) & 0xff
5613 color
= (color
+ b
) & 0xff
5615 color
= (color
+ c
) & 0xff
5617 current_row
.append(color
)
5619 return width
, height
, pixels
5622 def write_xattr(path
, key
, value
):
5623 # This mess below finds the best xattr tool for the job
5625 # try the pyxattr module...
5628 if hasattr(xattr
, 'set'): # pyxattr
5629 # Unicode arguments are not supported in python-pyxattr until
5631 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5632 pyxattr_required_version
= '0.5.0'
5633 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5634 # TODO: fallback to CLI tools
5635 raise XAttrUnavailableError(
5636 'python-pyxattr is detected but is too old. '
5637 'youtube-dlc requires %s or above while your version is %s. '
5638 'Falling back to other xattr implementations' % (
5639 pyxattr_required_version
, xattr
.__version
__))
5641 setxattr
= xattr
.set
5643 setxattr
= xattr
.setxattr
5646 setxattr(path
, key
, value
)
5647 except EnvironmentError as e
:
5648 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5651 if compat_os_name
== 'nt':
5652 # Write xattrs to NTFS Alternate Data Streams:
5653 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5654 assert ':' not in key
5655 assert os
.path
.exists(path
)
5657 ads_fn
= path
+ ':' + key
5659 with open(ads_fn
, 'wb') as f
:
5661 except EnvironmentError as e
:
5662 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5664 user_has_setfattr
= check_executable('setfattr', ['--version'])
5665 user_has_xattr
= check_executable('xattr', ['-h'])
5667 if user_has_setfattr
or user_has_xattr
:
5669 value
= value
.decode('utf-8')
5670 if user_has_setfattr
:
5671 executable
= 'setfattr'
5672 opts
= ['-n', key
, '-v', value
]
5673 elif user_has_xattr
:
5674 executable
= 'xattr'
5675 opts
= ['-w', key
, value
]
5677 cmd
= ([encodeFilename(executable
, True)]
5678 + [encodeArgument(o
) for o
in opts
]
5679 + [encodeFilename(path
, True)])
5682 p
= subprocess
.Popen(
5683 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5684 except EnvironmentError as e
:
5685 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5686 stdout
, stderr
= p
.communicate()
5687 stderr
= stderr
.decode('utf-8', 'replace')
5688 if p
.returncode
!= 0:
5689 raise XAttrMetadataError(p
.returncode
, stderr
)
5692 # On Unix, and can't find pyxattr, setfattr, or xattr.
5693 if sys
.platform
.startswith('linux'):
5694 raise XAttrUnavailableError(
5695 "Couldn't find a tool to set the xattrs. "
5696 "Install either the python 'pyxattr' or 'xattr' "
5697 "modules, or the GNU 'attr' package "
5698 "(which contains the 'setfattr' tool).")
5700 raise XAttrUnavailableError(
5701 "Couldn't find a tool to set the xattrs. "
5702 "Install either the python 'xattr' module, "
5703 "or the 'xattr' binary.")
5706 def random_birthday(year_field
, month_field
, day_field
):
5707 start_date
= datetime
.date(1950, 1, 1)
5708 end_date
= datetime
.date(1995, 12, 31)
5709 offset
= random
.randint(0, (end_date
- start_date
).days
)
5710 random_date
= start_date
+ datetime
.timedelta(offset
)
5712 year_field
: str(random_date
.year
),
5713 month_field
: str(random_date
.month
),
5714 day_field
: str(random_date
.day
),