4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_urlunparse
,
64 compat_urllib_parse_quote
,
65 compat_urllib_parse_quote_plus
,
66 compat_urllib_parse_unquote_plus
,
67 compat_urllib_request
,
78 def register_socks_protocols():
79 # "Register" SOCKS protocols
80 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
81 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
82 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
83 if scheme
not in compat_urlparse
.uses_netloc
:
84 compat_urlparse
.uses_netloc
.append(scheme
)
87 # This is not clearly defined otherwise
88 compiled_regex_type
= type(re
.compile(''))
91 def random_user_agent():
92 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1671 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1675 'User-Agent': random_user_agent(),
1676 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1677 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1678 'Accept-Encoding': 'gzip, deflate',
1679 'Accept-Language': 'en-us,en;q=0.5',
1684 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1688 NO_DEFAULT
= object()
1690 ENGLISH_MONTH_NAMES
= [
1691 'January', 'February', 'March', 'April', 'May', 'June',
1692 'July', 'August', 'September', 'October', 'November', 'December']
1695 'en': ENGLISH_MONTH_NAMES
,
1697 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1698 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1701 KNOWN_EXTENSIONS
= (
1702 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1703 'flv', 'f4v', 'f4a', 'f4b',
1704 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1705 'mkv', 'mka', 'mk3d',
1708 'asf', 'wmv', 'wma',
1714 'f4f', 'f4m', 'm3u8', 'smil')
1716 # needed for sanitizing filenames in restricted mode
1717 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1718 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1719 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1742 '%Y/%m/%d %H:%M:%S',
1744 '%Y-%m-%d %H:%M:%S',
1745 '%Y-%m-%d %H:%M:%S.%f',
1748 '%Y-%m-%dT%H:%M:%SZ',
1749 '%Y-%m-%dT%H:%M:%S.%fZ',
1750 '%Y-%m-%dT%H:%M:%S.%f0Z',
1751 '%Y-%m-%dT%H:%M:%S',
1752 '%Y-%m-%dT%H:%M:%S.%f',
1754 '%b %d %Y at %H:%M',
1755 '%b %d %Y at %H:%M:%S',
1756 '%B %d %Y at %H:%M',
1757 '%B %d %Y at %H:%M:%S',
1760 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1761 DATE_FORMATS_DAY_FIRST
.extend([
1767 '%d/%m/%Y %H:%M:%S',
1770 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1771 DATE_FORMATS_MONTH_FIRST
.extend([
1776 '%m/%d/%Y %H:%M:%S',
1779 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1780 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1783 def preferredencoding():
1784 """Get preferred encoding.
1786 Returns the best encoding scheme for the system, based on
1787 locale.getpreferredencoding() and some further tweaks.
1790 pref = locale.getpreferredencoding()
1798 def write_json_file(obj, fn):
1799 """ Encode obj as JSON and write it to fn, atomically if possible """
1801 fn = encodeFilename(fn)
1802 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1803 encoding = get_filesystem_encoding()
1804 # os.path.basename returns a bytes object, but NamedTemporaryFile
1805 # will fail if the filename contains non ascii characters unless we
1806 # use a unicode object
1807 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1808 # the same for os.path.dirname
1809 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1811 path_basename = os.path.basename
1812 path_dirname = os.path.dirname
1816 'prefix
': path_basename(fn) + '.',
1817 'dir': path_dirname(fn),
1821 # In Python 2.x, json.dump expects a bytestream.
1822 # In Python 3.x, it writes to a character stream
1823 if sys.version_info < (3, 0):
1828 'encoding
': 'utf
-8',
1831 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1836 if sys.platform == 'win32
':
1837 # Need to remove existing file on Windows, else os.rename raises
1838 # WindowsError or FileExistsError.
1846 os.chmod(tf.name, 0o666 & ~mask)
1849 os.rename(tf.name, fn)
1858 if sys.version_info >= (2, 7):
1859 def find_xpath_attr(node, xpath, key, val=None):
1860 """ Find the xpath xpath[@key=val] """
1861 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1862 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1863 return node.find(expr)
1865 def find_xpath_attr(node, xpath, key, val=None):
1866 for f in node.findall(compat_xpath(xpath)):
1867 if key not in f.attrib:
1869 if val is None or f.attrib.get(key) == val:
1873 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1874 # the namespace parameter
1877 def xpath_with_ns(path
, ns_map
):
1878 components
= [c
.split(':') for c
in path
.split('/')]
1880 for c
in components
:
1882 replaced
.append(c
[0])
1885 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1886 return '/'.join(replaced
)
1889 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1890 def _find_xpath(xpath
):
1891 return node
.find(compat_xpath(xpath
))
1893 if isinstance(xpath
, (str, compat_str
)):
1894 n
= _find_xpath(xpath
)
1902 if default
is not NO_DEFAULT
:
1905 name
= xpath
if name
is None else name
1906 raise ExtractorError('Could not find XML element %s' % name
)
1912 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1913 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1914 if n
is None or n
== default
:
1917 if default
is not NO_DEFAULT
:
1920 name
= xpath
if name
is None else name
1921 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1927 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1928 n
= find_xpath_attr(node
, xpath
, key
)
1930 if default
is not NO_DEFAULT
:
1933 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1934 raise ExtractorError('Could not find XML attribute %s' % name
)
1937 return n
.attrib
[key
]
1940 def get_element_by_id(id, html
):
1941 """Return the content of the tag with the specified ID in the passed HTML document"""
1942 return get_element_by_attribute('id', id, html
)
1945 def get_element_by_class(class_name
, html
):
1946 """Return the content of the first tag with the specified class in the passed HTML document"""
1947 retval
= get_elements_by_class(class_name
, html
)
1948 return retval
[0] if retval
else None
1951 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1952 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1953 return retval
[0] if retval
else None
1956 def get_elements_by_class(class_name
, html
):
1957 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1958 return get_elements_by_attribute(
1959 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1960 html, escape_value=False)
1963 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1964 """Return the content of the tag with the specified attribute in the passed HTML document"""
1966 value = re.escape(value) if escape_value else value
1969 for m in re.finditer(r'''(?xs)
1971 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1973 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1977 ''' % (re.escape(attribute), value), html):
1978 res = m.group('content
')
1980 if res.startswith('"') or res.startswith("'"):
1983 retlist.append(unescapeHTML(res))
1988 class HTMLAttributeParser(compat_HTMLParser):
1989 """Trivial HTML parser to gather the attributes for a single element"""
1993 compat_HTMLParser.__init__(self)
1995 def handle_starttag(self, tag, attrs):
1996 self.attrs = dict(attrs)
1999 def extract_attributes(html_element):
2000 """Given a string for an HTML element such as
2002 a="foo" B="bar" c="&98;az" d=boz
2003 empty= noval entity="&"
2006 Decode and return a dictionary of attributes.
2008 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2009 'empty
': '', 'noval
': None, 'entity
': '&',
2010 'sq
': '"', 'dq': '\''
2012 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2013 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2015 parser = HTMLAttributeParser()
2017 parser.feed(html_element)
2019 # Older Python may throw HTMLParseError in case of malformed HTML
2020 except compat_HTMLParseError:
2025 def clean_html(html):
2026 """Clean an HTML snippet into a readable string"""
2028 if html is None: # Convenience for sanitizing descriptions etc.
2032 html = html.replace('\n', ' ')
2033 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2034 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2036 html = re.sub('<.*?>', '', html)
2037 # Replace html entities
2038 html = unescapeHTML(html)
2042 def sanitize_open(filename, open_mode):
2043 """Try to open the given filename, and slightly tweak it if this fails.
2045 Attempts to open the given filename. If this fails, it tries to change
2046 the filename slightly, step by step, until it's either able to open it
2047 or it fails and raises a final exception, like the standard open()
2050 It returns the tuple (stream, definitive_file_name).
2054 if sys.platform == 'win32':
2056 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2057 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2058 stream = open(encodeFilename(filename), open_mode)
2059 return (stream, filename)
2060 except (IOError, OSError) as err:
2061 if err.errno in (errno.EACCES,):
2064 # In case of error, try to remove win32 forbidden chars
2065 alt_filename = sanitize_path(filename)
2066 if alt_filename == filename:
2069 # An exception here should be caught in the caller
2070 stream = open(encodeFilename(alt_filename), open_mode)
2071 return (stream, alt_filename)
2074 def timeconvert(timestr):
2075 """Convert RFC 2822 defined time string into system timestamp"""
2077 timetuple = email.utils.parsedate_tz(timestr)
2078 if timetuple is not None:
2079 timestamp = email.utils.mktime_tz(timetuple)
2083 def sanitize_filename(s, restricted=False, is_id=False):
2084 """Sanitizes a string so it could be used as part of a filename.
2085 If restricted is set, use a stricter subset of allowed characters.
2086 Set is_id if this is not an arbitrary string, but an ID that should be kept
2089 def replace_insane(char):
2090 if restricted and char in ACCENT_CHARS:
2091 return ACCENT_CHARS[char]
2092 if char == '?' or ord(char) < 32 or ord(char) == 127:
2095 return '' if restricted else '\''
2097 return '_
-' if restricted else ' -'
2098 elif char in '\\/|
*<>':
2100 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2102 if restricted
and ord(char
) > 127:
2107 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2108 result
= ''.join(map(replace_insane
, s
))
2110 while '__' in result
:
2111 result
= result
.replace('__', '_')
2112 result
= result
.strip('_')
2113 # Common case of "Foreign band name - English song title"
2114 if restricted
and result
.startswith('-_'):
2116 if result
.startswith('-'):
2117 result
= '_' + result
[len('-'):]
2118 result
= result
.lstrip('.')
2124 def sanitize_path(s
):
2125 """Sanitizes and normalizes path on Windows"""
2126 if sys
.platform
!= 'win32':
2128 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2129 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2130 drive_or_unc
, _
= os
.path
.splitunc(s
)
2131 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2135 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2136 for path_part
in norm_path
]
2138 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2139 return os
.path
.join(*sanitized_path
)
2142 def sanitize_url(url
):
2143 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2144 # the number of unwanted failures due to missing protocol
2145 if url
.startswith('//'):
2146 return 'http:%s' % url
2147 # Fix some common typos seen so far
2149 # https://github.com/ytdl-org/youtube-dl/issues/15649
2150 (r
'^httpss://', r
'https://'),
2151 # https://bx1.be/lives/direct-tv/
2152 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2154 for mistake
, fixup
in COMMON_TYPOS
:
2155 if re
.match(mistake
, url
):
2156 return re
.sub(mistake
, fixup
, url
)
2160 def sanitized_Request(url
, *args
, **kwargs
):
2161 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2165 """Expand shell variables and ~"""
2166 return os
.path
.expandvars(compat_expanduser(s
))
2169 def orderedSet(iterable
):
2170 """ Remove all duplicates from the input iterable """
2178 def _htmlentity_transform(entity_with_semicolon
):
2179 """Transforms an HTML entity to a character."""
2180 entity
= entity_with_semicolon
[:-1]
2182 # Known non-numeric HTML entity
2183 if entity
in compat_html_entities
.name2codepoint
:
2184 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2186 # TODO: HTML5 allows entities without a semicolon. For example,
2187 # 'Éric' should be decoded as 'Éric'.
2188 if entity_with_semicolon
in compat_html_entities_html5
:
2189 return compat_html_entities_html5
[entity_with_semicolon
]
2191 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2192 if mobj
is not None:
2193 numstr
= mobj
.group(1)
2194 if numstr
.startswith('x'):
2196 numstr
= '0%s' % numstr
2199 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2201 return compat_chr(int(numstr
, base
))
2205 # Unknown entity in name, return its literal representation
2206 return '&%s;' % entity
2209 def unescapeHTML(s
):
2212 assert type(s
) == compat_str
2215 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2218 def get_subprocess_encoding():
2219 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2220 # For subprocess calls, encode with locale encoding
2221 # Refer to http://stackoverflow.com/a/9951851/35070
2222 encoding
= preferredencoding()
2224 encoding
= sys
.getfilesystemencoding()
2225 if encoding
is None:
2230 def encodeFilename(s
, for_subprocess
=False):
2232 @param s The name of the file
2235 assert type(s
) == compat_str
2237 # Python 3 has a Unicode API
2238 if sys
.version_info
>= (3, 0):
2241 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2242 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2243 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2244 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2247 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2248 if sys
.platform
.startswith('java'):
2251 return s
.encode(get_subprocess_encoding(), 'ignore')
2254 def decodeFilename(b
, for_subprocess
=False):
2256 if sys
.version_info
>= (3, 0):
2259 if not isinstance(b
, bytes):
2262 return b
.decode(get_subprocess_encoding(), 'ignore')
2265 def encodeArgument(s
):
2266 if not isinstance(s
, compat_str
):
2267 # Legacy code that uses byte strings
2268 # Uncomment the following line after fixing all post processors
2269 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2270 s
= s
.decode('ascii')
2271 return encodeFilename(s
, True)
2274 def decodeArgument(b
):
2275 return decodeFilename(b
, True)
2278 def decodeOption(optval
):
2281 if isinstance(optval
, bytes):
2282 optval
= optval
.decode(preferredencoding())
2284 assert isinstance(optval
, compat_str
)
2288 def formatSeconds(secs
, delim
=':'):
2290 return '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2292 return '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2297 def make_HTTPS_handler(params
, **kwargs
):
2298 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2299 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2300 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2301 if opts_no_check_certificate
:
2302 context
.check_hostname
= False
2303 context
.verify_mode
= ssl
.CERT_NONE
2305 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2308 # (create_default_context present but HTTPSHandler has no context=)
2311 if sys
.version_info
< (3, 2):
2312 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2313 else: # Python < 3.4
2314 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2315 context
.verify_mode
= (ssl
.CERT_NONE
2316 if opts_no_check_certificate
2317 else ssl
.CERT_REQUIRED
)
2318 context
.set_default_verify_paths()
2319 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2322 def bug_reports_message():
2323 if ytdl_is_updateable():
2324 update_cmd
= 'type youtube-dlc -U to update'
2326 update_cmd
= 'see https://github.com/pukkandan/yt-dlc on how to update'
2327 msg
= '; please report this issue on https://github.com/pukkandan/yt-dlc .'
2328 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2329 msg
+= ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.'
2333 class YoutubeDLError(Exception):
2334 """Base exception for YoutubeDL errors."""
2338 class ExtractorError(YoutubeDLError
):
2339 """Error during info extraction."""
2341 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2342 """ tb, if given, is the original traceback (so that it can be printed out).
2343 If expected is set, this is a normal error message and most likely not a bug in youtube-dlc.
2346 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2348 if video_id
is not None:
2349 msg
= video_id
+ ': ' + msg
2351 msg
+= ' (caused by %r)' % cause
2353 msg
+= bug_reports_message()
2354 super(ExtractorError
, self
).__init
__(msg
)
2357 self
.exc_info
= sys
.exc_info() # preserve original exception
2359 self
.video_id
= video_id
2361 def format_traceback(self
):
2362 if self
.traceback
is None:
2364 return ''.join(traceback
.format_tb(self
.traceback
))
2367 class UnsupportedError(ExtractorError
):
2368 def __init__(self
, url
):
2369 super(UnsupportedError
, self
).__init
__(
2370 'Unsupported URL: %s' % url
, expected
=True)
2374 class RegexNotFoundError(ExtractorError
):
2375 """Error when a regex didn't match"""
2379 class GeoRestrictedError(ExtractorError
):
2380 """Geographic restriction Error exception.
2382 This exception may be thrown when a video is not available from your
2383 geographic location due to geographic restrictions imposed by a website.
2386 def __init__(self
, msg
, countries
=None):
2387 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2389 self
.countries
= countries
2392 class DownloadError(YoutubeDLError
):
2393 """Download Error exception.
2395 This exception may be thrown by FileDownloader objects if they are not
2396 configured to continue on errors. They will contain the appropriate
2400 def __init__(self
, msg
, exc_info
=None):
2401 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2402 super(DownloadError
, self
).__init
__(msg
)
2403 self
.exc_info
= exc_info
2406 class SameFileError(YoutubeDLError
):
2407 """Same File exception.
2409 This exception will be thrown by FileDownloader objects if they detect
2410 multiple files would have to be downloaded to the same file on disk.
2415 class PostProcessingError(YoutubeDLError
):
2416 """Post Processing exception.
2418 This exception may be raised by PostProcessor's .run() method to
2419 indicate an error in the postprocessing task.
2422 def __init__(self
, msg
):
2423 super(PostProcessingError
, self
).__init
__(msg
)
2427 class MaxDownloadsReached(YoutubeDLError
):
2428 """ --max-downloads limit has been reached. """
2432 class UnavailableVideoError(YoutubeDLError
):
2433 """Unavailable Format exception.
2435 This exception will be thrown when a video is requested
2436 in a format that is not available for that video.
2441 class ContentTooShortError(YoutubeDLError
):
2442 """Content Too Short exception.
2444 This exception may be raised by FileDownloader objects when a file they
2445 download is too small for what the server announced first, indicating
2446 the connection was probably interrupted.
2449 def __init__(self
, downloaded
, expected
):
2450 super(ContentTooShortError
, self
).__init
__(
2451 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2454 self
.downloaded
= downloaded
2455 self
.expected
= expected
2458 class XAttrMetadataError(YoutubeDLError
):
2459 def __init__(self
, code
=None, msg
='Unknown error'):
2460 super(XAttrMetadataError
, self
).__init
__(msg
)
2464 # Parsing code and msg
2465 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2466 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2467 self
.reason
= 'NO_SPACE'
2468 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2469 self
.reason
= 'VALUE_TOO_LONG'
2471 self
.reason
= 'NOT_SUPPORTED'
2474 class XAttrUnavailableError(YoutubeDLError
):
2478 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2479 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2480 # expected HTTP responses to meet HTTP/1.0 or later (see also
2481 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2482 if sys
.version_info
< (3, 0):
2483 kwargs
['strict'] = True
2484 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2485 source_address
= ydl_handler
._params
.get('source_address')
2487 if source_address
is not None:
2488 # This is to workaround _create_connection() from socket where it will try all
2489 # address data from getaddrinfo() including IPv6. This filters the result from
2490 # getaddrinfo() based on the source_address value.
2491 # This is based on the cpython socket.create_connection() function.
2492 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2493 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2494 host
, port
= address
2496 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2497 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2498 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2499 if addrs
and not ip_addrs
:
2500 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2502 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2503 % (ip_version
, source_address
[0]))
2504 for res
in ip_addrs
:
2505 af
, socktype
, proto
, canonname
, sa
= res
2508 sock
= socket
.socket(af
, socktype
, proto
)
2509 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2510 sock
.settimeout(timeout
)
2511 sock
.bind(source_address
)
2513 err
= None # Explicitly break reference cycle
2515 except socket
.error
as _
:
2517 if sock
is not None:
2522 raise socket
.error('getaddrinfo returns an empty list')
2523 if hasattr(hc
, '_create_connection'):
2524 hc
._create
_connection
= _create_connection
2525 sa
= (source_address
, 0)
2526 if hasattr(hc
, 'source_address'): # Python 2.7+
2527 hc
.source_address
= sa
2529 def _hc_connect(self
, *args
, **kwargs
):
2530 sock
= _create_connection(
2531 (self
.host
, self
.port
), self
.timeout
, sa
)
2533 self
.sock
= ssl
.wrap_socket(
2534 sock
, self
.key_file
, self
.cert_file
,
2535 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2538 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2543 def handle_youtubedl_headers(headers
):
2544 filtered_headers
= headers
2546 if 'Youtubedl-no-compression' in filtered_headers
:
2547 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2548 del filtered_headers
['Youtubedl-no-compression']
2550 return filtered_headers
2553 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2554 """Handler for HTTP requests and responses.
2556 This class, when installed with an OpenerDirector, automatically adds
2557 the standard headers to every HTTP request and handles gzipped and
2558 deflated responses from web servers. If compression is to be avoided in
2559 a particular request, the original request in the program code only has
2560 to include the HTTP header "Youtubedl-no-compression", which will be
2561 removed before making the real request.
2563 Part of this code was copied from:
2565 http://techknack.net/python-urllib2-handlers/
2567 Andrew Rowls, the author of that code, agreed to release it to the
2571 def __init__(self
, params
, *args
, **kwargs
):
2572 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2573 self
._params
= params
2575 def http_open(self
, req
):
2576 conn_class
= compat_http_client
.HTTPConnection
2578 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2580 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2581 del req
.headers
['Ytdl-socks-proxy']
2583 return self
.do_open(functools
.partial(
2584 _create_http_connection
, self
, conn_class
, False),
2590 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2592 return zlib
.decompress(data
)
2594 def http_request(self
, req
):
2595 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2596 # always respected by websites, some tend to give out URLs with non percent-encoded
2597 # non-ASCII characters (see telemb.py, ard.py [#3412])
2598 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2599 # To work around aforementioned issue we will replace request's original URL with
2600 # percent-encoded one
2601 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2602 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2603 url
= req
.get_full_url()
2604 url_escaped
= escape_url(url
)
2606 # Substitute URL if any change after escaping
2607 if url
!= url_escaped
:
2608 req
= update_Request(req
, url
=url_escaped
)
2610 for h
, v
in std_headers
.items():
2611 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2612 # The dict keys are capitalized because of this bug by urllib
2613 if h
.capitalize() not in req
.headers
:
2614 req
.add_header(h
, v
)
2616 req
.headers
= handle_youtubedl_headers(req
.headers
)
2618 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2619 # Python 2.6 is brain-dead when it comes to fragments
2620 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2621 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2625 def http_response(self
, req
, resp
):
2628 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2629 content
= resp
.read()
2630 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2632 uncompressed
= io
.BytesIO(gz
.read())
2633 except IOError as original_ioerror
:
2634 # There may be junk add the end of the file
2635 # See http://stackoverflow.com/q/4928560/35070 for details
2636 for i
in range(1, 1024):
2638 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2639 uncompressed
= io
.BytesIO(gz
.read())
2644 raise original_ioerror
2645 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2646 resp
.msg
= old_resp
.msg
2647 del resp
.headers
['Content-encoding']
2649 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2650 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2651 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2652 resp
.msg
= old_resp
.msg
2653 del resp
.headers
['Content-encoding']
2654 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2655 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2656 if 300 <= resp
.code
< 400:
2657 location
= resp
.headers
.get('Location')
2659 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2660 if sys
.version_info
>= (3, 0):
2661 location
= location
.encode('iso-8859-1').decode('utf-8')
2663 location
= location
.decode('utf-8')
2664 location_escaped
= escape_url(location
)
2665 if location
!= location_escaped
:
2666 del resp
.headers
['Location']
2667 if sys
.version_info
< (3, 0):
2668 location_escaped
= location_escaped
.encode('utf-8')
2669 resp
.headers
['Location'] = location_escaped
2672 https_request
= http_request
2673 https_response
= http_response
2676 def make_socks_conn_class(base_class
, socks_proxy
):
2677 assert issubclass(base_class
, (
2678 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2680 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2681 if url_components
.scheme
.lower() == 'socks5':
2682 socks_type
= ProxyType
.SOCKS5
2683 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2684 socks_type
= ProxyType
.SOCKS4
2685 elif url_components
.scheme
.lower() == 'socks4a':
2686 socks_type
= ProxyType
.SOCKS4A
2688 def unquote_if_non_empty(s
):
2691 return compat_urllib_parse_unquote_plus(s
)
2695 url_components
.hostname
, url_components
.port
or 1080,
2697 unquote_if_non_empty(url_components
.username
),
2698 unquote_if_non_empty(url_components
.password
),
2701 class SocksConnection(base_class
):
2703 self
.sock
= sockssocket()
2704 self
.sock
.setproxy(*proxy_args
)
2705 if type(self
.timeout
) in (int, float):
2706 self
.sock
.settimeout(self
.timeout
)
2707 self
.sock
.connect((self
.host
, self
.port
))
2709 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2710 if hasattr(self
, '_context'): # Python > 2.6
2711 self
.sock
= self
._context
.wrap_socket(
2712 self
.sock
, server_hostname
=self
.host
)
2714 self
.sock
= ssl
.wrap_socket(self
.sock
)
2716 return SocksConnection
2719 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2720 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2721 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2722 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2723 self
._params
= params
2725 def https_open(self
, req
):
2727 conn_class
= self
._https
_conn
_class
2729 if hasattr(self
, '_context'): # python > 2.6
2730 kwargs
['context'] = self
._context
2731 if hasattr(self
, '_check_hostname'): # python 3.x
2732 kwargs
['check_hostname'] = self
._check
_hostname
2734 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2736 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2737 del req
.headers
['Ytdl-socks-proxy']
2739 return self
.do_open(functools
.partial(
2740 _create_http_connection
, self
, conn_class
, True),
2744 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2746 See [1] for cookie file format.
2748 1. https://curl.haxx.se/docs/http-cookies.html
2750 _HTTPONLY_PREFIX
= '#HttpOnly_'
2752 _HEADER
= '''# Netscape HTTP Cookie File
2753 # This file is generated by youtube-dlc. Do not edit.
2756 _CookieFileEntry
= collections
.namedtuple(
2758 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2760 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2762 Save cookies to a file.
2764 Most of the code is taken from CPython 3.8 and slightly adapted
2765 to support cookie files with UTF-8 in both python 2 and 3.
2767 if filename
is None:
2768 if self
.filename
is not None:
2769 filename
= self
.filename
2771 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2773 # Store session cookies with `expires` set to 0 instead of an empty
2776 if cookie
.expires
is None:
2779 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2780 f
.write(self
._HEADER
)
2783 if not ignore_discard
and cookie
.discard
:
2785 if not ignore_expires
and cookie
.is_expired(now
):
2791 if cookie
.domain
.startswith('.'):
2792 initial_dot
= 'TRUE'
2794 initial_dot
= 'FALSE'
2795 if cookie
.expires
is not None:
2796 expires
= compat_str(cookie
.expires
)
2799 if cookie
.value
is None:
2800 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2801 # with no name, whereas http.cookiejar regards it as a
2802 # cookie with no value.
2807 value
= cookie
.value
2809 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2810 secure
, expires
, name
, value
]) + '\n')
2812 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2813 """Load cookies from a file."""
2814 if filename
is None:
2815 if self
.filename
is not None:
2816 filename
= self
.filename
2818 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2820 def prepare_line(line
):
2821 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2822 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2823 # comments and empty lines are fine
2824 if line
.startswith('#') or not line
.strip():
2826 cookie_list
= line
.split('\t')
2827 if len(cookie_list
) != self
._ENTRY
_LEN
:
2828 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2829 cookie
= self
._CookieFileEntry
(*cookie_list
)
2830 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2831 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2835 with io
.open(filename
, encoding
='utf-8') as f
:
2838 cf
.write(prepare_line(line
))
2839 except compat_cookiejar
.LoadError
as e
:
2841 'WARNING: skipping cookie file entry due to %s: %r\n'
2842 % (e
, line
), sys
.stderr
)
2845 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2846 # Session cookies are denoted by either `expires` field set to
2847 # an empty string or 0. MozillaCookieJar only recognizes the former
2848 # (see [1]). So we need force the latter to be recognized as session
2849 # cookies on our own.
2850 # Session cookies may be important for cookies-based authentication,
2851 # e.g. usually, when user does not check 'Remember me' check box while
2852 # logging in on a site, some important cookies are stored as session
2853 # cookies so that not recognizing them will result in failed login.
2854 # 1. https://bugs.python.org/issue17164
2856 # Treat `expires=0` cookies as session cookies
2857 if cookie
.expires
== 0:
2858 cookie
.expires
= None
2859 cookie
.discard
= True
2862 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2863 def __init__(self
, cookiejar
=None):
2864 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2866 def http_response(self
, request
, response
):
2867 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2868 # characters in Set-Cookie HTTP header of last response (see
2869 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2870 # In order to at least prevent crashing we will percent encode Set-Cookie
2871 # header before HTTPCookieProcessor starts processing it.
2872 # if sys.version_info < (3, 0) and response.headers:
2873 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2874 # set_cookie = response.headers.get(set_cookie_header)
2876 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2877 # if set_cookie != set_cookie_escaped:
2878 # del response.headers[set_cookie_header]
2879 # response.headers[set_cookie_header] = set_cookie_escaped
2880 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2882 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2883 https_response
= http_response
2886 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2887 if sys
.version_info
[0] < 3:
2888 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2889 # On python 2 urlh.geturl() may sometimes return redirect URL
2890 # as byte string instead of unicode. This workaround allows
2891 # to force it always return unicode.
2892 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2895 def extract_timezone(date_str
):
2897 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2900 timezone
= datetime
.timedelta()
2902 date_str
= date_str
[:-len(m
.group('tz'))]
2903 if not m
.group('sign'):
2904 timezone
= datetime
.timedelta()
2906 sign
= 1 if m
.group('sign') == '+' else -1
2907 timezone
= datetime
.timedelta(
2908 hours
=sign
* int(m
.group('hours')),
2909 minutes
=sign
* int(m
.group('minutes')))
2910 return timezone
, date_str
2913 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2914 """ Return a UNIX timestamp from the given date """
2916 if date_str
is None:
2919 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2921 if timezone
is None:
2922 timezone
, date_str
= extract_timezone(date_str
)
2925 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2926 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2927 return calendar
.timegm(dt
.timetuple())
2932 def date_formats(day_first
=True):
2933 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2936 def unified_strdate(date_str
, day_first
=True):
2937 """Return a string with the date in the format YYYYMMDD"""
2939 if date_str
is None:
2943 date_str
= date_str
.replace(',', ' ')
2944 # Remove AM/PM + timezone
2945 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2946 _
, date_str
= extract_timezone(date_str
)
2948 for expression
in date_formats(day_first
):
2950 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2953 if upload_date
is None:
2954 timetuple
= email
.utils
.parsedate_tz(date_str
)
2957 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2960 if upload_date
is not None:
2961 return compat_str(upload_date
)
2964 def unified_timestamp(date_str
, day_first
=True):
2965 if date_str
is None:
2968 date_str
= re
.sub(r
'[,|]', '', date_str
)
2970 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2971 timezone
, date_str
= extract_timezone(date_str
)
2973 # Remove AM/PM + timezone
2974 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2976 # Remove unrecognized timezones from ISO 8601 alike timestamps
2977 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2979 date_str
= date_str
[:-len(m
.group('tz'))]
2981 # Python only supports microseconds, so remove nanoseconds
2982 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
2984 date_str
= m
.group(1)
2986 for expression
in date_formats(day_first
):
2988 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
2989 return calendar
.timegm(dt
.timetuple())
2992 timetuple
= email
.utils
.parsedate_tz(date_str
)
2994 return calendar
.timegm(timetuple
) + pm_delta
* 3600
2997 def determine_ext(url
, default_ext
='unknown_video'):
2998 if url
is None or '.' not in url
:
3000 guess
= url
.partition('?')[0].rpartition('.')[2]
3001 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3003 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3004 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3005 return guess
.rstrip('/')
3010 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3011 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3014 def date_from_str(date_str
):
3016 Return a datetime object from a string in the format YYYYMMDD or
3017 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3018 today
= datetime
.date
.today()
3019 if date_str
in ('now', 'today'):
3021 if date_str
== 'yesterday':
3022 return today
- datetime
.timedelta(days
=1)
3023 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3024 if match
is not None:
3025 sign
= match
.group('sign')
3026 time
= int(match
.group('time'))
3029 unit
= match
.group('unit')
3030 # A bad approximation?
3034 elif unit
== 'year':
3038 delta
= datetime
.timedelta(**{unit: time}
)
3039 return today
+ delta
3040 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3043 def hyphenate_date(date_str
):
3045 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3046 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3047 if match
is not None:
3048 return '-'.join(match
.groups())
3053 class DateRange(object):
3054 """Represents a time interval between two dates"""
3056 def __init__(self
, start
=None, end
=None):
3057 """start and end must be strings in the format accepted by date"""
3058 if start
is not None:
3059 self
.start
= date_from_str(start
)
3061 self
.start
= datetime
.datetime
.min.date()
3063 self
.end
= date_from_str(end
)
3065 self
.end
= datetime
.datetime
.max.date()
3066 if self
.start
> self
.end
:
3067 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3071 """Returns a range that only contains the given day"""
3072 return cls(day
, day
)
3074 def __contains__(self
, date
):
3075 """Check if the date is in the range"""
3076 if not isinstance(date
, datetime
.date
):
3077 date
= date_from_str(date
)
3078 return self
.start
<= date
<= self
.end
3081 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3084 def platform_name():
3085 """ Returns the platform name as a compat_str """
3086 res
= platform
.platform()
3087 if isinstance(res
, bytes):
3088 res
= res
.decode(preferredencoding())
3090 assert isinstance(res
, compat_str
)
3094 def _windows_write_string(s
, out
):
3095 """ Returns True if the string was written using special methods,
3096 False if it has yet to be written out."""
3097 # Adapted from http://stackoverflow.com/a/3259271/35070
3100 import ctypes
.wintypes
3108 fileno
= out
.fileno()
3109 except AttributeError:
3110 # If the output stream doesn't have a fileno, it's virtual
3112 except io
.UnsupportedOperation
:
3113 # Some strange Windows pseudo files?
3115 if fileno
not in WIN_OUTPUT_IDS
:
3118 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3119 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3120 ('GetStdHandle', ctypes
.windll
.kernel32
))
3121 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3123 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3124 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3125 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3126 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3127 written
= ctypes
.wintypes
.DWORD(0)
3129 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3130 FILE_TYPE_CHAR
= 0x0002
3131 FILE_TYPE_REMOTE
= 0x8000
3132 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3133 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3134 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3135 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3136 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3138 def not_a_console(handle
):
3139 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3141 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3142 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3144 if not_a_console(h
):
3147 def next_nonbmp_pos(s
):
3149 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3150 except StopIteration:
3154 count
= min(next_nonbmp_pos(s
), 1024)
3156 ret
= WriteConsoleW(
3157 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3159 raise OSError('Failed to write string')
3160 if not count
: # We just wrote a non-BMP character
3161 assert written
.value
== 2
3164 assert written
.value
> 0
3165 s
= s
[written
.value
:]
3169 def write_string(s
, out
=None, encoding
=None):
3172 assert type(s
) == compat_str
3174 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3175 if _windows_write_string(s
, out
):
3178 if ('b' in getattr(out
, 'mode', '')
3179 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3180 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3182 elif hasattr(out
, 'buffer'):
3183 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3184 byt
= s
.encode(enc
, 'ignore')
3185 out
.buffer.write(byt
)
3191 def bytes_to_intlist(bs
):
3194 if isinstance(bs
[0], int): # Python 3
3197 return [ord(c
) for c
in bs
]
3200 def intlist_to_bytes(xs
):
3203 return compat_struct_pack('%dB' % len(xs
), *xs
)
3206 # Cross-platform file locking
3207 if sys
.platform
== 'win32':
3208 import ctypes
.wintypes
3211 class OVERLAPPED(ctypes
.Structure
):
3213 ('Internal', ctypes
.wintypes
.LPVOID
),
3214 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3215 ('Offset', ctypes
.wintypes
.DWORD
),
3216 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3217 ('hEvent', ctypes
.wintypes
.HANDLE
),
3220 kernel32
= ctypes
.windll
.kernel32
3221 LockFileEx
= kernel32
.LockFileEx
3222 LockFileEx
.argtypes
= [
3223 ctypes
.wintypes
.HANDLE
, # hFile
3224 ctypes
.wintypes
.DWORD
, # dwFlags
3225 ctypes
.wintypes
.DWORD
, # dwReserved
3226 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3227 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3228 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3230 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3231 UnlockFileEx
= kernel32
.UnlockFileEx
3232 UnlockFileEx
.argtypes
= [
3233 ctypes
.wintypes
.HANDLE
, # hFile
3234 ctypes
.wintypes
.DWORD
, # dwReserved
3235 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3236 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3237 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3239 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3240 whole_low
= 0xffffffff
3241 whole_high
= 0x7fffffff
3243 def _lock_file(f
, exclusive
):
3244 overlapped
= OVERLAPPED()
3245 overlapped
.Offset
= 0
3246 overlapped
.OffsetHigh
= 0
3247 overlapped
.hEvent
= 0
3248 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3249 handle
= msvcrt
.get_osfhandle(f
.fileno())
3250 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3251 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3252 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3254 def _unlock_file(f
):
3255 assert f
._lock
_file
_overlapped
_p
3256 handle
= msvcrt
.get_osfhandle(f
.fileno())
3257 if not UnlockFileEx(handle
, 0,
3258 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3259 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3262 # Some platforms, such as Jython, is missing fcntl
3266 def _lock_file(f
, exclusive
):
3267 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3269 def _unlock_file(f
):
3270 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3272 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3274 def _lock_file(f
, exclusive
):
3275 raise IOError(UNSUPPORTED_MSG
)
3277 def _unlock_file(f
):
3278 raise IOError(UNSUPPORTED_MSG
)
3281 class locked_file(object):
3282 def __init__(self
, filename
, mode
, encoding
=None):
3283 assert mode
in ['r', 'a', 'w']
3284 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3287 def __enter__(self
):
3288 exclusive
= self
.mode
!= 'r'
3290 _lock_file(self
.f
, exclusive
)
3296 def __exit__(self
, etype
, value
, traceback
):
3298 _unlock_file(self
.f
)
3305 def write(self
, *args
):
3306 return self
.f
.write(*args
)
3308 def read(self
, *args
):
3309 return self
.f
.read(*args
)
3312 def get_filesystem_encoding():
3313 encoding
= sys
.getfilesystemencoding()
3314 return encoding
if encoding
is not None else 'utf-8'
3317 def shell_quote(args
):
3319 encoding
= get_filesystem_encoding()
3321 if isinstance(a
, bytes):
3322 # We may get a filename encoded with 'encodeFilename'
3323 a
= a
.decode(encoding
)
3324 quoted_args
.append(compat_shlex_quote(a
))
3325 return ' '.join(quoted_args
)
3328 def smuggle_url(url
, data
):
3329 """ Pass additional data in a URL for internal use. """
3331 url
, idata
= unsmuggle_url(url
, {})
3333 sdata
= compat_urllib_parse_urlencode(
3334 {'__youtubedl_smuggle': json.dumps(data)}
)
3335 return url
+ '#' + sdata
3338 def unsmuggle_url(smug_url
, default
=None):
3339 if '#__youtubedl_smuggle' not in smug_url
:
3340 return smug_url
, default
3341 url
, _
, sdata
= smug_url
.rpartition('#')
3342 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3343 data
= json
.loads(jsond
)
3347 def format_bytes(bytes):
3350 if type(bytes) is str:
3351 bytes = float(bytes)
3355 exponent
= int(math
.log(bytes, 1024.0))
3356 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3357 converted
= float(bytes) / float(1024 ** exponent
)
3358 return '%.2f%s' % (converted
, suffix
)
3361 def lookup_unit_table(unit_table
, s
):
3362 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3364 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3367 num_str
= m
.group('num').replace(',', '.')
3368 mult
= unit_table
[m
.group('unit')]
3369 return int(float(num_str
) * mult
)
3372 def parse_filesize(s
):
3376 # The lower-case forms are of course incorrect and unofficial,
3377 # but we support those too
3394 'megabytes': 1000 ** 2,
3395 'mebibytes': 1024 ** 2,
3401 'gigabytes': 1000 ** 3,
3402 'gibibytes': 1024 ** 3,
3408 'terabytes': 1000 ** 4,
3409 'tebibytes': 1024 ** 4,
3415 'petabytes': 1000 ** 5,
3416 'pebibytes': 1024 ** 5,
3422 'exabytes': 1000 ** 6,
3423 'exbibytes': 1024 ** 6,
3429 'zettabytes': 1000 ** 7,
3430 'zebibytes': 1024 ** 7,
3436 'yottabytes': 1000 ** 8,
3437 'yobibytes': 1024 ** 8,
3440 return lookup_unit_table(_UNIT_TABLE
, s
)
3449 if re
.match(r
'^[\d,.]+$', s
):
3450 return str_to_int(s
)
3461 return lookup_unit_table(_UNIT_TABLE
, s
)
3464 def parse_resolution(s
):
3468 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3471 'width': int(mobj
.group('w')),
3472 'height': int(mobj
.group('h')),
3475 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3477 return {'height': int(mobj.group(1))}
3479 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3481 return {'height': int(mobj.group(1)) * 540}
3486 def parse_bitrate(s
):
3487 if not isinstance(s
, compat_str
):
3489 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3491 return int(mobj
.group(1))
3494 def month_by_name(name
, lang
='en'):
3495 """ Return the number of a month by (locale-independently) English name """
3497 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3500 return month_names
.index(name
) + 1
3505 def month_by_abbreviation(abbrev
):
3506 """ Return the number of a month by (locale-independently) English
3510 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3515 def fix_xml_ampersands(xml_str
):
3516 """Replace all the '&' by '&' in XML"""
3518 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3523 def setproctitle(title
):
3524 assert isinstance(title
, compat_str
)
3526 # ctypes in Jython is not complete
3527 # http://bugs.jython.org/issue2148
3528 if sys
.platform
.startswith('java'):
3532 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3536 # LoadLibrary in Windows Python 2.7.13 only expects
3537 # a bytestring, but since unicode_literals turns
3538 # every string into a unicode string, it fails.
3540 title_bytes
= title
.encode('utf-8')
3541 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3542 buf
.value
= title_bytes
3544 libc
.prctl(15, buf
, 0, 0, 0)
3545 except AttributeError:
3546 return # Strange libc, just skip this
3549 def remove_start(s
, start
):
3550 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3553 def remove_end(s
, end
):
3554 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3557 def remove_quotes(s
):
3558 if s
is None or len(s
) < 2:
3560 for quote
in ('"', "'", ):
3561 if s
[0] == quote
and s
[-1] == quote
:
3566 def get_domain(url
):
3567 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3568 return domain
.group('domain') if domain
else None
3571 def url_basename(url
):
3572 path
= compat_urlparse
.urlparse(url
).path
3573 return path
.strip('/').split('/')[-1]
3577 return re
.match(r
'https?://[^?#&]+/', url
).group()
3580 def urljoin(base
, path
):
3581 if isinstance(path
, bytes):
3582 path
= path
.decode('utf-8')
3583 if not isinstance(path
, compat_str
) or not path
:
3585 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3587 if isinstance(base
, bytes):
3588 base
= base
.decode('utf-8')
3589 if not isinstance(base
, compat_str
) or not re
.match(
3590 r
'^(?:https?:)?//', base
):
3592 return compat_urlparse
.urljoin(base
, path
)
3595 class HEADRequest(compat_urllib_request
.Request
):
3596 def get_method(self
):
3600 class PUTRequest(compat_urllib_request
.Request
):
3601 def get_method(self
):
3605 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3608 v
= getattr(v
, get_attr
, None)
3614 return int(v
) * invscale
// scale
3615 except (ValueError, TypeError):
3619 def str_or_none(v
, default
=None):
3620 return default
if v
is None else compat_str(v
)
3623 def str_to_int(int_str
):
3624 """ A more relaxed version of int_or_none """
3625 if isinstance(int_str
, compat_integer_types
):
3627 elif isinstance(int_str
, compat_str
):
3628 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3629 return int_or_none(int_str
)
3632 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3636 return float(v
) * invscale
/ scale
3637 except (ValueError, TypeError):
3641 def bool_or_none(v
, default
=None):
3642 return v
if isinstance(v
, bool) else default
3645 def strip_or_none(v
, default
=None):
3646 return v
.strip() if isinstance(v
, compat_str
) else default
3649 def url_or_none(url
):
3650 if not url
or not isinstance(url
, compat_str
):
3653 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3656 def parse_duration(s
):
3657 if not isinstance(s
, compat_basestring
):
3662 days
, hours
, mins
, secs
, ms
= [None] * 5
3663 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3665 days
, hours
, mins
, secs
, ms
= m
.groups()
3670 [0-9]+\s*y(?:ears?)?\s*
3673 [0-9]+\s*m(?:onths?)?\s*
3676 [0-9]+\s*w(?:eeks?)?\s*
3679 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3683 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3686 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3689 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3692 days
, hours
, mins
, secs
, ms
= m
.groups()
3694 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3696 hours
, mins
= m
.groups()
3702 duration
+= float(secs
)
3704 duration
+= float(mins
) * 60
3706 duration
+= float(hours
) * 60 * 60
3708 duration
+= float(days
) * 24 * 60 * 60
3710 duration
+= float(ms
)
3714 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3715 name
, real_ext
= os
.path
.splitext(filename
)
3717 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3718 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3719 else '{0}.{1}'.format(filename
, ext
))
3722 def replace_extension(filename
, ext
, expected_real_ext
=None):
3723 name
, real_ext
= os
.path
.splitext(filename
)
3724 return '{0}.{1}'.format(
3725 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3729 def check_executable(exe
, args
=[]):
3730 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3731 args can be a list of arguments for a short output (like -version) """
3733 subprocess
.Popen([exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
).communicate()
3739 def get_exe_version(exe
, args
=['--version'],
3740 version_re
=None, unrecognized
='present'):
3741 """ Returns the version of the specified executable,
3742 or False if the executable is not present """
3744 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3745 # SIGTTOU if youtube-dlc is run in the background.
3746 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3747 out
, _
= subprocess
.Popen(
3748 [encodeArgument(exe
)] + args
,
3749 stdin
=subprocess
.PIPE
,
3750 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
).communicate()
3753 if isinstance(out
, bytes): # Python 2.x
3754 out
= out
.decode('ascii', 'ignore')
3755 return detect_exe_version(out
, version_re
, unrecognized
)
3758 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3759 assert isinstance(output
, compat_str
)
3760 if version_re
is None:
3761 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3762 m
= re
.search(version_re
, output
)
3769 class PagedList(object):
3771 # This is only useful for tests
3772 return len(self
.getslice())
3775 class OnDemandPagedList(PagedList
):
3776 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3777 self
._pagefunc
= pagefunc
3778 self
._pagesize
= pagesize
3779 self
._use
_cache
= use_cache
3783 def getslice(self
, start
=0, end
=None):
3785 for pagenum
in itertools
.count(start
// self
._pagesize
):
3786 firstid
= pagenum
* self
._pagesize
3787 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3788 if start
>= nextfirstid
:
3793 page_results
= self
._cache
.get(pagenum
)
3794 if page_results
is None:
3795 page_results
= list(self
._pagefunc
(pagenum
))
3797 self
._cache
[pagenum
] = page_results
3800 start
% self
._pagesize
3801 if firstid
<= start
< nextfirstid
3805 ((end
- 1) % self
._pagesize
) + 1
3806 if (end
is not None and firstid
<= end
<= nextfirstid
)
3809 if startv
!= 0 or endv
is not None:
3810 page_results
= page_results
[startv
:endv
]
3811 res
.extend(page_results
)
3813 # A little optimization - if current page is not "full", ie. does
3814 # not contain page_size videos then we can assume that this page
3815 # is the last one - there are no more ids on further pages -
3816 # i.e. no need to query again.
3817 if len(page_results
) + startv
< self
._pagesize
:
3820 # If we got the whole page, but the next page is not interesting,
3821 # break out early as well
3822 if end
== nextfirstid
:
3827 class InAdvancePagedList(PagedList
):
3828 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3829 self
._pagefunc
= pagefunc
3830 self
._pagecount
= pagecount
3831 self
._pagesize
= pagesize
3833 def getslice(self
, start
=0, end
=None):
3835 start_page
= start
// self
._pagesize
3837 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3838 skip_elems
= start
- start_page
* self
._pagesize
3839 only_more
= None if end
is None else end
- start
3840 for pagenum
in range(start_page
, end_page
):
3841 page
= list(self
._pagefunc
(pagenum
))
3843 page
= page
[skip_elems
:]
3845 if only_more
is not None:
3846 if len(page
) < only_more
:
3847 only_more
-= len(page
)
3849 page
= page
[:only_more
]
3856 def uppercase_escape(s
):
3857 unicode_escape
= codecs
.getdecoder('unicode_escape')
3859 r
'\\U[0-9a-fA-F]{8}',
3860 lambda m
: unicode_escape(m
.group(0))[0],
3864 def lowercase_escape(s
):
3865 unicode_escape
= codecs
.getdecoder('unicode_escape')
3867 r
'\\u[0-9a-fA-F]{4}',
3868 lambda m
: unicode_escape(m
.group(0))[0],
3872 def escape_rfc3986(s
):
3873 """Escape non-ASCII characters as suggested by RFC 3986"""
3874 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3875 s
= s
.encode('utf-8')
3876 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3879 def escape_url(url
):
3880 """Escape URL as suggested by RFC 3986"""
3881 url_parsed
= compat_urllib_parse_urlparse(url
)
3882 return url_parsed
._replace
(
3883 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3884 path
=escape_rfc3986(url_parsed
.path
),
3885 params
=escape_rfc3986(url_parsed
.params
),
3886 query
=escape_rfc3986(url_parsed
.query
),
3887 fragment
=escape_rfc3986(url_parsed
.fragment
)
3891 def read_batch_urls(batch_fd
):
3893 if not isinstance(url
, compat_str
):
3894 url
= url
.decode('utf-8', 'replace')
3895 BOM_UTF8
= '\xef\xbb\xbf'
3896 if url
.startswith(BOM_UTF8
):
3897 url
= url
[len(BOM_UTF8
):]
3899 if url
.startswith(('#', ';', ']')):
3903 with contextlib
.closing(batch_fd
) as fd
:
3904 return [url
for url
in map(fixup
, fd
) if url
]
3907 def urlencode_postdata(*args
, **kargs
):
3908 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3911 def update_url_query(url
, query
):
3914 parsed_url
= compat_urlparse
.urlparse(url
)
3915 qs
= compat_parse_qs(parsed_url
.query
)
3917 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3918 query
=compat_urllib_parse_urlencode(qs
, True)))
3921 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
3922 req_headers
= req
.headers
.copy()
3923 req_headers
.update(headers
)
3924 req_data
= data
or req
.data
3925 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3926 req_get_method
= req
.get_method()
3927 if req_get_method
== 'HEAD':
3928 req_type
= HEADRequest
3929 elif req_get_method
== 'PUT':
3930 req_type
= PUTRequest
3932 req_type
= compat_urllib_request
.Request
3934 req_url
, data
=req_data
, headers
=req_headers
,
3935 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3936 if hasattr(req
, 'timeout'):
3937 new_req
.timeout
= req
.timeout
3941 def _multipart_encode_impl(data
, boundary
):
3942 content_type
= 'multipart/form-data; boundary=%s' % boundary
3945 for k
, v
in data
.items():
3946 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3947 if isinstance(k
, compat_str
):
3948 k
= k
.encode('utf-8')
3949 if isinstance(v
, compat_str
):
3950 v
= v
.encode('utf-8')
3951 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3952 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3953 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3954 if boundary
.encode('ascii') in content
:
3955 raise ValueError('Boundary overlaps with data')
3958 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3960 return out
, content_type
3963 def multipart_encode(data
, boundary
=None):
3965 Encode a dict to RFC 7578-compliant form-data
3968 A dict where keys and values can be either Unicode or bytes-like
3971 If specified a Unicode object, it's used as the boundary. Otherwise
3972 a random boundary is generated.
3974 Reference: https://tools.ietf.org/html/rfc7578
3976 has_specified_boundary
= boundary
is not None
3979 if boundary
is None:
3980 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
3983 out
, content_type
= _multipart_encode_impl(data
, boundary
)
3986 if has_specified_boundary
:
3990 return out
, content_type
3993 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
3994 if isinstance(key_or_keys
, (list, tuple)):
3995 for key
in key_or_keys
:
3996 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4000 return d
.get(key_or_keys
, default
)
4003 def try_get(src
, getter
, expected_type
=None):
4004 if not isinstance(getter
, (list, tuple)):
4009 except (AttributeError, KeyError, TypeError, IndexError):
4012 if expected_type
is None or isinstance(v
, expected_type
):
4016 def merge_dicts(*dicts
):
4018 for a_dict
in dicts
:
4019 for k
, v
in a_dict
.items():
4023 or (isinstance(v
, compat_str
) and v
4024 and isinstance(merged
[k
], compat_str
)
4025 and not merged
[k
])):
4030 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4031 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4043 TV_PARENTAL_GUIDELINES
= {
4053 def parse_age_limit(s
):
4055 return s
if 0 <= s
<= 21 else None
4056 if not isinstance(s
, compat_basestring
):
4058 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4060 return int(m
.group('age'))
4062 return US_RATINGS
[s
]
4063 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4065 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4069 def strip_jsonp(code
):
4072 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4073 (?:\s*&&\s*(?P=func_name))?
4074 \s*\(\s*(?P<callback_data>.*)\);?
4075 \s*?(?://[^\n]*)*$''',
4076 r
'\g<callback_data>', code
)
4079 def js_to_json(code
):
4080 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4081 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4083 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4084 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4089 if v
in ('true', 'false', 'null'):
4091 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4094 if v
[0] in ("'", '"'):
4095 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4100 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4102 for regex
, base
in INTEGER_TABLE
:
4103 im
= re
.match(regex
, v
)
4105 i
= int(im
.group(1), base
)
4106 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4110 return re
.sub(r
'''(?sx)
4111 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4112 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4113 {comment}|,(?={skip}[\]}}])|
4114 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4115 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4118 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4121 def qualities(quality_ids
):
4122 """ Get a numeric quality value out of a list of possible values """
4125 return quality_ids
.index(qid
)
4131 DEFAULT_OUTTMPL
= '%(title)s [%(id)s].%(ext)s'
4134 def limit_length(s
, length
):
4135 """ Add ellipses to overly long strings """
4140 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4144 def version_tuple(v
):
4145 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4148 def is_outdated_version(version
, limit
, assume_new
=True):
4150 return not assume_new
4152 return version_tuple(version
) < version_tuple(limit
)
4154 return not assume_new
4157 def ytdl_is_updateable():
4158 """ Returns if youtube-dlc can be updated with -U """
4161 from zipimport
import zipimporter
4163 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4166 def args_to_str(args
):
4167 # Get a short string representation for a subprocess command
4168 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4171 def error_to_compat_str(err
):
4173 # On python 2 error byte string must be decoded with proper
4174 # encoding rather than ascii
4175 if sys
.version_info
[0] < 3:
4176 err_str
= err_str
.decode(preferredencoding())
4180 def mimetype2ext(mt
):
4186 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4187 # it's the most popular one
4188 'audio/mpeg': 'mp3',
4189 'audio/x-wav': 'wav',
4194 _
, _
, res
= mt
.rpartition('/')
4195 res
= res
.split(';')[0].strip().lower()
4199 'smptett+xml': 'tt',
4203 'x-mp4-fragmented': 'mp4',
4204 'x-ms-sami': 'sami',
4207 'x-mpegurl': 'm3u8',
4208 'vnd.apple.mpegurl': 'm3u8',
4212 'vnd.ms-sstr+xml': 'ism',
4219 def parse_codecs(codecs_str
):
4220 # http://tools.ietf.org/html/rfc6381
4223 split_codecs
= list(filter(None, map(
4224 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4225 vcodec
, acodec
= None, None
4226 for full_codec
in split_codecs
:
4227 codec
= full_codec
.split('.')[0]
4228 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4231 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4235 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4236 if not vcodec
and not acodec
:
4237 if len(split_codecs
) == 2:
4239 'vcodec': split_codecs
[0],
4240 'acodec': split_codecs
[1],
4244 'vcodec': vcodec
or 'none',
4245 'acodec': acodec
or 'none',
4250 def urlhandle_detect_ext(url_handle
):
4251 getheader
= url_handle
.headers
.get
4253 cd
= getheader('Content-Disposition')
4255 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4257 e
= determine_ext(m
.group('filename'), default_ext
=None)
4261 return mimetype2ext(getheader('Content-Type'))
4264 def encode_data_uri(data
, mime_type
):
4265 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4268 def age_restricted(content_limit
, age_limit
):
4269 """ Returns True iff the content should be blocked """
4271 if age_limit
is None: # No limit set
4273 if content_limit
is None:
4274 return False # Content available for everyone
4275 return age_limit
< content_limit
4278 def is_html(first_bytes
):
4279 """ Detect whether a file contains HTML by examining its first bytes. """
4282 (b
'\xef\xbb\xbf', 'utf-8'),
4283 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4284 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4285 (b
'\xff\xfe', 'utf-16-le'),
4286 (b
'\xfe\xff', 'utf-16-be'),
4288 for bom
, enc
in BOMS
:
4289 if first_bytes
.startswith(bom
):
4290 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4293 s
= first_bytes
.decode('utf-8', 'replace')
4295 return re
.match(r
'^\s*<', s
)
4298 def determine_protocol(info_dict
):
4299 protocol
= info_dict
.get('protocol')
4300 if protocol
is not None:
4303 url
= info_dict
['url']
4304 if url
.startswith('rtmp'):
4306 elif url
.startswith('mms'):
4308 elif url
.startswith('rtsp'):
4311 ext
= determine_ext(url
)
4317 return compat_urllib_parse_urlparse(url
).scheme
4320 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4321 """ Render a list of rows, each as a list of values """
4323 def get_max_lens(table
):
4324 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4326 def filter_using_list(row
, filterArray
):
4327 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4330 max_lens
= get_max_lens(data
)
4331 header_row
= filter_using_list(header_row
, max_lens
)
4332 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4334 table
= [header_row
] + data
4335 max_lens
= get_max_lens(table
)
4337 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4338 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4339 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4342 def _match_one(filter_part
, dct
):
4343 COMPARISON_OPERATORS
= {
4351 operator_rex
= re
.compile(r
'''(?x)\s*
4353 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4355 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4356 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4357 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4360 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4361 m = operator_rex.search(filter_part)
4363 op = COMPARISON_OPERATORS[m.group('op')]
4364 actual_value = dct.get(m.group('key'))
4365 if (m.group('quotedstrval') is not None
4366 or m.group('strval') is not None
4367 # If the original field is a string and matching comparisonvalue is
4368 # a number we should respect the origin of the original field
4369 # and process comparison value as a string (see
4370 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4371 or actual_value is not None and m.group('intval') is not None
4372 and isinstance(actual_value, compat_str)):
4373 if m.group('op') not in ('=', '!='):
4375 'Operator %s does not support string values!' % m.group('op'))
4376 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4377 quote = m.group('quote')
4378 if quote is not None:
4379 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4382 comparison_value = int(m.group('intval'))
4384 comparison_value = parse_filesize(m.group('intval'))
4385 if comparison_value is None:
4386 comparison_value = parse_filesize(m.group('intval') + 'B')
4387 if comparison_value is None:
4389 'Invalid integer value %r in filter part %r' % (
4390 m.group('intval'), filter_part))
4391 if actual_value is None:
4392 return m.group('none_inclusive')
4393 return op(actual_value, comparison_value)
4396 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4397 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4399 operator_rex = re.compile(r'''(?x
)\s
*
4400 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4402 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4403 m = operator_rex.search(filter_part)
4405 op = UNARY_OPERATORS[m.group('op')]
4406 actual_value = dct.get(m.group('key'))
4407 return op(actual_value)
4409 raise ValueError('Invalid filter part %r' % filter_part)
4412 def match_str(filter_str, dct):
4413 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4416 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4419 def match_filter_func(filter_str):
4420 def _match_func(info_dict):
4421 if match_str(filter_str, info_dict):
4424 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4425 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4429 def parse_dfxp_time_expr(time_expr):
4433 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4435 return float(mobj.group('time_offset'))
4437 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4439 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4442 def srt_subtitles_timecode(seconds):
4443 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4446 def dfxp2srt(dfxp_data):
4448 @param dfxp_data A
bytes-like
object containing DFXP data
4449 @returns A
unicode object containing converted SRT data
4451 LEGACY_NAMESPACES = (
4452 (b'http://www.w3.org/ns/ttml', [
4453 b'http://www.w3.org/2004/11/ttaf1',
4454 b'http://www.w3.org/2006/04/ttaf1',
4455 b'http://www.w3.org/2006/10/ttaf1',
4457 (b'http://www.w3.org/ns/ttml#styling', [
4458 b'http://www.w3.org/ns/ttml#style',
4462 SUPPORTED_STYLING = [
4471 _x = functools.partial(xpath_with_ns, ns_map={
4472 'xml': 'http://www.w3.org/XML/1998/namespace',
4473 'ttml': 'http://www.w3.org/ns/ttml',
4474 'tts': 'http://www.w3.org/ns/ttml#styling',
4480 class TTMLPElementParser(object):
4482 _unclosed_elements = []
4483 _applied_styles = []
4485 def start(self, tag, attrib):
4486 if tag in (_x('ttml:br'), 'br'):
4489 unclosed_elements = []
4491 element_style_id = attrib.get('style')
4493 style.update(default_style)
4494 if element_style_id:
4495 style.update(styles.get(element_style_id, {}))
4496 for prop in SUPPORTED_STYLING:
4497 prop_val = attrib.get(_x('tts:' + prop))
4499 style[prop] = prop_val
4502 for k, v in sorted(style.items()):
4503 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4506 font += ' color="%s"' % v
4507 elif k == 'fontSize':
4508 font += ' size="%s"' % v
4509 elif k == 'fontFamily':
4510 font += ' face="%s"' % v
4511 elif k == 'fontWeight' and v == 'bold':
4513 unclosed_elements.append('b')
4514 elif k == 'fontStyle' and v == 'italic':
4516 unclosed_elements.append('i')
4517 elif k == 'textDecoration' and v == 'underline':
4519 unclosed_elements.append('u')
4521 self._out += '<font' + font + '>'
4522 unclosed_elements.append('font')
4524 if self._applied_styles:
4525 applied_style.update(self._applied_styles[-1])
4526 applied_style.update(style)
4527 self._applied_styles.append(applied_style)
4528 self._unclosed_elements.append(unclosed_elements)
4531 if tag not in (_x('ttml:br'), 'br'):
4532 unclosed_elements = self._unclosed_elements.pop()
4533 for element in reversed(unclosed_elements):
4534 self._out += '</%s>' % element
4535 if unclosed_elements and self._applied_styles:
4536 self._applied_styles.pop()
4538 def data(self, data):
4542 return self._out.strip()
4544 def parse_node(node):
4545 target = TTMLPElementParser()
4546 parser = xml.etree.ElementTree.XMLParser(target=target)
4547 parser.feed(xml.etree.ElementTree.tostring(node))
4548 return parser.close()
4550 for k, v in LEGACY_NAMESPACES:
4552 dfxp_data = dfxp_data.replace(ns, k)
4554 dfxp = compat_etree_fromstring(dfxp_data)
4556 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4559 raise ValueError('Invalid dfxp/TTML subtitle')
4563 for style in dfxp.findall(_x('.//ttml:style')):
4564 style_id = style.get('id') or style.get(_x('xml:id'))
4567 parent_style_id = style.get('style')
4569 if parent_style_id not in styles:
4572 styles[style_id] = styles[parent_style_id].copy()
4573 for prop in SUPPORTED_STYLING:
4574 prop_val = style.get(_x('tts:' + prop))
4576 styles.setdefault(style_id, {})[prop] = prop_val
4582 for p in ('body', 'div'):
4583 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4586 style = styles.get(ele.get('style'))
4589 default_style.update(style)
4591 for para, index in zip(paras, itertools.count(1)):
4592 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4593 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4594 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4595 if begin_time is None:
4600 end_time = begin_time + dur
4601 out.append('%d\n%s --> %s\n%s\n\n' % (
4603 srt_subtitles_timecode(begin_time),
4604 srt_subtitles_timecode(end_time),
4610 def cli_option(params, command_option, param):
4611 param = params.get(param)
4613 param = compat_str(param)
4614 return [command_option, param] if param is not None else []
4617 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4618 param = params.get(param)
4621 assert isinstance(param, bool)
4623 return [command_option + separator + (true_value if param else false_value)]
4624 return [command_option, true_value if param else false_value]
4627 def cli_valueless_option(params, command_option, param, expected_value=True):
4628 param = params.get(param)
4629 return [command_option] if param == expected_value else []
4632 def cli_configuration_args(params, param, default=[]):
4633 ex_args = params.get(param)
4636 assert isinstance(ex_args, list)
4640 class ISO639Utils(object):
4641 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4700 'iw': 'heb', # Replaced by he in 1989 revision
4710 'in': 'ind', # Replaced by id in 1989 revision
4825 'ji': 'yid', # Replaced by yi in 1989 revision
4833 def short2long(cls, code):
4834 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4835 return cls._lang_map.get(code[:2])
4838 def long2short(cls, code):
4839 """Convert language code from ISO 639-2/T to ISO 639-1"""
4840 for short_name, long_name in cls._lang_map.items():
4841 if long_name == code:
4845 class ISO3166Utils(object):
4846 # From http://data.okfn.org/data/core/country-list
4848 'AF': 'Afghanistan',
4849 'AX': 'Åland Islands',
4852 'AS': 'American Samoa',
4857 'AG': 'Antigua and Barbuda',
4874 'BO': 'Bolivia, Plurinational State of',
4875 'BQ': 'Bonaire, Sint Eustatius and Saba',
4876 'BA': 'Bosnia and Herzegovina',
4878 'BV': 'Bouvet Island',
4880 'IO': 'British Indian Ocean Territory',
4881 'BN': 'Brunei Darussalam',
4883 'BF': 'Burkina Faso',
4889 'KY': 'Cayman Islands',
4890 'CF': 'Central African Republic',
4894 'CX': 'Christmas Island',
4895 'CC': 'Cocos (Keeling) Islands',
4899 'CD': 'Congo, the Democratic Republic of the',
4900 'CK': 'Cook Islands',
4902 'CI': 'Côte d\'Ivoire',
4907 'CZ': 'Czech Republic',
4911 'DO': 'Dominican Republic',
4914 'SV': 'El Salvador',
4915 'GQ': 'Equatorial Guinea',
4919 'FK': 'Falkland Islands (Malvinas)',
4920 'FO': 'Faroe Islands',
4924 'GF': 'French Guiana',
4925 'PF': 'French Polynesia',
4926 'TF': 'French Southern Territories',
4941 'GW': 'Guinea-Bissau',
4944 'HM': 'Heard Island and McDonald Islands',
4945 'VA': 'Holy See (Vatican City State)',
4952 'IR': 'Iran, Islamic Republic of',
4955 'IM': 'Isle of Man',
4965 'KP': 'Korea, Democratic People\'s Republic of',
4966 'KR': 'Korea, Republic of',
4969 'LA': 'Lao People\'s Democratic Republic',
4975 'LI': 'Liechtenstein',
4979 'MK': 'Macedonia, the Former Yugoslav Republic of',
4986 'MH': 'Marshall Islands',
4992 'FM': 'Micronesia, Federated States of',
4993 'MD': 'Moldova, Republic of',
5004 'NL': 'Netherlands',
5005 'NC': 'New Caledonia',
5006 'NZ': 'New Zealand',
5011 'NF': 'Norfolk Island',
5012 'MP': 'Northern Mariana Islands',
5017 'PS': 'Palestine, State of',
5019 'PG': 'Papua New Guinea',
5022 'PH': 'Philippines',
5026 'PR': 'Puerto Rico',
5030 'RU': 'Russian Federation',
5032 'BL': 'Saint Barthélemy',
5033 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5034 'KN': 'Saint Kitts and Nevis',
5035 'LC': 'Saint Lucia',
5036 'MF': 'Saint Martin (French part)',
5037 'PM': 'Saint Pierre and Miquelon',
5038 'VC': 'Saint Vincent and the Grenadines',
5041 'ST': 'Sao Tome and Principe',
5042 'SA': 'Saudi Arabia',
5046 'SL': 'Sierra Leone',
5048 'SX': 'Sint Maarten (Dutch part)',
5051 'SB': 'Solomon Islands',
5053 'ZA': 'South Africa',
5054 'GS': 'South Georgia and the South Sandwich Islands',
5055 'SS': 'South Sudan',
5060 'SJ': 'Svalbard and Jan Mayen',
5063 'CH': 'Switzerland',
5064 'SY': 'Syrian Arab Republic',
5065 'TW': 'Taiwan, Province of China',
5067 'TZ': 'Tanzania, United Republic of',
5069 'TL': 'Timor-Leste',
5073 'TT': 'Trinidad and Tobago',
5076 'TM': 'Turkmenistan',
5077 'TC': 'Turks and Caicos Islands',
5081 'AE': 'United Arab Emirates',
5082 'GB': 'United Kingdom',
5083 'US': 'United States',
5084 'UM': 'United States Minor Outlying Islands',
5088 'VE': 'Venezuela, Bolivarian Republic of',
5090 'VG': 'Virgin Islands, British',
5091 'VI': 'Virgin Islands, U.S.',
5092 'WF': 'Wallis and Futuna',
5093 'EH': 'Western Sahara',
5100 def short2full(cls, code):
5101 """Convert an ISO 3166-2 country code to the corresponding full name"""
5102 return cls._country_map.get(code.upper())
5105 class GeoUtils(object):
5106 # Major IPv4 address blocks per country
5108 'AD': '46.172.224.0/19',
5109 'AE': '94.200.0.0/13',
5110 'AF': '149.54.0.0/17',
5111 'AG': '209.59.64.0/18',
5112 'AI': '204.14.248.0/21',
5113 'AL': '46.99.0.0/16',
5114 'AM': '46.70.0.0/15',
5115 'AO': '105.168.0.0/13',
5116 'AP': '182.50.184.0/21',
5117 'AQ': '23.154.160.0/24',
5118 'AR': '181.0.0.0/12',
5119 'AS': '202.70.112.0/20',
5120 'AT': '77.116.0.0/14',
5121 'AU': '1.128.0.0/11',
5122 'AW': '181.41.0.0/18',
5123 'AX': '185.217.4.0/22',
5124 'AZ': '5.197.0.0/16',
5125 'BA': '31.176.128.0/17',
5126 'BB': '65.48.128.0/17',
5127 'BD': '114.130.0.0/16',
5129 'BF': '102.178.0.0/15',
5130 'BG': '95.42.0.0/15',
5131 'BH': '37.131.0.0/17',
5132 'BI': '154.117.192.0/18',
5133 'BJ': '137.255.0.0/16',
5134 'BL': '185.212.72.0/23',
5135 'BM': '196.12.64.0/18',
5136 'BN': '156.31.0.0/16',
5137 'BO': '161.56.0.0/16',
5138 'BQ': '161.0.80.0/20',
5139 'BR': '191.128.0.0/12',
5140 'BS': '24.51.64.0/18',
5141 'BT': '119.2.96.0/19',
5142 'BW': '168.167.0.0/16',
5143 'BY': '178.120.0.0/13',
5144 'BZ': '179.42.192.0/18',
5145 'CA': '99.224.0.0/11',
5146 'CD': '41.243.0.0/16',
5147 'CF': '197.242.176.0/21',
5148 'CG': '160.113.0.0/16',
5149 'CH': '85.0.0.0/13',
5150 'CI': '102.136.0.0/14',
5151 'CK': '202.65.32.0/19',
5152 'CL': '152.172.0.0/14',
5153 'CM': '102.244.0.0/14',
5154 'CN': '36.128.0.0/10',
5155 'CO': '181.240.0.0/12',
5156 'CR': '201.192.0.0/12',
5157 'CU': '152.206.0.0/15',
5158 'CV': '165.90.96.0/19',
5159 'CW': '190.88.128.0/17',
5160 'CY': '31.153.0.0/16',
5161 'CZ': '88.100.0.0/14',
5163 'DJ': '197.241.0.0/17',
5164 'DK': '87.48.0.0/12',
5165 'DM': '192.243.48.0/20',
5166 'DO': '152.166.0.0/15',
5167 'DZ': '41.96.0.0/12',
5168 'EC': '186.68.0.0/15',
5169 'EE': '90.190.0.0/15',
5170 'EG': '156.160.0.0/11',
5171 'ER': '196.200.96.0/20',
5172 'ES': '88.0.0.0/11',
5173 'ET': '196.188.0.0/14',
5174 'EU': '2.16.0.0/13',
5175 'FI': '91.152.0.0/13',
5176 'FJ': '144.120.0.0/16',
5177 'FK': '80.73.208.0/21',
5178 'FM': '119.252.112.0/20',
5179 'FO': '88.85.32.0/19',
5181 'GA': '41.158.0.0/15',
5183 'GD': '74.122.88.0/21',
5184 'GE': '31.146.0.0/16',
5185 'GF': '161.22.64.0/18',
5186 'GG': '62.68.160.0/19',
5187 'GH': '154.160.0.0/12',
5188 'GI': '95.164.0.0/16',
5189 'GL': '88.83.0.0/19',
5190 'GM': '160.182.0.0/15',
5191 'GN': '197.149.192.0/18',
5192 'GP': '104.250.0.0/19',
5193 'GQ': '105.235.224.0/20',
5194 'GR': '94.64.0.0/13',
5195 'GT': '168.234.0.0/16',
5196 'GU': '168.123.0.0/16',
5197 'GW': '197.214.80.0/20',
5198 'GY': '181.41.64.0/18',
5199 'HK': '113.252.0.0/14',
5200 'HN': '181.210.0.0/16',
5201 'HR': '93.136.0.0/13',
5202 'HT': '148.102.128.0/17',
5203 'HU': '84.0.0.0/14',
5204 'ID': '39.192.0.0/10',
5205 'IE': '87.32.0.0/12',
5206 'IL': '79.176.0.0/13',
5207 'IM': '5.62.80.0/20',
5208 'IN': '117.192.0.0/10',
5209 'IO': '203.83.48.0/21',
5210 'IQ': '37.236.0.0/14',
5211 'IR': '2.176.0.0/12',
5212 'IS': '82.221.0.0/16',
5213 'IT': '79.0.0.0/10',
5214 'JE': '87.244.64.0/18',
5215 'JM': '72.27.0.0/17',
5216 'JO': '176.29.0.0/16',
5217 'JP': '133.0.0.0/8',
5218 'KE': '105.48.0.0/12',
5219 'KG': '158.181.128.0/17',
5220 'KH': '36.37.128.0/17',
5221 'KI': '103.25.140.0/22',
5222 'KM': '197.255.224.0/20',
5223 'KN': '198.167.192.0/19',
5224 'KP': '175.45.176.0/22',
5225 'KR': '175.192.0.0/10',
5226 'KW': '37.36.0.0/14',
5227 'KY': '64.96.0.0/15',
5228 'KZ': '2.72.0.0/13',
5229 'LA': '115.84.64.0/18',
5230 'LB': '178.135.0.0/16',
5231 'LC': '24.92.144.0/20',
5232 'LI': '82.117.0.0/19',
5233 'LK': '112.134.0.0/15',
5234 'LR': '102.183.0.0/16',
5235 'LS': '129.232.0.0/17',
5236 'LT': '78.56.0.0/13',
5237 'LU': '188.42.0.0/16',
5238 'LV': '46.109.0.0/16',
5239 'LY': '41.252.0.0/14',
5240 'MA': '105.128.0.0/11',
5241 'MC': '88.209.64.0/18',
5242 'MD': '37.246.0.0/16',
5243 'ME': '178.175.0.0/17',
5244 'MF': '74.112.232.0/21',
5245 'MG': '154.126.0.0/17',
5246 'MH': '117.103.88.0/21',
5247 'MK': '77.28.0.0/15',
5248 'ML': '154.118.128.0/18',
5249 'MM': '37.111.0.0/17',
5250 'MN': '49.0.128.0/17',
5251 'MO': '60.246.0.0/16',
5252 'MP': '202.88.64.0/20',
5253 'MQ': '109.203.224.0/19',
5254 'MR': '41.188.64.0/18',
5255 'MS': '208.90.112.0/22',
5256 'MT': '46.11.0.0/16',
5257 'MU': '105.16.0.0/12',
5258 'MV': '27.114.128.0/18',
5259 'MW': '102.70.0.0/15',
5260 'MX': '187.192.0.0/11',
5261 'MY': '175.136.0.0/13',
5262 'MZ': '197.218.0.0/15',
5263 'NA': '41.182.0.0/16',
5264 'NC': '101.101.0.0/18',
5265 'NE': '197.214.0.0/18',
5266 'NF': '203.17.240.0/22',
5267 'NG': '105.112.0.0/12',
5268 'NI': '186.76.0.0/15',
5269 'NL': '145.96.0.0/11',
5270 'NO': '84.208.0.0/13',
5271 'NP': '36.252.0.0/15',
5272 'NR': '203.98.224.0/19',
5273 'NU': '49.156.48.0/22',
5274 'NZ': '49.224.0.0/14',
5275 'OM': '5.36.0.0/15',
5276 'PA': '186.72.0.0/15',
5277 'PE': '186.160.0.0/14',
5278 'PF': '123.50.64.0/18',
5279 'PG': '124.240.192.0/19',
5280 'PH': '49.144.0.0/13',
5281 'PK': '39.32.0.0/11',
5282 'PL': '83.0.0.0/11',
5283 'PM': '70.36.0.0/20',
5284 'PR': '66.50.0.0/16',
5285 'PS': '188.161.0.0/16',
5286 'PT': '85.240.0.0/13',
5287 'PW': '202.124.224.0/20',
5288 'PY': '181.120.0.0/14',
5289 'QA': '37.210.0.0/15',
5290 'RE': '102.35.0.0/16',
5291 'RO': '79.112.0.0/13',
5292 'RS': '93.86.0.0/15',
5293 'RU': '5.136.0.0/13',
5294 'RW': '41.186.0.0/16',
5295 'SA': '188.48.0.0/13',
5296 'SB': '202.1.160.0/19',
5297 'SC': '154.192.0.0/11',
5298 'SD': '102.120.0.0/13',
5299 'SE': '78.64.0.0/12',
5300 'SG': '8.128.0.0/10',
5301 'SI': '188.196.0.0/14',
5302 'SK': '78.98.0.0/15',
5303 'SL': '102.143.0.0/17',
5304 'SM': '89.186.32.0/19',
5305 'SN': '41.82.0.0/15',
5306 'SO': '154.115.192.0/18',
5307 'SR': '186.179.128.0/17',
5308 'SS': '105.235.208.0/21',
5309 'ST': '197.159.160.0/19',
5310 'SV': '168.243.0.0/16',
5311 'SX': '190.102.0.0/20',
5313 'SZ': '41.84.224.0/19',
5314 'TC': '65.255.48.0/20',
5315 'TD': '154.68.128.0/19',
5316 'TG': '196.168.0.0/14',
5317 'TH': '171.96.0.0/13',
5318 'TJ': '85.9.128.0/18',
5319 'TK': '27.96.24.0/21',
5320 'TL': '180.189.160.0/20',
5321 'TM': '95.85.96.0/19',
5322 'TN': '197.0.0.0/11',
5323 'TO': '175.176.144.0/21',
5324 'TR': '78.160.0.0/11',
5325 'TT': '186.44.0.0/15',
5326 'TV': '202.2.96.0/19',
5327 'TW': '120.96.0.0/11',
5328 'TZ': '156.156.0.0/14',
5329 'UA': '37.52.0.0/14',
5330 'UG': '102.80.0.0/13',
5332 'UY': '167.56.0.0/13',
5333 'UZ': '84.54.64.0/18',
5334 'VA': '212.77.0.0/19',
5335 'VC': '207.191.240.0/21',
5336 'VE': '186.88.0.0/13',
5337 'VG': '66.81.192.0/20',
5338 'VI': '146.226.0.0/16',
5339 'VN': '14.160.0.0/11',
5340 'VU': '202.80.32.0/20',
5341 'WF': '117.20.32.0/21',
5342 'WS': '202.4.32.0/19',
5343 'YE': '134.35.0.0/16',
5344 'YT': '41.242.116.0/22',
5345 'ZA': '41.0.0.0/11',
5346 'ZM': '102.144.0.0/13',
5347 'ZW': '102.177.192.0/18',
5351 def random_ipv4(cls, code_or_block):
5352 if len(code_or_block) == 2:
5353 block = cls._country_ip_map.get(code_or_block.upper())
5357 block = code_or_block
5358 addr, preflen = block.split('/')
5359 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5360 addr_max = addr_min | (0xffffffff >> int(preflen))
5361 return compat_str(socket.inet_ntoa(
5362 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5365 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5366 def __init__(self, proxies=None):
5367 # Set default handlers
5368 for type in ('http', 'https'):
5369 setattr(self, '%s_open' % type,
5370 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5371 meth(r, proxy, type))
5372 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5374 def proxy_open(self, req, proxy, type):
5375 req_proxy = req.headers.get('Ytdl-request-proxy')
5376 if req_proxy is not None:
5378 del req.headers['Ytdl-request-proxy']
5380 if proxy == '__noproxy__':
5381 return None # No Proxy
5382 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5383 req.add_header('Ytdl-socks-proxy', proxy)
5384 # youtube-dlc's http/https handlers do wrapping the socket with socks
5386 return compat_urllib_request.ProxyHandler.proxy_open(
5387 self, req, proxy, type)
5390 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5391 # released into Public Domain
5392 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5394 def long_to_bytes(n, blocksize=0):
5395 """long_to_bytes(n:long, blocksize:int) : string
5396 Convert a long integer to a byte string.
5398 If optional blocksize is given and greater than zero, pad the front of the
5399 byte string with binary zeros so that the length is a multiple of
5402 # after much testing, this algorithm was deemed to be the fastest
5406 s = compat_struct_pack('>I', n & 0xffffffff) + s
5408 # strip off leading zeros
5409 for i in range(len(s)):
5410 if s[i] != b'\000'[0]:
5413 # only happens when n == 0
5417 # add back some pad bytes. this could be done more efficiently w.r.t. the
5418 # de-padding being done above, but sigh...
5419 if blocksize > 0 and len(s) % blocksize:
5420 s = (blocksize - len(s) % blocksize) * b'\000' + s
5424 def bytes_to_long(s):
5425 """bytes_to_long(string) : long
5426 Convert a byte string to a long integer.
5428 This is (essentially) the inverse of long_to_bytes().
5433 extra = (4 - length % 4)
5434 s = b'\000' * extra + s
5435 length = length + extra
5436 for i in range(0, length, 4):
5437 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5441 def ohdave_rsa_encrypt(data, exponent, modulus):
5443 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5446 data: data to encrypt, bytes-like object
5447 exponent, modulus: parameter e and N of RSA algorithm, both integer
5448 Output: hex string of encrypted data
5450 Limitation: supports one block encryption only
5453 payload = int(binascii.hexlify(data[::-1]), 16)
5454 encrypted = pow(payload, exponent, modulus)
5455 return '%x' % encrypted
5458 def pkcs1pad(data, length):
5460 Padding input data with PKCS#1 scheme
5462 @param {int[]} data input data
5463 @param {int} length target length
5464 @returns {int[]} padded data
5466 if len(data) > length - 11:
5467 raise ValueError('Input data too
long for PKCS
#1 padding')
5469 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5470 return [0, 2] + pseudo_random
+ [0] + data
5473 def encode_base_n(num
, n
, table
=None):
5474 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5476 table
= FULL_TABLE
[:n
]
5479 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5486 ret
= table
[num
% n
] + ret
5491 def decode_packed_codes(code
):
5492 mobj
= re
.search(PACKED_CODES_RE
, code
)
5493 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5496 symbols
= symbols
.split('|')
5501 base_n_count
= encode_base_n(count
, base
)
5502 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5505 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5509 def caesar(s
, alphabet
, shift
):
5514 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5519 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5522 def parse_m3u8_attributes(attrib
):
5524 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5525 if val
.startswith('"'):
5531 def urshift(val
, n
):
5532 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5535 # Based on png2str() written by @gdkchan and improved by @yokrysty
5536 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5537 def decode_png(png_data
):
5538 # Reference: https://www.w3.org/TR/PNG/
5539 header
= png_data
[8:]
5541 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5542 raise IOError('Not a valid PNG file.')
5544 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5545 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5550 length
= unpack_integer(header
[:4])
5553 chunk_type
= header
[:4]
5556 chunk_data
= header
[:length
]
5557 header
= header
[length
:]
5559 header
= header
[4:] # Skip CRC
5567 ihdr
= chunks
[0]['data']
5569 width
= unpack_integer(ihdr
[:4])
5570 height
= unpack_integer(ihdr
[4:8])
5574 for chunk
in chunks
:
5575 if chunk
['type'] == b
'IDAT':
5576 idat
+= chunk
['data']
5579 raise IOError('Unable to read PNG data.')
5581 decompressed_data
= bytearray(zlib
.decompress(idat
))
5586 def _get_pixel(idx
):
5591 for y
in range(height
):
5592 basePos
= y
* (1 + stride
)
5593 filter_type
= decompressed_data
[basePos
]
5597 pixels
.append(current_row
)
5599 for x
in range(stride
):
5600 color
= decompressed_data
[1 + basePos
+ x
]
5601 basex
= y
* stride
+ x
5606 left
= _get_pixel(basex
- 3)
5608 up
= _get_pixel(basex
- stride
)
5610 if filter_type
== 1: # Sub
5611 color
= (color
+ left
) & 0xff
5612 elif filter_type
== 2: # Up
5613 color
= (color
+ up
) & 0xff
5614 elif filter_type
== 3: # Average
5615 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5616 elif filter_type
== 4: # Paeth
5622 c
= _get_pixel(basex
- stride
- 3)
5630 if pa
<= pb
and pa
<= pc
:
5631 color
= (color
+ a
) & 0xff
5633 color
= (color
+ b
) & 0xff
5635 color
= (color
+ c
) & 0xff
5637 current_row
.append(color
)
5639 return width
, height
, pixels
5642 def write_xattr(path
, key
, value
):
5643 # This mess below finds the best xattr tool for the job
5645 # try the pyxattr module...
5648 if hasattr(xattr
, 'set'): # pyxattr
5649 # Unicode arguments are not supported in python-pyxattr until
5651 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5652 pyxattr_required_version
= '0.5.0'
5653 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5654 # TODO: fallback to CLI tools
5655 raise XAttrUnavailableError(
5656 'python-pyxattr is detected but is too old. '
5657 'youtube-dlc requires %s or above while your version is %s. '
5658 'Falling back to other xattr implementations' % (
5659 pyxattr_required_version
, xattr
.__version
__))
5661 setxattr
= xattr
.set
5663 setxattr
= xattr
.setxattr
5666 setxattr(path
, key
, value
)
5667 except EnvironmentError as e
:
5668 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5671 if compat_os_name
== 'nt':
5672 # Write xattrs to NTFS Alternate Data Streams:
5673 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5674 assert ':' not in key
5675 assert os
.path
.exists(path
)
5677 ads_fn
= path
+ ':' + key
5679 with open(ads_fn
, 'wb') as f
:
5681 except EnvironmentError as e
:
5682 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5684 user_has_setfattr
= check_executable('setfattr', ['--version'])
5685 user_has_xattr
= check_executable('xattr', ['-h'])
5687 if user_has_setfattr
or user_has_xattr
:
5689 value
= value
.decode('utf-8')
5690 if user_has_setfattr
:
5691 executable
= 'setfattr'
5692 opts
= ['-n', key
, '-v', value
]
5693 elif user_has_xattr
:
5694 executable
= 'xattr'
5695 opts
= ['-w', key
, value
]
5697 cmd
= ([encodeFilename(executable
, True)]
5698 + [encodeArgument(o
) for o
in opts
]
5699 + [encodeFilename(path
, True)])
5702 p
= subprocess
.Popen(
5703 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5704 except EnvironmentError as e
:
5705 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5706 stdout
, stderr
= p
.communicate()
5707 stderr
= stderr
.decode('utf-8', 'replace')
5708 if p
.returncode
!= 0:
5709 raise XAttrMetadataError(p
.returncode
, stderr
)
5712 # On Unix, and can't find pyxattr, setfattr, or xattr.
5713 if sys
.platform
.startswith('linux'):
5714 raise XAttrUnavailableError(
5715 "Couldn't find a tool to set the xattrs. "
5716 "Install either the python 'pyxattr' or 'xattr' "
5717 "modules, or the GNU 'attr' package "
5718 "(which contains the 'setfattr' tool).")
5720 raise XAttrUnavailableError(
5721 "Couldn't find a tool to set the xattrs. "
5722 "Install either the python 'xattr' module, "
5723 "or the 'xattr' binary.")
5726 def random_birthday(year_field
, month_field
, day_field
):
5727 start_date
= datetime
.date(1950, 1, 1)
5728 end_date
= datetime
.date(1995, 12, 31)
5729 offset
= random
.randint(0, (end_date
- start_date
).days
)
5730 random_date
= start_date
+ datetime
.timedelta(offset
)
5732 year_field
: str(random_date
.year
),
5733 month_field
: str(random_date
.month
),
5734 day_field
: str(random_date
.day
),
5738 # Templates for internet shortcut files, which are plain text files.
5739 DOT_URL_LINK_TEMPLATE
= '''
5744 DOT_WEBLOC_LINK_TEMPLATE
= '''
5745 <?xml version="1.0" encoding="UTF-8"?>
5746 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5747 <plist version="1.0">
5750 \t<string>%(url)s</string>
5755 DOT_DESKTOP_LINK_TEMPLATE
= '''
5765 def iri_to_uri(iri
):
5767 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5769 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5772 iri_parts
= compat_urllib_parse_urlparse(iri
)
5774 if '[' in iri_parts
.netloc
:
5775 raise ValueError('IPv6 URIs are not, yet, supported.')
5776 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5778 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5781 if iri_parts
.username
:
5782 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5783 if iri_parts
.password
is not None:
5784 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5787 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
5788 # The 'idna' encoding produces ASCII text.
5789 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5790 net_location
+= ':' + str(iri_parts
.port
)
5792 return compat_urllib_parse_urlunparse(
5796 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5798 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5799 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5801 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5802 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5804 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5806 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5809 def to_high_limit_path(path
):
5810 if sys
.platform
in ['win32', 'cygwin']:
5811 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5812 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
5817 def format_field(obj
, field
, template
='%s', ignore
=(None, ''), default
='', func
=None):
5818 val
= obj
.get(field
, default
)
5819 if func
and val
not in ignore
:
5821 return template
% val
if val
not in ignore
else default