4 from __future__
import unicode_literals
36 import xml
.etree
.ElementTree
40 compat_HTMLParseError
,
45 compat_ctypes_WINFUNCTYPE
,
46 compat_etree_fromstring
,
49 compat_html_entities_html5
,
61 compat_urllib_parse_urlencode
,
62 compat_urllib_parse_urlparse
,
63 compat_urllib_parse_urlunparse
,
64 compat_urllib_parse_quote
,
65 compat_urllib_parse_quote_plus
,
66 compat_urllib_parse_unquote_plus
,
67 compat_urllib_request
,
78 def register_socks_protocols():
79 # "Register" SOCKS protocols
80 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
81 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
82 for scheme
in ('socks', 'socks4', 'socks4a', 'socks5'):
83 if scheme
not in compat_urlparse
.uses_netloc
:
84 compat_urlparse
.uses_netloc
.append(scheme
)
87 # This is not clearly defined otherwise
88 compiled_regex_type
= type(re
.compile(''))
91 def random_user_agent():
92 _USER_AGENT_TPL
= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1671 return _USER_AGENT_TPL
% random
.choice(_CHROME_VERSIONS
)
1675 'User-Agent': random_user_agent(),
1676 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1677 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1678 'Accept-Encoding': 'gzip, deflate',
1679 'Accept-Language': 'en-us,en;q=0.5',
1684 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1688 NO_DEFAULT
= object()
1690 ENGLISH_MONTH_NAMES
= [
1691 'January', 'February', 'March', 'April', 'May', 'June',
1692 'July', 'August', 'September', 'October', 'November', 'December']
1695 'en': ENGLISH_MONTH_NAMES
,
1697 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1698 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1701 KNOWN_EXTENSIONS
= (
1702 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1703 'flv', 'f4v', 'f4a', 'f4b',
1704 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1705 'mkv', 'mka', 'mk3d',
1708 'asf', 'wmv', 'wma',
1714 'f4f', 'f4m', 'm3u8', 'smil')
1716 # needed for sanitizing filenames in restricted mode
1717 ACCENT_CHARS
= dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1718 itertools
.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1719 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1742 '%Y/%m/%d %H:%M:%S',
1744 '%Y-%m-%d %H:%M:%S',
1745 '%Y-%m-%d %H:%M:%S.%f',
1748 '%Y-%m-%dT%H:%M:%SZ',
1749 '%Y-%m-%dT%H:%M:%S.%fZ',
1750 '%Y-%m-%dT%H:%M:%S.%f0Z',
1751 '%Y-%m-%dT%H:%M:%S',
1752 '%Y-%m-%dT%H:%M:%S.%f',
1754 '%b %d %Y at %H:%M',
1755 '%b %d %Y at %H:%M:%S',
1756 '%B %d %Y at %H:%M',
1757 '%B %d %Y at %H:%M:%S',
1760 DATE_FORMATS_DAY_FIRST
= list(DATE_FORMATS
)
1761 DATE_FORMATS_DAY_FIRST
.extend([
1767 '%d/%m/%Y %H:%M:%S',
1770 DATE_FORMATS_MONTH_FIRST
= list(DATE_FORMATS
)
1771 DATE_FORMATS_MONTH_FIRST
.extend([
1776 '%m/%d/%Y %H:%M:%S',
1779 PACKED_CODES_RE
= r
"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1780 JSON_LD_RE
= r
'(?is)<script[^>]+type=(["\']?
)application
/ld\
+json\
1[^
>]*>(?P
<json_ld
>.+?
)</script
>'
1783 def preferredencoding():
1784 """Get preferred encoding.
1786 Returns the best encoding scheme for the system, based on
1787 locale.getpreferredencoding() and some further tweaks.
1790 pref = locale.getpreferredencoding()
1798 def write_json_file(obj, fn):
1799 """ Encode obj as JSON and write it to fn, atomically if possible """
1801 fn = encodeFilename(fn)
1802 if sys.version_info < (3, 0) and sys.platform != 'win32
':
1803 encoding = get_filesystem_encoding()
1804 # os.path.basename returns a bytes object, but NamedTemporaryFile
1805 # will fail if the filename contains non ascii characters unless we
1806 # use a unicode object
1807 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1808 # the same for os.path.dirname
1809 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1811 path_basename = os.path.basename
1812 path_dirname = os.path.dirname
1816 'prefix
': path_basename(fn) + '.',
1817 'dir': path_dirname(fn),
1821 # In Python 2.x, json.dump expects a bytestream.
1822 # In Python 3.x, it writes to a character stream
1823 if sys.version_info < (3, 0):
1828 'encoding
': 'utf
-8',
1831 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1836 if sys.platform == 'win32
':
1837 # Need to remove existing file on Windows, else os.rename raises
1838 # WindowsError or FileExistsError.
1846 os.chmod(tf.name, 0o666 & ~mask)
1849 os.rename(tf.name, fn)
1858 if sys.version_info >= (2, 7):
1859 def find_xpath_attr(node, xpath, key, val=None):
1860 """ Find the xpath xpath[@key=val] """
1861 assert re.match(r'^
[a
-zA
-Z_
-]+$
', key)
1862 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1863 return node.find(expr)
1865 def find_xpath_attr(node, xpath, key, val=None):
1866 for f in node.findall(compat_xpath(xpath)):
1867 if key not in f.attrib:
1869 if val is None or f.attrib.get(key) == val:
1873 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1874 # the namespace parameter
1877 def xpath_with_ns(path
, ns_map
):
1878 components
= [c
.split(':') for c
in path
.split('/')]
1880 for c
in components
:
1882 replaced
.append(c
[0])
1885 replaced
.append('{%s}%s' % (ns_map
[ns
], tag
))
1886 return '/'.join(replaced
)
1889 def xpath_element(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1890 def _find_xpath(xpath
):
1891 return node
.find(compat_xpath(xpath
))
1893 if isinstance(xpath
, (str, compat_str
)):
1894 n
= _find_xpath(xpath
)
1902 if default
is not NO_DEFAULT
:
1905 name
= xpath
if name
is None else name
1906 raise ExtractorError('Could not find XML element %s' % name
)
1912 def xpath_text(node
, xpath
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1913 n
= xpath_element(node
, xpath
, name
, fatal
=fatal
, default
=default
)
1914 if n
is None or n
== default
:
1917 if default
is not NO_DEFAULT
:
1920 name
= xpath
if name
is None else name
1921 raise ExtractorError('Could not find XML element\'s text %s' % name
)
1927 def xpath_attr(node
, xpath
, key
, name
=None, fatal
=False, default
=NO_DEFAULT
):
1928 n
= find_xpath_attr(node
, xpath
, key
)
1930 if default
is not NO_DEFAULT
:
1933 name
= '%s[@%s]' % (xpath
, key
) if name
is None else name
1934 raise ExtractorError('Could not find XML attribute %s' % name
)
1937 return n
.attrib
[key
]
1940 def get_element_by_id(id, html
):
1941 """Return the content of the tag with the specified ID in the passed HTML document"""
1942 return get_element_by_attribute('id', id, html
)
1945 def get_element_by_class(class_name
, html
):
1946 """Return the content of the first tag with the specified class in the passed HTML document"""
1947 retval
= get_elements_by_class(class_name
, html
)
1948 return retval
[0] if retval
else None
1951 def get_element_by_attribute(attribute
, value
, html
, escape_value
=True):
1952 retval
= get_elements_by_attribute(attribute
, value
, html
, escape_value
)
1953 return retval
[0] if retval
else None
1956 def get_elements_by_class(class_name
, html
):
1957 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1958 return get_elements_by_attribute(
1959 'class', r
'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1960 html, escape_value=False)
1963 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1964 """Return the content of the tag with the specified attribute in the passed HTML document"""
1966 value = re.escape(value) if escape_value else value
1969 for m in re.finditer(r'''(?xs)
1971 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^
']*'|
))*?
1973 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^
"]*"|
='[^']*'|))*?
1977 ''' % (re.escape(attribute), value), html):
1978 res = m.group('content
')
1980 if res.startswith('"') or res.startswith("'"):
1983 retlist.append(unescapeHTML(res))
1988 class HTMLAttributeParser(compat_HTMLParser):
1989 """Trivial HTML parser to gather the attributes for a single element"""
1993 compat_HTMLParser.__init__(self)
1995 def handle_starttag(self, tag, attrs):
1996 self.attrs = dict(attrs)
1999 def extract_attributes(html_element):
2000 """Given a string for an HTML element such as
2002 a="foo" B="bar" c="&98;az" d=boz
2003 empty= noval entity="&"
2006 Decode and return a dictionary of attributes.
2008 'a
': 'foo
', 'b
': 'bar
', c: 'baz
', d: 'boz
',
2009 'empty
': '', 'noval
': None, 'entity
': '&',
2010 'sq
': '"', 'dq': '\''
2012 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2013 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2015 parser = HTMLAttributeParser()
2017 parser.feed(html_element)
2019 # Older Python may throw HTMLParseError in case of malformed HTML
2020 except compat_HTMLParseError:
2025 def clean_html(html):
2026 """Clean an HTML snippet into a readable string"""
2028 if html is None: # Convenience for sanitizing descriptions etc.
2032 html = html.replace('\n', ' ')
2033 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2034 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2036 html = re.sub('<.*?>', '', html)
2037 # Replace html entities
2038 html = unescapeHTML(html)
2042 def sanitize_open(filename, open_mode):
2043 """Try to open the given filename, and slightly tweak it if this fails.
2045 Attempts to open the given filename. If this fails, it tries to change
2046 the filename slightly, step by step, until it's either able to open it
2047 or it fails and raises a final exception, like the standard open()
2050 It returns the tuple (stream, definitive_file_name).
2054 if sys.platform == 'win32':
2056 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2057 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2058 stream = open(encodeFilename(filename), open_mode)
2059 return (stream, filename)
2060 except (IOError, OSError) as err:
2061 if err.errno in (errno.EACCES,):
2064 # In case of error, try to remove win32 forbidden chars
2065 alt_filename = sanitize_path(filename)
2066 if alt_filename == filename:
2069 # An exception here should be caught in the caller
2070 stream = open(encodeFilename(alt_filename), open_mode)
2071 return (stream, alt_filename)
2074 def timeconvert(timestr):
2075 """Convert RFC 2822 defined time string into system timestamp"""
2077 timetuple = email.utils.parsedate_tz(timestr)
2078 if timetuple is not None:
2079 timestamp = email.utils.mktime_tz(timetuple)
2083 def sanitize_filename(s, restricted=False, is_id=False):
2084 """Sanitizes a string so it could be used as part of a filename.
2085 If restricted is set, use a stricter subset of allowed characters.
2086 Set is_id if this is not an arbitrary string, but an ID that should be kept
2089 def replace_insane(char):
2090 if restricted and char in ACCENT_CHARS:
2091 return ACCENT_CHARS[char]
2092 if char == '?' or ord(char) < 32 or ord(char) == 127:
2095 return '' if restricted else '\''
2097 return '_
-' if restricted else ' -'
2098 elif char in '\\/|
*<>':
2100 if restricted and (char in '!&\'()[]{}$
;`^
,#' or char.isspace()):
2102 if restricted
and ord(char
) > 127:
2107 s
= re
.sub(r
'[0-9]+(?::[0-9]+)+', lambda m
: m
.group(0).replace(':', '_'), s
)
2108 result
= ''.join(map(replace_insane
, s
))
2110 while '__' in result
:
2111 result
= result
.replace('__', '_')
2112 result
= result
.strip('_')
2113 # Common case of "Foreign band name - English song title"
2114 if restricted
and result
.startswith('-_'):
2116 if result
.startswith('-'):
2117 result
= '_' + result
[len('-'):]
2118 result
= result
.lstrip('.')
2124 def sanitize_path(s
):
2125 """Sanitizes and normalizes path on Windows"""
2126 if sys
.platform
!= 'win32':
2128 drive_or_unc
, _
= os
.path
.splitdrive(s
)
2129 if sys
.version_info
< (2, 7) and not drive_or_unc
:
2130 drive_or_unc
, _
= os
.path
.splitunc(s
)
2131 norm_path
= os
.path
.normpath(remove_start(s
, drive_or_unc
)).split(os
.path
.sep
)
2135 path_part
if path_part
in ['.', '..'] else re
.sub(r
'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part
)
2136 for path_part
in norm_path
]
2138 sanitized_path
.insert(0, drive_or_unc
+ os
.path
.sep
)
2139 return os
.path
.join(*sanitized_path
)
2142 def sanitize_url(url
):
2143 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2144 # the number of unwanted failures due to missing protocol
2145 if url
.startswith('//'):
2146 return 'http:%s' % url
2147 # Fix some common typos seen so far
2149 # https://github.com/ytdl-org/youtube-dl/issues/15649
2150 (r
'^httpss://', r
'https://'),
2151 # https://bx1.be/lives/direct-tv/
2152 (r
'^rmtp([es]?)://', r
'rtmp\1://'),
2154 for mistake
, fixup
in COMMON_TYPOS
:
2155 if re
.match(mistake
, url
):
2156 return re
.sub(mistake
, fixup
, url
)
2160 def sanitized_Request(url
, *args
, **kwargs
):
2161 return compat_urllib_request
.Request(sanitize_url(url
), *args
, **kwargs
)
2165 """Expand shell variables and ~"""
2166 return os
.path
.expandvars(compat_expanduser(s
))
2169 def orderedSet(iterable
):
2170 """ Remove all duplicates from the input iterable """
2178 def _htmlentity_transform(entity_with_semicolon
):
2179 """Transforms an HTML entity to a character."""
2180 entity
= entity_with_semicolon
[:-1]
2182 # Known non-numeric HTML entity
2183 if entity
in compat_html_entities
.name2codepoint
:
2184 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
2186 # TODO: HTML5 allows entities without a semicolon. For example,
2187 # 'Éric' should be decoded as 'Éric'.
2188 if entity_with_semicolon
in compat_html_entities_html5
:
2189 return compat_html_entities_html5
[entity_with_semicolon
]
2191 mobj
= re
.match(r
'#(x[0-9a-fA-F]+|[0-9]+)', entity
)
2192 if mobj
is not None:
2193 numstr
= mobj
.group(1)
2194 if numstr
.startswith('x'):
2196 numstr
= '0%s' % numstr
2199 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2201 return compat_chr(int(numstr
, base
))
2205 # Unknown entity in name, return its literal representation
2206 return '&%s;' % entity
2209 def unescapeHTML(s
):
2212 assert type(s
) == compat_str
2215 r
'&([^&;]+;)', lambda m
: _htmlentity_transform(m
.group(1)), s
)
2218 def process_communicate_or_kill(p
, *args
, **kwargs
):
2220 return p
.communicate(*args
, **kwargs
)
2221 except BaseException
: # Including KeyboardInterrupt
2227 def get_subprocess_encoding():
2228 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2229 # For subprocess calls, encode with locale encoding
2230 # Refer to http://stackoverflow.com/a/9951851/35070
2231 encoding
= preferredencoding()
2233 encoding
= sys
.getfilesystemencoding()
2234 if encoding
is None:
2239 def encodeFilename(s
, for_subprocess
=False):
2241 @param s The name of the file
2244 assert type(s
) == compat_str
2246 # Python 3 has a Unicode API
2247 if sys
.version_info
>= (3, 0):
2250 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2251 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2252 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2253 if not for_subprocess
and sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
2256 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2257 if sys
.platform
.startswith('java'):
2260 return s
.encode(get_subprocess_encoding(), 'ignore')
2263 def decodeFilename(b
, for_subprocess
=False):
2265 if sys
.version_info
>= (3, 0):
2268 if not isinstance(b
, bytes):
2271 return b
.decode(get_subprocess_encoding(), 'ignore')
2274 def encodeArgument(s
):
2275 if not isinstance(s
, compat_str
):
2276 # Legacy code that uses byte strings
2277 # Uncomment the following line after fixing all post processors
2278 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2279 s
= s
.decode('ascii')
2280 return encodeFilename(s
, True)
2283 def decodeArgument(b
):
2284 return decodeFilename(b
, True)
2287 def decodeOption(optval
):
2290 if isinstance(optval
, bytes):
2291 optval
= optval
.decode(preferredencoding())
2293 assert isinstance(optval
, compat_str
)
2297 def formatSeconds(secs
, delim
=':'):
2299 return '%d%s%02d%s%02d' % (secs
// 3600, delim
, (secs
% 3600) // 60, delim
, secs
% 60)
2301 return '%d%s%02d' % (secs
// 60, delim
, secs
% 60)
2306 def make_HTTPS_handler(params
, **kwargs
):
2307 opts_no_check_certificate
= params
.get('nocheckcertificate', False)
2308 if hasattr(ssl
, 'create_default_context'): # Python >= 3.4 or 2.7.9
2309 context
= ssl
.create_default_context(ssl
.Purpose
.SERVER_AUTH
)
2310 if opts_no_check_certificate
:
2311 context
.check_hostname
= False
2312 context
.verify_mode
= ssl
.CERT_NONE
2314 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2317 # (create_default_context present but HTTPSHandler has no context=)
2320 if sys
.version_info
< (3, 2):
2321 return YoutubeDLHTTPSHandler(params
, **kwargs
)
2322 else: # Python < 3.4
2323 context
= ssl
.SSLContext(ssl
.PROTOCOL_TLSv1
)
2324 context
.verify_mode
= (ssl
.CERT_NONE
2325 if opts_no_check_certificate
2326 else ssl
.CERT_REQUIRED
)
2327 context
.set_default_verify_paths()
2328 return YoutubeDLHTTPSHandler(params
, context
=context
, **kwargs
)
2331 def bug_reports_message():
2332 if ytdl_is_updateable():
2333 update_cmd
= 'type youtube-dlc -U to update'
2335 update_cmd
= 'see https://github.com/pukkandan/yt-dlp on how to update'
2336 msg
= '; please report this issue on https://github.com/pukkandan/yt-dlp .'
2337 msg
+= ' Make sure you are using the latest version; %s.' % update_cmd
2338 msg
+= ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.'
2342 class YoutubeDLError(Exception):
2343 """Base exception for YoutubeDL errors."""
2347 class ExtractorError(YoutubeDLError
):
2348 """Error during info extraction."""
2350 def __init__(self
, msg
, tb
=None, expected
=False, cause
=None, video_id
=None):
2351 """ tb, if given, is the original traceback (so that it can be printed out).
2352 If expected is set, this is a normal error message and most likely not a bug in youtube-dlc.
2355 if sys
.exc_info()[0] in (compat_urllib_error
.URLError
, socket
.timeout
, UnavailableVideoError
):
2357 if video_id
is not None:
2358 msg
= video_id
+ ': ' + msg
2360 msg
+= ' (caused by %r)' % cause
2362 msg
+= bug_reports_message()
2363 super(ExtractorError
, self
).__init
__(msg
)
2366 self
.exc_info
= sys
.exc_info() # preserve original exception
2368 self
.video_id
= video_id
2370 def format_traceback(self
):
2371 if self
.traceback
is None:
2373 return ''.join(traceback
.format_tb(self
.traceback
))
2376 class UnsupportedError(ExtractorError
):
2377 def __init__(self
, url
):
2378 super(UnsupportedError
, self
).__init
__(
2379 'Unsupported URL: %s' % url
, expected
=True)
2383 class RegexNotFoundError(ExtractorError
):
2384 """Error when a regex didn't match"""
2388 class GeoRestrictedError(ExtractorError
):
2389 """Geographic restriction Error exception.
2391 This exception may be thrown when a video is not available from your
2392 geographic location due to geographic restrictions imposed by a website.
2395 def __init__(self
, msg
, countries
=None):
2396 super(GeoRestrictedError
, self
).__init
__(msg
, expected
=True)
2398 self
.countries
= countries
2401 class DownloadError(YoutubeDLError
):
2402 """Download Error exception.
2404 This exception may be thrown by FileDownloader objects if they are not
2405 configured to continue on errors. They will contain the appropriate
2409 def __init__(self
, msg
, exc_info
=None):
2410 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2411 super(DownloadError
, self
).__init
__(msg
)
2412 self
.exc_info
= exc_info
2415 class SameFileError(YoutubeDLError
):
2416 """Same File exception.
2418 This exception will be thrown by FileDownloader objects if they detect
2419 multiple files would have to be downloaded to the same file on disk.
2424 class PostProcessingError(YoutubeDLError
):
2425 """Post Processing exception.
2427 This exception may be raised by PostProcessor's .run() method to
2428 indicate an error in the postprocessing task.
2431 def __init__(self
, msg
):
2432 super(PostProcessingError
, self
).__init
__(msg
)
2436 class ExistingVideoReached(YoutubeDLError
):
2437 """ --max-downloads limit has been reached. """
2441 class RejectedVideoReached(YoutubeDLError
):
2442 """ --max-downloads limit has been reached. """
2446 class MaxDownloadsReached(YoutubeDLError
):
2447 """ --max-downloads limit has been reached. """
2451 class UnavailableVideoError(YoutubeDLError
):
2452 """Unavailable Format exception.
2454 This exception will be thrown when a video is requested
2455 in a format that is not available for that video.
2460 class ContentTooShortError(YoutubeDLError
):
2461 """Content Too Short exception.
2463 This exception may be raised by FileDownloader objects when a file they
2464 download is too small for what the server announced first, indicating
2465 the connection was probably interrupted.
2468 def __init__(self
, downloaded
, expected
):
2469 super(ContentTooShortError
, self
).__init
__(
2470 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded
, expected
)
2473 self
.downloaded
= downloaded
2474 self
.expected
= expected
2477 class XAttrMetadataError(YoutubeDLError
):
2478 def __init__(self
, code
=None, msg
='Unknown error'):
2479 super(XAttrMetadataError
, self
).__init
__(msg
)
2483 # Parsing code and msg
2484 if (self
.code
in (errno
.ENOSPC
, errno
.EDQUOT
)
2485 or 'No space left' in self
.msg
or 'Disk quota exceeded' in self
.msg
):
2486 self
.reason
= 'NO_SPACE'
2487 elif self
.code
== errno
.E2BIG
or 'Argument list too long' in self
.msg
:
2488 self
.reason
= 'VALUE_TOO_LONG'
2490 self
.reason
= 'NOT_SUPPORTED'
2493 class XAttrUnavailableError(YoutubeDLError
):
2497 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
2498 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2499 # expected HTTP responses to meet HTTP/1.0 or later (see also
2500 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2501 if sys
.version_info
< (3, 0):
2502 kwargs
['strict'] = True
2503 hc
= http_class(*args
, **compat_kwargs(kwargs
))
2504 source_address
= ydl_handler
._params
.get('source_address')
2506 if source_address
is not None:
2507 # This is to workaround _create_connection() from socket where it will try all
2508 # address data from getaddrinfo() including IPv6. This filters the result from
2509 # getaddrinfo() based on the source_address value.
2510 # This is based on the cpython socket.create_connection() function.
2511 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2512 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
2513 host
, port
= address
2515 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
2516 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
2517 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
2518 if addrs
and not ip_addrs
:
2519 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
2521 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2522 % (ip_version
, source_address
[0]))
2523 for res
in ip_addrs
:
2524 af
, socktype
, proto
, canonname
, sa
= res
2527 sock
= socket
.socket(af
, socktype
, proto
)
2528 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
2529 sock
.settimeout(timeout
)
2530 sock
.bind(source_address
)
2532 err
= None # Explicitly break reference cycle
2534 except socket
.error
as _
:
2536 if sock
is not None:
2541 raise socket
.error('getaddrinfo returns an empty list')
2542 if hasattr(hc
, '_create_connection'):
2543 hc
._create
_connection
= _create_connection
2544 sa
= (source_address
, 0)
2545 if hasattr(hc
, 'source_address'): # Python 2.7+
2546 hc
.source_address
= sa
2548 def _hc_connect(self
, *args
, **kwargs
):
2549 sock
= _create_connection(
2550 (self
.host
, self
.port
), self
.timeout
, sa
)
2552 self
.sock
= ssl
.wrap_socket(
2553 sock
, self
.key_file
, self
.cert_file
,
2554 ssl_version
=ssl
.PROTOCOL_TLSv1
)
2557 hc
.connect
= functools
.partial(_hc_connect
, hc
)
2562 def handle_youtubedl_headers(headers
):
2563 filtered_headers
= headers
2565 if 'Youtubedl-no-compression' in filtered_headers
:
2566 filtered_headers
= dict((k
, v
) for k
, v
in filtered_headers
.items() if k
.lower() != 'accept-encoding')
2567 del filtered_headers
['Youtubedl-no-compression']
2569 return filtered_headers
2572 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
2573 """Handler for HTTP requests and responses.
2575 This class, when installed with an OpenerDirector, automatically adds
2576 the standard headers to every HTTP request and handles gzipped and
2577 deflated responses from web servers. If compression is to be avoided in
2578 a particular request, the original request in the program code only has
2579 to include the HTTP header "Youtubedl-no-compression", which will be
2580 removed before making the real request.
2582 Part of this code was copied from:
2584 http://techknack.net/python-urllib2-handlers/
2586 Andrew Rowls, the author of that code, agreed to release it to the
2590 def __init__(self
, params
, *args
, **kwargs
):
2591 compat_urllib_request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
2592 self
._params
= params
2594 def http_open(self
, req
):
2595 conn_class
= compat_http_client
.HTTPConnection
2597 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2599 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2600 del req
.headers
['Ytdl-socks-proxy']
2602 return self
.do_open(functools
.partial(
2603 _create_http_connection
, self
, conn_class
, False),
2609 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
2611 return zlib
.decompress(data
)
2613 def http_request(self
, req
):
2614 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2615 # always respected by websites, some tend to give out URLs with non percent-encoded
2616 # non-ASCII characters (see telemb.py, ard.py [#3412])
2617 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2618 # To work around aforementioned issue we will replace request's original URL with
2619 # percent-encoded one
2620 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2621 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2622 url
= req
.get_full_url()
2623 url_escaped
= escape_url(url
)
2625 # Substitute URL if any change after escaping
2626 if url
!= url_escaped
:
2627 req
= update_Request(req
, url
=url_escaped
)
2629 for h
, v
in std_headers
.items():
2630 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2631 # The dict keys are capitalized because of this bug by urllib
2632 if h
.capitalize() not in req
.headers
:
2633 req
.add_header(h
, v
)
2635 req
.headers
= handle_youtubedl_headers(req
.headers
)
2637 if sys
.version_info
< (2, 7) and '#' in req
.get_full_url():
2638 # Python 2.6 is brain-dead when it comes to fragments
2639 req
._Request
__original
= req
._Request
__original
.partition('#')[0]
2640 req
._Request
__r
_type
= req
._Request
__r
_type
.partition('#')[0]
2644 def http_response(self
, req
, resp
):
2647 if resp
.headers
.get('Content-encoding', '') == 'gzip':
2648 content
= resp
.read()
2649 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
), mode
='rb')
2651 uncompressed
= io
.BytesIO(gz
.read())
2652 except IOError as original_ioerror
:
2653 # There may be junk add the end of the file
2654 # See http://stackoverflow.com/q/4928560/35070 for details
2655 for i
in range(1, 1024):
2657 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(content
[:-i
]), mode
='rb')
2658 uncompressed
= io
.BytesIO(gz
.read())
2663 raise original_ioerror
2664 resp
= compat_urllib_request
.addinfourl(uncompressed
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2665 resp
.msg
= old_resp
.msg
2666 del resp
.headers
['Content-encoding']
2668 if resp
.headers
.get('Content-encoding', '') == 'deflate':
2669 gz
= io
.BytesIO(self
.deflate(resp
.read()))
2670 resp
= compat_urllib_request
.addinfourl(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
2671 resp
.msg
= old_resp
.msg
2672 del resp
.headers
['Content-encoding']
2673 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2674 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2675 if 300 <= resp
.code
< 400:
2676 location
= resp
.headers
.get('Location')
2678 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2679 if sys
.version_info
>= (3, 0):
2680 location
= location
.encode('iso-8859-1').decode('utf-8')
2682 location
= location
.decode('utf-8')
2683 location_escaped
= escape_url(location
)
2684 if location
!= location_escaped
:
2685 del resp
.headers
['Location']
2686 if sys
.version_info
< (3, 0):
2687 location_escaped
= location_escaped
.encode('utf-8')
2688 resp
.headers
['Location'] = location_escaped
2691 https_request
= http_request
2692 https_response
= http_response
2695 def make_socks_conn_class(base_class
, socks_proxy
):
2696 assert issubclass(base_class
, (
2697 compat_http_client
.HTTPConnection
, compat_http_client
.HTTPSConnection
))
2699 url_components
= compat_urlparse
.urlparse(socks_proxy
)
2700 if url_components
.scheme
.lower() == 'socks5':
2701 socks_type
= ProxyType
.SOCKS5
2702 elif url_components
.scheme
.lower() in ('socks', 'socks4'):
2703 socks_type
= ProxyType
.SOCKS4
2704 elif url_components
.scheme
.lower() == 'socks4a':
2705 socks_type
= ProxyType
.SOCKS4A
2707 def unquote_if_non_empty(s
):
2710 return compat_urllib_parse_unquote_plus(s
)
2714 url_components
.hostname
, url_components
.port
or 1080,
2716 unquote_if_non_empty(url_components
.username
),
2717 unquote_if_non_empty(url_components
.password
),
2720 class SocksConnection(base_class
):
2722 self
.sock
= sockssocket()
2723 self
.sock
.setproxy(*proxy_args
)
2724 if type(self
.timeout
) in (int, float):
2725 self
.sock
.settimeout(self
.timeout
)
2726 self
.sock
.connect((self
.host
, self
.port
))
2728 if isinstance(self
, compat_http_client
.HTTPSConnection
):
2729 if hasattr(self
, '_context'): # Python > 2.6
2730 self
.sock
= self
._context
.wrap_socket(
2731 self
.sock
, server_hostname
=self
.host
)
2733 self
.sock
= ssl
.wrap_socket(self
.sock
)
2735 return SocksConnection
2738 class YoutubeDLHTTPSHandler(compat_urllib_request
.HTTPSHandler
):
2739 def __init__(self
, params
, https_conn_class
=None, *args
, **kwargs
):
2740 compat_urllib_request
.HTTPSHandler
.__init
__(self
, *args
, **kwargs
)
2741 self
._https
_conn
_class
= https_conn_class
or compat_http_client
.HTTPSConnection
2742 self
._params
= params
2744 def https_open(self
, req
):
2746 conn_class
= self
._https
_conn
_class
2748 if hasattr(self
, '_context'): # python > 2.6
2749 kwargs
['context'] = self
._context
2750 if hasattr(self
, '_check_hostname'): # python 3.x
2751 kwargs
['check_hostname'] = self
._check
_hostname
2753 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
2755 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
2756 del req
.headers
['Ytdl-socks-proxy']
2758 return self
.do_open(functools
.partial(
2759 _create_http_connection
, self
, conn_class
, True),
2763 class YoutubeDLCookieJar(compat_cookiejar
.MozillaCookieJar
):
2765 See [1] for cookie file format.
2767 1. https://curl.haxx.se/docs/http-cookies.html
2769 _HTTPONLY_PREFIX
= '#HttpOnly_'
2771 _HEADER
= '''# Netscape HTTP Cookie File
2772 # This file is generated by youtube-dlc. Do not edit.
2775 _CookieFileEntry
= collections
.namedtuple(
2777 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2779 def save(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2781 Save cookies to a file.
2783 Most of the code is taken from CPython 3.8 and slightly adapted
2784 to support cookie files with UTF-8 in both python 2 and 3.
2786 if filename
is None:
2787 if self
.filename
is not None:
2788 filename
= self
.filename
2790 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2792 # Store session cookies with `expires` set to 0 instead of an empty
2795 if cookie
.expires
is None:
2798 with io
.open(filename
, 'w', encoding
='utf-8') as f
:
2799 f
.write(self
._HEADER
)
2802 if not ignore_discard
and cookie
.discard
:
2804 if not ignore_expires
and cookie
.is_expired(now
):
2810 if cookie
.domain
.startswith('.'):
2811 initial_dot
= 'TRUE'
2813 initial_dot
= 'FALSE'
2814 if cookie
.expires
is not None:
2815 expires
= compat_str(cookie
.expires
)
2818 if cookie
.value
is None:
2819 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2820 # with no name, whereas http.cookiejar regards it as a
2821 # cookie with no value.
2826 value
= cookie
.value
2828 '\t'.join([cookie
.domain
, initial_dot
, cookie
.path
,
2829 secure
, expires
, name
, value
]) + '\n')
2831 def load(self
, filename
=None, ignore_discard
=False, ignore_expires
=False):
2832 """Load cookies from a file."""
2833 if filename
is None:
2834 if self
.filename
is not None:
2835 filename
= self
.filename
2837 raise ValueError(compat_cookiejar
.MISSING_FILENAME_TEXT
)
2839 def prepare_line(line
):
2840 if line
.startswith(self
._HTTPONLY
_PREFIX
):
2841 line
= line
[len(self
._HTTPONLY
_PREFIX
):]
2842 # comments and empty lines are fine
2843 if line
.startswith('#') or not line
.strip():
2845 cookie_list
= line
.split('\t')
2846 if len(cookie_list
) != self
._ENTRY
_LEN
:
2847 raise compat_cookiejar
.LoadError('invalid length %d' % len(cookie_list
))
2848 cookie
= self
._CookieFileEntry
(*cookie_list
)
2849 if cookie
.expires_at
and not cookie
.expires_at
.isdigit():
2850 raise compat_cookiejar
.LoadError('invalid expires at %s' % cookie
.expires_at
)
2854 with io
.open(filename
, encoding
='utf-8') as f
:
2857 cf
.write(prepare_line(line
))
2858 except compat_cookiejar
.LoadError
as e
:
2860 'WARNING: skipping cookie file entry due to %s: %r\n'
2861 % (e
, line
), sys
.stderr
)
2864 self
._really
_load
(cf
, filename
, ignore_discard
, ignore_expires
)
2865 # Session cookies are denoted by either `expires` field set to
2866 # an empty string or 0. MozillaCookieJar only recognizes the former
2867 # (see [1]). So we need force the latter to be recognized as session
2868 # cookies on our own.
2869 # Session cookies may be important for cookies-based authentication,
2870 # e.g. usually, when user does not check 'Remember me' check box while
2871 # logging in on a site, some important cookies are stored as session
2872 # cookies so that not recognizing them will result in failed login.
2873 # 1. https://bugs.python.org/issue17164
2875 # Treat `expires=0` cookies as session cookies
2876 if cookie
.expires
== 0:
2877 cookie
.expires
= None
2878 cookie
.discard
= True
2881 class YoutubeDLCookieProcessor(compat_urllib_request
.HTTPCookieProcessor
):
2882 def __init__(self
, cookiejar
=None):
2883 compat_urllib_request
.HTTPCookieProcessor
.__init
__(self
, cookiejar
)
2885 def http_response(self
, request
, response
):
2886 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2887 # characters in Set-Cookie HTTP header of last response (see
2888 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2889 # In order to at least prevent crashing we will percent encode Set-Cookie
2890 # header before HTTPCookieProcessor starts processing it.
2891 # if sys.version_info < (3, 0) and response.headers:
2892 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2893 # set_cookie = response.headers.get(set_cookie_header)
2895 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2896 # if set_cookie != set_cookie_escaped:
2897 # del response.headers[set_cookie_header]
2898 # response.headers[set_cookie_header] = set_cookie_escaped
2899 return compat_urllib_request
.HTTPCookieProcessor
.http_response(self
, request
, response
)
2901 https_request
= compat_urllib_request
.HTTPCookieProcessor
.http_request
2902 https_response
= http_response
2905 class YoutubeDLRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
):
2906 if sys
.version_info
[0] < 3:
2907 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
2908 # On python 2 urlh.geturl() may sometimes return redirect URL
2909 # as byte string instead of unicode. This workaround allows
2910 # to force it always return unicode.
2911 return compat_urllib_request
.HTTPRedirectHandler
.redirect_request(self
, req
, fp
, code
, msg
, headers
, compat_str(newurl
))
2914 def extract_timezone(date_str
):
2916 r
'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2919 timezone
= datetime
.timedelta()
2921 date_str
= date_str
[:-len(m
.group('tz'))]
2922 if not m
.group('sign'):
2923 timezone
= datetime
.timedelta()
2925 sign
= 1 if m
.group('sign') == '+' else -1
2926 timezone
= datetime
.timedelta(
2927 hours
=sign
* int(m
.group('hours')),
2928 minutes
=sign
* int(m
.group('minutes')))
2929 return timezone
, date_str
2932 def parse_iso8601(date_str
, delimiter
='T', timezone
=None):
2933 """ Return a UNIX timestamp from the given date """
2935 if date_str
is None:
2938 date_str
= re
.sub(r
'\.[0-9]+', '', date_str
)
2940 if timezone
is None:
2941 timezone
, date_str
= extract_timezone(date_str
)
2944 date_format
= '%Y-%m-%d{0}%H:%M:%S'.format(delimiter
)
2945 dt
= datetime
.datetime
.strptime(date_str
, date_format
) - timezone
2946 return calendar
.timegm(dt
.timetuple())
2951 def date_formats(day_first
=True):
2952 return DATE_FORMATS_DAY_FIRST
if day_first
else DATE_FORMATS_MONTH_FIRST
2955 def unified_strdate(date_str
, day_first
=True):
2956 """Return a string with the date in the format YYYYMMDD"""
2958 if date_str
is None:
2962 date_str
= date_str
.replace(',', ' ')
2963 # Remove AM/PM + timezone
2964 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2965 _
, date_str
= extract_timezone(date_str
)
2967 for expression
in date_formats(day_first
):
2969 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
2972 if upload_date
is None:
2973 timetuple
= email
.utils
.parsedate_tz(date_str
)
2976 upload_date
= datetime
.datetime(*timetuple
[:6]).strftime('%Y%m%d')
2979 if upload_date
is not None:
2980 return compat_str(upload_date
)
2983 def unified_timestamp(date_str
, day_first
=True):
2984 if date_str
is None:
2987 date_str
= re
.sub(r
'[,|]', '', date_str
)
2989 pm_delta
= 12 if re
.search(r
'(?i)PM', date_str
) else 0
2990 timezone
, date_str
= extract_timezone(date_str
)
2992 # Remove AM/PM + timezone
2993 date_str
= re
.sub(r
'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str
)
2995 # Remove unrecognized timezones from ISO 8601 alike timestamps
2996 m
= re
.search(r
'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str
)
2998 date_str
= date_str
[:-len(m
.group('tz'))]
3000 # Python only supports microseconds, so remove nanoseconds
3001 m
= re
.search(r
'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str
)
3003 date_str
= m
.group(1)
3005 for expression
in date_formats(day_first
):
3007 dt
= datetime
.datetime
.strptime(date_str
, expression
) - timezone
+ datetime
.timedelta(hours
=pm_delta
)
3008 return calendar
.timegm(dt
.timetuple())
3011 timetuple
= email
.utils
.parsedate_tz(date_str
)
3013 return calendar
.timegm(timetuple
) + pm_delta
* 3600
3016 def determine_ext(url
, default_ext
='unknown_video'):
3017 if url
is None or '.' not in url
:
3019 guess
= url
.partition('?')[0].rpartition('.')[2]
3020 if re
.match(r
'^[A-Za-z0-9]+$', guess
):
3022 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3023 elif guess
.rstrip('/') in KNOWN_EXTENSIONS
:
3024 return guess
.rstrip('/')
3029 def subtitles_filename(filename
, sub_lang
, sub_format
, expected_real_ext
=None):
3030 return replace_extension(filename
, sub_lang
+ '.' + sub_format
, expected_real_ext
)
3033 def date_from_str(date_str
):
3035 Return a datetime object from a string in the format YYYYMMDD or
3036 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3037 today
= datetime
.date
.today()
3038 if date_str
in ('now', 'today'):
3040 if date_str
== 'yesterday':
3041 return today
- datetime
.timedelta(days
=1)
3042 match
= re
.match(r
'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
3043 if match
is not None:
3044 sign
= match
.group('sign')
3045 time
= int(match
.group('time'))
3048 unit
= match
.group('unit')
3049 # A bad approximation?
3053 elif unit
== 'year':
3057 delta
= datetime
.timedelta(**{unit: time}
)
3058 return today
+ delta
3059 return datetime
.datetime
.strptime(date_str
, '%Y%m%d').date()
3062 def hyphenate_date(date_str
):
3064 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3065 match
= re
.match(r
'^(\d\d\d\d)(\d\d)(\d\d)$', date_str
)
3066 if match
is not None:
3067 return '-'.join(match
.groups())
3072 class DateRange(object):
3073 """Represents a time interval between two dates"""
3075 def __init__(self
, start
=None, end
=None):
3076 """start and end must be strings in the format accepted by date"""
3077 if start
is not None:
3078 self
.start
= date_from_str(start
)
3080 self
.start
= datetime
.datetime
.min.date()
3082 self
.end
= date_from_str(end
)
3084 self
.end
= datetime
.datetime
.max.date()
3085 if self
.start
> self
.end
:
3086 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
3090 """Returns a range that only contains the given day"""
3091 return cls(day
, day
)
3093 def __contains__(self
, date
):
3094 """Check if the date is in the range"""
3095 if not isinstance(date
, datetime
.date
):
3096 date
= date_from_str(date
)
3097 return self
.start
<= date
<= self
.end
3100 return '%s - %s' % (self
.start
.isoformat(), self
.end
.isoformat())
3103 def platform_name():
3104 """ Returns the platform name as a compat_str """
3105 res
= platform
.platform()
3106 if isinstance(res
, bytes):
3107 res
= res
.decode(preferredencoding())
3109 assert isinstance(res
, compat_str
)
3113 def _windows_write_string(s
, out
):
3114 """ Returns True if the string was written using special methods,
3115 False if it has yet to be written out."""
3116 # Adapted from http://stackoverflow.com/a/3259271/35070
3119 import ctypes
.wintypes
3127 fileno
= out
.fileno()
3128 except AttributeError:
3129 # If the output stream doesn't have a fileno, it's virtual
3131 except io
.UnsupportedOperation
:
3132 # Some strange Windows pseudo files?
3134 if fileno
not in WIN_OUTPUT_IDS
:
3137 GetStdHandle
= compat_ctypes_WINFUNCTYPE(
3138 ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.DWORD
)(
3139 ('GetStdHandle', ctypes
.windll
.kernel32
))
3140 h
= GetStdHandle(WIN_OUTPUT_IDS
[fileno
])
3142 WriteConsoleW
= compat_ctypes_WINFUNCTYPE(
3143 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
, ctypes
.wintypes
.LPWSTR
,
3144 ctypes
.wintypes
.DWORD
, ctypes
.POINTER(ctypes
.wintypes
.DWORD
),
3145 ctypes
.wintypes
.LPVOID
)(('WriteConsoleW', ctypes
.windll
.kernel32
))
3146 written
= ctypes
.wintypes
.DWORD(0)
3148 GetFileType
= compat_ctypes_WINFUNCTYPE(ctypes
.wintypes
.DWORD
, ctypes
.wintypes
.DWORD
)(('GetFileType', ctypes
.windll
.kernel32
))
3149 FILE_TYPE_CHAR
= 0x0002
3150 FILE_TYPE_REMOTE
= 0x8000
3151 GetConsoleMode
= compat_ctypes_WINFUNCTYPE(
3152 ctypes
.wintypes
.BOOL
, ctypes
.wintypes
.HANDLE
,
3153 ctypes
.POINTER(ctypes
.wintypes
.DWORD
))(
3154 ('GetConsoleMode', ctypes
.windll
.kernel32
))
3155 INVALID_HANDLE_VALUE
= ctypes
.wintypes
.DWORD(-1).value
3157 def not_a_console(handle
):
3158 if handle
== INVALID_HANDLE_VALUE
or handle
is None:
3160 return ((GetFileType(handle
) & ~FILE_TYPE_REMOTE
) != FILE_TYPE_CHAR
3161 or GetConsoleMode(handle
, ctypes
.byref(ctypes
.wintypes
.DWORD())) == 0)
3163 if not_a_console(h
):
3166 def next_nonbmp_pos(s
):
3168 return next(i
for i
, c
in enumerate(s
) if ord(c
) > 0xffff)
3169 except StopIteration:
3173 count
= min(next_nonbmp_pos(s
), 1024)
3175 ret
= WriteConsoleW(
3176 h
, s
, count
if count
else 2, ctypes
.byref(written
), None)
3178 raise OSError('Failed to write string')
3179 if not count
: # We just wrote a non-BMP character
3180 assert written
.value
== 2
3183 assert written
.value
> 0
3184 s
= s
[written
.value
:]
3188 def write_string(s
, out
=None, encoding
=None):
3191 assert type(s
) == compat_str
3193 if sys
.platform
== 'win32' and encoding
is None and hasattr(out
, 'fileno'):
3194 if _windows_write_string(s
, out
):
3197 if ('b' in getattr(out
, 'mode', '')
3198 or sys
.version_info
[0] < 3): # Python 2 lies about mode of sys.stderr
3199 byt
= s
.encode(encoding
or preferredencoding(), 'ignore')
3201 elif hasattr(out
, 'buffer'):
3202 enc
= encoding
or getattr(out
, 'encoding', None) or preferredencoding()
3203 byt
= s
.encode(enc
, 'ignore')
3204 out
.buffer.write(byt
)
3210 def bytes_to_intlist(bs
):
3213 if isinstance(bs
[0], int): # Python 3
3216 return [ord(c
) for c
in bs
]
3219 def intlist_to_bytes(xs
):
3222 return compat_struct_pack('%dB' % len(xs
), *xs
)
3225 # Cross-platform file locking
3226 if sys
.platform
== 'win32':
3227 import ctypes
.wintypes
3230 class OVERLAPPED(ctypes
.Structure
):
3232 ('Internal', ctypes
.wintypes
.LPVOID
),
3233 ('InternalHigh', ctypes
.wintypes
.LPVOID
),
3234 ('Offset', ctypes
.wintypes
.DWORD
),
3235 ('OffsetHigh', ctypes
.wintypes
.DWORD
),
3236 ('hEvent', ctypes
.wintypes
.HANDLE
),
3239 kernel32
= ctypes
.windll
.kernel32
3240 LockFileEx
= kernel32
.LockFileEx
3241 LockFileEx
.argtypes
= [
3242 ctypes
.wintypes
.HANDLE
, # hFile
3243 ctypes
.wintypes
.DWORD
, # dwFlags
3244 ctypes
.wintypes
.DWORD
, # dwReserved
3245 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3246 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3247 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3249 LockFileEx
.restype
= ctypes
.wintypes
.BOOL
3250 UnlockFileEx
= kernel32
.UnlockFileEx
3251 UnlockFileEx
.argtypes
= [
3252 ctypes
.wintypes
.HANDLE
, # hFile
3253 ctypes
.wintypes
.DWORD
, # dwReserved
3254 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockLow
3255 ctypes
.wintypes
.DWORD
, # nNumberOfBytesToLockHigh
3256 ctypes
.POINTER(OVERLAPPED
) # Overlapped
3258 UnlockFileEx
.restype
= ctypes
.wintypes
.BOOL
3259 whole_low
= 0xffffffff
3260 whole_high
= 0x7fffffff
3262 def _lock_file(f
, exclusive
):
3263 overlapped
= OVERLAPPED()
3264 overlapped
.Offset
= 0
3265 overlapped
.OffsetHigh
= 0
3266 overlapped
.hEvent
= 0
3267 f
._lock
_file
_overlapped
_p
= ctypes
.pointer(overlapped
)
3268 handle
= msvcrt
.get_osfhandle(f
.fileno())
3269 if not LockFileEx(handle
, 0x2 if exclusive
else 0x0, 0,
3270 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3271 raise OSError('Locking file failed: %r' % ctypes
.FormatError())
3273 def _unlock_file(f
):
3274 assert f
._lock
_file
_overlapped
_p
3275 handle
= msvcrt
.get_osfhandle(f
.fileno())
3276 if not UnlockFileEx(handle
, 0,
3277 whole_low
, whole_high
, f
._lock
_file
_overlapped
_p
):
3278 raise OSError('Unlocking file failed: %r' % ctypes
.FormatError())
3281 # Some platforms, such as Jython, is missing fcntl
3285 def _lock_file(f
, exclusive
):
3286 fcntl
.flock(f
, fcntl
.LOCK_EX
if exclusive
else fcntl
.LOCK_SH
)
3288 def _unlock_file(f
):
3289 fcntl
.flock(f
, fcntl
.LOCK_UN
)
3291 UNSUPPORTED_MSG
= 'file locking is not supported on this platform'
3293 def _lock_file(f
, exclusive
):
3294 raise IOError(UNSUPPORTED_MSG
)
3296 def _unlock_file(f
):
3297 raise IOError(UNSUPPORTED_MSG
)
3300 class locked_file(object):
3301 def __init__(self
, filename
, mode
, encoding
=None):
3302 assert mode
in ['r', 'a', 'w']
3303 self
.f
= io
.open(filename
, mode
, encoding
=encoding
)
3306 def __enter__(self
):
3307 exclusive
= self
.mode
!= 'r'
3309 _lock_file(self
.f
, exclusive
)
3315 def __exit__(self
, etype
, value
, traceback
):
3317 _unlock_file(self
.f
)
3324 def write(self
, *args
):
3325 return self
.f
.write(*args
)
3327 def read(self
, *args
):
3328 return self
.f
.read(*args
)
3331 def get_filesystem_encoding():
3332 encoding
= sys
.getfilesystemencoding()
3333 return encoding
if encoding
is not None else 'utf-8'
3336 def shell_quote(args
):
3338 encoding
= get_filesystem_encoding()
3340 if isinstance(a
, bytes):
3341 # We may get a filename encoded with 'encodeFilename'
3342 a
= a
.decode(encoding
)
3343 quoted_args
.append(compat_shlex_quote(a
))
3344 return ' '.join(quoted_args
)
3347 def smuggle_url(url
, data
):
3348 """ Pass additional data in a URL for internal use. """
3350 url
, idata
= unsmuggle_url(url
, {})
3352 sdata
= compat_urllib_parse_urlencode(
3353 {'__youtubedl_smuggle': json.dumps(data)}
)
3354 return url
+ '#' + sdata
3357 def unsmuggle_url(smug_url
, default
=None):
3358 if '#__youtubedl_smuggle' not in smug_url
:
3359 return smug_url
, default
3360 url
, _
, sdata
= smug_url
.rpartition('#')
3361 jsond
= compat_parse_qs(sdata
)['__youtubedl_smuggle'][0]
3362 data
= json
.loads(jsond
)
3366 def format_bytes(bytes):
3369 if type(bytes) is str:
3370 bytes = float(bytes)
3374 exponent
= int(math
.log(bytes, 1024.0))
3375 suffix
= ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent
]
3376 converted
= float(bytes) / float(1024 ** exponent
)
3377 return '%.2f%s' % (converted
, suffix
)
3380 def lookup_unit_table(unit_table
, s
):
3381 units_re
= '|'.join(re
.escape(u
) for u
in unit_table
)
3383 r
'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re
, s
)
3386 num_str
= m
.group('num').replace(',', '.')
3387 mult
= unit_table
[m
.group('unit')]
3388 return int(float(num_str
) * mult
)
3391 def parse_filesize(s
):
3395 # The lower-case forms are of course incorrect and unofficial,
3396 # but we support those too
3413 'megabytes': 1000 ** 2,
3414 'mebibytes': 1024 ** 2,
3420 'gigabytes': 1000 ** 3,
3421 'gibibytes': 1024 ** 3,
3427 'terabytes': 1000 ** 4,
3428 'tebibytes': 1024 ** 4,
3434 'petabytes': 1000 ** 5,
3435 'pebibytes': 1024 ** 5,
3441 'exabytes': 1000 ** 6,
3442 'exbibytes': 1024 ** 6,
3448 'zettabytes': 1000 ** 7,
3449 'zebibytes': 1024 ** 7,
3455 'yottabytes': 1000 ** 8,
3456 'yobibytes': 1024 ** 8,
3459 return lookup_unit_table(_UNIT_TABLE
, s
)
3468 if re
.match(r
'^[\d,.]+$', s
):
3469 return str_to_int(s
)
3480 return lookup_unit_table(_UNIT_TABLE
, s
)
3483 def parse_resolution(s
):
3487 mobj
= re
.search(r
'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s
)
3490 'width': int(mobj
.group('w')),
3491 'height': int(mobj
.group('h')),
3494 mobj
= re
.search(r
'\b(\d+)[pPiI]\b', s
)
3496 return {'height': int(mobj.group(1))}
3498 mobj
= re
.search(r
'\b([48])[kK]\b', s
)
3500 return {'height': int(mobj.group(1)) * 540}
3505 def parse_bitrate(s
):
3506 if not isinstance(s
, compat_str
):
3508 mobj
= re
.search(r
'\b(\d+)\s*kbps', s
)
3510 return int(mobj
.group(1))
3513 def month_by_name(name
, lang
='en'):
3514 """ Return the number of a month by (locale-independently) English name """
3516 month_names
= MONTH_NAMES
.get(lang
, MONTH_NAMES
['en'])
3519 return month_names
.index(name
) + 1
3524 def month_by_abbreviation(abbrev
):
3525 """ Return the number of a month by (locale-independently) English
3529 return [s
[:3] for s
in ENGLISH_MONTH_NAMES
].index(abbrev
) + 1
3534 def fix_xml_ampersands(xml_str
):
3535 """Replace all the '&' by '&' in XML"""
3537 r
'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3542 def setproctitle(title
):
3543 assert isinstance(title
, compat_str
)
3545 # ctypes in Jython is not complete
3546 # http://bugs.jython.org/issue2148
3547 if sys
.platform
.startswith('java'):
3551 libc
= ctypes
.cdll
.LoadLibrary('libc.so.6')
3555 # LoadLibrary in Windows Python 2.7.13 only expects
3556 # a bytestring, but since unicode_literals turns
3557 # every string into a unicode string, it fails.
3559 title_bytes
= title
.encode('utf-8')
3560 buf
= ctypes
.create_string_buffer(len(title_bytes
))
3561 buf
.value
= title_bytes
3563 libc
.prctl(15, buf
, 0, 0, 0)
3564 except AttributeError:
3565 return # Strange libc, just skip this
3568 def remove_start(s
, start
):
3569 return s
[len(start
):] if s
is not None and s
.startswith(start
) else s
3572 def remove_end(s
, end
):
3573 return s
[:-len(end
)] if s
is not None and s
.endswith(end
) else s
3576 def remove_quotes(s
):
3577 if s
is None or len(s
) < 2:
3579 for quote
in ('"', "'", ):
3580 if s
[0] == quote
and s
[-1] == quote
:
3585 def get_domain(url
):
3586 domain
= re
.match(r
'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url
)
3587 return domain
.group('domain') if domain
else None
3590 def url_basename(url
):
3591 path
= compat_urlparse
.urlparse(url
).path
3592 return path
.strip('/').split('/')[-1]
3596 return re
.match(r
'https?://[^?#&]+/', url
).group()
3599 def urljoin(base
, path
):
3600 if isinstance(path
, bytes):
3601 path
= path
.decode('utf-8')
3602 if not isinstance(path
, compat_str
) or not path
:
3604 if re
.match(r
'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path
):
3606 if isinstance(base
, bytes):
3607 base
= base
.decode('utf-8')
3608 if not isinstance(base
, compat_str
) or not re
.match(
3609 r
'^(?:https?:)?//', base
):
3611 return compat_urlparse
.urljoin(base
, path
)
3614 class HEADRequest(compat_urllib_request
.Request
):
3615 def get_method(self
):
3619 class PUTRequest(compat_urllib_request
.Request
):
3620 def get_method(self
):
3624 def int_or_none(v
, scale
=1, default
=None, get_attr
=None, invscale
=1):
3627 v
= getattr(v
, get_attr
, None)
3633 return int(v
) * invscale
// scale
3634 except (ValueError, TypeError):
3638 def str_or_none(v
, default
=None):
3639 return default
if v
is None else compat_str(v
)
3642 def str_to_int(int_str
):
3643 """ A more relaxed version of int_or_none """
3644 if isinstance(int_str
, compat_integer_types
):
3646 elif isinstance(int_str
, compat_str
):
3647 int_str
= re
.sub(r
'[,\.\+]', '', int_str
)
3648 return int_or_none(int_str
)
3651 def float_or_none(v
, scale
=1, invscale
=1, default
=None):
3655 return float(v
) * invscale
/ scale
3656 except (ValueError, TypeError):
3660 def bool_or_none(v
, default
=None):
3661 return v
if isinstance(v
, bool) else default
3664 def strip_or_none(v
, default
=None):
3665 return v
.strip() if isinstance(v
, compat_str
) else default
3668 def url_or_none(url
):
3669 if not url
or not isinstance(url
, compat_str
):
3672 return url
if re
.match(r
'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url
) else None
3675 def parse_duration(s
):
3676 if not isinstance(s
, compat_basestring
):
3681 days
, hours
, mins
, secs
, ms
= [None] * 5
3682 m
= re
.match(r
'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s
)
3684 days
, hours
, mins
, secs
, ms
= m
.groups()
3689 [0-9]+\s*y(?:ears?)?\s*
3692 [0-9]+\s*m(?:onths?)?\s*
3695 [0-9]+\s*w(?:eeks?)?\s*
3698 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3702 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3705 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3708 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3711 days
, hours
, mins
, secs
, ms
= m
.groups()
3713 m
= re
.match(r
'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s
)
3715 hours
, mins
= m
.groups()
3721 duration
+= float(secs
)
3723 duration
+= float(mins
) * 60
3725 duration
+= float(hours
) * 60 * 60
3727 duration
+= float(days
) * 24 * 60 * 60
3729 duration
+= float(ms
)
3733 def prepend_extension(filename
, ext
, expected_real_ext
=None):
3734 name
, real_ext
= os
.path
.splitext(filename
)
3736 '{0}.{1}{2}'.format(name
, ext
, real_ext
)
3737 if not expected_real_ext
or real_ext
[1:] == expected_real_ext
3738 else '{0}.{1}'.format(filename
, ext
))
3741 def replace_extension(filename
, ext
, expected_real_ext
=None):
3742 name
, real_ext
= os
.path
.splitext(filename
)
3743 return '{0}.{1}'.format(
3744 name
if not expected_real_ext
or real_ext
[1:] == expected_real_ext
else filename
,
3748 def check_executable(exe
, args
=[]):
3749 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3750 args can be a list of arguments for a short output (like -version) """
3752 process_communicate_or_kill(subprocess
.Popen(
3753 [exe
] + args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
))
3759 def get_exe_version(exe
, args
=['--version'],
3760 version_re
=None, unrecognized
='present'):
3761 """ Returns the version of the specified executable,
3762 or False if the executable is not present """
3764 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3765 # SIGTTOU if youtube-dlc is run in the background.
3766 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3767 out
, _
= process_communicate_or_kill(subprocess
.Popen(
3768 [encodeArgument(exe
)] + args
,
3769 stdin
=subprocess
.PIPE
,
3770 stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
))
3773 if isinstance(out
, bytes): # Python 2.x
3774 out
= out
.decode('ascii', 'ignore')
3775 return detect_exe_version(out
, version_re
, unrecognized
)
3778 def detect_exe_version(output
, version_re
=None, unrecognized
='present'):
3779 assert isinstance(output
, compat_str
)
3780 if version_re
is None:
3781 version_re
= r
'version\s+([-0-9._a-zA-Z]+)'
3782 m
= re
.search(version_re
, output
)
3789 class PagedList(object):
3791 # This is only useful for tests
3792 return len(self
.getslice())
3795 class OnDemandPagedList(PagedList
):
3796 def __init__(self
, pagefunc
, pagesize
, use_cache
=True):
3797 self
._pagefunc
= pagefunc
3798 self
._pagesize
= pagesize
3799 self
._use
_cache
= use_cache
3803 def getslice(self
, start
=0, end
=None):
3805 for pagenum
in itertools
.count(start
// self
._pagesize
):
3806 firstid
= pagenum
* self
._pagesize
3807 nextfirstid
= pagenum
* self
._pagesize
+ self
._pagesize
3808 if start
>= nextfirstid
:
3813 page_results
= self
._cache
.get(pagenum
)
3814 if page_results
is None:
3815 page_results
= list(self
._pagefunc
(pagenum
))
3817 self
._cache
[pagenum
] = page_results
3820 start
% self
._pagesize
3821 if firstid
<= start
< nextfirstid
3825 ((end
- 1) % self
._pagesize
) + 1
3826 if (end
is not None and firstid
<= end
<= nextfirstid
)
3829 if startv
!= 0 or endv
is not None:
3830 page_results
= page_results
[startv
:endv
]
3831 res
.extend(page_results
)
3833 # A little optimization - if current page is not "full", ie. does
3834 # not contain page_size videos then we can assume that this page
3835 # is the last one - there are no more ids on further pages -
3836 # i.e. no need to query again.
3837 if len(page_results
) + startv
< self
._pagesize
:
3840 # If we got the whole page, but the next page is not interesting,
3841 # break out early as well
3842 if end
== nextfirstid
:
3847 class InAdvancePagedList(PagedList
):
3848 def __init__(self
, pagefunc
, pagecount
, pagesize
):
3849 self
._pagefunc
= pagefunc
3850 self
._pagecount
= pagecount
3851 self
._pagesize
= pagesize
3853 def getslice(self
, start
=0, end
=None):
3855 start_page
= start
// self
._pagesize
3857 self
._pagecount
if end
is None else (end
// self
._pagesize
+ 1))
3858 skip_elems
= start
- start_page
* self
._pagesize
3859 only_more
= None if end
is None else end
- start
3860 for pagenum
in range(start_page
, end_page
):
3861 page
= list(self
._pagefunc
(pagenum
))
3863 page
= page
[skip_elems
:]
3865 if only_more
is not None:
3866 if len(page
) < only_more
:
3867 only_more
-= len(page
)
3869 page
= page
[:only_more
]
3876 def uppercase_escape(s
):
3877 unicode_escape
= codecs
.getdecoder('unicode_escape')
3879 r
'\\U[0-9a-fA-F]{8}',
3880 lambda m
: unicode_escape(m
.group(0))[0],
3884 def lowercase_escape(s
):
3885 unicode_escape
= codecs
.getdecoder('unicode_escape')
3887 r
'\\u[0-9a-fA-F]{4}',
3888 lambda m
: unicode_escape(m
.group(0))[0],
3892 def escape_rfc3986(s
):
3893 """Escape non-ASCII characters as suggested by RFC 3986"""
3894 if sys
.version_info
< (3, 0) and isinstance(s
, compat_str
):
3895 s
= s
.encode('utf-8')
3896 return compat_urllib_parse
.quote(s
, b
"%/;:@&=+$,!~*'()?#[]")
3899 def escape_url(url
):
3900 """Escape URL as suggested by RFC 3986"""
3901 url_parsed
= compat_urllib_parse_urlparse(url
)
3902 return url_parsed
._replace
(
3903 netloc
=url_parsed
.netloc
.encode('idna').decode('ascii'),
3904 path
=escape_rfc3986(url_parsed
.path
),
3905 params
=escape_rfc3986(url_parsed
.params
),
3906 query
=escape_rfc3986(url_parsed
.query
),
3907 fragment
=escape_rfc3986(url_parsed
.fragment
)
3911 def read_batch_urls(batch_fd
):
3913 if not isinstance(url
, compat_str
):
3914 url
= url
.decode('utf-8', 'replace')
3915 BOM_UTF8
= ('\xef\xbb\xbf', '\ufeff')
3916 for bom
in BOM_UTF8
:
3917 if url
.startswith(bom
):
3918 url
= url
[len(bom
):]
3920 if not url
or url
.startswith(('#', ';', ']')):
3922 # "#" cannot be stripped out since it is part of the URI
3923 # However, it can be safely stipped out if follwing a whitespace
3924 return re
.split(r
'\s#', url
, 1)[0].rstrip()
3926 with contextlib
.closing(batch_fd
) as fd
:
3927 return [url
for url
in map(fixup
, fd
) if url
]
3930 def urlencode_postdata(*args
, **kargs
):
3931 return compat_urllib_parse_urlencode(*args
, **kargs
).encode('ascii')
3934 def update_url_query(url
, query
):
3937 parsed_url
= compat_urlparse
.urlparse(url
)
3938 qs
= compat_parse_qs(parsed_url
.query
)
3940 return compat_urlparse
.urlunparse(parsed_url
._replace
(
3941 query
=compat_urllib_parse_urlencode(qs
, True)))
3944 def update_Request(req
, url
=None, data
=None, headers
={}, query={}
):
3945 req_headers
= req
.headers
.copy()
3946 req_headers
.update(headers
)
3947 req_data
= data
or req
.data
3948 req_url
= update_url_query(url
or req
.get_full_url(), query
)
3949 req_get_method
= req
.get_method()
3950 if req_get_method
== 'HEAD':
3951 req_type
= HEADRequest
3952 elif req_get_method
== 'PUT':
3953 req_type
= PUTRequest
3955 req_type
= compat_urllib_request
.Request
3957 req_url
, data
=req_data
, headers
=req_headers
,
3958 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
3959 if hasattr(req
, 'timeout'):
3960 new_req
.timeout
= req
.timeout
3964 def _multipart_encode_impl(data
, boundary
):
3965 content_type
= 'multipart/form-data; boundary=%s' % boundary
3968 for k
, v
in data
.items():
3969 out
+= b
'--' + boundary
.encode('ascii') + b
'\r\n'
3970 if isinstance(k
, compat_str
):
3971 k
= k
.encode('utf-8')
3972 if isinstance(v
, compat_str
):
3973 v
= v
.encode('utf-8')
3974 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3975 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3976 content
= b
'Content-Disposition: form-data; name="' + k
+ b
'"\r\n\r\n' + v
+ b
'\r\n'
3977 if boundary
.encode('ascii') in content
:
3978 raise ValueError('Boundary overlaps with data')
3981 out
+= b
'--' + boundary
.encode('ascii') + b
'--\r\n'
3983 return out
, content_type
3986 def multipart_encode(data
, boundary
=None):
3988 Encode a dict to RFC 7578-compliant form-data
3991 A dict where keys and values can be either Unicode or bytes-like
3994 If specified a Unicode object, it's used as the boundary. Otherwise
3995 a random boundary is generated.
3997 Reference: https://tools.ietf.org/html/rfc7578
3999 has_specified_boundary
= boundary
is not None
4002 if boundary
is None:
4003 boundary
= '---------------' + str(random
.randrange(0x0fffffff, 0xffffffff))
4006 out
, content_type
= _multipart_encode_impl(data
, boundary
)
4009 if has_specified_boundary
:
4013 return out
, content_type
4016 def dict_get(d
, key_or_keys
, default
=None, skip_false_values
=True):
4017 if isinstance(key_or_keys
, (list, tuple)):
4018 for key
in key_or_keys
:
4019 if key
not in d
or d
[key
] is None or skip_false_values
and not d
[key
]:
4023 return d
.get(key_or_keys
, default
)
4026 def try_get(src
, getter
, expected_type
=None):
4027 if not isinstance(getter
, (list, tuple)):
4032 except (AttributeError, KeyError, TypeError, IndexError):
4035 if expected_type
is None or isinstance(v
, expected_type
):
4039 def merge_dicts(*dicts
):
4041 for a_dict
in dicts
:
4042 for k
, v
in a_dict
.items():
4046 or (isinstance(v
, compat_str
) and v
4047 and isinstance(merged
[k
], compat_str
)
4048 and not merged
[k
])):
4053 def encode_compat_str(string
, encoding
=preferredencoding(), errors
='strict'):
4054 return string
if isinstance(string
, compat_str
) else compat_str(string
, encoding
, errors
)
4066 TV_PARENTAL_GUIDELINES
= {
4076 def parse_age_limit(s
):
4078 return s
if 0 <= s
<= 21 else None
4079 if not isinstance(s
, compat_basestring
):
4081 m
= re
.match(r
'^(?P<age>\d{1,2})\+?$', s
)
4083 return int(m
.group('age'))
4085 return US_RATINGS
[s
]
4086 m
= re
.match(r
'^TV[_-]?(%s)$' % '|'.join(k
[3:] for k
in TV_PARENTAL_GUIDELINES
), s
)
4088 return TV_PARENTAL_GUIDELINES
['TV-' + m
.group(1)]
4092 def strip_jsonp(code
):
4095 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4096 (?:\s*&&\s*(?P=func_name))?
4097 \s*\(\s*(?P<callback_data>.*)\);?
4098 \s*?(?://[^\n]*)*$''',
4099 r
'\g<callback_data>', code
)
4102 def js_to_json(code
, vars={}):
4103 # vars is a dict of var, val pairs to substitute
4104 COMMENT_RE
= r
'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4105 SKIP_RE
= r
'\s*(?:{comment})?\s*'.format(comment
=COMMENT_RE
)
4107 (r
'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip
=SKIP_RE
), 16),
4108 (r
'(?s)^(0+[0-7]+){skip}:?$'.format(skip
=SKIP_RE
), 8),
4113 if v
in ('true', 'false', 'null'):
4115 elif v
.startswith('/*') or v
.startswith('//') or v
.startswith('!') or v
== ',':
4118 if v
[0] in ("'", '"'):
4119 v
= re
.sub(r
'(?s)\\.|"', lambda m
: {
4124 }.get(m
.group(0), m
.group(0)), v
[1:-1])
4126 for regex
, base
in INTEGER_TABLE
:
4127 im
= re
.match(regex
, v
)
4129 i
= int(im
.group(1), base
)
4130 return '"%d":' % i
if v
.endswith(':') else '%d' % i
4137 return re
.sub(r
'''(?sx)
4138 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4139 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4140 {comment}|,(?={skip}[\]}}])|
4141 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4142 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4145 '''.format(comment
=COMMENT_RE
, skip
=SKIP_RE
), fix_kv
, code
)
4148 def qualities(quality_ids
):
4149 """ Get a numeric quality value out of a list of possible values """
4152 return quality_ids
.index(qid
)
4158 DEFAULT_OUTTMPL
= '%(title)s [%(id)s].%(ext)s'
4161 def limit_length(s
, length
):
4162 """ Add ellipses to overly long strings """
4167 return s
[:length
- len(ELLIPSES
)] + ELLIPSES
4171 def version_tuple(v
):
4172 return tuple(int(e
) for e
in re
.split(r
'[-.]', v
))
4175 def is_outdated_version(version
, limit
, assume_new
=True):
4177 return not assume_new
4179 return version_tuple(version
) < version_tuple(limit
)
4181 return not assume_new
4184 def ytdl_is_updateable():
4185 """ Returns if youtube-dlc can be updated with -U """
4188 from zipimport
import zipimporter
4190 return isinstance(globals().get('__loader__'), zipimporter
) or hasattr(sys
, 'frozen')
4193 def args_to_str(args
):
4194 # Get a short string representation for a subprocess command
4195 return ' '.join(compat_shlex_quote(a
) for a
in args
)
4198 def error_to_compat_str(err
):
4200 # On python 2 error byte string must be decoded with proper
4201 # encoding rather than ascii
4202 if sys
.version_info
[0] < 3:
4203 err_str
= err_str
.decode(preferredencoding())
4207 def mimetype2ext(mt
):
4213 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4214 # it's the most popular one
4215 'audio/mpeg': 'mp3',
4216 'audio/x-wav': 'wav',
4221 _
, _
, res
= mt
.rpartition('/')
4222 res
= res
.split(';')[0].strip().lower()
4226 'smptett+xml': 'tt',
4230 'x-mp4-fragmented': 'mp4',
4231 'x-ms-sami': 'sami',
4234 'x-mpegurl': 'm3u8',
4235 'vnd.apple.mpegurl': 'm3u8',
4239 'vnd.ms-sstr+xml': 'ism',
4246 def parse_codecs(codecs_str
):
4247 # http://tools.ietf.org/html/rfc6381
4250 split_codecs
= list(filter(None, map(
4251 lambda str: str.strip(), codecs_str
.strip().strip(',').split(','))))
4252 vcodec
, acodec
= None, None
4253 for full_codec
in split_codecs
:
4254 codec
= full_codec
.split('.')[0]
4255 if codec
in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4258 elif codec
in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4262 write_string('WARNING: Unknown codec %s\n' % full_codec
, sys
.stderr
)
4263 if not vcodec
and not acodec
:
4264 if len(split_codecs
) == 2:
4266 'vcodec': split_codecs
[0],
4267 'acodec': split_codecs
[1],
4271 'vcodec': vcodec
or 'none',
4272 'acodec': acodec
or 'none',
4277 def urlhandle_detect_ext(url_handle
):
4278 getheader
= url_handle
.headers
.get
4280 cd
= getheader('Content-Disposition')
4282 m
= re
.match(r
'attachment;\s*filename="(?P<filename>[^"]+)"', cd
)
4284 e
= determine_ext(m
.group('filename'), default_ext
=None)
4288 return mimetype2ext(getheader('Content-Type'))
4291 def encode_data_uri(data
, mime_type
):
4292 return 'data:%s;base64,%s' % (mime_type
, base64
.b64encode(data
).decode('ascii'))
4295 def age_restricted(content_limit
, age_limit
):
4296 """ Returns True iff the content should be blocked """
4298 if age_limit
is None: # No limit set
4300 if content_limit
is None:
4301 return False # Content available for everyone
4302 return age_limit
< content_limit
4305 def is_html(first_bytes
):
4306 """ Detect whether a file contains HTML by examining its first bytes. """
4309 (b
'\xef\xbb\xbf', 'utf-8'),
4310 (b
'\x00\x00\xfe\xff', 'utf-32-be'),
4311 (b
'\xff\xfe\x00\x00', 'utf-32-le'),
4312 (b
'\xff\xfe', 'utf-16-le'),
4313 (b
'\xfe\xff', 'utf-16-be'),
4315 for bom
, enc
in BOMS
:
4316 if first_bytes
.startswith(bom
):
4317 s
= first_bytes
[len(bom
):].decode(enc
, 'replace')
4320 s
= first_bytes
.decode('utf-8', 'replace')
4322 return re
.match(r
'^\s*<', s
)
4325 def determine_protocol(info_dict
):
4326 protocol
= info_dict
.get('protocol')
4327 if protocol
is not None:
4330 url
= info_dict
['url']
4331 if url
.startswith('rtmp'):
4333 elif url
.startswith('mms'):
4335 elif url
.startswith('rtsp'):
4338 ext
= determine_ext(url
)
4344 return compat_urllib_parse_urlparse(url
).scheme
4347 def render_table(header_row
, data
, delim
=False, extraGap
=0, hideEmpty
=False):
4348 """ Render a list of rows, each as a list of values """
4350 def get_max_lens(table
):
4351 return [max(len(compat_str(v
)) for v
in col
) for col
in zip(*table
)]
4353 def filter_using_list(row
, filterArray
):
4354 return [col
for (take
, col
) in zip(filterArray
, row
) if take
]
4357 max_lens
= get_max_lens(data
)
4358 header_row
= filter_using_list(header_row
, max_lens
)
4359 data
= [filter_using_list(row
, max_lens
) for row
in data
]
4361 table
= [header_row
] + data
4362 max_lens
= get_max_lens(table
)
4364 table
= [header_row
] + [['-' * ml
for ml
in max_lens
]] + data
4365 format_str
= ' '.join('%-' + compat_str(ml
+ extraGap
) + 's' for ml
in max_lens
[:-1]) + ' %s'
4366 return '\n'.join(format_str
% tuple(row
) for row
in table
)
4369 def _match_one(filter_part
, dct
):
4370 COMPARISON_OPERATORS
= {
4378 operator_rex
= re
.compile(r
'''(?x)\s*
4380 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4382 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4383 (?P<quote>["\'])(?P
<quotedstrval
>(?
:\\.|
(?
!(?P
=quote
)|
\\).)+?
)(?P
=quote
)|
4384 (?P
<strval
>(?
![0-9.])[a
-z0
-9A
-Z
]*)
4387 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4388 m = operator_rex.search(filter_part)
4390 op = COMPARISON_OPERATORS[m.group('op')]
4391 actual_value = dct.get(m.group('key'))
4392 if (m.group('quotedstrval') is not None
4393 or m.group('strval') is not None
4394 # If the original field is a string and matching comparisonvalue is
4395 # a number we should respect the origin of the original field
4396 # and process comparison value as a string (see
4397 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4398 or actual_value is not None and m.group('intval') is not None
4399 and isinstance(actual_value, compat_str)):
4400 if m.group('op') not in ('=', '!='):
4402 'Operator %s does not support string values!' % m.group('op'))
4403 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4404 quote = m.group('quote')
4405 if quote is not None:
4406 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4409 comparison_value = int(m.group('intval'))
4411 comparison_value = parse_filesize(m.group('intval'))
4412 if comparison_value is None:
4413 comparison_value = parse_filesize(m.group('intval') + 'B')
4414 if comparison_value is None:
4416 'Invalid integer value %r in filter part %r' % (
4417 m.group('intval'), filter_part))
4418 if actual_value is None:
4419 return m.group('none_inclusive')
4420 return op(actual_value, comparison_value)
4423 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4424 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4426 operator_rex = re.compile(r'''(?x
)\s
*
4427 (?P
<op
>%s)\s
*(?P
<key
>[a
-z_
]+)
4429 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4430 m = operator_rex.search(filter_part)
4432 op = UNARY_OPERATORS[m.group('op')]
4433 actual_value = dct.get(m.group('key'))
4434 return op(actual_value)
4436 raise ValueError('Invalid filter part %r' % filter_part)
4439 def match_str(filter_str, dct):
4440 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4443 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4446 def match_filter_func(filter_str):
4447 def _match_func(info_dict):
4448 if match_str(filter_str, info_dict):
4451 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4452 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4456 def parse_dfxp_time_expr(time_expr):
4460 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4462 return float(mobj.group('time_offset'))
4464 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4466 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4469 def srt_subtitles_timecode(seconds):
4470 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4473 def dfxp2srt(dfxp_data):
4475 @param dfxp_data A
bytes-like
object containing DFXP data
4476 @returns A
unicode object containing converted SRT data
4478 LEGACY_NAMESPACES = (
4479 (b'http://www.w3.org/ns/ttml', [
4480 b'http://www.w3.org/2004/11/ttaf1',
4481 b'http://www.w3.org/2006/04/ttaf1',
4482 b'http://www.w3.org/2006/10/ttaf1',
4484 (b'http://www.w3.org/ns/ttml#styling', [
4485 b'http://www.w3.org/ns/ttml#style',
4489 SUPPORTED_STYLING = [
4498 _x = functools.partial(xpath_with_ns, ns_map={
4499 'xml': 'http://www.w3.org/XML/1998/namespace',
4500 'ttml': 'http://www.w3.org/ns/ttml',
4501 'tts': 'http://www.w3.org/ns/ttml#styling',
4507 class TTMLPElementParser(object):
4509 _unclosed_elements = []
4510 _applied_styles = []
4512 def start(self, tag, attrib):
4513 if tag in (_x('ttml:br'), 'br'):
4516 unclosed_elements = []
4518 element_style_id = attrib.get('style')
4520 style.update(default_style)
4521 if element_style_id:
4522 style.update(styles.get(element_style_id, {}))
4523 for prop in SUPPORTED_STYLING:
4524 prop_val = attrib.get(_x('tts:' + prop))
4526 style[prop] = prop_val
4529 for k, v in sorted(style.items()):
4530 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4533 font += ' color="%s"' % v
4534 elif k == 'fontSize':
4535 font += ' size="%s"' % v
4536 elif k == 'fontFamily':
4537 font += ' face="%s"' % v
4538 elif k == 'fontWeight' and v == 'bold':
4540 unclosed_elements.append('b')
4541 elif k == 'fontStyle' and v == 'italic':
4543 unclosed_elements.append('i')
4544 elif k == 'textDecoration' and v == 'underline':
4546 unclosed_elements.append('u')
4548 self._out += '<font' + font + '>'
4549 unclosed_elements.append('font')
4551 if self._applied_styles:
4552 applied_style.update(self._applied_styles[-1])
4553 applied_style.update(style)
4554 self._applied_styles.append(applied_style)
4555 self._unclosed_elements.append(unclosed_elements)
4558 if tag not in (_x('ttml:br'), 'br'):
4559 unclosed_elements = self._unclosed_elements.pop()
4560 for element in reversed(unclosed_elements):
4561 self._out += '</%s>' % element
4562 if unclosed_elements and self._applied_styles:
4563 self._applied_styles.pop()
4565 def data(self, data):
4569 return self._out.strip()
4571 def parse_node(node):
4572 target = TTMLPElementParser()
4573 parser = xml.etree.ElementTree.XMLParser(target=target)
4574 parser.feed(xml.etree.ElementTree.tostring(node))
4575 return parser.close()
4577 for k, v in LEGACY_NAMESPACES:
4579 dfxp_data = dfxp_data.replace(ns, k)
4581 dfxp = compat_etree_fromstring(dfxp_data)
4583 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4586 raise ValueError('Invalid dfxp/TTML subtitle')
4590 for style in dfxp.findall(_x('.//ttml:style')):
4591 style_id = style.get('id') or style.get(_x('xml:id'))
4594 parent_style_id = style.get('style')
4596 if parent_style_id not in styles:
4599 styles[style_id] = styles[parent_style_id].copy()
4600 for prop in SUPPORTED_STYLING:
4601 prop_val = style.get(_x('tts:' + prop))
4603 styles.setdefault(style_id, {})[prop] = prop_val
4609 for p in ('body', 'div'):
4610 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4613 style = styles.get(ele.get('style'))
4616 default_style.update(style)
4618 for para, index in zip(paras, itertools.count(1)):
4619 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4620 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4621 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4622 if begin_time is None:
4627 end_time = begin_time + dur
4628 out.append('%d\n%s --> %s\n%s\n\n' % (
4630 srt_subtitles_timecode(begin_time),
4631 srt_subtitles_timecode(end_time),
4637 def cli_option(params, command_option, param):
4638 param = params.get(param)
4640 param = compat_str(param)
4641 return [command_option, param] if param is not None else []
4644 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4645 param = params.get(param)
4648 assert isinstance(param, bool)
4650 return [command_option + separator + (true_value if param else false_value)]
4651 return [command_option, true_value if param else false_value]
4654 def cli_valueless_option(params, command_option, param, expected_value=True):
4655 param = params.get(param)
4656 return [command_option] if param == expected_value else []
4659 def cli_configuration_args(params, arg_name, key, default=[], exe=None): # returns arg, for_compat
4660 argdict = params.get(arg_name, {})
4661 if isinstance(argdict, (list, tuple)): # for backward compatibility
4662 return argdict, True
4665 return default, False
4666 assert isinstance(argdict, dict)
4668 assert isinstance(key, compat_str)
4671 args = exe_args = None
4673 assert isinstance(exe, compat_str)
4675 args = argdict.get('%s+%s' % (key, exe))
4677 exe_args = argdict.get(exe)
4680 args = argdict.get(key) if key != exe else None
4681 if args is None and exe_args is None:
4682 args = argdict.get('default', default)
4684 args, exe_args = args or [], exe_args or []
4685 assert isinstance(args, (list, tuple))
4686 assert isinstance(exe_args, (list, tuple))
4687 return args + exe_args, False
4690 class ISO639Utils(object):
4691 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4750 'iw': 'heb', # Replaced by he in 1989 revision
4760 'in': 'ind', # Replaced by id in 1989 revision
4875 'ji': 'yid', # Replaced by yi in 1989 revision
4883 def short2long(cls, code):
4884 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4885 return cls._lang_map.get(code[:2])
4888 def long2short(cls, code):
4889 """Convert language code from ISO 639-2/T to ISO 639-1"""
4890 for short_name, long_name in cls._lang_map.items():
4891 if long_name == code:
4895 class ISO3166Utils(object):
4896 # From http://data.okfn.org/data/core/country-list
4898 'AF': 'Afghanistan',
4899 'AX': 'Åland Islands',
4902 'AS': 'American Samoa',
4907 'AG': 'Antigua and Barbuda',
4924 'BO': 'Bolivia, Plurinational State of',
4925 'BQ': 'Bonaire, Sint Eustatius and Saba',
4926 'BA': 'Bosnia and Herzegovina',
4928 'BV': 'Bouvet Island',
4930 'IO': 'British Indian Ocean Territory',
4931 'BN': 'Brunei Darussalam',
4933 'BF': 'Burkina Faso',
4939 'KY': 'Cayman Islands',
4940 'CF': 'Central African Republic',
4944 'CX': 'Christmas Island',
4945 'CC': 'Cocos (Keeling) Islands',
4949 'CD': 'Congo, the Democratic Republic of the',
4950 'CK': 'Cook Islands',
4952 'CI': 'Côte d\'Ivoire',
4957 'CZ': 'Czech Republic',
4961 'DO': 'Dominican Republic',
4964 'SV': 'El Salvador',
4965 'GQ': 'Equatorial Guinea',
4969 'FK': 'Falkland Islands (Malvinas)',
4970 'FO': 'Faroe Islands',
4974 'GF': 'French Guiana',
4975 'PF': 'French Polynesia',
4976 'TF': 'French Southern Territories',
4991 'GW': 'Guinea-Bissau',
4994 'HM': 'Heard Island and McDonald Islands',
4995 'VA': 'Holy See (Vatican City State)',
5002 'IR': 'Iran, Islamic Republic of',
5005 'IM': 'Isle of Man',
5015 'KP': 'Korea, Democratic People\'s Republic of',
5016 'KR': 'Korea, Republic of',
5019 'LA': 'Lao People\'s Democratic Republic',
5025 'LI': 'Liechtenstein',
5029 'MK': 'Macedonia, the Former Yugoslav Republic of',
5036 'MH': 'Marshall Islands',
5042 'FM': 'Micronesia, Federated States of',
5043 'MD': 'Moldova, Republic of',
5054 'NL': 'Netherlands',
5055 'NC': 'New Caledonia',
5056 'NZ': 'New Zealand',
5061 'NF': 'Norfolk Island',
5062 'MP': 'Northern Mariana Islands',
5067 'PS': 'Palestine, State of',
5069 'PG': 'Papua New Guinea',
5072 'PH': 'Philippines',
5076 'PR': 'Puerto Rico',
5080 'RU': 'Russian Federation',
5082 'BL': 'Saint Barthélemy',
5083 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5084 'KN': 'Saint Kitts and Nevis',
5085 'LC': 'Saint Lucia',
5086 'MF': 'Saint Martin (French part)',
5087 'PM': 'Saint Pierre and Miquelon',
5088 'VC': 'Saint Vincent and the Grenadines',
5091 'ST': 'Sao Tome and Principe',
5092 'SA': 'Saudi Arabia',
5096 'SL': 'Sierra Leone',
5098 'SX': 'Sint Maarten (Dutch part)',
5101 'SB': 'Solomon Islands',
5103 'ZA': 'South Africa',
5104 'GS': 'South Georgia and the South Sandwich Islands',
5105 'SS': 'South Sudan',
5110 'SJ': 'Svalbard and Jan Mayen',
5113 'CH': 'Switzerland',
5114 'SY': 'Syrian Arab Republic',
5115 'TW': 'Taiwan, Province of China',
5117 'TZ': 'Tanzania, United Republic of',
5119 'TL': 'Timor-Leste',
5123 'TT': 'Trinidad and Tobago',
5126 'TM': 'Turkmenistan',
5127 'TC': 'Turks and Caicos Islands',
5131 'AE': 'United Arab Emirates',
5132 'GB': 'United Kingdom',
5133 'US': 'United States',
5134 'UM': 'United States Minor Outlying Islands',
5138 'VE': 'Venezuela, Bolivarian Republic of',
5140 'VG': 'Virgin Islands, British',
5141 'VI': 'Virgin Islands, U.S.',
5142 'WF': 'Wallis and Futuna',
5143 'EH': 'Western Sahara',
5150 def short2full(cls, code):
5151 """Convert an ISO 3166-2 country code to the corresponding full name"""
5152 return cls._country_map.get(code.upper())
5155 class GeoUtils(object):
5156 # Major IPv4 address blocks per country
5158 'AD': '46.172.224.0/19',
5159 'AE': '94.200.0.0/13',
5160 'AF': '149.54.0.0/17',
5161 'AG': '209.59.64.0/18',
5162 'AI': '204.14.248.0/21',
5163 'AL': '46.99.0.0/16',
5164 'AM': '46.70.0.0/15',
5165 'AO': '105.168.0.0/13',
5166 'AP': '182.50.184.0/21',
5167 'AQ': '23.154.160.0/24',
5168 'AR': '181.0.0.0/12',
5169 'AS': '202.70.112.0/20',
5170 'AT': '77.116.0.0/14',
5171 'AU': '1.128.0.0/11',
5172 'AW': '181.41.0.0/18',
5173 'AX': '185.217.4.0/22',
5174 'AZ': '5.197.0.0/16',
5175 'BA': '31.176.128.0/17',
5176 'BB': '65.48.128.0/17',
5177 'BD': '114.130.0.0/16',
5179 'BF': '102.178.0.0/15',
5180 'BG': '95.42.0.0/15',
5181 'BH': '37.131.0.0/17',
5182 'BI': '154.117.192.0/18',
5183 'BJ': '137.255.0.0/16',
5184 'BL': '185.212.72.0/23',
5185 'BM': '196.12.64.0/18',
5186 'BN': '156.31.0.0/16',
5187 'BO': '161.56.0.0/16',
5188 'BQ': '161.0.80.0/20',
5189 'BR': '191.128.0.0/12',
5190 'BS': '24.51.64.0/18',
5191 'BT': '119.2.96.0/19',
5192 'BW': '168.167.0.0/16',
5193 'BY': '178.120.0.0/13',
5194 'BZ': '179.42.192.0/18',
5195 'CA': '99.224.0.0/11',
5196 'CD': '41.243.0.0/16',
5197 'CF': '197.242.176.0/21',
5198 'CG': '160.113.0.0/16',
5199 'CH': '85.0.0.0/13',
5200 'CI': '102.136.0.0/14',
5201 'CK': '202.65.32.0/19',
5202 'CL': '152.172.0.0/14',
5203 'CM': '102.244.0.0/14',
5204 'CN': '36.128.0.0/10',
5205 'CO': '181.240.0.0/12',
5206 'CR': '201.192.0.0/12',
5207 'CU': '152.206.0.0/15',
5208 'CV': '165.90.96.0/19',
5209 'CW': '190.88.128.0/17',
5210 'CY': '31.153.0.0/16',
5211 'CZ': '88.100.0.0/14',
5213 'DJ': '197.241.0.0/17',
5214 'DK': '87.48.0.0/12',
5215 'DM': '192.243.48.0/20',
5216 'DO': '152.166.0.0/15',
5217 'DZ': '41.96.0.0/12',
5218 'EC': '186.68.0.0/15',
5219 'EE': '90.190.0.0/15',
5220 'EG': '156.160.0.0/11',
5221 'ER': '196.200.96.0/20',
5222 'ES': '88.0.0.0/11',
5223 'ET': '196.188.0.0/14',
5224 'EU': '2.16.0.0/13',
5225 'FI': '91.152.0.0/13',
5226 'FJ': '144.120.0.0/16',
5227 'FK': '80.73.208.0/21',
5228 'FM': '119.252.112.0/20',
5229 'FO': '88.85.32.0/19',
5231 'GA': '41.158.0.0/15',
5233 'GD': '74.122.88.0/21',
5234 'GE': '31.146.0.0/16',
5235 'GF': '161.22.64.0/18',
5236 'GG': '62.68.160.0/19',
5237 'GH': '154.160.0.0/12',
5238 'GI': '95.164.0.0/16',
5239 'GL': '88.83.0.0/19',
5240 'GM': '160.182.0.0/15',
5241 'GN': '197.149.192.0/18',
5242 'GP': '104.250.0.0/19',
5243 'GQ': '105.235.224.0/20',
5244 'GR': '94.64.0.0/13',
5245 'GT': '168.234.0.0/16',
5246 'GU': '168.123.0.0/16',
5247 'GW': '197.214.80.0/20',
5248 'GY': '181.41.64.0/18',
5249 'HK': '113.252.0.0/14',
5250 'HN': '181.210.0.0/16',
5251 'HR': '93.136.0.0/13',
5252 'HT': '148.102.128.0/17',
5253 'HU': '84.0.0.0/14',
5254 'ID': '39.192.0.0/10',
5255 'IE': '87.32.0.0/12',
5256 'IL': '79.176.0.0/13',
5257 'IM': '5.62.80.0/20',
5258 'IN': '117.192.0.0/10',
5259 'IO': '203.83.48.0/21',
5260 'IQ': '37.236.0.0/14',
5261 'IR': '2.176.0.0/12',
5262 'IS': '82.221.0.0/16',
5263 'IT': '79.0.0.0/10',
5264 'JE': '87.244.64.0/18',
5265 'JM': '72.27.0.0/17',
5266 'JO': '176.29.0.0/16',
5267 'JP': '133.0.0.0/8',
5268 'KE': '105.48.0.0/12',
5269 'KG': '158.181.128.0/17',
5270 'KH': '36.37.128.0/17',
5271 'KI': '103.25.140.0/22',
5272 'KM': '197.255.224.0/20',
5273 'KN': '198.167.192.0/19',
5274 'KP': '175.45.176.0/22',
5275 'KR': '175.192.0.0/10',
5276 'KW': '37.36.0.0/14',
5277 'KY': '64.96.0.0/15',
5278 'KZ': '2.72.0.0/13',
5279 'LA': '115.84.64.0/18',
5280 'LB': '178.135.0.0/16',
5281 'LC': '24.92.144.0/20',
5282 'LI': '82.117.0.0/19',
5283 'LK': '112.134.0.0/15',
5284 'LR': '102.183.0.0/16',
5285 'LS': '129.232.0.0/17',
5286 'LT': '78.56.0.0/13',
5287 'LU': '188.42.0.0/16',
5288 'LV': '46.109.0.0/16',
5289 'LY': '41.252.0.0/14',
5290 'MA': '105.128.0.0/11',
5291 'MC': '88.209.64.0/18',
5292 'MD': '37.246.0.0/16',
5293 'ME': '178.175.0.0/17',
5294 'MF': '74.112.232.0/21',
5295 'MG': '154.126.0.0/17',
5296 'MH': '117.103.88.0/21',
5297 'MK': '77.28.0.0/15',
5298 'ML': '154.118.128.0/18',
5299 'MM': '37.111.0.0/17',
5300 'MN': '49.0.128.0/17',
5301 'MO': '60.246.0.0/16',
5302 'MP': '202.88.64.0/20',
5303 'MQ': '109.203.224.0/19',
5304 'MR': '41.188.64.0/18',
5305 'MS': '208.90.112.0/22',
5306 'MT': '46.11.0.0/16',
5307 'MU': '105.16.0.0/12',
5308 'MV': '27.114.128.0/18',
5309 'MW': '102.70.0.0/15',
5310 'MX': '187.192.0.0/11',
5311 'MY': '175.136.0.0/13',
5312 'MZ': '197.218.0.0/15',
5313 'NA': '41.182.0.0/16',
5314 'NC': '101.101.0.0/18',
5315 'NE': '197.214.0.0/18',
5316 'NF': '203.17.240.0/22',
5317 'NG': '105.112.0.0/12',
5318 'NI': '186.76.0.0/15',
5319 'NL': '145.96.0.0/11',
5320 'NO': '84.208.0.0/13',
5321 'NP': '36.252.0.0/15',
5322 'NR': '203.98.224.0/19',
5323 'NU': '49.156.48.0/22',
5324 'NZ': '49.224.0.0/14',
5325 'OM': '5.36.0.0/15',
5326 'PA': '186.72.0.0/15',
5327 'PE': '186.160.0.0/14',
5328 'PF': '123.50.64.0/18',
5329 'PG': '124.240.192.0/19',
5330 'PH': '49.144.0.0/13',
5331 'PK': '39.32.0.0/11',
5332 'PL': '83.0.0.0/11',
5333 'PM': '70.36.0.0/20',
5334 'PR': '66.50.0.0/16',
5335 'PS': '188.161.0.0/16',
5336 'PT': '85.240.0.0/13',
5337 'PW': '202.124.224.0/20',
5338 'PY': '181.120.0.0/14',
5339 'QA': '37.210.0.0/15',
5340 'RE': '102.35.0.0/16',
5341 'RO': '79.112.0.0/13',
5342 'RS': '93.86.0.0/15',
5343 'RU': '5.136.0.0/13',
5344 'RW': '41.186.0.0/16',
5345 'SA': '188.48.0.0/13',
5346 'SB': '202.1.160.0/19',
5347 'SC': '154.192.0.0/11',
5348 'SD': '102.120.0.0/13',
5349 'SE': '78.64.0.0/12',
5350 'SG': '8.128.0.0/10',
5351 'SI': '188.196.0.0/14',
5352 'SK': '78.98.0.0/15',
5353 'SL': '102.143.0.0/17',
5354 'SM': '89.186.32.0/19',
5355 'SN': '41.82.0.0/15',
5356 'SO': '154.115.192.0/18',
5357 'SR': '186.179.128.0/17',
5358 'SS': '105.235.208.0/21',
5359 'ST': '197.159.160.0/19',
5360 'SV': '168.243.0.0/16',
5361 'SX': '190.102.0.0/20',
5363 'SZ': '41.84.224.0/19',
5364 'TC': '65.255.48.0/20',
5365 'TD': '154.68.128.0/19',
5366 'TG': '196.168.0.0/14',
5367 'TH': '171.96.0.0/13',
5368 'TJ': '85.9.128.0/18',
5369 'TK': '27.96.24.0/21',
5370 'TL': '180.189.160.0/20',
5371 'TM': '95.85.96.0/19',
5372 'TN': '197.0.0.0/11',
5373 'TO': '175.176.144.0/21',
5374 'TR': '78.160.0.0/11',
5375 'TT': '186.44.0.0/15',
5376 'TV': '202.2.96.0/19',
5377 'TW': '120.96.0.0/11',
5378 'TZ': '156.156.0.0/14',
5379 'UA': '37.52.0.0/14',
5380 'UG': '102.80.0.0/13',
5382 'UY': '167.56.0.0/13',
5383 'UZ': '84.54.64.0/18',
5384 'VA': '212.77.0.0/19',
5385 'VC': '207.191.240.0/21',
5386 'VE': '186.88.0.0/13',
5387 'VG': '66.81.192.0/20',
5388 'VI': '146.226.0.0/16',
5389 'VN': '14.160.0.0/11',
5390 'VU': '202.80.32.0/20',
5391 'WF': '117.20.32.0/21',
5392 'WS': '202.4.32.0/19',
5393 'YE': '134.35.0.0/16',
5394 'YT': '41.242.116.0/22',
5395 'ZA': '41.0.0.0/11',
5396 'ZM': '102.144.0.0/13',
5397 'ZW': '102.177.192.0/18',
5401 def random_ipv4(cls, code_or_block):
5402 if len(code_or_block) == 2:
5403 block = cls._country_ip_map.get(code_or_block.upper())
5407 block = code_or_block
5408 addr, preflen = block.split('/')
5409 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5410 addr_max = addr_min | (0xffffffff >> int(preflen))
5411 return compat_str(socket.inet_ntoa(
5412 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5415 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5416 def __init__(self, proxies=None):
5417 # Set default handlers
5418 for type in ('http', 'https'):
5419 setattr(self, '%s_open' % type,
5420 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5421 meth(r, proxy, type))
5422 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5424 def proxy_open(self, req, proxy, type):
5425 req_proxy = req.headers.get('Ytdl-request-proxy')
5426 if req_proxy is not None:
5428 del req.headers['Ytdl-request-proxy']
5430 if proxy == '__noproxy__':
5431 return None # No Proxy
5432 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5433 req.add_header('Ytdl-socks-proxy', proxy)
5434 # youtube-dlc's http/https handlers do wrapping the socket with socks
5436 return compat_urllib_request.ProxyHandler.proxy_open(
5437 self, req, proxy, type)
5440 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5441 # released into Public Domain
5442 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5444 def long_to_bytes(n, blocksize=0):
5445 """long_to_bytes(n:long, blocksize:int) : string
5446 Convert a long integer to a byte string.
5448 If optional blocksize is given and greater than zero, pad the front of the
5449 byte string with binary zeros so that the length is a multiple of
5452 # after much testing, this algorithm was deemed to be the fastest
5456 s = compat_struct_pack('>I', n & 0xffffffff) + s
5458 # strip off leading zeros
5459 for i in range(len(s)):
5460 if s[i] != b'\000'[0]:
5463 # only happens when n == 0
5467 # add back some pad bytes. this could be done more efficiently w.r.t. the
5468 # de-padding being done above, but sigh...
5469 if blocksize > 0 and len(s) % blocksize:
5470 s = (blocksize - len(s) % blocksize) * b'\000' + s
5474 def bytes_to_long(s):
5475 """bytes_to_long(string) : long
5476 Convert a byte string to a long integer.
5478 This is (essentially) the inverse of long_to_bytes().
5483 extra = (4 - length % 4)
5484 s = b'\000' * extra + s
5485 length = length + extra
5486 for i in range(0, length, 4):
5487 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5491 def ohdave_rsa_encrypt(data, exponent, modulus):
5493 Implement OHDave
's RSA algorithm. See http://www.ohdave.com/rsa/
5496 data: data to encrypt, bytes-like object
5497 exponent, modulus: parameter e and N of RSA algorithm, both integer
5498 Output: hex string of encrypted data
5500 Limitation: supports one block encryption only
5503 payload = int(binascii.hexlify(data[::-1]), 16)
5504 encrypted = pow(payload, exponent, modulus)
5505 return '%x' % encrypted
5508 def pkcs1pad(data, length):
5510 Padding input data with PKCS#1 scheme
5512 @param {int[]} data input data
5513 @param {int} length target length
5514 @returns {int[]} padded data
5516 if len(data) > length - 11:
5517 raise ValueError('Input data too
long for PKCS
#1 padding')
5519 pseudo_random
= [random
.randint(0, 254) for _
in range(length
- len(data
) - 3)]
5520 return [0, 2] + pseudo_random
+ [0] + data
5523 def encode_base_n(num
, n
, table
=None):
5524 FULL_TABLE
= '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5526 table
= FULL_TABLE
[:n
]
5529 raise ValueError('base %d exceeds table length %d' % (n
, len(table
)))
5536 ret
= table
[num
% n
] + ret
5541 def decode_packed_codes(code
):
5542 mobj
= re
.search(PACKED_CODES_RE
, code
)
5543 obfuscated_code
, base
, count
, symbols
= mobj
.groups()
5546 symbols
= symbols
.split('|')
5551 base_n_count
= encode_base_n(count
, base
)
5552 symbol_table
[base_n_count
] = symbols
[count
] or base_n_count
5555 r
'\b(\w+)\b', lambda mobj
: symbol_table
[mobj
.group(0)],
5559 def caesar(s
, alphabet
, shift
):
5564 alphabet
[(alphabet
.index(c
) + shift
) % l
] if c
in alphabet
else c
5569 return caesar(s
, r
'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5572 def parse_m3u8_attributes(attrib
):
5574 for (key
, val
) in re
.findall(r
'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib
):
5575 if val
.startswith('"'):
5581 def urshift(val
, n
):
5582 return val
>> n
if val
>= 0 else (val
+ 0x100000000) >> n
5585 # Based on png2str() written by @gdkchan and improved by @yokrysty
5586 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5587 def decode_png(png_data
):
5588 # Reference: https://www.w3.org/TR/PNG/
5589 header
= png_data
[8:]
5591 if png_data
[:8] != b
'\x89PNG\x0d\x0a\x1a\x0a' or header
[4:8] != b
'IHDR':
5592 raise IOError('Not a valid PNG file.')
5594 int_map
= {1: '>B', 2: '>H', 4: '>I'}
5595 unpack_integer
= lambda x
: compat_struct_unpack(int_map
[len(x
)], x
)[0]
5600 length
= unpack_integer(header
[:4])
5603 chunk_type
= header
[:4]
5606 chunk_data
= header
[:length
]
5607 header
= header
[length
:]
5609 header
= header
[4:] # Skip CRC
5617 ihdr
= chunks
[0]['data']
5619 width
= unpack_integer(ihdr
[:4])
5620 height
= unpack_integer(ihdr
[4:8])
5624 for chunk
in chunks
:
5625 if chunk
['type'] == b
'IDAT':
5626 idat
+= chunk
['data']
5629 raise IOError('Unable to read PNG data.')
5631 decompressed_data
= bytearray(zlib
.decompress(idat
))
5636 def _get_pixel(idx
):
5641 for y
in range(height
):
5642 basePos
= y
* (1 + stride
)
5643 filter_type
= decompressed_data
[basePos
]
5647 pixels
.append(current_row
)
5649 for x
in range(stride
):
5650 color
= decompressed_data
[1 + basePos
+ x
]
5651 basex
= y
* stride
+ x
5656 left
= _get_pixel(basex
- 3)
5658 up
= _get_pixel(basex
- stride
)
5660 if filter_type
== 1: # Sub
5661 color
= (color
+ left
) & 0xff
5662 elif filter_type
== 2: # Up
5663 color
= (color
+ up
) & 0xff
5664 elif filter_type
== 3: # Average
5665 color
= (color
+ ((left
+ up
) >> 1)) & 0xff
5666 elif filter_type
== 4: # Paeth
5672 c
= _get_pixel(basex
- stride
- 3)
5680 if pa
<= pb
and pa
<= pc
:
5681 color
= (color
+ a
) & 0xff
5683 color
= (color
+ b
) & 0xff
5685 color
= (color
+ c
) & 0xff
5687 current_row
.append(color
)
5689 return width
, height
, pixels
5692 def write_xattr(path
, key
, value
):
5693 # This mess below finds the best xattr tool for the job
5695 # try the pyxattr module...
5698 if hasattr(xattr
, 'set'): # pyxattr
5699 # Unicode arguments are not supported in python-pyxattr until
5701 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5702 pyxattr_required_version
= '0.5.0'
5703 if version_tuple(xattr
.__version
__) < version_tuple(pyxattr_required_version
):
5704 # TODO: fallback to CLI tools
5705 raise XAttrUnavailableError(
5706 'python-pyxattr is detected but is too old. '
5707 'youtube-dlc requires %s or above while your version is %s. '
5708 'Falling back to other xattr implementations' % (
5709 pyxattr_required_version
, xattr
.__version
__))
5711 setxattr
= xattr
.set
5713 setxattr
= xattr
.setxattr
5716 setxattr(path
, key
, value
)
5717 except EnvironmentError as e
:
5718 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5721 if compat_os_name
== 'nt':
5722 # Write xattrs to NTFS Alternate Data Streams:
5723 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5724 assert ':' not in key
5725 assert os
.path
.exists(path
)
5727 ads_fn
= path
+ ':' + key
5729 with open(ads_fn
, 'wb') as f
:
5731 except EnvironmentError as e
:
5732 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5734 user_has_setfattr
= check_executable('setfattr', ['--version'])
5735 user_has_xattr
= check_executable('xattr', ['-h'])
5737 if user_has_setfattr
or user_has_xattr
:
5739 value
= value
.decode('utf-8')
5740 if user_has_setfattr
:
5741 executable
= 'setfattr'
5742 opts
= ['-n', key
, '-v', value
]
5743 elif user_has_xattr
:
5744 executable
= 'xattr'
5745 opts
= ['-w', key
, value
]
5747 cmd
= ([encodeFilename(executable
, True)]
5748 + [encodeArgument(o
) for o
in opts
]
5749 + [encodeFilename(path
, True)])
5752 p
= subprocess
.Popen(
5753 cmd
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
, stdin
=subprocess
.PIPE
)
5754 except EnvironmentError as e
:
5755 raise XAttrMetadataError(e
.errno
, e
.strerror
)
5756 stdout
, stderr
= process_communicate_or_kill(p
)
5757 stderr
= stderr
.decode('utf-8', 'replace')
5758 if p
.returncode
!= 0:
5759 raise XAttrMetadataError(p
.returncode
, stderr
)
5762 # On Unix, and can't find pyxattr, setfattr, or xattr.
5763 if sys
.platform
.startswith('linux'):
5764 raise XAttrUnavailableError(
5765 "Couldn't find a tool to set the xattrs. "
5766 "Install either the python 'pyxattr' or 'xattr' "
5767 "modules, or the GNU 'attr' package "
5768 "(which contains the 'setfattr' tool).")
5770 raise XAttrUnavailableError(
5771 "Couldn't find a tool to set the xattrs. "
5772 "Install either the python 'xattr' module, "
5773 "or the 'xattr' binary.")
5776 def random_birthday(year_field
, month_field
, day_field
):
5777 start_date
= datetime
.date(1950, 1, 1)
5778 end_date
= datetime
.date(1995, 12, 31)
5779 offset
= random
.randint(0, (end_date
- start_date
).days
)
5780 random_date
= start_date
+ datetime
.timedelta(offset
)
5782 year_field
: str(random_date
.year
),
5783 month_field
: str(random_date
.month
),
5784 day_field
: str(random_date
.day
),
5788 # Templates for internet shortcut files, which are plain text files.
5789 DOT_URL_LINK_TEMPLATE
= '''
5794 DOT_WEBLOC_LINK_TEMPLATE
= '''
5795 <?xml version="1.0" encoding="UTF-8"?>
5796 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5797 <plist version="1.0">
5800 \t<string>%(url)s</string>
5805 DOT_DESKTOP_LINK_TEMPLATE
= '''
5815 def iri_to_uri(iri
):
5817 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5819 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5822 iri_parts
= compat_urllib_parse_urlparse(iri
)
5824 if '[' in iri_parts
.netloc
:
5825 raise ValueError('IPv6 URIs are not, yet, supported.')
5826 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5828 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5831 if iri_parts
.username
:
5832 net_location
+= compat_urllib_parse_quote(iri_parts
.username
, safe
=r
"!$%&'()*+,~")
5833 if iri_parts
.password
is not None:
5834 net_location
+= ':' + compat_urllib_parse_quote(iri_parts
.password
, safe
=r
"!$%&'()*+,~")
5837 net_location
+= iri_parts
.hostname
.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
5838 # The 'idna' encoding produces ASCII text.
5839 if iri_parts
.port
is not None and iri_parts
.port
!= 80:
5840 net_location
+= ':' + str(iri_parts
.port
)
5842 return compat_urllib_parse_urlunparse(
5846 compat_urllib_parse_quote_plus(iri_parts
.path
, safe
=r
"!$%&'()*+,/:;=@|~"),
5848 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5849 compat_urllib_parse_quote_plus(iri_parts
.params
, safe
=r
"!$%&'()*+,/:;=@|~"),
5851 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5852 compat_urllib_parse_quote_plus(iri_parts
.query
, safe
=r
"!$%&'()*+,/:;=?@{|}~"),
5854 compat_urllib_parse_quote_plus(iri_parts
.fragment
, safe
=r
"!#$%&'()*+,/:;=?@{|}~")))
5856 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5859 def to_high_limit_path(path
):
5860 if sys
.platform
in ['win32', 'cygwin']:
5861 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5862 return r
'\\?\ '.rstrip() + os
.path
.abspath(path
)
5867 def format_field(obj
, field
, template
='%s', ignore
=(None, ''), default
='', func
=None):
5868 val
= obj
.get(field
, default
)
5869 if func
and val
not in ignore
:
5871 return template
% val
if val
not in ignore
else default
5874 def clean_podcast_url(url
):
5875 return re
.sub(r
'''(?x)
5879 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5882 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5885 cn\.co| # https://podcorn.com/analytics-prefix/
5886 st\.fm # https://podsights.com/docs/
5891 _HEX_TABLE
= '0123456789abcdef'
5894 def random_uuidv4():
5895 return re
.sub(r
'[xy]', lambda x
: _HEX_TABLE
[random
.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5898 def make_dir(path
, to_screen
=None):
5900 dn
= os
.path
.dirname(path
)
5901 if dn
and not os
.path
.exists(dn
):
5904 except (OSError, IOError) as err
:
5905 if callable(to_screen
) is not None:
5906 to_screen('unable to create directory ' + error_to_compat_str(err
))