jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python
	2	# -- coding: utf-8 --
	3
	4	from __future__ import unicode_literals
	5
	6	import calendar
	7	import codecs
	8	import contextlib
	9	import ctypes
	10	import datetime
	11	import email.utils
	12	import errno
	13	import functools
	14	import gzip
	15	import itertools
	16	import io
	17	import json
	18	import locale
	19	import math
	20	import os
	21	import pipes
	22	import platform
	23	import re
	24	import ssl
	25	import socket
	26	import struct
	27	import subprocess
	28	import sys
	29	import tempfile
	30	import traceback
	31	import xml.etree.ElementTree
	32	import zlib
	33
	34	from .compat import (
	35	compat_chr,
	36	compat_getenv,
	37	compat_html_entities,
	38	compat_http_client,
	39	compat_parse_qs,
	40	compat_socket_create_connection,
	41	compat_str,
	42	compat_urllib_error,
	43	compat_urllib_parse,
	44	compat_urllib_parse_urlparse,
	45	compat_urllib_request,
	46	compat_urlparse,
	47	shlex_quote,
	48	)
	49
	50
	51	# This is not clearly defined otherwise
	52	compiled_regex_type = type(re.compile(''))
	53
	54	std_headers = {
	55	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
	56	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
	57	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	58	'Accept-Encoding': 'gzip, deflate',
	59	'Accept-Language': 'en-us,en;q=0.5',
	60	}
	61
	62
	63	def preferredencoding():
	64	"""Get preferred encoding.
	65
	66	Returns the best encoding scheme for the system, based on
	67	locale.getpreferredencoding() and some further tweaks.
	68	"""
	69	try:
	70	pref = locale.getpreferredencoding()
	71	'TEST'.encode(pref)
	72	except:
	73	pref = 'UTF-8'
	74
	75	return pref
	76
	77
	78	def write_json_file(obj, fn):
	79	""" Encode obj as JSON and write it to fn, atomically if possible """
	80
	81	fn = encodeFilename(fn)
	82	if sys.version_info < (3, 0) and sys.platform != 'win32':
	83	encoding = get_filesystem_encoding()
	84	# os.path.basename returns a bytes object, but NamedTemporaryFile
	85	# will fail if the filename contains non ascii characters unless we
	86	# use a unicode object
	87	path_basename = lambda f: os.path.basename(fn).decode(encoding)
	88	# the same for os.path.dirname
	89	path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
	90	else:
	91	path_basename = os.path.basename
	92	path_dirname = os.path.dirname
	93
	94	args = {
	95	'suffix': '.tmp',
	96	'prefix': path_basename(fn) + '.',
	97	'dir': path_dirname(fn),
	98	'delete': False,
	99	}
	100
	101	# In Python 2.x, json.dump expects a bytestream.
	102	# In Python 3.x, it writes to a character stream
	103	if sys.version_info < (3, 0):
	104	args['mode'] = 'wb'
	105	else:
	106	args.update({
	107	'mode': 'w',
	108	'encoding': 'utf-8',
	109	})
	110
	111	tf = tempfile.NamedTemporaryFile(**args)
	112
	113	try:
	114	with tf:
	115	json.dump(obj, tf)
	116	if sys.platform == 'win32':
	117	# Need to remove existing file on Windows, else os.rename raises
	118	# WindowsError or FileExistsError.
	119	try:
	120	os.unlink(fn)
	121	except OSError:
	122	pass
	123	os.rename(tf.name, fn)
	124	except:
	125	try:
	126	os.remove(tf.name)
	127	except OSError:
	128	pass
	129	raise
	130
	131
	132	if sys.version_info >= (2, 7):
	133	def find_xpath_attr(node, xpath, key, val):
	134	""" Find the xpath xpath[@key=val] """
	135	assert re.match(r'^[a-zA-Z-]+$', key)
	136	assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
	137	expr = xpath + "[@%s='%s']" % (key, val)
	138	return node.find(expr)
	139	else:
	140	def find_xpath_attr(node, xpath, key, val):
	141	# Here comes the crazy part: In 2.6, if the xpath is a unicode,
	142	# .//node does not match if a node is a direct child of . !
	143	if isinstance(xpath, unicode):
	144	xpath = xpath.encode('ascii')
	145
	146	for f in node.findall(xpath):
	147	if f.attrib.get(key) == val:
	148	return f
	149	return None
	150
	151	# On python2.6 the xml.etree.ElementTree.Element methods don't support
	152	# the namespace parameter
	153
	154
	155	def xpath_with_ns(path, ns_map):
	156	components = [c.split(':') for c in path.split('/')]
	157	replaced = []
	158	for c in components:
	159	if len(c) == 1:
	160	replaced.append(c[0])
	161	else:
	162	ns, tag = c
	163	replaced.append('{%s}%s' % (ns_map[ns], tag))
	164	return '/'.join(replaced)
	165
	166
	167	def xpath_text(node, xpath, name=None, fatal=False):
	168	if sys.version_info < (2, 7): # Crazy 2.6
	169	xpath = xpath.encode('ascii')
	170
	171	n = node.find(xpath)
	172	if n is None or n.text is None:
	173	if fatal:
	174	name = xpath if name is None else name
	175	raise ExtractorError('Could not find XML element %s' % name)
	176	else:
	177	return None
	178	return n.text
	179
	180
	181	def get_element_by_id(id, html):
	182	"""Return the content of the tag with the specified ID in the passed HTML document"""
	183	return get_element_by_attribute("id", id, html)
	184
	185
	186	def get_element_by_attribute(attribute, value, html):
	187	"""Return the content of the tag with the specified attribute in the passed HTML document"""
	188
	189	m = re.search(r'''(?xs)
	190	<([a-zA-Z0-9:._-]+)
	191	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+\|="[^"]+"\|='[^']+'))*?
	192	\s+%s=['"]?%s['"]?
	193	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+\|="[^"]+"\|='[^']+'))*?
	194	\s*>
	195	(?P<content>.*?)
	196	</\1>
	197	''' % (re.escape(attribute), re.escape(value)), html)
	198
	199	if not m:
	200	return None
	201	res = m.group('content')
	202
	203	if res.startswith('"') or res.startswith("'"):
	204	res = res[1:-1]
	205
	206	return unescapeHTML(res)
	207
	208
	209	def clean_html(html):
	210	"""Clean an HTML snippet into a readable string"""
	211
	212	if html is None: # Convenience for sanitizing descriptions etc.
	213	return html
	214
	215	# Newline vs <br />
	216	html = html.replace('\n', ' ')
	217	html = re.sub(r'\s<\sbr\s/?\s>\s*', '\n', html)
	218	html = re.sub(r'<\s/\sp\s>\s<\sp[^>]>', '\n', html)
	219	# Strip html tags
	220	html = re.sub('<.*?>', '', html)
	221	# Replace html entities
	222	html = unescapeHTML(html)
	223	return html.strip()
	224
	225
	226	def sanitize_open(filename, open_mode):
	227	"""Try to open the given filename, and slightly tweak it if this fails.
	228
	229	Attempts to open the given filename. If this fails, it tries to change
	230	the filename slightly, step by step, until it's either able to open it
	231	or it fails and raises a final exception, like the standard open()
	232	function.
	233
	234	It returns the tuple (stream, definitive_file_name).
	235	"""
	236	try:
	237	if filename == '-':
	238	if sys.platform == 'win32':
	239	import msvcrt
	240	msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
	241	return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
	242	stream = open(encodeFilename(filename), open_mode)
	243	return (stream, filename)
	244	except (IOError, OSError) as err:
	245	if err.errno in (errno.EACCES,):
	246	raise
	247
	248	# In case of error, try to remove win32 forbidden chars
	249	alt_filename = os.path.join(
	250	re.sub('[/<>:"\\\|\\\\?\\*]', '#', path_part)
	251	for path_part in os.path.split(filename)
	252	)
	253	if alt_filename == filename:
	254	raise
	255	else:
	256	# An exception here should be caught in the caller
	257	stream = open(encodeFilename(filename), open_mode)
	258	return (stream, alt_filename)
	259
	260
	261	def timeconvert(timestr):
	262	"""Convert RFC 2822 defined time string into system timestamp"""
	263	timestamp = None
	264	timetuple = email.utils.parsedate_tz(timestr)
	265	if timetuple is not None:
	266	timestamp = email.utils.mktime_tz(timetuple)
	267	return timestamp
	268
	269
	270	def sanitize_filename(s, restricted=False, is_id=False):
	271	"""Sanitizes a string so it could be used as part of a filename.
	272	If restricted is set, use a stricter subset of allowed characters.
	273	Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
	274	"""
	275	def replace_insane(char):
	276	if char == '?' or ord(char) < 32 or ord(char) == 127:
	277	return ''
	278	elif char == '"':
	279	return '' if restricted else '\''
	280	elif char == ':':
	281	return '_-' if restricted else ' -'
	282	elif char in '\\/\|*<>':
	283	return '_'
	284	if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
	285	return '_'
	286	if restricted and ord(char) > 127:
	287	return '_'
	288	return char
	289
	290	# Handle timestamps
	291	s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
	292	result = ''.join(map(replace_insane, s))
	293	if not is_id:
	294	while '__' in result:
	295	result = result.replace('__', '_')
	296	result = result.strip('_')
	297	# Common case of "Foreign band name - English song title"
	298	if restricted and result.startswith('-_'):
	299	result = result[2:]
	300	if not result:
	301	result = '_'
	302	return result
	303
	304
	305	def orderedSet(iterable):
	306	""" Remove all duplicates from the input iterable """
	307	res = []
	308	for el in iterable:
	309	if el not in res:
	310	res.append(el)
	311	return res
	312
	313
	314	def _htmlentity_transform(entity):
	315	"""Transforms an HTML entity to a character."""
	316	# Known non-numeric HTML entity
	317	if entity in compat_html_entities.name2codepoint:
	318	return compat_chr(compat_html_entities.name2codepoint[entity])
	319
	320	mobj = re.match(r'#(x?[0-9]+)', entity)
	321	if mobj is not None:
	322	numstr = mobj.group(1)
	323	if numstr.startswith('x'):
	324	base = 16
	325	numstr = '0%s' % numstr
	326	else:
	327	base = 10
	328	return compat_chr(int(numstr, base))
	329
	330	# Unknown entity in name, return its literal representation
	331	return ('&%s;' % entity)
	332
	333
	334	def unescapeHTML(s):
	335	if s is None:
	336	return None
	337	assert type(s) == compat_str
	338
	339	return re.sub(
	340	r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
	341
	342
	343	def encodeFilename(s, for_subprocess=False):
	344	"""
	345	@param s The name of the file
	346	"""
	347
	348	assert type(s) == compat_str
	349
	350	# Python 3 has a Unicode API
	351	if sys.version_info >= (3, 0):
	352	return s
	353
	354	if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
	355	# Pass '' directly to use Unicode APIs on Windows 2000 and up
	356	# (Detecting Windows NT 4 is tricky because 'major >= 4' would
	357	# match Windows 9x series as well. Besides, NT 4 is obsolete.)
	358	if not for_subprocess:
	359	return s
	360	else:
	361	# For subprocess calls, encode with locale encoding
	362	# Refer to http://stackoverflow.com/a/9951851/35070
	363	encoding = preferredencoding()
	364	else:
	365	encoding = sys.getfilesystemencoding()
	366	if encoding is None:
	367	encoding = 'utf-8'
	368	return s.encode(encoding, 'ignore')
	369
	370
	371	def encodeArgument(s):
	372	if not isinstance(s, compat_str):
	373	# Legacy code that uses byte strings
	374	# Uncomment the following line after fixing all post processors
	375	# assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
	376	s = s.decode('ascii')
	377	return encodeFilename(s, True)
	378
	379
	380	def decodeOption(optval):
	381	if optval is None:
	382	return optval
	383	if isinstance(optval, bytes):
	384	optval = optval.decode(preferredencoding())
	385
	386	assert isinstance(optval, compat_str)
	387	return optval
	388
	389
	390	def formatSeconds(secs):
	391	if secs > 3600:
	392	return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
	393	elif secs > 60:
	394	return '%d:%02d' % (secs // 60, secs % 60)
	395	else:
	396	return '%d' % secs
	397
	398
	399	def make_HTTPS_handler(params, **kwargs):
	400	opts_no_check_certificate = params.get('nocheckcertificate', False)
	401	if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
	402	context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
	403	if opts_no_check_certificate:
	404	context.check_hostname = False
	405	context.verify_mode = ssl.CERT_NONE
	406	try:
	407	return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
	408	except TypeError:
	409	# Python 2.7.8
	410	# (create_default_context present but HTTPSHandler has no context=)
	411	pass
	412
	413	if sys.version_info < (3, 2):
	414	return YoutubeDLHTTPSHandler(params, **kwargs)
	415	else: # Python < 3.4
	416	context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
	417	context.verify_mode = (ssl.CERT_NONE
	418	if opts_no_check_certificate
	419	else ssl.CERT_REQUIRED)
	420	context.set_default_verify_paths()
	421	return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
	422
	423
	424	class ExtractorError(Exception):
	425	"""Error during info extraction."""
	426
	427	def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
	428	""" tb, if given, is the original traceback (so that it can be printed out).
	429	If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
	430	"""
	431
	432	if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
	433	expected = True
	434	if video_id is not None:
	435	msg = video_id + ': ' + msg
	436	if cause:
	437	msg += ' (caused by %r)' % cause
	438	if not expected:
	439	if ytdl_is_updateable():
	440	update_cmd = 'type youtube-dl -U to update'
	441	else:
	442	update_cmd = 'see https://yt-dl.org/update on how to update'
	443	msg += '; please report this issue on https://yt-dl.org/bug .'
	444	msg += ' Make sure you are using the latest version; %s.' % update_cmd
	445	msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
	446	super(ExtractorError, self).__init__(msg)
	447
	448	self.traceback = tb
	449	self.exc_info = sys.exc_info() # preserve original exception
	450	self.cause = cause
	451	self.video_id = video_id
	452
	453	def format_traceback(self):
	454	if self.traceback is None:
	455	return None
	456	return ''.join(traceback.format_tb(self.traceback))
	457
	458
	459	class UnsupportedError(ExtractorError):
	460	def __init__(self, url):
	461	super(UnsupportedError, self).__init__(
	462	'Unsupported URL: %s' % url, expected=True)
	463	self.url = url
	464
	465
	466	class RegexNotFoundError(ExtractorError):
	467	"""Error when a regex didn't match"""
	468	pass
	469
	470
	471	class DownloadError(Exception):
	472	"""Download Error exception.
	473
	474	This exception may be thrown by FileDownloader objects if they are not
	475	configured to continue on errors. They will contain the appropriate
	476	error message.
	477	"""
	478
	479	def __init__(self, msg, exc_info=None):
	480	""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
	481	super(DownloadError, self).__init__(msg)
	482	self.exc_info = exc_info
	483
	484
	485	class SameFileError(Exception):
	486	"""Same File exception.
	487
	488	This exception will be thrown by FileDownloader objects if they detect
	489	multiple files would have to be downloaded to the same file on disk.
	490	"""
	491	pass
	492
	493
	494	class PostProcessingError(Exception):
	495	"""Post Processing exception.
	496
	497	This exception may be raised by PostProcessor's .run() method to
	498	indicate an error in the postprocessing task.
	499	"""
	500
	501	def __init__(self, msg):
	502	self.msg = msg
	503
	504
	505	class MaxDownloadsReached(Exception):
	506	""" --max-downloads limit has been reached. """
	507	pass
	508
	509
	510	class UnavailableVideoError(Exception):
	511	"""Unavailable Format exception.
	512
	513	This exception will be thrown when a video is requested
	514	in a format that is not available for that video.
	515	"""
	516	pass
	517
	518
	519	class ContentTooShortError(Exception):
	520	"""Content Too Short exception.
	521
	522	This exception may be raised by FileDownloader objects when a file they
	523	download is too small for what the server announced first, indicating
	524	the connection was probably interrupted.
	525	"""
	526	# Both in bytes
	527	downloaded = None
	528	expected = None
	529
	530	def __init__(self, downloaded, expected):
	531	self.downloaded = downloaded
	532	self.expected = expected
	533
	534
	535	def _create_http_connection(ydl_handler, http_class, is_https, args, *kwargs):
	536	hc = http_class(args, *kwargs)
	537	source_address = ydl_handler._params.get('source_address')
	538	if source_address is not None:
	539	sa = (source_address, 0)
	540	if hasattr(hc, 'source_address'): # Python 2.7+
	541	hc.source_address = sa
	542	else: # Python 2.6
	543	def _hc_connect(self, args, *kwargs):
	544	sock = compat_socket_create_connection(
	545	(self.host, self.port), self.timeout, sa)
	546	if is_https:
	547	self.sock = ssl.wrap_socket(
	548	sock, self.key_file, self.cert_file,
	549	ssl_version=ssl.PROTOCOL_TLSv1)
	550	else:
	551	self.sock = sock
	552	hc.connect = functools.partial(_hc_connect, hc)
	553
	554	return hc
	555
	556
	557	class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
	558	"""Handler for HTTP requests and responses.
	559
	560	This class, when installed with an OpenerDirector, automatically adds
	561	the standard headers to every HTTP request and handles gzipped and
	562	deflated responses from web servers. If compression is to be avoided in
	563	a particular request, the original request in the program code only has
	564	to include the HTTP header "Youtubedl-No-Compression", which will be
	565	removed before making the real request.
	566
	567	Part of this code was copied from:
	568
	569	http://techknack.net/python-urllib2-handlers/
	570
	571	Andrew Rowls, the author of that code, agreed to release it to the
	572	public domain.
	573	"""
	574
	575	def __init__(self, params, args, *kwargs):
	576	compat_urllib_request.HTTPHandler.__init__(self, args, *kwargs)
	577	self._params = params
	578
	579	def http_open(self, req):
	580	return self.do_open(functools.partial(
	581	_create_http_connection, self, compat_http_client.HTTPConnection, False),
	582	req)
	583
	584	@staticmethod
	585	def deflate(data):
	586	try:
	587	return zlib.decompress(data, -zlib.MAX_WBITS)
	588	except zlib.error:
	589	return zlib.decompress(data)
	590
	591	@staticmethod
	592	def addinfourl_wrapper(stream, headers, url, code):
	593	if hasattr(compat_urllib_request.addinfourl, 'getcode'):
	594	return compat_urllib_request.addinfourl(stream, headers, url, code)
	595	ret = compat_urllib_request.addinfourl(stream, headers, url)
	596	ret.code = code
	597	return ret
	598
	599	def http_request(self, req):
	600	for h, v in std_headers.items():
	601	# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
	602	# The dict keys are capitalized because of this bug by urllib
	603	if h.capitalize() not in req.headers:
	604	req.add_header(h, v)
	605	if 'Youtubedl-no-compression' in req.headers:
	606	if 'Accept-encoding' in req.headers:
	607	del req.headers['Accept-encoding']
	608	del req.headers['Youtubedl-no-compression']
	609	if 'Youtubedl-user-agent' in req.headers:
	610	if 'User-agent' in req.headers:
	611	del req.headers['User-agent']
	612	req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
	613	del req.headers['Youtubedl-user-agent']
	614
	615	if sys.version_info < (2, 7) and '#' in req.get_full_url():
	616	# Python 2.6 is brain-dead when it comes to fragments
	617	req._Request__original = req._Request__original.partition('#')[0]
	618	req._Request__r_type = req._Request__r_type.partition('#')[0]
	619
	620	return req
	621
	622	def http_response(self, req, resp):
	623	old_resp = resp
	624	# gzip
	625	if resp.headers.get('Content-encoding', '') == 'gzip':
	626	content = resp.read()
	627	gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
	628	try:
	629	uncompressed = io.BytesIO(gz.read())
	630	except IOError as original_ioerror:
	631	# There may be junk add the end of the file
	632	# See http://stackoverflow.com/q/4928560/35070 for details
	633	for i in range(1, 1024):
	634	try:
	635	gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
	636	uncompressed = io.BytesIO(gz.read())
	637	except IOError:
	638	continue
	639	break
	640	else:
	641	raise original_ioerror
	642	resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
	643	resp.msg = old_resp.msg
	644	# deflate
	645	if resp.headers.get('Content-encoding', '') == 'deflate':
	646	gz = io.BytesIO(self.deflate(resp.read()))
	647	resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
	648	resp.msg = old_resp.msg
	649	return resp
	650
	651	https_request = http_request
	652	https_response = http_response
	653
	654
	655	class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
	656	def __init__(self, params, https_conn_class=None, args, *kwargs):
	657	compat_urllib_request.HTTPSHandler.__init__(self, args, *kwargs)
	658	self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
	659	self._params = params
	660
	661	def https_open(self, req):
	662	return self.do_open(functools.partial(
	663	_create_http_connection, self, self._https_conn_class, True),
	664	req)
	665
	666
	667	def parse_iso8601(date_str, delimiter='T'):
	668	""" Return a UNIX timestamp from the given date """
	669
	670	if date_str is None:
	671	return None
	672
	673	m = re.search(
	674	r'(\.[0-9]+)?(?:Z$\| ?(?P<sign>\+\|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
	675	date_str)
	676	if not m:
	677	timezone = datetime.timedelta()
	678	else:
	679	date_str = date_str[:-len(m.group(0))]
	680	if not m.group('sign'):
	681	timezone = datetime.timedelta()
	682	else:
	683	sign = 1 if m.group('sign') == '+' else -1
	684	timezone = datetime.timedelta(
	685	hours=sign * int(m.group('hours')),
	686	minutes=sign * int(m.group('minutes')))
	687	date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
	688	dt = datetime.datetime.strptime(date_str, date_format) - timezone
	689	return calendar.timegm(dt.timetuple())
	690
	691
	692	def unified_strdate(date_str, day_first=True):
	693	"""Return a string with the date in the format YYYYMMDD"""
	694
	695	if date_str is None:
	696	return None
	697	upload_date = None
	698	# Replace commas
	699	date_str = date_str.replace(',', ' ')
	700	# %z (UTC offset) is only supported in python>=3.2
	701	date_str = re.sub(r' ?(\+\|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
	702	# Remove AM/PM + timezone
	703	date_str = re.sub(r'(?i)\s*(?:AM\|PM)\s+[A-Z]+', '', date_str)
	704
	705	format_expressions = [
	706	'%d %B %Y',
	707	'%d %b %Y',
	708	'%B %d %Y',
	709	'%b %d %Y',
	710	'%b %dst %Y %I:%M%p',
	711	'%b %dnd %Y %I:%M%p',
	712	'%b %dth %Y %I:%M%p',
	713	'%Y %m %d',
	714	'%Y-%m-%d',
	715	'%Y/%m/%d',
	716	'%Y/%m/%d %H:%M:%S',
	717	'%Y-%m-%d %H:%M:%S',
	718	'%Y-%m-%d %H:%M:%S.%f',
	719	'%d.%m.%Y %H:%M',
	720	'%d.%m.%Y %H.%M',
	721	'%Y-%m-%dT%H:%M:%SZ',
	722	'%Y-%m-%dT%H:%M:%S.%fZ',
	723	'%Y-%m-%dT%H:%M:%S.%f0Z',
	724	'%Y-%m-%dT%H:%M:%S',
	725	'%Y-%m-%dT%H:%M:%S.%f',
	726	'%Y-%m-%dT%H:%M',
	727	]
	728	if day_first:
	729	format_expressions.extend([
	730	'%d.%m.%Y',
	731	'%d/%m/%Y',
	732	'%d/%m/%y',
	733	'%d/%m/%Y %H:%M:%S',
	734	])
	735	else:
	736	format_expressions.extend([
	737	'%m.%d.%Y',
	738	'%m/%d/%Y',
	739	'%m/%d/%y',
	740	'%m/%d/%Y %H:%M:%S',
	741	])
	742	for expression in format_expressions:
	743	try:
	744	upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
	745	except ValueError:
	746	pass
	747	if upload_date is None:
	748	timetuple = email.utils.parsedate_tz(date_str)
	749	if timetuple:
	750	upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
	751	return upload_date
	752
	753
	754	def determine_ext(url, default_ext='unknown_video'):
	755	if url is None:
	756	return default_ext
	757	guess = url.partition('?')[0].rpartition('.')[2]
	758	if re.match(r'^[A-Za-z0-9]+$', guess):
	759	return guess
	760	else:
	761	return default_ext
	762
	763
	764	def subtitles_filename(filename, sub_lang, sub_format):
	765	return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
	766
	767
	768	def date_from_str(date_str):
	769	"""
	770	Return a datetime object from a string in the format YYYYMMDD or
	771	(now\|today)[+-][0-9](day\|week\|month\|year)(s)?"""
	772	today = datetime.date.today()
	773	if date_str in ('now', 'today'):
	774	return today
	775	if date_str == 'yesterday':
	776	return today - datetime.timedelta(days=1)
	777	match = re.match('(now\|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day\|week\|month\|year)(s)?', date_str)
	778	if match is not None:
	779	sign = match.group('sign')
	780	time = int(match.group('time'))
	781	if sign == '-':
	782	time = -time
	783	unit = match.group('unit')
	784	# A bad aproximation?
	785	if unit == 'month':
	786	unit = 'day'
	787	time *= 30
	788	elif unit == 'year':
	789	unit = 'day'
	790	time *= 365
	791	unit += 's'
	792	delta = datetime.timedelta(**{unit: time})
	793	return today + delta
	794	return datetime.datetime.strptime(date_str, "%Y%m%d").date()
	795
	796
	797	def hyphenate_date(date_str):
	798	"""
	799	Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
	800	match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
	801	if match is not None:
	802	return '-'.join(match.groups())
	803	else:
	804	return date_str
	805
	806
	807	class DateRange(object):
	808	"""Represents a time interval between two dates"""
	809
	810	def __init__(self, start=None, end=None):
	811	"""start and end must be strings in the format accepted by date"""
	812	if start is not None:
	813	self.start = date_from_str(start)
	814	else:
	815	self.start = datetime.datetime.min.date()
	816	if end is not None:
	817	self.end = date_from_str(end)
	818	else:
	819	self.end = datetime.datetime.max.date()
	820	if self.start > self.end:
	821	raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
	822
	823	@classmethod
	824	def day(cls, day):
	825	"""Returns a range that only contains the given day"""
	826	return cls(day, day)
	827
	828	def __contains__(self, date):
	829	"""Check if the date is in the range"""
	830	if not isinstance(date, datetime.date):
	831	date = date_from_str(date)
	832	return self.start <= date <= self.end
	833
	834	def __str__(self):
	835	return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
	836
	837
	838	def platform_name():
	839	""" Returns the platform name as a compat_str """
	840	res = platform.platform()
	841	if isinstance(res, bytes):
	842	res = res.decode(preferredencoding())
	843
	844	assert isinstance(res, compat_str)
	845	return res
	846
	847
	848	def _windows_write_string(s, out):
	849	""" Returns True if the string was written using special methods,
	850	False if it has yet to be written out."""
	851	# Adapted from http://stackoverflow.com/a/3259271/35070
	852
	853	import ctypes
	854	import ctypes.wintypes
	855
	856	WIN_OUTPUT_IDS = {
	857	1: -11,
	858	2: -12,
	859	}
	860
	861	try:
	862	fileno = out.fileno()
	863	except AttributeError:
	864	# If the output stream doesn't have a fileno, it's virtual
	865	return False
	866	except io.UnsupportedOperation:
	867	# Some strange Windows pseudo files?
	868	return False
	869	if fileno not in WIN_OUTPUT_IDS:
	870	return False
	871
	872	GetStdHandle = ctypes.WINFUNCTYPE(
	873	ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
	874	(b"GetStdHandle", ctypes.windll.kernel32))
	875	h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
	876
	877	WriteConsoleW = ctypes.WINFUNCTYPE(
	878	ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
	879	ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
	880	ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
	881	written = ctypes.wintypes.DWORD(0)
	882
	883	GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
	884	FILE_TYPE_CHAR = 0x0002
	885	FILE_TYPE_REMOTE = 0x8000
	886	GetConsoleMode = ctypes.WINFUNCTYPE(
	887	ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
	888	ctypes.POINTER(ctypes.wintypes.DWORD))(
	889	(b"GetConsoleMode", ctypes.windll.kernel32))
	890	INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
	891
	892	def not_a_console(handle):
	893	if handle == INVALID_HANDLE_VALUE or handle is None:
	894	return True
	895	return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
	896	or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
	897
	898	if not_a_console(h):
	899	return False
	900
	901	def next_nonbmp_pos(s):
	902	try:
	903	return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
	904	except StopIteration:
	905	return len(s)
	906
	907	while s:
	908	count = min(next_nonbmp_pos(s), 1024)
	909
	910	ret = WriteConsoleW(
	911	h, s, count if count else 2, ctypes.byref(written), None)
	912	if ret == 0:
	913	raise OSError('Failed to write string')
	914	if not count: # We just wrote a non-BMP character
	915	assert written.value == 2
	916	s = s[1:]
	917	else:
	918	assert written.value > 0
	919	s = s[written.value:]
	920	return True
	921
	922
	923	def write_string(s, out=None, encoding=None):
	924	if out is None:
	925	out = sys.stderr
	926	assert type(s) == compat_str
	927
	928	if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
	929	if _windows_write_string(s, out):
	930	return
	931
	932	if ('b' in getattr(out, 'mode', '') or
	933	sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
	934	byt = s.encode(encoding or preferredencoding(), 'ignore')
	935	out.write(byt)
	936	elif hasattr(out, 'buffer'):
	937	enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
	938	byt = s.encode(enc, 'ignore')
	939	out.buffer.write(byt)
	940	else:
	941	out.write(s)
	942	out.flush()
	943
	944
	945	def bytes_to_intlist(bs):
	946	if not bs:
	947	return []
	948	if isinstance(bs[0], int): # Python 3
	949	return list(bs)
	950	else:
	951	return [ord(c) for c in bs]
	952
	953
	954	def intlist_to_bytes(xs):
	955	if not xs:
	956	return b''
	957	return struct_pack('%dB' % len(xs), *xs)
	958
	959
	960	# Cross-platform file locking
	961	if sys.platform == 'win32':
	962	import ctypes.wintypes
	963	import msvcrt
	964
	965	class OVERLAPPED(ctypes.Structure):
	966	_fields_ = [
	967	('Internal', ctypes.wintypes.LPVOID),
	968	('InternalHigh', ctypes.wintypes.LPVOID),
	969	('Offset', ctypes.wintypes.DWORD),
	970	('OffsetHigh', ctypes.wintypes.DWORD),
	971	('hEvent', ctypes.wintypes.HANDLE),
	972	]
	973
	974	kernel32 = ctypes.windll.kernel32
	975	LockFileEx = kernel32.LockFileEx
	976	LockFileEx.argtypes = [
	977	ctypes.wintypes.HANDLE, # hFile
	978	ctypes.wintypes.DWORD, # dwFlags
	979	ctypes.wintypes.DWORD, # dwReserved
	980	ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
	981	ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
	982	ctypes.POINTER(OVERLAPPED) # Overlapped
	983	]
	984	LockFileEx.restype = ctypes.wintypes.BOOL
	985	UnlockFileEx = kernel32.UnlockFileEx
	986	UnlockFileEx.argtypes = [
	987	ctypes.wintypes.HANDLE, # hFile
	988	ctypes.wintypes.DWORD, # dwReserved
	989	ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
	990	ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
	991	ctypes.POINTER(OVERLAPPED) # Overlapped
	992	]
	993	UnlockFileEx.restype = ctypes.wintypes.BOOL
	994	whole_low = 0xffffffff
	995	whole_high = 0x7fffffff
	996
	997	def _lock_file(f, exclusive):
	998	overlapped = OVERLAPPED()
	999	overlapped.Offset = 0
	1000	overlapped.OffsetHigh = 0
	1001	overlapped.hEvent = 0
	1002	f._lock_file_overlapped_p = ctypes.pointer(overlapped)
	1003	handle = msvcrt.get_osfhandle(f.fileno())
	1004	if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
	1005	whole_low, whole_high, f._lock_file_overlapped_p):
	1006	raise OSError('Locking file failed: %r' % ctypes.FormatError())
	1007
	1008	def _unlock_file(f):
	1009	assert f._lock_file_overlapped_p
	1010	handle = msvcrt.get_osfhandle(f.fileno())
	1011	if not UnlockFileEx(handle, 0,
	1012	whole_low, whole_high, f._lock_file_overlapped_p):
	1013	raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
	1014
	1015	else:
	1016	import fcntl
	1017
	1018	def _lock_file(f, exclusive):
	1019	fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
	1020
	1021	def _unlock_file(f):
	1022	fcntl.flock(f, fcntl.LOCK_UN)
	1023
	1024
	1025	class locked_file(object):
	1026	def __init__(self, filename, mode, encoding=None):
	1027	assert mode in ['r', 'a', 'w']
	1028	self.f = io.open(filename, mode, encoding=encoding)
	1029	self.mode = mode
	1030
	1031	def __enter__(self):
	1032	exclusive = self.mode != 'r'
	1033	try:
	1034	_lock_file(self.f, exclusive)
	1035	except IOError:
	1036	self.f.close()
	1037	raise
	1038	return self
	1039
	1040	def __exit__(self, etype, value, traceback):
	1041	try:
	1042	_unlock_file(self.f)
	1043	finally:
	1044	self.f.close()
	1045
	1046	def __iter__(self):
	1047	return iter(self.f)
	1048
	1049	def write(self, *args):
	1050	return self.f.write(*args)
	1051
	1052	def read(self, *args):
	1053	return self.f.read(*args)
	1054
	1055
	1056	def get_filesystem_encoding():
	1057	encoding = sys.getfilesystemencoding()
	1058	return encoding if encoding is not None else 'utf-8'
	1059
	1060
	1061	def shell_quote(args):
	1062	quoted_args = []
	1063	encoding = get_filesystem_encoding()
	1064	for a in args:
	1065	if isinstance(a, bytes):
	1066	# We may get a filename encoded with 'encodeFilename'
	1067	a = a.decode(encoding)
	1068	quoted_args.append(pipes.quote(a))
	1069	return ' '.join(quoted_args)
	1070
	1071
	1072	def takewhile_inclusive(pred, seq):
	1073	""" Like itertools.takewhile, but include the latest evaluated element
	1074	(the first element so that Not pred(e)) """
	1075	for e in seq:
	1076	yield e
	1077	if not pred(e):
	1078	return
	1079
	1080
	1081	def smuggle_url(url, data):
	1082	""" Pass additional data in a URL for internal use. """
	1083
	1084	sdata = compat_urllib_parse.urlencode(
	1085	{'__youtubedl_smuggle': json.dumps(data)})
	1086	return url + '#' + sdata
	1087
	1088
	1089	def unsmuggle_url(smug_url, default=None):
	1090	if '#__youtubedl_smuggle' not in smug_url:
	1091	return smug_url, default
	1092	url, _, sdata = smug_url.rpartition('#')
	1093	jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
	1094	data = json.loads(jsond)
	1095	return url, data
	1096
	1097
	1098	def format_bytes(bytes):
	1099	if bytes is None:
	1100	return 'N/A'
	1101	if type(bytes) is str:
	1102	bytes = float(bytes)
	1103	if bytes == 0.0:
	1104	exponent = 0
	1105	else:
	1106	exponent = int(math.log(bytes, 1024.0))
	1107	suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
	1108	converted = float(bytes) / float(1024 ** exponent)
	1109	return '%.2f%s' % (converted, suffix)
	1110
	1111
	1112	def parse_filesize(s):
	1113	if s is None:
	1114	return None
	1115
	1116	# The lower-case forms are of course incorrect and inofficial,
	1117	# but we support those too
	1118	_UNIT_TABLE = {
	1119	'B': 1,
	1120	'b': 1,
	1121	'KiB': 1024,
	1122	'KB': 1000,
	1123	'kB': 1024,
	1124	'Kb': 1000,
	1125	'MiB': 1024 ** 2,
	1126	'MB': 1000 ** 2,
	1127	'mB': 1024 ** 2,
	1128	'Mb': 1000 ** 2,
	1129	'GiB': 1024 ** 3,
	1130	'GB': 1000 ** 3,
	1131	'gB': 1024 ** 3,
	1132	'Gb': 1000 ** 3,
	1133	'TiB': 1024 ** 4,
	1134	'TB': 1000 ** 4,
	1135	'tB': 1024 ** 4,
	1136	'Tb': 1000 ** 4,
	1137	'PiB': 1024 ** 5,
	1138	'PB': 1000 ** 5,
	1139	'pB': 1024 ** 5,
	1140	'Pb': 1000 ** 5,
	1141	'EiB': 1024 ** 6,
	1142	'EB': 1000 ** 6,
	1143	'eB': 1024 ** 6,
	1144	'Eb': 1000 ** 6,
	1145	'ZiB': 1024 ** 7,
	1146	'ZB': 1000 ** 7,
	1147	'zB': 1024 ** 7,
	1148	'Zb': 1000 ** 7,
	1149	'YiB': 1024 ** 8,
	1150	'YB': 1000 ** 8,
	1151	'yB': 1024 ** 8,
	1152	'Yb': 1000 ** 8,
	1153	}
	1154
	1155	units_re = '\|'.join(re.escape(u) for u in _UNIT_TABLE)
	1156	m = re.match(
	1157	r'(?P<num>[0-9]+(?:[,.][0-9])?)\s(?P<unit>%s)' % units_re, s)
	1158	if not m:
	1159	return None
	1160
	1161	num_str = m.group('num').replace(',', '.')
	1162	mult = _UNIT_TABLE[m.group('unit')]
	1163	return int(float(num_str) * mult)
	1164
	1165
	1166	def get_term_width():
	1167	columns = compat_getenv('COLUMNS', None)
	1168	if columns:
	1169	return int(columns)
	1170
	1171	try:
	1172	sp = subprocess.Popen(
	1173	['stty', 'size'],
	1174	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	1175	out, err = sp.communicate()
	1176	return int(out.split()[1])
	1177	except:
	1178	pass
	1179	return None
	1180
	1181
	1182	def month_by_name(name):
	1183	""" Return the number of a month by (locale-independently) English name """
	1184
	1185	ENGLISH_NAMES = [
	1186	'January', 'February', 'March', 'April', 'May', 'June',
	1187	'July', 'August', 'September', 'October', 'November', 'December']
	1188	try:
	1189	return ENGLISH_NAMES.index(name) + 1
	1190	except ValueError:
	1191	return None
	1192
	1193
	1194	def fix_xml_ampersands(xml_str):
	1195	"""Replace all the '&' by '&' in XML"""
	1196	return re.sub(
	1197	r'&(?!amp;\|lt;\|gt;\|apos;\|quot;\|#x[0-9a-fA-F]{,4};\|#[0-9]{,4};)',
	1198	'&',
	1199	xml_str)
	1200
	1201
	1202	def setproctitle(title):
	1203	assert isinstance(title, compat_str)
	1204	try:
	1205	libc = ctypes.cdll.LoadLibrary("libc.so.6")
	1206	except OSError:
	1207	return
	1208	title_bytes = title.encode('utf-8')
	1209	buf = ctypes.create_string_buffer(len(title_bytes))
	1210	buf.value = title_bytes
	1211	try:
	1212	libc.prctl(15, buf, 0, 0, 0)
	1213	except AttributeError:
	1214	return # Strange libc, just skip this
	1215
	1216
	1217	def remove_start(s, start):
	1218	if s.startswith(start):
	1219	return s[len(start):]
	1220	return s
	1221
	1222
	1223	def remove_end(s, end):
	1224	if s.endswith(end):
	1225	return s[:-len(end)]
	1226	return s
	1227
	1228
	1229	def url_basename(url):
	1230	path = compat_urlparse.urlparse(url).path
	1231	return path.strip('/').split('/')[-1]
	1232
	1233
	1234	class HEADRequest(compat_urllib_request.Request):
	1235	def get_method(self):
	1236	return "HEAD"
	1237
	1238
	1239	def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
	1240	if get_attr:
	1241	if v is not None:
	1242	v = getattr(v, get_attr, None)
	1243	if v == '':
	1244	v = None
	1245	return default if v is None else (int(v) * invscale // scale)
	1246
	1247
	1248	def str_or_none(v, default=None):
	1249	return default if v is None else compat_str(v)
	1250
	1251
	1252	def str_to_int(int_str):
	1253	""" A more relaxed version of int_or_none """
	1254	if int_str is None:
	1255	return None
	1256	int_str = re.sub(r'[,\.\+]', '', int_str)
	1257	return int(int_str)
	1258
	1259
	1260	def float_or_none(v, scale=1, invscale=1, default=None):
	1261	return default if v is None else (float(v) * invscale / scale)
	1262
	1263
	1264	def parse_duration(s):
	1265	if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
	1266	return None
	1267
	1268	s = s.strip()
	1269
	1270	m = re.match(
	1271	r'''(?ix)(?:P?T)?
	1272	(?:
	1273	(?P<only_mins>[0-9.]+)\s(?:mins?\|minutes?)\s\|
	1274	(?P<only_hours>[0-9.]+)\s*(?:hours?)\|
	1275
	1276	(?:
	1277	(?:(?P<hours>[0-9]+)\s(?:[:h]\|hours?)\s)?
	1278	(?P<mins>[0-9]+)\s(?:[:m]\|mins?\|minutes?)\s
	1279	)?
	1280	(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s\|secs?\|seconds?)?
	1281	)$''', s)
	1282	if not m:
	1283	return None
	1284	res = 0
	1285	if m.group('only_mins'):
	1286	return float_or_none(m.group('only_mins'), invscale=60)
	1287	if m.group('only_hours'):
	1288	return float_or_none(m.group('only_hours'), invscale=60 * 60)
	1289	if m.group('secs'):
	1290	res += int(m.group('secs'))
	1291	if m.group('mins'):
	1292	res += int(m.group('mins')) * 60
	1293	if m.group('hours'):
	1294	res += int(m.group('hours')) * 60 * 60
	1295	if m.group('ms'):
	1296	res += float(m.group('ms'))
	1297	return res
	1298
	1299
	1300	def prepend_extension(filename, ext):
	1301	name, real_ext = os.path.splitext(filename)
	1302	return '{0}.{1}{2}'.format(name, ext, real_ext)
	1303
	1304
	1305	def check_executable(exe, args=[]):
	1306	""" Checks if the given binary is installed somewhere in PATH, and returns its name.
	1307	args can be a list of arguments for a short output (like -version) """
	1308	try:
	1309	subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	1310	except OSError:
	1311	return False
	1312	return exe
	1313
	1314
	1315	def get_exe_version(exe, args=['--version'],
	1316	version_re=None, unrecognized='present'):
	1317	""" Returns the version of the specified executable,
	1318	or False if the executable is not present """
	1319	try:
	1320	out, _ = subprocess.Popen(
	1321	[exe] + args,
	1322	stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
	1323	except OSError:
	1324	return False
	1325	if isinstance(out, bytes): # Python 2.x
	1326	out = out.decode('ascii', 'ignore')
	1327	return detect_exe_version(out, version_re, unrecognized)
	1328
	1329
	1330	def detect_exe_version(output, version_re=None, unrecognized='present'):
	1331	assert isinstance(output, compat_str)
	1332	if version_re is None:
	1333	version_re = r'version\s+([-0-9._a-zA-Z]+)'
	1334	m = re.search(version_re, output)
	1335	if m:
	1336	return m.group(1)
	1337	else:
	1338	return unrecognized
	1339
	1340
	1341	class PagedList(object):
	1342	def __len__(self):
	1343	# This is only useful for tests
	1344	return len(self.getslice())
	1345
	1346
	1347	class OnDemandPagedList(PagedList):
	1348	def __init__(self, pagefunc, pagesize):
	1349	self._pagefunc = pagefunc
	1350	self._pagesize = pagesize
	1351
	1352	def getslice(self, start=0, end=None):
	1353	res = []
	1354	for pagenum in itertools.count(start // self._pagesize):
	1355	firstid = pagenum * self._pagesize
	1356	nextfirstid = pagenum * self._pagesize + self._pagesize
	1357	if start >= nextfirstid:
	1358	continue
	1359
	1360	page_results = list(self._pagefunc(pagenum))
	1361
	1362	startv = (
	1363	start % self._pagesize
	1364	if firstid <= start < nextfirstid
	1365	else 0)
	1366
	1367	endv = (
	1368	((end - 1) % self._pagesize) + 1
	1369	if (end is not None and firstid <= end <= nextfirstid)
	1370	else None)
	1371
	1372	if startv != 0 or endv is not None:
	1373	page_results = page_results[startv:endv]
	1374	res.extend(page_results)
	1375
	1376	# A little optimization - if current page is not "full", ie. does
	1377	# not contain page_size videos then we can assume that this page
	1378	# is the last one - there are no more ids on further pages -
	1379	# i.e. no need to query again.
	1380	if len(page_results) + startv < self._pagesize:
	1381	break
	1382
	1383	# If we got the whole page, but the next page is not interesting,
	1384	# break out early as well
	1385	if end == nextfirstid:
	1386	break
	1387	return res
	1388
	1389
	1390	class InAdvancePagedList(PagedList):
	1391	def __init__(self, pagefunc, pagecount, pagesize):
	1392	self._pagefunc = pagefunc
	1393	self._pagecount = pagecount
	1394	self._pagesize = pagesize
	1395
	1396	def getslice(self, start=0, end=None):
	1397	res = []
	1398	start_page = start // self._pagesize
	1399	end_page = (
	1400	self._pagecount if end is None else (end // self._pagesize + 1))
	1401	skip_elems = start - start_page * self._pagesize
	1402	only_more = None if end is None else end - start
	1403	for pagenum in range(start_page, end_page):
	1404	page = list(self._pagefunc(pagenum))
	1405	if skip_elems:
	1406	page = page[skip_elems:]
	1407	skip_elems = None
	1408	if only_more is not None:
	1409	if len(page) < only_more:
	1410	only_more -= len(page)
	1411	else:
	1412	page = page[:only_more]
	1413	res.extend(page)
	1414	break
	1415	res.extend(page)
	1416	return res
	1417
	1418
	1419	def uppercase_escape(s):
	1420	unicode_escape = codecs.getdecoder('unicode_escape')
	1421	return re.sub(
	1422	r'\\U[0-9a-fA-F]{8}',
	1423	lambda m: unicode_escape(m.group(0))[0],
	1424	s)
	1425
	1426
	1427	def escape_rfc3986(s):
	1428	"""Escape non-ASCII characters as suggested by RFC 3986"""
	1429	if sys.version_info < (3, 0) and isinstance(s, unicode):
	1430	s = s.encode('utf-8')
	1431	return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
	1432
	1433
	1434	def escape_url(url):
	1435	"""Escape URL as suggested by RFC 3986"""
	1436	url_parsed = compat_urllib_parse_urlparse(url)
	1437	return url_parsed._replace(
	1438	path=escape_rfc3986(url_parsed.path),
	1439	params=escape_rfc3986(url_parsed.params),
	1440	query=escape_rfc3986(url_parsed.query),
	1441	fragment=escape_rfc3986(url_parsed.fragment)
	1442	).geturl()
	1443
	1444	try:
	1445	struct.pack('!I', 0)
	1446	except TypeError:
	1447	# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
	1448	def struct_pack(spec, *args):
	1449	if isinstance(spec, compat_str):
	1450	spec = spec.encode('ascii')
	1451	return struct.pack(spec, *args)
	1452
	1453	def struct_unpack(spec, *args):
	1454	if isinstance(spec, compat_str):
	1455	spec = spec.encode('ascii')
	1456	return struct.unpack(spec, *args)
	1457	else:
	1458	struct_pack = struct.pack
	1459	struct_unpack = struct.unpack
	1460
	1461
	1462	def read_batch_urls(batch_fd):
	1463	def fixup(url):
	1464	if not isinstance(url, compat_str):
	1465	url = url.decode('utf-8', 'replace')
	1466	BOM_UTF8 = '\xef\xbb\xbf'
	1467	if url.startswith(BOM_UTF8):
	1468	url = url[len(BOM_UTF8):]
	1469	url = url.strip()
	1470	if url.startswith(('#', ';', ']')):
	1471	return False
	1472	return url
	1473
	1474	with contextlib.closing(batch_fd) as fd:
	1475	return [url for url in map(fixup, fd) if url]
	1476
	1477
	1478	def urlencode_postdata(args, *kargs):
	1479	return compat_urllib_parse.urlencode(args, *kargs).encode('ascii')
	1480
	1481
	1482	try:
	1483	etree_iter = xml.etree.ElementTree.Element.iter
	1484	except AttributeError: # Python <=2.6
	1485	etree_iter = lambda n: n.findall('.//*')
	1486
	1487
	1488	def parse_xml(s):
	1489	class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
	1490	def doctype(self, name, pubid, system):
	1491	pass # Ignore doctypes
	1492
	1493	parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
	1494	kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
	1495	tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
	1496	# Fix up XML parser in Python 2.x
	1497	if sys.version_info < (3, 0):
	1498	for n in etree_iter(tree):
	1499	if n.text is not None:
	1500	if not isinstance(n.text, compat_str):
	1501	n.text = n.text.decode('utf-8')
	1502	return tree
	1503
	1504
	1505	US_RATINGS = {
	1506	'G': 0,
	1507	'PG': 10,
	1508	'PG-13': 13,
	1509	'R': 16,
	1510	'NC': 18,
	1511	}
	1512
	1513
	1514	def parse_age_limit(s):
	1515	if s is None:
	1516	return None
	1517	m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
	1518	return int(m.group('age')) if m else US_RATINGS.get(s, None)
	1519
	1520
	1521	def strip_jsonp(code):
	1522	return re.sub(
	1523	r'(?s)^[a-zA-Z0-9_]+\s$\s(.)$;?\s?(?://[^\n])$', r'\1', code)
	1524
	1525
	1526	def js_to_json(code):
	1527	def fix_kv(m):
	1528	v = m.group(0)
	1529	if v in ('true', 'false', 'null'):
	1530	return v
	1531	if v.startswith('"'):
	1532	return v
	1533	if v.startswith("'"):
	1534	v = v[1:-1]
	1535	v = re.sub(r"\\\\\|\\'\|\"", lambda m: {
	1536	'\\\\': '\\\\',
	1537	"\\'": "'",
	1538	'"': '\\"',
	1539	}[m.group(0)], v)
	1540	return '"%s"' % v
	1541
	1542	res = re.sub(r'''(?x)
	1543	"(?:[^"\\](?:\\\\\|\\")?)"\|
	1544	'(?:[^'\\](?:\\\\\|\\')?)'\|
	1545	[a-zA-Z_][a-zA-Z_0-9]*
	1546	''', fix_kv, code)
	1547	res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
	1548	return res
	1549
	1550
	1551	def qualities(quality_ids):
	1552	""" Get a numeric quality value out of a list of possible values """
	1553	def q(qid):
	1554	try:
	1555	return quality_ids.index(qid)
	1556	except ValueError:
	1557	return -1
	1558	return q
	1559
	1560
	1561	DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
	1562
	1563
	1564	def limit_length(s, length):
	1565	""" Add ellipses to overly long strings """
	1566	if s is None:
	1567	return None
	1568	ELLIPSES = '...'
	1569	if len(s) > length:
	1570	return s[:length - len(ELLIPSES)] + ELLIPSES
	1571	return s
	1572
	1573
	1574	def version_tuple(v):
	1575	return tuple(int(e) for e in re.split(r'[-.]', v))
	1576
	1577
	1578	def is_outdated_version(version, limit, assume_new=True):
	1579	if not version:
	1580	return not assume_new
	1581	try:
	1582	return version_tuple(version) < version_tuple(limit)
	1583	except ValueError:
	1584	return not assume_new
	1585
	1586
	1587	def ytdl_is_updateable():
	1588	""" Returns if youtube-dl can be updated with -U """
	1589	from zipimport import zipimporter
	1590
	1591	return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
	1592
	1593
	1594	def args_to_str(args):
	1595	# Get a short string representation for a subprocess command
	1596	return ' '.join(shlex_quote(a) for a in args)
	1597
	1598
	1599	def urlhandle_detect_ext(url_handle):
	1600	try:
	1601	url_handle.headers
	1602	getheader = lambda h: url_handle.headers[h]
	1603	except AttributeError: # Python < 3
	1604	getheader = url_handle.info().getheader
	1605
	1606	cd = getheader('Content-Disposition')
	1607	if cd:
	1608	m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
	1609	if m:
	1610	e = determine_ext(m.group('filename'), default_ext=None)
	1611	if e:
	1612	return e
	1613
	1614	return getheader('Content-Type').split("/")[1]
	1615
	1616
	1617	def age_restricted(content_limit, age_limit):
	1618	""" Returns True iff the content should be blocked """
	1619
	1620	if age_limit is None: # No limit set
	1621	return False
	1622	if content_limit is None:
	1623	return False # Content available for everyone
	1624	return age_limit < content_limit
	1625
	1626
	1627	def is_html(first_bytes):
	1628	""" Detect whether a file contains HTML by examining its first bytes. """
	1629
	1630	BOMS = [
	1631	(b'\xef\xbb\xbf', 'utf-8'),
	1632	(b'\x00\x00\xfe\xff', 'utf-32-be'),
	1633	(b'\xff\xfe\x00\x00', 'utf-32-le'),
	1634	(b'\xff\xfe', 'utf-16-le'),
	1635	(b'\xfe\xff', 'utf-16-be'),
	1636	]
	1637	for bom, enc in BOMS:
	1638	if first_bytes.startswith(bom):
	1639	s = first_bytes[len(bom):].decode(enc, 'replace')
	1640	break
	1641	else:
	1642	s = first_bytes.decode('utf-8', 'replace')
	1643
	1644	return re.match(r'^\s*<', s)