jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	from __future__ import unicode_literals
	2
	3	import base64
	4	import hashlib
	5	import json
	6	import netrc
	7	import os
	8	import re
	9	import socket
	10	import sys
	11	import time
	12	import xml.etree.ElementTree
	13
	14	from ..utils import (
	15	compat_http_client,
	16	compat_urllib_error,
	17	compat_urllib_parse_urlparse,
	18	compat_str,
	19
	20	clean_html,
	21	compiled_regex_type,
	22	ExtractorError,
	23	int_or_none,
	24	RegexNotFoundError,
	25	sanitize_filename,
	26	unescapeHTML,
	27	)
	28	_NO_DEFAULT = object()
	29
	30
	31	class InfoExtractor(object):
	32	"""Information Extractor class.
	33
	34	Information extractors are the classes that, given a URL, extract
	35	information about the video (or videos) the URL refers to. This
	36	information includes the real video URL, the video title, author and
	37	others. The information is stored in a dictionary which is then
	38	passed to the FileDownloader. The FileDownloader processes this
	39	information possibly downloading the video to the file system, among
	40	other possible outcomes.
	41
	42	The dictionaries must include the following fields:
	43
	44	id: Video identifier.
	45	title: Video title, unescaped.
	46
	47	Additionally, it must contain either a formats entry or a url one:
	48
	49	formats: A list of dictionaries for each format available, ordered
	50	from worst to best quality.
	51
	52	Potential fields:
	53	* url Mandatory. The URL of the video file
	54	* ext Will be calculated from url if missing
	55	* format A human-readable description of the format
	56	("mp4 container with h264/opus").
	57	Calculated from the format_id, width, height.
	58	and format_note fields if missing.
	59	* format_id A short description of the format
	60	("mp4_h264_opus" or "19").
	61	Technically optional, but strongly recommended.
	62	* format_note Additional info about the format
	63	("3D" or "DASH video")
	64	* width Width of the video, if known
	65	* height Height of the video, if known
	66	* resolution Textual description of width and height
	67	* tbr Average bitrate of audio and video in KBit/s
	68	* abr Average audio bitrate in KBit/s
	69	* acodec Name of the audio codec in use
	70	* asr Audio sampling rate in Hertz
	71	* vbr Average video bitrate in KBit/s
	72	* vcodec Name of the video codec in use
	73	* container Name of the container format
	74	* filesize The number of bytes, if known in advance
	75	* filesize_approx An estimate for the number of bytes
	76	* player_url SWF Player URL (used for rtmpdump).
	77	* protocol The protocol that will be used for the actual
	78	download, lower-case.
	79	"http", "https", "rtsp", "rtmp", "m3u8" or so.
	80	* preference Order number of this format. If this field is
	81	present and not None, the formats get sorted
	82	by this field, regardless of all other values.
	83	-1 for default (order by other properties),
	84	-2 or smaller for less than default.
	85	* quality Order number of the video quality of this
	86	format, irrespective of the file format.
	87	-1 for default (order by other properties),
	88	-2 or smaller for less than default.
	89	* http_referer HTTP Referer header value to set.
	90	* http_method HTTP method to use for the download.
	91	* http_headers A dictionary of additional HTTP headers
	92	to add to the request.
	93	* http_post_data Additional data to send with a POST
	94	request.
	95	url: Final video URL.
	96	ext: Video filename extension.
	97	format: The video format, defaults to ext (used for --get-format)
	98	player_url: SWF Player URL (used for rtmpdump).
	99
	100	The following fields are optional:
	101
	102	display_id An alternative identifier for the video, not necessarily
	103	unique, but available before title. Typically, id is
	104	something like "4234987", title "Dancing naked mole rats",
	105	and display_id "dancing-naked-mole-rats"
	106	thumbnails: A list of dictionaries, with the following entries:
	107	* "url"
	108	* "width" (optional, int)
	109	* "height" (optional, int)
	110	* "resolution" (optional, string "{width}x{height"},
	111	deprecated)
	112	thumbnail: Full URL to a video thumbnail image.
	113	description: One-line video description.
	114	uploader: Full name of the video uploader.
	115	timestamp: UNIX timestamp of the moment the video became available.
	116	upload_date: Video upload date (YYYYMMDD).
	117	If not explicitly set, calculated from timestamp.
	118	uploader_id: Nickname or id of the video uploader.
	119	location: Physical location where the video was filmed.
	120	subtitles: The subtitle file contents as a dictionary in the format
	121	{language: subtitles}.
	122	duration: Length of the video in seconds, as an integer.
	123	view_count: How many users have watched the video on the platform.
	124	like_count: Number of positive ratings of the video
	125	dislike_count: Number of negative ratings of the video
	126	comment_count: Number of comments on the video
	127	age_limit: Age restriction for the video, as an integer (years)
	128	webpage_url: The url to the video webpage, if given to youtube-dl it
	129	should allow to get the same result again. (It will be set
	130	by YoutubeDL if it's missing)
	131	categories: A list of categories that the video falls in, for example
	132	["Sports", "Berlin"]
	133
	134	Unless mentioned otherwise, the fields should be Unicode strings.
	135
	136	Subclasses of this one should re-define the _real_initialize() and
	137	_real_extract() methods and define a _VALID_URL regexp.
	138	Probably, they should also be added to the list of extractors.
	139
	140	Finally, the _WORKING attribute should be set to False for broken IEs
	141	in order to warn the users and skip the tests.
	142	"""
	143
	144	_ready = False
	145	_downloader = None
	146	_WORKING = True
	147
	148	def __init__(self, downloader=None):
	149	"""Constructor. Receives an optional downloader."""
	150	self._ready = False
	151	self.set_downloader(downloader)
	152
	153	@classmethod
	154	def suitable(cls, url):
	155	"""Receives a URL and returns True if suitable for this IE."""
	156
	157	# This does not use has/getattr intentionally - we want to know whether
	158	# we have cached the regexp for this class, whereas getattr would also
	159	# match the superclass
	160	if '_VALID_URL_RE' not in cls.__dict__:
	161	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	162	return cls._VALID_URL_RE.match(url) is not None
	163
	164	@classmethod
	165	def working(cls):
	166	"""Getter method for _WORKING."""
	167	return cls._WORKING
	168
	169	def initialize(self):
	170	"""Initializes an instance (authentication, etc)."""
	171	if not self._ready:
	172	self._real_initialize()
	173	self._ready = True
	174
	175	def extract(self, url):
	176	"""Extracts URL information and returns it in list of dicts."""
	177	self.initialize()
	178	return self._real_extract(url)
	179
	180	def set_downloader(self, downloader):
	181	"""Sets the downloader for this IE."""
	182	self._downloader = downloader
	183
	184	def _real_initialize(self):
	185	"""Real initialization process. Redefine in subclasses."""
	186	pass
	187
	188	def _real_extract(self, url):
	189	"""Real extraction process. Redefine in subclasses."""
	190	pass
	191
	192	@classmethod
	193	def ie_key(cls):
	194	"""A string for getting the InfoExtractor with get_info_extractor"""
	195	return cls.__name__[:-2]
	196
	197	@property
	198	def IE_NAME(self):
	199	return type(self).__name__[:-2]
	200
	201	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	202	""" Returns the response handle """
	203	if note is None:
	204	self.report_download_webpage(video_id)
	205	elif note is not False:
	206	if video_id is None:
	207	self.to_screen('%s' % (note,))
	208	else:
	209	self.to_screen('%s: %s' % (video_id, note))
	210	try:
	211	return self._downloader.urlopen(url_or_request)
	212	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	213	if errnote is False:
	214	return False
	215	if errnote is None:
	216	errnote = 'Unable to download webpage'
	217	errmsg = '%s: %s' % (errnote, compat_str(err))
	218	if fatal:
	219	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	220	else:
	221	self._downloader.report_warning(errmsg)
	222	return False
	223
	224	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	225	""" Returns a tuple (page content as string, URL handle) """
	226
	227	# Strip hashes from the URL (#1038)
	228	if isinstance(url_or_request, (compat_str, str)):
	229	url_or_request = url_or_request.partition('#')[0]
	230
	231	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	232	if urlh is False:
	233	assert not fatal
	234	return False
	235	content_type = urlh.headers.get('Content-Type', '')
	236	webpage_bytes = urlh.read()
	237	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	238	if m:
	239	encoding = m.group(1)
	240	else:
	241	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
	242	webpage_bytes[:1024])
	243	if m:
	244	encoding = m.group(1).decode('ascii')
	245	elif webpage_bytes.startswith(b'\xff\xfe'):
	246	encoding = 'utf-16'
	247	else:
	248	encoding = 'utf-8'
	249	if self._downloader.params.get('dump_intermediate_pages', False):
	250	try:
	251	url = url_or_request.get_full_url()
	252	except AttributeError:
	253	url = url_or_request
	254	self.to_screen('Dumping request to ' + url)
	255	dump = base64.b64encode(webpage_bytes).decode('ascii')
	256	self._downloader.to_screen(dump)
	257	if self._downloader.params.get('write_pages', False):
	258	try:
	259	url = url_or_request.get_full_url()
	260	except AttributeError:
	261	url = url_or_request
	262	basen = '%s_%s' % (video_id, url)
	263	if len(basen) > 240:
	264	h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
	265	basen = basen[:240 - len(h)] + h
	266	raw_filename = basen + '.dump'
	267	filename = sanitize_filename(raw_filename, restricted=True)
	268	self.to_screen('Saving request to ' + filename)
	269	with open(filename, 'wb') as outf:
	270	outf.write(webpage_bytes)
	271
	272	try:
	273	content = webpage_bytes.decode(encoding, 'replace')
	274	except LookupError:
	275	content = webpage_bytes.decode('utf-8', 'replace')
	276
	277	if ('<title>Access to this site is blocked</title>' in content and
	278	'Websense' in content[:512]):
	279	msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
	280	blocked_iframe = self._html_search_regex(
	281	r'<iframe src="([^"]+)"', content,
	282	'Websense information URL', default=None)
	283	if blocked_iframe:
	284	msg += ' Visit %s for more details' % blocked_iframe
	285	raise ExtractorError(msg, expected=True)
	286
	287	return (content, urlh)
	288
	289	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	290	""" Returns the data of the page as a string """
	291	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	292	if res is False:
	293	return res
	294	else:
	295	content, _ = res
	296	return content
	297
	298	def _download_xml(self, url_or_request, video_id,
	299	note='Downloading XML', errnote='Unable to download XML',
	300	transform_source=None, fatal=True):
	301	"""Return the xml as an xml.etree.ElementTree.Element"""
	302	xml_string = self._download_webpage(
	303	url_or_request, video_id, note, errnote, fatal=fatal)
	304	if xml_string is False:
	305	return xml_string
	306	if transform_source:
	307	xml_string = transform_source(xml_string)
	308	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
	309
	310	def _download_json(self, url_or_request, video_id,
	311	note='Downloading JSON metadata',
	312	errnote='Unable to download JSON metadata',
	313	transform_source=None,
	314	fatal=True):
	315	json_string = self._download_webpage(
	316	url_or_request, video_id, note, errnote, fatal=fatal)
	317	if (not fatal) and json_string is False:
	318	return None
	319	if transform_source:
	320	json_string = transform_source(json_string)
	321	try:
	322	return json.loads(json_string)
	323	except ValueError as ve:
	324	raise ExtractorError('Failed to download JSON', cause=ve)
	325
	326	def report_warning(self, msg, video_id=None):
	327	idstr = '' if video_id is None else '%s: ' % video_id
	328	self._downloader.report_warning(
	329	'[%s] %s%s' % (self.IE_NAME, idstr, msg))
	330
	331	def to_screen(self, msg):
	332	"""Print msg to screen, prefixing it with '[ie_name]'"""
	333	self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
	334
	335	def report_extraction(self, id_or_name):
	336	"""Report information extraction."""
	337	self.to_screen('%s: Extracting information' % id_or_name)
	338
	339	def report_download_webpage(self, video_id):
	340	"""Report webpage download."""
	341	self.to_screen('%s: Downloading webpage' % video_id)
	342
	343	def report_age_confirmation(self):
	344	"""Report attempt to confirm age."""
	345	self.to_screen('Confirming age')
	346
	347	def report_login(self):
	348	"""Report attempt to log in."""
	349	self.to_screen('Logging in')
	350
	351	#Methods for following #608
	352	@staticmethod
	353	def url_result(url, ie=None, video_id=None):
	354	"""Returns a url that points to a page that should be processed"""
	355	#TODO: ie should be the class used for getting the info
	356	video_info = {'_type': 'url',
	357	'url': url,
	358	'ie_key': ie}
	359	if video_id is not None:
	360	video_info['id'] = video_id
	361	return video_info
	362	@staticmethod
	363	def playlist_result(entries, playlist_id=None, playlist_title=None):
	364	"""Returns a playlist"""
	365	video_info = {'_type': 'playlist',
	366	'entries': entries}
	367	if playlist_id:
	368	video_info['id'] = playlist_id
	369	if playlist_title:
	370	video_info['title'] = playlist_title
	371	return video_info
	372
	373	def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	374	"""
	375	Perform a regex search on the given string, using a single or a list of
	376	patterns returning the first matching group.
	377	In case of failure return a default value or raise a WARNING or a
	378	RegexNotFoundError, depending on fatal, specifying the field name.
	379	"""
	380	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	381	mobj = re.search(pattern, string, flags)
	382	else:
	383	for p in pattern:
	384	mobj = re.search(p, string, flags)
	385	if mobj:
	386	break
	387
	388	if os.name != 'nt' and sys.stderr.isatty():
	389	_name = '\033[0;34m%s\033[0m' % name
	390	else:
	391	_name = name
	392
	393	if mobj:
	394	# return the first matching group
	395	return next(g for g in mobj.groups() if g is not None)
	396	elif default is not _NO_DEFAULT:
	397	return default
	398	elif fatal:
	399	raise RegexNotFoundError('Unable to extract %s' % _name)
	400	else:
	401	self._downloader.report_warning('unable to extract %s; '
	402	'please report this issue on http://yt-dl.org/bug' % _name)
	403	return None
	404
	405	def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	406	"""
	407	Like _search_regex, but strips HTML tags and unescapes entities.
	408	"""
	409	res = self._search_regex(pattern, string, name, default, fatal, flags)
	410	if res:
	411	return clean_html(res).strip()
	412	else:
	413	return res
	414
	415	def _get_login_info(self):
	416	"""
	417	Get the the login info as (username, password)
	418	It will look in the netrc file using the _NETRC_MACHINE value
	419	If there's no info available, return (None, None)
	420	"""
	421	if self._downloader is None:
	422	return (None, None)
	423
	424	username = None
	425	password = None
	426	downloader_params = self._downloader.params
	427
	428	# Attempt to use provided username and password or .netrc data
	429	if downloader_params.get('username', None) is not None:
	430	username = downloader_params['username']
	431	password = downloader_params['password']
	432	elif downloader_params.get('usenetrc', False):
	433	try:
	434	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	435	if info is not None:
	436	username = info[0]
	437	password = info[2]
	438	else:
	439	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	440	except (IOError, netrc.NetrcParseError) as err:
	441	self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
	442
	443	return (username, password)
	444
	445	def _get_tfa_info(self):
	446	"""
	447	Get the two-factor authentication info
	448	TODO - asking the user will be required for sms/phone verify
	449	currently just uses the command line option
	450	If there's no info available, return None
	451	"""
	452	if self._downloader is None:
	453	return None
	454	downloader_params = self._downloader.params
	455
	456	if downloader_params.get('twofactor', None) is not None:
	457	return downloader_params['twofactor']
	458
	459	return None
	460
	461	# Helper functions for extracting OpenGraph info
	462	@staticmethod
	463	def _og_regexes(prop):
	464	content_re = r'content=(?:"([^>]+?)"\|\'([^>]+?)\')'
	465	property_re = r'(?:name\|property)=[\'"]og:%s[\'"]' % re.escape(prop)
	466	template = r'<meta[^>]+?%s[^>]+?%s'
	467	return [
	468	template % (property_re, content_re),
	469	template % (content_re, property_re),
	470	]
	471
	472	def _og_search_property(self, prop, html, name=None, **kargs):
	473	if name is None:
	474	name = 'OpenGraph %s' % prop
	475	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
	476	if escaped is None:
	477	return None
	478	return unescapeHTML(escaped)
	479
	480	def _og_search_thumbnail(self, html, **kargs):
	481	return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
	482
	483	def _og_search_description(self, html, **kargs):
	484	return self._og_search_property('description', html, fatal=False, **kargs)
	485
	486	def _og_search_title(self, html, **kargs):
	487	return self._og_search_property('title', html, **kargs)
	488
	489	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
	490	regexes = self._og_regexes('video') + self._og_regexes('video:url')
	491	if secure:
	492	regexes = self._og_regexes('video:secure_url') + regexes
	493	return self._html_search_regex(regexes, html, name, **kargs)
	494
	495	def _og_search_url(self, html, **kargs):
	496	return self._og_search_property('url', html, **kargs)
	497
	498	def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
	499	if display_name is None:
	500	display_name = name
	501	return self._html_search_regex(
	502	r'''(?ix)<meta
	503	(?=[^>]+(?:itemprop\|name\|property)=["\']?%s["\']?)
	504	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	505	html, display_name, fatal=fatal, **kwargs)
	506
	507	def _dc_search_uploader(self, html):
	508	return self._html_search_meta('dc.creator', html, 'uploader')
	509
	510	def _rta_search(self, html):
	511	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	512	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	513	r' content="RTA-5042-1996-1400-1577-RTA"',
	514	html):
	515	return 18
	516	return 0
	517
	518	def _media_rating_search(self, html):
	519	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	520	rating = self._html_search_meta('rating', html)
	521
	522	if not rating:
	523	return None
	524
	525	RATING_TABLE = {
	526	'safe for kids': 0,
	527	'general': 8,
	528	'14 years': 14,
	529	'mature': 17,
	530	'restricted': 19,
	531	}
	532	return RATING_TABLE.get(rating.lower(), None)
	533
	534	def _twitter_search_player(self, html):
	535	return self._html_search_meta('twitter:player', html,
	536	'twitter card player')
	537
	538	def _sort_formats(self, formats):
	539	if not formats:
	540	raise ExtractorError('No video formats found')
	541
	542	def _formats_key(f):
	543	# TODO remove the following workaround
	544	from ..utils import determine_ext
	545	if not f.get('ext') and 'url' in f:
	546	f['ext'] = determine_ext(f['url'])
	547
	548	preference = f.get('preference')
	549	if preference is None:
	550	proto = f.get('protocol')
	551	if proto is None:
	552	proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
	553
	554	preference = 0 if proto in ['http', 'https'] else -0.1
	555	if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
	556	preference -= 0.5
	557
	558	if f.get('vcodec') == 'none': # audio only
	559	if self._downloader.params.get('prefer_free_formats'):
	560	ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
	561	else:
	562	ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
	563	ext_preference = 0
	564	try:
	565	audio_ext_preference = ORDER.index(f['ext'])
	566	except ValueError:
	567	audio_ext_preference = -1
	568	else:
	569	if self._downloader.params.get('prefer_free_formats'):
	570	ORDER = ['flv', 'mp4', 'webm']
	571	else:
	572	ORDER = ['webm', 'flv', 'mp4']
	573	try:
	574	ext_preference = ORDER.index(f['ext'])
	575	except ValueError:
	576	ext_preference = -1
	577	audio_ext_preference = 0
	578
	579	return (
	580	preference,
	581	f.get('quality') if f.get('quality') is not None else -1,
	582	f.get('height') if f.get('height') is not None else -1,
	583	f.get('width') if f.get('width') is not None else -1,
	584	ext_preference,
	585	f.get('tbr') if f.get('tbr') is not None else -1,
	586	f.get('vbr') if f.get('vbr') is not None else -1,
	587	f.get('abr') if f.get('abr') is not None else -1,
	588	audio_ext_preference,
	589	f.get('filesize') if f.get('filesize') is not None else -1,
	590	f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
	591	f.get('format_id'),
	592	)
	593	formats.sort(key=_formats_key)
	594
	595	def http_scheme(self):
	596	""" Either "https:" or "https:", depending on the user's preferences """
	597	return (
	598	'http:'
	599	if self._downloader.params.get('prefer_insecure', False)
	600	else 'https:')
	601
	602	def _proto_relative_url(self, url, scheme=None):
	603	if url is None:
	604	return url
	605	if url.startswith('//'):
	606	if scheme is None:
	607	scheme = self.http_scheme()
	608	return scheme + url
	609	else:
	610	return url
	611
	612	def _sleep(self, timeout, video_id, msg_template=None):
	613	if msg_template is None:
	614	msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
	615	msg = msg_template % {'video_id': video_id, 'timeout': timeout}
	616	self.to_screen(msg)
	617	time.sleep(timeout)
	618
	619	def _extract_f4m_formats(self, manifest_url, video_id):
	620	manifest = self._download_xml(
	621	manifest_url, video_id, 'Downloading f4m manifest',
	622	'Unable to download f4m manifest')
	623
	624	formats = []
	625	media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
	626	for i, media_el in enumerate(media_nodes):
	627	tbr = int_or_none(media_el.attrib.get('bitrate'))
	628	format_id = 'f4m-%d' % (i if tbr is None else tbr)
	629	formats.append({
	630	'format_id': format_id,
	631	'url': manifest_url,
	632	'ext': 'flv',
	633	'tbr': tbr,
	634	'width': int_or_none(media_el.attrib.get('width')),
	635	'height': int_or_none(media_el.attrib.get('height')),
	636	})
	637	self._sort_formats(formats)
	638
	639	return formats
	640
	641	def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
	642	formats = [{
	643	'format_id': 'm3u8-meta',
	644	'url': m3u8_url,
	645	'ext': ext,
	646	'protocol': 'm3u8',
	647	'preference': -1,
	648	'resolution': 'multiple',
	649	'format_note': 'Quality selection URL',
	650	}]
	651
	652	m3u8_doc = self._download_webpage(m3u8_url, video_id)
	653	last_info = None
	654	kv_rex = re.compile(
	655	r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"\|[^",]+)(?:,\|$)')
	656	for line in m3u8_doc.splitlines():
	657	if line.startswith('#EXT-X-STREAM-INF:'):
	658	last_info = {}
	659	for m in kv_rex.finditer(line):
	660	v = m.group('val')
	661	if v.startswith('"'):
	662	v = v[1:-1]
	663	last_info[m.group('key')] = v
	664	elif line.startswith('#') or not line.strip():
	665	continue
	666	else:
	667	if last_info is None:
	668	formats.append({'url': line})
	669	continue
	670	tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
	671
	672	f = {
	673	'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
	674	'url': line.strip(),
	675	'tbr': tbr,
	676	'ext': ext,
	677	}
	678	codecs = last_info.get('CODECS')
	679	if codecs:
	680	video, audio = codecs.split(',')
	681	f['vcodec'] = video.partition('.')[0]
	682	f['acodec'] = audio.partition('.')[0]
	683	resolution = last_info.get('RESOLUTION')
	684	if resolution:
	685	width_str, height_str = resolution.split('x')
	686	f['width'] = int(width_str)
	687	f['height'] = int(height_str)
	688	formats.append(f)
	689	last_info = {}
	690	self._sort_formats(formats)
	691	return formats
	692
	693
	694	class SearchInfoExtractor(InfoExtractor):
	695	"""
	696	Base class for paged search queries extractors.
	697	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	698	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	699	"""
	700
	701	@classmethod
	702	def _make_valid_url(cls):
	703	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	704
	705	@classmethod
	706	def suitable(cls, url):
	707	return re.match(cls._make_valid_url(), url) is not None
	708
	709	def _real_extract(self, query):
	710	mobj = re.match(self._make_valid_url(), query)
	711	if mobj is None:
	712	raise ExtractorError('Invalid search query "%s"' % query)
	713
	714	prefix = mobj.group('prefix')
	715	query = mobj.group('query')
	716	if prefix == '':
	717	return self._get_n_results(query, 1)
	718	elif prefix == 'all':
	719	return self._get_n_results(query, self._MAX_RESULTS)
	720	else:
	721	n = int(prefix)
	722	if n <= 0:
	723	raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
	724	elif n > self._MAX_RESULTS:
	725	self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	726	n = self._MAX_RESULTS
	727	return self._get_n_results(query, n)
	728
	729	def _get_n_results(self, query, n):
	730	"""Get a specified number of results for a query"""
	731	raise NotImplementedError("This method must be implemented by subclasses")
	732
	733	@property
	734	def SEARCH_KEY(self):
	735	return self._SEARCH_KEY