jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..compat import (
	7	compat_str,
	8	compat_xpath,
	9	)
	10	from ..utils import (
	11	ExtractorError,
	12	find_xpath_attr,
	13	fix_xml_ampersands,
	14	float_or_none,
	15	HEADRequest,
	16	RegexNotFoundError,
	17	sanitized_Request,
	18	strip_or_none,
	19	timeconvert,
	20	try_get,
	21	unescapeHTML,
	22	update_url_query,
	23	url_basename,
	24	xpath_text,
	25	)
	26
	27
	28	def _media_xml_tag(tag):
	29	return '{http://search.yahoo.com/mrss/}%s' % tag
	30
	31
	32	class MTVServicesInfoExtractor(InfoExtractor):
	33	_MOBILE_TEMPLATE = None
	34	_LANG = None
	35
	36	@staticmethod
	37	def _id_from_uri(uri):
	38	return uri.split(':')[-1]
	39
	40	@staticmethod
	41	def _remove_template_parameter(url):
	42	# Remove the templates, like &device={device}
	43	return re.sub(r'&[^=]?={.?}(?=(&\|$))', '', url)
	44
	45	def _get_feed_url(self, uri):
	46	return self._FEED_URL
	47
	48	def _get_thumbnail_url(self, uri, itemdoc):
	49	search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
	50	thumb_node = itemdoc.find(search_path)
	51	if thumb_node is None:
	52	return None
	53	return thumb_node.get('url') or thumb_node.text or None
	54
	55	def _extract_mobile_video_formats(self, mtvn_id):
	56	webpage_url = self._MOBILE_TEMPLATE % mtvn_id
	57	req = sanitized_Request(webpage_url)
	58	# Otherwise we get a webpage that would execute some javascript
	59	req.add_header('User-Agent', 'curl/7')
	60	webpage = self._download_webpage(req, mtvn_id,
	61	'Downloading mobile page')
	62	metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
	63	req = HEADRequest(metrics_url)
	64	response = self._request_webpage(req, mtvn_id, 'Resolving url')
	65	url = response.geturl()
	66	# Transform the url to get the best quality:
	67	url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
	68	return [{'url': url, 'ext': 'mp4'}]
	69
	70	def _extract_video_formats(self, mdoc, mtvn_id, video_id):
	71	if re.match(r'.*/(error_country_block\.swf\|geoblock\.mp4\|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None:
	72	if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
	73	self.to_screen('The normal version is not available from your '
	74	'country, trying with the mobile version')
	75	return self._extract_mobile_video_formats(mtvn_id)
	76	raise ExtractorError('This video is not available from your country.',
	77	expected=True)
	78
	79	formats = []
	80	for rendition in mdoc.findall('.//rendition'):
	81	if rendition.get('method') == 'hls':
	82	hls_url = rendition.find('./src').text
	83	formats.extend(self._extract_m3u8_formats(
	84	hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
	85	m3u8_id='hls', fatal=False))
	86	else:
	87	# fms
	88	try:
	89	_, _, ext = rendition.attrib['type'].partition('/')
	90	rtmp_video_url = rendition.find('./src').text
	91	if 'error_not_available.swf' in rtmp_video_url:
	92	raise ExtractorError(
	93	'%s said: video is not available' % self.IE_NAME,
	94	expected=True)
	95	if rtmp_video_url.endswith('siteunavail.png'):
	96	continue
	97	formats.extend([{
	98	'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext,
	99	'url': rtmp_video_url,
	100	'format_id': '-'.join(filter(None, [
	101	'rtmp' if rtmp_video_url.startswith('rtmp') else None,
	102	rendition.get('bitrate')])),
	103	'width': int(rendition.get('width')),
	104	'height': int(rendition.get('height')),
	105	}])
	106	except (KeyError, TypeError):
	107	raise ExtractorError('Invalid rendition field.')
	108	if formats:
	109	self._sort_formats(formats)
	110	return formats
	111
	112	def _extract_subtitles(self, mdoc, mtvn_id):
	113	subtitles = {}
	114	for transcript in mdoc.findall('.//transcript'):
	115	if transcript.get('kind') != 'captions':
	116	continue
	117	lang = transcript.get('srclang')
	118	for typographic in transcript.findall('./typographic'):
	119	sub_src = typographic.get('src')
	120	if not sub_src:
	121	continue
	122	ext = typographic.get('format')
	123	if ext == 'cea-608':
	124	ext = 'scc'
	125	subtitles.setdefault(lang, []).append({
	126	'url': compat_str(sub_src),
	127	'ext': ext
	128	})
	129	return subtitles
	130
	131	def _get_video_info(self, itemdoc, use_hls=True):
	132	uri = itemdoc.find('guid').text
	133	video_id = self._id_from_uri(uri)
	134	self.report_extraction(video_id)
	135	content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
	136	mediagen_url = self._remove_template_parameter(content_el.attrib['url'])
	137	mediagen_url = mediagen_url.replace('device={device}', '')
	138	if 'acceptMethods' not in mediagen_url:
	139	mediagen_url += '&' if '?' in mediagen_url else '?'
	140	mediagen_url += 'acceptMethods='
	141	mediagen_url += 'hls' if use_hls else 'fms'
	142
	143	mediagen_doc = self._download_xml(
	144	mediagen_url, video_id, 'Downloading video urls', fatal=False)
	145
	146	if mediagen_doc is False:
	147	return None
	148
	149	item = mediagen_doc.find('./video/item')
	150	if item is not None and item.get('type') == 'text':
	151	message = '%s returned error: ' % self.IE_NAME
	152	if item.get('code') is not None:
	153	message += '%s - ' % item.get('code')
	154	message += item.text
	155	raise ExtractorError(message, expected=True)
	156
	157	description = strip_or_none(xpath_text(itemdoc, 'description'))
	158
	159	timestamp = timeconvert(xpath_text(itemdoc, 'pubDate'))
	160
	161	title_el = None
	162	if title_el is None:
	163	title_el = find_xpath_attr(
	164	itemdoc, './/{http://search.yahoo.com/mrss/}category',
	165	'scheme', 'urn:mtvn:video_title')
	166	if title_el is None:
	167	title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
	168	if title_el is None:
	169	title_el = itemdoc.find(compat_xpath('.//title'))
	170	if title_el.text is None:
	171	title_el = None
	172
	173	title = title_el.text
	174	if title is None:
	175	raise ExtractorError('Could not find video title')
	176	title = title.strip()
	177
	178	# This a short id that's used in the webpage urls
	179	mtvn_id = None
	180	mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
	181	'scheme', 'urn:mtvn:id')
	182	if mtvn_id_node is not None:
	183	mtvn_id = mtvn_id_node.text
	184
	185	formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
	186
	187	# Some parts of complete video may be missing (e.g. missing Act 3 in
	188	# http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
	189	if not formats:
	190	return None
	191
	192	self._sort_formats(formats)
	193
	194	return {
	195	'title': title,
	196	'formats': formats,
	197	'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id),
	198	'id': video_id,
	199	'thumbnail': self._get_thumbnail_url(uri, itemdoc),
	200	'description': description,
	201	'duration': float_or_none(content_el.attrib.get('duration')),
	202	'timestamp': timestamp,
	203	}
	204
	205	def _get_feed_query(self, uri):
	206	data = {'uri': uri}
	207	if self._LANG:
	208	data['lang'] = self._LANG
	209	return data
	210
	211	def _get_videos_info(self, uri, use_hls=True):
	212	video_id = self._id_from_uri(uri)
	213	feed_url = self._get_feed_url(uri)
	214	info_url = update_url_query(feed_url, self._get_feed_query(uri))
	215	return self._get_videos_info_from_url(info_url, video_id, use_hls)
	216
	217	def _get_videos_info_from_url(self, url, video_id, use_hls=True):
	218	idoc = self._download_xml(
	219	url, video_id,
	220	'Downloading info', transform_source=fix_xml_ampersands)
	221
	222	title = xpath_text(idoc, './channel/title')
	223	description = xpath_text(idoc, './channel/description')
	224
	225	entries = []
	226	for item in idoc.findall('.//item'):
	227	info = self._get_video_info(item, use_hls)
	228	if info:
	229	entries.append(info)
	230
	231	return self.playlist_result(
	232	entries, playlist_title=title, playlist_description=description)
	233
	234	def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
	235	triforce_feed = self._parse_json(self._search_regex(
	236	r'triforceManifestFeed\s=\s({.+?})\s;\s\n', webpage,
	237	'triforce feed', default='{}'), video_id, fatal=False)
	238
	239	data_zone = self._search_regex(
	240	r'data-zone=(["\'])(?P<zone>.+?_lc_promo.*?)\1', webpage,
	241	'data zone', default=data_zone, group='zone')
	242
	243	feed_url = try_get(
	244	triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'],
	245	compat_str)
	246	if not feed_url:
	247	return
	248
	249	feed = self._download_json(feed_url, video_id, fatal=False)
	250	if not feed:
	251	return
	252
	253	return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
	254
	255	def _extract_mgid(self, webpage):
	256	try:
	257	# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
	258	# or http://media.mtvnservices.com/{mgid}
	259	og_url = self._og_search_video_url(webpage)
	260	mgid = url_basename(og_url)
	261	if mgid.endswith('.swf'):
	262	mgid = mgid[:-4]
	263	except RegexNotFoundError:
	264	mgid = None
	265
	266	if mgid is None or ':' not in mgid:
	267	mgid = self._search_regex(
	268	[r'data-mgid="(.?)"', r'swfobject\.embedSWF\(".?(mgid:.*?)"'],
	269	webpage, 'mgid', default=None)
	270
	271	if not mgid:
	272	sm4_embed = self._html_search_meta(
	273	'sm4:video:embed', webpage, 'sm4 embed', default='')
	274	mgid = self._search_regex(
	275	r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None)
	276
	277	if not mgid:
	278	mgid = self._extract_triforce_mgid(webpage)
	279
	280	return mgid
	281
	282	def _real_extract(self, url):
	283	title = url_basename(url)
	284	webpage = self._download_webpage(url, title)
	285	mgid = self._extract_mgid(webpage)
	286	videos_info = self._get_videos_info(mgid)
	287	return videos_info
	288
	289
	290	class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
	291	IE_NAME = 'mtvservices:embedded'
	292	_VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?\|/\|$)'
	293
	294	_TEST = {
	295	# From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
	296	'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906',
	297	'md5': 'cb349b21a7897164cede95bd7bf3fbb9',
	298	'info_dict': {
	299	'id': '1043906',
	300	'ext': 'mp4',
	301	'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds',
	302	'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.',
	303	'timestamp': 1400126400,
	304	'upload_date': '20140515',
	305	},
	306	}
	307
	308	@staticmethod
	309	def _extract_url(webpage):
	310	mobj = re.search(
	311	r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
	312	if mobj:
	313	return mobj.group('url')
	314
	315	def _get_feed_url(self, uri):
	316	video_id = self._id_from_uri(uri)
	317	config = self._download_json(
	318	'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
	319	return self._remove_template_parameter(config['feedWithQueryParams'])
	320
	321	def _real_extract(self, url):
	322	mobj = re.match(self._VALID_URL, url)
	323	mgid = mobj.group('mgid')
	324	return self._get_videos_info(mgid)
	325
	326
	327	class MTVIE(MTVServicesInfoExtractor):
	328	IE_NAME = 'mtv'
	329	_VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips\|(?:full-)?episodes)/(?P<id>[^/?#.]+)'
	330	_FEED_URL = 'http://www.mtv.com/feeds/mrss/'
	331
	332	_TESTS = [{
	333	'url': 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer',
	334	'md5': '1edbcdf1e7628e414a8c5dcebca3d32b',
	335	'info_dict': {
	336	'id': '5e14040d-18a4-47c4-a582-43ff602de88e',
	337	'ext': 'mp4',
	338	'title': 'Unlocking The Truth\|July 18, 2016\|1\|101\|Trailer',
	339	'description': '"Unlocking the Truth" premieres August 17th at 11/10c.',
	340	'timestamp': 1468846800,
	341	'upload_date': '20160718',
	342	},
	343	}, {
	344	'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101',
	345	'only_matching': True,
	346	}, {
	347	'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713',
	348	'only_matching': True,
	349	}]
	350
	351
	352	class MTV81IE(InfoExtractor):
	353	IE_NAME = 'mtv81'
	354	_VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P<id>[^/?#.]+)'
	355
	356	_TEST = {
	357	'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/',
	358	'md5': '1edbcdf1e7628e414a8c5dcebca3d32b',
	359	'info_dict': {
	360	'id': '5e14040d-18a4-47c4-a582-43ff602de88e',
	361	'ext': 'mp4',
	362	'title': 'Unlocking The Truth\|July 18, 2016\|1\|101\|Trailer',
	363	'description': '"Unlocking the Truth" premieres August 17th at 11/10c.',
	364	'timestamp': 1468846800,
	365	'upload_date': '20160718',
	366	},
	367	}
	368
	369	def _extract_mgid(self, webpage):
	370	return self._search_regex(
	371	r'getTheVideo\((["\'])(?P<id>mgid:.+?)\1', webpage,
	372	'mgid', group='id')
	373
	374	def _real_extract(self, url):
	375	video_id = self._match_id(url)
	376	webpage = self._download_webpage(url, video_id)
	377	mgid = self._extract_mgid(webpage)
	378	return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
	379
	380
	381	class MTVVideoIE(MTVServicesInfoExtractor):
	382	IE_NAME = 'mtv:video'
	383	_VALID_URL = r'''(?x)^https?://
	384	(?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$\|
	385	m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''
	386
	387	_FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
	388
	389	_TESTS = [
	390	{
	391	'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
	392	'md5': '850f3f143316b1e71fa56a4edfd6e0f8',
	393	'info_dict': {
	394	'id': '853555',
	395	'ext': 'mp4',
	396	'title': 'Taylor Swift - "Ours (VH1 Storytellers)"',
	397	'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
	398	'timestamp': 1352610000,
	399	'upload_date': '20121111',
	400	},
	401	},
	402	]
	403
	404	def _get_thumbnail_url(self, uri, itemdoc):
	405	return 'http://mtv.mtvnimages.com/uri/' + uri
	406
	407	def _real_extract(self, url):
	408	mobj = re.match(self._VALID_URL, url)
	409	video_id = mobj.group('videoid')
	410	uri = mobj.groupdict().get('mgid')
	411	if uri is None:
	412	webpage = self._download_webpage(url, video_id)
	413
	414	# Some videos come from Vevo.com
	415	m_vevo = re.search(
	416	r'(?s)isVevoVideo = true;.?vevoVideoId = "(.?)";', webpage)
	417	if m_vevo:
	418	vevo_id = m_vevo.group(1)
	419	self.to_screen('Vevo video detected: %s' % vevo_id)
	420	return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
	421
	422	uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri')
	423	return self._get_videos_info(uri)
	424
	425
	426	class MTVDEIE(MTVServicesInfoExtractor):
	427	IE_NAME = 'mtv.de'
	428	_VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists\|shows\|news)/(?:[^/]+/)(?P<id>\d+)-[^/#?]+/(?:[#?].*)?$'
	429	_TESTS = [{
	430	'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum',
	431	'info_dict': {
	432	'id': 'music_video-a50bc5f0b3aa4b3190aa',
	433	'ext': 'flv',
	434	'title': 'MusicVideo_cro-traum',
	435	'description': 'Cro - Traum',
	436	},
	437	'params': {
	438	# rtmp download
	439	'skip_download': True,
	440	},
	441	'skip': 'Blocked at Travis CI',
	442	}, {
	443	# mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
	444	'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen',
	445	'info_dict': {
	446	'id': 'local_playlist-f5ae778b9832cc837189',
	447	'ext': 'flv',
	448	'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1',
	449	},
	450	'params': {
	451	# rtmp download
	452	'skip_download': True,
	453	},
	454	'skip': 'Blocked at Travis CI',
	455	}, {
	456	'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3',
	457	'info_dict': {
	458	'id': 'local_playlist-4e760566473c4c8c5344',
	459	'ext': 'mp4',
	460	'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1',
	461	'description': 'MTV Movies Supercut',
	462	},
	463	'params': {
	464	# rtmp download
	465	'skip_download': True,
	466	},
	467	'skip': 'Das Video kann zur Zeit nicht abgespielt werden.',
	468	}]
	469
	470	def _real_extract(self, url):
	471	video_id = self._match_id(url)
	472
	473	webpage = self._download_webpage(url, video_id)
	474
	475	playlist = self._parse_json(
	476	self._search_regex(
	477	r'window\.pagePlaylist\s=\s(\[.+?\]);\n', webpage, 'page playlist'),
	478	video_id)
	479
	480	def _mrss_url(item):
	481	return item['mrss'] + item.get('mrssvars', '')
	482
	483	# news pages contain single video in playlist with different id
	484	if len(playlist) == 1:
	485	return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id)
	486
	487	for item in playlist:
	488	item_id = item.get('id')
	489	if item_id and compat_str(item_id) == video_id:
	490	return self._get_videos_info_from_url(_mrss_url(item), video_id)