jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	from __future__ import unicode_literals
	2
	3	from .common import InfoExtractor
	4	from ..compat import compat_urlparse
	5	from ..utils import (
	6	determine_ext,
	7	ExtractorError,
	8	find_xpath_attr,
	9	fix_xml_ampersands,
	10	int_or_none,
	11	parse_duration,
	12	unified_strdate,
	13	update_url_query,
	14	xpath_text,
	15	)
	16
	17
	18	class RaiBaseIE(InfoExtractor):
	19	def _extract_relinker_formats(self, relinker_url, video_id):
	20	formats = []
	21
	22	for platform in ('mon', 'flash', 'native'):
	23	relinker = self._download_xml(
	24	relinker_url, video_id,
	25	note='Downloading XML metadata for platform %s' % platform,
	26	transform_source=fix_xml_ampersands,
	27	query={'output': 45, 'pl': platform},
	28	headers=self.geo_verification_headers())
	29
	30	media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
	31	if media_url == 'http://download.rai.it/video_no_available.mp4':
	32	self.raise_geo_restricted()
	33
	34	ext = determine_ext(media_url)
	35	if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
	36	continue
	37
	38	if ext == 'm3u8':
	39	formats.extend(self._extract_m3u8_formats(
	40	media_url, video_id, 'mp4', 'm3u8_native',
	41	m3u8_id='hls', fatal=False))
	42	elif ext == 'f4m':
	43	manifest_url = update_url_query(
	44	media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
	45	{'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
	46	formats.extend(self._extract_f4m_formats(
	47	manifest_url, video_id, f4m_id='hds', fatal=False))
	48	else:
	49	bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
	50	formats.append({
	51	'url': media_url,
	52	'tbr': bitrate if bitrate > 0 else None,
	53	'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
	54	})
	55
	56	return formats
	57
	58	def _extract_from_content_id(self, content_id, base_url):
	59	media = self._download_json(
	60	'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
	61	content_id, 'Downloading video JSON')
	62
	63	thumbnails = []
	64	for image_type in ('image', 'image_medium', 'image_300'):
	65	thumbnail_url = media.get(image_type)
	66	if thumbnail_url:
	67	thumbnails.append({
	68	'url': compat_urlparse.urljoin(base_url, thumbnail_url),
	69	})
	70
	71	formats = []
	72	media_type = media['type']
	73	if 'Audio' in media_type:
	74	formats.append({
	75	'format_id': media.get('formatoAudio'),
	76	'url': media['audioUrl'],
	77	'ext': media.get('formatoAudio'),
	78	})
	79	elif 'Video' in media_type:
	80	formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
	81	self._sort_formats(formats)
	82	else:
	83	raise ExtractorError('not a media file')
	84
	85	subtitles = {}
	86	captions = media.get('subtitlesUrl')
	87	if captions:
	88	STL_EXT = '.stl'
	89	SRT_EXT = '.srt'
	90	if captions.endswith(STL_EXT):
	91	captions = captions[:-len(STL_EXT)] + SRT_EXT
	92	subtitles['it'] = [{
	93	'ext': 'srt',
	94	'url': captions,
	95	}]
	96
	97	return {
	98	'id': content_id,
	99	'title': media['name'],
	100	'description': media.get('desc'),
	101	'thumbnails': thumbnails,
	102	'uploader': media.get('author'),
	103	'upload_date': unified_strdate(media.get('date')),
	104	'duration': parse_duration(media.get('length')),
	105	'formats': formats,
	106	'subtitles': subtitles,
	107	}
	108
	109
	110	class RaiTVIE(RaiBaseIE):
	111	_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it\|rai\.tv\|rainews\.it)/dl/(?:[^/]+/)+(?:media\|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
	112	_TESTS = [
	113	{
	114	'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
	115	'md5': '8970abf8caf8aef4696e7b1f2adfc696',
	116	'info_dict': {
	117	'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
	118	'ext': 'mp4',
	119	'title': 'Report del 07/04/2014',
	120	'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
	121	'upload_date': '20140407',
	122	'duration': 6160,
	123	'thumbnail': 're:^https?://.*\.jpg$',
	124	}
	125	},
	126	{
	127	# no m3u8 stream
	128	'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
	129	# HDS download, MD5 is unstable
	130	'info_dict': {
	131	'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
	132	'ext': 'flv',
	133	'title': 'TG PRIMO TEMPO',
	134	'upload_date': '20140612',
	135	'duration': 1758,
	136	'thumbnail': 're:^https?://.*\.jpg$',
	137	},
	138	'skip': 'Geo-restricted to Italy',
	139	},
	140	{
	141	'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
	142	'md5': '35cf7c229f22eeef43e48b5cf923bef0',
	143	'info_dict': {
	144	'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13',
	145	'ext': 'mp4',
	146	'title': 'State of the Net, Antonella La Carpia: regole virali',
	147	'description': 'md5:b0ba04a324126903e3da7763272ae63c',
	148	'upload_date': '20140613',
	149	},
	150	'skip': 'Error 404',
	151	},
	152	{
	153	'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html',
	154	'info_dict': {
	155	'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132',
	156	'ext': 'mp4',
	157	'title': 'Alluvione in Sardegna e dissesto idrogeologico',
	158	'description': 'Edizione delle ore 20:30 ',
	159	},
	160	'skip': 'invalid urls',
	161	},
	162	{
	163	'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
	164	'md5': 'e57493e1cb8bc7c564663f363b171847',
	165	'info_dict': {
	166	'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
	167	'ext': 'mp4',
	168	'title': 'Il Candidato - Primo episodio: "Le Primarie"',
	169	'description': 'md5:364b604f7db50594678f483353164fb8',
	170	'upload_date': '20140923',
	171	'duration': 386,
	172	'thumbnail': 're:^https?://.*\.jpg$',
	173	}
	174	},
	175	]
	176
	177	def _real_extract(self, url):
	178	video_id = self._match_id(url)
	179
	180	return self._extract_from_content_id(video_id, url)
	181
	182
	183	class RaiIE(RaiBaseIE):
	184	_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it\|rai\.tv\|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
	185	_TESTS = [
	186	{
	187	'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
	188	'md5': '2dd727e61114e1ee9c47f0da6914e178',
	189	'info_dict': {
	190	'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
	191	'ext': 'mp4',
	192	'title': 'Il pacco',
	193	'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
	194	'upload_date': '20141221',
	195	},
	196	},
	197	{
	198	# Direct relinker URL
	199	'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
	200	# HDS live stream, MD5 is unstable
	201	'info_dict': {
	202	'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
	203	'ext': 'flv',
	204	'title': 'EuroNews',
	205	},
	206	'skip': 'Geo-restricted to Italy',
	207	},
	208	{
	209	# Embedded content item ID
	210	'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
	211	'md5': '84c1135ce960e8822ae63cec34441d63',
	212	'info_dict': {
	213	'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
	214	'ext': 'mp4',
	215	'title': 'TG1 ore 20:00 del 02/07/2016',
	216	'upload_date': '20160702',
	217	},
	218	},
	219	{
	220	'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
	221	# HDS live stream, MD5 is unstable
	222	'info_dict': {
	223	'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
	224	'ext': 'flv',
	225	'title': 'La diretta di Rainews24',
	226	},
	227	},
	228	]
	229
	230	@classmethod
	231	def suitable(cls, url):
	232	return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url)
	233
	234	def _real_extract(self, url):
	235	video_id = self._match_id(url)
	236	webpage = self._download_webpage(url, video_id)
	237
	238	iframe_url = self._search_regex(
	239	[r'<iframe[^>]+src="([^"]/dl/[^"]+\?iframe\b[^"])"',
	240	r'drawMediaRaiTV\(["\'](.+?)["\']'],
	241	webpage, 'iframe', default=None)
	242	if iframe_url:
	243	if not iframe_url.startswith('http'):
	244	iframe_url = compat_urlparse.urljoin(url, iframe_url)
	245	return self.url_result(iframe_url)
	246
	247	content_item_id = self._search_regex(
	248	r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
	249	webpage, 'content item ID', group='content_id', default=None)
	250	if content_item_id:
	251	return self._extract_from_content_id(content_item_id, url)
	252
	253	relinker_url = compat_urlparse.urljoin(url, self._search_regex(
	254	r'(?:var\s+videoURL\|mediaInfo\.mediaUri)\s=\s(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
	255	webpage, 'relinker URL', group='url'))
	256	formats = self._extract_relinker_formats(relinker_url, video_id)
	257	self._sort_formats(formats)
	258
	259	title = self._search_regex(
	260	r'var\s+videoTitolo\s=\s([\'"])(?P<title>[^\'"]+)\1',
	261	webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
	262
	263	return {
	264	'id': video_id,
	265	'title': title,
	266	'formats': formats,
	267	}