jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import json
	5
	6	from .common import InfoExtractor
	7	from .brightcove import BrightcoveNewIE
	8	from ..utils import (
	9	clean_html,
	10	determine_ext,
	11	extract_attributes,
	12	get_element_by_class,
	13	JSON_LD_RE,
	14	merge_dicts,
	15	parse_duration,
	16	smuggle_url,
	17	try_get,
	18	url_or_none,
	19	)
	20
	21
	22	class ITVIE(InfoExtractor):
	23	_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
	24	_GEO_COUNTRIES = ['GB']
	25	_TESTS = [{
	26	'url': 'https://www.itv.com/hub/liar/2a4547a0012',
	27	'info_dict': {
	28	'id': '2a4547a0012',
	29	'ext': 'mp4',
	30	'title': 'Liar - Series 2 - Episode 6',
	31	'description': 'md5:d0f91536569dec79ea184f0a44cca089',
	32	'series': 'Liar',
	33	'season_number': 2,
	34	'episode_number': 6,
	35	},
	36	'params': {
	37	# m3u8 download
	38	'skip_download': True,
	39	},
	40	}, {
	41	# unavailable via data-playlist-url
	42	'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
	43	'only_matching': True,
	44	}, {
	45	# InvalidVodcrid
	46	'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
	47	'only_matching': True,
	48	}, {
	49	# ContentUnavailable
	50	'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
	51	'only_matching': True,
	52	}]
	53
	54	def _real_extract(self, url):
	55	video_id = self._match_id(url)
	56	webpage = self._download_webpage(url, video_id)
	57	params = extract_attributes(self._search_regex(
	58	r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
	59
	60	ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
	61	hmac = params['data-video-hmac']
	62	headers = self.geo_verification_headers()
	63	headers.update({
	64	'Accept': 'application/vnd.itv.vod.playlist.v2+json',
	65	'Content-Type': 'application/json',
	66	'hmac': hmac.upper(),
	67	})
	68	ios_playlist = self._download_json(
	69	ios_playlist_url, video_id, data=json.dumps({
	70	'user': {
	71	'itvUserId': '',
	72	'entitlements': [],
	73	'token': ''
	74	},
	75	'device': {
	76	'manufacturer': 'Safari',
	77	'model': '5',
	78	'os': {
	79	'name': 'Windows NT',
	80	'version': '6.1',
	81	'type': 'desktop'
	82	}
	83	},
	84	'client': {
	85	'version': '4.1',
	86	'id': 'browser'
	87	},
	88	'variantAvailability': {
	89	'featureset': {
	90	'min': ['hls', 'aes', 'outband-webvtt'],
	91	'max': ['hls', 'aes', 'outband-webvtt']
	92	},
	93	'platformTag': 'dotcom'
	94	}
	95	}).encode(), headers=headers)
	96	video_data = ios_playlist['Playlist']['Video']
	97	ios_base_url = video_data.get('Base')
	98
	99	formats = []
	100	for media_file in (video_data.get('MediaFiles') or []):
	101	href = media_file.get('Href')
	102	if not href:
	103	continue
	104	if ios_base_url:
	105	href = ios_base_url + href
	106	ext = determine_ext(href)
	107	if ext == 'm3u8':
	108	formats.extend(self._extract_m3u8_formats(
	109	href, video_id, 'mp4', entry_protocol='m3u8_native',
	110	m3u8_id='hls', fatal=False))
	111	else:
	112	formats.append({
	113	'url': href,
	114	})
	115	self._sort_formats(formats)
	116
	117	subtitles = {}
	118	subs = video_data.get('Subtitles') or []
	119	for sub in subs:
	120	if not isinstance(sub, dict):
	121	continue
	122	href = url_or_none(sub.get('Href'))
	123	if not href:
	124	continue
	125	subtitles.setdefault('en', []).append({
	126	'url': href,
	127	'ext': determine_ext(href, 'vtt'),
	128	})
	129
	130	info = self._search_json_ld(webpage, video_id, default={})
	131	if not info:
	132	json_ld = self._parse_json(self._search_regex(
	133	JSON_LD_RE, webpage, 'JSON-LD', '{}',
	134	group='json_ld'), video_id, fatal=False)
	135	if json_ld and json_ld.get('@type') == 'BreadcrumbList':
	136	for ile in (json_ld.get('itemListElement:') or []):
	137	item = ile.get('item:') or {}
	138	if item.get('@type') == 'TVEpisode':
	139	item['@context'] = 'http://schema.org'
	140	info = self._json_ld(item, video_id, fatal=False) or {}
	141	break
	142
	143	return merge_dicts({
	144	'id': video_id,
	145	'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
	146	'formats': formats,
	147	'subtitles': subtitles,
	148	'duration': parse_duration(video_data.get('Duration')),
	149	'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)),
	150	}, info)
	151
	152
	153	class ITVBTCCIE(InfoExtractor):
	154	_VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
	155	_TEST = {
	156	'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
	157	'info_dict': {
	158	'id': 'btcc-2019-brands-hatch-gp-race-action',
	159	'title': 'BTCC 2019: Brands Hatch GP race action',
	160	},
	161	'playlist_count': 12,
	162	}
	163	BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
	164
	165	def _real_extract(self, url):
	166	playlist_id = self._match_id(url)
	167
	168	webpage = self._download_webpage(url, playlist_id)
	169
	170	json_map = try_get(self._parse_json(self._html_search_regex(
	171	'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
	172	lambda x: x['props']['pageProps']['article']['body']['content']) or []
	173
	174	# Discard empty objects
	175	video_ids = []
	176	for video in json_map:
	177	if video['data'].get('id'):
	178	video_ids.append(video['data']['id'])
	179
	180	entries = [
	181	self.url_result(
	182	smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
	183	# ITV does not like some GB IP ranges, so here are some
	184	# IP blocks it accepts
	185	'geo_ip_blocks': [
	186	'193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
	187	],
	188	'referrer': url,
	189	}),
	190	ie=BrightcoveNewIE.ie_key(), video_id=video_id)
	191	for video_id in video_ids]
	192
	193	title = self._og_search_title(webpage, fatal=False)
	194
	195	return self.playlist_result(entries, playlist_id, title)