jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	ExtractorError,
	6	GeoRestrictedError,
	7	int_or_none,
	8	parse_iso8601,
	9	parse_qs,
	10	strip_or_none,
	11	traverse_obj,
	12	url_or_none,
	13	)
	14
	15
	16	class ArteTVBaseIE(InfoExtractor):
	17	_ARTE_LANGUAGES = 'fr\|de\|en\|es\|it\|pl'
	18	_API_BASE = 'https://api.arte.tv/api/player/v2'
	19
	20
	21	class ArteTVIE(ArteTVBaseIE):
	22	_VALID_URL = r'''(?x)
	23	(?:https?://
	24	(?:
	25	(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos\|
	26	api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
	27	)
	28	\|arte://program)
	29	/(?P<id>\d{6}-\d{3}-[AF]\|LIVE)
	30	''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
	31	_TESTS = [{
	32	'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
	33	'only_matching': True,
	34	}, {
	35	'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
	36	'info_dict': {
	37	'id': '100103-000-A',
	38	'title': 'USA: Dyskryminacja na porodówce',
	39	'description': 'md5:242017b7cce59ffae340a54baefcafb1',
	40	'alt_title': 'ARTE Reportage',
	41	'upload_date': '20201103',
	42	'duration': 554,
	43	'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
	44	'timestamp': 1604417980,
	45	'ext': 'mp4',
	46	},
	47	'params': {'skip_download': 'm3u8'}
	48	}, {
	49	'note': 'No alt_title',
	50	'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
	51	'info_dict': {
	52	'id': '110371-000-A',
	53	'ext': 'mp4',
	54	'upload_date': '20220718',
	55	'duration': 154,
	56	'timestamp': 1658162460,
	57	'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
	58	'title': 'La chaleur, supplice des arbres de rue',
	59	'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
	60	},
	61	'params': {'skip_download': 'm3u8'}
	62	}, {
	63	'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
	64	'only_matching': True,
	65	}, {
	66	'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
	67	'only_matching': True,
	68	}]
	69
	70	_GEO_BYPASS = True
	71
	72	_LANG_MAP = { # ISO639 -> French abbreviations
	73	'fr': 'F',
	74	'de': 'A',
	75	'en': 'E[ANG]',
	76	'es': 'E[ESP]',
	77	'it': 'E[ITA]',
	78	'pl': 'E[POL]',
	79	# XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
	80	# uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
	81	'mul': 'EU',
	82	}
	83
	84	_VERSION_CODE_RE = re.compile(r'''(?x)
	85	V
	86	(?P<original_voice>O?)
	87	(?P<vlang>[FA]\|E\[[A-Z]+\]\|EU)?
	88	(?P<audio_desc>AUD\|)
	89	(?:
	90	(?P<has_sub>-ST)
	91	(?P<sdh_sub>M?)
	92	(?P<sub_lang>[FA]\|E\[[A-Z]+\]\|EU)
	93	)?
	94	''')
	95
	96	# all obtained by exhaustive testing
	97	_COUNTRIES_MAP = {
	98	'DE_FR': {
	99	'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
	100	'PF', 'PM', 'RE', 'WF', 'YT',
	101	},
	102	# with both of the below 'BE' sometimes works, sometimes doesn't
	103	'EUR_DE_FR': {
	104	'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
	105	'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
	106	'YT',
	107	},
	108	'SAT': {
	109	'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
	110	'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
	111	'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
	112	'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
	113	'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
	114	'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
	115	},
	116	}
	117
	118	def _real_extract(self, url):
	119	mobj = self._match_valid_url(url)
	120	video_id = mobj.group('id')
	121	lang = mobj.group('lang') or mobj.group('lang_2')
	122	langauge_code = self._LANG_MAP.get(lang)
	123
	124	config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
	125
	126	geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
	127	if geoblocking.get('restrictedArea'):
	128	raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
	129	countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
	130
	131	if not traverse_obj(config, ('data', 'attributes', 'rights')):
	132	# Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
	133	# Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
	134	raise ExtractorError(
	135	'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
	136
	137	formats, subtitles = [], {}
	138	for stream in config['data']['attributes']['streams']:
	139	# official player contains code like `e.get("versions")[0].eStat.ml5`
	140	stream_version = stream['versions'][0]
	141	stream_version_code = stream_version['eStat']['ml5']
	142
	143	lang_pref = -1
	144	m = self._VERSION_CODE_RE.match(stream_version_code)
	145	if m:
	146	lang_pref = int(''.join('01'[x] for x in (
	147	m.group('vlang') == langauge_code, # we prefer voice in the requested language
	148	not m.group('audio_desc'), # and not the audio description version
	149	bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
	150	m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
	151	not m.group('has_sub'), # but we prefer no subtitles otherwise
	152	not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
	153	)))
	154
	155	if stream['protocol'].startswith('HLS'):
	156	fmts, subs = self._extract_m3u8_formats_and_subtitles(
	157	stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
	158	for fmt in fmts:
	159	fmt.update({
	160	'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
	161	'language_preference': lang_pref,
	162	})
	163	formats.extend(fmts)
	164	self._merge_subtitles(subs, target=subtitles)
	165
	166	elif stream['protocol'] in ('HTTPS', 'RTMP'):
	167	formats.append({
	168	'format_id': f'{stream["protocol"]}-{stream_version_code}',
	169	'url': stream['url'],
	170	'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
	171	'language_preference': lang_pref,
	172	# 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
	173	})
	174
	175	else:
	176	self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
	177
	178	# TODO: chapters from stream['segments']?
	179	# The JS also looks for chapters in config['data']['attributes']['chapters'],
	180	# but I am yet to find a video having those
	181
	182	self._sort_formats(formats)
	183
	184	metadata = config['data']['attributes']['metadata']
	185
	186	return {
	187	'id': metadata['providerId'],
	188	'webpage_url': traverse_obj(metadata, ('link', 'url')),
	189	'title': traverse_obj(metadata, 'subtitle', 'title'),
	190	'alt_title': metadata.get('subtitle') and metadata.get('title'),
	191	'description': metadata.get('description'),
	192	'duration': traverse_obj(metadata, ('duration', 'seconds')),
	193	'language': metadata.get('language'),
	194	'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
	195	'is_live': config['data']['attributes'].get('live', False),
	196	'formats': formats,
	197	'subtitles': subtitles,
	198	'thumbnails': [
	199	{'url': image['url'], 'id': image.get('caption')}
	200	for image in metadata.get('images') or [] if url_or_none(image.get('url'))
	201	],
	202	}
	203
	204
	205	class ArteTVEmbedIE(InfoExtractor):
	206	_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
	207	_EMBED_REGEX = [r'<(?:iframe\|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
	208	_TESTS = [{
	209	'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
	210	'info_dict': {
	211	'id': '100605-013-A',
	212	'ext': 'mp4',
	213	'title': 'United we Stream November Lockdown Edition #13',
	214	'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
	215	'upload_date': '20201116',
	216	},
	217	'skip': 'No video available'
	218	}, {
	219	'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
	220	'only_matching': True,
	221	}]
	222
	223	def _real_extract(self, url):
	224	qs = parse_qs(url)
	225	json_url = qs['json_url'][0]
	226	video_id = ArteTVIE._match_id(json_url)
	227	return self.url_result(
	228	json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
	229
	230
	231	class ArteTVPlaylistIE(ArteTVBaseIE):
	232	_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
	233	_TESTS = [{
	234	'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
	235	'only_matching': True,
	236	}, {
	237	'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
	238	'playlist_mincount': 100,
	239	'info_dict': {
	240	'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
	241	'id': 'RC-014123',
	242	'title': 'ARTE Reportage - najlepsze reportaże',
	243	},
	244	}]
	245
	246	def _real_extract(self, url):
	247	lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
	248	playlist = self._download_json(
	249	f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
	250
	251	entries = [{
	252	'_type': 'url_transparent',
	253	'url': video['config']['url'],
	254	'ie_key': ArteTVIE.ie_key(),
	255	'id': video.get('providerId'),
	256	'title': video.get('title'),
	257	'alt_title': video.get('subtitle'),
	258	'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
	259	'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
	260	} for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
	261
	262	return self.playlist_result(entries, playlist_id,
	263	traverse_obj(playlist, ('metadata', 'title')),
	264	traverse_obj(playlist, ('metadata', 'description')))
	265
	266
	267	class ArteTVCategoryIE(ArteTVBaseIE):
	268	_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+))/?\s$' % ArteTVBaseIE._ARTE_LANGUAGES
	269	_TESTS = [{
	270	'url': 'https://www.arte.tv/en/videos/politics-and-society/',
	271	'info_dict': {
	272	'id': 'politics-and-society',
	273	'title': 'Politics and society',
	274	'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
	275	},
	276	'playlist_mincount': 13,
	277	}]
	278
	279	@classmethod
	280	def suitable(cls, url):
	281	return (
	282	not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
	283	and super().suitable(url))
	284
	285	def _real_extract(self, url):
	286	lang, playlist_id = self._match_valid_url(url).groups()
	287	webpage = self._download_webpage(url, playlist_id)
	288
	289	items = []
	290	for video in re.finditer(
	291	r'<a\b[^>]?href\s=\s*(?P<q>"\|\'\|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
	292	webpage):
	293	video = video.group('url')
	294	if video == url:
	295	continue
	296	if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
	297	items.append(video)
	298
	299	title = (self._og_search_title(webpage, default=None)
	300	or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
	301	title = strip_or_none(title.rsplit('\|', 1)[0]) or self._generic_title(url)
	302
	303	return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
	304	description=self._og_search_description(webpage, default=None))