jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	ExtractorError,
	6	GeoRestrictedError,
	7	int_or_none,
	8	parse_iso8601,
	9	parse_qs,
	10	strip_or_none,
	11	traverse_obj,
	12	url_or_none,
	13	)
	14
	15
	16	class ArteTVBaseIE(InfoExtractor):
	17	_ARTE_LANGUAGES = 'fr\|de\|en\|es\|it\|pl'
	18	_API_BASE = 'https://api.arte.tv/api/player/v2'
	19
	20
	21	class ArteTVIE(ArteTVBaseIE):
	22	_VALID_URL = r'''(?x)
	23	(?:https?://
	24	(?:
	25	(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos\|
	26	api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
	27	)
	28	\|arte://program)
	29	/(?P<id>\d{6}-\d{3}-[AF]\|LIVE)
	30	''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
	31	_TESTS = [{
	32	'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
	33	'only_matching': True,
	34	}, {
	35	'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
	36	'info_dict': {
	37	'id': '100103-000-A',
	38	'title': 'USA: Dyskryminacja na porodówce',
	39	'description': 'md5:242017b7cce59ffae340a54baefcafb1',
	40	'alt_title': 'ARTE Reportage',
	41	'upload_date': '20201103',
	42	'duration': 554,
	43	'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
	44	'timestamp': 1604417980,
	45	'ext': 'mp4',
	46	},
	47	'params': {'skip_download': 'm3u8'}
	48	}, {
	49	'note': 'No alt_title',
	50	'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
	51	'only_matching': True,
	52	}, {
	53	'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
	54	'only_matching': True,
	55	}, {
	56	'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
	57	'only_matching': True,
	58	}, {
	59	'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
	60	'only_matching': True,
	61	}, {
	62	'note': 'age-restricted',
	63	'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
	64	'info_dict': {
	65	'id': '006785-000-A',
	66	'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
	67	'title': 'The Element of Crime',
	68	'timestamp': 1696111200,
	69	'duration': 5849,
	70	'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
	71	'upload_date': '20230930',
	72	'ext': 'mp4',
	73	}
	74	}]
	75
	76	_GEO_BYPASS = True
	77
	78	_LANG_MAP = { # ISO639 -> French abbreviations
	79	'fr': 'F',
	80	'de': 'A',
	81	'en': 'E[ANG]',
	82	'es': 'E[ESP]',
	83	'it': 'E[ITA]',
	84	'pl': 'E[POL]',
	85	# XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
	86	# uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
	87	'mul': 'EU',
	88	}
	89
	90	_VERSION_CODE_RE = re.compile(r'''(?x)
	91	V
	92	(?P<original_voice>O?)
	93	(?P<vlang>[FA]\|E\[[A-Z]+\]\|EU)?
	94	(?P<audio_desc>AUD\|)
	95	(?:
	96	(?P<has_sub>-ST)
	97	(?P<sdh_sub>M?)
	98	(?P<sub_lang>[FA]\|E\[[A-Z]+\]\|EU)
	99	)?
	100	''')
	101
	102	# all obtained by exhaustive testing
	103	_COUNTRIES_MAP = {
	104	'DE_FR': (
	105	'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
	106	'PF', 'PM', 'RE', 'WF', 'YT',
	107	),
	108	# with both of the below 'BE' sometimes works, sometimes doesn't
	109	'EUR_DE_FR': (
	110	'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
	111	'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
	112	'YT',
	113	),
	114	'SAT': (
	115	'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
	116	'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
	117	'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
	118	'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
	119	'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
	120	'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
	121	),
	122	}
	123
	124	def _real_extract(self, url):
	125	mobj = self._match_valid_url(url)
	126	video_id = mobj.group('id')
	127	lang = mobj.group('lang') or mobj.group('lang_2')
	128	langauge_code = self._LANG_MAP.get(lang)
	129
	130	config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
	131	'x-validated-age': '18'
	132	})
	133
	134	geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
	135	if geoblocking.get('restrictedArea'):
	136	raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
	137	countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
	138
	139	if not traverse_obj(config, ('data', 'attributes', 'rights')):
	140	# Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
	141	# Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
	142	raise ExtractorError(
	143	'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
	144
	145	formats, subtitles = [], {}
	146	secondary_formats = []
	147	for stream in config['data']['attributes']['streams']:
	148	# official player contains code like `e.get("versions")[0].eStat.ml5`
	149	stream_version = stream['versions'][0]
	150	stream_version_code = stream_version['eStat']['ml5']
	151
	152	lang_pref = -1
	153	m = self._VERSION_CODE_RE.match(stream_version_code)
	154	if m:
	155	lang_pref = int(''.join('01'[x] for x in (
	156	m.group('vlang') == langauge_code, # we prefer voice in the requested language
	157	not m.group('audio_desc'), # and not the audio description version
	158	bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
	159	m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
	160	not m.group('has_sub'), # but we prefer no subtitles otherwise
	161	not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
	162	)))
	163
	164	short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
	165	if 'HLS' in stream['protocol']:
	166	fmts, subs = self._extract_m3u8_formats_and_subtitles(
	167	stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
	168	for fmt in fmts:
	169	fmt.update({
	170	'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
	171	'language_preference': lang_pref,
	172	})
	173	if any(map(short_label.startswith, ('cc', 'OGsub'))):
	174	secondary_formats.extend(fmts)
	175	else:
	176	formats.extend(fmts)
	177	self._merge_subtitles(subs, target=subtitles)
	178
	179	elif stream['protocol'] in ('HTTPS', 'RTMP'):
	180	formats.append({
	181	'format_id': f'{stream["protocol"]}-{stream_version_code}',
	182	'url': stream['url'],
	183	'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
	184	'language_preference': lang_pref,
	185	# 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
	186	})
	187
	188	else:
	189	self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
	190
	191	formats.extend(secondary_formats)
	192	self._remove_duplicate_formats(formats)
	193
	194	metadata = config['data']['attributes']['metadata']
	195
	196	return {
	197	'id': metadata['providerId'],
	198	'webpage_url': traverse_obj(metadata, ('link', 'url')),
	199	'title': traverse_obj(metadata, 'subtitle', 'title'),
	200	'alt_title': metadata.get('subtitle') and metadata.get('title'),
	201	'description': metadata.get('description'),
	202	'duration': traverse_obj(metadata, ('duration', 'seconds')),
	203	'language': metadata.get('language'),
	204	'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
	205	'is_live': config['data']['attributes'].get('live', False),
	206	'formats': formats,
	207	'subtitles': subtitles,
	208	'thumbnails': [
	209	{'url': image['url'], 'id': image.get('caption')}
	210	for image in metadata.get('images') or [] if url_or_none(image.get('url'))
	211	],
	212	# TODO: chapters may also be in stream['segments']?
	213	'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
	214	'start_time': 'startTime',
	215	'title': 'title',
	216	})) or None,
	217	}
	218
	219
	220	class ArteTVEmbedIE(InfoExtractor):
	221	_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
	222	_EMBED_REGEX = [r'<(?:iframe\|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
	223	_TESTS = [{
	224	'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
	225	'info_dict': {
	226	'id': '100605-013-A',
	227	'ext': 'mp4',
	228	'title': 'United we Stream November Lockdown Edition #13',
	229	'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
	230	'upload_date': '20201116',
	231	},
	232	'skip': 'No video available'
	233	}, {
	234	'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
	235	'only_matching': True,
	236	}]
	237
	238	def _real_extract(self, url):
	239	qs = parse_qs(url)
	240	json_url = qs['json_url'][0]
	241	video_id = ArteTVIE._match_id(json_url)
	242	return self.url_result(
	243	json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
	244
	245
	246	class ArteTVPlaylistIE(ArteTVBaseIE):
	247	_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
	248	_TESTS = [{
	249	'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
	250	'only_matching': True,
	251	}, {
	252	'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
	253	'playlist_mincount': 100,
	254	'info_dict': {
	255	'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
	256	'id': 'RC-014123',
	257	'title': 'ARTE Reportage - najlepsze reportaże',
	258	},
	259	}]
	260
	261	def _real_extract(self, url):
	262	lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
	263	playlist = self._download_json(
	264	f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
	265
	266	entries = [{
	267	'_type': 'url_transparent',
	268	'url': video['config']['url'],
	269	'ie_key': ArteTVIE.ie_key(),
	270	'id': video.get('providerId'),
	271	'title': video.get('title'),
	272	'alt_title': video.get('subtitle'),
	273	'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
	274	'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
	275	} for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
	276
	277	return self.playlist_result(entries, playlist_id,
	278	traverse_obj(playlist, ('metadata', 'title')),
	279	traverse_obj(playlist, ('metadata', 'description')))
	280
	281
	282	class ArteTVCategoryIE(ArteTVBaseIE):
	283	_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+))/?\s$' % ArteTVBaseIE._ARTE_LANGUAGES
	284	_TESTS = [{
	285	'url': 'https://www.arte.tv/en/videos/politics-and-society/',
	286	'info_dict': {
	287	'id': 'politics-and-society',
	288	'title': 'Politics and society',
	289	'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
	290	},
	291	'playlist_mincount': 13,
	292	}]
	293
	294	@classmethod
	295	def suitable(cls, url):
	296	return (
	297	not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
	298	and super().suitable(url))
	299
	300	def _real_extract(self, url):
	301	lang, playlist_id = self._match_valid_url(url).groups()
	302	webpage = self._download_webpage(url, playlist_id)
	303
	304	items = []
	305	for video in re.finditer(
	306	r'<a\b[^>]?href\s=\s*(?P<q>"\|\'\|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
	307	webpage):
	308	video = video.group('url')
	309	if video == url:
	310	continue
	311	if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
	312	items.append(video)
	313
	314	title = strip_or_none(self._generic_title('', webpage, default='').rsplit('\|', 1)[0]) or None
	315
	316	return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
	317	description=self._og_search_description(webpage, default=None))