jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import itertools
	2	import re
	3	import urllib.parse
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	int_or_none,
	8	join_nonempty,
	9	js_to_json,
	10	parse_duration,
	11	strftime_or_none,
	12	traverse_obj,
	13	unified_strdate,
	14	urljoin,
	15	)
	16
	17
	18	class RadioFranceIE(InfoExtractor):
	19	_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
	20	IE_NAME = 'radiofrance'
	21
	22	_TEST = {
	23	'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
	24	'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
	25	'info_dict': {
	26	'id': 'one-one',
	27	'ext': 'ogg',
	28	'title': 'One to one',
	29	'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
	30	'uploader': 'Thomas Hercouët',
	31	},
	32	}
	33
	34	def _real_extract(self, url):
	35	m = self._match_valid_url(url)
	36	video_id = m.group('id')
	37
	38	webpage = self._download_webpage(url, video_id)
	39	title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
	40	description = self._html_search_regex(
	41	r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
	42	webpage, 'description', fatal=False)
	43	uploader = self._html_search_regex(
	44	r'<div class="credit">  © (.*?)</div>',
	45	webpage, 'uploader', fatal=False)
	46
	47	formats_str = self._html_search_regex(
	48	r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
	49	webpage, 'audio URLs')
	50	formats = [
	51	{
	52	'format_id': fm[0],
	53	'url': fm[1],
	54	'vcodec': 'none',
	55	'quality': i,
	56	}
	57	for i, fm in
	58	enumerate(re.findall(r"([a-z0-9]+)\s:\s'([^']+)'", formats_str))
	59	]
	60
	61	return {
	62	'id': video_id,
	63	'title': title,
	64	'formats': formats,
	65	'description': description,
	66	'uploader': uploader,
	67	}
	68
	69
	70	class RadioFranceBaseIE(InfoExtractor):
	71	_VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
	72
	73	_STATIONS_RE = '\|'.join(map(re.escape, (
	74	'franceculture',
	75	'franceinfo',
	76	'franceinter',
	77	'francemusique',
	78	'fip',
	79	'mouv',
	80	)))
	81
	82	def _extract_data_from_webpage(self, webpage, display_id, key):
	83	return traverse_obj(self._search_json(
	84	r'\bconst\s+data\s*=', webpage, key, display_id,
	85	contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json),
	86	(..., 'data', key, {dict}), get_all=False) or {}
	87
	88
	89	class FranceCultureIE(RadioFranceBaseIE):
	90	_VALID_URL = rf'''(?x)
	91	{RadioFranceBaseIE._VALID_URL_BASE}
	92	/(?:{RadioFranceBaseIE._STATIONS_RE})
	93	/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$\|[?#])
	94	'''
	95
	96	_TESTS = [
	97	{
	98	'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
	99	'info_dict': {
	100	'id': '8440487',
	101	'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
	102	'ext': 'mp3',
	103	'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
	104	'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
	105	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	106	'upload_date': '20220514',
	107	'duration': 2750,
	108	},
	109	},
	110	{
	111	'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
	112	'info_dict': {
	113	'id': '2107675',
	114	'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
	115	'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
	116	'description': 'md5:36ee74351ede77a314fdebb94026b916',
	117	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	118	'upload_date': '20230310',
	119	'duration': 8977,
	120	'ext': 'mp3',
	121	},
	122	},
	123	{
	124	'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
	125	'only_matching': True,
	126	}, {
	127	'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
	128	'only_matching': True,
	129	}
	130	]
	131
	132	def _real_extract(self, url):
	133	video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
	134	webpage = self._download_webpage(url, display_id)
	135
	136	# _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
	137	video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s"@type"\s:\s*"AudioObject".+}')
	138
	139	return {
	140	'id': video_id,
	141	'display_id': display_id,
	142	'url': video_data['contentUrl'],
	143	'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
	144	'duration': parse_duration(video_data.get('duration')),
	145	'title': self._html_search_regex(r'(?s)<h1[^>]itemprop="[^"]name[^"]"[^>]>(.+?)</h1>',
	146	webpage, 'title', default=self._og_search_title(webpage)),
	147	'description': self._html_search_regex(
	148	r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
	149	'thumbnail': self._og_search_thumbnail(webpage),
	150	'uploader': self._html_search_regex(
	151	r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
	152	'upload_date': unified_strdate(self._search_regex(
	153	r'"datePublished"\s:\s"([^"]+)', webpage, 'timestamp', fatal=False))
	154	}
	155
	156
	157	class RadioFranceLiveIE(RadioFranceBaseIE):
	158	_VALID_URL = rf'''(?x)
	159	https?://(?:www\.)?radiofrance\.fr
	160	/(?P<id>{RadioFranceBaseIE._STATIONS_RE})
	161	/?(?P<substation_id>radio-[\w-]+)?(?:[#?]\|$)
	162	'''
	163
	164	_TESTS = [{
	165	'url': 'https://www.radiofrance.fr/franceinter/',
	166	'info_dict': {
	167	'id': 'franceinter',
	168	'title': str,
	169	'live_status': 'is_live',
	170	'ext': 'aac',
	171	},
	172	'params': {
	173	'skip_download': 'Livestream',
	174	},
	175	}, {
	176	'url': 'https://www.radiofrance.fr/franceculture',
	177	'info_dict': {
	178	'id': 'franceculture',
	179	'title': str,
	180	'live_status': 'is_live',
	181	'ext': 'aac',
	182	},
	183	'params': {
	184	'skip_download': 'Livestream',
	185	},
	186	}, {
	187	'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
	188	'info_dict': {
	189	'id': 'mouv-radio-musique-kids-family',
	190	'title': str,
	191	'live_status': 'is_live',
	192	'ext': 'aac',
	193	},
	194	'params': {
	195	'skip_download': 'Livestream',
	196	},
	197	}, {
	198	'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
	199	'info_dict': {
	200	'id': 'mouv-radio-rnb-soul',
	201	'title': str,
	202	'live_status': 'is_live',
	203	'ext': 'aac',
	204	},
	205	'params': {
	206	'skip_download': 'Livestream',
	207	},
	208	}, {
	209	'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
	210	'info_dict': {
	211	'id': 'mouv-radio-musique-mix',
	212	'title': str,
	213	'live_status': 'is_live',
	214	'ext': 'aac',
	215	},
	216	'params': {
	217	'skip_download': 'Livestream',
	218	},
	219	}, {
	220	'url': 'https://www.radiofrance.fr/fip/radio-rock',
	221	'info_dict': {
	222	'id': 'fip-radio-rock',
	223	'title': str,
	224	'live_status': 'is_live',
	225	'ext': 'aac',
	226	},
	227	'params': {
	228	'skip_download': 'Livestream',
	229	},
	230	}, {
	231	'url': 'https://www.radiofrance.fr/mouv',
	232	'only_matching': True,
	233	}]
	234
	235	def _real_extract(self, url):
	236	station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
	237
	238	if substation_id:
	239	webpage = self._download_webpage(url, station_id)
	240	api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
	241	else:
	242	api_response = self._download_json(
	243	f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
	244
	245	formats, subtitles = [], {}
	246	for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
	247	if media_source.get('format') == 'hls':
	248	fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
	249	formats.extend(fmts)
	250	self._merge_subtitles(subs, target=subtitles)
	251	else:
	252	formats.append({
	253	'url': media_source['url'],
	254	'abr': media_source.get('bitrate'),
	255	})
	256
	257	return {
	258	'id': join_nonempty(station_id, substation_id),
	259	'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
	260	('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
	261	'formats': formats,
	262	'subtitles': subtitles,
	263	'is_live': True,
	264	}
	265
	266
	267	class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
	268	"""Subclasses must set _METADATA_KEY"""
	269
	270	def _call_api(self, content_id, cursor, page_num):
	271	raise NotImplementedError('This method must be implemented by subclasses')
	272
	273	def _generate_playlist_entries(self, content_id, content_response):
	274	for page_num in itertools.count(2):
	275	for entry in content_response['items']:
	276	yield self.url_result(
	277	f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
	278	'title': 'title',
	279	'description': 'standFirst',
	280	'timestamp': ('publishedDate', {int_or_none}),
	281	'thumbnail': ('visual', 'src'),
	282	}))
	283
	284	next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
	285	if not next_cursor:
	286	break
	287
	288	content_response = self._call_api(content_id, next_cursor, page_num)
	289
	290	def _real_extract(self, url):
	291	display_id = self._match_id(url)
	292
	293	metadata = self._download_json(
	294	'https://www.radiofrance.fr/api/v2.1/path', display_id,
	295	query={'value': urllib.parse.urlparse(url).path})['content']
	296
	297	content_id = metadata['id']
	298
	299	return self.playlist_result(
	300	self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
	301	display_id=display_id, {traverse_obj(metadata, {
	302	'title': 'title',
	303	'description': 'standFirst',
	304	'thumbnail': ('visual', 'src'),
	305	}), **traverse_obj(metadata, {
	306	'title': 'name',
	307	'description': 'role',
	308	})})
	309
	310
	311	class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
	312	_VALID_URL = rf'''(?x)
	313	{RadioFranceBaseIE._VALID_URL_BASE}
	314	/(?:{RadioFranceBaseIE._STATIONS_RE})
	315	/podcasts/(?P<id>[\w-]+)/?(?:[?#]\|$)
	316	'''
	317
	318	_TESTS = [{
	319	'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
	320	'info_dict': {
	321	'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
	322	'display_id': 'le-billet-vert',
	323	'title': 'Le billet sciences',
	324	'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
	325	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	326	},
	327	'playlist_mincount': 11,
	328	}, {
	329	'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
	330	'info_dict': {
	331	'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
	332	'display_id': 'jean-marie-le-pen-l-obsession-nationale',
	333	'title': 'Jean-Marie Le Pen, l\'obsession nationale',
	334	'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
	335	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	336	},
	337	'playlist_count': 7,
	338	}, {
	339	'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
	340	'info_dict': {
	341	'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
	342	'display_id': 'serie-thomas-grjebine',
	343	'title': 'Thomas Grjebine',
	344	},
	345	'playlist_count': 1,
	346	}, {
	347	'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
	348	'info_dict': {
	349	'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
	350	'display_id': 'certains-l-aiment-fip',
	351	'title': 'Certains l’aiment Fip',
	352	'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
	353	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	354	},
	355	'playlist_mincount': 321,
	356	}, {
	357	'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
	358	'only_matching': True,
	359	}, {
	360	'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
	361	'only_matching': True,
	362	}]
	363
	364	_METADATA_KEY = 'expressions'
	365
	366	def _call_api(self, podcast_id, cursor, page_num):
	367	return self._download_json(
	368	f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
	369	note=f'Downloading page {page_num}', query={'pageCursor': cursor})
	370
	371
	372	class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
	373	_VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
	374
	375	_TESTS = [{
	376	'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
	377	'info_dict': {
	378	'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
	379	'display_id': 'thomas-pesquet',
	380	'title': 'Thomas Pesquet',
	381	'description': 'Astronaute à l\'agence spatiale européenne',
	382	},
	383	'playlist_mincount': 212,
	384	}, {
	385	'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
	386	'info_dict': {
	387	'id': '9593050b-0183-4972-a0b5-d8f699079e02',
	388	'display_id': 'eugenie-bastie',
	389	'title': 'Eugénie Bastié',
	390	'description': 'Journaliste et essayiste',
	391	'thumbnail': r're:^https?://.*\.(?:jpg\|png)',
	392	},
	393	'playlist_mincount': 39,
	394	}, {
	395	'url': 'https://www.radiofrance.fr/personnes/lea-salame',
	396	'only_matching': True,
	397	}]
	398
	399	_METADATA_KEY = 'documents'
	400
	401	def _call_api(self, profile_id, cursor, page_num):
	402	resp = self._download_json(
	403	f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
	404	note=f'Downloading page {page_num}', query={
	405	'relation': 'personality',
	406	'cursor': cursor,
	407	})
	408
	409	resp['next'] = traverse_obj(resp, ('pagination', 'next'))
	410	return resp
	411
	412
	413	class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
	414	_VALID_URL = rf'''(?x)
	415	{RadioFranceBaseIE._VALID_URL_BASE}
	416	/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
	417	/grille-programmes(?:\?date=(?P<date>[\d-]+))?
	418	'''
	419
	420	_TESTS = [{
	421	'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
	422	'info_dict': {
	423	'id': 'franceinter-program-20230217',
	424	'upload_date': '20230217',
	425	},
	426	'playlist_count': 25,
	427	}, {
	428	'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
	429	'info_dict': {
	430	'id': 'franceculture-program-20230201',
	431	'upload_date': '20230201',
	432	},
	433	'playlist_count': 25,
	434	}, {
	435	'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
	436	'info_dict': {
	437	'id': 'mouv-program-20230319',
	438	'upload_date': '20230319',
	439	},
	440	'playlist_count': 3,
	441	}, {
	442	'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
	443	'info_dict': {
	444	'id': 'francemusique-program-20230318',
	445	'upload_date': '20230318',
	446	},
	447	'playlist_count': 15,
	448	}, {
	449	'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
	450	'only_matching': True,
	451	}]
	452
	453	def _generate_playlist_entries(self, webpage_url, api_response):
	454	for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
	455	yield self.url_result(
	456	urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
	457	url_transparent=True, **traverse_obj(entry, {
	458	'title': ('expression', 'title'),
	459	'thumbnail': ('expression', 'visual', 'src'),
	460	'timestamp': ('startTime', {int_or_none}),
	461	'series_id': ('concept', 'id'),
	462	'series': ('concept', 'title'),
	463	}))
	464
	465	def _real_extract(self, url):
	466	station, date = self._match_valid_url(url).group('station', 'date')
	467	webpage = self._download_webpage(url, station)
	468	grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
	469	upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
	470
	471	return self.playlist_result(
	472	self._generate_playlist_entries(url, grid_data),
	473	join_nonempty(station, 'program', upload_date), upload_date=upload_date)