]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/radiofrance.py
[cleanup] Misc (#8598)
[yt-dlp.git] / yt_dlp / extractor / radiofrance.py
1 import itertools
2 import re
3 import urllib.parse
4
5 from .common import InfoExtractor
6 from ..utils import (
7 int_or_none,
8 join_nonempty,
9 js_to_json,
10 parse_duration,
11 strftime_or_none,
12 traverse_obj,
13 unified_strdate,
14 urljoin,
15 )
16
17
18 class RadioFranceIE(InfoExtractor):
19 _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
20 IE_NAME = 'radiofrance'
21
22 _TEST = {
23 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
24 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
25 'info_dict': {
26 'id': 'one-one',
27 'ext': 'ogg',
28 'title': 'One to one',
29 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
30 'uploader': 'Thomas Hercouët',
31 },
32 }
33
34 def _real_extract(self, url):
35 m = self._match_valid_url(url)
36 video_id = m.group('id')
37
38 webpage = self._download_webpage(url, video_id)
39 title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
40 description = self._html_search_regex(
41 r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
42 webpage, 'description', fatal=False)
43 uploader = self._html_search_regex(
44 r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
45 webpage, 'uploader', fatal=False)
46
47 formats_str = self._html_search_regex(
48 r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
49 webpage, 'audio URLs')
50 formats = [
51 {
52 'format_id': fm[0],
53 'url': fm[1],
54 'vcodec': 'none',
55 'quality': i,
56 }
57 for i, fm in
58 enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
59 ]
60
61 return {
62 'id': video_id,
63 'title': title,
64 'formats': formats,
65 'description': description,
66 'uploader': uploader,
67 }
68
69
70 class RadioFranceBaseIE(InfoExtractor):
71 _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
72
73 _STATIONS_RE = '|'.join(map(re.escape, (
74 'franceculture',
75 'franceinfo',
76 'franceinter',
77 'francemusique',
78 'fip',
79 'mouv',
80 )))
81
82 def _extract_data_from_webpage(self, webpage, display_id, key):
83 return traverse_obj(self._search_json(
84 r'\bconst\s+data\s*=', webpage, key, display_id,
85 contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json),
86 (..., 'data', key, {dict}), get_all=False) or {}
87
88
89 class FranceCultureIE(RadioFranceBaseIE):
90 _VALID_URL = rf'''(?x)
91 {RadioFranceBaseIE._VALID_URL_BASE}
92 /(?:{RadioFranceBaseIE._STATIONS_RE})
93 /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
94 '''
95
96 _TESTS = [
97 {
98 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
99 'info_dict': {
100 'id': '8440487',
101 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
102 'ext': 'mp3',
103 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
104 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
105 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
106 'upload_date': '20220514',
107 'duration': 2750,
108 },
109 },
110 {
111 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
112 'info_dict': {
113 'id': '2107675',
114 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
115 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
116 'description': 'md5:36ee74351ede77a314fdebb94026b916',
117 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
118 'upload_date': '20230310',
119 'duration': 8977,
120 'ext': 'mp3',
121 },
122 },
123 {
124 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
125 'only_matching': True,
126 }, {
127 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
128 'only_matching': True,
129 }
130 ]
131
132 def _real_extract(self, url):
133 video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
134 webpage = self._download_webpage(url, display_id)
135
136 # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
137 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}')
138
139 return {
140 'id': video_id,
141 'display_id': display_id,
142 'url': video_data['contentUrl'],
143 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
144 'duration': parse_duration(video_data.get('duration')),
145 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
146 webpage, 'title', default=self._og_search_title(webpage)),
147 'description': self._html_search_regex(
148 r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
149 'thumbnail': self._og_search_thumbnail(webpage),
150 'uploader': self._html_search_regex(
151 r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
152 'upload_date': unified_strdate(self._search_regex(
153 r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
154 }
155
156
157 class RadioFranceLiveIE(RadioFranceBaseIE):
158 _VALID_URL = rf'''(?x)
159 https?://(?:www\.)?radiofrance\.fr
160 /(?P<id>{RadioFranceBaseIE._STATIONS_RE})
161 /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
162 '''
163
164 _TESTS = [{
165 'url': 'https://www.radiofrance.fr/franceinter/',
166 'info_dict': {
167 'id': 'franceinter',
168 'title': str,
169 'live_status': 'is_live',
170 'ext': 'aac',
171 },
172 'params': {
173 'skip_download': 'Livestream',
174 },
175 }, {
176 'url': 'https://www.radiofrance.fr/franceculture',
177 'info_dict': {
178 'id': 'franceculture',
179 'title': str,
180 'live_status': 'is_live',
181 'ext': 'aac',
182 },
183 'params': {
184 'skip_download': 'Livestream',
185 },
186 }, {
187 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
188 'info_dict': {
189 'id': 'mouv-radio-musique-kids-family',
190 'title': str,
191 'live_status': 'is_live',
192 'ext': 'aac',
193 },
194 'params': {
195 'skip_download': 'Livestream',
196 },
197 }, {
198 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
199 'info_dict': {
200 'id': 'mouv-radio-rnb-soul',
201 'title': str,
202 'live_status': 'is_live',
203 'ext': 'aac',
204 },
205 'params': {
206 'skip_download': 'Livestream',
207 },
208 }, {
209 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
210 'info_dict': {
211 'id': 'mouv-radio-musique-mix',
212 'title': str,
213 'live_status': 'is_live',
214 'ext': 'aac',
215 },
216 'params': {
217 'skip_download': 'Livestream',
218 },
219 }, {
220 'url': 'https://www.radiofrance.fr/fip/radio-rock',
221 'info_dict': {
222 'id': 'fip-radio-rock',
223 'title': str,
224 'live_status': 'is_live',
225 'ext': 'aac',
226 },
227 'params': {
228 'skip_download': 'Livestream',
229 },
230 }, {
231 'url': 'https://www.radiofrance.fr/mouv',
232 'only_matching': True,
233 }]
234
235 def _real_extract(self, url):
236 station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
237
238 if substation_id:
239 webpage = self._download_webpage(url, station_id)
240 api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
241 else:
242 api_response = self._download_json(
243 f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
244
245 formats, subtitles = [], {}
246 for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
247 if media_source.get('format') == 'hls':
248 fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
249 formats.extend(fmts)
250 self._merge_subtitles(subs, target=subtitles)
251 else:
252 formats.append({
253 'url': media_source['url'],
254 'abr': media_source.get('bitrate'),
255 })
256
257 return {
258 'id': join_nonempty(station_id, substation_id),
259 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
260 ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
261 'formats': formats,
262 'subtitles': subtitles,
263 'is_live': True,
264 }
265
266
267 class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
268 """Subclasses must set _METADATA_KEY"""
269
270 def _call_api(self, content_id, cursor, page_num):
271 raise NotImplementedError('This method must be implemented by subclasses')
272
273 def _generate_playlist_entries(self, content_id, content_response):
274 for page_num in itertools.count(2):
275 for entry in content_response['items']:
276 yield self.url_result(
277 f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
278 'title': 'title',
279 'description': 'standFirst',
280 'timestamp': ('publishedDate', {int_or_none}),
281 'thumbnail': ('visual', 'src'),
282 }))
283
284 next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
285 if not next_cursor:
286 break
287
288 content_response = self._call_api(content_id, next_cursor, page_num)
289
290 def _real_extract(self, url):
291 display_id = self._match_id(url)
292
293 metadata = self._download_json(
294 'https://www.radiofrance.fr/api/v2.1/path', display_id,
295 query={'value': urllib.parse.urlparse(url).path})['content']
296
297 content_id = metadata['id']
298
299 return self.playlist_result(
300 self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
301 display_id=display_id, **{**traverse_obj(metadata, {
302 'title': 'title',
303 'description': 'standFirst',
304 'thumbnail': ('visual', 'src'),
305 }), **traverse_obj(metadata, {
306 'title': 'name',
307 'description': 'role',
308 })})
309
310
311 class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
312 _VALID_URL = rf'''(?x)
313 {RadioFranceBaseIE._VALID_URL_BASE}
314 /(?:{RadioFranceBaseIE._STATIONS_RE})
315 /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
316 '''
317
318 _TESTS = [{
319 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
320 'info_dict': {
321 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
322 'display_id': 'le-billet-vert',
323 'title': 'Le billet sciences',
324 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
325 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
326 },
327 'playlist_mincount': 11,
328 }, {
329 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
330 'info_dict': {
331 'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
332 'display_id': 'jean-marie-le-pen-l-obsession-nationale',
333 'title': 'Jean-Marie Le Pen, l\'obsession nationale',
334 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
335 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
336 },
337 'playlist_count': 7,
338 }, {
339 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
340 'info_dict': {
341 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
342 'display_id': 'serie-thomas-grjebine',
343 'title': 'Thomas Grjebine',
344 },
345 'playlist_count': 1,
346 }, {
347 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
348 'info_dict': {
349 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
350 'display_id': 'certains-l-aiment-fip',
351 'title': 'Certains l’aiment Fip',
352 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
353 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
354 },
355 'playlist_mincount': 321,
356 }, {
357 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
358 'only_matching': True,
359 }, {
360 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
361 'only_matching': True,
362 }]
363
364 _METADATA_KEY = 'expressions'
365
366 def _call_api(self, podcast_id, cursor, page_num):
367 return self._download_json(
368 f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
369 note=f'Downloading page {page_num}', query={'pageCursor': cursor})
370
371
372 class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
373 _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
374
375 _TESTS = [{
376 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
377 'info_dict': {
378 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
379 'display_id': 'thomas-pesquet',
380 'title': 'Thomas Pesquet',
381 'description': 'Astronaute à l\'agence spatiale européenne',
382 },
383 'playlist_mincount': 212,
384 }, {
385 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
386 'info_dict': {
387 'id': '9593050b-0183-4972-a0b5-d8f699079e02',
388 'display_id': 'eugenie-bastie',
389 'title': 'Eugénie Bastié',
390 'description': 'Journaliste et essayiste',
391 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
392 },
393 'playlist_mincount': 39,
394 }, {
395 'url': 'https://www.radiofrance.fr/personnes/lea-salame',
396 'only_matching': True,
397 }]
398
399 _METADATA_KEY = 'documents'
400
401 def _call_api(self, profile_id, cursor, page_num):
402 resp = self._download_json(
403 f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
404 note=f'Downloading page {page_num}', query={
405 'relation': 'personality',
406 'cursor': cursor,
407 })
408
409 resp['next'] = traverse_obj(resp, ('pagination', 'next'))
410 return resp
411
412
413 class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
414 _VALID_URL = rf'''(?x)
415 {RadioFranceBaseIE._VALID_URL_BASE}
416 /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
417 /grille-programmes(?:\?date=(?P<date>[\d-]+))?
418 '''
419
420 _TESTS = [{
421 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
422 'info_dict': {
423 'id': 'franceinter-program-20230217',
424 'upload_date': '20230217',
425 },
426 'playlist_count': 25,
427 }, {
428 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
429 'info_dict': {
430 'id': 'franceculture-program-20230201',
431 'upload_date': '20230201',
432 },
433 'playlist_count': 25,
434 }, {
435 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
436 'info_dict': {
437 'id': 'mouv-program-20230319',
438 'upload_date': '20230319',
439 },
440 'playlist_count': 3,
441 }, {
442 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
443 'info_dict': {
444 'id': 'francemusique-program-20230318',
445 'upload_date': '20230318',
446 },
447 'playlist_count': 15,
448 }, {
449 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
450 'only_matching': True,
451 }]
452
453 def _generate_playlist_entries(self, webpage_url, api_response):
454 for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
455 yield self.url_result(
456 urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
457 url_transparent=True, **traverse_obj(entry, {
458 'title': ('expression', 'title'),
459 'thumbnail': ('expression', 'visual', 'src'),
460 'timestamp': ('startTime', {int_or_none}),
461 'series_id': ('concept', 'id'),
462 'series': ('concept', 'title'),
463 }))
464
465 def _real_extract(self, url):
466 station, date = self._match_valid_url(url).group('station', 'date')
467 webpage = self._download_webpage(url, station)
468 grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
469 upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
470
471 return self.playlist_result(
472 self._generate_playlist_entries(url, grid_data),
473 join_nonempty(station, 'program', upload_date), upload_date=upload_date)