]>
Commit | Line | Data |
---|---|---|
1 | import itertools | |
2 | import re | |
3 | import urllib.parse | |
4 | ||
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
7 | int_or_none, | |
8 | join_nonempty, | |
9 | js_to_json, | |
10 | parse_duration, | |
11 | strftime_or_none, | |
12 | traverse_obj, | |
13 | unified_strdate, | |
14 | urljoin, | |
15 | ) | |
16 | ||
17 | ||
18 | class RadioFranceIE(InfoExtractor): | |
19 | _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' | |
20 | IE_NAME = 'radiofrance' | |
21 | ||
22 | _TEST = { | |
23 | 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', | |
24 | 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', | |
25 | 'info_dict': { | |
26 | 'id': 'one-one', | |
27 | 'ext': 'ogg', | |
28 | 'title': 'One to one', | |
29 | 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", | |
30 | 'uploader': 'Thomas Hercouët', | |
31 | }, | |
32 | } | |
33 | ||
34 | def _real_extract(self, url): | |
35 | m = self._match_valid_url(url) | |
36 | video_id = m.group('id') | |
37 | ||
38 | webpage = self._download_webpage(url, video_id) | |
39 | title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') | |
40 | description = self._html_search_regex( | |
41 | r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', | |
42 | webpage, 'description', fatal=False) | |
43 | uploader = self._html_search_regex( | |
44 | r'<div class="credit"> © (.*?)</div>', | |
45 | webpage, 'uploader', fatal=False) | |
46 | ||
47 | formats_str = self._html_search_regex( | |
48 | r'class="jp-jplayer[^"]*" data-source="([^"]+)">', | |
49 | webpage, 'audio URLs') | |
50 | formats = [ | |
51 | { | |
52 | 'format_id': fm[0], | |
53 | 'url': fm[1], | |
54 | 'vcodec': 'none', | |
55 | 'quality': i, | |
56 | } | |
57 | for i, fm in | |
58 | enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) | |
59 | ] | |
60 | ||
61 | return { | |
62 | 'id': video_id, | |
63 | 'title': title, | |
64 | 'formats': formats, | |
65 | 'description': description, | |
66 | 'uploader': uploader, | |
67 | } | |
68 | ||
69 | ||
70 | class RadioFranceBaseIE(InfoExtractor): | |
71 | _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' | |
72 | ||
73 | _STATIONS_RE = '|'.join(map(re.escape, ( | |
74 | 'franceculture', | |
75 | 'franceinfo', | |
76 | 'franceinter', | |
77 | 'francemusique', | |
78 | 'fip', | |
79 | 'mouv', | |
80 | ))) | |
81 | ||
82 | def _extract_data_from_webpage(self, webpage, display_id, key): | |
83 | return traverse_obj(self._search_json( | |
84 | r'\bconst\s+data\s*=', webpage, key, display_id, | |
85 | contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json), | |
86 | (..., 'data', key, {dict}), get_all=False) or {} | |
87 | ||
88 | ||
89 | class FranceCultureIE(RadioFranceBaseIE): | |
90 | _VALID_URL = rf'''(?x) | |
91 | {RadioFranceBaseIE._VALID_URL_BASE} | |
92 | /(?:{RadioFranceBaseIE._STATIONS_RE}) | |
93 | /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#]) | |
94 | ''' | |
95 | ||
96 | _TESTS = [ | |
97 | { | |
98 | 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', | |
99 | 'info_dict': { | |
100 | 'id': '8440487', | |
101 | 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', | |
102 | 'ext': 'mp3', | |
103 | 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', | |
104 | 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', | |
105 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
106 | 'upload_date': '20220514', | |
107 | 'duration': 2750, | |
108 | }, | |
109 | }, | |
110 | { | |
111 | 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', | |
112 | 'info_dict': { | |
113 | 'id': '2107675', | |
114 | 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', | |
115 | 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', | |
116 | 'description': 'md5:36ee74351ede77a314fdebb94026b916', | |
117 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
118 | 'upload_date': '20230310', | |
119 | 'duration': 8977, | |
120 | 'ext': 'mp3', | |
121 | }, | |
122 | }, | |
123 | { | |
124 | 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', | |
125 | 'only_matching': True, | |
126 | }, { | |
127 | 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', | |
128 | 'only_matching': True, | |
129 | } | |
130 | ] | |
131 | ||
132 | def _real_extract(self, url): | |
133 | video_id, display_id = self._match_valid_url(url).group('id', 'display_id') | |
134 | webpage = self._download_webpage(url, display_id) | |
135 | ||
136 | # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 | |
137 | video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') | |
138 | ||
139 | return { | |
140 | 'id': video_id, | |
141 | 'display_id': display_id, | |
142 | 'url': video_data['contentUrl'], | |
143 | 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, | |
144 | 'duration': parse_duration(video_data.get('duration')), | |
145 | 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', | |
146 | webpage, 'title', default=self._og_search_title(webpage)), | |
147 | 'description': self._html_search_regex( | |
148 | r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None), | |
149 | 'thumbnail': self._og_search_thumbnail(webpage), | |
150 | 'uploader': self._html_search_regex( | |
151 | r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), | |
152 | 'upload_date': unified_strdate(self._search_regex( | |
153 | r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) | |
154 | } | |
155 | ||
156 | ||
157 | class RadioFranceLiveIE(RadioFranceBaseIE): | |
158 | _VALID_URL = rf'''(?x) | |
159 | https?://(?:www\.)?radiofrance\.fr | |
160 | /(?P<id>{RadioFranceBaseIE._STATIONS_RE}) | |
161 | /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$) | |
162 | ''' | |
163 | ||
164 | _TESTS = [{ | |
165 | 'url': 'https://www.radiofrance.fr/franceinter/', | |
166 | 'info_dict': { | |
167 | 'id': 'franceinter', | |
168 | 'title': str, | |
169 | 'live_status': 'is_live', | |
170 | 'ext': 'aac', | |
171 | }, | |
172 | 'params': { | |
173 | 'skip_download': 'Livestream', | |
174 | }, | |
175 | }, { | |
176 | 'url': 'https://www.radiofrance.fr/franceculture', | |
177 | 'info_dict': { | |
178 | 'id': 'franceculture', | |
179 | 'title': str, | |
180 | 'live_status': 'is_live', | |
181 | 'ext': 'aac', | |
182 | }, | |
183 | 'params': { | |
184 | 'skip_download': 'Livestream', | |
185 | }, | |
186 | }, { | |
187 | 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', | |
188 | 'info_dict': { | |
189 | 'id': 'mouv-radio-musique-kids-family', | |
190 | 'title': str, | |
191 | 'live_status': 'is_live', | |
192 | 'ext': 'aac', | |
193 | }, | |
194 | 'params': { | |
195 | 'skip_download': 'Livestream', | |
196 | }, | |
197 | }, { | |
198 | 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', | |
199 | 'info_dict': { | |
200 | 'id': 'mouv-radio-rnb-soul', | |
201 | 'title': str, | |
202 | 'live_status': 'is_live', | |
203 | 'ext': 'aac', | |
204 | }, | |
205 | 'params': { | |
206 | 'skip_download': 'Livestream', | |
207 | }, | |
208 | }, { | |
209 | 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', | |
210 | 'info_dict': { | |
211 | 'id': 'mouv-radio-musique-mix', | |
212 | 'title': str, | |
213 | 'live_status': 'is_live', | |
214 | 'ext': 'aac', | |
215 | }, | |
216 | 'params': { | |
217 | 'skip_download': 'Livestream', | |
218 | }, | |
219 | }, { | |
220 | 'url': 'https://www.radiofrance.fr/fip/radio-rock', | |
221 | 'info_dict': { | |
222 | 'id': 'fip-radio-rock', | |
223 | 'title': str, | |
224 | 'live_status': 'is_live', | |
225 | 'ext': 'aac', | |
226 | }, | |
227 | 'params': { | |
228 | 'skip_download': 'Livestream', | |
229 | }, | |
230 | }, { | |
231 | 'url': 'https://www.radiofrance.fr/mouv', | |
232 | 'only_matching': True, | |
233 | }] | |
234 | ||
235 | def _real_extract(self, url): | |
236 | station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') | |
237 | ||
238 | if substation_id: | |
239 | webpage = self._download_webpage(url, station_id) | |
240 | api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') | |
241 | else: | |
242 | api_response = self._download_json( | |
243 | f'https://www.radiofrance.fr/{station_id}/api/live', station_id) | |
244 | ||
245 | formats, subtitles = [], {} | |
246 | for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): | |
247 | if media_source.get('format') == 'hls': | |
248 | fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) | |
249 | formats.extend(fmts) | |
250 | self._merge_subtitles(subs, target=subtitles) | |
251 | else: | |
252 | formats.append({ | |
253 | 'url': media_source['url'], | |
254 | 'abr': media_source.get('bitrate'), | |
255 | }) | |
256 | ||
257 | return { | |
258 | 'id': join_nonempty(station_id, substation_id), | |
259 | 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( | |
260 | ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), | |
261 | 'formats': formats, | |
262 | 'subtitles': subtitles, | |
263 | 'is_live': True, | |
264 | } | |
265 | ||
266 | ||
267 | class RadioFrancePlaylistBaseIE(RadioFranceBaseIE): | |
268 | """Subclasses must set _METADATA_KEY""" | |
269 | ||
270 | def _call_api(self, content_id, cursor, page_num): | |
271 | raise NotImplementedError('This method must be implemented by subclasses') | |
272 | ||
273 | def _generate_playlist_entries(self, content_id, content_response): | |
274 | for page_num in itertools.count(2): | |
275 | for entry in content_response['items']: | |
276 | yield self.url_result( | |
277 | f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { | |
278 | 'title': 'title', | |
279 | 'description': 'standFirst', | |
280 | 'timestamp': ('publishedDate', {int_or_none}), | |
281 | 'thumbnail': ('visual', 'src'), | |
282 | })) | |
283 | ||
284 | next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) | |
285 | if not next_cursor: | |
286 | break | |
287 | ||
288 | content_response = self._call_api(content_id, next_cursor, page_num) | |
289 | ||
290 | def _real_extract(self, url): | |
291 | display_id = self._match_id(url) | |
292 | ||
293 | metadata = self._download_json( | |
294 | 'https://www.radiofrance.fr/api/v2.1/path', display_id, | |
295 | query={'value': urllib.parse.urlparse(url).path})['content'] | |
296 | ||
297 | content_id = metadata['id'] | |
298 | ||
299 | return self.playlist_result( | |
300 | self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, | |
301 | display_id=display_id, **{**traverse_obj(metadata, { | |
302 | 'title': 'title', | |
303 | 'description': 'standFirst', | |
304 | 'thumbnail': ('visual', 'src'), | |
305 | }), **traverse_obj(metadata, { | |
306 | 'title': 'name', | |
307 | 'description': 'role', | |
308 | })}) | |
309 | ||
310 | ||
311 | class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE): | |
312 | _VALID_URL = rf'''(?x) | |
313 | {RadioFranceBaseIE._VALID_URL_BASE} | |
314 | /(?:{RadioFranceBaseIE._STATIONS_RE}) | |
315 | /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$) | |
316 | ''' | |
317 | ||
318 | _TESTS = [{ | |
319 | 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', | |
320 | 'info_dict': { | |
321 | 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', | |
322 | 'display_id': 'le-billet-vert', | |
323 | 'title': 'Le billet sciences', | |
324 | 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', | |
325 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
326 | }, | |
327 | 'playlist_mincount': 11, | |
328 | }, { | |
329 | 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', | |
330 | 'info_dict': { | |
331 | 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', | |
332 | 'display_id': 'jean-marie-le-pen-l-obsession-nationale', | |
333 | 'title': 'Jean-Marie Le Pen, l\'obsession nationale', | |
334 | 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', | |
335 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
336 | }, | |
337 | 'playlist_count': 7, | |
338 | }, { | |
339 | 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', | |
340 | 'info_dict': { | |
341 | 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', | |
342 | 'display_id': 'serie-thomas-grjebine', | |
343 | 'title': 'Thomas Grjebine', | |
344 | }, | |
345 | 'playlist_count': 1, | |
346 | }, { | |
347 | 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', | |
348 | 'info_dict': { | |
349 | 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', | |
350 | 'display_id': 'certains-l-aiment-fip', | |
351 | 'title': 'Certains l’aiment Fip', | |
352 | 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', | |
353 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
354 | }, | |
355 | 'playlist_mincount': 321, | |
356 | }, { | |
357 | 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', | |
358 | 'only_matching': True, | |
359 | }, { | |
360 | 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', | |
361 | 'only_matching': True, | |
362 | }] | |
363 | ||
364 | _METADATA_KEY = 'expressions' | |
365 | ||
366 | def _call_api(self, podcast_id, cursor, page_num): | |
367 | return self._download_json( | |
368 | f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, | |
369 | note=f'Downloading page {page_num}', query={'pageCursor': cursor}) | |
370 | ||
371 | ||
372 | class RadioFranceProfileIE(RadioFrancePlaylistBaseIE): | |
373 | _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)' | |
374 | ||
375 | _TESTS = [{ | |
376 | 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', | |
377 | 'info_dict': { | |
378 | 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', | |
379 | 'display_id': 'thomas-pesquet', | |
380 | 'title': 'Thomas Pesquet', | |
381 | 'description': 'Astronaute à l\'agence spatiale européenne', | |
382 | }, | |
383 | 'playlist_mincount': 212, | |
384 | }, { | |
385 | 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', | |
386 | 'info_dict': { | |
387 | 'id': '9593050b-0183-4972-a0b5-d8f699079e02', | |
388 | 'display_id': 'eugenie-bastie', | |
389 | 'title': 'Eugénie Bastié', | |
390 | 'description': 'Journaliste et essayiste', | |
391 | 'thumbnail': r're:^https?://.*\.(?:jpg|png)', | |
392 | }, | |
393 | 'playlist_mincount': 39, | |
394 | }, { | |
395 | 'url': 'https://www.radiofrance.fr/personnes/lea-salame', | |
396 | 'only_matching': True, | |
397 | }] | |
398 | ||
399 | _METADATA_KEY = 'documents' | |
400 | ||
401 | def _call_api(self, profile_id, cursor, page_num): | |
402 | resp = self._download_json( | |
403 | f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, | |
404 | note=f'Downloading page {page_num}', query={ | |
405 | 'relation': 'personality', | |
406 | 'cursor': cursor, | |
407 | }) | |
408 | ||
409 | resp['next'] = traverse_obj(resp, ('pagination', 'next')) | |
410 | return resp | |
411 | ||
412 | ||
413 | class RadioFranceProgramScheduleIE(RadioFranceBaseIE): | |
414 | _VALID_URL = rf'''(?x) | |
415 | {RadioFranceBaseIE._VALID_URL_BASE} | |
416 | /(?P<station>{RadioFranceBaseIE._STATIONS_RE}) | |
417 | /grille-programmes(?:\?date=(?P<date>[\d-]+))? | |
418 | ''' | |
419 | ||
420 | _TESTS = [{ | |
421 | 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', | |
422 | 'info_dict': { | |
423 | 'id': 'franceinter-program-20230217', | |
424 | 'upload_date': '20230217', | |
425 | }, | |
426 | 'playlist_count': 25, | |
427 | }, { | |
428 | 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', | |
429 | 'info_dict': { | |
430 | 'id': 'franceculture-program-20230201', | |
431 | 'upload_date': '20230201', | |
432 | }, | |
433 | 'playlist_count': 25, | |
434 | }, { | |
435 | 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', | |
436 | 'info_dict': { | |
437 | 'id': 'mouv-program-20230319', | |
438 | 'upload_date': '20230319', | |
439 | }, | |
440 | 'playlist_count': 3, | |
441 | }, { | |
442 | 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', | |
443 | 'info_dict': { | |
444 | 'id': 'francemusique-program-20230318', | |
445 | 'upload_date': '20230318', | |
446 | }, | |
447 | 'playlist_count': 15, | |
448 | }, { | |
449 | 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', | |
450 | 'only_matching': True, | |
451 | }] | |
452 | ||
453 | def _generate_playlist_entries(self, webpage_url, api_response): | |
454 | for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): | |
455 | yield self.url_result( | |
456 | urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, | |
457 | url_transparent=True, **traverse_obj(entry, { | |
458 | 'title': ('expression', 'title'), | |
459 | 'thumbnail': ('expression', 'visual', 'src'), | |
460 | 'timestamp': ('startTime', {int_or_none}), | |
461 | 'series_id': ('concept', 'id'), | |
462 | 'series': ('concept', 'title'), | |
463 | })) | |
464 | ||
465 | def _real_extract(self, url): | |
466 | station, date = self._match_valid_url(url).group('station', 'date') | |
467 | webpage = self._download_webpage(url, station) | |
468 | grid_data = self._extract_data_from_webpage(webpage, station, 'grid') | |
469 | upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') | |
470 | ||
471 | return self.playlist_result( | |
472 | self._generate_playlist_entries(url, grid_data), | |
473 | join_nonempty(station, 'program', upload_date), upload_date=upload_date) |