]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/radiofrance.py
[cleanup] Misc (#8182)
[yt-dlp.git] / yt_dlp / extractor / radiofrance.py
CommitLineData
ba8e9eb2 1import itertools
0e2a436d 2import re
ba8e9eb2 3import urllib.parse
0e2a436d
PH
4
5from .common import InfoExtractor
ba8e9eb2
E
6from ..utils import (
7 int_or_none,
8 join_nonempty,
9 js_to_json,
10 parse_duration,
11 strftime_or_none,
12 traverse_obj,
13 unified_strdate,
14 urljoin,
15)
0e2a436d
PH
16
17
18class RadioFranceIE(InfoExtractor):
19 _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
51fb2e98 20 IE_NAME = 'radiofrance'
0e2a436d
PH
21
22 _TEST = {
51fb2e98
PH
23 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
24 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
25 'info_dict': {
26 'id': 'one-one',
27 'ext': 'ogg',
611c1dd9
S
28 'title': 'One to one',
29 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
30 'uploader': 'Thomas Hercouët',
0e2a436d
PH
31 },
32 }
33
34 def _real_extract(self, url):
5ad28e7f 35 m = self._match_valid_url(url)
0e2a436d
PH
36 video_id = m.group('id')
37
38 webpage = self._download_webpage(url, video_id)
51fb2e98 39 title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
0e2a436d
PH
40 description = self._html_search_regex(
41 r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
51fb2e98 42 webpage, 'description', fatal=False)
0e2a436d
PH
43 uploader = self._html_search_regex(
44 r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
51fb2e98 45 webpage, 'uploader', fatal=False)
0e2a436d
PH
46
47 formats_str = self._html_search_regex(
48 r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
51fb2e98 49 webpage, 'audio URLs')
0e2a436d
PH
50 formats = [
51 {
7de6e075
PH
52 'format_id': fm[0],
53 'url': fm[1],
0e2a436d 54 'vcodec': 'none',
f983b875 55 'quality': i,
0e2a436d 56 }
51fb2e98
PH
57 for i, fm in
58 enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
0e2a436d 59 ]
0e2a436d
PH
60
61 return {
62 'id': video_id,
63 'title': title,
64 'formats': formats,
65 'description': description,
66 'uploader': uploader,
67 }
56ba69e4 68
69
ba8e9eb2
E
70class RadioFranceBaseIE(InfoExtractor):
71 _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
72
73 _STATIONS_RE = '|'.join(map(re.escape, (
74 'franceculture',
75 'franceinfo',
76 'franceinter',
77 'francemusique',
78 'fip',
79 'mouv',
80 )))
81
82 def _extract_data_from_webpage(self, webpage, display_id, key):
83 return traverse_obj(self._search_json(
84 r'\bconst\s+data\s*=', webpage, key, display_id,
5ca095cb 85 contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json),
ba8e9eb2
E
86 (..., 'data', key, {dict}), get_all=False) or {}
87
88
89class FranceCultureIE(RadioFranceBaseIE):
90 _VALID_URL = rf'''(?x)
91 {RadioFranceBaseIE._VALID_URL_BASE}
92 /(?:{RadioFranceBaseIE._STATIONS_RE})
93 /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
94 '''
95
56ba69e4 96 _TESTS = [
97 {
98 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
99 'info_dict': {
100 'id': '8440487',
101 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
102 'ext': 'mp3',
103 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
104 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
ba8e9eb2 105 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
56ba69e4 106 'upload_date': '20220514',
107 'duration': 2750,
108 },
109 },
ba8e9eb2
E
110 {
111 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
112 'info_dict': {
113 'id': '2107675',
114 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
115 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
116 'description': 'md5:36ee74351ede77a314fdebb94026b916',
117 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
118 'upload_date': '20230310',
119 'duration': 8977,
120 'ext': 'mp3',
121 },
122 },
38d86f4d 123 {
124 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
125 'only_matching': True,
ba8e9eb2
E
126 }, {
127 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
128 'only_matching': True,
38d86f4d 129 }
56ba69e4 130 ]
131
132 def _real_extract(self, url):
133 video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
134 webpage = self._download_webpage(url, display_id)
135
136 # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
8b7fb8b6 137 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}')
56ba69e4 138
139 return {
140 'id': video_id,
141 'display_id': display_id,
142 'url': video_data['contentUrl'],
56ba69e4 143 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
144 'duration': parse_duration(video_data.get('duration')),
145 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
146 webpage, 'title', default=self._og_search_title(webpage)),
147 'description': self._html_search_regex(
148 r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
149 'thumbnail': self._og_search_thumbnail(webpage),
150 'uploader': self._html_search_regex(
151 r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
152 'upload_date': unified_strdate(self._search_regex(
153 r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
154 }
ba8e9eb2
E
155
156
157class RadioFranceLiveIE(RadioFranceBaseIE):
158 _VALID_URL = rf'''(?x)
159 https?://(?:www\.)?radiofrance\.fr
160 /(?P<id>{RadioFranceBaseIE._STATIONS_RE})
161 /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
162 '''
163
164 _TESTS = [{
165 'url': 'https://www.radiofrance.fr/franceinter/',
166 'info_dict': {
167 'id': 'franceinter',
168 'title': str,
169 'live_status': 'is_live',
170 'ext': 'aac',
171 },
172 'params': {
173 'skip_download': 'Livestream',
174 },
175 }, {
176 'url': 'https://www.radiofrance.fr/franceculture',
177 'info_dict': {
178 'id': 'franceculture',
179 'title': str,
180 'live_status': 'is_live',
181 'ext': 'aac',
182 },
183 'params': {
184 'skip_download': 'Livestream',
185 },
186 }, {
187 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
188 'info_dict': {
189 'id': 'mouv-radio-musique-kids-family',
190 'title': str,
191 'live_status': 'is_live',
192 'ext': 'aac',
193 },
194 'params': {
195 'skip_download': 'Livestream',
196 },
197 }, {
198 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
199 'info_dict': {
200 'id': 'mouv-radio-rnb-soul',
201 'title': str,
202 'live_status': 'is_live',
203 'ext': 'aac',
204 },
205 'params': {
206 'skip_download': 'Livestream',
207 },
208 }, {
209 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
210 'info_dict': {
211 'id': 'mouv-radio-musique-mix',
212 'title': str,
213 'live_status': 'is_live',
214 'ext': 'aac',
215 },
216 'params': {
217 'skip_download': 'Livestream',
218 },
219 }, {
220 'url': 'https://www.radiofrance.fr/fip/radio-rock',
221 'info_dict': {
222 'id': 'fip-radio-rock',
223 'title': str,
224 'live_status': 'is_live',
225 'ext': 'aac',
226 },
227 'params': {
228 'skip_download': 'Livestream',
229 },
230 }, {
231 'url': 'https://www.radiofrance.fr/mouv',
232 'only_matching': True,
233 }]
234
235 def _real_extract(self, url):
236 station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
237
238 if substation_id:
239 webpage = self._download_webpage(url, station_id)
240 api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
241 else:
242 api_response = self._download_json(
243 f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
244
245 formats, subtitles = [], {}
246 for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
247 if media_source.get('format') == 'hls':
248 fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
249 formats.extend(fmts)
250 self._merge_subtitles(subs, target=subtitles)
251 else:
252 formats.append({
253 'url': media_source['url'],
254 'abr': media_source.get('bitrate'),
255 })
256
257 return {
258 'id': join_nonempty(station_id, substation_id),
259 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
260 ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
261 'formats': formats,
262 'subtitles': subtitles,
263 'is_live': True,
264 }
265
266
267class RadioFrancePlaylistBase(RadioFranceBaseIE):
268 """Subclasses must set _METADATA_KEY"""
269
270 def _call_api(self, content_id, cursor, page_num):
271 raise NotImplementedError('This method must be implemented by subclasses')
272
273 def _generate_playlist_entries(self, content_id, content_response):
274 for page_num in itertools.count(2):
275 for entry in content_response['items']:
276 yield self.url_result(
277 f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
278 'title': 'title',
279 'description': 'standFirst',
280 'timestamp': ('publishedDate', {int_or_none}),
281 'thumbnail': ('visual', 'src'),
282 }))
283
284 next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
285 if not next_cursor:
286 break
287
288 content_response = self._call_api(content_id, next_cursor, page_num)
289
290 def _real_extract(self, url):
291 display_id = self._match_id(url)
292
293 metadata = self._download_json(
294 'https://www.radiofrance.fr/api/v2.1/path', display_id,
295 query={'value': urllib.parse.urlparse(url).path})['content']
296
297 content_id = metadata['id']
298
299 return self.playlist_result(
300 self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
301 display_id=display_id, **{**traverse_obj(metadata, {
302 'title': 'title',
303 'description': 'standFirst',
304 'thumbnail': ('visual', 'src'),
305 }), **traverse_obj(metadata, {
306 'title': 'name',
307 'description': 'role',
308 })})
309
310
311class RadioFrancePodcastIE(RadioFrancePlaylistBase):
312 _VALID_URL = rf'''(?x)
313 {RadioFranceBaseIE._VALID_URL_BASE}
314 /(?:{RadioFranceBaseIE._STATIONS_RE})
315 /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
316 '''
317
318 _TESTS = [{
319 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
320 'info_dict': {
321 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
322 'display_id': 'le-billet-vert',
323 'title': 'Le billet sciences',
324 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
325 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
326 },
327 'playlist_mincount': 11,
328 }, {
329 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
330 'info_dict': {
331 'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
332 'display_id': 'jean-marie-le-pen-l-obsession-nationale',
333 'title': 'Jean-Marie Le Pen, l\'obsession nationale',
334 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
335 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
336 },
337 'playlist_count': 7,
338 }, {
339 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
340 'info_dict': {
341 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
342 'display_id': 'serie-thomas-grjebine',
343 'title': 'Thomas Grjebine',
344 },
345 'playlist_count': 1,
346 }, {
347 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
348 'info_dict': {
349 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
350 'display_id': 'certains-l-aiment-fip',
351 'title': 'Certains l’aiment Fip',
352 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
353 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
354 },
355 'playlist_mincount': 321,
356 }, {
357 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
358 'only_matching': True,
359 }, {
360 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
361 'only_matching': True,
362 }]
363
364 _METADATA_KEY = 'expressions'
365
366 def _call_api(self, podcast_id, cursor, page_num):
367 return self._download_json(
368 f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
369 note=f'Downloading page {page_num}', query={'pageCursor': cursor})
370
371
372class RadioFranceProfileIE(RadioFrancePlaylistBase):
373 _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
374
375 _TESTS = [{
376 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
377 'info_dict': {
378 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
379 'display_id': 'thomas-pesquet',
380 'title': 'Thomas Pesquet',
381 'description': 'Astronaute à l\'agence spatiale européenne',
382 },
383 'playlist_mincount': 212,
384 }, {
385 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
386 'info_dict': {
387 'id': '9593050b-0183-4972-a0b5-d8f699079e02',
388 'display_id': 'eugenie-bastie',
389 'title': 'Eugénie Bastié',
390 'description': 'Journaliste et essayiste',
391 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
392 },
393 'playlist_mincount': 39,
394 }, {
395 'url': 'https://www.radiofrance.fr/personnes/lea-salame',
396 'only_matching': True,
397 }]
398
399 _METADATA_KEY = 'documents'
400
401 def _call_api(self, profile_id, cursor, page_num):
402 resp = self._download_json(
403 f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
404 note=f'Downloading page {page_num}', query={
405 'relation': 'personality',
406 'cursor': cursor,
407 })
408
409 resp['next'] = traverse_obj(resp, ('pagination', 'next'))
410 return resp
411
412
413class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
414 _VALID_URL = rf'''(?x)
415 {RadioFranceBaseIE._VALID_URL_BASE}
416 /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
417 /grille-programmes(?:\?date=(?P<date>[\d-]+))?
418 '''
419
420 _TESTS = [{
421 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
422 'info_dict': {
423 'id': 'franceinter-program-20230217',
424 'upload_date': '20230217',
425 },
426 'playlist_count': 25,
427 }, {
428 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
429 'info_dict': {
430 'id': 'franceculture-program-20230201',
431 'upload_date': '20230201',
432 },
433 'playlist_count': 25,
434 }, {
435 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
436 'info_dict': {
437 'id': 'mouv-program-20230319',
438 'upload_date': '20230319',
439 },
440 'playlist_count': 3,
441 }, {
442 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
443 'info_dict': {
444 'id': 'francemusique-program-20230318',
445 'upload_date': '20230318',
446 },
447 'playlist_count': 15,
448 }, {
449 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
450 'only_matching': True,
451 }]
452
453 def _generate_playlist_entries(self, webpage_url, api_response):
454 for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
455 yield self.url_result(
456 urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
457 url_transparent=True, **traverse_obj(entry, {
458 'title': ('expression', 'title'),
459 'thumbnail': ('expression', 'visual', 'src'),
460 'timestamp': ('startTime', {int_or_none}),
461 'series_id': ('concept', 'id'),
462 'series': ('concept', 'title'),
463 }))
464
465 def _real_extract(self, url):
466 station, date = self._match_valid_url(url).group('station', 'date')
467 webpage = self._download_webpage(url, station)
468 grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
469 upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
470
471 return self.playlist_result(
472 self._generate_playlist_entries(url, grid_data),
473 join_nonempty(station, 'program', upload_date), upload_date=upload_date)