]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/radiofrance.py
[cleanup] Misc fixes
[yt-dlp.git] / yt_dlp / extractor / radiofrance.py
CommitLineData
0e2a436d
PH
1import re
2
3from .common import InfoExtractor
56ba69e4 4from ..utils import parse_duration, unified_strdate
0e2a436d
PH
5
6
7class RadioFranceIE(InfoExtractor):
8 _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
51fb2e98 9 IE_NAME = 'radiofrance'
0e2a436d
PH
10
11 _TEST = {
51fb2e98
PH
12 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
13 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
14 'info_dict': {
15 'id': 'one-one',
16 'ext': 'ogg',
611c1dd9
S
17 'title': 'One to one',
18 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
19 'uploader': 'Thomas Hercouët',
0e2a436d
PH
20 },
21 }
22
23 def _real_extract(self, url):
5ad28e7f 24 m = self._match_valid_url(url)
0e2a436d
PH
25 video_id = m.group('id')
26
27 webpage = self._download_webpage(url, video_id)
51fb2e98 28 title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
0e2a436d
PH
29 description = self._html_search_regex(
30 r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
51fb2e98 31 webpage, 'description', fatal=False)
0e2a436d
PH
32 uploader = self._html_search_regex(
33 r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
51fb2e98 34 webpage, 'uploader', fatal=False)
0e2a436d
PH
35
36 formats_str = self._html_search_regex(
37 r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
51fb2e98 38 webpage, 'audio URLs')
0e2a436d
PH
39 formats = [
40 {
7de6e075
PH
41 'format_id': fm[0],
42 'url': fm[1],
0e2a436d 43 'vcodec': 'none',
f983b875 44 'quality': i,
0e2a436d 45 }
51fb2e98
PH
46 for i, fm in
47 enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
0e2a436d 48 ]
51fb2e98 49 self._sort_formats(formats)
0e2a436d
PH
50
51 return {
52 'id': video_id,
53 'title': title,
54 'formats': formats,
55 'description': description,
56 'uploader': uploader,
57 }
56ba69e4 58
59
60class FranceCultureIE(InfoExtractor):
61 _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
62 _TESTS = [
63 {
64 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
65 'info_dict': {
66 'id': '8440487',
67 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
68 'ext': 'mp3',
69 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
70 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
71 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
72 'upload_date': '20220514',
73 'duration': 2750,
74 },
75 },
76 ]
77
78 def _real_extract(self, url):
79 video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
80 webpage = self._download_webpage(url, display_id)
81
82 # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
83 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+')
84
85 return {
86 'id': video_id,
87 'display_id': display_id,
88 'url': video_data['contentUrl'],
89 'ext': video_data.get('encodingFormat'),
90 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
91 'duration': parse_duration(video_data.get('duration')),
92 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
93 webpage, 'title', default=self._og_search_title(webpage)),
94 'description': self._html_search_regex(
95 r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
96 'thumbnail': self._og_search_thumbnail(webpage),
97 'uploader': self._html_search_regex(
98 r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
99 'upload_date': unified_strdate(self._search_regex(
100 r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
101 }