]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[ie/zingmp3] Add support for radio and podcasts (#7189)
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
09f815ad 51 'only_matching': True,
8bdd16b4 52 }, {
051d6b45 53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 54 'only_matching': True,
55 }, {
051d6b45 56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 57 'only_matching': True,
15e9e578 58 }, {
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
09f815ad
SL
60 'only_matching': True,
61 }, {
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
15e9e578 64 'info_dict': {
09f815ad
SL
65 'id': '006785-000-A',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
69 'duration': 5849,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
15e9e578 72 'ext': 'mp4',
09f815ad 73 }
8bdd16b4 74 }]
75
051d6b45
F
76 _GEO_BYPASS = True
77
78 _LANG_MAP = { # ISO639 -> French abbreviations
79 'fr': 'F',
80 'de': 'A',
81 'en': 'E[ANG]',
82 'es': 'E[ESP]',
83 'it': 'E[ITA]',
84 'pl': 'E[POL]',
85 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
86 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
87 'mul': 'EU',
88 }
89
90 _VERSION_CODE_RE = re.compile(r'''(?x)
91 V
92 (?P<original_voice>O?)
93 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
94 (?P<audio_desc>AUD|)
95 (?:
96 (?P<has_sub>-ST)
97 (?P<sdh_sub>M?)
98 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
99 )?
100 ''')
101
102 # all obtained by exhaustive testing
103 _COUNTRIES_MAP = {
89e4d861 104 'DE_FR': (
051d6b45
F
105 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
106 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 107 ),
051d6b45 108 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 109 'EUR_DE_FR': (
051d6b45
F
110 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
111 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
112 'YT',
89e4d861 113 ),
114 'SAT': (
051d6b45
F
115 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
116 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
117 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
118 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
119 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
120 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 121 ),
051d6b45
F
122 }
123
8bdd16b4 124 def _real_extract(self, url):
5ad28e7f 125 mobj = self._match_valid_url(url)
8bdd16b4 126 video_id = mobj.group('id')
127 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
128 langauge_code = self._LANG_MAP.get(lang)
129
09f815ad
SL
130 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
131 'x-validated-age': '18'
132 })
051d6b45
F
133
134 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
135 if geoblocking.get('restrictedArea'):
136 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
137 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
138
139 if not traverse_obj(config, ('data', 'attributes', 'rights')):
140 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
141 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
142 raise ExtractorError(
143 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
144
145 formats, subtitles = [], {}
1534aba8 146 secondary_formats = []
051d6b45
F
147 for stream in config['data']['attributes']['streams']:
148 # official player contains code like `e.get("versions")[0].eStat.ml5`
149 stream_version = stream['versions'][0]
150 stream_version_code = stream_version['eStat']['ml5']
151
152 lang_pref = -1
153 m = self._VERSION_CODE_RE.match(stream_version_code)
154 if m:
155 lang_pref = int(''.join('01'[x] for x in (
156 m.group('vlang') == langauge_code, # we prefer voice in the requested language
157 not m.group('audio_desc'), # and not the audio description version
158 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
159 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
160 not m.group('has_sub'), # but we prefer no subtitles otherwise
161 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
162 )))
163
1534aba8 164 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
c2da0b5e 165 if 'HLS' in stream['protocol']:
051d6b45
F
166 fmts, subs = self._extract_m3u8_formats_and_subtitles(
167 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
168 for fmt in fmts:
169 fmt.update({
1534aba8 170 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 171 'language_preference': lang_pref,
25791435 172 })
1534aba8
SS
173 if any(map(short_label.startswith, ('cc', 'OGsub'))):
174 secondary_formats.extend(fmts)
175 else:
176 formats.extend(fmts)
051d6b45
F
177 self._merge_subtitles(subs, target=subtitles)
178
179 elif stream['protocol'] in ('HTTPS', 'RTMP'):
180 formats.append({
181 'format_id': f'{stream["protocol"]}-{stream_version_code}',
182 'url': stream['url'],
1534aba8 183 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
184 'language_preference': lang_pref,
185 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
186 })
187
c40f5cf4 188 else:
051d6b45
F
189 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
190
1534aba8
SS
191 formats.extend(secondary_formats)
192 self._remove_duplicate_formats(formats)
aff2f4f4 193
051d6b45 194 metadata = config['data']['attributes']['metadata']
c40f5cf4 195
8bdd16b4 196 return {
051d6b45
F
197 'id': metadata['providerId'],
198 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 199 'title': traverse_obj(metadata, 'subtitle', 'title'),
200 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
201 'description': metadata.get('description'),
202 'duration': traverse_obj(metadata, ('duration', 'seconds')),
203 'language': metadata.get('language'),
204 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
205 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 206 'formats': formats,
051d6b45
F
207 'subtitles': subtitles,
208 'thumbnails': [
209 {'url': image['url'], 'id': image.get('caption')}
210 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
211 ],
15e9e578 212 # TODO: chapters may also be in stream['segments']?
213 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
214 'start_time': 'startTime',
215 'title': 'title',
216 })) or None,
8bdd16b4 217 }
c40f5cf4 218
24114fee 219
8bdd16b4 220class ArteTVEmbedIE(InfoExtractor):
221 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 222 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 223 _TESTS = [{
8bdd16b4 224 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 225 'info_dict': {
8bdd16b4 226 'id': '100605-013-A',
9c54ae33 227 'ext': 'mp4',
8bdd16b4 228 'title': 'United we Stream November Lockdown Edition #13',
229 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
230 'upload_date': '20201116',
69a0c470 231 },
051d6b45 232 'skip': 'No video available'
8bdd16b4 233 }, {
234 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
235 'only_matching': True,
9c54ae33 236 }]
56a8ab7d 237
893f8832 238 def _real_extract(self, url):
4dfbf869 239 qs = parse_qs(url)
8bdd16b4 240 json_url = qs['json_url'][0]
241 video_id = ArteTVIE._match_id(json_url)
242 return self.url_result(
243 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
244
245
6e6b9f60 246class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 247 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 248 _TESTS = [{
ff0f4cfe 249 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 250 'only_matching': True,
8bdd16b4 251 }, {
252 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
253 'playlist_mincount': 100,
254 'info_dict': {
255 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
256 'id': 'RC-014123',
257 'title': 'ARTE Reportage - najlepsze reportaże',
258 },
6e6b9f60
S
259 }]
260
261 def _real_extract(self, url):
051d6b45
F
262 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
263 playlist = self._download_json(
264 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
265
266 entries = [{
267 '_type': 'url_transparent',
268 'url': video['config']['url'],
269 'ie_key': ArteTVIE.ie_key(),
270 'id': video.get('providerId'),
271 'title': video.get('title'),
272 'alt_title': video.get('subtitle'),
273 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
274 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
275 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
276
277 return self.playlist_result(entries, playlist_id,
278 traverse_obj(playlist, ('metadata', 'title')),
279 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 280
281
282class ArteTVCategoryIE(ArteTVBaseIE):
283 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
284 _TESTS = [{
285 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
286 'info_dict': {
287 'id': 'politics-and-society',
288 'title': 'Politics and society',
289 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
290 },
291 'playlist_mincount': 13,
051d6b45 292 }]
50e93e03 293
294 @classmethod
295 def suitable(cls, url):
296 return (
297 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 298 and super().suitable(url))
50e93e03 299
300 def _real_extract(self, url):
301 lang, playlist_id = self._match_valid_url(url).groups()
302 webpage = self._download_webpage(url, playlist_id)
303
304 items = []
305 for video in re.finditer(
306 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
307 webpage):
308 video = video.group('url')
309 if video == url:
310 continue
311 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
312 items.append(video)
313
62b8dac4 314 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 315
316 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
317 description=self._og_search_description(webpage, default=None))