]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 join_nonempty,
9 parse_iso8601,
10 parse_qs,
11 strip_or_none,
12 traverse_obj,
13 url_or_none,
14 )
15
16
17 class ArteTVBaseIE(InfoExtractor):
18 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
19 _API_BASE = 'https://api.arte.tv/api/player/v2'
20
21
22 class ArteTVIE(ArteTVBaseIE):
23 _VALID_URL = r'''(?x)
24 (?:https?://
25 (?:
26 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
27 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
28 )
29 |arte://program)
30 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
31 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 _TESTS = [{
33 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
34 'only_matching': True,
35 }, {
36 'note': 'No alt_title',
37 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
38 'only_matching': True,
39 }, {
40 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
41 'only_matching': True,
42 }, {
43 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
44 'only_matching': True,
45 }, {
46 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
47 'only_matching': True,
48 }, {
49 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/',
50 'info_dict': {
51 'id': '109067-000-A',
52 'ext': 'mp4',
53 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739',
54 'timestamp': 1713927600,
55 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530',
56 'duration': 7599,
57 'title': 'La loi de Téhéran',
58 'upload_date': '20240424',
59 'subtitles': {
60 'fr': 'mincount:1',
61 'fr-acc': 'mincount:1',
62 'fr-forced': 'mincount:1',
63 },
64 },
65 }, {
66 'note': 'age-restricted',
67 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
68 'info_dict': {
69 'id': '006785-000-A',
70 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
71 'title': 'The Element of Crime',
72 'timestamp': 1696111200,
73 'duration': 5849,
74 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
75 'upload_date': '20230930',
76 'ext': 'mp4',
77 },
78 'skip': '404 Not Found',
79 }]
80
81 _GEO_BYPASS = True
82
83 _LANG_MAP = { # ISO639 -> French abbreviations
84 'fr': 'F',
85 'de': 'A',
86 'en': 'E[ANG]',
87 'es': 'E[ESP]',
88 'it': 'E[ITA]',
89 'pl': 'E[POL]',
90 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
91 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
92 'mul': 'EU',
93 }
94
95 _VERSION_CODE_RE = re.compile(r'''(?x)
96 V
97 (?P<original_voice>O?)
98 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
99 (?P<audio_desc>AUD|)
100 (?:
101 (?P<has_sub>-ST)
102 (?P<sdh_sub>M?)
103 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
104 )?
105 ''')
106
107 # all obtained by exhaustive testing
108 _COUNTRIES_MAP = {
109 'DE_FR': (
110 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
111 'PF', 'PM', 'RE', 'WF', 'YT',
112 ),
113 # with both of the below 'BE' sometimes works, sometimes doesn't
114 'EUR_DE_FR': (
115 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
116 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
117 'YT',
118 ),
119 'SAT': (
120 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
121 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
122 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
123 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
124 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
125 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
126 ),
127 }
128
129 @staticmethod
130 def _fix_accessible_subs_locale(subs):
131 updated_subs = {}
132 for lang, sub_formats in subs.items():
133 for fmt in sub_formats:
134 url = fmt.get('url') or ''
135 suffix = ('acc' if url.endswith('-MAL.m3u8')
136 else 'forced' if '_VO' not in url
137 else None)
138 updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt)
139 return updated_subs
140
141 def _real_extract(self, url):
142 mobj = self._match_valid_url(url)
143 video_id = mobj.group('id')
144 lang = mobj.group('lang') or mobj.group('lang_2')
145 language_code = self._LANG_MAP.get(lang)
146
147 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
148 'x-validated-age': '18'
149 })
150
151 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
152 if geoblocking.get('restrictedArea'):
153 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
154 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
155
156 if not traverse_obj(config, ('data', 'attributes', 'rights')):
157 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
158 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
159 raise ExtractorError(
160 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
161
162 formats, subtitles = [], {}
163 secondary_formats = []
164 for stream in config['data']['attributes']['streams']:
165 # official player contains code like `e.get("versions")[0].eStat.ml5`
166 stream_version = stream['versions'][0]
167 stream_version_code = stream_version['eStat']['ml5']
168
169 lang_pref = -1
170 m = self._VERSION_CODE_RE.match(stream_version_code)
171 if m:
172 lang_pref = int(''.join('01'[x] for x in (
173 m.group('vlang') == language_code, # we prefer voice in the requested language
174 not m.group('audio_desc'), # and not the audio description version
175 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
176 m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language
177 not m.group('has_sub'), # but we prefer no subtitles otherwise
178 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
179 )))
180
181 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
182 if 'HLS' in stream['protocol']:
183 fmts, subs = self._extract_m3u8_formats_and_subtitles(
184 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
185 for fmt in fmts:
186 fmt.update({
187 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
188 'language_preference': lang_pref,
189 })
190 if any(map(short_label.startswith, ('cc', 'OGsub'))):
191 secondary_formats.extend(fmts)
192 else:
193 formats.extend(fmts)
194 subs = self._fix_accessible_subs_locale(subs)
195 self._merge_subtitles(subs, target=subtitles)
196
197 elif stream['protocol'] in ('HTTPS', 'RTMP'):
198 formats.append({
199 'format_id': f'{stream["protocol"]}-{stream_version_code}',
200 'url': stream['url'],
201 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
202 'language_preference': lang_pref,
203 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
204 })
205
206 else:
207 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
208
209 formats.extend(secondary_formats)
210 self._remove_duplicate_formats(formats)
211
212 metadata = config['data']['attributes']['metadata']
213
214 return {
215 'id': metadata['providerId'],
216 'webpage_url': traverse_obj(metadata, ('link', 'url')),
217 'title': traverse_obj(metadata, 'subtitle', 'title'),
218 'alt_title': metadata.get('subtitle') and metadata.get('title'),
219 'description': metadata.get('description'),
220 'duration': traverse_obj(metadata, ('duration', 'seconds')),
221 'language': metadata.get('language'),
222 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
223 'is_live': config['data']['attributes'].get('live', False),
224 'formats': formats,
225 'subtitles': subtitles,
226 'thumbnails': [
227 {'url': image['url'], 'id': image.get('caption')}
228 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
229 ],
230 # TODO: chapters may also be in stream['segments']?
231 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
232 'start_time': 'startTime',
233 'title': 'title',
234 })) or None,
235 }
236
237
238 class ArteTVEmbedIE(InfoExtractor):
239 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
240 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
241 _TESTS = [{
242 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
243 'info_dict': {
244 'id': '100605-013-A',
245 'ext': 'mp4',
246 'title': 'United we Stream November Lockdown Edition #13',
247 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
248 'upload_date': '20201116',
249 },
250 'skip': 'No video available'
251 }, {
252 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
253 'only_matching': True,
254 }]
255
256 def _real_extract(self, url):
257 qs = parse_qs(url)
258 json_url = qs['json_url'][0]
259 video_id = ArteTVIE._match_id(json_url)
260 return self.url_result(
261 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
262
263
264 class ArteTVPlaylistIE(ArteTVBaseIE):
265 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
266 _TESTS = [{
267 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
268 'only_matching': True,
269 }, {
270 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
271 'playlist_mincount': 100,
272 'info_dict': {
273 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
274 'id': 'RC-014123',
275 'title': 'ARTE Reportage - najlepsze reportaże',
276 },
277 }]
278
279 def _real_extract(self, url):
280 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
281 playlist = self._download_json(
282 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
283
284 entries = [{
285 '_type': 'url_transparent',
286 'url': video['config']['url'],
287 'ie_key': ArteTVIE.ie_key(),
288 'id': video.get('providerId'),
289 'title': video.get('title'),
290 'alt_title': video.get('subtitle'),
291 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
292 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
293 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
294
295 return self.playlist_result(entries, playlist_id,
296 traverse_obj(playlist, ('metadata', 'title')),
297 traverse_obj(playlist, ('metadata', 'description')))
298
299
300 class ArteTVCategoryIE(ArteTVBaseIE):
301 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
302 _TESTS = [{
303 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
304 'info_dict': {
305 'id': 'politics-and-society',
306 'title': 'Politics and society',
307 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
308 },
309 'playlist_mincount': 13,
310 }]
311
312 @classmethod
313 def suitable(cls, url):
314 return (
315 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
316 and super().suitable(url))
317
318 def _real_extract(self, url):
319 lang, playlist_id = self._match_valid_url(url).groups()
320 webpage = self._download_webpage(url, playlist_id)
321
322 items = []
323 for video in re.finditer(
324 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
325 webpage):
326 video = video.group('url')
327 if video == url:
328 continue
329 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
330 items.append(video)
331
332 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
333
334 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
335 description=self._og_search_description(webpage, default=None))