]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
7b567494 8 join_nonempty,
051d6b45 9 parse_iso8601,
4dfbf869 10 parse_qs,
50e93e03 11 strip_or_none,
051d6b45 12 traverse_obj,
8bdd16b4 13 url_or_none,
d5822b96
PH
14)
15
d5822b96 16
6e6b9f60 17class ArteTVBaseIE(InfoExtractor):
8bdd16b4 18 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 19 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 20
21
22class ArteTVIE(ArteTVBaseIE):
add96eb9 23 _VALID_URL = rf'''(?x)
051d6b45 24 (?:https?://
8bdd16b4 25 (?:
add96eb9 26 (?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos|
27 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>{ArteTVBaseIE._ARTE_LANGUAGES})
8bdd16b4 28 )
051d6b45 29 |arte://program)
add96eb9 30 /(?P<id>\d{{6}}-\d{{3}}-[AF]|LIVE)
31 '''
8bdd16b4 32 _TESTS = [{
33 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45 34 'only_matching': True,
f640e42f 35 }, {
36 'note': 'No alt_title',
37 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
09f815ad 38 'only_matching': True,
8bdd16b4 39 }, {
051d6b45 40 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 41 'only_matching': True,
42 }, {
051d6b45 43 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 44 'only_matching': True,
15e9e578 45 }, {
46 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
09f815ad 47 'only_matching': True,
7b567494 48 }, {
49 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/',
50 'info_dict': {
51 'id': '109067-000-A',
52 'ext': 'mp4',
53 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739',
54 'timestamp': 1713927600,
55 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530',
56 'duration': 7599,
57 'title': 'La loi de Téhéran',
58 'upload_date': '20240424',
59 'subtitles': {
60 'fr': 'mincount:1',
61 'fr-acc': 'mincount:1',
62 'fr-forced': 'mincount:1',
63 },
64 },
09f815ad
SL
65 }, {
66 'note': 'age-restricted',
67 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
15e9e578 68 'info_dict': {
09f815ad
SL
69 'id': '006785-000-A',
70 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
71 'title': 'The Element of Crime',
72 'timestamp': 1696111200,
73 'duration': 5849,
74 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
75 'upload_date': '20230930',
15e9e578 76 'ext': 'mp4',
393b487a 77 },
7b567494 78 'skip': '404 Not Found',
8bdd16b4 79 }]
80
051d6b45
F
81 _GEO_BYPASS = True
82
83 _LANG_MAP = { # ISO639 -> French abbreviations
84 'fr': 'F',
85 'de': 'A',
86 'en': 'E[ANG]',
87 'es': 'E[ESP]',
88 'it': 'E[ITA]',
89 'pl': 'E[POL]',
90 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
91 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
92 'mul': 'EU',
93 }
94
95 _VERSION_CODE_RE = re.compile(r'''(?x)
96 V
97 (?P<original_voice>O?)
98 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
99 (?P<audio_desc>AUD|)
100 (?:
101 (?P<has_sub>-ST)
102 (?P<sdh_sub>M?)
103 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
104 )?
105 ''')
106
107 # all obtained by exhaustive testing
108 _COUNTRIES_MAP = {
89e4d861 109 'DE_FR': (
051d6b45
F
110 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
111 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 112 ),
051d6b45 113 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 114 'EUR_DE_FR': (
051d6b45
F
115 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
116 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
117 'YT',
89e4d861 118 ),
119 'SAT': (
051d6b45
F
120 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
121 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
122 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
123 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
124 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
125 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 126 ),
051d6b45
F
127 }
128
393b487a
NA
129 @staticmethod
130 def _fix_accessible_subs_locale(subs):
131 updated_subs = {}
132 for lang, sub_formats in subs.items():
615a8444 133 for fmt in sub_formats:
7b567494 134 url = fmt.get('url') or ''
135 suffix = ('acc' if url.endswith('-MAL.m3u8')
136 else 'forced' if '_VO' not in url
137 else None)
138 updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt)
393b487a
NA
139 return updated_subs
140
8bdd16b4 141 def _real_extract(self, url):
5ad28e7f 142 mobj = self._match_valid_url(url)
8bdd16b4 143 video_id = mobj.group('id')
144 lang = mobj.group('lang') or mobj.group('lang_2')
7b567494 145 language_code = self._LANG_MAP.get(lang)
051d6b45 146
09f815ad 147 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
add96eb9 148 'x-validated-age': '18',
09f815ad 149 })
051d6b45
F
150
151 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
152 if geoblocking.get('restrictedArea'):
153 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
154 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
155
156 if not traverse_obj(config, ('data', 'attributes', 'rights')):
157 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
158 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
159 raise ExtractorError(
160 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
161
162 formats, subtitles = [], {}
1534aba8 163 secondary_formats = []
051d6b45
F
164 for stream in config['data']['attributes']['streams']:
165 # official player contains code like `e.get("versions")[0].eStat.ml5`
166 stream_version = stream['versions'][0]
167 stream_version_code = stream_version['eStat']['ml5']
168
169 lang_pref = -1
170 m = self._VERSION_CODE_RE.match(stream_version_code)
171 if m:
172 lang_pref = int(''.join('01'[x] for x in (
7b567494 173 m.group('vlang') == language_code, # we prefer voice in the requested language
051d6b45
F
174 not m.group('audio_desc'), # and not the audio description version
175 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
7b567494 176 m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language
051d6b45
F
177 not m.group('has_sub'), # but we prefer no subtitles otherwise
178 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
179 )))
180
1534aba8 181 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
c2da0b5e 182 if 'HLS' in stream['protocol']:
051d6b45
F
183 fmts, subs = self._extract_m3u8_formats_and_subtitles(
184 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
185 for fmt in fmts:
186 fmt.update({
1534aba8 187 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 188 'language_preference': lang_pref,
25791435 189 })
1534aba8
SS
190 if any(map(short_label.startswith, ('cc', 'OGsub'))):
191 secondary_formats.extend(fmts)
192 else:
193 formats.extend(fmts)
393b487a 194 subs = self._fix_accessible_subs_locale(subs)
051d6b45
F
195 self._merge_subtitles(subs, target=subtitles)
196
197 elif stream['protocol'] in ('HTTPS', 'RTMP'):
198 formats.append({
199 'format_id': f'{stream["protocol"]}-{stream_version_code}',
200 'url': stream['url'],
1534aba8 201 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
202 'language_preference': lang_pref,
203 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
204 })
205
c40f5cf4 206 else:
051d6b45
F
207 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
208
1534aba8
SS
209 formats.extend(secondary_formats)
210 self._remove_duplicate_formats(formats)
aff2f4f4 211
051d6b45 212 metadata = config['data']['attributes']['metadata']
c40f5cf4 213
8bdd16b4 214 return {
051d6b45
F
215 'id': metadata['providerId'],
216 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 217 'title': traverse_obj(metadata, 'subtitle', 'title'),
218 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
219 'description': metadata.get('description'),
220 'duration': traverse_obj(metadata, ('duration', 'seconds')),
221 'language': metadata.get('language'),
222 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
223 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 224 'formats': formats,
051d6b45
F
225 'subtitles': subtitles,
226 'thumbnails': [
227 {'url': image['url'], 'id': image.get('caption')}
228 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
229 ],
15e9e578 230 # TODO: chapters may also be in stream['segments']?
231 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
232 'start_time': 'startTime',
233 'title': 'title',
234 })) or None,
8bdd16b4 235 }
c40f5cf4 236
24114fee 237
8bdd16b4 238class ArteTVEmbedIE(InfoExtractor):
239 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 240 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 241 _TESTS = [{
8bdd16b4 242 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 243 'info_dict': {
8bdd16b4 244 'id': '100605-013-A',
9c54ae33 245 'ext': 'mp4',
8bdd16b4 246 'title': 'United we Stream November Lockdown Edition #13',
247 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
248 'upload_date': '20201116',
69a0c470 249 },
add96eb9 250 'skip': 'No video available',
8bdd16b4 251 }, {
252 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
253 'only_matching': True,
9c54ae33 254 }]
56a8ab7d 255
893f8832 256 def _real_extract(self, url):
4dfbf869 257 qs = parse_qs(url)
8bdd16b4 258 json_url = qs['json_url'][0]
259 video_id = ArteTVIE._match_id(json_url)
260 return self.url_result(
261 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
262
263
6e6b9f60 264class ArteTVPlaylistIE(ArteTVBaseIE):
add96eb9 265 _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>RC-\d{{6}})'
6e6b9f60 266 _TESTS = [{
ff0f4cfe 267 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 268 'only_matching': True,
8bdd16b4 269 }, {
270 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
271 'playlist_mincount': 100,
272 'info_dict': {
273 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
274 'id': 'RC-014123',
275 'title': 'ARTE Reportage - najlepsze reportaże',
276 },
6e6b9f60
S
277 }]
278
279 def _real_extract(self, url):
051d6b45
F
280 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
281 playlist = self._download_json(
282 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
283
284 entries = [{
285 '_type': 'url_transparent',
286 'url': video['config']['url'],
287 'ie_key': ArteTVIE.ie_key(),
288 'id': video.get('providerId'),
289 'title': video.get('title'),
290 'alt_title': video.get('subtitle'),
291 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
292 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
293 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
294
295 return self.playlist_result(entries, playlist_id,
296 traverse_obj(playlist, ('metadata', 'title')),
297 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 298
299
300class ArteTVCategoryIE(ArteTVBaseIE):
add96eb9 301 _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$'
50e93e03 302 _TESTS = [{
303 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
304 'info_dict': {
305 'id': 'politics-and-society',
306 'title': 'Politics and society',
307 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
308 },
309 'playlist_mincount': 13,
051d6b45 310 }]
50e93e03 311
312 @classmethod
313 def suitable(cls, url):
314 return (
add96eb9 315 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE))
051d6b45 316 and super().suitable(url))
50e93e03 317
318 def _real_extract(self, url):
319 lang, playlist_id = self._match_valid_url(url).groups()
320 webpage = self._download_webpage(url, playlist_id)
321
322 items = []
323 for video in re.finditer(
add96eb9 324 rf'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/{lang}/videos/[\w/-]+)(?P=q)',
50e93e03 325 webpage):
326 video = video.group('url')
327 if video == url:
328 continue
add96eb9 329 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE)):
50e93e03 330 items.append(video)
331
62b8dac4 332 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 333
334 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
335 description=self._og_search_description(webpage, default=None))