]>
Commit | Line | Data |
---|---|---|
d5822b96 | 1 | import re |
d5822b96 PH |
2 | |
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
c0892b2b | 5 | ExtractorError, |
051d6b45 | 6 | GeoRestrictedError, |
d24a2b20 | 7 | int_or_none, |
7b567494 | 8 | join_nonempty, |
051d6b45 | 9 | parse_iso8601, |
4dfbf869 | 10 | parse_qs, |
50e93e03 | 11 | strip_or_none, |
051d6b45 | 12 | traverse_obj, |
8bdd16b4 | 13 | url_or_none, |
d5822b96 PH |
14 | ) |
15 | ||
d5822b96 | 16 | |
6e6b9f60 | 17 | class ArteTVBaseIE(InfoExtractor): |
8bdd16b4 | 18 | _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' |
051d6b45 | 19 | _API_BASE = 'https://api.arte.tv/api/player/v2' |
8bdd16b4 | 20 | |
21 | ||
22 | class ArteTVIE(ArteTVBaseIE): | |
add96eb9 | 23 | _VALID_URL = rf'''(?x) |
051d6b45 | 24 | (?:https?:// |
8bdd16b4 | 25 | (?: |
add96eb9 | 26 | (?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos| |
27 | api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>{ArteTVBaseIE._ARTE_LANGUAGES}) | |
8bdd16b4 | 28 | ) |
051d6b45 | 29 | |arte://program) |
add96eb9 | 30 | /(?P<id>\d{{6}}-\d{{3}}-[AF]|LIVE) |
31 | ''' | |
8bdd16b4 | 32 | _TESTS = [{ |
33 | 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', | |
051d6b45 | 34 | 'only_matching': True, |
f640e42f | 35 | }, { |
36 | 'note': 'No alt_title', | |
37 | 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', | |
09f815ad | 38 | 'only_matching': True, |
8bdd16b4 | 39 | }, { |
051d6b45 | 40 | 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', |
8bdd16b4 | 41 | 'only_matching': True, |
42 | }, { | |
051d6b45 | 43 | 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', |
8bdd16b4 | 44 | 'only_matching': True, |
15e9e578 | 45 | }, { |
46 | 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', | |
09f815ad | 47 | 'only_matching': True, |
7b567494 | 48 | }, { |
49 | 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/', | |
50 | 'info_dict': { | |
51 | 'id': '109067-000-A', | |
52 | 'ext': 'mp4', | |
53 | 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739', | |
54 | 'timestamp': 1713927600, | |
55 | 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530', | |
56 | 'duration': 7599, | |
57 | 'title': 'La loi de Téhéran', | |
58 | 'upload_date': '20240424', | |
59 | 'subtitles': { | |
60 | 'fr': 'mincount:1', | |
61 | 'fr-acc': 'mincount:1', | |
62 | 'fr-forced': 'mincount:1', | |
63 | }, | |
64 | }, | |
09f815ad SL |
65 | }, { |
66 | 'note': 'age-restricted', | |
67 | 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', | |
15e9e578 | 68 | 'info_dict': { |
09f815ad SL |
69 | 'id': '006785-000-A', |
70 | 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba', | |
71 | 'title': 'The Element of Crime', | |
72 | 'timestamp': 1696111200, | |
73 | 'duration': 5849, | |
74 | 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', | |
75 | 'upload_date': '20230930', | |
15e9e578 | 76 | 'ext': 'mp4', |
393b487a | 77 | }, |
7b567494 | 78 | 'skip': '404 Not Found', |
8bdd16b4 | 79 | }] |
80 | ||
051d6b45 F |
81 | _GEO_BYPASS = True |
82 | ||
83 | _LANG_MAP = { # ISO639 -> French abbreviations | |
84 | 'fr': 'F', | |
85 | 'de': 'A', | |
86 | 'en': 'E[ANG]', | |
87 | 'es': 'E[ESP]', | |
88 | 'it': 'E[ITA]', | |
89 | 'pl': 'E[POL]', | |
90 | # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/> | |
91 | # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed) | |
92 | 'mul': 'EU', | |
93 | } | |
94 | ||
95 | _VERSION_CODE_RE = re.compile(r'''(?x) | |
96 | V | |
97 | (?P<original_voice>O?) | |
98 | (?P<vlang>[FA]|E\[[A-Z]+\]|EU)? | |
99 | (?P<audio_desc>AUD|) | |
100 | (?: | |
101 | (?P<has_sub>-ST) | |
102 | (?P<sdh_sub>M?) | |
103 | (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU) | |
104 | )? | |
105 | ''') | |
106 | ||
107 | # all obtained by exhaustive testing | |
108 | _COUNTRIES_MAP = { | |
89e4d861 | 109 | 'DE_FR': ( |
051d6b45 F |
110 | 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', |
111 | 'PF', 'PM', 'RE', 'WF', 'YT', | |
89e4d861 | 112 | ), |
051d6b45 | 113 | # with both of the below 'BE' sometimes works, sometimes doesn't |
89e4d861 | 114 | 'EUR_DE_FR': ( |
051d6b45 F |
115 | 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', |
116 | 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', | |
117 | 'YT', | |
89e4d861 | 118 | ), |
119 | 'SAT': ( | |
051d6b45 F |
120 | 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', |
121 | 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', | |
122 | 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', | |
123 | 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', | |
124 | 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', | |
125 | 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', | |
89e4d861 | 126 | ), |
051d6b45 F |
127 | } |
128 | ||
393b487a NA |
129 | @staticmethod |
130 | def _fix_accessible_subs_locale(subs): | |
131 | updated_subs = {} | |
132 | for lang, sub_formats in subs.items(): | |
615a8444 | 133 | for fmt in sub_formats: |
7b567494 | 134 | url = fmt.get('url') or '' |
135 | suffix = ('acc' if url.endswith('-MAL.m3u8') | |
136 | else 'forced' if '_VO' not in url | |
137 | else None) | |
138 | updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt) | |
393b487a NA |
139 | return updated_subs |
140 | ||
8bdd16b4 | 141 | def _real_extract(self, url): |
5ad28e7f | 142 | mobj = self._match_valid_url(url) |
8bdd16b4 | 143 | video_id = mobj.group('id') |
144 | lang = mobj.group('lang') or mobj.group('lang_2') | |
7b567494 | 145 | language_code = self._LANG_MAP.get(lang) |
051d6b45 | 146 | |
09f815ad | 147 | config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ |
add96eb9 | 148 | 'x-validated-age': '18', |
09f815ad | 149 | }) |
051d6b45 F |
150 | |
151 | geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} | |
152 | if geoblocking.get('restrictedArea'): | |
153 | raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}', | |
154 | countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR'))) | |
155 | ||
156 | if not traverse_obj(config, ('data', 'attributes', 'rights')): | |
157 | # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten | |
158 | # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23 | |
159 | raise ExtractorError( | |
160 | 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) | |
161 | ||
162 | formats, subtitles = [], {} | |
1534aba8 | 163 | secondary_formats = [] |
051d6b45 F |
164 | for stream in config['data']['attributes']['streams']: |
165 | # official player contains code like `e.get("versions")[0].eStat.ml5` | |
166 | stream_version = stream['versions'][0] | |
167 | stream_version_code = stream_version['eStat']['ml5'] | |
168 | ||
169 | lang_pref = -1 | |
170 | m = self._VERSION_CODE_RE.match(stream_version_code) | |
171 | if m: | |
172 | lang_pref = int(''.join('01'[x] for x in ( | |
7b567494 | 173 | m.group('vlang') == language_code, # we prefer voice in the requested language |
051d6b45 F |
174 | not m.group('audio_desc'), # and not the audio description version |
175 | bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice | |
7b567494 | 176 | m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language |
051d6b45 F |
177 | not m.group('has_sub'), # but we prefer no subtitles otherwise |
178 | not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles | |
179 | ))) | |
180 | ||
1534aba8 | 181 | short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') |
c2da0b5e | 182 | if 'HLS' in stream['protocol']: |
051d6b45 F |
183 | fmts, subs = self._extract_m3u8_formats_and_subtitles( |
184 | stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) | |
185 | for fmt in fmts: | |
186 | fmt.update({ | |
1534aba8 | 187 | 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', |
25791435 | 188 | 'language_preference': lang_pref, |
25791435 | 189 | }) |
1534aba8 SS |
190 | if any(map(short_label.startswith, ('cc', 'OGsub'))): |
191 | secondary_formats.extend(fmts) | |
192 | else: | |
193 | formats.extend(fmts) | |
393b487a | 194 | subs = self._fix_accessible_subs_locale(subs) |
051d6b45 F |
195 | self._merge_subtitles(subs, target=subtitles) |
196 | ||
197 | elif stream['protocol'] in ('HTTPS', 'RTMP'): | |
198 | formats.append({ | |
199 | 'format_id': f'{stream["protocol"]}-{stream_version_code}', | |
200 | 'url': stream['url'], | |
1534aba8 | 201 | 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', |
051d6b45 F |
202 | 'language_preference': lang_pref, |
203 | # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS | |
204 | }) | |
205 | ||
c40f5cf4 | 206 | else: |
051d6b45 F |
207 | self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') |
208 | ||
1534aba8 SS |
209 | formats.extend(secondary_formats) |
210 | self._remove_duplicate_formats(formats) | |
aff2f4f4 | 211 | |
051d6b45 | 212 | metadata = config['data']['attributes']['metadata'] |
c40f5cf4 | 213 | |
8bdd16b4 | 214 | return { |
051d6b45 F |
215 | 'id': metadata['providerId'], |
216 | 'webpage_url': traverse_obj(metadata, ('link', 'url')), | |
f640e42f | 217 | 'title': traverse_obj(metadata, 'subtitle', 'title'), |
218 | 'alt_title': metadata.get('subtitle') and metadata.get('title'), | |
051d6b45 F |
219 | 'description': metadata.get('description'), |
220 | 'duration': traverse_obj(metadata, ('duration', 'seconds')), | |
221 | 'language': metadata.get('language'), | |
222 | 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601), | |
223 | 'is_live': config['data']['attributes'].get('live', False), | |
8bdd16b4 | 224 | 'formats': formats, |
051d6b45 F |
225 | 'subtitles': subtitles, |
226 | 'thumbnails': [ | |
227 | {'url': image['url'], 'id': image.get('caption')} | |
228 | for image in metadata.get('images') or [] if url_or_none(image.get('url')) | |
229 | ], | |
15e9e578 | 230 | # TODO: chapters may also be in stream['segments']? |
231 | 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., { | |
232 | 'start_time': 'startTime', | |
233 | 'title': 'title', | |
234 | })) or None, | |
8bdd16b4 | 235 | } |
c40f5cf4 | 236 | |
24114fee | 237 | |
8bdd16b4 | 238 | class ArteTVEmbedIE(InfoExtractor): |
239 | _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' | |
bfd973ec | 240 | _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] |
9c54ae33 | 241 | _TESTS = [{ |
8bdd16b4 | 242 | 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', |
9c54ae33 | 243 | 'info_dict': { |
8bdd16b4 | 244 | 'id': '100605-013-A', |
9c54ae33 | 245 | 'ext': 'mp4', |
8bdd16b4 | 246 | 'title': 'United we Stream November Lockdown Edition #13', |
247 | 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', | |
248 | 'upload_date': '20201116', | |
69a0c470 | 249 | }, |
add96eb9 | 250 | 'skip': 'No video available', |
8bdd16b4 | 251 | }, { |
252 | 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', | |
253 | 'only_matching': True, | |
9c54ae33 | 254 | }] |
56a8ab7d | 255 | |
893f8832 | 256 | def _real_extract(self, url): |
4dfbf869 | 257 | qs = parse_qs(url) |
8bdd16b4 | 258 | json_url = qs['json_url'][0] |
259 | video_id = ArteTVIE._match_id(json_url) | |
260 | return self.url_result( | |
261 | json_url, ie=ArteTVIE.ie_key(), video_id=video_id) | |
4b492e35 S |
262 | |
263 | ||
6e6b9f60 | 264 | class ArteTVPlaylistIE(ArteTVBaseIE): |
add96eb9 | 265 | _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>RC-\d{{6}})' |
6e6b9f60 | 266 | _TESTS = [{ |
ff0f4cfe | 267 | 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', |
051d6b45 | 268 | 'only_matching': True, |
8bdd16b4 | 269 | }, { |
270 | 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', | |
051d6b45 F |
271 | 'playlist_mincount': 100, |
272 | 'info_dict': { | |
273 | 'description': 'md5:84e7bf1feda248bc325ebfac818c476e', | |
274 | 'id': 'RC-014123', | |
275 | 'title': 'ARTE Reportage - najlepsze reportaże', | |
276 | }, | |
6e6b9f60 S |
277 | }] |
278 | ||
279 | def _real_extract(self, url): | |
051d6b45 F |
280 | lang, playlist_id = self._match_valid_url(url).group('lang', 'id') |
281 | playlist = self._download_json( | |
282 | f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes'] | |
283 | ||
284 | entries = [{ | |
285 | '_type': 'url_transparent', | |
286 | 'url': video['config']['url'], | |
287 | 'ie_key': ArteTVIE.ie_key(), | |
288 | 'id': video.get('providerId'), | |
289 | 'title': video.get('title'), | |
290 | 'alt_title': video.get('subtitle'), | |
291 | 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))), | |
292 | 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))), | |
293 | } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))] | |
294 | ||
295 | return self.playlist_result(entries, playlist_id, | |
296 | traverse_obj(playlist, ('metadata', 'title')), | |
297 | traverse_obj(playlist, ('metadata', 'description'))) | |
50e93e03 | 298 | |
299 | ||
300 | class ArteTVCategoryIE(ArteTVBaseIE): | |
add96eb9 | 301 | _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' |
50e93e03 | 302 | _TESTS = [{ |
303 | 'url': 'https://www.arte.tv/en/videos/politics-and-society/', | |
304 | 'info_dict': { | |
305 | 'id': 'politics-and-society', | |
306 | 'title': 'Politics and society', | |
307 | 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', | |
308 | }, | |
309 | 'playlist_mincount': 13, | |
051d6b45 | 310 | }] |
50e93e03 | 311 | |
312 | @classmethod | |
313 | def suitable(cls, url): | |
314 | return ( | |
add96eb9 | 315 | not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE)) |
051d6b45 | 316 | and super().suitable(url)) |
50e93e03 | 317 | |
318 | def _real_extract(self, url): | |
319 | lang, playlist_id = self._match_valid_url(url).groups() | |
320 | webpage = self._download_webpage(url, playlist_id) | |
321 | ||
322 | items = [] | |
323 | for video in re.finditer( | |
add96eb9 | 324 | rf'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/{lang}/videos/[\w/-]+)(?P=q)', |
50e93e03 | 325 | webpage): |
326 | video = video.group('url') | |
327 | if video == url: | |
328 | continue | |
add96eb9 | 329 | if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE)): |
50e93e03 | 330 | items.append(video) |
331 | ||
62b8dac4 | 332 | title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None |
50e93e03 | 333 | |
334 | return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, | |
335 | description=self._og_search_description(webpage, default=None)) |