]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[ie/nebula] Overhaul extractors (#8566)
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'only_matching': True,
52 }, {
53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
54 'only_matching': True,
55 }, {
56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
57 'only_matching': True,
58 }, {
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
60 'only_matching': True,
61 }, {
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
64 'info_dict': {
65 'id': '006785-000-A',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
69 'duration': 5849,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
72 'ext': 'mp4',
73 }
74 }]
75
76 _GEO_BYPASS = True
77
78 _LANG_MAP = { # ISO639 -> French abbreviations
79 'fr': 'F',
80 'de': 'A',
81 'en': 'E[ANG]',
82 'es': 'E[ESP]',
83 'it': 'E[ITA]',
84 'pl': 'E[POL]',
85 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
86 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
87 'mul': 'EU',
88 }
89
90 _VERSION_CODE_RE = re.compile(r'''(?x)
91 V
92 (?P<original_voice>O?)
93 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
94 (?P<audio_desc>AUD|)
95 (?:
96 (?P<has_sub>-ST)
97 (?P<sdh_sub>M?)
98 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
99 )?
100 ''')
101
102 # all obtained by exhaustive testing
103 _COUNTRIES_MAP = {
104 'DE_FR': (
105 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
106 'PF', 'PM', 'RE', 'WF', 'YT',
107 ),
108 # with both of the below 'BE' sometimes works, sometimes doesn't
109 'EUR_DE_FR': (
110 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
111 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
112 'YT',
113 ),
114 'SAT': (
115 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
116 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
117 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
118 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
119 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
120 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
121 ),
122 }
123
124 def _real_extract(self, url):
125 mobj = self._match_valid_url(url)
126 video_id = mobj.group('id')
127 lang = mobj.group('lang') or mobj.group('lang_2')
128 langauge_code = self._LANG_MAP.get(lang)
129
130 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
131 'x-validated-age': '18'
132 })
133
134 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
135 if geoblocking.get('restrictedArea'):
136 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
137 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
138
139 if not traverse_obj(config, ('data', 'attributes', 'rights')):
140 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
141 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
142 raise ExtractorError(
143 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
144
145 formats, subtitles = [], {}
146 secondary_formats = []
147 for stream in config['data']['attributes']['streams']:
148 # official player contains code like `e.get("versions")[0].eStat.ml5`
149 stream_version = stream['versions'][0]
150 stream_version_code = stream_version['eStat']['ml5']
151
152 lang_pref = -1
153 m = self._VERSION_CODE_RE.match(stream_version_code)
154 if m:
155 lang_pref = int(''.join('01'[x] for x in (
156 m.group('vlang') == langauge_code, # we prefer voice in the requested language
157 not m.group('audio_desc'), # and not the audio description version
158 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
159 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
160 not m.group('has_sub'), # but we prefer no subtitles otherwise
161 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
162 )))
163
164 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
165 if 'HLS' in stream['protocol']:
166 fmts, subs = self._extract_m3u8_formats_and_subtitles(
167 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
168 for fmt in fmts:
169 fmt.update({
170 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
171 'language_preference': lang_pref,
172 })
173 if any(map(short_label.startswith, ('cc', 'OGsub'))):
174 secondary_formats.extend(fmts)
175 else:
176 formats.extend(fmts)
177 self._merge_subtitles(subs, target=subtitles)
178
179 elif stream['protocol'] in ('HTTPS', 'RTMP'):
180 formats.append({
181 'format_id': f'{stream["protocol"]}-{stream_version_code}',
182 'url': stream['url'],
183 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
184 'language_preference': lang_pref,
185 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
186 })
187
188 else:
189 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
190
191 formats.extend(secondary_formats)
192 self._remove_duplicate_formats(formats)
193
194 metadata = config['data']['attributes']['metadata']
195
196 return {
197 'id': metadata['providerId'],
198 'webpage_url': traverse_obj(metadata, ('link', 'url')),
199 'title': traverse_obj(metadata, 'subtitle', 'title'),
200 'alt_title': metadata.get('subtitle') and metadata.get('title'),
201 'description': metadata.get('description'),
202 'duration': traverse_obj(metadata, ('duration', 'seconds')),
203 'language': metadata.get('language'),
204 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
205 'is_live': config['data']['attributes'].get('live', False),
206 'formats': formats,
207 'subtitles': subtitles,
208 'thumbnails': [
209 {'url': image['url'], 'id': image.get('caption')}
210 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
211 ],
212 # TODO: chapters may also be in stream['segments']?
213 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
214 'start_time': 'startTime',
215 'title': 'title',
216 })) or None,
217 }
218
219
220 class ArteTVEmbedIE(InfoExtractor):
221 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
222 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
223 _TESTS = [{
224 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
225 'info_dict': {
226 'id': '100605-013-A',
227 'ext': 'mp4',
228 'title': 'United we Stream November Lockdown Edition #13',
229 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
230 'upload_date': '20201116',
231 },
232 'skip': 'No video available'
233 }, {
234 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
235 'only_matching': True,
236 }]
237
238 def _real_extract(self, url):
239 qs = parse_qs(url)
240 json_url = qs['json_url'][0]
241 video_id = ArteTVIE._match_id(json_url)
242 return self.url_result(
243 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
244
245
246 class ArteTVPlaylistIE(ArteTVBaseIE):
247 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
248 _TESTS = [{
249 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
250 'only_matching': True,
251 }, {
252 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
253 'playlist_mincount': 100,
254 'info_dict': {
255 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
256 'id': 'RC-014123',
257 'title': 'ARTE Reportage - najlepsze reportaże',
258 },
259 }]
260
261 def _real_extract(self, url):
262 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
263 playlist = self._download_json(
264 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
265
266 entries = [{
267 '_type': 'url_transparent',
268 'url': video['config']['url'],
269 'ie_key': ArteTVIE.ie_key(),
270 'id': video.get('providerId'),
271 'title': video.get('title'),
272 'alt_title': video.get('subtitle'),
273 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
274 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
275 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
276
277 return self.playlist_result(entries, playlist_id,
278 traverse_obj(playlist, ('metadata', 'title')),
279 traverse_obj(playlist, ('metadata', 'description')))
280
281
282 class ArteTVCategoryIE(ArteTVBaseIE):
283 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
284 _TESTS = [{
285 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
286 'info_dict': {
287 'id': 'politics-and-society',
288 'title': 'Politics and society',
289 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
290 },
291 'playlist_mincount': 13,
292 }]
293
294 @classmethod
295 def suitable(cls, url):
296 return (
297 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
298 and super().suitable(url))
299
300 def _real_extract(self, url):
301 lang, playlist_id = self._match_valid_url(url).groups()
302 webpage = self._download_webpage(url, playlist_id)
303
304 items = []
305 for video in re.finditer(
306 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
307 webpage):
308 video = video.group('url')
309 if video == url:
310 continue
311 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
312 items.append(video)
313
314 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
315
316 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
317 description=self._og_search_description(webpage, default=None))