]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[ie/crunchyroll] Fix stream extraction (#10005)
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'only_matching': True,
52 }, {
53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
54 'only_matching': True,
55 }, {
56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
57 'only_matching': True,
58 }, {
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
60 'only_matching': True,
61 }, {
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
64 'info_dict': {
65 'id': '006785-000-A',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
69 'duration': 5849,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
72 'ext': 'mp4',
73 },
74 }, {
75 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/',
76 'info_dict': {
77 'id': '085374-003-A',
78 'ext': 'mp4',
79 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9',
80 'timestamp': 1702872000,
81 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530',
82 'duration': 2594,
83 'title': 'Die kurze Zeit der Jugend',
84 'alt_title': 'Im hohen Norden geboren',
85 'upload_date': '20231218',
86 'subtitles': {
87 'fr': 'mincount:1',
88 'fr-acc': 'mincount:1',
89 },
90 },
91 }]
92
93 _GEO_BYPASS = True
94
95 _LANG_MAP = { # ISO639 -> French abbreviations
96 'fr': 'F',
97 'de': 'A',
98 'en': 'E[ANG]',
99 'es': 'E[ESP]',
100 'it': 'E[ITA]',
101 'pl': 'E[POL]',
102 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
103 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
104 'mul': 'EU',
105 }
106
107 _VERSION_CODE_RE = re.compile(r'''(?x)
108 V
109 (?P<original_voice>O?)
110 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
111 (?P<audio_desc>AUD|)
112 (?:
113 (?P<has_sub>-ST)
114 (?P<sdh_sub>M?)
115 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
116 )?
117 ''')
118
119 # all obtained by exhaustive testing
120 _COUNTRIES_MAP = {
121 'DE_FR': (
122 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
123 'PF', 'PM', 'RE', 'WF', 'YT',
124 ),
125 # with both of the below 'BE' sometimes works, sometimes doesn't
126 'EUR_DE_FR': (
127 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
128 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
129 'YT',
130 ),
131 'SAT': (
132 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
133 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
134 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
135 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
136 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
137 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
138 ),
139 }
140
141 @staticmethod
142 def _fix_accessible_subs_locale(subs):
143 updated_subs = {}
144 for lang, sub_formats in subs.items():
145 for fmt in sub_formats:
146 if fmt.get('url', '').endswith('-MAL.m3u8'):
147 lang += '-acc'
148 updated_subs.setdefault(lang, []).append(fmt)
149 return updated_subs
150
151 def _real_extract(self, url):
152 mobj = self._match_valid_url(url)
153 video_id = mobj.group('id')
154 lang = mobj.group('lang') or mobj.group('lang_2')
155 langauge_code = self._LANG_MAP.get(lang)
156
157 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
158 'x-validated-age': '18'
159 })
160
161 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
162 if geoblocking.get('restrictedArea'):
163 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
164 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
165
166 if not traverse_obj(config, ('data', 'attributes', 'rights')):
167 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
168 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
169 raise ExtractorError(
170 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
171
172 formats, subtitles = [], {}
173 secondary_formats = []
174 for stream in config['data']['attributes']['streams']:
175 # official player contains code like `e.get("versions")[0].eStat.ml5`
176 stream_version = stream['versions'][0]
177 stream_version_code = stream_version['eStat']['ml5']
178
179 lang_pref = -1
180 m = self._VERSION_CODE_RE.match(stream_version_code)
181 if m:
182 lang_pref = int(''.join('01'[x] for x in (
183 m.group('vlang') == langauge_code, # we prefer voice in the requested language
184 not m.group('audio_desc'), # and not the audio description version
185 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
186 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
187 not m.group('has_sub'), # but we prefer no subtitles otherwise
188 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
189 )))
190
191 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
192 if 'HLS' in stream['protocol']:
193 fmts, subs = self._extract_m3u8_formats_and_subtitles(
194 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
195 for fmt in fmts:
196 fmt.update({
197 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
198 'language_preference': lang_pref,
199 })
200 if any(map(short_label.startswith, ('cc', 'OGsub'))):
201 secondary_formats.extend(fmts)
202 else:
203 formats.extend(fmts)
204 subs = self._fix_accessible_subs_locale(subs)
205 self._merge_subtitles(subs, target=subtitles)
206
207 elif stream['protocol'] in ('HTTPS', 'RTMP'):
208 formats.append({
209 'format_id': f'{stream["protocol"]}-{stream_version_code}',
210 'url': stream['url'],
211 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
212 'language_preference': lang_pref,
213 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
214 })
215
216 else:
217 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
218
219 formats.extend(secondary_formats)
220 self._remove_duplicate_formats(formats)
221
222 metadata = config['data']['attributes']['metadata']
223
224 return {
225 'id': metadata['providerId'],
226 'webpage_url': traverse_obj(metadata, ('link', 'url')),
227 'title': traverse_obj(metadata, 'subtitle', 'title'),
228 'alt_title': metadata.get('subtitle') and metadata.get('title'),
229 'description': metadata.get('description'),
230 'duration': traverse_obj(metadata, ('duration', 'seconds')),
231 'language': metadata.get('language'),
232 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
233 'is_live': config['data']['attributes'].get('live', False),
234 'formats': formats,
235 'subtitles': subtitles,
236 'thumbnails': [
237 {'url': image['url'], 'id': image.get('caption')}
238 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
239 ],
240 # TODO: chapters may also be in stream['segments']?
241 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
242 'start_time': 'startTime',
243 'title': 'title',
244 })) or None,
245 }
246
247
248 class ArteTVEmbedIE(InfoExtractor):
249 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
250 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
251 _TESTS = [{
252 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
253 'info_dict': {
254 'id': '100605-013-A',
255 'ext': 'mp4',
256 'title': 'United we Stream November Lockdown Edition #13',
257 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
258 'upload_date': '20201116',
259 },
260 'skip': 'No video available'
261 }, {
262 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
263 'only_matching': True,
264 }]
265
266 def _real_extract(self, url):
267 qs = parse_qs(url)
268 json_url = qs['json_url'][0]
269 video_id = ArteTVIE._match_id(json_url)
270 return self.url_result(
271 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
272
273
274 class ArteTVPlaylistIE(ArteTVBaseIE):
275 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
276 _TESTS = [{
277 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
278 'only_matching': True,
279 }, {
280 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
281 'playlist_mincount': 100,
282 'info_dict': {
283 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
284 'id': 'RC-014123',
285 'title': 'ARTE Reportage - najlepsze reportaże',
286 },
287 }]
288
289 def _real_extract(self, url):
290 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
291 playlist = self._download_json(
292 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
293
294 entries = [{
295 '_type': 'url_transparent',
296 'url': video['config']['url'],
297 'ie_key': ArteTVIE.ie_key(),
298 'id': video.get('providerId'),
299 'title': video.get('title'),
300 'alt_title': video.get('subtitle'),
301 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
302 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
303 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
304
305 return self.playlist_result(entries, playlist_id,
306 traverse_obj(playlist, ('metadata', 'title')),
307 traverse_obj(playlist, ('metadata', 'description')))
308
309
310 class ArteTVCategoryIE(ArteTVBaseIE):
311 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
312 _TESTS = [{
313 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
314 'info_dict': {
315 'id': 'politics-and-society',
316 'title': 'Politics and society',
317 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
318 },
319 'playlist_mincount': 13,
320 }]
321
322 @classmethod
323 def suitable(cls, url):
324 return (
325 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
326 and super().suitable(url))
327
328 def _real_extract(self, url):
329 lang, playlist_id = self._match_valid_url(url).groups()
330 webpage = self._download_webpage(url, playlist_id)
331
332 items = []
333 for video in re.finditer(
334 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
335 webpage):
336 video = video.group('url')
337 if video == url:
338 continue
339 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
340 items.append(video)
341
342 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
343
344 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
345 description=self._og_search_description(webpage, default=None))