]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[ie/crunchyroll] Fix stream extraction (#10005)
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
09f815ad 51 'only_matching': True,
8bdd16b4 52 }, {
051d6b45 53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 54 'only_matching': True,
55 }, {
051d6b45 56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 57 'only_matching': True,
15e9e578 58 }, {
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
09f815ad
SL
60 'only_matching': True,
61 }, {
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
15e9e578 64 'info_dict': {
09f815ad
SL
65 'id': '006785-000-A',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
69 'duration': 5849,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
15e9e578 72 'ext': 'mp4',
393b487a
NA
73 },
74 }, {
75 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/',
76 'info_dict': {
77 'id': '085374-003-A',
78 'ext': 'mp4',
79 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9',
80 'timestamp': 1702872000,
81 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530',
82 'duration': 2594,
83 'title': 'Die kurze Zeit der Jugend',
84 'alt_title': 'Im hohen Norden geboren',
85 'upload_date': '20231218',
86 'subtitles': {
87 'fr': 'mincount:1',
88 'fr-acc': 'mincount:1',
89 },
90 },
8bdd16b4 91 }]
92
051d6b45
F
93 _GEO_BYPASS = True
94
95 _LANG_MAP = { # ISO639 -> French abbreviations
96 'fr': 'F',
97 'de': 'A',
98 'en': 'E[ANG]',
99 'es': 'E[ESP]',
100 'it': 'E[ITA]',
101 'pl': 'E[POL]',
102 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
103 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
104 'mul': 'EU',
105 }
106
107 _VERSION_CODE_RE = re.compile(r'''(?x)
108 V
109 (?P<original_voice>O?)
110 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
111 (?P<audio_desc>AUD|)
112 (?:
113 (?P<has_sub>-ST)
114 (?P<sdh_sub>M?)
115 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
116 )?
117 ''')
118
119 # all obtained by exhaustive testing
120 _COUNTRIES_MAP = {
89e4d861 121 'DE_FR': (
051d6b45
F
122 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
123 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 124 ),
051d6b45 125 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 126 'EUR_DE_FR': (
051d6b45
F
127 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
128 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
129 'YT',
89e4d861 130 ),
131 'SAT': (
051d6b45
F
132 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
133 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
134 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
135 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
136 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
137 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 138 ),
051d6b45
F
139 }
140
393b487a
NA
141 @staticmethod
142 def _fix_accessible_subs_locale(subs):
143 updated_subs = {}
144 for lang, sub_formats in subs.items():
615a8444 145 for fmt in sub_formats:
146 if fmt.get('url', '').endswith('-MAL.m3u8'):
393b487a 147 lang += '-acc'
615a8444 148 updated_subs.setdefault(lang, []).append(fmt)
393b487a
NA
149 return updated_subs
150
8bdd16b4 151 def _real_extract(self, url):
5ad28e7f 152 mobj = self._match_valid_url(url)
8bdd16b4 153 video_id = mobj.group('id')
154 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
155 langauge_code = self._LANG_MAP.get(lang)
156
09f815ad
SL
157 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={
158 'x-validated-age': '18'
159 })
051d6b45
F
160
161 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
162 if geoblocking.get('restrictedArea'):
163 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
164 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
165
166 if not traverse_obj(config, ('data', 'attributes', 'rights')):
167 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
168 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
169 raise ExtractorError(
170 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
171
172 formats, subtitles = [], {}
1534aba8 173 secondary_formats = []
051d6b45
F
174 for stream in config['data']['attributes']['streams']:
175 # official player contains code like `e.get("versions")[0].eStat.ml5`
176 stream_version = stream['versions'][0]
177 stream_version_code = stream_version['eStat']['ml5']
178
179 lang_pref = -1
180 m = self._VERSION_CODE_RE.match(stream_version_code)
181 if m:
182 lang_pref = int(''.join('01'[x] for x in (
183 m.group('vlang') == langauge_code, # we prefer voice in the requested language
184 not m.group('audio_desc'), # and not the audio description version
185 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
186 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
187 not m.group('has_sub'), # but we prefer no subtitles otherwise
188 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
189 )))
190
1534aba8 191 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
c2da0b5e 192 if 'HLS' in stream['protocol']:
051d6b45
F
193 fmts, subs = self._extract_m3u8_formats_and_subtitles(
194 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
195 for fmt in fmts:
196 fmt.update({
1534aba8 197 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 198 'language_preference': lang_pref,
25791435 199 })
1534aba8
SS
200 if any(map(short_label.startswith, ('cc', 'OGsub'))):
201 secondary_formats.extend(fmts)
202 else:
203 formats.extend(fmts)
393b487a 204 subs = self._fix_accessible_subs_locale(subs)
051d6b45
F
205 self._merge_subtitles(subs, target=subtitles)
206
207 elif stream['protocol'] in ('HTTPS', 'RTMP'):
208 formats.append({
209 'format_id': f'{stream["protocol"]}-{stream_version_code}',
210 'url': stream['url'],
1534aba8 211 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
212 'language_preference': lang_pref,
213 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
214 })
215
c40f5cf4 216 else:
051d6b45
F
217 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
218
1534aba8
SS
219 formats.extend(secondary_formats)
220 self._remove_duplicate_formats(formats)
aff2f4f4 221
051d6b45 222 metadata = config['data']['attributes']['metadata']
c40f5cf4 223
8bdd16b4 224 return {
051d6b45
F
225 'id': metadata['providerId'],
226 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 227 'title': traverse_obj(metadata, 'subtitle', 'title'),
228 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
229 'description': metadata.get('description'),
230 'duration': traverse_obj(metadata, ('duration', 'seconds')),
231 'language': metadata.get('language'),
232 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
233 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 234 'formats': formats,
051d6b45
F
235 'subtitles': subtitles,
236 'thumbnails': [
237 {'url': image['url'], 'id': image.get('caption')}
238 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
239 ],
15e9e578 240 # TODO: chapters may also be in stream['segments']?
241 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
242 'start_time': 'startTime',
243 'title': 'title',
244 })) or None,
8bdd16b4 245 }
c40f5cf4 246
24114fee 247
8bdd16b4 248class ArteTVEmbedIE(InfoExtractor):
249 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 250 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 251 _TESTS = [{
8bdd16b4 252 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 253 'info_dict': {
8bdd16b4 254 'id': '100605-013-A',
9c54ae33 255 'ext': 'mp4',
8bdd16b4 256 'title': 'United we Stream November Lockdown Edition #13',
257 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
258 'upload_date': '20201116',
69a0c470 259 },
051d6b45 260 'skip': 'No video available'
8bdd16b4 261 }, {
262 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
263 'only_matching': True,
9c54ae33 264 }]
56a8ab7d 265
893f8832 266 def _real_extract(self, url):
4dfbf869 267 qs = parse_qs(url)
8bdd16b4 268 json_url = qs['json_url'][0]
269 video_id = ArteTVIE._match_id(json_url)
270 return self.url_result(
271 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
272
273
6e6b9f60 274class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 275 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 276 _TESTS = [{
ff0f4cfe 277 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 278 'only_matching': True,
8bdd16b4 279 }, {
280 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
281 'playlist_mincount': 100,
282 'info_dict': {
283 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
284 'id': 'RC-014123',
285 'title': 'ARTE Reportage - najlepsze reportaże',
286 },
6e6b9f60
S
287 }]
288
289 def _real_extract(self, url):
051d6b45
F
290 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
291 playlist = self._download_json(
292 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
293
294 entries = [{
295 '_type': 'url_transparent',
296 'url': video['config']['url'],
297 'ie_key': ArteTVIE.ie_key(),
298 'id': video.get('providerId'),
299 'title': video.get('title'),
300 'alt_title': video.get('subtitle'),
301 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
302 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
303 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
304
305 return self.playlist_result(entries, playlist_id,
306 traverse_obj(playlist, ('metadata', 'title')),
307 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 308
309
310class ArteTVCategoryIE(ArteTVBaseIE):
311 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
312 _TESTS = [{
313 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
314 'info_dict': {
315 'id': 'politics-and-society',
316 'title': 'Politics and society',
317 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
318 },
319 'playlist_mincount': 13,
051d6b45 320 }]
50e93e03 321
322 @classmethod
323 def suitable(cls, url):
324 return (
325 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 326 and super().suitable(url))
50e93e03 327
328 def _real_extract(self, url):
329 lang, playlist_id = self._match_valid_url(url).groups()
330 webpage = self._download_webpage(url, playlist_id)
331
332 items = []
333 for video in re.finditer(
334 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
335 webpage):
336 video = video.group('url')
337 if video == url:
338 continue
339 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
340 items.append(video)
341
62b8dac4 342 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 343
344 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
345 description=self._og_search_description(webpage, default=None))