]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
dfbfe03c3ca7550860813b3880100b87ac2fb5bb
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }, {
69 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
70 'info_dict': {
71 'id': '110203-006-A',
72 'chapters': 'count:16',
73 'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
74 'alt_title': 'Zaz',
75 'title': 'Baloise Session 2022',
76 'timestamp': 1668445200,
77 'duration': 4054,
78 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
79 'upload_date': '20221114',
80 'ext': 'mp4',
81 },
82 'expected_warnings': ['geo restricted']
83 }]
84
85 _GEO_BYPASS = True
86
87 _LANG_MAP = { # ISO639 -> French abbreviations
88 'fr': 'F',
89 'de': 'A',
90 'en': 'E[ANG]',
91 'es': 'E[ESP]',
92 'it': 'E[ITA]',
93 'pl': 'E[POL]',
94 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
95 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
96 'mul': 'EU',
97 }
98
99 _VERSION_CODE_RE = re.compile(r'''(?x)
100 V
101 (?P<original_voice>O?)
102 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
103 (?P<audio_desc>AUD|)
104 (?:
105 (?P<has_sub>-ST)
106 (?P<sdh_sub>M?)
107 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
108 )?
109 ''')
110
111 # all obtained by exhaustive testing
112 _COUNTRIES_MAP = {
113 'DE_FR': (
114 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
115 'PF', 'PM', 'RE', 'WF', 'YT',
116 ),
117 # with both of the below 'BE' sometimes works, sometimes doesn't
118 'EUR_DE_FR': (
119 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
120 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
121 'YT',
122 ),
123 'SAT': (
124 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
125 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
126 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
127 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
128 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
129 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
130 ),
131 }
132
133 def _real_extract(self, url):
134 mobj = self._match_valid_url(url)
135 video_id = mobj.group('id')
136 lang = mobj.group('lang') or mobj.group('lang_2')
137 langauge_code = self._LANG_MAP.get(lang)
138
139 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
140
141 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
142 if geoblocking.get('restrictedArea'):
143 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
144 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
145
146 if not traverse_obj(config, ('data', 'attributes', 'rights')):
147 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
148 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
149 raise ExtractorError(
150 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
151
152 formats, subtitles = [], {}
153 secondary_formats = []
154 for stream in config['data']['attributes']['streams']:
155 # official player contains code like `e.get("versions")[0].eStat.ml5`
156 stream_version = stream['versions'][0]
157 stream_version_code = stream_version['eStat']['ml5']
158
159 lang_pref = -1
160 m = self._VERSION_CODE_RE.match(stream_version_code)
161 if m:
162 lang_pref = int(''.join('01'[x] for x in (
163 m.group('vlang') == langauge_code, # we prefer voice in the requested language
164 not m.group('audio_desc'), # and not the audio description version
165 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
166 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
167 not m.group('has_sub'), # but we prefer no subtitles otherwise
168 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
169 )))
170
171 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
172 if stream['protocol'].startswith('HLS'):
173 fmts, subs = self._extract_m3u8_formats_and_subtitles(
174 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
175 for fmt in fmts:
176 fmt.update({
177 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
178 'language_preference': lang_pref,
179 })
180 if any(map(short_label.startswith, ('cc', 'OGsub'))):
181 secondary_formats.extend(fmts)
182 else:
183 formats.extend(fmts)
184 self._merge_subtitles(subs, target=subtitles)
185
186 elif stream['protocol'] in ('HTTPS', 'RTMP'):
187 formats.append({
188 'format_id': f'{stream["protocol"]}-{stream_version_code}',
189 'url': stream['url'],
190 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
191 'language_preference': lang_pref,
192 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
193 })
194
195 else:
196 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
197
198
199 formats.extend(secondary_formats)
200 self._remove_duplicate_formats(formats)
201
202 metadata = config['data']['attributes']['metadata']
203
204 return {
205 'id': metadata['providerId'],
206 'webpage_url': traverse_obj(metadata, ('link', 'url')),
207 'title': traverse_obj(metadata, 'subtitle', 'title'),
208 'alt_title': metadata.get('subtitle') and metadata.get('title'),
209 'description': metadata.get('description'),
210 'duration': traverse_obj(metadata, ('duration', 'seconds')),
211 'language': metadata.get('language'),
212 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
213 'is_live': config['data']['attributes'].get('live', False),
214 'formats': formats,
215 'subtitles': subtitles,
216 'thumbnails': [
217 {'url': image['url'], 'id': image.get('caption')}
218 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
219 ],
220 # TODO: chapters may also be in stream['segments']?
221 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
222 'start_time': 'startTime',
223 'title': 'title',
224 })) or None,
225 }
226
227
228 class ArteTVEmbedIE(InfoExtractor):
229 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
230 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
231 _TESTS = [{
232 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
233 'info_dict': {
234 'id': '100605-013-A',
235 'ext': 'mp4',
236 'title': 'United we Stream November Lockdown Edition #13',
237 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
238 'upload_date': '20201116',
239 },
240 'skip': 'No video available'
241 }, {
242 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
243 'only_matching': True,
244 }]
245
246 def _real_extract(self, url):
247 qs = parse_qs(url)
248 json_url = qs['json_url'][0]
249 video_id = ArteTVIE._match_id(json_url)
250 return self.url_result(
251 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
252
253
254 class ArteTVPlaylistIE(ArteTVBaseIE):
255 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
256 _TESTS = [{
257 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
258 'only_matching': True,
259 }, {
260 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
261 'playlist_mincount': 100,
262 'info_dict': {
263 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
264 'id': 'RC-014123',
265 'title': 'ARTE Reportage - najlepsze reportaże',
266 },
267 }]
268
269 def _real_extract(self, url):
270 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
271 playlist = self._download_json(
272 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
273
274 entries = [{
275 '_type': 'url_transparent',
276 'url': video['config']['url'],
277 'ie_key': ArteTVIE.ie_key(),
278 'id': video.get('providerId'),
279 'title': video.get('title'),
280 'alt_title': video.get('subtitle'),
281 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
282 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
283 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
284
285 return self.playlist_result(entries, playlist_id,
286 traverse_obj(playlist, ('metadata', 'title')),
287 traverse_obj(playlist, ('metadata', 'description')))
288
289
290 class ArteTVCategoryIE(ArteTVBaseIE):
291 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
292 _TESTS = [{
293 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
294 'info_dict': {
295 'id': 'politics-and-society',
296 'title': 'Politics and society',
297 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
298 },
299 'playlist_mincount': 13,
300 }]
301
302 @classmethod
303 def suitable(cls, url):
304 return (
305 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
306 and super().suitable(url))
307
308 def _real_extract(self, url):
309 lang, playlist_id = self._match_valid_url(url).groups()
310 webpage = self._download_webpage(url, playlist_id)
311
312 items = []
313 for video in re.finditer(
314 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
315 webpage):
316 video = video.group('url')
317 if video == url:
318 continue
319 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
320 items.append(video)
321
322 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
323
324 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
325 description=self._og_search_description(webpage, default=None))