]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/arte.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / arte.py
... / ...
CommitLineData
1import re
2
3from .common import InfoExtractor
4from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13)
14
15
16class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }]
69
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': {
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 },
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': {
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 },
108 'SAT': {
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 },
116 }
117
118 def _real_extract(self, url):
119 mobj = self._match_valid_url(url)
120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 for stream in config['data']['attributes']['streams']:
139 # official player contains code like `e.get("versions")[0].eStat.ml5`
140 stream_version = stream['versions'][0]
141 stream_version_code = stream_version['eStat']['ml5']
142
143 lang_pref = -1
144 m = self._VERSION_CODE_RE.match(stream_version_code)
145 if m:
146 lang_pref = int(''.join('01'[x] for x in (
147 m.group('vlang') == langauge_code, # we prefer voice in the requested language
148 not m.group('audio_desc'), # and not the audio description version
149 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
150 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
151 not m.group('has_sub'), # but we prefer no subtitles otherwise
152 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
153 )))
154
155 if stream['protocol'].startswith('HLS'):
156 fmts, subs = self._extract_m3u8_formats_and_subtitles(
157 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
158 for fmt in fmts:
159 fmt.update({
160 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
161 'language_preference': lang_pref,
162 })
163 formats.extend(fmts)
164 self._merge_subtitles(subs, target=subtitles)
165
166 elif stream['protocol'] in ('HTTPS', 'RTMP'):
167 formats.append({
168 'format_id': f'{stream["protocol"]}-{stream_version_code}',
169 'url': stream['url'],
170 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
171 'language_preference': lang_pref,
172 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
173 })
174
175 else:
176 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
177
178 # TODO: chapters from stream['segments']?
179 # The JS also looks for chapters in config['data']['attributes']['chapters'],
180 # but I am yet to find a video having those
181
182 self._sort_formats(formats)
183
184 metadata = config['data']['attributes']['metadata']
185
186 return {
187 'id': metadata['providerId'],
188 'webpage_url': traverse_obj(metadata, ('link', 'url')),
189 'title': traverse_obj(metadata, 'subtitle', 'title'),
190 'alt_title': metadata.get('subtitle') and metadata.get('title'),
191 'description': metadata.get('description'),
192 'duration': traverse_obj(metadata, ('duration', 'seconds')),
193 'language': metadata.get('language'),
194 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
195 'is_live': config['data']['attributes'].get('live', False),
196 'formats': formats,
197 'subtitles': subtitles,
198 'thumbnails': [
199 {'url': image['url'], 'id': image.get('caption')}
200 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
201 ],
202 }
203
204
205class ArteTVEmbedIE(InfoExtractor):
206 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
207 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
208 _TESTS = [{
209 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
210 'info_dict': {
211 'id': '100605-013-A',
212 'ext': 'mp4',
213 'title': 'United we Stream November Lockdown Edition #13',
214 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
215 'upload_date': '20201116',
216 },
217 'skip': 'No video available'
218 }, {
219 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
220 'only_matching': True,
221 }]
222
223 def _real_extract(self, url):
224 qs = parse_qs(url)
225 json_url = qs['json_url'][0]
226 video_id = ArteTVIE._match_id(json_url)
227 return self.url_result(
228 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
229
230
231class ArteTVPlaylistIE(ArteTVBaseIE):
232 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
233 _TESTS = [{
234 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
235 'only_matching': True,
236 }, {
237 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
238 'playlist_mincount': 100,
239 'info_dict': {
240 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
241 'id': 'RC-014123',
242 'title': 'ARTE Reportage - najlepsze reportaże',
243 },
244 }]
245
246 def _real_extract(self, url):
247 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
248 playlist = self._download_json(
249 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
250
251 entries = [{
252 '_type': 'url_transparent',
253 'url': video['config']['url'],
254 'ie_key': ArteTVIE.ie_key(),
255 'id': video.get('providerId'),
256 'title': video.get('title'),
257 'alt_title': video.get('subtitle'),
258 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
259 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
260 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
261
262 return self.playlist_result(entries, playlist_id,
263 traverse_obj(playlist, ('metadata', 'title')),
264 traverse_obj(playlist, ('metadata', 'description')))
265
266
267class ArteTVCategoryIE(ArteTVBaseIE):
268 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
269 _TESTS = [{
270 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
271 'info_dict': {
272 'id': 'politics-and-society',
273 'title': 'Politics and society',
274 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
275 },
276 'playlist_mincount': 13,
277 }]
278
279 @classmethod
280 def suitable(cls, url):
281 return (
282 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
283 and super().suitable(url))
284
285 def _real_extract(self, url):
286 lang, playlist_id = self._match_valid_url(url).groups()
287 webpage = self._download_webpage(url, playlist_id)
288
289 items = []
290 for video in re.finditer(
291 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
292 webpage):
293 video = video.group('url')
294 if video == url:
295 continue
296 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
297 items.append(video)
298
299 title = (self._og_search_title(webpage, default=None)
300 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
301 title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
302
303 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
304 description=self._og_search_description(webpage, default=None))