]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[extractor] Support multiple archive ids for one video (#4307)
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }]
69
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': {
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 },
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': {
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 },
108 'SAT': {
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 },
116 }
117
118 def _real_extract(self, url):
119 mobj = self._match_valid_url(url)
120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 for stream in config['data']['attributes']['streams']:
139 # official player contains code like `e.get("versions")[0].eStat.ml5`
140 stream_version = stream['versions'][0]
141 stream_version_code = stream_version['eStat']['ml5']
142
143 lang_pref = -1
144 m = self._VERSION_CODE_RE.match(stream_version_code)
145 if m:
146 lang_pref = int(''.join('01'[x] for x in (
147 m.group('vlang') == langauge_code, # we prefer voice in the requested language
148 not m.group('audio_desc'), # and not the audio description version
149 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
150 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
151 not m.group('has_sub'), # but we prefer no subtitles otherwise
152 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
153 )))
154
155 if stream['protocol'].startswith('HLS'):
156 fmts, subs = self._extract_m3u8_formats_and_subtitles(
157 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
158 for fmt in fmts:
159 fmt.update({
160 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
161 'language_preference': lang_pref,
162 })
163 formats.extend(fmts)
164 self._merge_subtitles(subs, target=subtitles)
165
166 elif stream['protocol'] in ('HTTPS', 'RTMP'):
167 formats.append({
168 'format_id': f'{stream["protocol"]}-{stream_version_code}',
169 'url': stream['url'],
170 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
171 'language_preference': lang_pref,
172 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
173 })
174
175 else:
176 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
177
178 # TODO: chapters from stream['segments']?
179 # The JS also looks for chapters in config['data']['attributes']['chapters'],
180 # but I am yet to find a video having those
181
182 self._sort_formats(formats)
183
184 metadata = config['data']['attributes']['metadata']
185
186 return {
187 'id': metadata['providerId'],
188 'webpage_url': traverse_obj(metadata, ('link', 'url')),
189 'title': traverse_obj(metadata, 'subtitle', 'title'),
190 'alt_title': metadata.get('subtitle') and metadata.get('title'),
191 'description': metadata.get('description'),
192 'duration': traverse_obj(metadata, ('duration', 'seconds')),
193 'language': metadata.get('language'),
194 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
195 'is_live': config['data']['attributes'].get('live', False),
196 'formats': formats,
197 'subtitles': subtitles,
198 'thumbnails': [
199 {'url': image['url'], 'id': image.get('caption')}
200 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
201 ],
202 }
203
204
205 class ArteTVEmbedIE(InfoExtractor):
206 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
207 _TESTS = [{
208 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
209 'info_dict': {
210 'id': '100605-013-A',
211 'ext': 'mp4',
212 'title': 'United we Stream November Lockdown Edition #13',
213 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
214 'upload_date': '20201116',
215 },
216 'skip': 'No video available'
217 }, {
218 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
219 'only_matching': True,
220 }]
221
222 @staticmethod
223 def _extract_urls(webpage):
224 return [url for _, url in re.findall(
225 r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
226 webpage)]
227
228 def _real_extract(self, url):
229 qs = parse_qs(url)
230 json_url = qs['json_url'][0]
231 video_id = ArteTVIE._match_id(json_url)
232 return self.url_result(
233 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
234
235
236 class ArteTVPlaylistIE(ArteTVBaseIE):
237 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
238 _TESTS = [{
239 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
240 'only_matching': True,
241 }, {
242 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
243 'playlist_mincount': 100,
244 'info_dict': {
245 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
246 'id': 'RC-014123',
247 'title': 'ARTE Reportage - najlepsze reportaże',
248 },
249 }]
250
251 def _real_extract(self, url):
252 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
253 playlist = self._download_json(
254 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
255
256 entries = [{
257 '_type': 'url_transparent',
258 'url': video['config']['url'],
259 'ie_key': ArteTVIE.ie_key(),
260 'id': video.get('providerId'),
261 'title': video.get('title'),
262 'alt_title': video.get('subtitle'),
263 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
264 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
265 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
266
267 return self.playlist_result(entries, playlist_id,
268 traverse_obj(playlist, ('metadata', 'title')),
269 traverse_obj(playlist, ('metadata', 'description')))
270
271
272 class ArteTVCategoryIE(ArteTVBaseIE):
273 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
274 _TESTS = [{
275 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
276 'info_dict': {
277 'id': 'politics-and-society',
278 'title': 'Politics and society',
279 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
280 },
281 'playlist_mincount': 13,
282 }]
283
284 @classmethod
285 def suitable(cls, url):
286 return (
287 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
288 and super().suitable(url))
289
290 def _real_extract(self, url):
291 lang, playlist_id = self._match_valid_url(url).groups()
292 webpage = self._download_webpage(url, playlist_id)
293
294 items = []
295 for video in re.finditer(
296 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
297 webpage):
298 video = video.group('url')
299 if video == url:
300 continue
301 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
302 items.append(video)
303
304 title = (self._og_search_title(webpage, default=None)
305 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
306 title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
307
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))