]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
8bdd16b4 62 }, {
051d6b45 63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 64 'only_matching': True,
65 }, {
051d6b45 66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 67 'only_matching': True,
68 }]
69
051d6b45
F
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': {
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 },
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': {
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 },
108 'SAT': {
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 },
116 }
117
8bdd16b4 118 def _real_extract(self, url):
5ad28e7f 119 mobj = self._match_valid_url(url)
8bdd16b4 120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 for stream in config['data']['attributes']['streams']:
139 # official player contains code like `e.get("versions")[0].eStat.ml5`
140 stream_version = stream['versions'][0]
141 stream_version_code = stream_version['eStat']['ml5']
142
143 lang_pref = -1
144 m = self._VERSION_CODE_RE.match(stream_version_code)
145 if m:
146 lang_pref = int(''.join('01'[x] for x in (
147 m.group('vlang') == langauge_code, # we prefer voice in the requested language
148 not m.group('audio_desc'), # and not the audio description version
149 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
150 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
151 not m.group('has_sub'), # but we prefer no subtitles otherwise
152 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
153 )))
154
155 if stream['protocol'].startswith('HLS'):
156 fmts, subs = self._extract_m3u8_formats_and_subtitles(
157 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
158 for fmt in fmts:
159 fmt.update({
160 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
25791435 161 'language_preference': lang_pref,
25791435 162 })
051d6b45
F
163 formats.extend(fmts)
164 self._merge_subtitles(subs, target=subtitles)
165
166 elif stream['protocol'] in ('HTTPS', 'RTMP'):
167 formats.append({
168 'format_id': f'{stream["protocol"]}-{stream_version_code}',
169 'url': stream['url'],
170 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
171 'language_preference': lang_pref,
172 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
173 })
174
c40f5cf4 175 else:
051d6b45
F
176 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
177
178 # TODO: chapters from stream['segments']?
179 # The JS also looks for chapters in config['data']['attributes']['chapters'],
180 # but I am yet to find a video having those
aff2f4f4 181
051d6b45 182 self._sort_formats(formats)
aff2f4f4 183
051d6b45 184 metadata = config['data']['attributes']['metadata']
c40f5cf4 185
8bdd16b4 186 return {
051d6b45
F
187 'id': metadata['providerId'],
188 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 189 'title': traverse_obj(metadata, 'subtitle', 'title'),
190 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
191 'description': metadata.get('description'),
192 'duration': traverse_obj(metadata, ('duration', 'seconds')),
193 'language': metadata.get('language'),
194 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
195 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 196 'formats': formats,
051d6b45
F
197 'subtitles': subtitles,
198 'thumbnails': [
199 {'url': image['url'], 'id': image.get('caption')}
200 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
201 ],
8bdd16b4 202 }
c40f5cf4 203
24114fee 204
8bdd16b4 205class ArteTVEmbedIE(InfoExtractor):
206 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 207 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 208 _TESTS = [{
8bdd16b4 209 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 210 'info_dict': {
8bdd16b4 211 'id': '100605-013-A',
9c54ae33 212 'ext': 'mp4',
8bdd16b4 213 'title': 'United we Stream November Lockdown Edition #13',
214 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
215 'upload_date': '20201116',
69a0c470 216 },
051d6b45 217 'skip': 'No video available'
8bdd16b4 218 }, {
219 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
220 'only_matching': True,
9c54ae33 221 }]
56a8ab7d 222
893f8832 223 def _real_extract(self, url):
4dfbf869 224 qs = parse_qs(url)
8bdd16b4 225 json_url = qs['json_url'][0]
226 video_id = ArteTVIE._match_id(json_url)
227 return self.url_result(
228 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
229
230
6e6b9f60 231class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 232 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 233 _TESTS = [{
ff0f4cfe 234 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 235 'only_matching': True,
8bdd16b4 236 }, {
237 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
238 'playlist_mincount': 100,
239 'info_dict': {
240 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
241 'id': 'RC-014123',
242 'title': 'ARTE Reportage - najlepsze reportaże',
243 },
6e6b9f60
S
244 }]
245
246 def _real_extract(self, url):
051d6b45
F
247 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
248 playlist = self._download_json(
249 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
250
251 entries = [{
252 '_type': 'url_transparent',
253 'url': video['config']['url'],
254 'ie_key': ArteTVIE.ie_key(),
255 'id': video.get('providerId'),
256 'title': video.get('title'),
257 'alt_title': video.get('subtitle'),
258 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
259 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
260 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
261
262 return self.playlist_result(entries, playlist_id,
263 traverse_obj(playlist, ('metadata', 'title')),
264 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 265
266
267class ArteTVCategoryIE(ArteTVBaseIE):
268 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
269 _TESTS = [{
270 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
271 'info_dict': {
272 'id': 'politics-and-society',
273 'title': 'Politics and society',
274 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
275 },
276 'playlist_mincount': 13,
051d6b45 277 }]
50e93e03 278
279 @classmethod
280 def suitable(cls, url):
281 return (
282 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 283 and super().suitable(url))
50e93e03 284
285 def _real_extract(self, url):
286 lang, playlist_id = self._match_valid_url(url).groups()
287 webpage = self._download_webpage(url, playlist_id)
288
289 items = []
290 for video in re.finditer(
291 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
292 webpage):
293 video = video.group('url')
294 if video == url:
295 continue
296 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
297 items.append(video)
298
299 title = (self._og_search_title(webpage, default=None)
300 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
301 title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
302
303 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
304 description=self._og_search_description(webpage, default=None))