]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[extractor/arte] Fix title extraction
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
8bdd16b4 62 }, {
051d6b45 63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 64 'only_matching': True,
65 }, {
051d6b45 66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 67 'only_matching': True,
68 }]
69
051d6b45
F
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': {
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 },
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': {
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 },
108 'SAT': {
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 },
116 }
117
8bdd16b4 118 def _real_extract(self, url):
5ad28e7f 119 mobj = self._match_valid_url(url)
8bdd16b4 120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 for stream in config['data']['attributes']['streams']:
139 # official player contains code like `e.get("versions")[0].eStat.ml5`
140 stream_version = stream['versions'][0]
141 stream_version_code = stream_version['eStat']['ml5']
142
143 lang_pref = -1
144 m = self._VERSION_CODE_RE.match(stream_version_code)
145 if m:
146 lang_pref = int(''.join('01'[x] for x in (
147 m.group('vlang') == langauge_code, # we prefer voice in the requested language
148 not m.group('audio_desc'), # and not the audio description version
149 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
150 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
151 not m.group('has_sub'), # but we prefer no subtitles otherwise
152 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
153 )))
154
155 if stream['protocol'].startswith('HLS'):
156 fmts, subs = self._extract_m3u8_formats_and_subtitles(
157 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
158 for fmt in fmts:
159 fmt.update({
160 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
25791435 161 'language_preference': lang_pref,
25791435 162 })
051d6b45
F
163 formats.extend(fmts)
164 self._merge_subtitles(subs, target=subtitles)
165
166 elif stream['protocol'] in ('HTTPS', 'RTMP'):
167 formats.append({
168 'format_id': f'{stream["protocol"]}-{stream_version_code}',
169 'url': stream['url'],
170 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
171 'language_preference': lang_pref,
172 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
173 })
174
c40f5cf4 175 else:
051d6b45
F
176 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
177
178 # TODO: chapters from stream['segments']?
179 # The JS also looks for chapters in config['data']['attributes']['chapters'],
180 # but I am yet to find a video having those
aff2f4f4 181
051d6b45 182 self._sort_formats(formats)
aff2f4f4 183
051d6b45 184 metadata = config['data']['attributes']['metadata']
c40f5cf4 185
8bdd16b4 186 return {
051d6b45
F
187 'id': metadata['providerId'],
188 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 189 'title': traverse_obj(metadata, 'subtitle', 'title'),
190 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
191 'description': metadata.get('description'),
192 'duration': traverse_obj(metadata, ('duration', 'seconds')),
193 'language': metadata.get('language'),
194 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
195 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 196 'formats': formats,
051d6b45
F
197 'subtitles': subtitles,
198 'thumbnails': [
199 {'url': image['url'], 'id': image.get('caption')}
200 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
201 ],
8bdd16b4 202 }
c40f5cf4 203
24114fee 204
8bdd16b4 205class ArteTVEmbedIE(InfoExtractor):
206 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
9c54ae33 207 _TESTS = [{
8bdd16b4 208 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 209 'info_dict': {
8bdd16b4 210 'id': '100605-013-A',
9c54ae33 211 'ext': 'mp4',
8bdd16b4 212 'title': 'United we Stream November Lockdown Edition #13',
213 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
214 'upload_date': '20201116',
69a0c470 215 },
051d6b45 216 'skip': 'No video available'
8bdd16b4 217 }, {
218 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
219 'only_matching': True,
9c54ae33 220 }]
56a8ab7d 221
8bdd16b4 222 @staticmethod
223 def _extract_urls(webpage):
224 return [url for _, url in re.findall(
225 r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
226 webpage)]
6e6b9f60 227
893f8832 228 def _real_extract(self, url):
4dfbf869 229 qs = parse_qs(url)
8bdd16b4 230 json_url = qs['json_url'][0]
231 video_id = ArteTVIE._match_id(json_url)
232 return self.url_result(
233 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
234
235
6e6b9f60 236class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 237 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 238 _TESTS = [{
ff0f4cfe 239 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 240 'only_matching': True,
8bdd16b4 241 }, {
242 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
243 'playlist_mincount': 100,
244 'info_dict': {
245 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
246 'id': 'RC-014123',
247 'title': 'ARTE Reportage - najlepsze reportaże',
248 },
6e6b9f60
S
249 }]
250
251 def _real_extract(self, url):
051d6b45
F
252 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
253 playlist = self._download_json(
254 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
255
256 entries = [{
257 '_type': 'url_transparent',
258 'url': video['config']['url'],
259 'ie_key': ArteTVIE.ie_key(),
260 'id': video.get('providerId'),
261 'title': video.get('title'),
262 'alt_title': video.get('subtitle'),
263 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
264 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
265 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
266
267 return self.playlist_result(entries, playlist_id,
268 traverse_obj(playlist, ('metadata', 'title')),
269 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 270
271
272class ArteTVCategoryIE(ArteTVBaseIE):
273 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
274 _TESTS = [{
275 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
276 'info_dict': {
277 'id': 'politics-and-society',
278 'title': 'Politics and society',
279 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
280 },
281 'playlist_mincount': 13,
051d6b45 282 }]
50e93e03 283
284 @classmethod
285 def suitable(cls, url):
286 return (
287 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 288 and super().suitable(url))
50e93e03 289
290 def _real_extract(self, url):
291 lang, playlist_id = self._match_valid_url(url).groups()
292 webpage = self._download_webpage(url, playlist_id)
293
294 items = []
295 for video in re.finditer(
296 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
297 webpage):
298 video = video.group('url')
299 if video == url:
300 continue
301 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
302 items.append(video)
303
304 title = (self._og_search_title(webpage, default=None)
305 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
306 title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
307
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))