]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[extractor] Improve `_generic_title`
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }]
69
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': (
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 ),
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': (
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 ),
108 'SAT': (
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 ),
116 }
117
118 def _real_extract(self, url):
119 mobj = self._match_valid_url(url)
120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 secondary_formats = []
139 for stream in config['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version = stream['versions'][0]
142 stream_version_code = stream_version['eStat']['ml5']
143
144 lang_pref = -1
145 m = self._VERSION_CODE_RE.match(stream_version_code)
146 if m:
147 lang_pref = int(''.join('01'[x] for x in (
148 m.group('vlang') == langauge_code, # we prefer voice in the requested language
149 not m.group('audio_desc'), # and not the audio description version
150 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
152 not m.group('has_sub'), # but we prefer no subtitles otherwise
153 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
154 )))
155
156 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
157 if stream['protocol'].startswith('HLS'):
158 fmts, subs = self._extract_m3u8_formats_and_subtitles(
159 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
160 for fmt in fmts:
161 fmt.update({
162 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
163 'language_preference': lang_pref,
164 })
165 if any(map(short_label.startswith, ('cc', 'OGsub'))):
166 secondary_formats.extend(fmts)
167 else:
168 formats.extend(fmts)
169 self._merge_subtitles(subs, target=subtitles)
170
171 elif stream['protocol'] in ('HTTPS', 'RTMP'):
172 formats.append({
173 'format_id': f'{stream["protocol"]}-{stream_version_code}',
174 'url': stream['url'],
175 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
176 'language_preference': lang_pref,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
178 })
179
180 else:
181 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
182
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
186
187 formats.extend(secondary_formats)
188 self._remove_duplicate_formats(formats)
189 self._sort_formats(formats)
190
191 metadata = config['data']['attributes']['metadata']
192
193 return {
194 'id': metadata['providerId'],
195 'webpage_url': traverse_obj(metadata, ('link', 'url')),
196 'title': traverse_obj(metadata, 'subtitle', 'title'),
197 'alt_title': metadata.get('subtitle') and metadata.get('title'),
198 'description': metadata.get('description'),
199 'duration': traverse_obj(metadata, ('duration', 'seconds')),
200 'language': metadata.get('language'),
201 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
202 'is_live': config['data']['attributes'].get('live', False),
203 'formats': formats,
204 'subtitles': subtitles,
205 'thumbnails': [
206 {'url': image['url'], 'id': image.get('caption')}
207 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
208 ],
209 }
210
211
212 class ArteTVEmbedIE(InfoExtractor):
213 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
214 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
215 _TESTS = [{
216 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
217 'info_dict': {
218 'id': '100605-013-A',
219 'ext': 'mp4',
220 'title': 'United we Stream November Lockdown Edition #13',
221 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
222 'upload_date': '20201116',
223 },
224 'skip': 'No video available'
225 }, {
226 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
227 'only_matching': True,
228 }]
229
230 def _real_extract(self, url):
231 qs = parse_qs(url)
232 json_url = qs['json_url'][0]
233 video_id = ArteTVIE._match_id(json_url)
234 return self.url_result(
235 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
236
237
238 class ArteTVPlaylistIE(ArteTVBaseIE):
239 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
240 _TESTS = [{
241 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
242 'only_matching': True,
243 }, {
244 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
245 'playlist_mincount': 100,
246 'info_dict': {
247 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
248 'id': 'RC-014123',
249 'title': 'ARTE Reportage - najlepsze reportaże',
250 },
251 }]
252
253 def _real_extract(self, url):
254 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
255 playlist = self._download_json(
256 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
257
258 entries = [{
259 '_type': 'url_transparent',
260 'url': video['config']['url'],
261 'ie_key': ArteTVIE.ie_key(),
262 'id': video.get('providerId'),
263 'title': video.get('title'),
264 'alt_title': video.get('subtitle'),
265 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
266 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
267 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
268
269 return self.playlist_result(entries, playlist_id,
270 traverse_obj(playlist, ('metadata', 'title')),
271 traverse_obj(playlist, ('metadata', 'description')))
272
273
274 class ArteTVCategoryIE(ArteTVBaseIE):
275 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
276 _TESTS = [{
277 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
278 'info_dict': {
279 'id': 'politics-and-society',
280 'title': 'Politics and society',
281 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
282 },
283 'playlist_mincount': 13,
284 }]
285
286 @classmethod
287 def suitable(cls, url):
288 return (
289 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
290 and super().suitable(url))
291
292 def _real_extract(self, url):
293 lang, playlist_id = self._match_valid_url(url).groups()
294 webpage = self._download_webpage(url, playlist_id)
295
296 items = []
297 for video in re.finditer(
298 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
299 webpage):
300 video = video.group('url')
301 if video == url:
302 continue
303 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
304 items.append(video)
305
306 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
307
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))