]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[extractor] Improve `_generic_title`
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
8bdd16b4 62 }, {
051d6b45 63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 64 'only_matching': True,
65 }, {
051d6b45 66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 67 'only_matching': True,
68 }]
69
051d6b45
F
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
89e4d861 98 'DE_FR': (
051d6b45
F
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 101 ),
051d6b45 102 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 103 'EUR_DE_FR': (
051d6b45
F
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
89e4d861 107 ),
108 'SAT': (
051d6b45
F
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 115 ),
051d6b45
F
116 }
117
8bdd16b4 118 def _real_extract(self, url):
5ad28e7f 119 mobj = self._match_valid_url(url)
8bdd16b4 120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
1534aba8 138 secondary_formats = []
051d6b45
F
139 for stream in config['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version = stream['versions'][0]
142 stream_version_code = stream_version['eStat']['ml5']
143
144 lang_pref = -1
145 m = self._VERSION_CODE_RE.match(stream_version_code)
146 if m:
147 lang_pref = int(''.join('01'[x] for x in (
148 m.group('vlang') == langauge_code, # we prefer voice in the requested language
149 not m.group('audio_desc'), # and not the audio description version
150 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
152 not m.group('has_sub'), # but we prefer no subtitles otherwise
153 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
154 )))
155
1534aba8 156 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
051d6b45
F
157 if stream['protocol'].startswith('HLS'):
158 fmts, subs = self._extract_m3u8_formats_and_subtitles(
159 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
160 for fmt in fmts:
161 fmt.update({
1534aba8 162 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 163 'language_preference': lang_pref,
25791435 164 })
1534aba8
SS
165 if any(map(short_label.startswith, ('cc', 'OGsub'))):
166 secondary_formats.extend(fmts)
167 else:
168 formats.extend(fmts)
051d6b45
F
169 self._merge_subtitles(subs, target=subtitles)
170
171 elif stream['protocol'] in ('HTTPS', 'RTMP'):
172 formats.append({
173 'format_id': f'{stream["protocol"]}-{stream_version_code}',
174 'url': stream['url'],
1534aba8 175 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
176 'language_preference': lang_pref,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
178 })
179
c40f5cf4 180 else:
051d6b45
F
181 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
182
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
aff2f4f4 186
1534aba8
SS
187 formats.extend(secondary_formats)
188 self._remove_duplicate_formats(formats)
051d6b45 189 self._sort_formats(formats)
aff2f4f4 190
051d6b45 191 metadata = config['data']['attributes']['metadata']
c40f5cf4 192
8bdd16b4 193 return {
051d6b45
F
194 'id': metadata['providerId'],
195 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 196 'title': traverse_obj(metadata, 'subtitle', 'title'),
197 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
198 'description': metadata.get('description'),
199 'duration': traverse_obj(metadata, ('duration', 'seconds')),
200 'language': metadata.get('language'),
201 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
202 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 203 'formats': formats,
051d6b45
F
204 'subtitles': subtitles,
205 'thumbnails': [
206 {'url': image['url'], 'id': image.get('caption')}
207 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
208 ],
8bdd16b4 209 }
c40f5cf4 210
24114fee 211
8bdd16b4 212class ArteTVEmbedIE(InfoExtractor):
213 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 214 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 215 _TESTS = [{
8bdd16b4 216 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 217 'info_dict': {
8bdd16b4 218 'id': '100605-013-A',
9c54ae33 219 'ext': 'mp4',
8bdd16b4 220 'title': 'United we Stream November Lockdown Edition #13',
221 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
222 'upload_date': '20201116',
69a0c470 223 },
051d6b45 224 'skip': 'No video available'
8bdd16b4 225 }, {
226 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
227 'only_matching': True,
9c54ae33 228 }]
56a8ab7d 229
893f8832 230 def _real_extract(self, url):
4dfbf869 231 qs = parse_qs(url)
8bdd16b4 232 json_url = qs['json_url'][0]
233 video_id = ArteTVIE._match_id(json_url)
234 return self.url_result(
235 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
236
237
6e6b9f60 238class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 239 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 240 _TESTS = [{
ff0f4cfe 241 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 242 'only_matching': True,
8bdd16b4 243 }, {
244 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
245 'playlist_mincount': 100,
246 'info_dict': {
247 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
248 'id': 'RC-014123',
249 'title': 'ARTE Reportage - najlepsze reportaże',
250 },
6e6b9f60
S
251 }]
252
253 def _real_extract(self, url):
051d6b45
F
254 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
255 playlist = self._download_json(
256 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
257
258 entries = [{
259 '_type': 'url_transparent',
260 'url': video['config']['url'],
261 'ie_key': ArteTVIE.ie_key(),
262 'id': video.get('providerId'),
263 'title': video.get('title'),
264 'alt_title': video.get('subtitle'),
265 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
266 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
267 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
268
269 return self.playlist_result(entries, playlist_id,
270 traverse_obj(playlist, ('metadata', 'title')),
271 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 272
273
274class ArteTVCategoryIE(ArteTVBaseIE):
275 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
276 _TESTS = [{
277 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
278 'info_dict': {
279 'id': 'politics-and-society',
280 'title': 'Politics and society',
281 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
282 },
283 'playlist_mincount': 13,
051d6b45 284 }]
50e93e03 285
286 @classmethod
287 def suitable(cls, url):
288 return (
289 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 290 and super().suitable(url))
50e93e03 291
292 def _real_extract(self, url):
293 lang, playlist_id = self._match_valid_url(url).groups()
294 webpage = self._download_webpage(url, playlist_id)
295
296 items = []
297 for video in re.finditer(
298 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
299 webpage):
300 video = video.group('url')
301 if video == url:
302 continue
303 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
304 items.append(video)
305
62b8dac4 306 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 307
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))