]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[extractor/youtube] Ignore incomplete data for comment threads by default (#7475)
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }, {
69 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
70 'info_dict': {
71 'id': '110203-006-A',
72 'chapters': 'count:16',
73 'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
74 'alt_title': 'Zaz',
75 'title': 'Baloise Session 2022',
76 'timestamp': 1668445200,
77 'duration': 4054,
78 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
79 'upload_date': '20221114',
80 'ext': 'mp4',
81 },
82 'expected_warnings': ['geo restricted']
83 }]
84
85 _GEO_BYPASS = True
86
87 _LANG_MAP = { # ISO639 -> French abbreviations
88 'fr': 'F',
89 'de': 'A',
90 'en': 'E[ANG]',
91 'es': 'E[ESP]',
92 'it': 'E[ITA]',
93 'pl': 'E[POL]',
94 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
95 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
96 'mul': 'EU',
97 }
98
99 _VERSION_CODE_RE = re.compile(r'''(?x)
100 V
101 (?P<original_voice>O?)
102 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
103 (?P<audio_desc>AUD|)
104 (?:
105 (?P<has_sub>-ST)
106 (?P<sdh_sub>M?)
107 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
108 )?
109 ''')
110
111 # all obtained by exhaustive testing
112 _COUNTRIES_MAP = {
113 'DE_FR': (
114 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
115 'PF', 'PM', 'RE', 'WF', 'YT',
116 ),
117 # with both of the below 'BE' sometimes works, sometimes doesn't
118 'EUR_DE_FR': (
119 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
120 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
121 'YT',
122 ),
123 'SAT': (
124 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
125 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
126 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
127 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
128 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
129 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
130 ),
131 }
132
133 def _real_extract(self, url):
134 mobj = self._match_valid_url(url)
135 video_id = mobj.group('id')
136 lang = mobj.group('lang') or mobj.group('lang_2')
137 langauge_code = self._LANG_MAP.get(lang)
138
139 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
140
141 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
142 if geoblocking.get('restrictedArea'):
143 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
144 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
145
146 if not traverse_obj(config, ('data', 'attributes', 'rights')):
147 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
148 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
149 raise ExtractorError(
150 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
151
152 formats, subtitles = [], {}
153 secondary_formats = []
154 for stream in config['data']['attributes']['streams']:
155 # official player contains code like `e.get("versions")[0].eStat.ml5`
156 stream_version = stream['versions'][0]
157 stream_version_code = stream_version['eStat']['ml5']
158
159 lang_pref = -1
160 m = self._VERSION_CODE_RE.match(stream_version_code)
161 if m:
162 lang_pref = int(''.join('01'[x] for x in (
163 m.group('vlang') == langauge_code, # we prefer voice in the requested language
164 not m.group('audio_desc'), # and not the audio description version
165 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
166 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
167 not m.group('has_sub'), # but we prefer no subtitles otherwise
168 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
169 )))
170
171 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
172 if stream['protocol'].startswith('HLS'):
173 fmts, subs = self._extract_m3u8_formats_and_subtitles(
174 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
175 for fmt in fmts:
176 fmt.update({
177 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
178 'language_preference': lang_pref,
179 })
180 if any(map(short_label.startswith, ('cc', 'OGsub'))):
181 secondary_formats.extend(fmts)
182 else:
183 formats.extend(fmts)
184 self._merge_subtitles(subs, target=subtitles)
185
186 elif stream['protocol'] in ('HTTPS', 'RTMP'):
187 formats.append({
188 'format_id': f'{stream["protocol"]}-{stream_version_code}',
189 'url': stream['url'],
190 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
191 'language_preference': lang_pref,
192 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
193 })
194
195 else:
196 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
197
198 formats.extend(secondary_formats)
199 self._remove_duplicate_formats(formats)
200
201 metadata = config['data']['attributes']['metadata']
202
203 return {
204 'id': metadata['providerId'],
205 'webpage_url': traverse_obj(metadata, ('link', 'url')),
206 'title': traverse_obj(metadata, 'subtitle', 'title'),
207 'alt_title': metadata.get('subtitle') and metadata.get('title'),
208 'description': metadata.get('description'),
209 'duration': traverse_obj(metadata, ('duration', 'seconds')),
210 'language': metadata.get('language'),
211 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
212 'is_live': config['data']['attributes'].get('live', False),
213 'formats': formats,
214 'subtitles': subtitles,
215 'thumbnails': [
216 {'url': image['url'], 'id': image.get('caption')}
217 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
218 ],
219 # TODO: chapters may also be in stream['segments']?
220 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., {
221 'start_time': 'startTime',
222 'title': 'title',
223 })) or None,
224 }
225
226
227 class ArteTVEmbedIE(InfoExtractor):
228 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
229 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
230 _TESTS = [{
231 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
232 'info_dict': {
233 'id': '100605-013-A',
234 'ext': 'mp4',
235 'title': 'United we Stream November Lockdown Edition #13',
236 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
237 'upload_date': '20201116',
238 },
239 'skip': 'No video available'
240 }, {
241 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
242 'only_matching': True,
243 }]
244
245 def _real_extract(self, url):
246 qs = parse_qs(url)
247 json_url = qs['json_url'][0]
248 video_id = ArteTVIE._match_id(json_url)
249 return self.url_result(
250 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
251
252
253 class ArteTVPlaylistIE(ArteTVBaseIE):
254 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
255 _TESTS = [{
256 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
257 'only_matching': True,
258 }, {
259 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
260 'playlist_mincount': 100,
261 'info_dict': {
262 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
263 'id': 'RC-014123',
264 'title': 'ARTE Reportage - najlepsze reportaże',
265 },
266 }]
267
268 def _real_extract(self, url):
269 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
270 playlist = self._download_json(
271 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
272
273 entries = [{
274 '_type': 'url_transparent',
275 'url': video['config']['url'],
276 'ie_key': ArteTVIE.ie_key(),
277 'id': video.get('providerId'),
278 'title': video.get('title'),
279 'alt_title': video.get('subtitle'),
280 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
281 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
282 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
283
284 return self.playlist_result(entries, playlist_id,
285 traverse_obj(playlist, ('metadata', 'title')),
286 traverse_obj(playlist, ('metadata', 'description')))
287
288
289 class ArteTVCategoryIE(ArteTVBaseIE):
290 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
291 _TESTS = [{
292 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
293 'info_dict': {
294 'id': 'politics-and-society',
295 'title': 'Politics and society',
296 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
297 },
298 'playlist_mincount': 13,
299 }]
300
301 @classmethod
302 def suitable(cls, url):
303 return (
304 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
305 and super().suitable(url))
306
307 def _real_extract(self, url):
308 lang, playlist_id = self._match_valid_url(url).groups()
309 webpage = self._download_webpage(url, playlist_id)
310
311 items = []
312 for video in re.finditer(
313 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
314 webpage):
315 video = video.group('url')
316 if video == url:
317 continue
318 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
319 items.append(video)
320
321 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
322
323 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
324 description=self._og_search_description(webpage, default=None))