]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/arte.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / arte.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import (
5 ExtractorError,
6 GeoRestrictedError,
7 int_or_none,
8 parse_iso8601,
9 parse_qs,
10 strip_or_none,
11 traverse_obj,
12 url_or_none,
13 )
14
15
16 class ArteTVBaseIE(InfoExtractor):
17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
18 _API_BASE = 'https://api.arte.tv/api/player/v2'
19
20
21 class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
23 (?:https?://
24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
36 'info_dict': {
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
45 'ext': 'mp4',
46 },
47 'params': {'skip_download': 'm3u8'}
48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
62 }, {
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
65 }, {
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
68 }]
69
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
98 'DE_FR': (
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
101 ),
102 # with both of the below 'BE' sometimes works, sometimes doesn't
103 'EUR_DE_FR': (
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
107 ),
108 'SAT': (
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
115 ),
116 }
117
118 def _real_extract(self, url):
119 mobj = self._match_valid_url(url)
120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
138 secondary_formats = []
139 for stream in config['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version = stream['versions'][0]
142 stream_version_code = stream_version['eStat']['ml5']
143
144 lang_pref = -1
145 m = self._VERSION_CODE_RE.match(stream_version_code)
146 if m:
147 lang_pref = int(''.join('01'[x] for x in (
148 m.group('vlang') == langauge_code, # we prefer voice in the requested language
149 not m.group('audio_desc'), # and not the audio description version
150 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
152 not m.group('has_sub'), # but we prefer no subtitles otherwise
153 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
154 )))
155
156 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
157 if stream['protocol'].startswith('HLS'):
158 fmts, subs = self._extract_m3u8_formats_and_subtitles(
159 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
160 for fmt in fmts:
161 fmt.update({
162 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
163 'language_preference': lang_pref,
164 })
165 if any(map(short_label.startswith, ('cc', 'OGsub'))):
166 secondary_formats.extend(fmts)
167 else:
168 formats.extend(fmts)
169 self._merge_subtitles(subs, target=subtitles)
170
171 elif stream['protocol'] in ('HTTPS', 'RTMP'):
172 formats.append({
173 'format_id': f'{stream["protocol"]}-{stream_version_code}',
174 'url': stream['url'],
175 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
176 'language_preference': lang_pref,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
178 })
179
180 else:
181 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
182
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
186
187 formats.extend(secondary_formats)
188 self._remove_duplicate_formats(formats)
189
190 metadata = config['data']['attributes']['metadata']
191
192 return {
193 'id': metadata['providerId'],
194 'webpage_url': traverse_obj(metadata, ('link', 'url')),
195 'title': traverse_obj(metadata, 'subtitle', 'title'),
196 'alt_title': metadata.get('subtitle') and metadata.get('title'),
197 'description': metadata.get('description'),
198 'duration': traverse_obj(metadata, ('duration', 'seconds')),
199 'language': metadata.get('language'),
200 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
201 'is_live': config['data']['attributes'].get('live', False),
202 'formats': formats,
203 'subtitles': subtitles,
204 'thumbnails': [
205 {'url': image['url'], 'id': image.get('caption')}
206 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
207 ],
208 }
209
210
211 class ArteTVEmbedIE(InfoExtractor):
212 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
213 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
214 _TESTS = [{
215 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
216 'info_dict': {
217 'id': '100605-013-A',
218 'ext': 'mp4',
219 'title': 'United we Stream November Lockdown Edition #13',
220 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
221 'upload_date': '20201116',
222 },
223 'skip': 'No video available'
224 }, {
225 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
226 'only_matching': True,
227 }]
228
229 def _real_extract(self, url):
230 qs = parse_qs(url)
231 json_url = qs['json_url'][0]
232 video_id = ArteTVIE._match_id(json_url)
233 return self.url_result(
234 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
235
236
237 class ArteTVPlaylistIE(ArteTVBaseIE):
238 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
239 _TESTS = [{
240 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
241 'only_matching': True,
242 }, {
243 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
244 'playlist_mincount': 100,
245 'info_dict': {
246 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
247 'id': 'RC-014123',
248 'title': 'ARTE Reportage - najlepsze reportaże',
249 },
250 }]
251
252 def _real_extract(self, url):
253 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
254 playlist = self._download_json(
255 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
256
257 entries = [{
258 '_type': 'url_transparent',
259 'url': video['config']['url'],
260 'ie_key': ArteTVIE.ie_key(),
261 'id': video.get('providerId'),
262 'title': video.get('title'),
263 'alt_title': video.get('subtitle'),
264 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
265 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
266 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
267
268 return self.playlist_result(entries, playlist_id,
269 traverse_obj(playlist, ('metadata', 'title')),
270 traverse_obj(playlist, ('metadata', 'description')))
271
272
273 class ArteTVCategoryIE(ArteTVBaseIE):
274 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
275 _TESTS = [{
276 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
277 'info_dict': {
278 'id': 'politics-and-society',
279 'title': 'Politics and society',
280 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
281 },
282 'playlist_mincount': 13,
283 }]
284
285 @classmethod
286 def suitable(cls, url):
287 return (
288 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
289 and super().suitable(url))
290
291 def _real_extract(self, url):
292 lang, playlist_id = self._match_valid_url(url).groups()
293 webpage = self._download_webpage(url, playlist_id)
294
295 items = []
296 for video in re.finditer(
297 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
298 webpage):
299 video = video.group('url')
300 if video == url:
301 continue
302 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
303 items.append(video)
304
305 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
306
307 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
308 description=self._og_search_description(webpage, default=None))