]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/arte.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / arte.py
CommitLineData
d5822b96 1import re
d5822b96
PH
2
3from .common import InfoExtractor
4from ..utils import (
c0892b2b 5 ExtractorError,
051d6b45 6 GeoRestrictedError,
d24a2b20 7 int_or_none,
051d6b45 8 parse_iso8601,
4dfbf869 9 parse_qs,
50e93e03 10 strip_or_none,
051d6b45 11 traverse_obj,
8bdd16b4 12 url_or_none,
d5822b96
PH
13)
14
d5822b96 15
6e6b9f60 16class ArteTVBaseIE(InfoExtractor):
8bdd16b4 17 _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
051d6b45 18 _API_BASE = 'https://api.arte.tv/api/player/v2'
8bdd16b4 19
20
21class ArteTVIE(ArteTVBaseIE):
22 _VALID_URL = r'''(?x)
051d6b45 23 (?:https?://
8bdd16b4 24 (?:
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
27 )
051d6b45
F
28 |arte://program)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
8bdd16b4 30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
31 _TESTS = [{
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
051d6b45
F
33 'only_matching': True,
34 }, {
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
8bdd16b4 36 'info_dict': {
051d6b45
F
37 'id': '100103-000-A',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
42 'duration': 554,
43 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
8bdd16b4 45 'ext': 'mp4',
8bdd16b4 46 },
051d6b45 47 'params': {'skip_download': 'm3u8'}
f640e42f 48 }, {
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'info_dict': {
52 'id': '110371-000-A',
53 'ext': 'mp4',
54 'upload_date': '20220718',
55 'duration': 154,
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
60 },
61 'params': {'skip_download': 'm3u8'}
8bdd16b4 62 }, {
051d6b45 63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
8bdd16b4 64 'only_matching': True,
65 }, {
051d6b45 66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
8bdd16b4 67 'only_matching': True,
68 }]
69
051d6b45
F
70 _GEO_BYPASS = True
71
72 _LANG_MAP = { # ISO639 -> French abbreviations
73 'fr': 'F',
74 'de': 'A',
75 'en': 'E[ANG]',
76 'es': 'E[ESP]',
77 'it': 'E[ITA]',
78 'pl': 'E[POL]',
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
81 'mul': 'EU',
82 }
83
84 _VERSION_CODE_RE = re.compile(r'''(?x)
85 V
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
88 (?P<audio_desc>AUD|)
89 (?:
90 (?P<has_sub>-ST)
91 (?P<sdh_sub>M?)
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
93 )?
94 ''')
95
96 # all obtained by exhaustive testing
97 _COUNTRIES_MAP = {
89e4d861 98 'DE_FR': (
051d6b45
F
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
89e4d861 101 ),
051d6b45 102 # with both of the below 'BE' sometimes works, sometimes doesn't
89e4d861 103 'EUR_DE_FR': (
051d6b45
F
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
106 'YT',
89e4d861 107 ),
108 'SAT': (
051d6b45
F
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
89e4d861 115 ),
051d6b45
F
116 }
117
8bdd16b4 118 def _real_extract(self, url):
5ad28e7f 119 mobj = self._match_valid_url(url)
8bdd16b4 120 video_id = mobj.group('id')
121 lang = mobj.group('lang') or mobj.group('lang_2')
051d6b45
F
122 langauge_code = self._LANG_MAP.get(lang)
123
124 config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id)
125
126 geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking.get('restrictedArea'):
128 raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}',
129 countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR')))
130
131 if not traverse_obj(config, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True)
136
137 formats, subtitles = [], {}
1534aba8 138 secondary_formats = []
051d6b45
F
139 for stream in config['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version = stream['versions'][0]
142 stream_version_code = stream_version['eStat']['ml5']
143
144 lang_pref = -1
145 m = self._VERSION_CODE_RE.match(stream_version_code)
146 if m:
147 lang_pref = int(''.join('01'[x] for x in (
148 m.group('vlang') == langauge_code, # we prefer voice in the requested language
149 not m.group('audio_desc'), # and not the audio description version
150 bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language
152 not m.group('has_sub'), # but we prefer no subtitles otherwise
153 not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
154 )))
155
1534aba8 156 short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?')
051d6b45
F
157 if stream['protocol'].startswith('HLS'):
158 fmts, subs = self._extract_m3u8_formats_and_subtitles(
159 stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False)
160 for fmt in fmts:
161 fmt.update({
1534aba8 162 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
25791435 163 'language_preference': lang_pref,
25791435 164 })
1534aba8
SS
165 if any(map(short_label.startswith, ('cc', 'OGsub'))):
166 secondary_formats.extend(fmts)
167 else:
168 formats.extend(fmts)
051d6b45
F
169 self._merge_subtitles(subs, target=subtitles)
170
171 elif stream['protocol'] in ('HTTPS', 'RTMP'):
172 formats.append({
173 'format_id': f'{stream["protocol"]}-{stream_version_code}',
174 'url': stream['url'],
1534aba8 175 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]',
051d6b45
F
176 'language_preference': lang_pref,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
178 })
179
c40f5cf4 180 else:
051d6b45
F
181 self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}')
182
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
aff2f4f4 186
1534aba8
SS
187 formats.extend(secondary_formats)
188 self._remove_duplicate_formats(formats)
aff2f4f4 189
051d6b45 190 metadata = config['data']['attributes']['metadata']
c40f5cf4 191
8bdd16b4 192 return {
051d6b45
F
193 'id': metadata['providerId'],
194 'webpage_url': traverse_obj(metadata, ('link', 'url')),
f640e42f 195 'title': traverse_obj(metadata, 'subtitle', 'title'),
196 'alt_title': metadata.get('subtitle') and metadata.get('title'),
051d6b45
F
197 'description': metadata.get('description'),
198 'duration': traverse_obj(metadata, ('duration', 'seconds')),
199 'language': metadata.get('language'),
200 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601),
201 'is_live': config['data']['attributes'].get('live', False),
8bdd16b4 202 'formats': formats,
051d6b45
F
203 'subtitles': subtitles,
204 'thumbnails': [
205 {'url': image['url'], 'id': image.get('caption')}
206 for image in metadata.get('images') or [] if url_or_none(image.get('url'))
207 ],
8bdd16b4 208 }
c40f5cf4 209
24114fee 210
8bdd16b4 211class ArteTVEmbedIE(InfoExtractor):
212 _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
bfd973ec 213 _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
9c54ae33 214 _TESTS = [{
8bdd16b4 215 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
9c54ae33 216 'info_dict': {
8bdd16b4 217 'id': '100605-013-A',
9c54ae33 218 'ext': 'mp4',
8bdd16b4 219 'title': 'United we Stream November Lockdown Edition #13',
220 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
221 'upload_date': '20201116',
69a0c470 222 },
051d6b45 223 'skip': 'No video available'
8bdd16b4 224 }, {
225 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
226 'only_matching': True,
9c54ae33 227 }]
56a8ab7d 228
893f8832 229 def _real_extract(self, url):
4dfbf869 230 qs = parse_qs(url)
8bdd16b4 231 json_url = qs['json_url'][0]
232 video_id = ArteTVIE._match_id(json_url)
233 return self.url_result(
234 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
4b492e35
S
235
236
6e6b9f60 237class ArteTVPlaylistIE(ArteTVBaseIE):
8bdd16b4 238 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
6e6b9f60 239 _TESTS = [{
ff0f4cfe 240 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
051d6b45 241 'only_matching': True,
8bdd16b4 242 }, {
243 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
051d6b45
F
244 'playlist_mincount': 100,
245 'info_dict': {
246 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
247 'id': 'RC-014123',
248 'title': 'ARTE Reportage - najlepsze reportaże',
249 },
6e6b9f60
S
250 }]
251
252 def _real_extract(self, url):
051d6b45
F
253 lang, playlist_id = self._match_valid_url(url).group('lang', 'id')
254 playlist = self._download_json(
255 f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes']
256
257 entries = [{
258 '_type': 'url_transparent',
259 'url': video['config']['url'],
260 'ie_key': ArteTVIE.ie_key(),
261 'id': video.get('providerId'),
262 'title': video.get('title'),
263 'alt_title': video.get('subtitle'),
264 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))),
265 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))),
266 } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))]
267
268 return self.playlist_result(entries, playlist_id,
269 traverse_obj(playlist, ('metadata', 'title')),
270 traverse_obj(playlist, ('metadata', 'description')))
50e93e03 271
272
273class ArteTVCategoryIE(ArteTVBaseIE):
274 _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
275 _TESTS = [{
276 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
277 'info_dict': {
278 'id': 'politics-and-society',
279 'title': 'Politics and society',
280 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
281 },
282 'playlist_mincount': 13,
051d6b45 283 }]
50e93e03 284
285 @classmethod
286 def suitable(cls, url):
287 return (
288 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
051d6b45 289 and super().suitable(url))
50e93e03 290
291 def _real_extract(self, url):
292 lang, playlist_id = self._match_valid_url(url).groups()
293 webpage = self._download_webpage(url, playlist_id)
294
295 items = []
296 for video in re.finditer(
297 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
298 webpage):
299 video = video.group('url')
300 if video == url:
301 continue
302 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
303 items.append(video)
304
62b8dac4 305 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
50e93e03 306
307 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
308 description=self._og_search_description(webpage, default=None))