3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'only_matching': True,
53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
54 'only_matching': True,
56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
57 'only_matching': True,
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
60 'only_matching': True,
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
75 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/',
79 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9',
80 'timestamp': 1702872000,
81 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530',
83 'title': 'Die kurze Zeit der Jugend',
84 'alt_title': 'Im hohen Norden geboren',
85 'upload_date': '20231218',
88 'fr-acc': 'mincount:1',
95 _LANG_MAP
= { # ISO639 -> French abbreviations
102 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
103 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
107 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
109 (?P<original_voice>O?)
110 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
115 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
119 # all obtained by exhaustive testing
122 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
123 'PF', 'PM', 'RE', 'WF', 'YT',
125 # with both of the below 'BE' sometimes works, sometimes doesn't
127 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
128 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
132 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
133 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
134 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
135 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
136 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
137 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
142 def _fix_accessible_subs_locale(subs
):
144 for lang
, sub_formats
in subs
.items():
145 for format
in sub_formats
:
146 if format
.get('url', '').endswith('-MAL.m3u8'):
148 updated_subs
.setdefault(lang
, []).append(format
)
151 def _real_extract(self
, url
):
152 mobj
= self
._match
_valid
_url
(url
)
153 video_id
= mobj
.group('id')
154 lang
= mobj
.group('lang') or mobj
.group('lang_2')
155 langauge_code
= self
._LANG
_MAP
.get(lang
)
157 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
, headers
={
158 'x-validated-age': '18'
161 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
162 if geoblocking
.get('restrictedArea'):
163 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
164 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
166 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
167 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
168 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
169 raise ExtractorError(
170 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
172 formats
, subtitles
= [], {}
173 secondary_formats
= []
174 for stream
in config
['data']['attributes']['streams']:
175 # official player contains code like `e.get("versions")[0].eStat.ml5`
176 stream_version
= stream
['versions'][0]
177 stream_version_code
= stream_version
['eStat']['ml5']
180 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
182 lang_pref
= int(''.join('01'[x
] for x
in (
183 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
184 not m
.group('audio_desc'), # and not the audio description version
185 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
186 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
187 not m
.group('has_sub'), # but we prefer no subtitles otherwise
188 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
191 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
192 if 'HLS' in stream
['protocol']:
193 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
194 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
197 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
198 'language_preference': lang_pref
,
200 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
201 secondary_formats
.extend(fmts
)
204 subs
= self
._fix
_accessible
_subs
_locale
(subs
)
205 self
._merge
_subtitles
(subs
, target
=subtitles
)
207 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
209 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
210 'url': stream
['url'],
211 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
212 'language_preference': lang_pref
,
213 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
217 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
219 formats
.extend(secondary_formats
)
220 self
._remove
_duplicate
_formats
(formats
)
222 metadata
= config
['data']['attributes']['metadata']
225 'id': metadata
['providerId'],
226 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
227 'title': traverse_obj(metadata
, 'subtitle', 'title'),
228 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
229 'description': metadata
.get('description'),
230 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
231 'language': metadata
.get('language'),
232 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
233 'is_live': config
['data']['attributes'].get('live', False),
235 'subtitles': subtitles
,
237 {'url': image['url'], 'id': image.get('caption')}
238 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
240 # TODO: chapters may also be in stream['segments']?
241 'chapters': traverse_obj(config
, ('data', 'attributes', 'chapters', 'elements', ..., {
242 'start_time': 'startTime',
248 class ArteTVEmbedIE(InfoExtractor
):
249 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
250 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
252 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
254 'id': '100605-013-A
',
256 'title
': 'United we Stream November Lockdown Edition
#13',
257 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
258 'upload_date': '20201116',
260 'skip': 'No video available'
262 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
263 'only_matching': True,
266 def _real_extract(self
, url
):
268 json_url
= qs
['json_url'][0]
269 video_id
= ArteTVIE
._match
_id
(json_url
)
270 return self
.url_result(
271 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
274 class ArteTVPlaylistIE(ArteTVBaseIE
):
275 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE
._ARTE
_LANGUAGES
277 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
278 'only_matching': True,
280 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
281 'playlist_mincount': 100,
283 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
285 'title': 'ARTE Reportage - najlepsze reportaże',
289 def _real_extract(self
, url
):
290 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
291 playlist
= self
._download
_json
(
292 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
295 '_type': 'url_transparent',
296 'url': video
['config']['url'],
297 'ie_key': ArteTVIE
.ie_key(),
298 'id': video
.get('providerId'),
299 'title': video
.get('title'),
300 'alt_title': video
.get('subtitle'),
301 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
302 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
303 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
305 return self
.playlist_result(entries
, playlist_id
,
306 traverse_obj(playlist
, ('metadata', 'title')),
307 traverse_obj(playlist
, ('metadata', 'description')))
310 class ArteTVCategoryIE(ArteTVBaseIE
):
311 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE
._ARTE
_LANGUAGES
313 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
315 'id': 'politics-and-society',
316 'title': 'Politics and society',
317 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
319 'playlist_mincount': 13,
323 def suitable(cls
, url
):
325 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
, ))
326 and super().suitable(url
))
328 def _real_extract(self
, url
):
329 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
330 webpage
= self
._download
_webpage
(url
, playlist_id
)
333 for video
in re
.finditer(
334 r
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|
\b)(?P
<url
>https?
://www\
.arte\
.tv
/%s/videos
/[\w
/-]+)(?P
=q
)' % lang,
336 video = video.group('url
')
339 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
342 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|
', 1)[0]) or None
344 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
345 description=self._og_search_description(webpage, default=None))