3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
51 'only_matching': True,
53 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
54 'only_matching': True,
56 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
57 'only_matching': True,
59 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
60 'only_matching': True,
62 'note': 'age-restricted',
63 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/',
66 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba',
67 'title': 'The Element of Crime',
68 'timestamp': 1696111200,
70 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530',
71 'upload_date': '20230930',
78 _LANG_MAP
= { # ISO639 -> French abbreviations
85 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
86 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
90 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
92 (?P<original_voice>O?)
93 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
98 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
102 # all obtained by exhaustive testing
105 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
106 'PF', 'PM', 'RE', 'WF', 'YT',
108 # with both of the below 'BE' sometimes works, sometimes doesn't
110 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
111 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
115 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
116 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
117 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
118 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
119 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
120 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
124 def _real_extract(self
, url
):
125 mobj
= self
._match
_valid
_url
(url
)
126 video_id
= mobj
.group('id')
127 lang
= mobj
.group('lang') or mobj
.group('lang_2')
128 langauge_code
= self
._LANG
_MAP
.get(lang
)
130 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
, headers
={
131 'x-validated-age': '18'
134 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
135 if geoblocking
.get('restrictedArea'):
136 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
137 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
139 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
140 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
141 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
142 raise ExtractorError(
143 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
145 formats
, subtitles
= [], {}
146 secondary_formats
= []
147 for stream
in config
['data']['attributes']['streams']:
148 # official player contains code like `e.get("versions")[0].eStat.ml5`
149 stream_version
= stream
['versions'][0]
150 stream_version_code
= stream_version
['eStat']['ml5']
153 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
155 lang_pref
= int(''.join('01'[x
] for x
in (
156 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
157 not m
.group('audio_desc'), # and not the audio description version
158 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
159 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
160 not m
.group('has_sub'), # but we prefer no subtitles otherwise
161 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
164 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
165 if 'HLS' in stream
['protocol']:
166 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
167 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
170 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
171 'language_preference': lang_pref
,
173 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
174 secondary_formats
.extend(fmts
)
177 self
._merge
_subtitles
(subs
, target
=subtitles
)
179 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
181 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
182 'url': stream
['url'],
183 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
184 'language_preference': lang_pref
,
185 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
189 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
191 formats
.extend(secondary_formats
)
192 self
._remove
_duplicate
_formats
(formats
)
194 metadata
= config
['data']['attributes']['metadata']
197 'id': metadata
['providerId'],
198 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
199 'title': traverse_obj(metadata
, 'subtitle', 'title'),
200 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
201 'description': metadata
.get('description'),
202 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
203 'language': metadata
.get('language'),
204 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
205 'is_live': config
['data']['attributes'].get('live', False),
207 'subtitles': subtitles
,
209 {'url': image['url'], 'id': image.get('caption')}
210 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
212 # TODO: chapters may also be in stream['segments']?
213 'chapters': traverse_obj(config
, ('data', 'attributes', 'chapters', 'elements', ..., {
214 'start_time': 'startTime',
220 class ArteTVEmbedIE(InfoExtractor
):
221 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
222 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
224 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
226 'id': '100605-013-A
',
228 'title
': 'United we Stream November Lockdown Edition
#13',
229 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
230 'upload_date': '20201116',
232 'skip': 'No video available'
234 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
235 'only_matching': True,
238 def _real_extract(self
, url
):
240 json_url
= qs
['json_url'][0]
241 video_id
= ArteTVIE
._match
_id
(json_url
)
242 return self
.url_result(
243 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
246 class ArteTVPlaylistIE(ArteTVBaseIE
):
247 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE
._ARTE
_LANGUAGES
249 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
250 'only_matching': True,
252 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
253 'playlist_mincount': 100,
255 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
257 'title': 'ARTE Reportage - najlepsze reportaże',
261 def _real_extract(self
, url
):
262 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
263 playlist
= self
._download
_json
(
264 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
267 '_type': 'url_transparent',
268 'url': video
['config']['url'],
269 'ie_key': ArteTVIE
.ie_key(),
270 'id': video
.get('providerId'),
271 'title': video
.get('title'),
272 'alt_title': video
.get('subtitle'),
273 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
274 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
275 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
277 return self
.playlist_result(entries
, playlist_id
,
278 traverse_obj(playlist
, ('metadata', 'title')),
279 traverse_obj(playlist
, ('metadata', 'description')))
282 class ArteTVCategoryIE(ArteTVBaseIE
):
283 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE
._ARTE
_LANGUAGES
285 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
287 'id': 'politics-and-society',
288 'title': 'Politics and society',
289 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
291 'playlist_mincount': 13,
295 def suitable(cls
, url
):
297 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
, ))
298 and super().suitable(url
))
300 def _real_extract(self
, url
):
301 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
302 webpage
= self
._download
_webpage
(url
, playlist_id
)
305 for video
in re
.finditer(
306 r
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|
\b)(?P
<url
>https?
://www\
.arte\
.tv
/%s/videos
/[\w
/-]+)(?P
=q
)' % lang,
308 video = video.group('url
')
311 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
314 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|
', 1)[0]) or None
316 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
317 description=self._og_search_description(webpage, default=None))