3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
54 'upload_date': '20220718',
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
61 'params': {'skip_download': 'm3u8'}
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
69 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/',
72 'chapters': 'count:16',
73 'description': 'md5:cf592f1df52fe52007e3f8eac813c084',
75 'title': 'Baloise Session 2022',
76 'timestamp': 1668445200,
78 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530',
79 'upload_date': '20221114',
82 'expected_warnings': ['geo restricted']
87 _LANG_MAP
= { # ISO639 -> French abbreviations
94 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
95 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
99 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
101 (?P<original_voice>O?)
102 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
107 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
111 # all obtained by exhaustive testing
114 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
115 'PF', 'PM', 'RE', 'WF', 'YT',
117 # with both of the below 'BE' sometimes works, sometimes doesn't
119 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
120 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
124 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
125 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
126 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
127 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
128 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
129 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
133 def _real_extract(self
, url
):
134 mobj
= self
._match
_valid
_url
(url
)
135 video_id
= mobj
.group('id')
136 lang
= mobj
.group('lang') or mobj
.group('lang_2')
137 langauge_code
= self
._LANG
_MAP
.get(lang
)
139 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
)
141 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
142 if geoblocking
.get('restrictedArea'):
143 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
144 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
146 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
147 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
148 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
149 raise ExtractorError(
150 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
152 formats
, subtitles
= [], {}
153 secondary_formats
= []
154 for stream
in config
['data']['attributes']['streams']:
155 # official player contains code like `e.get("versions")[0].eStat.ml5`
156 stream_version
= stream
['versions'][0]
157 stream_version_code
= stream_version
['eStat']['ml5']
160 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
162 lang_pref
= int(''.join('01'[x
] for x
in (
163 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
164 not m
.group('audio_desc'), # and not the audio description version
165 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
166 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
167 not m
.group('has_sub'), # but we prefer no subtitles otherwise
168 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
171 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
172 if stream
['protocol'].startswith('HLS'):
173 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
174 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
177 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
178 'language_preference': lang_pref
,
180 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
181 secondary_formats
.extend(fmts
)
184 self
._merge
_subtitles
(subs
, target
=subtitles
)
186 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
188 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
189 'url': stream
['url'],
190 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
191 'language_preference': lang_pref
,
192 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
196 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
198 formats
.extend(secondary_formats
)
199 self
._remove
_duplicate
_formats
(formats
)
201 metadata
= config
['data']['attributes']['metadata']
204 'id': metadata
['providerId'],
205 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
206 'title': traverse_obj(metadata
, 'subtitle', 'title'),
207 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
208 'description': metadata
.get('description'),
209 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
210 'language': metadata
.get('language'),
211 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
212 'is_live': config
['data']['attributes'].get('live', False),
214 'subtitles': subtitles
,
216 {'url': image['url'], 'id': image.get('caption')}
217 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
219 # TODO: chapters may also be in stream['segments']?
220 'chapters': traverse_obj(config
, ('data', 'attributes', 'chapters', 'elements', ..., {
221 'start_time': 'startTime',
227 class ArteTVEmbedIE(InfoExtractor
):
228 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
229 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
231 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
233 'id': '100605-013-A
',
235 'title
': 'United we Stream November Lockdown Edition
#13',
236 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
237 'upload_date': '20201116',
239 'skip': 'No video available'
241 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
242 'only_matching': True,
245 def _real_extract(self
, url
):
247 json_url
= qs
['json_url'][0]
248 video_id
= ArteTVIE
._match
_id
(json_url
)
249 return self
.url_result(
250 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
253 class ArteTVPlaylistIE(ArteTVBaseIE
):
254 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE
._ARTE
_LANGUAGES
256 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
257 'only_matching': True,
259 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
260 'playlist_mincount': 100,
262 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
264 'title': 'ARTE Reportage - najlepsze reportaże',
268 def _real_extract(self
, url
):
269 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
270 playlist
= self
._download
_json
(
271 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
274 '_type': 'url_transparent',
275 'url': video
['config']['url'],
276 'ie_key': ArteTVIE
.ie_key(),
277 'id': video
.get('providerId'),
278 'title': video
.get('title'),
279 'alt_title': video
.get('subtitle'),
280 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
281 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
282 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
284 return self
.playlist_result(entries
, playlist_id
,
285 traverse_obj(playlist
, ('metadata', 'title')),
286 traverse_obj(playlist
, ('metadata', 'description')))
289 class ArteTVCategoryIE(ArteTVBaseIE
):
290 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE
._ARTE
_LANGUAGES
292 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
294 'id': 'politics-and-society',
295 'title': 'Politics and society',
296 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
298 'playlist_mincount': 13,
302 def suitable(cls
, url
):
304 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
, ))
305 and super().suitable(url
))
307 def _real_extract(self
, url
):
308 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
309 webpage
= self
._download
_webpage
(url
, playlist_id
)
312 for video
in re
.finditer(
313 r
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|
\b)(?P
<url
>https?
://www\
.arte\
.tv
/%s/videos
/[\w
/-]+)(?P
=q
)' % lang,
315 video = video.group('url
')
318 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
321 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|
', 1)[0]) or None
323 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
324 description=self._og_search_description(webpage, default=None))