3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
54 'upload_date': '20220718',
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
61 'params': {'skip_download': 'm3u8'}
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
72 _LANG_MAP
= { # ISO639 -> French abbreviations
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
84 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
96 # all obtained by exhaustive testing
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
102 # with both of the below 'BE' sometimes works, sometimes doesn't
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
118 def _real_extract(self
, url
):
119 mobj
= self
._match
_valid
_url
(url
)
120 video_id
= mobj
.group('id')
121 lang
= mobj
.group('lang') or mobj
.group('lang_2')
122 langauge_code
= self
._LANG
_MAP
.get(lang
)
124 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
)
126 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking
.get('restrictedArea'):
128 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
129 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
131 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
137 formats
, subtitles
= [], {}
138 secondary_formats
= []
139 for stream
in config
['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version
= stream
['versions'][0]
142 stream_version_code
= stream_version
['eStat']['ml5']
145 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
147 lang_pref
= int(''.join('01'[x
] for x
in (
148 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
149 not m
.group('audio_desc'), # and not the audio description version
150 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
152 not m
.group('has_sub'), # but we prefer no subtitles otherwise
153 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
156 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
157 if stream
['protocol'].startswith('HLS'):
158 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
159 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
162 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
163 'language_preference': lang_pref
,
165 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
166 secondary_formats
.extend(fmts
)
169 self
._merge
_subtitles
(subs
, target
=subtitles
)
171 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
173 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
174 'url': stream
['url'],
175 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
176 'language_preference': lang_pref
,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
181 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
187 formats
.extend(secondary_formats
)
188 self
._remove
_duplicate
_formats
(formats
)
190 metadata
= config
['data']['attributes']['metadata']
193 'id': metadata
['providerId'],
194 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
195 'title': traverse_obj(metadata
, 'subtitle', 'title'),
196 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
197 'description': metadata
.get('description'),
198 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
199 'language': metadata
.get('language'),
200 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
201 'is_live': config
['data']['attributes'].get('live', False),
203 'subtitles': subtitles
,
205 {'url': image['url'], 'id': image.get('caption')}
206 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
211 class ArteTVEmbedIE(InfoExtractor
):
212 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
213 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
215 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
217 'id': '100605-013-A
',
219 'title
': 'United we Stream November Lockdown Edition
#13',
220 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
221 'upload_date': '20201116',
223 'skip': 'No video available'
225 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
226 'only_matching': True,
229 def _real_extract(self
, url
):
231 json_url
= qs
['json_url'][0]
232 video_id
= ArteTVIE
._match
_id
(json_url
)
233 return self
.url_result(
234 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
237 class ArteTVPlaylistIE(ArteTVBaseIE
):
238 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE
._ARTE
_LANGUAGES
240 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
241 'only_matching': True,
243 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
244 'playlist_mincount': 100,
246 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
248 'title': 'ARTE Reportage - najlepsze reportaże',
252 def _real_extract(self
, url
):
253 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
254 playlist
= self
._download
_json
(
255 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
258 '_type': 'url_transparent',
259 'url': video
['config']['url'],
260 'ie_key': ArteTVIE
.ie_key(),
261 'id': video
.get('providerId'),
262 'title': video
.get('title'),
263 'alt_title': video
.get('subtitle'),
264 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
265 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
266 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
268 return self
.playlist_result(entries
, playlist_id
,
269 traverse_obj(playlist
, ('metadata', 'title')),
270 traverse_obj(playlist
, ('metadata', 'description')))
273 class ArteTVCategoryIE(ArteTVBaseIE
):
274 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE
._ARTE
_LANGUAGES
276 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
278 'id': 'politics-and-society',
279 'title': 'Politics and society',
280 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
282 'playlist_mincount': 13,
286 def suitable(cls
, url
):
288 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
, ))
289 and super().suitable(url
))
291 def _real_extract(self
, url
):
292 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
293 webpage
= self
._download
_webpage
(url
, playlist_id
)
296 for video
in re
.finditer(
297 r
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|
\b)(?P
<url
>https?
://www\
.arte\
.tv
/%s/videos
/[\w
/-]+)(?P
=q
)' % lang,
299 video = video.group('url
')
302 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
305 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|
', 1)[0]) or None
307 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
308 description=self._og_search_description(webpage, default=None))