3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
54 'upload_date': '20220718',
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
61 'params': {'skip_download': 'm3u8'}
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
72 _LANG_MAP
= { # ISO639 -> French abbreviations
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
84 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
96 # all obtained by exhaustive testing
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
102 # with both of the below 'BE' sometimes works, sometimes doesn't
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
118 def _real_extract(self
, url
):
119 mobj
= self
._match
_valid
_url
(url
)
120 video_id
= mobj
.group('id')
121 lang
= mobj
.group('lang') or mobj
.group('lang_2')
122 langauge_code
= self
._LANG
_MAP
.get(lang
)
124 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
)
126 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking
.get('restrictedArea'):
128 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
129 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
131 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
137 formats
, subtitles
= [], {}
138 secondary_formats
= []
139 for stream
in config
['data']['attributes']['streams']:
140 # official player contains code like `e.get("versions")[0].eStat.ml5`
141 stream_version
= stream
['versions'][0]
142 stream_version_code
= stream_version
['eStat']['ml5']
145 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
147 lang_pref
= int(''.join('01'[x
] for x
in (
148 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
149 not m
.group('audio_desc'), # and not the audio description version
150 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
151 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
152 not m
.group('has_sub'), # but we prefer no subtitles otherwise
153 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
156 short_label
= traverse_obj(stream_version
, 'shortLabel', expected_type
=str, default
='?')
157 if stream
['protocol'].startswith('HLS'):
158 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
159 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
162 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
163 'language_preference': lang_pref
,
165 if any(map(short_label
.startswith
, ('cc', 'OGsub'))):
166 secondary_formats
.extend(fmts
)
169 self
._merge
_subtitles
(subs
, target
=subtitles
)
171 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
173 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
174 'url': stream
['url'],
175 'format_note': f
'{stream_version.get("label", "unknown")} [{short_label}]',
176 'language_preference': lang_pref
,
177 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
181 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
183 # TODO: chapters from stream['segments']?
184 # The JS also looks for chapters in config['data']['attributes']['chapters'],
185 # but I am yet to find a video having those
187 formats
.extend(secondary_formats
)
188 self
._remove
_duplicate
_formats
(formats
)
189 self
._sort
_formats
(formats
)
191 metadata
= config
['data']['attributes']['metadata']
194 'id': metadata
['providerId'],
195 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
196 'title': traverse_obj(metadata
, 'subtitle', 'title'),
197 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
198 'description': metadata
.get('description'),
199 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
200 'language': metadata
.get('language'),
201 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
202 'is_live': config
['data']['attributes'].get('live', False),
204 'subtitles': subtitles
,
206 {'url': image['url'], 'id': image.get('caption')}
207 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
212 class ArteTVEmbedIE(InfoExtractor
):
213 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
214 _EMBED_REGEX
= [r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1']
216 'url
': 'https
://www
.arte
.tv
/player
/v5
/index
.php?json_url
=https
%3A
%2F
%2Fapi
.arte
.tv
%2Fapi
%2Fplayer
%2Fv2
%2Fconfig
%2Fde
%2F100605
-013-A
&lang
=de
&autoplay
=true
&mute
=0100605-013-A
',
218 'id': '100605-013-A
',
220 'title
': 'United we Stream November Lockdown Edition
#13',
221 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
222 'upload_date': '20201116',
224 'skip': 'No video available'
226 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
227 'only_matching': True,
230 def _real_extract(self
, url
):
232 json_url
= qs
['json_url'][0]
233 video_id
= ArteTVIE
._match
_id
(json_url
)
234 return self
.url_result(
235 json_url
, ie
=ArteTVIE
.ie_key(), video_id
=video_id
)
238 class ArteTVPlaylistIE(ArteTVBaseIE
):
239 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE
._ARTE
_LANGUAGES
241 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
242 'only_matching': True,
244 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
245 'playlist_mincount': 100,
247 'description': 'md5:84e7bf1feda248bc325ebfac818c476e',
249 'title': 'ARTE Reportage - najlepsze reportaże',
253 def _real_extract(self
, url
):
254 lang
, playlist_id
= self
._match
_valid
_url
(url
).group('lang', 'id')
255 playlist
= self
._download
_json
(
256 f
'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id
)['data']['attributes']
259 '_type': 'url_transparent',
260 'url': video
['config']['url'],
261 'ie_key': ArteTVIE
.ie_key(),
262 'id': video
.get('providerId'),
263 'title': video
.get('title'),
264 'alt_title': video
.get('subtitle'),
265 'thumbnail': url_or_none(traverse_obj(video
, ('mainImage', 'url'))),
266 'duration': int_or_none(traverse_obj(video
, ('duration', 'seconds'))),
267 } for video
in traverse_obj(playlist
, ('items', lambda _
, v
: v
['config']['url']))]
269 return self
.playlist_result(entries
, playlist_id
,
270 traverse_obj(playlist
, ('metadata', 'title')),
271 traverse_obj(playlist
, ('metadata', 'description')))
274 class ArteTVCategoryIE(ArteTVBaseIE
):
275 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE
._ARTE
_LANGUAGES
277 'url': 'https://www.arte.tv/en/videos/politics-and-society/',
279 'id': 'politics-and-society',
280 'title': 'Politics and society',
281 'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
283 'playlist_mincount': 13,
287 def suitable(cls
, url
):
289 not any(ie
.suitable(url
) for ie
in (ArteTVIE
, ArteTVPlaylistIE
, ))
290 and super().suitable(url
))
292 def _real_extract(self
, url
):
293 lang
, playlist_id
= self
._match
_valid
_url
(url
).groups()
294 webpage
= self
._download
_webpage
(url
, playlist_id
)
297 for video
in re
.finditer(
298 r
'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|
\b)(?P
<url
>https?
://www\
.arte\
.tv
/%s/videos
/[\w
/-]+)(?P
=q
)' % lang,
300 video = video.group('url
')
303 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
306 title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|
', 1)[0]) or None
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))