3 from .common
import InfoExtractor
16 class ArteTVBaseIE(InfoExtractor
):
17 _ARTE_LANGUAGES
= 'fr|de|en|es|it|pl'
18 _API_BASE
= 'https://api.arte.tv/api/player/v2'
21 class ArteTVIE(ArteTVBaseIE
):
25 (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
26 api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
29 /(?P<id>\d{6}-\d{3}-[AF]|LIVE)
30 ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
32 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
33 'only_matching': True,
35 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
38 'title': 'USA: Dyskryminacja na porodówce',
39 'description': 'md5:242017b7cce59ffae340a54baefcafb1',
40 'alt_title': 'ARTE Reportage',
41 'upload_date': '20201103',
43 'thumbnail': r
're:https://api-cdn\.arte\.tv/.+940x530',
44 'timestamp': 1604417980,
47 'params': {'skip_download': 'm3u8'}
49 'note': 'No alt_title',
50 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/',
54 'upload_date': '20220718',
56 'timestamp': 1658162460,
57 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786',
58 'title': 'La chaleur, supplice des arbres de rue',
59 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530',
61 'params': {'skip_download': 'm3u8'}
63 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
64 'only_matching': True,
66 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE',
67 'only_matching': True,
72 _LANG_MAP
= { # ISO639 -> French abbreviations
79 # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
80 # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
84 _VERSION_CODE_RE
= re
.compile(r
'''(?x)
86 (?P<original_voice>O?)
87 (?P<vlang>[FA]|E\[[A-Z]+\]|EU)?
92 (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU)
96 # all obtained by exhaustive testing
99 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC',
100 'PF', 'PM', 'RE', 'WF', 'YT',
102 # with both of the below 'BE' sometimes works, sometimes doesn't
104 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI',
105 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF',
109 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ',
110 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF',
111 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI',
112 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC',
113 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO',
114 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT',
118 def _real_extract(self
, url
):
119 mobj
= self
._match
_valid
_url
(url
)
120 video_id
= mobj
.group('id')
121 lang
= mobj
.group('lang') or mobj
.group('lang_2')
122 langauge_code
= self
._LANG
_MAP
.get(lang
)
124 config
= self
._download
_json
(f
'{self._API_BASE}/config/{lang}/{video_id}', video_id
)
126 geoblocking
= traverse_obj(config
, ('data', 'attributes', 'restriction', 'geoblocking')) or {}
127 if geoblocking
.get('restrictedArea'):
128 raise GeoRestrictedError(f
'Video restricted to {geoblocking["code"]!r}',
129 countries
=self
._COUNTRIES
_MAP
.get(geoblocking
['code'], ('DE', 'FR')))
131 if not traverse_obj(config
, ('data', 'attributes', 'rights')):
132 # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
133 # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
134 raise ExtractorError(
135 'Video is not available in this language edition of Arte or broadcast rights expired', expected
=True)
137 formats
, subtitles
= [], {}
138 for stream
in config
['data']['attributes']['streams']:
139 # official player contains code like `e.get("versions")[0].eStat.ml5`
140 stream_version
= stream
['versions'][0]
141 stream_version_code
= stream_version
['eStat']['ml5']
144 m
= self
._VERSION
_CODE
_RE
.match(stream_version_code
)
146 lang_pref
= int(''.join('01'[x
] for x
in (
147 m
.group('vlang') == langauge_code
, # we prefer voice in the requested language
148 not m
.group('audio_desc'), # and not the audio description version
149 bool(m
.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice
150 m
.group('sub_lang') == langauge_code
, # if subtitles are present, we prefer them in the requested language
151 not m
.group('has_sub'), # but we prefer no subtitles otherwise
152 not m
.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles
155 if stream
['protocol'].startswith('HLS'):
156 fmts
, subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
157 stream
['url'], video_id
=video_id
, ext
='mp4', m3u8_id
=stream_version_code
, fatal
=False)
160 'format_note': f
'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
161 'language_preference': lang_pref
,
164 self
._merge
_subtitles
(subs
, target
=subtitles
)
166 elif stream
['protocol'] in ('HTTPS', 'RTMP'):
168 'format_id': f
'{stream["protocol"]}-{stream_version_code}',
169 'url': stream
['url'],
170 'format_note': f
'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]',
171 'language_preference': lang_pref
,
172 # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
176 self
.report_warning(f
'Skipping stream with unknown protocol {stream["protocol"]}')
178 # TODO: chapters from stream['segments']?
179 # The JS also looks for chapters in config['data']['attributes']['chapters'],
180 # but I am yet to find a video having those
182 self
._sort
_formats
(formats
)
184 metadata
= config
['data']['attributes']['metadata']
187 'id': metadata
['providerId'],
188 'webpage_url': traverse_obj(metadata
, ('link', 'url')),
189 'title': traverse_obj(metadata
, 'subtitle', 'title'),
190 'alt_title': metadata
.get('subtitle') and metadata
.get('title'),
191 'description': metadata
.get('description'),
192 'duration': traverse_obj(metadata
, ('duration', 'seconds')),
193 'language': metadata
.get('language'),
194 'timestamp': traverse_obj(config
, ('data', 'attributes', 'rights', 'begin'), expected_type
=parse_iso8601
),
195 'is_live': config
['data']['attributes'].get('live', False),
197 'subtitles': subtitles
,
199 {'url': image['url'], 'id': image.get('caption')}
200 for image
in metadata
.get('images') or [] if url_or_none(image
.get('url'))
205 class ArteTVEmbedIE(InfoExtractor
):
206 _VALID_URL
= r
'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
208 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
210 'id': '100605-013-A',
212 'title': 'United we Stream November Lockdown Edition #13',
213 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
214 'upload_date': '20201116',
216 'skip': 'No video available'
218 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
219 'only_matching': True,
223 def _extract_urls(webpage
):
224 return [url
for _
, url
in re
.findall(
225 r
'<(?:iframe|script)[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?arte\
.tv
/player
/v\d
+/index\
.php
\?.*?
\bjson
_url
=.+?
)\
1',
228 def _real_extract(self, url):
230 json_url = qs['json_url
'][0]
231 video_id = ArteTVIE._match_id(json_url)
232 return self.url_result(
233 json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
236 class ArteTVPlaylistIE(ArteTVBaseIE):
237 _VALID_URL = r'https?
://(?
:www\
.)?arte\
.tv
/(?P
<lang
>%s)/videos
/(?P
<id>RC
-\d{6}
)' % ArteTVBaseIE._ARTE_LANGUAGES
239 'url
': 'https
://www
.arte
.tv
/en
/videos
/RC
-016954/earn
-a
-living
/',
240 'only_matching
': True,
242 'url
': 'https
://www
.arte
.tv
/pl
/videos
/RC
-014123/arte
-reportage
/',
243 'playlist_mincount
': 100,
245 'description
': 'md5
:84e7bf1feda248bc325ebfac818c476e
',
247 'title
': 'ARTE Reportage
- najlepsze reportaże
',
251 def _real_extract(self, url):
252 lang, playlist_id = self._match_valid_url(url).group('lang
', 'id')
253 playlist = self._download_json(
254 f'{self._API_BASE}
/playlist
/{lang}
/{playlist_id}
', playlist_id)['data
']['attributes
']
257 '_type
': 'url_transparent
',
258 'url
': video['config
']['url
'],
259 'ie_key
': ArteTVIE.ie_key(),
260 'id': video.get('providerId
'),
261 'title
': video.get('title
'),
262 'alt_title
': video.get('subtitle
'),
263 'thumbnail
': url_or_none(traverse_obj(video, ('mainImage
', 'url
'))),
264 'duration
': int_or_none(traverse_obj(video, ('duration
', 'seconds
'))),
265 } for video in traverse_obj(playlist, ('items
', lambda _, v: v['config
']['url
']))]
267 return self.playlist_result(entries, playlist_id,
268 traverse_obj(playlist, ('metadata
', 'title
')),
269 traverse_obj(playlist, ('metadata
', 'description
')))
272 class ArteTVCategoryIE(ArteTVBaseIE):
273 _VALID_URL = r'https?
://(?
:www\
.)?arte\
.tv
/(?P
<lang
>%s)/videos
/(?P
<id>[\w
-]+(?
:/[\w
-]+)*)/?\s
*$
' % ArteTVBaseIE._ARTE_LANGUAGES
275 'url
': 'https
://www
.arte
.tv
/en
/videos
/politics
-and-society
/',
277 'id': 'politics
-and-society
',
278 'title
': 'Politics
and society
',
279 'description
': 'Investigative documentary series
, geopolitical analysis
, and international commentary
',
281 'playlist_mincount
': 13,
285 def suitable(cls, url):
287 not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
288 and super().suitable(url))
290 def _real_extract(self, url):
291 lang, playlist_id = self._match_valid_url(url).groups()
292 webpage = self._download_webpage(url, playlist_id)
295 for video in re.finditer(
296 r'<a
\b[^
>]*?href\s
*=\s
*(?P
<q
>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
298 video = video.group('url')
301 if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
304 title = (self._og_search_title(webpage, default=None)
305 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
306 title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
308 return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
309 description=self._og_search_description(webpage, default=None))