]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
5 | ExtractorError, | |
6 | GeoRestrictedError, | |
7 | int_or_none, | |
8 | parse_iso8601, | |
9 | parse_qs, | |
10 | strip_or_none, | |
11 | traverse_obj, | |
12 | url_or_none, | |
13 | ) | |
14 | ||
15 | ||
16 | class ArteTVBaseIE(InfoExtractor): | |
17 | _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' | |
18 | _API_BASE = 'https://api.arte.tv/api/player/v2' | |
19 | ||
20 | ||
21 | class ArteTVIE(ArteTVBaseIE): | |
22 | _VALID_URL = r'''(?x) | |
23 | (?:https?:// | |
24 | (?: | |
25 | (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| | |
26 | api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) | |
27 | ) | |
28 | |arte://program) | |
29 | /(?P<id>\d{6}-\d{3}-[AF]|LIVE) | |
30 | ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} | |
31 | _TESTS = [{ | |
32 | 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', | |
33 | 'only_matching': True, | |
34 | }, { | |
35 | 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', | |
36 | 'info_dict': { | |
37 | 'id': '100103-000-A', | |
38 | 'title': 'USA: Dyskryminacja na porodówce', | |
39 | 'description': 'md5:242017b7cce59ffae340a54baefcafb1', | |
40 | 'alt_title': 'ARTE Reportage', | |
41 | 'upload_date': '20201103', | |
42 | 'duration': 554, | |
43 | 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', | |
44 | 'timestamp': 1604417980, | |
45 | 'ext': 'mp4', | |
46 | }, | |
47 | 'params': {'skip_download': 'm3u8'} | |
48 | }, { | |
49 | 'note': 'No alt_title', | |
50 | 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', | |
51 | 'info_dict': { | |
52 | 'id': '110371-000-A', | |
53 | 'ext': 'mp4', | |
54 | 'upload_date': '20220718', | |
55 | 'duration': 154, | |
56 | 'timestamp': 1658162460, | |
57 | 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', | |
58 | 'title': 'La chaleur, supplice des arbres de rue', | |
59 | 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', | |
60 | }, | |
61 | 'params': {'skip_download': 'm3u8'} | |
62 | }, { | |
63 | 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', | |
64 | 'only_matching': True, | |
65 | }, { | |
66 | 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', | |
67 | 'only_matching': True, | |
68 | }] | |
69 | ||
70 | _GEO_BYPASS = True | |
71 | ||
72 | _LANG_MAP = { # ISO639 -> French abbreviations | |
73 | 'fr': 'F', | |
74 | 'de': 'A', | |
75 | 'en': 'E[ANG]', | |
76 | 'es': 'E[ESP]', | |
77 | 'it': 'E[ITA]', | |
78 | 'pl': 'E[POL]', | |
79 | # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/> | |
80 | # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed) | |
81 | 'mul': 'EU', | |
82 | } | |
83 | ||
84 | _VERSION_CODE_RE = re.compile(r'''(?x) | |
85 | V | |
86 | (?P<original_voice>O?) | |
87 | (?P<vlang>[FA]|E\[[A-Z]+\]|EU)? | |
88 | (?P<audio_desc>AUD|) | |
89 | (?: | |
90 | (?P<has_sub>-ST) | |
91 | (?P<sdh_sub>M?) | |
92 | (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU) | |
93 | )? | |
94 | ''') | |
95 | ||
96 | # all obtained by exhaustive testing | |
97 | _COUNTRIES_MAP = { | |
98 | 'DE_FR': { | |
99 | 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', | |
100 | 'PF', 'PM', 'RE', 'WF', 'YT', | |
101 | }, | |
102 | # with both of the below 'BE' sometimes works, sometimes doesn't | |
103 | 'EUR_DE_FR': { | |
104 | 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', | |
105 | 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', | |
106 | 'YT', | |
107 | }, | |
108 | 'SAT': { | |
109 | 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', | |
110 | 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', | |
111 | 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', | |
112 | 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', | |
113 | 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', | |
114 | 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', | |
115 | }, | |
116 | } | |
117 | ||
118 | def _real_extract(self, url): | |
119 | mobj = self._match_valid_url(url) | |
120 | video_id = mobj.group('id') | |
121 | lang = mobj.group('lang') or mobj.group('lang_2') | |
122 | langauge_code = self._LANG_MAP.get(lang) | |
123 | ||
124 | config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) | |
125 | ||
126 | geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} | |
127 | if geoblocking.get('restrictedArea'): | |
128 | raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}', | |
129 | countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR'))) | |
130 | ||
131 | if not traverse_obj(config, ('data', 'attributes', 'rights')): | |
132 | # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten | |
133 | # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23 | |
134 | raise ExtractorError( | |
135 | 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) | |
136 | ||
137 | formats, subtitles = [], {} | |
138 | for stream in config['data']['attributes']['streams']: | |
139 | # official player contains code like `e.get("versions")[0].eStat.ml5` | |
140 | stream_version = stream['versions'][0] | |
141 | stream_version_code = stream_version['eStat']['ml5'] | |
142 | ||
143 | lang_pref = -1 | |
144 | m = self._VERSION_CODE_RE.match(stream_version_code) | |
145 | if m: | |
146 | lang_pref = int(''.join('01'[x] for x in ( | |
147 | m.group('vlang') == langauge_code, # we prefer voice in the requested language | |
148 | not m.group('audio_desc'), # and not the audio description version | |
149 | bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice | |
150 | m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language | |
151 | not m.group('has_sub'), # but we prefer no subtitles otherwise | |
152 | not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles | |
153 | ))) | |
154 | ||
155 | if stream['protocol'].startswith('HLS'): | |
156 | fmts, subs = self._extract_m3u8_formats_and_subtitles( | |
157 | stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) | |
158 | for fmt in fmts: | |
159 | fmt.update({ | |
160 | 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]', | |
161 | 'language_preference': lang_pref, | |
162 | }) | |
163 | formats.extend(fmts) | |
164 | self._merge_subtitles(subs, target=subtitles) | |
165 | ||
166 | elif stream['protocol'] in ('HTTPS', 'RTMP'): | |
167 | formats.append({ | |
168 | 'format_id': f'{stream["protocol"]}-{stream_version_code}', | |
169 | 'url': stream['url'], | |
170 | 'format_note': f'{stream_version.get("label", "unknown")} [{stream_version.get("shortLabel", "?")}]', | |
171 | 'language_preference': lang_pref, | |
172 | # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS | |
173 | }) | |
174 | ||
175 | else: | |
176 | self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') | |
177 | ||
178 | # TODO: chapters from stream['segments']? | |
179 | # The JS also looks for chapters in config['data']['attributes']['chapters'], | |
180 | # but I am yet to find a video having those | |
181 | ||
182 | self._sort_formats(formats) | |
183 | ||
184 | metadata = config['data']['attributes']['metadata'] | |
185 | ||
186 | return { | |
187 | 'id': metadata['providerId'], | |
188 | 'webpage_url': traverse_obj(metadata, ('link', 'url')), | |
189 | 'title': traverse_obj(metadata, 'subtitle', 'title'), | |
190 | 'alt_title': metadata.get('subtitle') and metadata.get('title'), | |
191 | 'description': metadata.get('description'), | |
192 | 'duration': traverse_obj(metadata, ('duration', 'seconds')), | |
193 | 'language': metadata.get('language'), | |
194 | 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601), | |
195 | 'is_live': config['data']['attributes'].get('live', False), | |
196 | 'formats': formats, | |
197 | 'subtitles': subtitles, | |
198 | 'thumbnails': [ | |
199 | {'url': image['url'], 'id': image.get('caption')} | |
200 | for image in metadata.get('images') or [] if url_or_none(image.get('url')) | |
201 | ], | |
202 | } | |
203 | ||
204 | ||
205 | class ArteTVEmbedIE(InfoExtractor): | |
206 | _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' | |
207 | _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] | |
208 | _TESTS = [{ | |
209 | 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', | |
210 | 'info_dict': { | |
211 | 'id': '100605-013-A', | |
212 | 'ext': 'mp4', | |
213 | 'title': 'United we Stream November Lockdown Edition #13', | |
214 | 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', | |
215 | 'upload_date': '20201116', | |
216 | }, | |
217 | 'skip': 'No video available' | |
218 | }, { | |
219 | 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', | |
220 | 'only_matching': True, | |
221 | }] | |
222 | ||
223 | def _real_extract(self, url): | |
224 | qs = parse_qs(url) | |
225 | json_url = qs['json_url'][0] | |
226 | video_id = ArteTVIE._match_id(json_url) | |
227 | return self.url_result( | |
228 | json_url, ie=ArteTVIE.ie_key(), video_id=video_id) | |
229 | ||
230 | ||
231 | class ArteTVPlaylistIE(ArteTVBaseIE): | |
232 | _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES | |
233 | _TESTS = [{ | |
234 | 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', | |
235 | 'only_matching': True, | |
236 | }, { | |
237 | 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', | |
238 | 'playlist_mincount': 100, | |
239 | 'info_dict': { | |
240 | 'description': 'md5:84e7bf1feda248bc325ebfac818c476e', | |
241 | 'id': 'RC-014123', | |
242 | 'title': 'ARTE Reportage - najlepsze reportaże', | |
243 | }, | |
244 | }] | |
245 | ||
246 | def _real_extract(self, url): | |
247 | lang, playlist_id = self._match_valid_url(url).group('lang', 'id') | |
248 | playlist = self._download_json( | |
249 | f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes'] | |
250 | ||
251 | entries = [{ | |
252 | '_type': 'url_transparent', | |
253 | 'url': video['config']['url'], | |
254 | 'ie_key': ArteTVIE.ie_key(), | |
255 | 'id': video.get('providerId'), | |
256 | 'title': video.get('title'), | |
257 | 'alt_title': video.get('subtitle'), | |
258 | 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))), | |
259 | 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))), | |
260 | } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))] | |
261 | ||
262 | return self.playlist_result(entries, playlist_id, | |
263 | traverse_obj(playlist, ('metadata', 'title')), | |
264 | traverse_obj(playlist, ('metadata', 'description'))) | |
265 | ||
266 | ||
267 | class ArteTVCategoryIE(ArteTVBaseIE): | |
268 | _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES | |
269 | _TESTS = [{ | |
270 | 'url': 'https://www.arte.tv/en/videos/politics-and-society/', | |
271 | 'info_dict': { | |
272 | 'id': 'politics-and-society', | |
273 | 'title': 'Politics and society', | |
274 | 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', | |
275 | }, | |
276 | 'playlist_mincount': 13, | |
277 | }] | |
278 | ||
279 | @classmethod | |
280 | def suitable(cls, url): | |
281 | return ( | |
282 | not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) | |
283 | and super().suitable(url)) | |
284 | ||
285 | def _real_extract(self, url): | |
286 | lang, playlist_id = self._match_valid_url(url).groups() | |
287 | webpage = self._download_webpage(url, playlist_id) | |
288 | ||
289 | items = [] | |
290 | for video in re.finditer( | |
291 | r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, | |
292 | webpage): | |
293 | video = video.group('url') | |
294 | if video == url: | |
295 | continue | |
296 | if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): | |
297 | items.append(video) | |
298 | ||
299 | title = (self._og_search_title(webpage, default=None) | |
300 | or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) | |
301 | title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) | |
302 | ||
303 | return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, | |
304 | description=self._og_search_description(webpage, default=None)) |