]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..networking import HEADRequest | |
5 | from ..utils import ( | |
6 | clean_html, | |
7 | determine_ext, | |
8 | ExtractorError, | |
9 | filter_dict, | |
10 | GeoRestrictedError, | |
11 | int_or_none, | |
12 | join_nonempty, | |
13 | parse_duration, | |
14 | remove_start, | |
15 | strip_or_none, | |
16 | traverse_obj, | |
17 | try_get, | |
18 | unified_strdate, | |
19 | unified_timestamp, | |
20 | update_url_query, | |
21 | urljoin, | |
22 | xpath_text, | |
23 | ) | |
24 | ||
25 | ||
26 | class RaiBaseIE(InfoExtractor): | |
27 | _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' | |
28 | _GEO_COUNTRIES = ['IT'] | |
29 | _GEO_BYPASS = False | |
30 | ||
31 | def _fix_m3u8_formats(self, media_url, video_id): | |
32 | fmts = self._extract_m3u8_formats( | |
33 | media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) | |
34 | ||
35 | # Fix malformed m3u8 manifests by setting audio-only/video-only formats | |
36 | for f in fmts: | |
37 | if not f.get('acodec'): | |
38 | f['acodec'] = 'mp4a' | |
39 | if not f.get('vcodec'): | |
40 | f['vcodec'] = 'avc1' | |
41 | man_url = f['url'] | |
42 | if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only | |
43 | f['vcodec'] = 'none' | |
44 | elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only | |
45 | f['acodec'] = 'none' | |
46 | else: # video+audio | |
47 | if f['acodec'] == 'none': | |
48 | f['acodec'] = 'mp4a' | |
49 | if f['vcodec'] == 'none': | |
50 | f['vcodec'] = 'avc1' | |
51 | ||
52 | return fmts | |
53 | ||
54 | def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): | |
55 | def fix_cdata(s): | |
56 | # remove \r\n\t before and after <![CDATA[ ]]> to avoid | |
57 | # polluted text with xpath_text | |
58 | s = re.sub(r'(\]\]>)[\r\n\t]+(</)', '\\1\\2', s) | |
59 | return re.sub(r'(>)[\r\n\t]+(<!\[CDATA\[)', '\\1\\2', s) | |
60 | ||
61 | if not re.match(r'https?://', relinker_url): | |
62 | return {'formats': [{'url': relinker_url}]} | |
63 | ||
64 | # set User-Agent to generic 'Rai' to avoid quality filtering from | |
65 | # the media server and get the maximum qualities available | |
66 | relinker = self._download_xml( | |
67 | relinker_url, video_id, note='Downloading XML metadata', | |
68 | transform_source=fix_cdata, query={'output': 64}, | |
69 | headers={**self.geo_verification_headers(), 'User-Agent': 'Rai'}) | |
70 | ||
71 | if xpath_text(relinker, './license_url', default='{}') != '{}': | |
72 | self.report_drm(video_id) | |
73 | ||
74 | is_live = xpath_text(relinker, './is_live', default='N') == 'Y' | |
75 | duration = parse_duration(xpath_text(relinker, './duration', default=None)) | |
76 | media_url = xpath_text(relinker, './url[@type="content"]', default=None) | |
77 | ||
78 | if not media_url: | |
79 | self.raise_no_formats('The relinker returned no media url') | |
80 | ||
81 | # geo flag is a bit unreliable and not properly set all the time | |
82 | geoprotection = xpath_text(relinker, './geoprotection', default='N') == 'Y' | |
83 | ||
84 | ext = determine_ext(media_url) | |
85 | formats = [] | |
86 | ||
87 | if ext == 'mp3': | |
88 | formats.append({ | |
89 | 'url': media_url, | |
90 | 'vcodec': 'none', | |
91 | 'acodec': 'mp3', | |
92 | 'format_id': 'https-mp3', | |
93 | }) | |
94 | elif ext == 'm3u8' or 'format=m3u8' in media_url: | |
95 | formats.extend(self._fix_m3u8_formats(media_url, video_id)) | |
96 | elif ext == 'f4m': | |
97 | # very likely no longer needed. Cannot find any url that uses it. | |
98 | manifest_url = update_url_query( | |
99 | media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), | |
100 | {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) | |
101 | formats.extend(self._extract_f4m_formats( | |
102 | manifest_url, video_id, f4m_id='hds', fatal=False)) | |
103 | elif ext == 'mp4': | |
104 | bitrate = int_or_none(xpath_text(relinker, './bitrate')) | |
105 | formats.append({ | |
106 | 'url': media_url, | |
107 | 'tbr': bitrate if bitrate > 0 else None, | |
108 | 'format_id': join_nonempty('https', bitrate, delim='-'), | |
109 | }) | |
110 | else: | |
111 | raise ExtractorError('Unrecognized media file found') | |
112 | ||
113 | if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url: | |
114 | self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) | |
115 | ||
116 | if not audio_only and not is_live: | |
117 | formats.extend(self._create_http_urls(media_url, relinker_url, formats, video_id)) | |
118 | ||
119 | return filter_dict({ | |
120 | 'is_live': is_live, | |
121 | 'duration': duration, | |
122 | 'formats': formats, | |
123 | }) | |
124 | ||
125 | def _create_http_urls(self, manifest_url, relinker_url, fmts, video_id): | |
126 | _MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8' | |
127 | _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' | |
128 | _QUALITY = { | |
129 | # tbr: w, h | |
130 | 250: [352, 198], | |
131 | 400: [512, 288], | |
132 | 600: [512, 288], | |
133 | 700: [512, 288], | |
134 | 800: [700, 394], | |
135 | 1200: [736, 414], | |
136 | 1500: [920, 518], | |
137 | 1800: [1024, 576], | |
138 | 2400: [1280, 720], | |
139 | 3200: [1440, 810], | |
140 | 3600: [1440, 810], | |
141 | 5000: [1920, 1080], | |
142 | 10000: [1920, 1080], | |
143 | } | |
144 | ||
145 | def percentage(number, target, pc=20, roof=125): | |
146 | '''check if the target is in the range of number +/- percent''' | |
147 | if not number or number < 0: | |
148 | return False | |
149 | return abs(target - number) < min(float(number) * float(pc) / 100.0, roof) | |
150 | ||
151 | def get_format_info(tbr): | |
152 | import math | |
153 | br = int_or_none(tbr) | |
154 | if len(fmts) == 1 and not br: | |
155 | br = fmts[0].get('tbr') | |
156 | if br and br > 300: | |
157 | tbr = math.floor(br / 100) * 100 | |
158 | else: | |
159 | tbr = 250 | |
160 | ||
161 | # try extracting info from available m3u8 formats | |
162 | format_copy = [None, None] | |
163 | for f in fmts: | |
164 | if f.get('tbr'): | |
165 | if percentage(tbr, f['tbr']): | |
166 | format_copy[0] = f.copy() | |
167 | if [f.get('width'), f.get('height')] == _QUALITY.get(tbr): | |
168 | format_copy[1] = f.copy() | |
169 | format_copy[1]['tbr'] = tbr | |
170 | ||
171 | # prefer format with similar bitrate because there might be | |
172 | # multiple video with the same resolution but different bitrate | |
173 | format_copy = format_copy[0] or format_copy[1] or {} | |
174 | return { | |
175 | 'format_id': f'https-{tbr}', | |
176 | 'width': format_copy.get('width'), | |
177 | 'height': format_copy.get('height'), | |
178 | 'tbr': format_copy.get('tbr') or tbr, | |
179 | 'vcodec': format_copy.get('vcodec') or 'avc1', | |
180 | 'acodec': format_copy.get('acodec') or 'mp4a', | |
181 | 'fps': format_copy.get('fps') or 25, | |
182 | } if format_copy else { | |
183 | 'format_id': f'https-{tbr}', | |
184 | 'width': _QUALITY[tbr][0], | |
185 | 'height': _QUALITY[tbr][1], | |
186 | 'tbr': tbr, | |
187 | 'vcodec': 'avc1', | |
188 | 'acodec': 'mp4a', | |
189 | 'fps': 25, | |
190 | } | |
191 | ||
192 | # Check if MP4 download is available | |
193 | try: | |
194 | self._request_webpage( | |
195 | HEADRequest(_MP4_TMPL % (relinker_url, '*')), video_id, 'Checking MP4 availability') | |
196 | except ExtractorError as e: | |
197 | self.to_screen(f'{video_id}: MP4 direct download is not available: {e.cause}') | |
198 | return [] | |
199 | ||
200 | # filter out single-stream formats | |
201 | fmts = [f for f in fmts | |
202 | if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none'] | |
203 | ||
204 | mobj = re.search(_MANIFEST_REG, manifest_url) | |
205 | if not mobj: | |
206 | return [] | |
207 | available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] | |
208 | ||
209 | formats = [] | |
210 | for q in filter(None, available_qualities): | |
211 | self.write_debug(f'Creating https format for quality {q}') | |
212 | formats.append({ | |
213 | 'url': _MP4_TMPL % (relinker_url, q), | |
214 | 'protocol': 'https', | |
215 | 'ext': 'mp4', | |
216 | **get_format_info(q) | |
217 | }) | |
218 | return formats | |
219 | ||
220 | @staticmethod | |
221 | def _get_thumbnails_list(thumbs, url): | |
222 | return [{ | |
223 | 'url': urljoin(url, thumb_url), | |
224 | } for thumb_url in (thumbs or {}).values() if thumb_url] | |
225 | ||
226 | @staticmethod | |
227 | def _extract_subtitles(url, video_data): | |
228 | STL_EXT = 'stl' | |
229 | SRT_EXT = 'srt' | |
230 | subtitles = {} | |
231 | subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or [] | |
232 | for k in ('subtitles', 'subtitlesUrl'): | |
233 | subtitles_array.append({'url': video_data.get(k)}) | |
234 | for subtitle in subtitles_array: | |
235 | sub_url = subtitle.get('url') | |
236 | if sub_url and isinstance(sub_url, str): | |
237 | sub_lang = subtitle.get('language') or 'it' | |
238 | sub_url = urljoin(url, sub_url) | |
239 | sub_ext = determine_ext(sub_url, SRT_EXT) | |
240 | subtitles.setdefault(sub_lang, []).append({ | |
241 | 'ext': sub_ext, | |
242 | 'url': sub_url, | |
243 | }) | |
244 | if STL_EXT == sub_ext: | |
245 | subtitles[sub_lang].append({ | |
246 | 'ext': SRT_EXT, | |
247 | 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, | |
248 | }) | |
249 | return subtitles | |
250 | ||
251 | ||
252 | class RaiPlayIE(RaiBaseIE): | |
253 | _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' | |
254 | _TESTS = [{ | |
255 | 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', | |
256 | 'md5': '8970abf8caf8aef4696e7b1f2adfc696', | |
257 | 'info_dict': { | |
258 | 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', | |
259 | 'ext': 'mp4', | |
260 | 'title': 'Report del 07/04/2014', | |
261 | 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', | |
262 | 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', | |
263 | 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg', | |
264 | 'uploader': 'Rai 3', | |
265 | 'creator': 'Rai 3', | |
266 | 'duration': 6160, | |
267 | 'series': 'Report', | |
268 | 'season': '2013/14', | |
269 | 'subtitles': {'it': 'count:4'}, | |
270 | 'release_year': 2024, | |
271 | 'episode': 'Espresso nel caffè - 07/04/2014', | |
272 | 'timestamp': 1396919880, | |
273 | 'upload_date': '20140408', | |
274 | 'formats': 'count:4', | |
275 | }, | |
276 | 'params': {'skip_download': True}, | |
277 | }, { | |
278 | # 1080p | |
279 | 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', | |
280 | 'md5': 'aeda7243115380b2dd5e881fd42d949a', | |
281 | 'info_dict': { | |
282 | 'id': 'b1255a4a-8e72-4a2f-b9f3-fc1308e00736', | |
283 | 'ext': 'mp4', | |
284 | 'title': 'Blanca - S1E1 - Senza occhi', | |
285 | 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', | |
286 | 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', | |
287 | 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg', | |
288 | 'uploader': 'Rai Premium', | |
289 | 'creator': 'Rai Fiction', | |
290 | 'duration': 6493, | |
291 | 'series': 'Blanca', | |
292 | 'season': 'Season 1', | |
293 | 'episode_number': 1, | |
294 | 'release_year': 2021, | |
295 | 'season_number': 1, | |
296 | 'episode': 'Senza occhi', | |
297 | 'timestamp': 1637318940, | |
298 | 'upload_date': '20211119', | |
299 | 'formats': 'count:7', | |
300 | }, | |
301 | 'params': {'skip_download': True}, | |
302 | 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] | |
303 | }, { | |
304 | # 1500 quality | |
305 | 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html', | |
306 | 'md5': 'a634d20e8ab2d43724c273563f6bf87a', | |
307 | 'info_dict': { | |
308 | 'id': '0cab3323-732e-45d6-8e86-7704acab6598', | |
309 | 'ext': 'mp4', | |
310 | 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica', | |
311 | 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica', | |
312 | 'description': 'md5:4969e594184b1920c4c1f2b704da9dea', | |
313 | 'thumbnail': r're:^https?://.*\.jpg$', | |
314 | 'uploader': 'Rai Gulp', | |
315 | 'series': 'Mia and Me', | |
316 | 'season': 'Season 1', | |
317 | 'episode_number': 11, | |
318 | 'release_year': 2015, | |
319 | 'season_number': 1, | |
320 | 'episode': 'Tutto ciò che luccica', | |
321 | 'timestamp': 1348495020, | |
322 | 'upload_date': '20120924', | |
323 | }, | |
324 | }, { | |
325 | 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', | |
326 | 'only_matching': True, | |
327 | }, { | |
328 | # subtitles at 'subtitlesArray' key (see #27698) | |
329 | 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', | |
330 | 'only_matching': True, | |
331 | }, { | |
332 | # DRM protected | |
333 | 'url': 'https://www.raiplay.it/video/2021/06/Lo-straordinario-mondo-di-Zoey-S2E1-Lo-straordinario-ritorno-di-Zoey-3ba992de-2332-41ad-9214-73e32ab209f4.html', | |
334 | 'only_matching': True, | |
335 | }] | |
336 | ||
337 | def _real_extract(self, url): | |
338 | base, video_id = self._match_valid_url(url).groups() | |
339 | ||
340 | media = self._download_json( | |
341 | f'{base}.json', video_id, 'Downloading video JSON') | |
342 | ||
343 | if not self.get_param('allow_unplayable_formats'): | |
344 | if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): | |
345 | self.report_drm(video_id) | |
346 | ||
347 | video = media['video'] | |
348 | relinker_info = self._extract_relinker_info(video['content_url'], video_id) | |
349 | date_published = join_nonempty( | |
350 | media.get('date_published'), media.get('time_published'), delim=' ') | |
351 | season = media.get('season') | |
352 | alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') | |
353 | ||
354 | return { | |
355 | 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, | |
356 | 'display_id': video_id, | |
357 | 'title': media.get('name'), | |
358 | 'alt_title': strip_or_none(alt_title or None), | |
359 | 'description': media.get('description'), | |
360 | 'uploader': strip_or_none( | |
361 | traverse_obj(media, ('program_info', 'channel')) | |
362 | or media.get('channel') or None), | |
363 | 'creator': strip_or_none( | |
364 | traverse_obj(media, ('program_info', 'editor')) | |
365 | or media.get('editor') or None), | |
366 | 'duration': parse_duration(video.get('duration')), | |
367 | 'timestamp': unified_timestamp(date_published), | |
368 | 'thumbnails': self._get_thumbnails_list(media.get('images'), url), | |
369 | 'series': traverse_obj(media, ('program_info', 'name')), | |
370 | 'season_number': int_or_none(season), | |
371 | 'season': season if (season and not season.isdigit()) else None, | |
372 | 'episode': media.get('episode_title'), | |
373 | 'episode_number': int_or_none(media.get('episode')), | |
374 | 'subtitles': self._extract_subtitles(url, video), | |
375 | 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), | |
376 | **relinker_info | |
377 | } | |
378 | ||
379 | ||
380 | class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE | |
381 | _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' | |
382 | _TESTS = [{ | |
383 | 'url': 'http://www.raiplay.it/dirette/rainews24', | |
384 | 'info_dict': { | |
385 | 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', | |
386 | 'display_id': 'rainews24', | |
387 | 'ext': 'mp4', | |
388 | 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', | |
389 | 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', | |
390 | 'uploader': 'Rai News 24', | |
391 | 'creator': 'Rai News 24', | |
392 | 'is_live': True, | |
393 | 'live_status': 'is_live', | |
394 | 'upload_date': '20090502', | |
395 | 'timestamp': 1241276220, | |
396 | 'formats': 'count:3', | |
397 | }, | |
398 | 'params': {'skip_download': True}, | |
399 | }] | |
400 | ||
401 | ||
402 | class RaiPlayPlaylistIE(InfoExtractor): | |
403 | _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' | |
404 | _TESTS = [{ | |
405 | # entire series episodes + extras... | |
406 | 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', | |
407 | 'info_dict': { | |
408 | 'id': 'nondirloalmiocapo', | |
409 | 'title': 'Non dirlo al mio capo', | |
410 | 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', | |
411 | }, | |
412 | 'playlist_mincount': 30, | |
413 | }, { | |
414 | # single season | |
415 | 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', | |
416 | 'info_dict': { | |
417 | 'id': 'nondirloalmiocapo', | |
418 | 'title': 'Non dirlo al mio capo - Stagione 2', | |
419 | 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', | |
420 | }, | |
421 | 'playlist_count': 12, | |
422 | }] | |
423 | ||
424 | def _real_extract(self, url): | |
425 | base, playlist_id, extra_id = self._match_valid_url(url).groups() | |
426 | ||
427 | program = self._download_json( | |
428 | f'{base}.json', playlist_id, 'Downloading program JSON') | |
429 | ||
430 | if extra_id: | |
431 | extra_id = extra_id.upper().rstrip('/') | |
432 | ||
433 | playlist_title = program.get('name') | |
434 | entries = [] | |
435 | for b in (program.get('blocks') or []): | |
436 | for s in (b.get('sets') or []): | |
437 | if extra_id: | |
438 | if extra_id != join_nonempty( | |
439 | b.get('name'), s.get('name'), delim='/').replace(' ', '-').upper(): | |
440 | continue | |
441 | playlist_title = join_nonempty(playlist_title, s.get('name'), delim=' - ') | |
442 | ||
443 | s_id = s.get('id') | |
444 | if not s_id: | |
445 | continue | |
446 | medias = self._download_json( | |
447 | f'{base}/{s_id}.json', s_id, | |
448 | 'Downloading content set JSON', fatal=False) | |
449 | if not medias: | |
450 | continue | |
451 | for m in (medias.get('items') or []): | |
452 | path_id = m.get('path_id') | |
453 | if not path_id: | |
454 | continue | |
455 | video_url = urljoin(url, path_id) | |
456 | entries.append(self.url_result( | |
457 | video_url, ie=RaiPlayIE.ie_key(), | |
458 | video_id=RaiPlayIE._match_id(video_url))) | |
459 | ||
460 | return self.playlist_result( | |
461 | entries, playlist_id, playlist_title, | |
462 | try_get(program, lambda x: x['program_info']['description'])) | |
463 | ||
464 | ||
465 | class RaiPlaySoundIE(RaiBaseIE): | |
466 | _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' | |
467 | _TESTS = [{ | |
468 | 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', | |
469 | 'md5': '8970abf8caf8aef4696e7b1f2adfc696', | |
470 | 'info_dict': { | |
471 | 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707', | |
472 | 'ext': 'mp3', | |
473 | 'title': 'Il Ruggito del Coniglio del 10/12/2021', | |
474 | 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', | |
475 | 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', | |
476 | 'thumbnail': r're:^https?://.+\.jpg$', | |
477 | 'uploader': 'rai radio 2', | |
478 | 'duration': 5685, | |
479 | 'series': 'Il Ruggito del Coniglio', | |
480 | 'episode': 'Il Ruggito del Coniglio del 10/12/2021', | |
481 | 'creator': 'rai radio 2', | |
482 | 'timestamp': 1638346620, | |
483 | 'upload_date': '20211201', | |
484 | }, | |
485 | 'params': {'skip_download': True}, | |
486 | }] | |
487 | ||
488 | def _real_extract(self, url): | |
489 | base, audio_id = self._match_valid_url(url).group('base', 'id') | |
490 | media = self._download_json(f'{base}.json', audio_id, 'Downloading audio JSON') | |
491 | uid = try_get(media, lambda x: remove_start(remove_start(x['uniquename'], 'ContentItem-'), 'Page-')) | |
492 | ||
493 | info = {} | |
494 | formats = [] | |
495 | relinkers = set(traverse_obj(media, (('downloadable_audio', 'audio', ('live', 'cards', 0, 'audio')), 'url'))) | |
496 | for r in relinkers: | |
497 | info = self._extract_relinker_info(r, audio_id, True) | |
498 | formats.extend(info.get('formats')) | |
499 | ||
500 | date_published = try_get(media, (lambda x: f'{x["create_date"]} {x.get("create_time") or ""}', | |
501 | lambda x: x['live']['create_date'])) | |
502 | ||
503 | podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} | |
504 | ||
505 | return { | |
506 | **info, | |
507 | 'id': uid or audio_id, | |
508 | 'display_id': audio_id, | |
509 | 'title': traverse_obj(media, 'title', 'episode_title'), | |
510 | 'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none), | |
511 | 'description': media.get('description'), | |
512 | 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), | |
513 | 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), | |
514 | 'timestamp': unified_timestamp(date_published), | |
515 | 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url), | |
516 | 'series': podcast_info.get('title'), | |
517 | 'season_number': int_or_none(media.get('season')), | |
518 | 'episode': media.get('episode_title'), | |
519 | 'episode_number': int_or_none(media.get('episode')), | |
520 | 'formats': formats, | |
521 | } | |
522 | ||
523 | ||
524 | class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete IE | |
525 | _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)' | |
526 | _TESTS = [{ | |
527 | 'url': 'https://www.raiplaysound.it/radio2', | |
528 | 'info_dict': { | |
529 | 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44', | |
530 | 'display_id': 'radio2', | |
531 | 'ext': 'mp4', | |
532 | 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', | |
533 | 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png', | |
534 | 'uploader': 'rai radio 2', | |
535 | 'series': 'Rai Radio 2', | |
536 | 'creator': 'raiplaysound', | |
537 | 'is_live': True, | |
538 | 'live_status': 'is_live', | |
539 | }, | |
540 | 'params': {'skip_download': True}, | |
541 | }] | |
542 | ||
543 | ||
544 | class RaiPlaySoundPlaylistIE(InfoExtractor): | |
545 | _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' | |
546 | _TESTS = [{ | |
547 | # entire show | |
548 | 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', | |
549 | 'info_dict': { | |
550 | 'id': 'ilruggitodelconiglio', | |
551 | 'title': 'Il Ruggito del Coniglio', | |
552 | 'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e', | |
553 | }, | |
554 | 'playlist_mincount': 65, | |
555 | }, { | |
556 | # single season | |
557 | 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', | |
558 | 'info_dict': { | |
559 | 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', | |
560 | 'title': 'Prima Stagione 1995', | |
561 | }, | |
562 | 'playlist_count': 1, | |
563 | }] | |
564 | ||
565 | def _real_extract(self, url): | |
566 | base, playlist_id, extra_id = self._match_valid_url(url).group('base', 'id', 'extra_id') | |
567 | url = f'{base}.json' | |
568 | program = self._download_json(url, playlist_id, 'Downloading program JSON') | |
569 | ||
570 | if extra_id: | |
571 | extra_id = extra_id.rstrip('/') | |
572 | playlist_id += '_' + extra_id.replace('/', '_') | |
573 | path = next(c['path_id'] for c in program.get('filters') or [] if extra_id in c.get('weblink')) | |
574 | program = self._download_json( | |
575 | urljoin('https://www.raiplaysound.it', path), playlist_id, 'Downloading program secondary JSON') | |
576 | ||
577 | entries = [ | |
578 | self.url_result(urljoin(base, c['path_id']), ie=RaiPlaySoundIE.ie_key()) | |
579 | for c in traverse_obj(program, 'cards', ('block', 'cards')) or [] | |
580 | if c.get('path_id')] | |
581 | ||
582 | return self.playlist_result(entries, playlist_id, program.get('title'), | |
583 | traverse_obj(program, ('podcast_info', 'description'))) | |
584 | ||
585 | ||
586 | class RaiIE(RaiBaseIE): | |
587 | _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' | |
588 | _TESTS = [{ | |
589 | 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', | |
590 | 'info_dict': { | |
591 | 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', | |
592 | 'ext': 'mp4', | |
593 | 'title': 'TG PRIMO TEMPO', | |
594 | 'thumbnail': r're:^https?://.*\.jpg', | |
595 | 'duration': 1758, | |
596 | 'upload_date': '20140612', | |
597 | }, | |
598 | 'params': {'skip_download': True}, | |
599 | 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] | |
600 | }, { | |
601 | 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', | |
602 | 'info_dict': { | |
603 | 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', | |
604 | 'ext': 'mp4', | |
605 | 'title': 'TG1 ore 20:00 del 03/11/2016', | |
606 | 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', | |
607 | 'thumbnail': r're:^https?://.*\.jpg$', | |
608 | 'duration': 2214, | |
609 | 'upload_date': '20161103' | |
610 | }, | |
611 | 'params': {'skip_download': True}, | |
612 | }, { | |
613 | # Direct MMS: Media URL no longer works. | |
614 | 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', | |
615 | 'only_matching': True, | |
616 | }] | |
617 | ||
618 | def _real_extract(self, url): | |
619 | content_id = self._match_id(url) | |
620 | media = self._download_json( | |
621 | f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', | |
622 | content_id, 'Downloading video JSON', fatal=False, expected_status=404) | |
623 | ||
624 | if media is None: | |
625 | return None | |
626 | ||
627 | if 'Audio' in media['type']: | |
628 | relinker_info = { | |
629 | 'formats': [{ | |
630 | 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'), | |
631 | 'url': media['audioUrl'], | |
632 | 'ext': media.get('formatoAudio'), | |
633 | 'vcodec': 'none', | |
634 | 'acodec': media.get('formatoAudio'), | |
635 | }] | |
636 | } | |
637 | elif 'Video' in media['type']: | |
638 | relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) | |
639 | else: | |
640 | raise ExtractorError('not a media file') | |
641 | ||
642 | thumbnails = self._get_thumbnails_list( | |
643 | {image_type: media.get(image_type) for image_type in ( | |
644 | 'image', 'image_medium', 'image_300')}, url) | |
645 | ||
646 | return { | |
647 | 'id': content_id, | |
648 | 'title': strip_or_none(media.get('name') or media.get('title')), | |
649 | 'description': strip_or_none(media.get('desc')) or None, | |
650 | 'thumbnails': thumbnails, | |
651 | 'uploader': strip_or_none(media.get('author')) or None, | |
652 | 'upload_date': unified_strdate(media.get('date')), | |
653 | 'duration': parse_duration(media.get('length')), | |
654 | 'subtitles': self._extract_subtitles(url, media), | |
655 | **relinker_info | |
656 | } | |
657 | ||
658 | ||
659 | class RaiNewsIE(RaiBaseIE): | |
660 | _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' | |
661 | _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] | |
662 | _TESTS = [{ | |
663 | # new rainews player (#3911) | |
664 | 'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html', | |
665 | 'info_dict': { | |
666 | 'id': '31e8017c-845c-43f5-9c48-245b43c3a079', | |
667 | 'ext': 'mp4', | |
668 | 'title': 'md5:1e81364b09de4a149042bac3c7d36f0b', | |
669 | 'duration': 196, | |
670 | 'upload_date': '20240225', | |
671 | 'uploader': 'rainews', | |
672 | 'formats': 'count:2', | |
673 | }, | |
674 | 'params': {'skip_download': True}, | |
675 | }, { | |
676 | # old content with fallback method to extract media urls | |
677 | 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', | |
678 | 'info_dict': { | |
679 | 'id': '1632c009-c843-4836-bb65-80c33084a64b', | |
680 | 'ext': 'mp4', | |
681 | 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', | |
682 | 'description': 'I film in uscita questa settimana.', | |
683 | 'thumbnail': r're:^https?://.*\.png$', | |
684 | 'duration': 833, | |
685 | 'upload_date': '20161103', | |
686 | 'formats': 'count:8', | |
687 | }, | |
688 | 'params': {'skip_download': True}, | |
689 | 'expected_warnings': ['unable to extract player_data'], | |
690 | }, { | |
691 | # iframe + drm | |
692 | 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', | |
693 | 'only_matching': True, | |
694 | }] | |
695 | _PLAYER_TAG = 'news' | |
696 | ||
697 | def _real_extract(self, url): | |
698 | video_id = self._match_id(url) | |
699 | ||
700 | webpage = self._download_webpage(url, video_id) | |
701 | ||
702 | player_data = self._search_json( | |
703 | rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id, | |
704 | transform_source=clean_html, default={}) | |
705 | track_info = player_data.get('track_info') | |
706 | relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') | |
707 | ||
708 | if not relinker_url: | |
709 | # fallback on old implementation for some old content | |
710 | try: | |
711 | return RaiIE._real_extract(self, url) | |
712 | except GeoRestrictedError: | |
713 | raise | |
714 | except ExtractorError as e: | |
715 | raise ExtractorError('Relinker URL not found', cause=e) | |
716 | ||
717 | relinker_info = self._extract_relinker_info(urljoin(url, relinker_url), video_id) | |
718 | ||
719 | return { | |
720 | 'id': video_id, | |
721 | 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage), | |
722 | 'upload_date': unified_strdate(track_info.get('date')), | |
723 | 'uploader': strip_or_none(track_info.get('editor') or None), | |
724 | **relinker_info | |
725 | } | |
726 | ||
727 | ||
728 | class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE | |
729 | _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' | |
730 | _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] | |
731 | _TESTS = [{ | |
732 | 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html', | |
733 | 'info_dict': { | |
734 | 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb', | |
735 | 'ext': 'mp4', | |
736 | 'title': 'Alberto Asor Rosa: Letteratura e potere', | |
737 | 'duration': 1756, | |
738 | 'upload_date': '20181206', | |
739 | 'uploader': 'raicultura', | |
740 | 'formats': 'count:2', | |
741 | }, | |
742 | 'params': {'skip_download': True}, | |
743 | }] | |
744 | _PLAYER_TAG = 'cultura' | |
745 | ||
746 | ||
747 | class RaiSudtirolIE(RaiBaseIE): | |
748 | _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)' | |
749 | _TESTS = [{ | |
750 | # mp4 file | |
751 | 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', | |
752 | 'info_dict': { | |
753 | 'id': 'Ptv1619729460', | |
754 | 'ext': 'mp4', | |
755 | 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', | |
756 | 'series': 'Euro: trasmisciun d\'economia', | |
757 | 'upload_date': '20210429', | |
758 | 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg', | |
759 | 'uploader': 'raisudtirol', | |
760 | 'formats': 'count:1', | |
761 | }, | |
762 | 'params': {'skip_download': True}, | |
763 | }, { | |
764 | # m3u manifest | |
765 | 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil', | |
766 | 'info_dict': { | |
767 | 'id': 'GUGGUG_P1', | |
768 | 'ext': 'mp4', | |
769 | 'title': 'GUGGUG! La Prospettiva - Die Perspektive', | |
770 | 'uploader': 'raisudtirol', | |
771 | 'formats': 'count:6', | |
772 | }, | |
773 | 'params': {'skip_download': True}, | |
774 | }] | |
775 | ||
776 | def _real_extract(self, url): | |
777 | video_id = self._match_id(url) | |
778 | webpage = self._download_webpage(url, video_id) | |
779 | ||
780 | video_date = self._html_search_regex( | |
781 | r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None) | |
782 | video_title = self._html_search_regex([ | |
783 | r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','], | |
784 | webpage, 'video_title', default=None) | |
785 | video_url = self._html_search_regex([ | |
786 | r'sources:\s*\[\{file:\s*"(.+?)"\}\]', | |
787 | r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'], | |
788 | webpage, 'video_url', default=None) | |
789 | ||
790 | ext = determine_ext(video_url) | |
791 | if ext == 'm3u8': | |
792 | formats = self._extract_m3u8_formats(video_url, video_id) | |
793 | elif ext == 'mp4': | |
794 | formats = [{ | |
795 | 'format_id': 'https-mp4', | |
796 | 'url': self._proto_relative_url(video_url), | |
797 | 'width': 1024, | |
798 | 'height': 576, | |
799 | 'fps': 25, | |
800 | 'vcodec': 'avc1', | |
801 | 'acodec': 'mp4a', | |
802 | }] | |
803 | else: | |
804 | formats = [] | |
805 | self.raise_no_formats(f'Unrecognized media file: {video_url}') | |
806 | ||
807 | return { | |
808 | 'id': video_id, | |
809 | 'title': join_nonempty(video_title, video_date, delim=' - '), | |
810 | 'series': video_title if video_date else None, | |
811 | 'upload_date': unified_strdate(video_date), | |
812 | 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex( | |
813 | r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)), | |
814 | 'uploader': 'raisudtirol', | |
815 | 'formats': formats, | |
816 | } |