]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..networking import HEADRequest | |
5 | from ..utils import ( | |
6 | ExtractorError, | |
7 | base_url, | |
8 | clean_html, | |
9 | extract_attributes, | |
10 | get_element_html_by_class, | |
11 | get_element_html_by_id, | |
12 | int_or_none, | |
13 | js_to_json, | |
14 | mimetype2ext, | |
15 | sanitize_url, | |
16 | traverse_obj, | |
17 | try_call, | |
18 | url_basename, | |
19 | urljoin, | |
20 | ) | |
21 | ||
22 | ||
23 | class RCSBaseIE(InfoExtractor): | |
24 | # based on VideoPlayerLoader.prototype.getVideoSrc | |
25 | # and VideoPlayerLoader.prototype.transformSrc from | |
26 | # https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs | |
27 | _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' | |
28 | _RCS_ID_RE = r'[\w-]+-\d{10}' | |
29 | _MIGRATION_MAP = { | |
30 | 'videoamica-vh.akamaihd': 'amica', | |
31 | 'media2-amica-it.akamaized': 'amica', | |
32 | 'corrierevam-vh.akamaihd': 'corriere', | |
33 | 'media2vam-corriere-it.akamaized': 'corriere', | |
34 | 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno', | |
35 | 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno', | |
36 | 'corveneto-vh.akamaihd': 'corrieredelveneto', | |
37 | 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto', | |
38 | 'corbologna-vh.akamaihd': 'corrieredibologna', | |
39 | 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna', | |
40 | 'corfiorentino-vh.akamaihd': 'corrierefiorentino', | |
41 | 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino', | |
42 | 'corinnovazione-vh.akamaihd': 'corriereinnovazione', | |
43 | 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet', | |
44 | 'videogazzanet-vh.akamaihd': 'gazzanet', | |
45 | 'videogazzaworld-vh.akamaihd': 'gazzaworld', | |
46 | 'gazzettavam-vh.akamaihd': 'gazzetta', | |
47 | 'media2vam-gazzetta-it.akamaized': 'gazzetta', | |
48 | 'videoiodonna-vh.akamaihd': 'iodonna', | |
49 | 'media2-leitv-it.akamaized': 'leitv', | |
50 | 'videoleitv-vh.akamaihd': 'leitv', | |
51 | 'videoliving-vh.akamaihd': 'living', | |
52 | 'media2-living-corriere-it.akamaized': 'living', | |
53 | 'media2-oggi-it.akamaized': 'oggi', | |
54 | 'videooggi-vh.akamaihd': 'oggi', | |
55 | 'media2-quimamme-it.akamaized': 'quimamme', | |
56 | 'quimamme-vh.akamaihd': 'quimamme', | |
57 | 'videorunning-vh.akamaihd': 'running', | |
58 | 'media2-style-corriere-it.akamaized': 'style', | |
59 | 'style-vh.akamaihd': 'style', | |
60 | 'videostyle-vh.akamaihd': 'style', | |
61 | 'media2-stylepiccoli-it.akamaized': 'stylepiccoli', | |
62 | 'stylepiccoli-vh.akamaihd': 'stylepiccoli', | |
63 | 'doveviaggi-vh.akamaihd': 'viaggi', | |
64 | 'media2-doveviaggi-it.akamaized': 'viaggi', | |
65 | 'media2-vivimilano-corriere-it.akamaized': 'vivimilano', | |
66 | 'vivimilano-vh.akamaihd': 'vivimilano', | |
67 | 'media2-youreporter-it.akamaized': 'youreporter' | |
68 | } | |
69 | ||
70 | def _get_video_src(self, video): | |
71 | for source in traverse_obj(video, ( | |
72 | 'mediaProfile', 'mediaFile', lambda _, v: v.get('mimeType'))): | |
73 | url = source['value'] | |
74 | for s, r in ( | |
75 | ('media2vam.corriere.it.edgesuite.net', 'media2vam-corriere-it.akamaized.net'), | |
76 | ('media.youreporter.it.edgesuite.net', 'media-youreporter-it.akamaized.net'), | |
77 | ('corrierepmd.corriere.it.edgesuite.net', 'corrierepmd-corriere-it.akamaized.net'), | |
78 | ('media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/', 'video.corriere.it/vr360/videos/'), | |
79 | ('http://', 'https://'), | |
80 | ): | |
81 | url = url.replace(s, r) | |
82 | ||
83 | type_ = mimetype2ext(source['mimeType']) | |
84 | if type_ == 'm3u8' and '-vh.akamaihd' in url: | |
85 | # still needed for some old content: see _TESTS #3 | |
86 | matches = re.search(r'(?:https?:)?//(?P<host>[\w\.\-]+)\.net/i(?P<path>.+)$', url) | |
87 | if matches: | |
88 | url = f'https://vod.rcsobjects.it/hls/{self._MIGRATION_MAP[matches.group("host")]}{matches.group("path")}' | |
89 | if traverse_obj(video, ('mediaProfile', 'geoblocking')) or ( | |
90 | type_ == 'm3u8' and 'fcs.quotidiani_!' in url): | |
91 | url = url.replace('vod.rcsobjects', 'vod-it.rcsobjects') | |
92 | if type_ == 'm3u8' and 'vod' in url: | |
93 | url = url.replace('.csmil', '.urlset') | |
94 | if type_ == 'mp3': | |
95 | url = url.replace('media2vam-corriere-it.akamaized.net', 'vod.rcsobjects.it/corriere') | |
96 | ||
97 | yield { | |
98 | 'type': type_, | |
99 | 'url': url, | |
100 | 'bitrate': source.get('bitrate') | |
101 | } | |
102 | ||
103 | def _create_http_formats(self, m3u8_formats, video_id): | |
104 | for f in m3u8_formats: | |
105 | if f['vcodec'] == 'none': | |
106 | continue | |
107 | http_url = re.sub(r'(https?://[^/]+)/hls/([^?#]+?\.mp4).+', r'\g<1>/\g<2>', f['url']) | |
108 | if http_url == f['url']: | |
109 | continue | |
110 | ||
111 | http_f = f.copy() | |
112 | del http_f['manifest_url'] | |
113 | format_id = try_call(lambda: http_f['format_id'].replace('hls-', 'https-')) | |
114 | urlh = self._request_webpage(HEADRequest(http_url), video_id, fatal=False, | |
115 | note=f'Check filesize for {format_id}') | |
116 | if not urlh: | |
117 | continue | |
118 | ||
119 | http_f.update({ | |
120 | 'format_id': format_id, | |
121 | 'url': http_url, | |
122 | 'protocol': 'https', | |
123 | 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), | |
124 | }) | |
125 | yield http_f | |
126 | ||
127 | def _create_formats(self, sources, video_id): | |
128 | for source in sources: | |
129 | if source['type'] == 'm3u8': | |
130 | m3u8_formats = self._extract_m3u8_formats( | |
131 | source['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) | |
132 | yield from m3u8_formats | |
133 | yield from self._create_http_formats(m3u8_formats, video_id) | |
134 | elif source['type'] == 'mp3': | |
135 | yield { | |
136 | 'format_id': 'https-mp3', | |
137 | 'ext': 'mp3', | |
138 | 'acodec': 'mp3', | |
139 | 'vcodec': 'none', | |
140 | 'abr': source.get('bitrate'), | |
141 | 'url': source['url'], | |
142 | } | |
143 | ||
144 | def _real_extract(self, url): | |
145 | cdn, video_id = self._match_valid_url(url).group('cdn', 'id') | |
146 | display_id, video_data = None, None | |
147 | ||
148 | if re.match(self._UUID_RE, video_id) or re.match(self._RCS_ID_RE, video_id): | |
149 | url = f'https://video.{cdn}/video-json/{video_id}' | |
150 | else: | |
151 | webpage = self._download_webpage(url, video_id) | |
152 | data_config = get_element_html_by_id('divVideoPlayer', webpage) or get_element_html_by_class('divVideoPlayer', webpage) | |
153 | ||
154 | if data_config: | |
155 | data_config = self._parse_json( | |
156 | extract_attributes(data_config).get('data-config'), | |
157 | video_id, fatal=False) or {} | |
158 | if data_config.get('newspaper'): | |
159 | cdn = f'{data_config["newspaper"]}.it' | |
160 | display_id, video_id = video_id, data_config.get('uuid') or video_id | |
161 | url = f'https://video.{cdn}/video-json/{video_id}' | |
162 | else: | |
163 | json_url = self._search_regex( | |
164 | r'''(?x)url\s*=\s*(["']) | |
165 | (?P<url> | |
166 | (?:https?:)?//video\.rcs\.it | |
167 | /fragment-includes/video-includes/[^"']+?\.json | |
168 | )\1;''', | |
169 | webpage, video_id, group='url', default=None) | |
170 | if json_url: | |
171 | video_data = self._download_json(sanitize_url(json_url, scheme='https'), video_id) | |
172 | display_id, video_id = video_id, video_data.get('id') or video_id | |
173 | ||
174 | if not video_data: | |
175 | webpage = self._download_webpage(url, video_id) | |
176 | ||
177 | video_data = self._search_json( | |
178 | '##start-video##', webpage, 'video data', video_id, default=None, | |
179 | end_pattern='##end-video##', transform_source=js_to_json) | |
180 | ||
181 | if not video_data: | |
182 | # try search for iframes | |
183 | emb = RCSEmbedsIE._extract_url(webpage) | |
184 | if emb: | |
185 | return { | |
186 | '_type': 'url_transparent', | |
187 | 'url': emb, | |
188 | 'ie_key': RCSEmbedsIE.ie_key() | |
189 | } | |
190 | ||
191 | if not video_data: | |
192 | raise ExtractorError('Video data not found in the page') | |
193 | ||
194 | return { | |
195 | 'id': video_id, | |
196 | 'display_id': display_id, | |
197 | 'title': video_data.get('title'), | |
198 | 'description': (clean_html(video_data.get('description')) | |
199 | or clean_html(video_data.get('htmlDescription')) | |
200 | or self._html_search_meta('description', webpage)), | |
201 | 'uploader': video_data.get('provider') or cdn, | |
202 | 'formats': list(self._create_formats(self._get_video_src(video_data), video_id)), | |
203 | } | |
204 | ||
205 | ||
206 | class RCSEmbedsIE(RCSBaseIE): | |
207 | _VALID_URL = r'''(?x) | |
208 | https?://(?P<vid>video)\. | |
209 | (?P<cdn> | |
210 | (?: | |
211 | rcs| | |
212 | (?:corriere\w+\.)?corriere| | |
213 | (?:gazzanet\.)?gazzetta | |
214 | )\.it) | |
215 | /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' | |
216 | _EMBED_REGEX = [r'''(?x) | |
217 | (?: | |
218 | data-frame-src=| | |
219 | <iframe[^\n]+src= | |
220 | ) | |
221 | (["']) | |
222 | (?P<url>(?:https?:)?//video\. | |
223 | (?: | |
224 | rcs| | |
225 | (?:corriere\w+\.)?corriere| | |
226 | (?:gazzanet\.)?gazzetta | |
227 | ) | |
228 | \.it/video-embed/.+?) | |
229 | \1'''] | |
230 | _TESTS = [{ | |
231 | 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', | |
232 | 'md5': '0faca97df525032bb9847f690bc3720c', | |
233 | 'info_dict': { | |
234 | 'id': 'iodonna-0001585037', | |
235 | 'ext': 'mp4', | |
236 | 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"', | |
237 | 'description': 'md5:65b09633df9ffee57f48b39e34c9e067', | |
238 | 'uploader': 'rcs.it', | |
239 | } | |
240 | }, { | |
241 | 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', | |
242 | 'only_matching': True | |
243 | }, { | |
244 | 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', | |
245 | 'only_matching': True | |
246 | }] | |
247 | _WEBPAGE_TESTS = [{ | |
248 | 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/', | |
249 | 'info_dict': { | |
250 | 'id': 'iodonna-0002033648', | |
251 | 'ext': 'mp4', | |
252 | 'title': 'Monica Bellucci: «Più del lavoro, oggi per me sono importanti l\'amicizia e la famiglia»', | |
253 | 'description': 'md5:daea6d9837351e56b1ab615c06bebac1', | |
254 | 'uploader': 'rcs.it', | |
255 | } | |
256 | }] | |
257 | ||
258 | @staticmethod | |
259 | def _sanitize_url(url): | |
260 | url = sanitize_url(url, scheme='https') | |
261 | return urljoin(base_url(url), url_basename(url)) | |
262 | ||
263 | @classmethod | |
264 | def _extract_embed_urls(cls, url, webpage): | |
265 | return map(cls._sanitize_url, super()._extract_embed_urls(url, webpage)) | |
266 | ||
267 | ||
268 | class RCSIE(RCSBaseIE): | |
269 | _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\. | |
270 | (?P<cdn> | |
271 | (?: | |
272 | corrieredelmezzogiorno\. | |
273 | |corrieredelveneto\. | |
274 | |corrieredibologna\. | |
275 | |corrierefiorentino\. | |
276 | )?corriere\.it | |
277 | |(?:gazzanet\.)?gazzetta\.it) | |
278 | /(?!video-embed/)[^?#]+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' | |
279 | _TESTS = [{ | |
280 | # json iframe directly from id | |
281 | 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', | |
282 | 'md5': '14946840dec46ecfddf66ba4eea7d2b2', | |
283 | 'info_dict': { | |
284 | 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', | |
285 | 'ext': 'mp4', | |
286 | 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', | |
287 | 'description': 'md5:3915ce5ebb3d2571deb69a5eb85ac9b5', | |
288 | 'uploader': 'Corriere Tv', | |
289 | } | |
290 | }, { | |
291 | # search for video id inside the page | |
292 | 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', | |
293 | 'md5': 'f22a92d9e666e80f2fffbf2825359c81', | |
294 | 'info_dict': { | |
295 | 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', | |
296 | 'display_id': 'norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen', | |
297 | 'ext': 'mp4', | |
298 | 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', | |
299 | 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', | |
300 | 'uploader': 'DOVE Viaggi', | |
301 | } | |
302 | }, { | |
303 | # only audio format https://github.com/yt-dlp/yt-dlp/issues/5683 | |
304 | 'url': 'https://video.corriere.it/cronaca/audio-telefonata-il-papa-becciu-santita-lettera-che-mi-ha-inviato-condanna/b94c0d20-70c2-11ed-9572-e4b947a0ebd2', | |
305 | 'md5': 'aaffb08d02f2ce4292a4654694c78150', | |
306 | 'info_dict': { | |
307 | 'id': 'b94c0d20-70c2-11ed-9572-e4b947a0ebd2', | |
308 | 'ext': 'mp3', | |
309 | 'title': 'L\'audio della telefonata tra il Papa e Becciu: «Santità, la lettera che mi ha inviato è una condanna»', | |
310 | 'description': 'md5:c0ddb61bd94a8d4e0d4bb9cda50a689b', | |
311 | 'uploader': 'Corriere Tv', | |
312 | 'formats': [{'format_id': 'https-mp3', 'ext': 'mp3'}], | |
313 | } | |
314 | }, { | |
315 | # old content still needs cdn migration | |
316 | 'url': 'https://viaggi.corriere.it/video/milano-varallo-sesia-sul-treno-a-vapore/', | |
317 | 'md5': '2dfdce7af249654ad27eeba03fe1e08d', | |
318 | 'info_dict': { | |
319 | 'id': 'd8f6c8d0-f7d7-11e8-bfca-f74cf4634191', | |
320 | 'display_id': 'milano-varallo-sesia-sul-treno-a-vapore', | |
321 | 'ext': 'mp4', | |
322 | 'title': 'Milano-Varallo Sesia sul treno a vapore', | |
323 | 'description': 'md5:6348f47aac230397fe341a74f7678d53', | |
324 | 'uploader': 'DOVE Viaggi', | |
325 | } | |
326 | }, { | |
327 | 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', | |
328 | 'only_matching': True | |
329 | }] | |
330 | ||
331 | ||
332 | class RCSVariousIE(RCSBaseIE): | |
333 | _VALID_URL = r'''(?x)https?://www\. | |
334 | (?P<cdn> | |
335 | leitv\.it| | |
336 | youreporter\.it| | |
337 | amica\.it | |
338 | )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)''' | |
339 | _TESTS = [{ | |
340 | 'url': 'https://www.leitv.it/benessere/mal-di-testa/', | |
341 | 'md5': '3b7a683d105a7313ec7513b014443631', | |
342 | 'info_dict': { | |
343 | 'id': 'leitv-0000125151', | |
344 | 'display_id': 'mal-di-testa', | |
345 | 'ext': 'mp4', | |
346 | 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto', | |
347 | 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5', | |
348 | 'uploader': 'leitv.it', | |
349 | } | |
350 | }, { | |
351 | 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', | |
352 | 'md5': '3989b6d603482611a2abd2f32b79f739', | |
353 | 'info_dict': { | |
354 | 'id': 'youreporter-0000332574', | |
355 | 'display_id': 'fiume-sesia-3-ottobre-2020', | |
356 | 'ext': 'mp4', | |
357 | 'title': 'Fiume Sesia 3 ottobre 2020', | |
358 | 'description': 'md5:0070eef1cc884d13c970a4125063de55', | |
359 | 'uploader': 'youreporter.it', | |
360 | } | |
361 | }, { | |
362 | 'url': 'https://www.amica.it/video-post/saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi/', | |
363 | 'md5': '187cce524dfd0343c95646c047375fc4', | |
364 | 'info_dict': { | |
365 | 'id': 'amica-0001225365', | |
366 | 'display_id': 'saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi', | |
367 | 'ext': 'mp4', | |
368 | 'title': '"Saint Omer": al cinema il film Leone d\'argento che ribalta gli stereotipi', | |
369 | 'description': 'md5:b1c8869c2dcfd6073a2a311ba0008aa8', | |
370 | 'uploader': 'rcs.it', | |
371 | } | |
372 | }] |