]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/mainstreaming.py
c144c75928b118ab1420c2b67aa97c7faa682dc3
[yt-dlp.git] / yt_dlp / extractor / mainstreaming.py
1 import re
2
3 from .common import InfoExtractor
4
5 from ..utils import (
6 int_or_none,
7 js_to_json,
8 parse_duration,
9 traverse_obj,
10 try_get,
11 urljoin
12 )
13
14
15 class MainStreamingIE(InfoExtractor):
16 _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
17 IE_DESC = 'MainStreaming Player'
18
19 _TESTS = [
20 {
21 # Live stream offline, has alternative content id
22 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
23 'info_dict': {
24 'id': '53EN6GxbWaJC',
25 'title': 'Diretta homepage 2021-12-31 12:00',
26 'description': '',
27 'live_status': 'was_live',
28 'ext': 'mp4',
29 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
30 },
31 'expected_warnings': [
32 'Ignoring alternative content ID: WDAF1KOWUpH3',
33 'MainStreaming said: Live event is OFFLINE'
34 ],
35 'skip': 'live stream offline'
36 }, {
37 # playlist
38 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
39 'info_dict': {
40 'id': 'WDAF1KOWUpH3',
41 'title': 'Playlist homepage',
42 },
43 'playlist_mincount': 2
44 }, {
45 # livestream
46 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
47 'info_dict': {
48 'id': 'tDoFkZD3T1Lw',
49 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
50 'live_status': 'is_live',
51 'ext': 'mp4',
52 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
53 },
54 'skip': 'live stream'
55 }, {
56 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
57 'info_dict': {
58 'id': 'EUlZfGWkGpOd',
59 'title': 'La Settimana ',
60 'description': '03 Ottobre ore 02:00',
61 'ext': 'mp4',
62 'live_status': 'not_live',
63 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
64 'duration': 1512
65 }
66 }, {
67 # video without webtools- prefix
68 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
69 'info_dict': {
70 'id': 'MfuWmzL2lGkA',
71 'title': 'TG Mattina',
72 'description': '06 Ottobre ore 08:00',
73 'ext': 'mp4',
74 'live_status': 'not_live',
75 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
76 'duration': 789.04
77 }
78 }, {
79 # always-on livestream with DVR
80 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
81 'info_dict': {
82 'id': 'HVvPMzy',
83 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
84 'description': 'canale all news',
85 'live_status': 'is_live',
86 'ext': 'mp4',
87 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
88 },
89 'params': {
90 'skip_download': True,
91 },
92 }, {
93 # no host
94 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
95 'only_matching': True
96 }, {
97 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
98 'only_matching': True
99 }, {
100 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
101 'only_matching': True
102 }
103 ]
104
105 @staticmethod
106 def _extract_urls(webpage):
107 mobj = re.findall(
108 r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
109 if mobj:
110 return [group[0] for group in mobj]
111
112 def _playlist_entries(self, host, playlist_content):
113 for entry in playlist_content:
114 content_id = entry.get('contentID')
115 yield {
116 '_type': 'url',
117 'ie_key': MainStreamingIE.ie_key(),
118 'id': content_id,
119 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
120 'title': entry.get('title'),
121 'url': f'https://{host}/embed/{content_id}'
122 }
123
124 @staticmethod
125 def _get_webtools_host(host):
126 if not host.startswith('webtools'):
127 host = 'webtools' + ('-' if not host.startswith('.') else '') + host
128 return host
129
130 def _get_webtools_base_url(self, host):
131 return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
132
133 def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
134 # JSON API, does not appear to be documented
135 return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
136
137 def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
138 # webtools docs: https://webtools.msvdn.net/
139 return self._download_json(
140 urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
141
142 def _real_extract(self, url):
143 host, video_id = self._match_valid_url(url).groups()
144 content_info = try_get(
145 self._call_api(
146 host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
147 # Fallback
148 if not content_info:
149 webpage = self._download_webpage(url, video_id)
150 player_config = self._parse_json(
151 self._search_regex(
152 r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
153 default='{}', flags=re.DOTALL),
154 video_id, transform_source=js_to_json, fatal=False) or {}
155 content_info = player_config['contentInfo']
156
157 host = content_info.get('host') or host
158 video_id = content_info.get('contentID') or video_id
159 title = content_info.get('title')
160 description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
161 live_status = 'not_live'
162 if content_info.get('drmEnabled'):
163 self.report_drm(video_id)
164
165 alternative_content_id = content_info.get('alternativeContentID')
166 if alternative_content_id:
167 self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
168
169 content_type = int_or_none(content_info.get('contentType'))
170 format_base_url = None
171 formats = []
172 subtitles = {}
173 # Live content
174 if content_type == 20:
175 dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
176 format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
177 live_status = 'is_live'
178 heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
179 if heartbeat.get('heartBeatUp') is False:
180 self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
181 live_status = 'was_live'
182
183 # Playlist
184 elif content_type == 31:
185 return self.playlist_result(
186 self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
187 # Normal video content?
188 elif content_type == 10:
189 format_base_url = f'https://{host}/vod/{video_id}/%s'
190 # Progressive format
191 # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
192 # however it seems to be the same as original.mp4?
193 formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
194 else:
195 self.raise_no_formats(f'Unknown content type {content_type}')
196
197 if format_base_url:
198 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
199 format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
200 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
201 format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
202
203 subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
204 formats.extend(m3u8_formats + mpd_formats)
205
206 self._sort_formats(formats)
207
208 return {
209 'id': video_id,
210 'title': title,
211 'description': description,
212 'formats': formats,
213 'live_status': live_status,
214 'duration': parse_duration(content_info.get('duration')),
215 'tags': content_info.get('tags'),
216 'subtitles': subtitles,
217 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
218 }