yt_dlp/extractor/mainstreaming.py

   1 # coding: utf-8
   2 import re
   3
   4 from .common import InfoExtractor
   5
   6 from ..utils import (
   7     int_or_none,
   8     js_to_json,
   9     parse_duration,
  10     traverse_obj,
  11     try_get,
  12     urljoin
  13 )
  14
  15
  16 class MainStreamingIE(InfoExtractor):
  17     _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
  18     IE_DESC = 'MainStreaming Player'
  19
  20     _TESTS = [
  21         {
  22             # Live stream offline, has alternative content id
  23             'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
  24             'info_dict': {
  25                 'id': '53EN6GxbWaJC',
  26                 'title': 'Diretta homepage 2021-12-31 12:00',
  27                 'description': '',
  28                 'live_status': 'was_live',
  29                 'ext': 'mp4',
  30                 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  31             },
  32             'expected_warnings': [
  33                 'Ignoring alternative content ID: WDAF1KOWUpH3',
  34                 'MainStreaming said: Live event is OFFLINE'
  35             ],
  36             'skip': 'live stream offline'
  37         }, {
  38             # playlist
  39             'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
  40             'info_dict': {
  41                 'id': 'WDAF1KOWUpH3',
  42                 'title': 'Playlist homepage',
  43             },
  44             'playlist_mincount': 2
  45         }, {
  46             # livestream
  47             'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
  48             'info_dict': {
  49                 'id': 'tDoFkZD3T1Lw',
  50                 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
  51                 'live_status': 'is_live',
  52                 'ext': 'mp4',
  53                 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  54             },
  55             'skip': 'live stream'
  56         }, {
  57             'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
  58             'info_dict': {
  59                 'id': 'EUlZfGWkGpOd',
  60                 'title': 'La Settimana ',
  61                 'description': '03 Ottobre ore 02:00',
  62                 'ext': 'mp4',
  63                 'live_status': 'not_live',
  64                 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  65                 'duration': 1512
  66             }
  67         }, {
  68             # video without webtools- prefix
  69             'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
  70             'info_dict': {
  71                 'id': 'MfuWmzL2lGkA',
  72                 'title': 'TG Mattina',
  73                 'description': '06 Ottobre ore 08:00',
  74                 'ext': 'mp4',
  75                 'live_status': 'not_live',
  76                 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  77                 'duration': 789.04
  78             }
  79         }, {
  80             # always-on livestream with DVR
  81             'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
  82             'info_dict': {
  83                 'id': 'HVvPMzy',
  84                 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
  85                 'description': 'canale all news',
  86                 'live_status': 'is_live',
  87                 'ext': 'mp4',
  88                 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  89             },
  90             'params': {
  91                 'skip_download': True,
  92             },
  93         }, {
  94             # no host
  95             'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
  96             'only_matching': True
  97         }, {
  98             'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
  99             'only_matching': True
 100         }, {
 101             'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
 102             'only_matching': True
 103         }
 104     ]
 105
 106     @staticmethod
 107     def _extract_urls(webpage):
 108         mobj = re.findall(
 109             r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage)
 110         if mobj:
 111             return [group[0] for group in mobj]
 112
 113     def _playlist_entries(self, host, playlist_content):
 114         for entry in playlist_content:
 115             content_id = entry.get('contentID')
 116             yield {
 117                 '_type': 'url',
 118                 'ie_key': MainStreamingIE.ie_key(),
 119                 'id': content_id,
 120                 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
 121                 'title': entry.get('title'),
 122                 'url': f'https://{host}/embed/{content_id}'
 123             }
 124
 125     @staticmethod
 126     def _get_webtools_host(host):
 127         if not host.startswith('webtools'):
 128             host = 'webtools' + ('-' if not host.startswith('.') else '') + host
 129         return host
 130
 131     def _get_webtools_base_url(self, host):
 132         return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
 133
 134     def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
 135         # JSON API, does not appear to be documented
 136         return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
 137
 138     def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
 139         # webtools docs: https://webtools.msvdn.net/
 140         return self._download_json(
 141             urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
 142
 143     def _real_extract(self, url):
 144         host, video_id = self._match_valid_url(url).groups()
 145         content_info = try_get(
 146             self._call_api(
 147                 host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
 148         # Fallback
 149         if not content_info:
 150             webpage = self._download_webpage(url, video_id)
 151             player_config = self._parse_json(
 152                 self._search_regex(
 153                     r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
 154                     default='{}', flags=re.DOTALL),
 155                 video_id, transform_source=js_to_json, fatal=False) or {}
 156             content_info = player_config['contentInfo']
 157
 158         host = content_info.get('host') or host
 159         video_id = content_info.get('contentID') or video_id
 160         title = content_info.get('title')
 161         description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
 162         live_status = 'not_live'
 163         if content_info.get('drmEnabled'):
 164             self.report_drm(video_id)
 165
 166         alternative_content_id = content_info.get('alternativeContentID')
 167         if alternative_content_id:
 168             self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
 169
 170         content_type = int_or_none(content_info.get('contentType'))
 171         format_base_url = None
 172         formats = []
 173         subtitles = {}
 174         # Live content
 175         if content_type == 20:
 176             dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
 177             format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
 178             live_status = 'is_live'
 179             heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
 180             if heartbeat.get('heartBeatUp') is False:
 181                 self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
 182                 live_status = 'was_live'
 183
 184         # Playlist
 185         elif content_type == 31:
 186             return self.playlist_result(
 187                 self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
 188         # Normal video content?
 189         elif content_type == 10:
 190             format_base_url = f'https://{host}/vod/{video_id}/%s'
 191             # Progressive format
 192             # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
 193             # however it seems to be the same as original.mp4?
 194             formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
 195         else:
 196             self.raise_no_formats(f'Unknown content type {content_type}')
 197
 198         if format_base_url:
 199             m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
 200                 format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
 201             mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
 202                 format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
 203
 204             subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
 205             formats.extend(m3u8_formats + mpd_formats)
 206
 207         self._sort_formats(formats)
 208
 209         return {
 210             'id': video_id,
 211             'title': title,
 212             'description': description,
 213             'formats': formats,
 214             'live_status': live_status,
 215             'duration': parse_duration(content_info.get('duration')),
 216             'tags': content_info.get('tags'),
 217             'subtitles': subtitles,
 218             'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster')
 219         }