yt_dlp/extractor/microsoftstream.py

   1 import base64
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     merge_dicts,
   6     parse_duration,
   7     parse_iso8601,
   8     parse_resolution,
   9     try_get,
  10     url_basename,
  11 )
  12
  13
  14 class MicrosoftStreamIE(InfoExtractor):
  15     IE_NAME = 'microsoftstream'
  16     IE_DESC = 'Microsoft Stream'
  17     _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
  18
  19     _TESTS = [{
  20         'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0',
  21         'only_matching': True,
  22     }, {
  23         'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca',
  24         'only_matching': True,
  25     }]
  26
  27     def _get_all_subtitles(self, api_url, video_id, headers):
  28         subtitles = {}
  29         automatic_captions = {}
  30         text_tracks = self._download_json(
  31             f'{api_url}/videos/{video_id}/texttracks', video_id,
  32             note='Downloading subtitles JSON', fatal=False, headers=headers,
  33             query={'api-version': '1.4-private'}).get('value') or []
  34         for track in text_tracks:
  35             if not track.get('language') or not track.get('url'):
  36                 continue
  37             sub_dict = automatic_captions if track.get('autoGenerated') else subtitles
  38             sub_dict.setdefault(track['language'], []).append({
  39                 'ext': 'vtt',
  40                 'url': track.get('url')
  41             })
  42         return {
  43             'subtitles': subtitles,
  44             'automatic_captions': automatic_captions
  45         }
  46
  47     def extract_all_subtitles(self, *args, **kwargs):
  48         if (self.get_param('writesubtitles', False)
  49                 or self.get_param('writeautomaticsub', False)
  50                 or self.get_param('listsubtitles')):
  51             return self._get_all_subtitles(*args, **kwargs)
  52         return {}
  53
  54     def _real_extract(self, url):
  55         video_id = self._match_id(url)
  56         webpage = self._download_webpage(url, video_id)
  57         if '<title>Microsoft Stream</title>' not in webpage:
  58             self.raise_login_required(method='cookies')
  59
  60         access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token')
  61         api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url')
  62
  63         headers = {'Authorization': f'Bearer {access_token}'}
  64
  65         video_data = self._download_json(
  66             f'{api_url}/videos/{video_id}', video_id,
  67             headers=headers, query={
  68                 '$expand': 'creator,tokens,status,liveEvent,extensions',
  69                 'api-version': '1.4-private'
  70             })
  71         video_id = video_data.get('id') or video_id
  72         language = video_data.get('language')
  73
  74         thumbnails = []
  75         for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'):
  76             thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str)
  77             if not thumbnail_url:
  78                 continue
  79             thumb = {
  80                 'id': thumbnail_id,
  81                 'url': thumbnail_url,
  82             }
  83             thumb_name = url_basename(thumbnail_url)
  84             thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
  85             thumb.update(parse_resolution(thumb_name))
  86             thumbnails.append(thumb)
  87
  88         formats = []
  89         for playlist in video_data['playbackUrls']:
  90             if playlist['mimeType'] == 'application/vnd.apple.mpegurl':
  91                 formats.extend(self._extract_m3u8_formats(
  92                     playlist['playbackUrl'], video_id,
  93                     ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls',
  94                     fatal=False, headers=headers))
  95             elif playlist['mimeType'] == 'application/dash+xml':
  96                 formats.extend(self._extract_mpd_formats(
  97                     playlist['playbackUrl'], video_id, mpd_id='dash',
  98                     fatal=False, headers=headers))
  99             elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml':
 100                 formats.extend(self._extract_ism_formats(
 101                     playlist['playbackUrl'], video_id, ism_id='mss',
 102                     fatal=False, headers=headers))
 103         formats = [merge_dicts(f, {'language': language}) for f in formats]
 104
 105         return {
 106             'id': video_id,
 107             'title': video_data['name'],
 108             'description': video_data.get('description'),
 109             'uploader': try_get(video_data, lambda x: x['creator']['name'], str),
 110             'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'],
 111                                                 lambda x: x['creator']['id']), str),
 112             'thumbnails': thumbnails,
 113             **self.extract_all_subtitles(api_url, video_id, headers),
 114             'timestamp': parse_iso8601(video_data.get('created')),
 115             'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])),
 116             'webpage_url': f'https://web.microsoftstream.com/video/{video_id}',
 117             'view_count': try_get(video_data, lambda x: x['metrics']['views'], int),
 118             'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int),
 119             'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int),
 120             'formats': formats,
 121         }