yt_dlp/extractor/panopto.py

   1 import re
   2 import calendar
   3 import json
   4 import functools
   5 from datetime import datetime
   6 from random import random
   7
   8 from .common import InfoExtractor
   9 from ..compat import (
  10     compat_urllib_parse_urlparse,
  11     compat_urlparse
  12 )
  13
  14 from ..utils import (
  15     bug_reports_message,
  16     ExtractorError,
  17     get_first,
  18     int_or_none,
  19     OnDemandPagedList,
  20     parse_qs,
  21     traverse_obj,
  22 )
  23
  24
  25 class PanoptoBaseIE(InfoExtractor):
  26     BASE_URL_RE = r'(?P<base_url>https?://[\w.]+\.panopto.(?:com|eu)/Panopto)'
  27
  28     def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs):
  29         response = self._download_json(
  30             base_url + path, video_id, data=json.dumps(data).encode('utf8') if data else None,
  31             fatal=fatal, headers={'accept': 'application/json', 'content-type': 'application/json'}, **kwargs)
  32         if not response:
  33             return
  34         error_code = response.get('ErrorCode')
  35         if error_code == 2:
  36             self.raise_login_required(method='cookies')
  37         elif error_code is not None:
  38             msg = f'Panopto said: {response.get("ErrorMessage")}'
  39             if fatal:
  40                 raise ExtractorError(msg, video_id=video_id, expected=True)
  41             else:
  42                 self.report_warning(msg, video_id=video_id)
  43         return response
  44
  45     @staticmethod
  46     def _parse_fragment(url):
  47         return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()}
  48
  49     @staticmethod
  50     def _extract_urls(webpage):
  51         return [m.group('url') for m in re.finditer(
  52             r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE,
  53             webpage)]
  54
  55
  56 class PanoptoIE(PanoptoBaseIE):
  57     _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)'
  58     _TESTS = [
  59         {
  60             'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
  61             'info_dict': {
  62                 'id': '26b3ae9e-4a48-4dcc-96ba-0befba08a0fb',
  63                 'title': 'Panopto for Business - Use Cases',
  64                 'timestamp': 1459184200,
  65                 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb&mode=Delivery&random=[\d.]+',
  66                 'upload_date': '20160328',
  67                 'ext': 'mp4',
  68                 'cast': [],
  69                 'duration': 88.17099999999999,
  70                 'average_rating': int,
  71                 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e',
  72                 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a',
  73                 'channel': 'Showcase Videos'
  74             },
  75         },
  76         {
  77             'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59',
  78             'info_dict': {
  79                 'id': 'ed01b077-c9e5-4c7b-b8ff-15fa306d7a59',
  80                 'title': 'Overcoming Top 4 Challenges of Enterprise Video',
  81                 'uploader': 'Panopto Support',
  82                 'timestamp': 1449409251,
  83                 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59&mode=Delivery&random=[\d.]+',
  84                 'upload_date': '20151206',
  85                 'ext': 'mp4',
  86                 'chapters': 'count:21',
  87                 'cast': ['Panopto Support'],
  88                 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c',
  89                 'average_rating': int,
  90                 'description': 'md5:4391837802b3fc856dadf630c4b375d1',
  91                 'duration': 1088.2659999999998,
  92                 'channel_id': '9f3c1921-43bb-4bda-8b3a-b8d2f05a8546',
  93                 'channel': 'Webcasts',
  94             },
  95         },
  96         {
  97             # Extra params in URL
  98             'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?randomparam=thisisnotreal&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true',
  99             'info_dict': {
 100                 'id': '5fa74e93-3d87-4694-b60e-aaa4012214ed',
 101                 'ext': 'mp4',
 102                 'duration': 129.513,
 103                 'cast': ['Kathryn Kelly'],
 104                 'uploader_id': '316a0a58-7fa2-4cd9-be1c-64270d284a56',
 105                 'timestamp': 1569845768,
 106                 'tags': ['Viewer', 'Enterprise'],
 107                 'upload_date': '20190930',
 108                 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=5fa74e93-3d87-4694-b60e-aaa4012214ed&mode=Delivery&random=[\d.]+',
 109                 'description': 'md5:2d844aaa1b1a14ad0e2601a0993b431f',
 110                 'title': 'Getting Started: View a Video',
 111                 'average_rating': int,
 112                 'uploader': 'Kathryn Kelly',
 113                 'channel_id': 'fb93bc3c-6750-4b80-a05b-a921013735d3',
 114                 'channel': 'Getting Started',
 115             }
 116         },
 117         {
 118             # Does not allow normal Viewer.aspx. AUDIO livestream has no url, so should be skipped and only give one stream.
 119             'url': 'https://unisa.au.panopto.com/Panopto/Pages/Embed.aspx?id=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4',
 120             'info_dict': {
 121                 'id': '9d9a0fa3-e99a-4ebd-a281-aac2017f4da4',
 122                 'ext': 'mp4',
 123                 'cast': ['LTS CLI Script'],
 124                 'duration': 2178.45,
 125                 'description': 'md5:ee5cf653919f55b72bce2dbcf829c9fa',
 126                 'channel_id': 'b23e673f-c287-4cb1-8344-aae9005a69f8',
 127                 'average_rating': int,
 128                 'uploader_id': '38377323-6a23-41e2-9ff6-a8e8004bf6f7',
 129                 'uploader': 'LTS CLI Script',
 130                 'timestamp': 1572458134,
 131                 'title': 'WW2 Vets Interview 3 Ronald Stanley George',
 132                 'thumbnail': r're:https://unisa\.au\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4&mode=Delivery&random=[\d.]+',
 133                 'channel': 'World War II Veteran Interviews',
 134                 'upload_date': '20191030',
 135             },
 136         },
 137         {
 138             'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb',
 139             'only_matching': True
 140         },
 141         {
 142             'url': 'https://brown.hosted.panopto.com/Panopto/Pages/Embed.aspx?id=0b3ff73b-36a0-46c5-8455-aadf010a3638',
 143             'only_matching': True
 144         },
 145     ]
 146
 147     @classmethod
 148     def suitable(cls, url):
 149         return False if PanoptoPlaylistIE.suitable(url) else super().suitable(url)
 150
 151     def _mark_watched(self, base_url, video_id, delivery_info):
 152         duration = traverse_obj(delivery_info, ('Delivery', 'Duration'), expected_type=float)
 153         invocation_id = delivery_info.get('InvocationId')
 154         stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
 155         if invocation_id and stream_id and duration:
 156             timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/'
 157             data = {
 158                 'streamRequests': [
 159                     {
 160                         'ClientTimeStamp': timestamp_str,
 161                         'ID': 0,
 162                         'InvocationID': invocation_id,
 163                         'PlaybackSpeed': 1,
 164                         'SecondsListened': duration - 1,
 165                         'SecondsRejected': 0,
 166                         'StartPosition': 0,
 167                         'StartReason': 2,
 168                         'StopReason': None,
 169                         'StreamID': stream_id,
 170                         'TimeStamp': timestamp_str,
 171                         'UpdatesRejected': 0
 172                     },
 173                 ]}
 174
 175             self._download_webpage(
 176                 base_url + '/Services/Analytics.svc/AddStreamRequests', video_id,
 177                 fatal=False, data=json.dumps(data).encode('utf8'), headers={'content-type': 'application/json'},
 178                 note='Marking watched', errnote='Unable to mark watched')
 179
 180     @staticmethod
 181     def _extract_chapters(delivery):
 182         chapters = []
 183         for timestamp in delivery.get('Timestamps', []):
 184             start, duration = int_or_none(timestamp.get('Time')), int_or_none(timestamp.get('Duration'))
 185             if start is None or duration is None:
 186                 continue
 187             chapters.append({
 188                 'start_time': start,
 189                 'end_time': start + duration,
 190                 'title': timestamp.get('Caption')
 191             })
 192         return chapters
 193
 194     def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs):
 195         formats = []
 196         subtitles = {}
 197         for stream in streams or []:
 198             stream_formats = []
 199             http_stream_url = stream.get('StreamHttpUrl')
 200             stream_url = stream.get('StreamUrl')
 201
 202             if http_stream_url:
 203                 stream_formats.append({'url': http_stream_url})
 204
 205             if stream_url:
 206                 media_type = stream.get('ViewerMediaFileTypeName')
 207                 if media_type in ('hls', ):
 208                     m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id)
 209                     stream_formats.extend(m3u8_formats)
 210                     subtitles = self._merge_subtitles(subtitles, stream_subtitles)
 211                 else:
 212                     stream_formats.append({
 213                         'url': stream_url
 214                     })
 215             for fmt in stream_formats:
 216                 fmt.update({
 217                     'format_note': stream.get('Tag'),
 218                     **fmt_kwargs
 219                 })
 220             formats.extend(stream_formats)
 221
 222         return formats, subtitles
 223
 224     def _real_extract(self, url):
 225         base_url, video_id = self._match_valid_url(url).group('base_url', 'id')
 226         delivery_info = self._call_api(
 227             base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id,
 228             query={
 229                 'deliveryId': video_id,
 230                 'invocationId': '',
 231                 'isLiveNotes': 'false',
 232                 'refreshAuthCookie': 'true',
 233                 'isActiveBroadcast': 'false',
 234                 'isEditing': 'false',
 235                 'isKollectiveAgentInstalled': 'false',
 236                 'isEmbed': 'false',
 237                 'responseType': 'json',
 238             }
 239         )
 240
 241         delivery = delivery_info['Delivery']
 242         session_start_time = int_or_none(delivery.get('SessionStartTime'))
 243
 244         # Podcast stream is usually the combined streams. We will prefer that by default.
 245         podcast_formats, podcast_subtitles = self._extract_streams_formats_and_subtitles(
 246             video_id, delivery.get('PodcastStreams'), format_note='PODCAST')
 247
 248         streams_formats, streams_subtitles = self._extract_streams_formats_and_subtitles(
 249             video_id, delivery.get('Streams'), preference=-10)
 250
 251         formats = podcast_formats + streams_formats
 252         subtitles = self._merge_subtitles(podcast_subtitles, streams_subtitles)
 253         self._sort_formats(formats)
 254
 255         self.mark_watched(base_url, video_id, delivery_info)
 256
 257         return {
 258             'id': video_id,
 259             'title': delivery.get('SessionName'),
 260             'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None),
 261             'timestamp': session_start_time - 11640000000 if session_start_time else None,
 262             'duration': delivery.get('Duration'),
 263             'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}',
 264             'average_rating': delivery.get('AverageRating'),
 265             'chapters': self._extract_chapters(delivery) or None,
 266             'uploader': delivery.get('OwnerDisplayName') or None,
 267             'uploader_id': delivery.get('OwnerId'),
 268             'description': delivery.get('SessionAbstract'),
 269             'tags': traverse_obj(delivery, ('Tags', ..., 'Content')),
 270             'channel_id': delivery.get('SessionGroupPublicID'),
 271             'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False),
 272             'formats': formats,
 273             'subtitles': subtitles
 274         }
 275
 276
 277 class PanoptoPlaylistIE(PanoptoBaseIE):
 278     _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)pid=(?P<id>[a-f0-9-]+)'
 279     _TESTS = [
 280         {
 281             'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=f3b39fcf-882f-4849-93d6-a9f401236d36&id=5fa74e93-3d87-4694-b60e-aaa4012214ed&advance=true',
 282             'info_dict': {
 283                 'title': 'Featured Video Tutorials',
 284                 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36',
 285                 'description': '',
 286             },
 287             'playlist_mincount': 36
 288         },
 289         {
 290             'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190',
 291             'info_dict': {
 292                 'title': 'Library Website Introduction Playlist',
 293                 'id': 'e2900555-3ad4-4bdb-854d-ad2401686190',
 294                 'description': 'md5:f958bca50a1cbda15fdc1e20d32b3ecb',
 295             },
 296             'playlist_mincount': 4
 297         },
 298
 299     ]
 300
 301     def _entries(self, base_url, playlist_id, session_list_id):
 302         session_list_info = self._call_api(
 303             base_url, f'/Api/SessionLists/{session_list_id}?collections[0].maxCount=500&collections[0].name=items', playlist_id)
 304
 305         items = session_list_info['Items']
 306         for item in items:
 307             if item.get('TypeName') != 'Session':
 308                 self.report_warning('Got an item in the playlist that is not a Session' + bug_reports_message(), only_once=True)
 309                 continue
 310             yield {
 311                 '_type': 'url',
 312                 'id': item.get('Id'),
 313                 'url': item.get('ViewerUri'),
 314                 'title': item.get('Name'),
 315                 'description': item.get('Description'),
 316                 'duration': item.get('Duration'),
 317                 'channel': traverse_obj(item, ('Parent', 'Name')),
 318                 'channel_id': traverse_obj(item, ('Parent', 'Id'))
 319             }
 320
 321     def _real_extract(self, url):
 322         base_url, playlist_id = self._match_valid_url(url).group('base_url', 'id')
 323
 324         video_id = get_first(parse_qs(url), 'id')
 325         if video_id:
 326             if self.get_param('noplaylist'):
 327                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
 328                 return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id)
 329             else:
 330                 self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}')
 331
 332         playlist_info = self._call_api(base_url, f'/Api/Playlists/{playlist_id}', playlist_id)
 333         return self.playlist_result(
 334             self._entries(base_url, playlist_id, playlist_info['SessionListId']),
 335             playlist_id=playlist_id, playlist_title=playlist_info.get('Name'),
 336             playlist_description=playlist_info.get('Description'))
 337
 338
 339 class PanoptoListIE(PanoptoBaseIE):
 340     _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/Sessions/List\.aspx'
 341     _PAGE_SIZE = 250
 342     _TESTS = [
 343         {
 344             'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#folderID=%22e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a%22',
 345             'info_dict': {
 346                 'id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a',
 347                 'title': 'Showcase Videos'
 348             },
 349             'playlist_mincount': 140
 350
 351         },
 352         {
 353             'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#view=2&maxResults=250',
 354             'info_dict': {
 355                 'id': 'panopto_list',
 356                 'title': 'panopto_list'
 357             },
 358             'playlist_mincount': 300
 359         },
 360         {
 361             # Folder that contains 8 folders and a playlist
 362             'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx?noredirect=true#folderID=%224b9de7ae-0080-4158-8496-a9ba01692c2e%22',
 363             'info_dict': {
 364                 'id': '4b9de7ae-0080-4158-8496-a9ba01692c2e',
 365                 'title': 'Video Tutorials'
 366             },
 367             'playlist_mincount': 9
 368         }
 369
 370     ]
 371
 372     def _fetch_page(self, base_url, query_params, display_id, page):
 373
 374         params = {
 375             'sortColumn': 1,
 376             'getFolderData': True,
 377             'includePlaylists': True,
 378             **query_params,
 379             'page': page,
 380             'maxResults': self._PAGE_SIZE,
 381         }
 382
 383         response = self._call_api(
 384             base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}',
 385             data={'queryParameters': params}, fatal=False)
 386
 387         for result in get_first(response, 'Results', default=[]):
 388             # This could be a video, playlist (or maybe something else)
 389             item_id = result.get('DeliveryID')
 390             yield {
 391                 '_type': 'url',
 392                 'id': item_id,
 393                 'title': result.get('SessionName'),
 394                 'url': traverse_obj(result, 'ViewerUrl', 'EmbedUrl', get_all=False) or (base_url + f'/Pages/Viewer.aspx?id={item_id}'),
 395                 'duration': result.get('Duration'),
 396                 'channel': result.get('FolderName'),
 397                 'channel_id': result.get('FolderID'),
 398             }
 399
 400         for folder in get_first(response, 'Subfolders', default=[]):
 401             folder_id = folder.get('ID')
 402             yield self.url_result(
 403                 base_url + f'/Pages/Sessions/List.aspx#folderID="{folder_id}"',
 404                 ie_key=PanoptoListIE.ie_key(), video_id=folder_id, title=folder.get('Name'))
 405
 406     def _extract_folder_metadata(self, base_url, folder_id):
 407         response = self._call_api(
 408             base_url, '/Services/Data.svc/GetFolderInfo', folder_id,
 409             data={'folderID': folder_id}, fatal=False)
 410         return {
 411             'title': get_first(response, 'Name', default=[])
 412         }
 413
 414     def _real_extract(self, url):
 415         mobj = self._match_valid_url(url)
 416         base_url = mobj.group('base_url')
 417
 418         query_params = self._parse_fragment(url)
 419         folder_id, display_id = query_params.get('folderID'), 'panopto_list'
 420
 421         if query_params.get('isSubscriptionsPage'):
 422             display_id = 'subscriptions'
 423             if not query_params.get('subscribableTypes'):
 424                 query_params['subscribableTypes'] = [0, 1, 2]
 425         elif query_params.get('isSharedWithMe'):
 426             display_id = 'sharedwithme'
 427         elif folder_id:
 428             display_id = folder_id
 429
 430         query = query_params.get('query')
 431         if query:
 432             display_id += f': query "{query}"'
 433
 434         info = {
 435             '_type': 'playlist',
 436             'id': display_id,
 437             'title': display_id,
 438         }
 439         if folder_id:
 440             info.update(self._extract_folder_metadata(base_url, folder_id))
 441
 442         info['entries'] = OnDemandPagedList(
 443             functools.partial(self._fetch_page, base_url, query_params, display_id), self._PAGE_SIZE)
 444
 445         return info