]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/mediasite.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / mediasite.py
1 import json
2 import re
3
4 from .common import InfoExtractor
5 from ..compat import (
6 compat_str,
7 compat_urlparse,
8 )
9 from ..utils import (
10 ExtractorError,
11 float_or_none,
12 mimetype2ext,
13 smuggle_url,
14 str_or_none,
15 try_call,
16 try_get,
17 unsmuggle_url,
18 url_or_none,
19 urljoin,
20 )
21
22 _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})'
23
24
25 class MediasiteIE(InfoExtractor):
26 _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
27 _EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE]
28 _TESTS = [
29 {
30 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
31 'info_dict': {
32 'id': '2db6c271681e4f199af3c60d1f82869b1d',
33 'ext': 'mp4',
34 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles',
35 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.',
36 'timestamp': 1474268400.0,
37 'upload_date': '20160919',
38 },
39 },
40 {
41 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb',
42 'info_dict': {
43 'id': '90bb363295d945d6b548c867d01181361d',
44 'ext': 'mp4',
45 'upload_date': '20150429',
46 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity',
47 'timestamp': 1430311380.0,
48 },
49 },
50 {
51 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d',
52 'md5': '481fda1c11f67588c0d9d8fbdced4e39',
53 'info_dict': {
54 'id': '585a43626e544bdd97aeb71a0ec907a01d',
55 'ext': 'mp4',
56 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
57 'description': '',
58 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$',
59 'duration': 7713.088,
60 'timestamp': 1413309600,
61 'upload_date': '20141014',
62 },
63 },
64 {
65 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4',
66 'md5': 'ef1fdded95bdf19b12c5999949419c92',
67 'info_dict': {
68 'id': '86a9ea9f53e149079fbdb4202b521ed21d',
69 'ext': 'wmv',
70 'title': '64ste Vakantiecursus: Afvalwater',
71 'description': 'md5:7fd774865cc69d972f542b157c328305',
72 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
73 'duration': 10853,
74 'timestamp': 1326446400,
75 'upload_date': '20120113',
76 },
77 },
78 {
79 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d',
80 'md5': '9422edc9b9a60151727e4b6d8bef393d',
81 'info_dict': {
82 'id': '24aace4429fc450fb5b38cdbf424a66e1d',
83 'ext': 'mp4',
84 'title': 'Xyce Software Training - Section 1',
85 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}',
86 'upload_date': '20120409',
87 'timestamp': 1333983600,
88 'duration': 7794,
89 }
90 },
91 {
92 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d',
93 'only_matching': True,
94 },
95 {
96 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d',
97 'only_matching': True,
98 },
99 {
100 # dashed id
101 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d',
102 'only_matching': True,
103 }
104 ]
105
106 # look in Mediasite.Core.js (Mediasite.ContentStreamType[*])
107 _STREAM_TYPES = {
108 0: 'video1', # the main video
109 2: 'slide',
110 3: 'presentation',
111 4: 'video2', # screencast?
112 5: 'video3',
113 }
114
115 @classmethod
116 def _extract_embed_urls(cls, url, webpage):
117 for embed_url in super()._extract_embed_urls(url, webpage):
118 yield smuggle_url(embed_url, {'UrlReferrer': url})
119
120 def __extract_slides(self, *, stream_id, snum, Stream, duration, images):
121 slide_base_url = Stream['SlideBaseUrl']
122
123 fname_template = Stream['SlideImageFileNameTemplate']
124 if fname_template != 'slide_{0:D4}.jpg':
125 self.report_warning('Unusual slide file name template; report a bug if slide downloading fails')
126 fname_template = re.sub(r'\{0:D([0-9]+)\}', r'{0:0\1}', fname_template)
127
128 fragments = []
129 for i, slide in enumerate(Stream['Slides']):
130 if i == 0:
131 if slide['Time'] > 0:
132 default_slide = images.get('DefaultSlide')
133 if default_slide is None:
134 default_slide = images.get('DefaultStreamImage')
135 if default_slide is not None:
136 default_slide = default_slide['ImageFilename']
137 if default_slide is not None:
138 fragments.append({
139 'path': default_slide,
140 'duration': slide['Time'] / 1000,
141 })
142
143 next_time = try_call(
144 lambda: Stream['Slides'][i + 1]['Time'],
145 lambda: duration,
146 lambda: slide['Time'],
147 expected_type=(int, float))
148
149 fragments.append({
150 'path': fname_template.format(slide.get('Number', i + 1)),
151 'duration': (next_time - slide['Time']) / 1000
152 })
153
154 return {
155 'format_id': '%s-%u.slides' % (stream_id, snum),
156 'ext': 'mhtml',
157 'url': slide_base_url,
158 'protocol': 'mhtml',
159 'acodec': 'none',
160 'vcodec': 'none',
161 'format_note': 'Slides',
162 'fragments': fragments,
163 'fragment_base_url': slide_base_url,
164 }
165
166 def _real_extract(self, url):
167 url, data = unsmuggle_url(url, {})
168 mobj = self._match_valid_url(url)
169 resource_id = mobj.group('id')
170 query = mobj.group('query')
171
172 webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer?
173 redirect_url = urlh.url
174
175 # XXX: might have also extracted UrlReferrer and QueryString from the html
176 service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
177 r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id,
178 default='/Mediasite/PlayerService/PlayerService.svc/json'))
179
180 player_options = self._download_json(
181 '%s/GetPlayerOptions' % service_path, resource_id,
182 headers={
183 'Content-type': 'application/json; charset=utf-8',
184 'X-Requested-With': 'XMLHttpRequest',
185 },
186 data=json.dumps({
187 'getPlayerOptionsRequest': {
188 'ResourceId': resource_id,
189 'QueryString': query,
190 'UrlReferrer': data.get('UrlReferrer', ''),
191 'UseScreenReader': False,
192 }
193 }).encode('utf-8'))['d']
194
195 presentation = player_options['Presentation']
196 title = presentation['Title']
197
198 if presentation is None:
199 raise ExtractorError(
200 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'],
201 expected=True)
202
203 thumbnails = []
204 formats = []
205 for snum, Stream in enumerate(presentation['Streams']):
206 stream_type = Stream.get('StreamType')
207 if stream_type is None:
208 continue
209
210 video_urls = Stream.get('VideoUrls')
211 if not isinstance(video_urls, list):
212 video_urls = []
213
214 stream_id = self._STREAM_TYPES.get(
215 stream_type, 'type%u' % stream_type)
216
217 stream_formats = []
218 for unum, VideoUrl in enumerate(video_urls):
219 video_url = url_or_none(VideoUrl.get('Location'))
220 if not video_url:
221 continue
222 # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
223
224 media_type = VideoUrl.get('MediaType')
225 if media_type == 'SS':
226 stream_formats.extend(self._extract_ism_formats(
227 video_url, resource_id,
228 ism_id='%s-%u.%u' % (stream_id, snum, unum),
229 fatal=False))
230 elif media_type == 'Dash':
231 stream_formats.extend(self._extract_mpd_formats(
232 video_url, resource_id,
233 mpd_id='%s-%u.%u' % (stream_id, snum, unum),
234 fatal=False))
235 else:
236 stream_formats.append({
237 'format_id': '%s-%u.%u' % (stream_id, snum, unum),
238 'url': video_url,
239 'ext': mimetype2ext(VideoUrl.get('MimeType')),
240 })
241
242 if Stream.get('HasSlideContent', False):
243 images = player_options['PlayerLayoutOptions']['Images']
244 stream_formats.append(self.__extract_slides(
245 stream_id=stream_id,
246 snum=snum,
247 Stream=Stream,
248 duration=presentation.get('Duration'),
249 images=images,
250 ))
251
252 # disprefer 'secondary' streams
253 if stream_type != 0:
254 for fmt in stream_formats:
255 fmt['quality'] = -10
256
257 thumbnail_url = Stream.get('ThumbnailUrl')
258 if thumbnail_url:
259 thumbnails.append({
260 'id': '%s-%u' % (stream_id, snum),
261 'url': urljoin(redirect_url, thumbnail_url),
262 'preference': -1 if stream_type != 0 else 0,
263 })
264 formats.extend(stream_formats)
265
266 # XXX: Presentation['Presenters']
267 # XXX: Presentation['Transcript']
268
269 return {
270 'id': resource_id,
271 'title': title,
272 'description': presentation.get('Description'),
273 'duration': float_or_none(presentation.get('Duration'), 1000),
274 'timestamp': float_or_none(presentation.get('UnixTime'), 1000),
275 'formats': formats,
276 'thumbnails': thumbnails,
277 }
278
279
280 class MediasiteCatalogIE(InfoExtractor):
281 _VALID_URL = r'''(?xi)
282 (?P<url>https?://[^/]+/Mediasite)
283 /Catalog/Full/
284 (?P<catalog_id>{0})
285 (?:
286 /(?P<current_folder_id>{0})
287 /(?P<root_dynamic_folder_id>{0})
288 )?
289 '''.format(_ID_RE)
290 _TESTS = [{
291 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21',
292 'info_dict': {
293 'id': '631f9e48530d454381549f955d08c75e21',
294 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically',
295 },
296 'playlist_count': 6,
297 'expected_warnings': ['is not a supported codec'],
298 }, {
299 # with CurrentFolderId and RootDynamicFolderId
300 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521',
301 'info_dict': {
302 'id': '9518c4a6c5cf4993b21cbd53e828a92521',
303 'title': 'IUSM Family and Friends Sessions',
304 },
305 'playlist_count': 2,
306 }, {
307 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21',
308 'only_matching': True,
309 }, {
310 # no AntiForgeryToken
311 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21',
312 'only_matching': True,
313 }, {
314 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521',
315 'only_matching': True,
316 }, {
317 # dashed id
318 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e',
319 'only_matching': True,
320 }]
321
322 def _real_extract(self, url):
323 mobj = self._match_valid_url(url)
324 mediasite_url = mobj.group('url')
325 catalog_id = mobj.group('catalog_id')
326 current_folder_id = mobj.group('current_folder_id') or catalog_id
327 root_dynamic_folder_id = mobj.group('root_dynamic_folder_id')
328
329 webpage = self._download_webpage(url, catalog_id)
330
331 # AntiForgeryToken is optional (e.g. [1])
332 # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21
333 anti_forgery_token = self._search_regex(
334 r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
335 webpage, 'anti forgery token', default=None, group='value')
336 if anti_forgery_token:
337 anti_forgery_header = self._search_regex(
338 r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
339 webpage, 'anti forgery header name',
340 default='X-SOFO-AntiForgeryHeader', group='value')
341
342 data = {
343 'IsViewPage': True,
344 'IsNewFolder': True,
345 'AuthTicket': None,
346 'CatalogId': catalog_id,
347 'CurrentFolderId': current_folder_id,
348 'RootDynamicFolderId': root_dynamic_folder_id,
349 'ItemsPerPage': 1000,
350 'PageIndex': 0,
351 'PermissionMask': 'Execute',
352 'CatalogSearchType': 'SearchInFolder',
353 'SortBy': 'Date',
354 'SortDirection': 'Descending',
355 'StartDate': None,
356 'EndDate': None,
357 'StatusFilterList': None,
358 'PreviewKey': None,
359 'Tags': [],
360 }
361
362 headers = {
363 'Content-Type': 'application/json; charset=UTF-8',
364 'Referer': url,
365 'X-Requested-With': 'XMLHttpRequest',
366 }
367 if anti_forgery_token:
368 headers[anti_forgery_header] = anti_forgery_token
369
370 catalog = self._download_json(
371 '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url,
372 catalog_id, data=json.dumps(data).encode(), headers=headers)
373
374 entries = []
375 for video in catalog['PresentationDetailsList']:
376 if not isinstance(video, dict):
377 continue
378 video_id = str_or_none(video.get('Id'))
379 if not video_id:
380 continue
381 entries.append(self.url_result(
382 '%s/Play/%s' % (mediasite_url, video_id),
383 ie=MediasiteIE.ie_key(), video_id=video_id))
384
385 title = try_get(
386 catalog, lambda x: x['CurrentFolder']['Name'], compat_str)
387
388 return self.playlist_result(entries, catalog_id, title,)
389
390
391 class MediasiteNamedCatalogIE(InfoExtractor):
392 _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)'
393 _TESTS = [{
394 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o',
395 'only_matching': True,
396 }]
397
398 def _real_extract(self, url):
399 mobj = self._match_valid_url(url)
400 mediasite_url = mobj.group('url')
401 catalog_name = mobj.group('catalog_name')
402
403 webpage = self._download_webpage(url, catalog_name)
404
405 catalog_id = self._search_regex(
406 r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id')
407
408 return self.url_result(
409 '%s/Catalog/Full/%s' % (mediasite_url, catalog_id),
410 ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id)