]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | import urllib.error | |
3 | import urllib.parse | |
4 | from base64 import b64decode | |
5 | ||
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | ExtractorError, | |
9 | HEADRequest, | |
10 | determine_ext, | |
11 | float_or_none, | |
12 | int_or_none, | |
13 | parse_qs, | |
14 | traverse_obj, | |
15 | try_get, | |
16 | update_url_query, | |
17 | urlhandle_detect_ext, | |
18 | ) | |
19 | ||
20 | ||
21 | class WistiaBaseIE(InfoExtractor): | |
22 | _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})' | |
23 | _VALID_URL_BASE = r'https?://(?:\w+\.)?wistia\.(?:net|com)/(?:embed/)?' | |
24 | _EMBED_BASE_URL = 'http://fast.wistia.net/embed/' | |
25 | ||
26 | def _download_embed_config(self, config_type, config_id, referer): | |
27 | base_url = self._EMBED_BASE_URL + '%s/%s' % (config_type, config_id) | |
28 | embed_config = self._download_json( | |
29 | base_url + '.json', config_id, headers={ | |
30 | 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. | |
31 | }) | |
32 | ||
33 | error = traverse_obj(embed_config, 'error') | |
34 | if error: | |
35 | raise ExtractorError( | |
36 | f'Error while getting the playlist: {error}', expected=True) | |
37 | ||
38 | return embed_config | |
39 | ||
40 | def _get_real_ext(self, url): | |
41 | ext = determine_ext(url, default_ext='bin') | |
42 | if ext == 'bin': | |
43 | urlh = self._request_webpage( | |
44 | HEADRequest(url), None, note='Checking media extension', | |
45 | errnote='HEAD request returned error', fatal=False) | |
46 | if urlh: | |
47 | ext = urlhandle_detect_ext(urlh, default='bin') | |
48 | return 'mp4' if ext == 'mov' else ext | |
49 | ||
50 | def _extract_media(self, embed_config): | |
51 | data = embed_config['media'] | |
52 | video_id = data['hashedId'] | |
53 | title = data['name'] | |
54 | ||
55 | formats = [] | |
56 | thumbnails = [] | |
57 | for a in data['assets']: | |
58 | aurl = a.get('url') | |
59 | if not aurl: | |
60 | continue | |
61 | astatus = a.get('status') | |
62 | atype = a.get('type') | |
63 | if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'): | |
64 | continue | |
65 | elif atype in ('still', 'still_image'): | |
66 | thumbnails.append({ | |
67 | 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'), | |
68 | 'width': int_or_none(a.get('width')), | |
69 | 'height': int_or_none(a.get('height')), | |
70 | 'filesize': int_or_none(a.get('size')), | |
71 | }) | |
72 | else: | |
73 | aext = a.get('ext') or self._get_real_ext(aurl) | |
74 | display_name = a.get('display_name') | |
75 | format_id = atype | |
76 | if atype and atype.endswith('_video') and display_name: | |
77 | format_id = '%s-%s' % (atype[:-6], display_name) | |
78 | f = { | |
79 | 'format_id': format_id, | |
80 | 'url': aurl, | |
81 | 'tbr': int_or_none(a.get('bitrate')) or None, | |
82 | 'quality': 1 if atype == 'original' else None, | |
83 | } | |
84 | if display_name == 'Audio': | |
85 | f.update({ | |
86 | 'vcodec': 'none', | |
87 | }) | |
88 | else: | |
89 | f.update({ | |
90 | 'width': int_or_none(a.get('width')), | |
91 | 'height': int_or_none(a.get('height')), | |
92 | 'vcodec': a.get('codec'), | |
93 | }) | |
94 | if a.get('container') == 'm3u8' or aext == 'm3u8': | |
95 | ts_f = f.copy() | |
96 | ts_f.update({ | |
97 | 'ext': 'ts', | |
98 | 'format_id': f['format_id'].replace('hls-', 'ts-'), | |
99 | 'url': f['url'].replace('.bin', '.ts'), | |
100 | }) | |
101 | formats.append(ts_f) | |
102 | f.update({ | |
103 | 'ext': 'mp4', | |
104 | 'protocol': 'm3u8_native', | |
105 | }) | |
106 | else: | |
107 | f.update({ | |
108 | 'container': a.get('container'), | |
109 | 'ext': aext, | |
110 | 'filesize': int_or_none(a.get('size')), | |
111 | }) | |
112 | formats.append(f) | |
113 | ||
114 | subtitles = {} | |
115 | for caption in data.get('captions', []): | |
116 | language = caption.get('language') | |
117 | if not language: | |
118 | continue | |
119 | subtitles[language] = [{ | |
120 | 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language, | |
121 | }] | |
122 | ||
123 | return { | |
124 | 'id': video_id, | |
125 | 'title': title, | |
126 | 'description': data.get('seoDescription'), | |
127 | 'formats': formats, | |
128 | 'thumbnails': thumbnails, | |
129 | 'duration': float_or_none(data.get('duration')), | |
130 | 'timestamp': int_or_none(data.get('createdAt')), | |
131 | 'subtitles': subtitles, | |
132 | } | |
133 | ||
134 | @classmethod | |
135 | def _extract_from_webpage(cls, url, webpage): | |
136 | from .teachable import TeachableIE | |
137 | ||
138 | if list(TeachableIE._extract_embed_urls(url, webpage)): | |
139 | return | |
140 | ||
141 | yield from super()._extract_from_webpage(url, webpage) | |
142 | ||
143 | @classmethod | |
144 | def _extract_wistia_async_embed(cls, webpage): | |
145 | # https://wistia.com/support/embed-and-share/video-on-your-website | |
146 | # https://wistia.com/support/embed-and-share/channel-embeds | |
147 | yield from re.finditer( | |
148 | r'''(?sx) | |
149 | <(?:div|section)[^>]+class=([\"'])(?:(?!\1).)*?(?P<type>wistia[a-z_0-9]+)\s*\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 | |
150 | ''', webpage) | |
151 | ||
152 | @classmethod | |
153 | def _extract_url_media_id(cls, url): | |
154 | mobj = re.search(r'(?:wmediaid|wvideo(?:id)?)]?=(?P<id>[a-z0-9]{10})', urllib.parse.unquote_plus(url)) | |
155 | if mobj: | |
156 | return mobj.group('id') | |
157 | ||
158 | ||
159 | class WistiaIE(WistiaBaseIE): | |
160 | _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) | |
161 | _EMBED_REGEX = [ | |
162 | r'''(?x) | |
163 | <(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'] | |
164 | (?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10}) | |
165 | '''] | |
166 | _TESTS = [{ | |
167 | # with hls video | |
168 | 'url': 'wistia:807fafadvk', | |
169 | 'md5': 'daff0f3687a41d9a71b40e0e8c2610fe', | |
170 | 'info_dict': { | |
171 | 'id': '807fafadvk', | |
172 | 'ext': 'mp4', | |
173 | 'title': 'Drip Brennan Dunn Workshop', | |
174 | 'description': 'a JV Webinars video', | |
175 | 'upload_date': '20160518', | |
176 | 'timestamp': 1463607249, | |
177 | 'duration': 4987.11, | |
178 | }, | |
179 | 'skip': 'video unavailable', | |
180 | }, { | |
181 | 'url': 'wistia:a6ndpko1wg', | |
182 | 'md5': '10c1ce9c4dde638202513ed17a3767bd', | |
183 | 'info_dict': { | |
184 | 'id': 'a6ndpko1wg', | |
185 | 'ext': 'mp4', | |
186 | 'title': 'Episode 2: Boxed Water\'s retention is thirsty', | |
187 | 'upload_date': '20210324', | |
188 | 'description': 'md5:da5994c2c2d254833b412469d9666b7a', | |
189 | 'duration': 966.0, | |
190 | 'timestamp': 1616614369, | |
191 | 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png', | |
192 | } | |
193 | }, { | |
194 | 'url': 'wistia:5vd7p4bct5', | |
195 | 'md5': 'b9676d24bf30945d97060638fbfe77f0', | |
196 | 'info_dict': { | |
197 | 'id': '5vd7p4bct5', | |
198 | 'ext': 'mp4', | |
199 | 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', | |
200 | 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', | |
201 | 'upload_date': '20220915', | |
202 | 'timestamp': 1663258727, | |
203 | 'duration': 623.019, | |
204 | 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$', | |
205 | }, | |
206 | }, { | |
207 | 'url': 'wistia:sh7fpupwlt', | |
208 | 'only_matching': True, | |
209 | }, { | |
210 | 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', | |
211 | 'only_matching': True, | |
212 | }, { | |
213 | 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt', | |
214 | 'only_matching': True, | |
215 | }, { | |
216 | 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json', | |
217 | 'only_matching': True, | |
218 | }] | |
219 | ||
220 | _WEBPAGE_TESTS = [{ | |
221 | 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', | |
222 | 'info_dict': { | |
223 | 'id': 'cqwukac3z1', | |
224 | 'ext': 'mp4', | |
225 | 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', | |
226 | 'duration': 158.125, | |
227 | 'timestamp': 1618974400, | |
228 | 'description': 'md5:27abc99a758573560be72600ef95cece', | |
229 | 'upload_date': '20210421', | |
230 | 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg', | |
231 | } | |
232 | }, { | |
233 | 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', | |
234 | 'md5': 'b9676d24bf30945d97060638fbfe77f0', | |
235 | 'info_dict': { | |
236 | 'id': '5vd7p4bct5', | |
237 | 'ext': 'mp4', | |
238 | 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', | |
239 | 'upload_date': '20220915', | |
240 | 'timestamp': 1663258727, | |
241 | 'duration': 623.019, | |
242 | 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg', | |
243 | 'description': 'a Paywall Videos video', | |
244 | }, | |
245 | }] | |
246 | ||
247 | def _real_extract(self, url): | |
248 | video_id = self._match_id(url) | |
249 | embed_config = self._download_embed_config('medias', video_id, url) | |
250 | return self._extract_media(embed_config) | |
251 | ||
252 | @classmethod | |
253 | def _extract_embed_urls(cls, url, webpage): | |
254 | urls = list(super()._extract_embed_urls(url, webpage)) | |
255 | for match in cls._extract_wistia_async_embed(webpage): | |
256 | if match.group('type') != 'wistia_channel': | |
257 | urls.append('wistia:%s' % match.group('id')) | |
258 | for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', | |
259 | webpage): | |
260 | urls.append('wistia:%s' % match.group('id')) | |
261 | if not WistiaChannelIE._extract_embed_urls(url, webpage): # Fallback | |
262 | media_id = cls._extract_url_media_id(url) | |
263 | if media_id: | |
264 | urls.append('wistia:%s' % match.group('id')) | |
265 | return urls | |
266 | ||
267 | ||
268 | class WistiaPlaylistIE(WistiaBaseIE): | |
269 | _VALID_URL = r'%splaylists/%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) | |
270 | ||
271 | _TEST = { | |
272 | 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc', | |
273 | 'info_dict': { | |
274 | 'id': 'aodt9etokc', | |
275 | }, | |
276 | 'playlist_count': 3, | |
277 | } | |
278 | ||
279 | def _real_extract(self, url): | |
280 | playlist_id = self._match_id(url) | |
281 | playlist = self._download_embed_config('playlists', playlist_id, url) | |
282 | ||
283 | entries = [] | |
284 | for media in (try_get(playlist, lambda x: x[0]['medias']) or []): | |
285 | embed_config = media.get('embed_config') | |
286 | if not embed_config: | |
287 | continue | |
288 | entries.append(self._extract_media(embed_config)) | |
289 | ||
290 | return self.playlist_result(entries, playlist_id) | |
291 | ||
292 | ||
293 | class WistiaChannelIE(WistiaBaseIE): | |
294 | _VALID_URL = r'(?:wistiachannel:|%schannel/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) | |
295 | ||
296 | _TESTS = [{ | |
297 | # JSON Embed API returns 403, should fall back to webpage | |
298 | 'url': 'https://fast.wistia.net/embed/channel/yvyvu7wjbg?wchannelid=yvyvu7wjbg', | |
299 | 'info_dict': { | |
300 | 'id': 'yvyvu7wjbg', | |
301 | 'title': 'Copysmith Tutorials and Education!', | |
302 | 'description': 'Learn all things Copysmith via short and informative videos!' | |
303 | }, | |
304 | 'playlist_mincount': 7, | |
305 | 'expected_warnings': ['falling back to webpage'], | |
306 | }, { | |
307 | 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l', | |
308 | 'info_dict': { | |
309 | 'id': '3802iirk0l', | |
310 | 'title': 'The Roof', | |
311 | }, | |
312 | 'playlist_mincount': 20, | |
313 | }, { | |
314 | # link to popup video, follow --no-playlist | |
315 | 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', | |
316 | 'info_dict': { | |
317 | 'id': 'sp5dqjzw3n', | |
318 | 'ext': 'mp4', | |
319 | 'title': 'The Roof S2: The Modern CRO', | |
320 | 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png', | |
321 | 'duration': 86.487, | |
322 | 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', | |
323 | 'timestamp': 1619790290, | |
324 | 'upload_date': '20210430', | |
325 | }, | |
326 | 'params': {'noplaylist': True, 'skip_download': True}, | |
327 | }] | |
328 | _WEBPAGE_TESTS = [{ | |
329 | 'url': 'https://www.profitwell.com/recur/boxed-out', | |
330 | 'info_dict': { | |
331 | 'id': '6jyvmqz6zs', | |
332 | 'title': 'Boxed Out', | |
333 | 'description': 'md5:14a8a93a1dbe236718e6a59f8c8c7bae', | |
334 | }, | |
335 | 'playlist_mincount': 30, | |
336 | }, { | |
337 | # section instead of div | |
338 | 'url': 'https://360learning.com/studio/onboarding-joei/', | |
339 | 'info_dict': { | |
340 | 'id': 'z874k93n2o', | |
341 | 'title': 'Onboarding Joei.', | |
342 | 'description': 'Coming to you weekly starting Feb 19th.', | |
343 | }, | |
344 | 'playlist_mincount': 20, | |
345 | }, { | |
346 | 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt', | |
347 | 'info_dict': { | |
348 | 'id': 'pz0m0l0if3', | |
349 | 'title': 'A Framework for Improving Product Team Performance', | |
350 | 'ext': 'mp4', | |
351 | 'timestamp': 1653935275, | |
352 | 'upload_date': '20220530', | |
353 | 'description': 'Learn how to help your company improve and achieve your product related goals.', | |
354 | 'duration': 1854.39, | |
355 | 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png', | |
356 | }, | |
357 | 'params': {'noplaylist': True, 'skip_download': True}, | |
358 | }] | |
359 | ||
360 | def _real_extract(self, url): | |
361 | channel_id = self._match_id(url) | |
362 | media_id = self._extract_url_media_id(url) | |
363 | if not self._yes_playlist(channel_id, media_id, playlist_label='channel'): | |
364 | return self.url_result(f'wistia:{media_id}', 'Wistia') | |
365 | ||
366 | try: | |
367 | data = self._download_embed_config('channel', channel_id, url) | |
368 | except (ExtractorError, urllib.error.HTTPError): | |
369 | # Some channels give a 403 from the JSON API | |
370 | self.report_warning('Failed to download channel data from API, falling back to webpage.') | |
371 | webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) | |
372 | data = self._parse_json( | |
373 | self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id), | |
374 | channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8'))) | |
375 | ||
376 | # XXX: can there be more than one series? | |
377 | series = traverse_obj(data, ('series', 0), default={}) | |
378 | ||
379 | entries = [ | |
380 | self.url_result(f'wistia:{video["hashedId"]}', WistiaIE, title=video.get('name')) | |
381 | for video in traverse_obj(series, ('sections', ..., 'videos', ...)) or [] | |
382 | if video.get('hashedId') | |
383 | ] | |
384 | ||
385 | return self.playlist_result( | |
386 | entries, channel_id, playlist_title=series.get('title'), playlist_description=series.get('description')) | |
387 | ||
388 | @classmethod | |
389 | def _extract_embed_urls(cls, url, webpage): | |
390 | yield from super()._extract_embed_urls(url, webpage) | |
391 | for match in cls._extract_wistia_async_embed(webpage): | |
392 | if match.group('type') == 'wistia_channel': | |
393 | # original url may contain wmediaid query param | |
394 | yield update_url_query(f'wistiachannel:{match.group("id")}', parse_qs(url)) |