]>
Commit | Line | Data |
---|---|---|
f859ed3b HR |
1 | import json |
2 | import textwrap | |
3 | import urllib.parse | |
4 | import uuid | |
5 | ||
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | ExtractorError, | |
9 | determine_ext, | |
10 | filter_dict, | |
11 | get_first, | |
12 | int_or_none, | |
13 | parse_iso8601, | |
14 | update_url, | |
15 | url_or_none, | |
16 | variadic, | |
17 | ) | |
18 | from ..utils.traversal import traverse_obj | |
19 | ||
20 | ||
21 | class LoomIE(InfoExtractor): | |
22 | IE_NAME = 'loom' | |
23 | _VALID_URL = r'https?://(?:www\.)?loom\.com/(?:share|embed)/(?P<id>[\da-f]{32})' | |
24 | _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] | |
25 | _TESTS = [{ | |
26 | # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, json subs only | |
27 | 'url': 'https://www.loom.com/share/43d05f362f734614a2e81b4694a3a523', | |
28 | 'md5': 'bfc2d7e9c2e0eb4813212230794b6f42', | |
29 | 'info_dict': { | |
30 | 'id': '43d05f362f734614a2e81b4694a3a523', | |
31 | 'ext': 'mp4', | |
32 | 'title': 'A Ruler for Windows - 28 March 2022', | |
33 | 'uploader': 'wILLIAM PIP', | |
34 | 'upload_date': '20220328', | |
35 | 'timestamp': 1648454238, | |
36 | 'duration': 27, | |
37 | }, | |
38 | }, { | |
39 | # webm raw-url, mp4 transcoded-url, cdn url == transcoded-url, no subs | |
40 | 'url': 'https://www.loom.com/share/c43a642f815f4378b6f80a889bb73d8d', | |
41 | 'md5': '70f529317be8cf880fcc2c649a531900', | |
42 | 'info_dict': { | |
43 | 'id': 'c43a642f815f4378b6f80a889bb73d8d', | |
44 | 'ext': 'webm', | |
45 | 'title': 'Lilah Nielsen Intro Video', | |
46 | 'uploader': 'Lilah Nielsen', | |
47 | 'upload_date': '20200826', | |
48 | 'timestamp': 1598480716, | |
49 | 'duration': 20, | |
50 | }, | |
51 | }, { | |
52 | # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, vtt sub and json subs | |
53 | 'url': 'https://www.loom.com/share/9458bcbf79784162aa62ffb8dd66201b', | |
54 | 'md5': '51737ec002969dd28344db4d60b9cbbb', | |
55 | 'info_dict': { | |
56 | 'id': '9458bcbf79784162aa62ffb8dd66201b', | |
57 | 'ext': 'mp4', | |
58 | 'title': 'Sharing screen with gpt-4', | |
59 | 'description': 'Sharing screen with GPT 4 vision model and asking questions to guide through blender.', | |
60 | 'uploader': 'Suneel Matham', | |
61 | 'chapters': 'count:3', | |
62 | 'upload_date': '20231109', | |
63 | 'timestamp': 1699518978, | |
64 | 'duration': 93, | |
65 | }, | |
66 | }, { | |
67 | # mpd raw-url, mp4 transcoded-url, cdn url == raw-url, no subs | |
68 | 'url': 'https://www.loom.com/share/24351eb8b317420289b158e4b7e96ff2', | |
69 | 'info_dict': { | |
70 | 'id': '24351eb8b317420289b158e4b7e96ff2', | |
71 | 'ext': 'webm', | |
72 | 'title': 'OMFG clown', | |
73 | 'description': 'md5:285c5ee9d62aa087b7e3271b08796815', | |
74 | 'uploader': 'MrPumkin B', | |
75 | 'upload_date': '20210924', | |
76 | 'timestamp': 1632519618, | |
77 | 'duration': 210, | |
78 | }, | |
79 | 'params': {'skip_download': 'dash'}, | |
80 | }, { | |
81 | # password-protected | |
82 | 'url': 'https://www.loom.com/share/50e26e8aeb7940189dff5630f95ce1f4', | |
83 | 'md5': '5cc7655e7d55d281d203f8ffd14771f7', | |
84 | 'info_dict': { | |
85 | 'id': '50e26e8aeb7940189dff5630f95ce1f4', | |
86 | 'ext': 'mp4', | |
87 | 'title': 'iOS Mobile Upload', | |
88 | 'uploader': 'Simon Curran', | |
89 | 'upload_date': '20200520', | |
90 | 'timestamp': 1590000123, | |
91 | 'duration': 35, | |
92 | }, | |
93 | 'params': {'videopassword': 'seniorinfants2'}, | |
94 | }, { | |
95 | # embed, transcoded-url endpoint sends empty JSON response | |
96 | 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', | |
97 | 'md5': '8488817242a0db1cb2ad0ea522553cf6', | |
98 | 'info_dict': { | |
99 | 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', | |
100 | 'ext': 'mp4', | |
101 | 'title': 'CF Reset User\'s Password', | |
102 | 'uploader': 'Aimee Heintz', | |
103 | 'upload_date': '20220707', | |
104 | 'timestamp': 1657216459, | |
105 | 'duration': 181, | |
106 | }, | |
107 | 'expected_warnings': ['Failed to parse JSON'], | |
108 | }] | |
109 | _WEBPAGE_TESTS = [{ | |
110 | 'url': 'https://www.loom.com/community/e1229802a8694a09909e8ba0fbb6d073-pg', | |
111 | 'md5': 'ec838cd01b576cf0386f32e1ae424609', | |
112 | 'info_dict': { | |
113 | 'id': 'e1229802a8694a09909e8ba0fbb6d073', | |
114 | 'ext': 'mp4', | |
115 | 'title': 'Rexie Jane Cimafranca - Founder\'s Presentation', | |
116 | 'uploader': 'Rexie Cimafranca', | |
117 | 'upload_date': '20230213', | |
118 | 'duration': 247, | |
119 | 'timestamp': 1676274030, | |
120 | }, | |
121 | }] | |
122 | ||
123 | _GRAPHQL_VARIABLES = { | |
124 | 'GetVideoSource': { | |
125 | 'acceptableMimes': ['DASH', 'M3U8', 'MP4'], | |
126 | }, | |
127 | } | |
128 | _GRAPHQL_QUERIES = { | |
129 | 'GetVideoSSR': textwrap.dedent('''\ | |
130 | query GetVideoSSR($videoId: ID!, $password: String) { | |
131 | getVideo(id: $videoId, password: $password) { | |
132 | __typename | |
133 | ... on PrivateVideo { | |
134 | id | |
135 | status | |
136 | message | |
137 | __typename | |
138 | } | |
139 | ... on VideoPasswordMissingOrIncorrect { | |
140 | id | |
141 | message | |
142 | __typename | |
143 | } | |
144 | ... on RegularUserVideo { | |
145 | id | |
146 | __typename | |
147 | createdAt | |
148 | description | |
149 | download_enabled | |
150 | folder_id | |
151 | is_protected | |
152 | needs_password | |
153 | owner { | |
154 | display_name | |
155 | __typename | |
156 | } | |
157 | privacy | |
158 | s3_id | |
159 | name | |
160 | video_properties { | |
161 | avgBitRate | |
162 | client | |
163 | camera_enabled | |
164 | client_version | |
165 | duration | |
166 | durationMs | |
167 | format | |
168 | height | |
169 | microphone_enabled | |
170 | os | |
171 | os_version | |
172 | recordingClient | |
173 | recording_type | |
174 | recording_version | |
175 | screen_type | |
176 | tab_audio | |
177 | trim_duration | |
178 | width | |
179 | __typename | |
180 | } | |
181 | playable_duration | |
182 | source_duration | |
183 | visibility | |
184 | } | |
185 | } | |
186 | }\n'''), | |
187 | 'GetVideoSource': textwrap.dedent('''\ | |
188 | query GetVideoSource($videoId: ID!, $password: String, $acceptableMimes: [CloudfrontVideoAcceptableMime]) { | |
189 | getVideo(id: $videoId, password: $password) { | |
190 | ... on RegularUserVideo { | |
191 | id | |
192 | nullableRawCdnUrl(acceptableMimes: $acceptableMimes, password: $password) { | |
193 | url | |
194 | __typename | |
195 | } | |
196 | __typename | |
197 | } | |
198 | __typename | |
199 | } | |
200 | }\n'''), | |
201 | 'FetchVideoTranscript': textwrap.dedent('''\ | |
202 | query FetchVideoTranscript($videoId: ID!, $password: String) { | |
203 | fetchVideoTranscript(videoId: $videoId, password: $password) { | |
204 | ... on VideoTranscriptDetails { | |
205 | id | |
206 | video_id | |
207 | source_url | |
208 | captions_source_url | |
209 | __typename | |
210 | } | |
211 | ... on GenericError { | |
212 | message | |
213 | __typename | |
214 | } | |
215 | __typename | |
216 | } | |
217 | }\n'''), | |
218 | 'FetchChapters': textwrap.dedent('''\ | |
219 | query FetchChapters($videoId: ID!, $password: String) { | |
220 | fetchVideoChapters(videoId: $videoId, password: $password) { | |
221 | ... on VideoChapters { | |
222 | video_id | |
223 | content | |
224 | __typename | |
225 | } | |
226 | ... on EmptyChaptersPayload { | |
227 | content | |
228 | __typename | |
229 | } | |
230 | ... on InvalidRequestWarning { | |
231 | message | |
232 | __typename | |
233 | } | |
234 | ... on Error { | |
235 | message | |
236 | __typename | |
237 | } | |
238 | __typename | |
239 | } | |
240 | }\n'''), | |
241 | } | |
242 | _APOLLO_GRAPHQL_VERSION = '0a1856c' | |
243 | ||
244 | def _call_graphql_api(self, operations, video_id, note=None, errnote=None): | |
245 | password = self.get_param('videopassword') | |
246 | return self._download_json( | |
247 | 'https://www.loom.com/graphql', video_id, note or 'Downloading GraphQL JSON', | |
248 | errnote or 'Failed to download GraphQL JSON', headers={ | |
249 | 'Accept': 'application/json', | |
250 | 'Content-Type': 'application/json', | |
251 | 'x-loom-request-source': f'loom_web_{self._APOLLO_GRAPHQL_VERSION}', | |
252 | 'apollographql-client-name': 'web', | |
253 | 'apollographql-client-version': self._APOLLO_GRAPHQL_VERSION, | |
254 | }, data=json.dumps([{ | |
255 | 'operationName': operation_name, | |
256 | 'variables': { | |
257 | 'videoId': video_id, | |
258 | 'password': password, | |
259 | **self._GRAPHQL_VARIABLES.get(operation_name, {}), | |
260 | }, | |
261 | 'query': self._GRAPHQL_QUERIES[operation_name], | |
262 | } for operation_name in variadic(operations)], separators=(',', ':')).encode()) | |
263 | ||
264 | def _call_url_api(self, endpoint, video_id): | |
265 | response = self._download_json( | |
266 | f'https://www.loom.com/api/campaigns/sessions/{video_id}/{endpoint}', video_id, | |
267 | f'Downloading {endpoint} JSON', f'Failed to download {endpoint} JSON', fatal=False, | |
268 | headers={'Accept': 'application/json', 'Content-Type': 'application/json'}, | |
269 | data=json.dumps({ | |
270 | 'anonID': str(uuid.uuid4()), | |
271 | 'deviceID': None, | |
272 | 'force_original': False, # HTTP error 401 if True | |
273 | 'password': self.get_param('videopassword'), | |
274 | }, separators=(',', ':')).encode()) | |
275 | return traverse_obj(response, ('url', {url_or_none})) | |
276 | ||
277 | def _extract_formats(self, video_id, metadata, gql_data): | |
278 | formats = [] | |
279 | video_properties = traverse_obj(metadata, ('video_properties', { | |
280 | 'width': ('width', {int_or_none}), | |
281 | 'height': ('height', {int_or_none}), | |
282 | 'acodec': ('microphone_enabled', {lambda x: 'none' if x is False else None}), | |
283 | })) | |
284 | ||
285 | def get_formats(format_url, format_id, quality): | |
286 | if not format_url: | |
287 | return | |
288 | ext = determine_ext(format_url) | |
289 | query = urllib.parse.urlparse(format_url).query | |
290 | ||
291 | if ext == 'm3u8': | |
292 | # Extract pre-merged HLS formats to avoid buggy parsing of metadata in split playlists | |
293 | format_url = format_url.replace('-split.m3u8', '.m3u8') | |
294 | m3u8_formats = self._extract_m3u8_formats( | |
295 | format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) | |
296 | for fmt in m3u8_formats: | |
297 | yield { | |
298 | **fmt, | |
299 | 'url': update_url(fmt['url'], query=query), | |
300 | 'extra_param_to_segment_url': query, | |
301 | } | |
302 | ||
303 | elif ext == 'mpd': | |
304 | dash_formats = self._extract_mpd_formats( | |
305 | format_url, video_id, mpd_id=f'dash-{format_id}', fatal=False) | |
306 | for fmt in dash_formats: | |
307 | yield { | |
308 | **fmt, | |
309 | 'extra_param_to_segment_url': query, | |
310 | 'quality': quality, | |
311 | } | |
312 | ||
313 | else: | |
314 | yield { | |
315 | 'url': format_url, | |
316 | 'ext': ext, | |
317 | 'format_id': f'http-{format_id}', | |
318 | 'quality': quality, | |
319 | **video_properties, | |
320 | } | |
321 | ||
322 | raw_url = self._call_url_api('raw-url', video_id) | |
323 | formats.extend(get_formats(raw_url, 'raw', quality=1)) # original quality | |
324 | ||
325 | transcoded_url = self._call_url_api('transcoded-url', video_id) | |
326 | formats.extend(get_formats(transcoded_url, 'transcoded', quality=-1)) # transcoded quality | |
327 | ||
328 | cdn_url = get_first(gql_data, ('data', 'getVideo', 'nullableRawCdnUrl', 'url', {url_or_none})) | |
329 | # cdn_url is usually a dupe, but the raw-url/transcoded-url endpoints could return errors | |
330 | valid_urls = [update_url(url, query=None) for url in (raw_url, transcoded_url) if url] | |
331 | if cdn_url and update_url(cdn_url, query=None) not in valid_urls: | |
332 | formats.extend(get_formats(cdn_url, 'cdn', quality=0)) # could be original or transcoded | |
333 | ||
334 | return formats | |
335 | ||
336 | def _real_extract(self, url): | |
337 | video_id = self._match_id(url) | |
338 | metadata = get_first( | |
339 | self._call_graphql_api('GetVideoSSR', video_id, 'Downloading GraphQL metadata JSON'), | |
340 | ('data', 'getVideo', {dict})) or {} | |
341 | ||
342 | if metadata.get('__typename') == 'VideoPasswordMissingOrIncorrect': | |
343 | if not self.get_param('videopassword'): | |
344 | raise ExtractorError( | |
345 | 'This video is password-protected, use the --video-password option', expected=True) | |
346 | raise ExtractorError('Invalid video password', expected=True) | |
347 | ||
348 | gql_data = self._call_graphql_api(['FetchChapters', 'FetchVideoTranscript', 'GetVideoSource'], video_id) | |
349 | duration = traverse_obj(metadata, ('video_properties', 'duration', {int_or_none})) | |
350 | ||
351 | return { | |
352 | 'id': video_id, | |
353 | 'duration': duration, | |
354 | 'chapters': self._extract_chapters_from_description( | |
355 | get_first(gql_data, ('data', 'fetchVideoChapters', 'content', {str})), duration) or None, | |
356 | 'formats': self._extract_formats(video_id, metadata, gql_data), | |
357 | 'subtitles': filter_dict({ | |
358 | 'en': traverse_obj(gql_data, ( | |
359 | ..., 'data', 'fetchVideoTranscript', | |
360 | ('source_url', 'captions_source_url'), { | |
361 | 'url': {url_or_none}, | |
362 | })) or None, | |
363 | }), | |
364 | **traverse_obj(metadata, { | |
365 | 'title': ('name', {str}), | |
366 | 'description': ('description', {str}), | |
367 | 'uploader': ('owner', 'display_name', {str}), | |
368 | 'timestamp': ('createdAt', {parse_iso8601}), | |
369 | }), | |
370 | } | |
371 | ||
372 | ||
373 | class LoomFolderIE(InfoExtractor): | |
374 | IE_NAME = 'loom:folder' | |
375 | _VALID_URL = r'https?://(?:www\.)?loom\.com/share/folder/(?P<id>[\da-f]{32})' | |
376 | _TESTS = [{ | |
377 | # 2 subfolders, no videos in root | |
378 | 'url': 'https://www.loom.com/share/folder/997db4db046f43e5912f10dc5f817b5c', | |
379 | 'playlist_mincount': 16, | |
380 | 'info_dict': { | |
381 | 'id': '997db4db046f43e5912f10dc5f817b5c', | |
382 | 'title': 'Blending Lessons', | |
383 | }, | |
384 | }, { | |
385 | # only videos, no subfolders | |
386 | 'url': 'https://www.loom.com/share/folder/9a8a87f6b6f546d9a400c8e7575ff7f2', | |
387 | 'playlist_mincount': 12, | |
388 | 'info_dict': { | |
389 | 'id': '9a8a87f6b6f546d9a400c8e7575ff7f2', | |
390 | 'title': 'List A- a, i, o', | |
391 | }, | |
392 | }, { | |
393 | # videos in root and empty subfolder | |
394 | 'url': 'https://www.loom.com/share/folder/886e534218c24fd292e97e9563078cc4', | |
395 | 'playlist_mincount': 21, | |
396 | 'info_dict': { | |
397 | 'id': '886e534218c24fd292e97e9563078cc4', | |
398 | 'title': 'Medicare Agent Training videos', | |
399 | }, | |
400 | }, { | |
401 | # videos in root and videos in subfolders | |
402 | 'url': 'https://www.loom.com/share/folder/b72c4ecdf04745da9403926d80a40c38', | |
403 | 'playlist_mincount': 21, | |
404 | 'info_dict': { | |
405 | 'id': 'b72c4ecdf04745da9403926d80a40c38', | |
406 | 'title': 'Quick Altos Q & A Tutorials', | |
407 | }, | |
408 | }, { | |
409 | # recursive folder extraction | |
410 | 'url': 'https://www.loom.com/share/folder/8b458a94e0e4449b8df9ea7a68fafc4e', | |
411 | 'playlist_count': 23, | |
412 | 'info_dict': { | |
413 | 'id': '8b458a94e0e4449b8df9ea7a68fafc4e', | |
414 | 'title': 'Sezer Texting Guide', | |
415 | }, | |
416 | }, { | |
417 | # more than 50 videos in 1 folder | |
418 | 'url': 'https://www.loom.com/share/folder/e056a91d290d47ca9b00c9d1df56c463', | |
419 | 'playlist_mincount': 61, | |
420 | 'info_dict': { | |
421 | 'id': 'e056a91d290d47ca9b00c9d1df56c463', | |
422 | 'title': 'User Videos', | |
423 | }, | |
424 | }, { | |
425 | # many subfolders | |
426 | 'url': 'https://www.loom.com/share/folder/c2dde8cc67454f0e99031677279d8954', | |
427 | 'playlist_mincount': 75, | |
428 | 'info_dict': { | |
429 | 'id': 'c2dde8cc67454f0e99031677279d8954', | |
430 | 'title': 'Honors 1', | |
431 | }, | |
432 | }, { | |
433 | 'url': 'https://www.loom.com/share/folder/bae17109a68146c7803454f2893c8cf8/Edpuzzle', | |
434 | 'only_matching': True, | |
435 | }] | |
436 | ||
437 | def _extract_folder_data(self, folder_id): | |
438 | return self._download_json( | |
439 | f'https://www.loom.com/v1/folders/{folder_id}', folder_id, | |
440 | 'Downloading folder info JSON', query={'limit': '10000'}) | |
441 | ||
442 | def _extract_folder_entries(self, folder_id, initial_folder_data=None): | |
443 | folder_data = initial_folder_data or self._extract_folder_data(folder_id) | |
444 | ||
445 | for video in traverse_obj(folder_data, ('videos', lambda _, v: v['id'])): | |
446 | video_id = video['id'] | |
447 | yield self.url_result( | |
448 | f'https://www.loom.com/share/{video_id}', LoomIE, video_id, video.get('name')) | |
449 | ||
450 | # Recurse into subfolders | |
451 | for subfolder_id in traverse_obj(folder_data, ( | |
452 | 'folders', lambda _, v: v['id'] != folder_id, 'id', {str})): | |
453 | yield from self._extract_folder_entries(subfolder_id) | |
454 | ||
455 | def _real_extract(self, url): | |
456 | playlist_id = self._match_id(url) | |
457 | playlist_data = self._extract_folder_data(playlist_id) | |
458 | ||
459 | return self.playlist_result( | |
460 | self._extract_folder_entries(playlist_id, playlist_data), playlist_id, | |
461 | traverse_obj(playlist_data, ('folder', 'name', {str.strip}))) |