]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/khanacademy.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / khanacademy.py
1 import json
2
3 from .common import InfoExtractor
4 from ..utils import (
5 int_or_none,
6 parse_iso8601,
7 try_get,
8 )
9
10
11 class KhanAcademyBaseIE(InfoExtractor):
12 _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
13
14 def _parse_video(self, video):
15 return {
16 '_type': 'url_transparent',
17 'url': video['youtubeId'],
18 'id': video.get('slug'),
19 'title': video.get('title'),
20 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
21 'duration': int_or_none(video.get('duration')),
22 'description': video.get('description'),
23 'ie_key': 'Youtube',
24 }
25
26 def _real_extract(self, url):
27 display_id = self._match_id(url)
28 content = self._download_json(
29 'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
30 display_id, query={
31 'fastly_cacheable': 'persist_until_publish',
32 'hash': '4134764944',
33 'lang': 'en',
34 'variables': json.dumps({
35 'path': display_id,
36 'queryParams': 'lang=en',
37 'isModal': False,
38 'followRedirects': True,
39 'countryCode': 'US',
40 }),
41 })['data']['contentJson']
42 return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
43
44
45 class KhanAcademyIE(KhanAcademyBaseIE):
46 IE_NAME = 'khanacademy'
47 _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
48 _TEST = {
49 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
50 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
51 'info_dict': {
52 'id': 'FlIG3TvQCBQ',
53 'ext': 'mp4',
54 'title': 'The one-time pad',
55 'description': 'The perfect cipher',
56 'duration': 176,
57 'uploader': 'Brit Cruise',
58 'uploader_id': 'khanacademy',
59 'upload_date': '20120411',
60 'timestamp': 1334170113,
61 'license': 'cc-by-nc-sa',
62 },
63 'add_ie': ['Youtube'],
64 }
65
66 def _parse_component_props(self, component_props):
67 video = component_props['tutorialPageData']['contentModel']
68 info = self._parse_video(video)
69 author_names = video.get('authorNames')
70 info.update({
71 'uploader': ', '.join(author_names) if author_names else None,
72 'timestamp': parse_iso8601(video.get('dateAdded')),
73 'license': video.get('kaUserLicense'),
74 })
75 return info
76
77
78 class KhanAcademyUnitIE(KhanAcademyBaseIE):
79 IE_NAME = 'khanacademy:unit'
80 _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
81 _TEST = {
82 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
83 'info_dict': {
84 'id': 'cryptography',
85 'title': 'Cryptography',
86 'description': 'How have humans protected their secret messages through history? What has changed today?',
87 },
88 'playlist_mincount': 31,
89 }
90
91 def _parse_component_props(self, component_props):
92 curation = component_props['curation']
93
94 entries = []
95 tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
96 for tutorial_number, tutorial in enumerate(tutorials, 1):
97 chapter_info = {
98 'chapter': tutorial.get('title'),
99 'chapter_number': tutorial_number,
100 'chapter_id': tutorial.get('id'),
101 }
102 for content_item in (tutorial.get('contentItems') or []):
103 if content_item.get('kind') == 'Video':
104 info = self._parse_video(content_item)
105 info.update(chapter_info)
106 entries.append(info)
107
108 return self.playlist_result(
109 entries, curation.get('unit'), curation.get('title'),
110 curation.get('description'))