]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/khanacademy.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / khanacademy.py
CommitLineData
30a074c2 1import json
3d3538e4
PH
2
3from .common import InfoExtractor
4from ..utils import (
30a074c2 5 int_or_none,
6 parse_iso8601,
7 try_get,
3d3538e4
PH
8)
9
10
30a074c2 11class KhanAcademyBaseIE(InfoExtractor):
12 _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
3d3538e4 13
30a074c2 14 def _parse_video(self, video):
15 return {
16 '_type': 'url_transparent',
17 'url': video['youtubeId'],
18 'id': video.get('slug'),
19 'title': video.get('title'),
20 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
21 'duration': int_or_none(video.get('duration')),
22 'description': video.get('description'),
23 'ie_key': 'Youtube',
24 }
25
26 def _real_extract(self, url):
27 display_id = self._match_id(url)
4f7a98c5 28 content = self._download_json(
29 'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
30a074c2 30 display_id, query={
4f7a98c5 31 'fastly_cacheable': 'persist_until_publish',
32 'hash': '4134764944',
33 'lang': 'en',
30a074c2 34 'variables': json.dumps({
35 'path': display_id,
4f7a98c5 36 'queryParams': 'lang=en',
37 'isModal': False,
38 'followRedirects': True,
39 'countryCode': 'US',
30a074c2 40 }),
4f7a98c5 41 })['data']['contentJson']
42 return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
30a074c2 43
44
45class KhanAcademyIE(KhanAcademyBaseIE):
46 IE_NAME = 'khanacademy'
47 _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
48 _TEST = {
49 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
50 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
3d3538e4 51 'info_dict': {
30a074c2 52 'id': 'FlIG3TvQCBQ',
53 'ext': 'mp4',
3d3538e4
PH
54 'title': 'The one-time pad',
55 'description': 'The perfect cipher',
56 'duration': 176,
57 'uploader': 'Brit Cruise',
628bc4d1 58 'uploader_id': 'khanacademy',
3d3538e4 59 'upload_date': '20120411',
30a074c2 60 'timestamp': 1334170113,
61 'license': 'cc-by-nc-sa',
628bc4d1
JMF
62 },
63 'add_ie': ['Youtube'],
30a074c2 64 }
65
66 def _parse_component_props(self, component_props):
67 video = component_props['tutorialPageData']['contentModel']
68 info = self._parse_video(video)
69 author_names = video.get('authorNames')
70 info.update({
71 'uploader': ', '.join(author_names) if author_names else None,
72 'timestamp': parse_iso8601(video.get('dateAdded')),
73 'license': video.get('kaUserLicense'),
74 })
75 return info
76
77
78class KhanAcademyUnitIE(KhanAcademyBaseIE):
79 IE_NAME = 'khanacademy:unit'
80 _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
81 _TEST = {
82 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
22a6f150
PH
83 'info_dict': {
84 'id': 'cryptography',
30a074c2 85 'title': 'Cryptography',
22a6f150
PH
86 'description': 'How have humans protected their secret messages through history? What has changed today?',
87 },
30a074c2 88 'playlist_mincount': 31,
89 }
3d3538e4 90
30a074c2 91 def _parse_component_props(self, component_props):
92 curation = component_props['curation']
3d3538e4 93
30a074c2 94 entries = []
95 tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
96 for tutorial_number, tutorial in enumerate(tutorials, 1):
97 chapter_info = {
98 'chapter': tutorial.get('title'),
99 'chapter_number': tutorial_number,
100 'chapter_id': tutorial.get('id'),
3d3538e4 101 }
30a074c2 102 for content_item in (tutorial.get('contentItems') or []):
103 if content_item.get('kind') == 'Video':
104 info = self._parse_video(content_item)
105 info.update(chapter_info)
106 entries.append(info)
3d3538e4 107
30a074c2 108 return self.playlist_result(
109 entries, curation.get('unit'), curation.get('title'),
110 curation.get('description'))