]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/linuxacademy.py
[ie/box] Fix formats extraction (#8649)
[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py
1 import json
2 import random
3
4 from .common import InfoExtractor
5 from ..compat import compat_b64decode, compat_str
6 from ..networking.exceptions import HTTPError
7 from ..utils import (
8 clean_html,
9 ExtractorError,
10 js_to_json,
11 parse_duration,
12 try_get,
13 unified_timestamp,
14 urlencode_postdata,
15 urljoin,
16 )
17
18
19 class LinuxAcademyIE(InfoExtractor):
20 _VALID_URL = r'''(?x)
21 https?://
22 (?:www\.)?linuxacademy\.com/cp/
23 (?:
24 courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
25 modules/view/id/(?P<course_id>\d+)
26 )
27 '''
28 _TESTS = [{
29 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
30 'info_dict': {
31 'id': '7971-2',
32 'ext': 'mp4',
33 'title': 'What Is Data Science',
34 'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
35 'timestamp': int, # The timestamp and upload date changes
36 'upload_date': r're:\d+',
37 'duration': 304,
38 },
39 'params': {
40 'skip_download': True,
41 },
42 'skip': 'Requires Linux Academy account credentials',
43 }, {
44 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
45 'only_matching': True,
46 }, {
47 'url': 'https://linuxacademy.com/cp/modules/view/id/154',
48 'info_dict': {
49 'id': '154',
50 'title': 'AWS Certified Cloud Practitioner',
51 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
52 'duration': 28835,
53 },
54 'playlist_count': 41,
55 'skip': 'Requires Linux Academy account credentials',
56 }, {
57 'url': 'https://linuxacademy.com/cp/modules/view/id/39',
58 'info_dict': {
59 'id': '39',
60 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
61 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
62 'duration': 89280,
63 },
64 'playlist_count': 73,
65 'skip': 'Requires Linux Academy account credentials',
66 }]
67
68 _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
69 _ORIGIN_URL = 'https://linuxacademy.com'
70 _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
71 _NETRC_MACHINE = 'linuxacademy'
72
73 def _perform_login(self, username, password):
74 def random_string():
75 return ''.join(random.choices(
76 '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))
77
78 webpage, urlh = self._download_webpage_handle(
79 self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
80 'client_id': self._CLIENT_ID,
81 'response_type': 'token id_token',
82 'response_mode': 'web_message',
83 'redirect_uri': self._ORIGIN_URL,
84 'scope': 'openid email user_impersonation profile',
85 'audience': self._ORIGIN_URL,
86 'state': random_string(),
87 'nonce': random_string(),
88 })
89
90 login_data = self._parse_json(
91 self._search_regex(
92 r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
93 'login info', group='value'), None,
94 transform_source=lambda x: compat_b64decode(x).decode('utf-8')
95 )['extraParams']
96
97 login_data.update({
98 'client_id': self._CLIENT_ID,
99 'redirect_uri': self._ORIGIN_URL,
100 'tenant': 'lacausers',
101 'connection': 'Username-Password-ACG-Proxy',
102 'username': username,
103 'password': password,
104 'sso': 'true',
105 })
106
107 login_state_url = urlh.url
108
109 try:
110 login_page = self._download_webpage(
111 'https://login.linuxacademy.com/usernamepassword/login', None,
112 'Downloading login page', data=json.dumps(login_data).encode(),
113 headers={
114 'Content-Type': 'application/json',
115 'Origin': 'https://login.linuxacademy.com',
116 'Referer': login_state_url,
117 })
118 except ExtractorError as e:
119 if isinstance(e.cause, HTTPError) and e.cause.status == 401:
120 error = self._parse_json(e.cause.response.read(), None)
121 message = error.get('description') or error['code']
122 raise ExtractorError(
123 '%s said: %s' % (self.IE_NAME, message), expected=True)
124 raise
125
126 callback_page, urlh = self._download_webpage_handle(
127 'https://login.linuxacademy.com/login/callback', None,
128 'Downloading callback page',
129 data=urlencode_postdata(self._hidden_inputs(login_page)),
130 headers={
131 'Content-Type': 'application/x-www-form-urlencoded',
132 'Origin': 'https://login.linuxacademy.com',
133 'Referer': login_state_url,
134 })
135
136 access_token = self._search_regex(
137 r'access_token=([^=&]+)', urlh.url,
138 'access token', default=None)
139 if not access_token:
140 access_token = self._parse_json(
141 self._search_regex(
142 r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
143 'authorization response'), None,
144 transform_source=js_to_json)['response']['access_token']
145
146 self._download_webpage(
147 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
148 % access_token, None, 'Downloading token validation page')
149
150 def _real_extract(self, url):
151 mobj = self._match_valid_url(url)
152 chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
153 item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
154
155 webpage = self._download_webpage(url, item_id)
156
157 # course path
158 if course_id:
159 module = self._parse_json(
160 self._search_regex(
161 r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
162 item_id)
163 entries = []
164 chapter_number = None
165 chapter = None
166 chapter_id = None
167 for item in module['items']:
168 if not isinstance(item, dict):
169 continue
170
171 def type_field(key):
172 return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
173 type_fields = (type_field('name'), type_field('slug'))
174 # Move to next module section
175 if 'section' in type_fields:
176 chapter = item.get('course_name')
177 chapter_id = item.get('course_module')
178 chapter_number = 1 if not chapter_number else chapter_number + 1
179 continue
180 # Skip non-lessons
181 if 'lesson' not in type_fields:
182 continue
183 lesson_url = urljoin(url, item.get('url'))
184 if not lesson_url:
185 continue
186 title = item.get('title') or item.get('lesson_name')
187 description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
188 entries.append({
189 '_type': 'url_transparent',
190 'url': lesson_url,
191 'ie_key': LinuxAcademyIE.ie_key(),
192 'title': title,
193 'description': description,
194 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
195 'duration': parse_duration(item.get('duration')),
196 'chapter': chapter,
197 'chapter_id': chapter_id,
198 'chapter_number': chapter_number,
199 })
200 return {
201 '_type': 'playlist',
202 'entries': entries,
203 'id': course_id,
204 'title': module.get('title'),
205 'description': module.get('md_desc') or clean_html(module.get('desc')),
206 'duration': parse_duration(module.get('duration')),
207 }
208
209 # single video path
210 m3u8_url = self._parse_json(
211 self._search_regex(
212 r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
213 item_id)[0]['file']
214 formats = self._extract_m3u8_formats(
215 m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
216 m3u8_id='hls')
217 info = {
218 'id': item_id,
219 'formats': formats,
220 }
221 lesson = self._parse_json(
222 self._search_regex(
223 (r'window\.lesson\s*=\s*({.+?})\s*;',
224 r'player\.lesson\s*=\s*({.+?})\s*;'),
225 webpage, 'lesson', default='{}'), item_id, fatal=False)
226 if lesson:
227 info.update({
228 'title': lesson.get('lesson_name'),
229 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
230 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
231 'duration': parse_duration(lesson.get('duration')),
232 })
233 if not info.get('title'):
234 info['title'] = self._search_regex(
235 (r'>Lecture\s*:\s*(?P<value>[^<]+)',
236 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
237 'title', group='value')
238 return info