]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from .wistia import WistiaIE | |
5 | from ..utils import ( | |
6 | clean_html, | |
7 | ExtractorError, | |
8 | int_or_none, | |
9 | get_element_by_class, | |
10 | strip_or_none, | |
11 | urlencode_postdata, | |
12 | urljoin, | |
13 | ) | |
14 | ||
15 | ||
16 | class TeachableBaseIE(InfoExtractor): | |
17 | _NETRC_MACHINE = 'teachable' | |
18 | _URL_PREFIX = 'teachable:' | |
19 | ||
20 | _SITES = { | |
21 | # Only notable ones here | |
22 | 'v1.upskillcourses.com': 'upskill', | |
23 | 'gns3.teachable.com': 'gns3', | |
24 | 'academyhacker.com': 'academyhacker', | |
25 | 'stackskills.com': 'stackskills', | |
26 | 'market.saleshacker.com': 'saleshacker', | |
27 | 'learnability.org': 'learnability', | |
28 | 'edurila.com': 'edurila', | |
29 | 'courses.workitdaily.com': 'workitdaily', | |
30 | } | |
31 | ||
32 | _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys())) | |
33 | ||
34 | def _real_initialize(self): | |
35 | self._logged_in = False | |
36 | ||
37 | def _login(self, site): | |
38 | if self._logged_in: | |
39 | return | |
40 | ||
41 | username, password = self._get_login_info(netrc_machine=self._SITES.get(site, site)) | |
42 | if username is None: | |
43 | return | |
44 | ||
45 | login_page, urlh = self._download_webpage_handle( | |
46 | 'https://%s/sign_in' % site, None, | |
47 | 'Downloading %s login page' % site) | |
48 | ||
49 | def is_logged(webpage): | |
50 | return any(re.search(p, webpage) for p in ( | |
51 | r'class=["\']user-signout', | |
52 | r'<a[^>]+\bhref=["\']/sign_out', | |
53 | r'Log\s+[Oo]ut\s*<')) | |
54 | ||
55 | if is_logged(login_page): | |
56 | self._logged_in = True | |
57 | return | |
58 | ||
59 | login_url = urlh.url | |
60 | ||
61 | login_form = self._hidden_inputs(login_page) | |
62 | ||
63 | login_form.update({ | |
64 | 'user[email]': username, | |
65 | 'user[password]': password, | |
66 | }) | |
67 | ||
68 | post_url = self._search_regex( | |
69 | r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, | |
70 | 'post url', default=login_url, group='url') | |
71 | ||
72 | if not post_url.startswith('http'): | |
73 | post_url = urljoin(login_url, post_url) | |
74 | ||
75 | response = self._download_webpage( | |
76 | post_url, None, 'Logging in to %s' % site, | |
77 | data=urlencode_postdata(login_form), | |
78 | headers={ | |
79 | 'Content-Type': 'application/x-www-form-urlencoded', | |
80 | 'Referer': login_url, | |
81 | }) | |
82 | ||
83 | if '>I accept the new Privacy Policy<' in response: | |
84 | raise ExtractorError( | |
85 | 'Unable to login: %s asks you to accept new Privacy Policy. ' | |
86 | 'Go to https://%s/ and accept.' % (site, site), expected=True) | |
87 | ||
88 | # Successful login | |
89 | if is_logged(response): | |
90 | self._logged_in = True | |
91 | return | |
92 | ||
93 | message = get_element_by_class('alert', response) | |
94 | if message is not None: | |
95 | raise ExtractorError( | |
96 | 'Unable to login: %s' % clean_html(message), expected=True) | |
97 | ||
98 | raise ExtractorError('Unable to log in') | |
99 | ||
100 | ||
101 | class TeachableIE(TeachableBaseIE): | |
102 | _VALID_URL = r'''(?x) | |
103 | (?: | |
104 | %shttps?://(?P<site_t>[^/]+)| | |
105 | https?://(?:www\.)?(?P<site>%s) | |
106 | ) | |
107 | /courses/[^/]+/lectures/(?P<id>\d+) | |
108 | ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE | |
109 | ||
110 | _TESTS = [{ | |
111 | 'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364', | |
112 | 'info_dict': { | |
113 | 'id': 'untlgzk1v7', | |
114 | 'ext': 'bin', | |
115 | 'title': 'Overview', | |
116 | 'description': 'md5:071463ff08b86c208811130ea1c2464c', | |
117 | 'duration': 736.4, | |
118 | 'timestamp': 1542315762, | |
119 | 'upload_date': '20181115', | |
120 | 'chapter': 'Welcome', | |
121 | 'chapter_number': 1, | |
122 | }, | |
123 | 'params': { | |
124 | 'skip_download': True, | |
125 | }, | |
126 | }, { | |
127 | 'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100', | |
128 | 'only_matching': True, | |
129 | }, { | |
130 | 'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939', | |
131 | 'only_matching': True, | |
132 | }, { | |
133 | 'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', | |
134 | 'only_matching': True, | |
135 | }] | |
136 | ||
137 | @staticmethod | |
138 | def _is_teachable(webpage): | |
139 | return 'teachableTracker.linker:autoLink' in webpage and re.search( | |
140 | r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', | |
141 | webpage) | |
142 | ||
143 | @classmethod | |
144 | def _extract_embed_urls(cls, url, webpage): | |
145 | if cls._is_teachable(webpage): | |
146 | if re.match(r'https?://[^/]+/(?:courses|p)', url): | |
147 | yield f'{cls._URL_PREFIX}{url}' | |
148 | raise cls.StopExtraction() | |
149 | ||
150 | def _real_extract(self, url): | |
151 | mobj = self._match_valid_url(url) | |
152 | site = mobj.group('site') or mobj.group('site_t') | |
153 | video_id = mobj.group('id') | |
154 | ||
155 | self._login(site) | |
156 | ||
157 | prefixed = url.startswith(self._URL_PREFIX) | |
158 | if prefixed: | |
159 | url = url[len(self._URL_PREFIX):] | |
160 | ||
161 | webpage = self._download_webpage(url, video_id) | |
162 | ||
163 | wistia_urls = WistiaIE._extract_embed_urls(url, webpage) | |
164 | if not wistia_urls: | |
165 | if any(re.search(p, webpage) for p in ( | |
166 | r'class=["\']lecture-contents-locked', | |
167 | r'>\s*Lecture contents locked', | |
168 | r'id=["\']lecture-locked', | |
169 | # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 | |
170 | r'class=["\'](?:inner-)?lesson-locked', | |
171 | r'>LESSON LOCKED<')): | |
172 | self.raise_login_required('Lecture contents locked') | |
173 | raise ExtractorError('Unable to find video URL') | |
174 | ||
175 | title = self._og_search_title(webpage, default=None) | |
176 | ||
177 | chapter = None | |
178 | chapter_number = None | |
179 | section_item = self._search_regex( | |
180 | r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id, | |
181 | webpage, 'section item', default=None, group='li') | |
182 | if section_item: | |
183 | chapter_number = int_or_none(self._search_regex( | |
184 | r'data-ss-position=["\'](\d+)', section_item, 'section id', | |
185 | default=None)) | |
186 | if chapter_number is not None: | |
187 | sections = [] | |
188 | for s in re.findall( | |
189 | r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage): | |
190 | section = strip_or_none(clean_html(s)) | |
191 | if not section: | |
192 | sections = [] | |
193 | break | |
194 | sections.append(section) | |
195 | if chapter_number <= len(sections): | |
196 | chapter = sections[chapter_number - 1] | |
197 | ||
198 | entries = [{ | |
199 | '_type': 'url_transparent', | |
200 | 'url': wistia_url, | |
201 | 'ie_key': WistiaIE.ie_key(), | |
202 | 'title': title, | |
203 | 'chapter': chapter, | |
204 | 'chapter_number': chapter_number, | |
205 | } for wistia_url in wistia_urls] | |
206 | ||
207 | return self.playlist_result(entries, video_id, title) | |
208 | ||
209 | ||
210 | class TeachableCourseIE(TeachableBaseIE): | |
211 | _VALID_URL = r'''(?x) | |
212 | (?: | |
213 | %shttps?://(?P<site_t>[^/]+)| | |
214 | https?://(?:www\.)?(?P<site>%s) | |
215 | ) | |
216 | /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+) | |
217 | ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE | |
218 | _TESTS = [{ | |
219 | 'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/', | |
220 | 'info_dict': { | |
221 | 'id': 'essential-web-developer-course', | |
222 | 'title': 'The Essential Web Developer Course (Free)', | |
223 | }, | |
224 | 'playlist_count': 192, | |
225 | }, { | |
226 | 'url': 'http://v1.upskillcourses.com/courses/119763/', | |
227 | 'only_matching': True, | |
228 | }, { | |
229 | 'url': 'http://v1.upskillcourses.com/courses/enrolled/119763', | |
230 | 'only_matching': True, | |
231 | }, { | |
232 | 'url': 'https://gns3.teachable.com/courses/enrolled/423415', | |
233 | 'only_matching': True, | |
234 | }, { | |
235 | 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini', | |
236 | 'only_matching': True, | |
237 | }, { | |
238 | 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course', | |
239 | 'only_matching': True, | |
240 | }] | |
241 | ||
242 | @classmethod | |
243 | def suitable(cls, url): | |
244 | return False if TeachableIE.suitable(url) else super( | |
245 | TeachableCourseIE, cls).suitable(url) | |
246 | ||
247 | def _real_extract(self, url): | |
248 | mobj = self._match_valid_url(url) | |
249 | site = mobj.group('site') or mobj.group('site_t') | |
250 | course_id = mobj.group('id') | |
251 | ||
252 | self._login(site) | |
253 | ||
254 | prefixed = url.startswith(self._URL_PREFIX) | |
255 | if prefixed: | |
256 | prefix = self._URL_PREFIX | |
257 | url = url[len(prefix):] | |
258 | ||
259 | webpage = self._download_webpage(url, course_id) | |
260 | ||
261 | url_base = 'https://%s/' % site | |
262 | ||
263 | entries = [] | |
264 | ||
265 | for mobj in re.finditer( | |
266 | r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', | |
267 | webpage): | |
268 | li = mobj.group('li') | |
269 | if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): | |
270 | continue | |
271 | lecture_url = self._search_regex( | |
272 | r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, | |
273 | 'lecture url', default=None, group='url') | |
274 | if not lecture_url: | |
275 | continue | |
276 | lecture_id = self._search_regex( | |
277 | r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) | |
278 | title = self._html_search_regex( | |
279 | r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, | |
280 | 'title', default=None) | |
281 | entry_url = urljoin(url_base, lecture_url) | |
282 | if prefixed: | |
283 | entry_url = self._URL_PREFIX + entry_url | |
284 | entries.append( | |
285 | self.url_result( | |
286 | entry_url, | |
287 | ie=TeachableIE.ie_key(), video_id=lecture_id, | |
288 | video_title=clean_html(title))) | |
289 | ||
290 | course_title = self._html_search_regex( | |
291 | (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', | |
292 | r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), | |
293 | webpage, 'course title', fatal=False) | |
294 | ||
295 | return self.playlist_result(entries, course_id, course_title) |