]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
5 | clean_html, | |
6 | determine_ext, | |
7 | ExtractorError, | |
8 | float_or_none, | |
9 | get_element_by_class, | |
10 | get_element_by_id, | |
11 | parse_duration, | |
12 | remove_end, | |
13 | urlencode_postdata, | |
14 | urljoin, | |
15 | ) | |
16 | ||
17 | ||
18 | class TeamTreeHouseIE(InfoExtractor): | |
19 | _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)' | |
20 | _TESTS = [{ | |
21 | # Course | |
22 | 'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php', | |
23 | 'info_dict': { | |
24 | 'id': 'introduction-to-user-authentication-in-php', | |
25 | 'title': 'Introduction to User Authentication in PHP', | |
26 | 'description': 'md5:405d7b4287a159b27ddf30ca72b5b053', | |
27 | }, | |
28 | 'playlist_mincount': 24, | |
29 | }, { | |
30 | # WorkShop | |
31 | 'url': 'https://teamtreehouse.com/library/deploying-a-react-app', | |
32 | 'info_dict': { | |
33 | 'id': 'deploying-a-react-app', | |
34 | 'title': 'Deploying a React App', | |
35 | 'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921', | |
36 | }, | |
37 | 'playlist_mincount': 4, | |
38 | }, { | |
39 | # Video | |
40 | 'url': 'https://teamtreehouse.com/library/application-overview-2', | |
41 | 'info_dict': { | |
42 | 'id': 'application-overview-2', | |
43 | 'ext': 'mp4', | |
44 | 'title': 'Application Overview', | |
45 | 'description': 'md5:4b0a234385c27140a4378de5f1e15127', | |
46 | }, | |
47 | 'expected_warnings': ['This is just a preview'], | |
48 | }] | |
49 | _NETRC_MACHINE = 'teamtreehouse' | |
50 | ||
51 | def _perform_login(self, username, password): | |
52 | ||
53 | signin_page = self._download_webpage( | |
54 | 'https://teamtreehouse.com/signin', | |
55 | None, 'Downloading signin page') | |
56 | data = self._form_hidden_inputs('new_user_session', signin_page) | |
57 | data.update({ | |
58 | 'user_session[email]': username, | |
59 | 'user_session[password]': password, | |
60 | }) | |
61 | error_message = get_element_by_class('error-message', self._download_webpage( | |
62 | 'https://teamtreehouse.com/person_session', | |
63 | None, 'Logging in', data=urlencode_postdata(data))) | |
64 | if error_message: | |
65 | raise ExtractorError(clean_html(error_message), expected=True) | |
66 | ||
67 | def _real_extract(self, url): | |
68 | display_id = self._match_id(url) | |
69 | webpage = self._download_webpage(url, display_id) | |
70 | title = self._html_search_meta(['og:title', 'twitter:title'], webpage) | |
71 | description = self._html_search_meta( | |
72 | ['description', 'og:description', 'twitter:description'], webpage) | |
73 | entries = self._parse_html5_media_entries(url, webpage, display_id) | |
74 | if entries: | |
75 | info = entries[0] | |
76 | ||
77 | for subtitles in info.get('subtitles', {}).values(): | |
78 | for subtitle in subtitles: | |
79 | subtitle['ext'] = determine_ext(subtitle['url'], 'srt') | |
80 | ||
81 | is_preview = 'data-preview="true"' in webpage | |
82 | if is_preview: | |
83 | self.report_warning( | |
84 | 'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id) | |
85 | duration = 30 | |
86 | else: | |
87 | duration = float_or_none(self._search_regex( | |
88 | r'data-duration="(\d+)"', webpage, 'duration'), 1000) | |
89 | if not duration: | |
90 | duration = parse_duration(get_element_by_id( | |
91 | 'video-duration', webpage)) | |
92 | ||
93 | info.update({ | |
94 | 'id': display_id, | |
95 | 'title': title, | |
96 | 'description': description, | |
97 | 'duration': duration, | |
98 | }) | |
99 | return info | |
100 | else: | |
101 | def extract_urls(html, extract_info=None): | |
102 | for path in re.findall(r'<a[^>]+href="([^"]+)"', html): | |
103 | page_url = urljoin(url, path) | |
104 | entry = { | |
105 | '_type': 'url_transparent', | |
106 | 'id': self._match_id(page_url), | |
107 | 'url': page_url, | |
108 | 'id_key': self.ie_key(), | |
109 | } | |
110 | if extract_info: | |
111 | entry.update(extract_info) | |
112 | entries.append(entry) | |
113 | ||
114 | workshop_videos = self._search_regex( | |
115 | r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>', | |
116 | webpage, 'workshop videos', default=None) | |
117 | if workshop_videos: | |
118 | extract_urls(workshop_videos) | |
119 | else: | |
120 | stages_path = self._search_regex( | |
121 | r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"', | |
122 | webpage, 'stages path') | |
123 | if stages_path: | |
124 | stages_page = self._download_webpage( | |
125 | urljoin(url, stages_path), display_id, 'Downloading stages page') | |
126 | for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1): | |
127 | extract_urls(steps_list, { | |
128 | 'chapter': chapter, | |
129 | 'chapter_number': chapter_number, | |
130 | }) | |
131 | title = remove_end(title, ' Course') | |
132 | ||
133 | return self.playlist_result( | |
134 | entries, display_id, title, description) |