]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/safari.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / safari.py
1 import json
2 import re
3
4 from .common import InfoExtractor
5 from ..compat import (
6 compat_parse_qs,
7 compat_urlparse,
8 )
9 from ..utils import (
10 ExtractorError,
11 update_url_query,
12 )
13
14
15 class SafariBaseIE(InfoExtractor):
16 _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
17 _NETRC_MACHINE = 'safari'
18
19 _API_BASE = 'https://learning.oreilly.com/api/v1'
20 _API_FORMAT = 'json'
21
22 LOGGED_IN = False
23
24 def _perform_login(self, username, password):
25 _, urlh = self._download_webpage_handle(
26 'https://learning.oreilly.com/accounts/login-check/', None,
27 'Downloading login page')
28
29 def is_logged(urlh):
30 return 'learning.oreilly.com/home/' in urlh.url
31
32 if is_logged(urlh):
33 self.LOGGED_IN = True
34 return
35
36 redirect_url = urlh.url
37 parsed_url = compat_urlparse.urlparse(redirect_url)
38 qs = compat_parse_qs(parsed_url.query)
39 next_uri = compat_urlparse.urljoin(
40 'https://api.oreilly.com', qs['next'][0])
41
42 auth, urlh = self._download_json_handle(
43 'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
44 data=json.dumps({
45 'email': username,
46 'password': password,
47 'redirect_uri': next_uri,
48 }).encode(), headers={
49 'Content-Type': 'application/json',
50 'Referer': redirect_url,
51 }, expected_status=400)
52
53 credentials = auth.get('credentials')
54 if (not auth.get('logged_in') and not auth.get('redirect_uri')
55 and credentials):
56 raise ExtractorError(
57 'Unable to login: %s' % credentials, expected=True)
58
59 # oreilly serves two same instances of the following cookies
60 # in Set-Cookie header and expects first one to be actually set
61 for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
62 self._apply_first_set_cookie_header(urlh, cookie)
63
64 _, urlh = self._download_webpage_handle(
65 auth.get('redirect_uri') or next_uri, None, 'Completing login',)
66
67 if is_logged(urlh):
68 self.LOGGED_IN = True
69 return
70
71 raise ExtractorError('Unable to log in')
72
73
74 class SafariIE(SafariBaseIE):
75 IE_NAME = 'safari'
76 IE_DESC = 'safaribooksonline.com online video'
77 _VALID_URL = r'''(?x)
78 https?://
79 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
80 (?:
81 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
82 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
83 )
84 '''
85
86 _TESTS = [{
87 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
88 'md5': 'dcc5a425e79f2564148652616af1f2a3',
89 'info_dict': {
90 'id': '0_qbqx90ic',
91 'ext': 'mp4',
92 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
93 'timestamp': 1437758058,
94 'upload_date': '20150724',
95 'uploader_id': 'stork',
96 },
97 }, {
98 # non-digits in course id
99 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
100 'only_matching': True,
101 }, {
102 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
103 'only_matching': True,
104 }, {
105 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
106 'only_matching': True,
107 }, {
108 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
109 'only_matching': True,
110 }, {
111 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
112 'only_matching': True,
113 }]
114
115 _PARTNER_ID = '1926081'
116 _UICONF_ID = '29375172'
117
118 def _real_extract(self, url):
119 mobj = self._match_valid_url(url)
120
121 reference_id = mobj.group('reference_id')
122 if reference_id:
123 video_id = reference_id
124 partner_id = self._PARTNER_ID
125 ui_id = self._UICONF_ID
126 else:
127 video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part'))
128
129 webpage, urlh = self._download_webpage_handle(url, video_id)
130
131 mobj = re.match(self._VALID_URL, urlh.url)
132 reference_id = mobj.group('reference_id')
133 if not reference_id:
134 reference_id = self._search_regex(
135 r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
136 webpage, 'kaltura reference id', group='id')
137 partner_id = self._search_regex(
138 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
139 webpage, 'kaltura widget id', default=self._PARTNER_ID,
140 group='id')
141 ui_id = self._search_regex(
142 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
143 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
144 group='id')
145
146 query = {
147 'wid': '_%s' % partner_id,
148 'uiconf_id': ui_id,
149 'flashvars[referenceId]': reference_id,
150 }
151
152 if self.LOGGED_IN:
153 kaltura_session = self._download_json(
154 '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
155 video_id, 'Downloading kaltura session JSON',
156 'Unable to download kaltura session JSON', fatal=False,
157 headers={'Accept': 'application/json'})
158 if kaltura_session:
159 session = kaltura_session.get('session')
160 if session:
161 query['flashvars[ks]'] = session
162
163 return self.url_result(update_url_query(
164 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
165 'Kaltura')
166
167
168 class SafariApiIE(SafariBaseIE):
169 IE_NAME = 'safari:api'
170 _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
171
172 _TESTS = [{
173 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
174 'only_matching': True,
175 }, {
176 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
177 'only_matching': True,
178 }]
179
180 def _real_extract(self, url):
181 mobj = self._match_valid_url(url)
182 part = self._download_json(
183 url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
184 'Downloading part JSON')
185 web_url = part['web_url']
186 if 'library/view' in web_url:
187 web_url = web_url.replace('library/view', 'videos')
188 natural_keys = part['natural_key']
189 web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
190 return self.url_result(web_url, SafariIE.ie_key())
191
192
193 class SafariCourseIE(SafariBaseIE):
194 IE_NAME = 'safari:course'
195 IE_DESC = 'safaribooksonline.com online courses'
196
197 _VALID_URL = r'''(?x)
198 https?://
199 (?:
200 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
201 (?:
202 library/view/[^/]+|
203 api/v1/book|
204 videos/[^/]+
205 )|
206 techbus\.safaribooksonline\.com
207 )
208 /(?P<id>[^/]+)
209 '''
210
211 _TESTS = [{
212 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
213 'info_dict': {
214 'id': '9780133392838',
215 'title': 'Hadoop Fundamentals LiveLessons',
216 },
217 'playlist_count': 22,
218 'skip': 'Requires safaribooksonline account credentials',
219 }, {
220 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
221 'only_matching': True,
222 }, {
223 'url': 'http://techbus.safaribooksonline.com/9780134426365',
224 'only_matching': True,
225 }, {
226 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
227 'only_matching': True,
228 }, {
229 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
230 'only_matching': True,
231 }, {
232 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
233 'only_matching': True,
234 }]
235
236 @classmethod
237 def suitable(cls, url):
238 return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
239 else super(SafariCourseIE, cls).suitable(url))
240
241 def _real_extract(self, url):
242 course_id = self._match_id(url)
243
244 course_json = self._download_json(
245 '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
246 course_id, 'Downloading course JSON')
247
248 if 'chapters' not in course_json:
249 raise ExtractorError(
250 'No chapters found for course %s' % course_id, expected=True)
251
252 entries = [
253 self.url_result(chapter, SafariApiIE.ie_key())
254 for chapter in course_json['chapters']]
255
256 course_title = course_json['title']
257
258 return self.playlist_result(entries, course_id, course_title)