]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/safari.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / safari.py
CommitLineData
a9e03736 1import json
32d687f5 2import re
32d687f5 3
4from .common import InfoExtractor
a9e03736
S
5from ..compat import (
6 compat_parse_qs,
a9e03736
S
7 compat_urlparse,
8)
32d687f5 9from ..utils import (
10 ExtractorError,
bcb668de 11 update_url_query,
32d687f5 12)
13
14
15class SafariBaseIE(InfoExtractor):
7f41a598 16 _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
31c48098
S
17 _NETRC_MACHINE = 'safari'
18
7f41a598 19 _API_BASE = 'https://learning.oreilly.com/api/v1'
31c48098 20 _API_FORMAT = 'json'
32d687f5 21
22 LOGGED_IN = False
23
52efa4b3 24 def _perform_login(self, username, password):
a9e03736
S
25 _, urlh = self._download_webpage_handle(
26 'https://learning.oreilly.com/accounts/login-check/', None,
27 'Downloading login page')
4244a13a 28
a9e03736 29 def is_logged(urlh):
3d2623a8 30 return 'learning.oreilly.com/home/' in urlh.url
4244a13a 31
a9e03736 32 if is_logged(urlh):
4244a13a
S
33 self.LOGGED_IN = True
34 return
32d687f5 35
3d2623a8 36 redirect_url = urlh.url
a9e03736
S
37 parsed_url = compat_urlparse.urlparse(redirect_url)
38 qs = compat_parse_qs(parsed_url.query)
39 next_uri = compat_urlparse.urljoin(
40 'https://api.oreilly.com', qs['next'][0])
41
42 auth, urlh = self._download_json_handle(
43 'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
44 data=json.dumps({
45 'email': username,
46 'password': password,
47 'redirect_uri': next_uri,
48 }).encode(), headers={
49 'Content-Type': 'application/json',
50 'Referer': redirect_url,
51 }, expected_status=400)
52
53 credentials = auth.get('credentials')
54 if (not auth.get('logged_in') and not auth.get('redirect_uri')
55 and credentials):
56 raise ExtractorError(
57 'Unable to login: %s' % credentials, expected=True)
32d687f5 58
d1fcf255 59 # oreilly serves two same instances of the following cookies
60 # in Set-Cookie header and expects first one to be actually set
61 for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
62 self._apply_first_set_cookie_header(urlh, cookie)
32d687f5 63
a9e03736
S
64 _, urlh = self._download_webpage_handle(
65 auth.get('redirect_uri') or next_uri, None, 'Completing login',)
32d687f5 66
a9e03736
S
67 if is_logged(urlh):
68 self.LOGGED_IN = True
69 return
32d687f5 70
a9e03736 71 raise ExtractorError('Unable to log in')
e9c8999e 72
32d687f5 73
74class SafariIE(SafariBaseIE):
75 IE_NAME = 'safari'
76 IE_DESC = 'safaribooksonline.com online video'
003fe73c
S
77 _VALID_URL = r'''(?x)
78 https?://
a9e03736 79 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
003fe73c
S
80 (?:
81 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
82 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
83 )
84 '''
31c48098
S
85
86 _TESTS = [{
87 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
bcb668de 88 'md5': 'dcc5a425e79f2564148652616af1f2a3',
32d687f5 89 'info_dict': {
bcb668de 90 'id': '0_qbqx90ic',
32d687f5 91 'ext': 'mp4',
bcb668de 92 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
93 'timestamp': 1437758058,
94 'upload_date': '20150724',
95 'uploader_id': 'stork',
31c48098 96 },
4fd35ee0
S
97 }, {
98 # non-digits in course id
99 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
100 'only_matching': True,
697655a7
S
101 }, {
102 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
103 'only_matching': True,
003fe73c
S
104 }, {
105 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
106 'only_matching': True,
7f41a598
S
107 }, {
108 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
109 'only_matching': True,
a9e03736
S
110 }, {
111 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
112 'only_matching': True,
31c48098 113 }]
32d687f5 114
003fe73c
S
115 _PARTNER_ID = '1926081'
116 _UICONF_ID = '29375172'
117
32d687f5 118 def _real_extract(self, url):
5ad28e7f 119 mobj = self._match_valid_url(url)
003fe73c
S
120
121 reference_id = mobj.group('reference_id')
122 if reference_id:
123 video_id = reference_id
124 partner_id = self._PARTNER_ID
125 ui_id = self._UICONF_ID
126 else:
127 video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part'))
128
129 webpage, urlh = self._download_webpage_handle(url, video_id)
130
3d2623a8 131 mobj = re.match(self._VALID_URL, urlh.url)
003fe73c
S
132 reference_id = mobj.group('reference_id')
133 if not reference_id:
134 reference_id = self._search_regex(
135 r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
136 webpage, 'kaltura reference id', group='id')
137 partner_id = self._search_regex(
138 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
139 webpage, 'kaltura widget id', default=self._PARTNER_ID,
140 group='id')
141 ui_id = self._search_regex(
142 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
143 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
144 group='id')
31c48098 145
73cbd709 146 query = {
bcb668de 147 'wid': '_%s' % partner_id,
148 'uiconf_id': ui_id,
149 'flashvars[referenceId]': reference_id,
73cbd709
S
150 }
151
152 if self.LOGGED_IN:
153 kaltura_session = self._download_json(
154 '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
3aec7176 155 video_id, 'Downloading kaltura session JSON',
3fdf5731 156 'Unable to download kaltura session JSON', fatal=False,
157 headers={'Accept': 'application/json'})
73cbd709
S
158 if kaltura_session:
159 session = kaltura_session.get('session')
160 if session:
161 query['flashvars[ks]'] = session
162
163 return self.url_result(update_url_query(
164 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
165 'Kaltura')
32d687f5 166
167
3aec7176
S
168class SafariApiIE(SafariBaseIE):
169 IE_NAME = 'safari:api'
a9e03736 170 _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
3aec7176 171
697655a7 172 _TESTS = [{
3aec7176
S
173 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
174 'only_matching': True,
697655a7
S
175 }, {
176 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
177 'only_matching': True,
178 }]
3aec7176
S
179
180 def _real_extract(self, url):
5ad28e7f 181 mobj = self._match_valid_url(url)
3aec7176
S
182 part = self._download_json(
183 url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
184 'Downloading part JSON')
7738bd32
MKA
185 web_url = part['web_url']
186 if 'library/view' in web_url:
187 web_url = web_url.replace('library/view', 'videos')
188 natural_keys = part['natural_key']
9c1c3ec0 189 web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
7738bd32 190 return self.url_result(web_url, SafariIE.ie_key())
3aec7176
S
191
192
32d687f5 193class SafariCourseIE(SafariBaseIE):
194 IE_NAME = 'safari:course'
195 IE_DESC = 'safaribooksonline.com online courses'
196
a26b174c
S
197 _VALID_URL = r'''(?x)
198 https?://
199 (?:
a9e03736 200 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
003fe73c
S
201 (?:
202 library/view/[^/]+|
203 api/v1/book|
204 videos/[^/]+
205 )|
a26b174c
S
206 techbus\.safaribooksonline\.com
207 )
003fe73c 208 /(?P<id>[^/]+)
a26b174c 209 '''
32d687f5 210
31c48098
S
211 _TESTS = [{
212 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
213 'info_dict': {
214 'id': '9780133392838',
215 'title': 'Hadoop Fundamentals LiveLessons',
216 },
217 'playlist_count': 22,
218 'skip': 'Requires safaribooksonline account credentials',
219 }, {
220 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
221 'only_matching': True,
a26b174c
S
222 }, {
223 'url': 'http://techbus.safaribooksonline.com/9780134426365',
224 'only_matching': True,
003fe73c
S
225 }, {
226 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
227 'only_matching': True,
7f41a598
S
228 }, {
229 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
230 'only_matching': True,
a9e03736
S
231 }, {
232 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
233 'only_matching': True,
31c48098 234 }]
32d687f5 235
003fe73c
S
236 @classmethod
237 def suitable(cls, url):
238 return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
239 else super(SafariCourseIE, cls).suitable(url))
240
32d687f5 241 def _real_extract(self, url):
31c48098 242 course_id = self._match_id(url)
32d687f5 243
31c48098 244 course_json = self._download_json(
73cbd709 245 '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
31c48098 246 course_id, 'Downloading course JSON')
32d687f5 247
248 if 'chapters' not in course_json:
31c48098
S
249 raise ExtractorError(
250 'No chapters found for course %s' % course_id, expected=True)
32d687f5 251
252 entries = [
3aec7176 253 self.url_result(chapter, SafariApiIE.ie_key())
31c48098 254 for chapter in course_json['chapters']]
32d687f5 255
256 course_title = course_json['title']
257
258 return self.playlist_result(entries, course_id, course_title)