]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/safari.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / safari.py
CommitLineData
a9e03736 1import json
32d687f5 2import re
add96eb9 3import urllib.parse
32d687f5 4
5from .common import InfoExtractor
32d687f5 6from ..utils import (
7 ExtractorError,
bcb668de 8 update_url_query,
32d687f5 9)
10
11
12class SafariBaseIE(InfoExtractor):
7f41a598 13 _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
31c48098
S
14 _NETRC_MACHINE = 'safari'
15
7f41a598 16 _API_BASE = 'https://learning.oreilly.com/api/v1'
31c48098 17 _API_FORMAT = 'json'
32d687f5 18
19 LOGGED_IN = False
20
52efa4b3 21 def _perform_login(self, username, password):
a9e03736
S
22 _, urlh = self._download_webpage_handle(
23 'https://learning.oreilly.com/accounts/login-check/', None,
24 'Downloading login page')
4244a13a 25
a9e03736 26 def is_logged(urlh):
3d2623a8 27 return 'learning.oreilly.com/home/' in urlh.url
4244a13a 28
a9e03736 29 if is_logged(urlh):
4244a13a
S
30 self.LOGGED_IN = True
31 return
32d687f5 32
3d2623a8 33 redirect_url = urlh.url
add96eb9 34 parsed_url = urllib.parse.urlparse(redirect_url)
35 qs = urllib.parse.parse_qs(parsed_url.query)
36 next_uri = urllib.parse.urljoin(
a9e03736
S
37 'https://api.oreilly.com', qs['next'][0])
38
39 auth, urlh = self._download_json_handle(
40 'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
41 data=json.dumps({
42 'email': username,
43 'password': password,
44 'redirect_uri': next_uri,
45 }).encode(), headers={
46 'Content-Type': 'application/json',
47 'Referer': redirect_url,
48 }, expected_status=400)
49
50 credentials = auth.get('credentials')
51 if (not auth.get('logged_in') and not auth.get('redirect_uri')
52 and credentials):
53 raise ExtractorError(
add96eb9 54 f'Unable to login: {credentials}', expected=True)
32d687f5 55
d1fcf255 56 # oreilly serves two same instances of the following cookies
57 # in Set-Cookie header and expects first one to be actually set
58 for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
59 self._apply_first_set_cookie_header(urlh, cookie)
32d687f5 60
a9e03736 61 _, urlh = self._download_webpage_handle(
add96eb9 62 auth.get('redirect_uri') or next_uri, None, 'Completing login')
32d687f5 63
a9e03736
S
64 if is_logged(urlh):
65 self.LOGGED_IN = True
66 return
32d687f5 67
a9e03736 68 raise ExtractorError('Unable to log in')
e9c8999e 69
32d687f5 70
71class SafariIE(SafariBaseIE):
72 IE_NAME = 'safari'
73 IE_DESC = 'safaribooksonline.com online video'
003fe73c
S
74 _VALID_URL = r'''(?x)
75 https?://
a9e03736 76 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
003fe73c
S
77 (?:
78 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
79 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
80 )
81 '''
31c48098
S
82
83 _TESTS = [{
84 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
bcb668de 85 'md5': 'dcc5a425e79f2564148652616af1f2a3',
32d687f5 86 'info_dict': {
bcb668de 87 'id': '0_qbqx90ic',
32d687f5 88 'ext': 'mp4',
bcb668de 89 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
90 'timestamp': 1437758058,
91 'upload_date': '20150724',
92 'uploader_id': 'stork',
31c48098 93 },
4fd35ee0
S
94 }, {
95 # non-digits in course id
96 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
97 'only_matching': True,
697655a7
S
98 }, {
99 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
100 'only_matching': True,
003fe73c
S
101 }, {
102 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
103 'only_matching': True,
7f41a598
S
104 }, {
105 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
106 'only_matching': True,
a9e03736
S
107 }, {
108 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
109 'only_matching': True,
31c48098 110 }]
32d687f5 111
003fe73c
S
112 _PARTNER_ID = '1926081'
113 _UICONF_ID = '29375172'
114
32d687f5 115 def _real_extract(self, url):
5ad28e7f 116 mobj = self._match_valid_url(url)
003fe73c
S
117
118 reference_id = mobj.group('reference_id')
119 if reference_id:
120 video_id = reference_id
121 partner_id = self._PARTNER_ID
122 ui_id = self._UICONF_ID
123 else:
add96eb9 124 video_id = '{}-{}'.format(mobj.group('course_id'), mobj.group('part'))
003fe73c
S
125
126 webpage, urlh = self._download_webpage_handle(url, video_id)
127
3d2623a8 128 mobj = re.match(self._VALID_URL, urlh.url)
003fe73c
S
129 reference_id = mobj.group('reference_id')
130 if not reference_id:
131 reference_id = self._search_regex(
132 r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
133 webpage, 'kaltura reference id', group='id')
134 partner_id = self._search_regex(
135 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
136 webpage, 'kaltura widget id', default=self._PARTNER_ID,
137 group='id')
138 ui_id = self._search_regex(
139 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
140 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
141 group='id')
31c48098 142
73cbd709 143 query = {
add96eb9 144 'wid': f'_{partner_id}',
bcb668de 145 'uiconf_id': ui_id,
146 'flashvars[referenceId]': reference_id,
73cbd709
S
147 }
148
149 if self.LOGGED_IN:
150 kaltura_session = self._download_json(
add96eb9 151 f'{self._API_BASE}/player/kaltura_session/?reference_id={reference_id}',
3aec7176 152 video_id, 'Downloading kaltura session JSON',
3fdf5731 153 'Unable to download kaltura session JSON', fatal=False,
154 headers={'Accept': 'application/json'})
73cbd709
S
155 if kaltura_session:
156 session = kaltura_session.get('session')
157 if session:
158 query['flashvars[ks]'] = session
159
160 return self.url_result(update_url_query(
161 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
162 'Kaltura')
32d687f5 163
164
3aec7176
S
165class SafariApiIE(SafariBaseIE):
166 IE_NAME = 'safari:api'
a9e03736 167 _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
3aec7176 168
697655a7 169 _TESTS = [{
3aec7176
S
170 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
171 'only_matching': True,
697655a7
S
172 }, {
173 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
174 'only_matching': True,
175 }]
3aec7176
S
176
177 def _real_extract(self, url):
5ad28e7f 178 mobj = self._match_valid_url(url)
3aec7176 179 part = self._download_json(
add96eb9 180 url, '{}/{}'.format(mobj.group('course_id'), mobj.group('part')),
3aec7176 181 'Downloading part JSON')
7738bd32
MKA
182 web_url = part['web_url']
183 if 'library/view' in web_url:
184 web_url = web_url.replace('library/view', 'videos')
185 natural_keys = part['natural_key']
9c1c3ec0 186 web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
7738bd32 187 return self.url_result(web_url, SafariIE.ie_key())
3aec7176
S
188
189
32d687f5 190class SafariCourseIE(SafariBaseIE):
191 IE_NAME = 'safari:course'
192 IE_DESC = 'safaribooksonline.com online courses'
193
a26b174c
S
194 _VALID_URL = r'''(?x)
195 https?://
196 (?:
a9e03736 197 (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
003fe73c
S
198 (?:
199 library/view/[^/]+|
200 api/v1/book|
201 videos/[^/]+
202 )|
a26b174c
S
203 techbus\.safaribooksonline\.com
204 )
003fe73c 205 /(?P<id>[^/]+)
a26b174c 206 '''
32d687f5 207
31c48098
S
208 _TESTS = [{
209 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
210 'info_dict': {
211 'id': '9780133392838',
212 'title': 'Hadoop Fundamentals LiveLessons',
213 },
214 'playlist_count': 22,
215 'skip': 'Requires safaribooksonline account credentials',
216 }, {
217 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
218 'only_matching': True,
a26b174c
S
219 }, {
220 'url': 'http://techbus.safaribooksonline.com/9780134426365',
221 'only_matching': True,
003fe73c
S
222 }, {
223 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
224 'only_matching': True,
7f41a598
S
225 }, {
226 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
227 'only_matching': True,
a9e03736
S
228 }, {
229 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
230 'only_matching': True,
31c48098 231 }]
32d687f5 232
003fe73c
S
233 @classmethod
234 def suitable(cls, url):
235 return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
add96eb9 236 else super().suitable(url))
003fe73c 237
32d687f5 238 def _real_extract(self, url):
31c48098 239 course_id = self._match_id(url)
32d687f5 240
31c48098 241 course_json = self._download_json(
add96eb9 242 f'{self._API_BASE}/book/{course_id}/?override_format={self._API_FORMAT}',
31c48098 243 course_id, 'Downloading course JSON')
32d687f5 244
245 if 'chapters' not in course_json:
31c48098 246 raise ExtractorError(
add96eb9 247 f'No chapters found for course {course_id}', expected=True)
32d687f5 248
249 entries = [
3aec7176 250 self.url_result(chapter, SafariApiIE.ie_key())
31c48098 251 for chapter in course_json['chapters']]
32d687f5 252
253 course_title = course_json['title']
254
255 return self.playlist_result(entries, course_id, course_title)