yt_dlp/extractor/safari.py

   1 import json
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     update_url_query,
   9 )
  10
  11
  12 class SafariBaseIE(InfoExtractor):
  13     _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
  14     _NETRC_MACHINE = 'safari'
  15
  16     _API_BASE = 'https://learning.oreilly.com/api/v1'
  17     _API_FORMAT = 'json'
  18
  19     LOGGED_IN = False
  20
  21     def _perform_login(self, username, password):
  22         _, urlh = self._download_webpage_handle(
  23             'https://learning.oreilly.com/accounts/login-check/', None,
  24             'Downloading login page')
  25
  26         def is_logged(urlh):
  27             return 'learning.oreilly.com/home/' in urlh.url
  28
  29         if is_logged(urlh):
  30             self.LOGGED_IN = True
  31             return
  32
  33         redirect_url = urlh.url
  34         parsed_url = urllib.parse.urlparse(redirect_url)
  35         qs = urllib.parse.parse_qs(parsed_url.query)
  36         next_uri = urllib.parse.urljoin(
  37             'https://api.oreilly.com', qs['next'][0])
  38
  39         auth, urlh = self._download_json_handle(
  40             'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
  41             data=json.dumps({
  42                 'email': username,
  43                 'password': password,
  44                 'redirect_uri': next_uri,
  45             }).encode(), headers={
  46                 'Content-Type': 'application/json',
  47                 'Referer': redirect_url,
  48             }, expected_status=400)
  49
  50         credentials = auth.get('credentials')
  51         if (not auth.get('logged_in') and not auth.get('redirect_uri')
  52                 and credentials):
  53             raise ExtractorError(
  54                 f'Unable to login: {credentials}', expected=True)
  55
  56         # oreilly serves two same instances of the following cookies
  57         # in Set-Cookie header and expects first one to be actually set
  58         for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
  59             self._apply_first_set_cookie_header(urlh, cookie)
  60
  61         _, urlh = self._download_webpage_handle(
  62             auth.get('redirect_uri') or next_uri, None, 'Completing login')
  63
  64         if is_logged(urlh):
  65             self.LOGGED_IN = True
  66             return
  67
  68         raise ExtractorError('Unable to log in')
  69
  70
  71 class SafariIE(SafariBaseIE):
  72     IE_NAME = 'safari'
  73     IE_DESC = 'safaribooksonline.com online video'
  74     _VALID_URL = r'''(?x)
  75                         https?://
  76                             (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
  77                             (?:
  78                                 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
  79                                 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
  80                             )
  81                     '''
  82
  83     _TESTS = [{
  84         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
  85         'md5': 'dcc5a425e79f2564148652616af1f2a3',
  86         'info_dict': {
  87             'id': '0_qbqx90ic',
  88             'ext': 'mp4',
  89             'title': 'Introduction to Hadoop Fundamentals LiveLessons',
  90             'timestamp': 1437758058,
  91             'upload_date': '20150724',
  92             'uploader_id': 'stork',
  93         },
  94     }, {
  95         # non-digits in course id
  96         'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
  97         'only_matching': True,
  98     }, {
  99         'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
 100         'only_matching': True,
 101     }, {
 102         'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
 103         'only_matching': True,
 104     }, {
 105         'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
 106         'only_matching': True,
 107     }, {
 108         'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
 109         'only_matching': True,
 110     }]
 111
 112     _PARTNER_ID = '1926081'
 113     _UICONF_ID = '29375172'
 114
 115     def _real_extract(self, url):
 116         mobj = self._match_valid_url(url)
 117
 118         reference_id = mobj.group('reference_id')
 119         if reference_id:
 120             video_id = reference_id
 121             partner_id = self._PARTNER_ID
 122             ui_id = self._UICONF_ID
 123         else:
 124             video_id = '{}-{}'.format(mobj.group('course_id'), mobj.group('part'))
 125
 126             webpage, urlh = self._download_webpage_handle(url, video_id)
 127
 128             mobj = re.match(self._VALID_URL, urlh.url)
 129             reference_id = mobj.group('reference_id')
 130             if not reference_id:
 131                 reference_id = self._search_regex(
 132                     r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 133                     webpage, 'kaltura reference id', group='id')
 134             partner_id = self._search_regex(
 135                 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 136                 webpage, 'kaltura widget id', default=self._PARTNER_ID,
 137                 group='id')
 138             ui_id = self._search_regex(
 139                 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 140                 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
 141                 group='id')
 142
 143         query = {
 144             'wid': f'_{partner_id}',
 145             'uiconf_id': ui_id,
 146             'flashvars[referenceId]': reference_id,
 147         }
 148
 149         if self.LOGGED_IN:
 150             kaltura_session = self._download_json(
 151                 f'{self._API_BASE}/player/kaltura_session/?reference_id={reference_id}',
 152                 video_id, 'Downloading kaltura session JSON',
 153                 'Unable to download kaltura session JSON', fatal=False,
 154                 headers={'Accept': 'application/json'})
 155             if kaltura_session:
 156                 session = kaltura_session.get('session')
 157                 if session:
 158                     query['flashvars[ks]'] = session
 159
 160         return self.url_result(update_url_query(
 161             'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
 162             'Kaltura')
 163
 164
 165 class SafariApiIE(SafariBaseIE):
 166     IE_NAME = 'safari:api'
 167     _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
 168
 169     _TESTS = [{
 170         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
 171         'only_matching': True,
 172     }, {
 173         'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
 174         'only_matching': True,
 175     }]
 176
 177     def _real_extract(self, url):
 178         mobj = self._match_valid_url(url)
 179         part = self._download_json(
 180             url, '{}/{}'.format(mobj.group('course_id'), mobj.group('part')),
 181             'Downloading part JSON')
 182         web_url = part['web_url']
 183         if 'library/view' in web_url:
 184             web_url = web_url.replace('library/view', 'videos')
 185             natural_keys = part['natural_key']
 186             web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
 187         return self.url_result(web_url, SafariIE.ie_key())
 188
 189
 190 class SafariCourseIE(SafariBaseIE):
 191     IE_NAME = 'safari:course'
 192     IE_DESC = 'safaribooksonline.com online courses'
 193
 194     _VALID_URL = r'''(?x)
 195                     https?://
 196                         (?:
 197                             (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
 198                             (?:
 199                                 library/view/[^/]+|
 200                                 api/v1/book|
 201                                 videos/[^/]+
 202                             )|
 203                             techbus\.safaribooksonline\.com
 204                         )
 205                         /(?P<id>[^/]+)
 206                     '''
 207
 208     _TESTS = [{
 209         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
 210         'info_dict': {
 211             'id': '9780133392838',
 212             'title': 'Hadoop Fundamentals LiveLessons',
 213         },
 214         'playlist_count': 22,
 215         'skip': 'Requires safaribooksonline account credentials',
 216     }, {
 217         'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
 218         'only_matching': True,
 219     }, {
 220         'url': 'http://techbus.safaribooksonline.com/9780134426365',
 221         'only_matching': True,
 222     }, {
 223         'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
 224         'only_matching': True,
 225     }, {
 226         'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
 227         'only_matching': True,
 228     }, {
 229         'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
 230         'only_matching': True,
 231     }]
 232
 233     @classmethod
 234     def suitable(cls, url):
 235         return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
 236                 else super().suitable(url))
 237
 238     def _real_extract(self, url):
 239         course_id = self._match_id(url)
 240
 241         course_json = self._download_json(
 242             f'{self._API_BASE}/book/{course_id}/?override_format={self._API_FORMAT}',
 243             course_id, 'Downloading course JSON')
 244
 245         if 'chapters' not in course_json:
 246             raise ExtractorError(
 247                 f'No chapters found for course {course_id}', expected=True)
 248
 249         entries = [
 250             self.url_result(chapter, SafariApiIE.ie_key())
 251             for chapter in course_json['chapters']]
 252
 253         course_title = course_json['title']
 254
 255         return self.playlist_result(entries, course_id, course_title)