yt_dlp/extractor/safari.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5
   6 from ..compat import (
   7     compat_parse_qs,
   8     compat_urlparse,
   9 )
  10 from ..utils import (
  11     ExtractorError,
  12     update_url_query,
  13 )
  14
  15
  16 class SafariBaseIE(InfoExtractor):
  17     _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
  18     _NETRC_MACHINE = 'safari'
  19
  20     _API_BASE = 'https://learning.oreilly.com/api/v1'
  21     _API_FORMAT = 'json'
  22
  23     LOGGED_IN = False
  24
  25     def _perform_login(self, username, password):
  26         _, urlh = self._download_webpage_handle(
  27             'https://learning.oreilly.com/accounts/login-check/', None,
  28             'Downloading login page')
  29
  30         def is_logged(urlh):
  31             return 'learning.oreilly.com/home/' in urlh.geturl()
  32
  33         if is_logged(urlh):
  34             self.LOGGED_IN = True
  35             return
  36
  37         redirect_url = urlh.geturl()
  38         parsed_url = compat_urlparse.urlparse(redirect_url)
  39         qs = compat_parse_qs(parsed_url.query)
  40         next_uri = compat_urlparse.urljoin(
  41             'https://api.oreilly.com', qs['next'][0])
  42
  43         auth, urlh = self._download_json_handle(
  44             'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
  45             data=json.dumps({
  46                 'email': username,
  47                 'password': password,
  48                 'redirect_uri': next_uri,
  49             }).encode(), headers={
  50                 'Content-Type': 'application/json',
  51                 'Referer': redirect_url,
  52             }, expected_status=400)
  53
  54         credentials = auth.get('credentials')
  55         if (not auth.get('logged_in') and not auth.get('redirect_uri')
  56                 and credentials):
  57             raise ExtractorError(
  58                 'Unable to login: %s' % credentials, expected=True)
  59
  60         # oreilly serves two same instances of the following cookies
  61         # in Set-Cookie header and expects first one to be actually set
  62         for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
  63             self._apply_first_set_cookie_header(urlh, cookie)
  64
  65         _, urlh = self._download_webpage_handle(
  66             auth.get('redirect_uri') or next_uri, None, 'Completing login',)
  67
  68         if is_logged(urlh):
  69             self.LOGGED_IN = True
  70             return
  71
  72         raise ExtractorError('Unable to log in')
  73
  74
  75 class SafariIE(SafariBaseIE):
  76     IE_NAME = 'safari'
  77     IE_DESC = 'safaribooksonline.com online video'
  78     _VALID_URL = r'''(?x)
  79                         https?://
  80                             (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
  81                             (?:
  82                                 library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
  83                                 videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
  84                             )
  85                     '''
  86
  87     _TESTS = [{
  88         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
  89         'md5': 'dcc5a425e79f2564148652616af1f2a3',
  90         'info_dict': {
  91             'id': '0_qbqx90ic',
  92             'ext': 'mp4',
  93             'title': 'Introduction to Hadoop Fundamentals LiveLessons',
  94             'timestamp': 1437758058,
  95             'upload_date': '20150724',
  96             'uploader_id': 'stork',
  97         },
  98     }, {
  99         # non-digits in course id
 100         'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
 101         'only_matching': True,
 102     }, {
 103         'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
 104         'only_matching': True,
 105     }, {
 106         'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
 107         'only_matching': True,
 108     }, {
 109         'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
 110         'only_matching': True,
 111     }, {
 112         'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
 113         'only_matching': True,
 114     }]
 115
 116     _PARTNER_ID = '1926081'
 117     _UICONF_ID = '29375172'
 118
 119     def _real_extract(self, url):
 120         mobj = self._match_valid_url(url)
 121
 122         reference_id = mobj.group('reference_id')
 123         if reference_id:
 124             video_id = reference_id
 125             partner_id = self._PARTNER_ID
 126             ui_id = self._UICONF_ID
 127         else:
 128             video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part'))
 129
 130             webpage, urlh = self._download_webpage_handle(url, video_id)
 131
 132             mobj = re.match(self._VALID_URL, urlh.geturl())
 133             reference_id = mobj.group('reference_id')
 134             if not reference_id:
 135                 reference_id = self._search_regex(
 136                     r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 137                     webpage, 'kaltura reference id', group='id')
 138             partner_id = self._search_regex(
 139                 r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 140                 webpage, 'kaltura widget id', default=self._PARTNER_ID,
 141                 group='id')
 142             ui_id = self._search_regex(
 143                 r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
 144                 webpage, 'kaltura uiconf id', default=self._UICONF_ID,
 145                 group='id')
 146
 147         query = {
 148             'wid': '_%s' % partner_id,
 149             'uiconf_id': ui_id,
 150             'flashvars[referenceId]': reference_id,
 151         }
 152
 153         if self.LOGGED_IN:
 154             kaltura_session = self._download_json(
 155                 '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
 156                 video_id, 'Downloading kaltura session JSON',
 157                 'Unable to download kaltura session JSON', fatal=False,
 158                 headers={'Accept': 'application/json'})
 159             if kaltura_session:
 160                 session = kaltura_session.get('session')
 161                 if session:
 162                     query['flashvars[ks]'] = session
 163
 164         return self.url_result(update_url_query(
 165             'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
 166             'Kaltura')
 167
 168
 169 class SafariApiIE(SafariBaseIE):
 170     IE_NAME = 'safari:api'
 171     _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
 172
 173     _TESTS = [{
 174         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
 175         'only_matching': True,
 176     }, {
 177         'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
 178         'only_matching': True,
 179     }]
 180
 181     def _real_extract(self, url):
 182         mobj = self._match_valid_url(url)
 183         part = self._download_json(
 184             url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
 185             'Downloading part JSON')
 186         web_url = part['web_url']
 187         if 'library/view' in web_url:
 188             web_url = web_url.replace('library/view', 'videos')
 189             natural_keys = part['natural_key']
 190             web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}'
 191         return self.url_result(web_url, SafariIE.ie_key())
 192
 193
 194 class SafariCourseIE(SafariBaseIE):
 195     IE_NAME = 'safari:course'
 196     IE_DESC = 'safaribooksonline.com online courses'
 197
 198     _VALID_URL = r'''(?x)
 199                     https?://
 200                         (?:
 201                             (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
 202                             (?:
 203                                 library/view/[^/]+|
 204                                 api/v1/book|
 205                                 videos/[^/]+
 206                             )|
 207                             techbus\.safaribooksonline\.com
 208                         )
 209                         /(?P<id>[^/]+)
 210                     '''
 211
 212     _TESTS = [{
 213         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
 214         'info_dict': {
 215             'id': '9780133392838',
 216             'title': 'Hadoop Fundamentals LiveLessons',
 217         },
 218         'playlist_count': 22,
 219         'skip': 'Requires safaribooksonline account credentials',
 220     }, {
 221         'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
 222         'only_matching': True,
 223     }, {
 224         'url': 'http://techbus.safaribooksonline.com/9780134426365',
 225         'only_matching': True,
 226     }, {
 227         'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
 228         'only_matching': True,
 229     }, {
 230         'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
 231         'only_matching': True,
 232     }, {
 233         'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
 234         'only_matching': True,
 235     }]
 236
 237     @classmethod
 238     def suitable(cls, url):
 239         return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
 240                 else super(SafariCourseIE, cls).suitable(url))
 241
 242     def _real_extract(self, url):
 243         course_id = self._match_id(url)
 244
 245         course_json = self._download_json(
 246             '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
 247             course_id, 'Downloading course JSON')
 248
 249         if 'chapters' not in course_json:
 250             raise ExtractorError(
 251                 'No chapters found for course %s' % course_id, expected=True)
 252
 253         entries = [
 254             self.url_result(chapter, SafariApiIE.ie_key())
 255             for chapter in course_json['chapters']]
 256
 257         course_title = course_json['title']
 258
 259         return self.playlist_result(entries, course_id, course_title)