yt_dlp/extractor/linuxacademy.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import random
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_b64decode,
   9     compat_HTTPError,
  10     compat_str,
  11 )
  12 from ..utils import (
  13     clean_html,
  14     ExtractorError,
  15     js_to_json,
  16     parse_duration,
  17     try_get,
  18     unified_timestamp,
  19     urlencode_postdata,
  20     urljoin,
  21 )
  22
  23
  24 class LinuxAcademyIE(InfoExtractor):
  25     _VALID_URL = r'''(?x)
  26                     https?://
  27                         (?:www\.)?linuxacademy\.com/cp/
  28                         (?:
  29                             courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
  30                             modules/view/id/(?P<course_id>\d+)
  31                         )
  32                     '''
  33     _TESTS = [{
  34         'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
  35         'info_dict': {
  36             'id': '7971-2',
  37             'ext': 'mp4',
  38             'title': 'What Is Data Science',
  39             'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
  40             'timestamp': int,  # The timestamp and upload date changes
  41             'upload_date': r're:\d+',
  42             'duration': 304,
  43         },
  44         'params': {
  45             'skip_download': True,
  46         },
  47         'skip': 'Requires Linux Academy account credentials',
  48     }, {
  49         'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
  50         'only_matching': True,
  51     }, {
  52         'url': 'https://linuxacademy.com/cp/modules/view/id/154',
  53         'info_dict': {
  54             'id': '154',
  55             'title': 'AWS Certified Cloud Practitioner',
  56             'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
  57             'duration': 28835,
  58         },
  59         'playlist_count': 41,
  60         'skip': 'Requires Linux Academy account credentials',
  61     }, {
  62         'url': 'https://linuxacademy.com/cp/modules/view/id/39',
  63         'info_dict': {
  64             'id': '39',
  65             'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
  66             'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
  67             'duration': 89280,
  68         },
  69         'playlist_count': 73,
  70         'skip': 'Requires Linux Academy account credentials',
  71     }]
  72
  73     _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
  74     _ORIGIN_URL = 'https://linuxacademy.com'
  75     _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
  76     _NETRC_MACHINE = 'linuxacademy'
  77
  78     def _perform_login(self, username, password):
  79         def random_string():
  80             return ''.join([
  81                 random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
  82                 for _ in range(32)])
  83
  84         webpage, urlh = self._download_webpage_handle(
  85             self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
  86                 'client_id': self._CLIENT_ID,
  87                 'response_type': 'token id_token',
  88                 'response_mode': 'web_message',
  89                 'redirect_uri': self._ORIGIN_URL,
  90                 'scope': 'openid email user_impersonation profile',
  91                 'audience': self._ORIGIN_URL,
  92                 'state': random_string(),
  93                 'nonce': random_string(),
  94             })
  95
  96         login_data = self._parse_json(
  97             self._search_regex(
  98                 r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
  99                 'login info', group='value'), None,
 100             transform_source=lambda x: compat_b64decode(x).decode('utf-8')
 101         )['extraParams']
 102
 103         login_data.update({
 104             'client_id': self._CLIENT_ID,
 105             'redirect_uri': self._ORIGIN_URL,
 106             'tenant': 'lacausers',
 107             'connection': 'Username-Password-ACG-Proxy',
 108             'username': username,
 109             'password': password,
 110             'sso': 'true',
 111         })
 112
 113         login_state_url = urlh.geturl()
 114
 115         try:
 116             login_page = self._download_webpage(
 117                 'https://login.linuxacademy.com/usernamepassword/login', None,
 118                 'Downloading login page', data=json.dumps(login_data).encode(),
 119                 headers={
 120                     'Content-Type': 'application/json',
 121                     'Origin': 'https://login.linuxacademy.com',
 122                     'Referer': login_state_url,
 123                 })
 124         except ExtractorError as e:
 125             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
 126                 error = self._parse_json(e.cause.read(), None)
 127                 message = error.get('description') or error['code']
 128                 raise ExtractorError(
 129                     '%s said: %s' % (self.IE_NAME, message), expected=True)
 130             raise
 131
 132         callback_page, urlh = self._download_webpage_handle(
 133             'https://login.linuxacademy.com/login/callback', None,
 134             'Downloading callback page',
 135             data=urlencode_postdata(self._hidden_inputs(login_page)),
 136             headers={
 137                 'Content-Type': 'application/x-www-form-urlencoded',
 138                 'Origin': 'https://login.linuxacademy.com',
 139                 'Referer': login_state_url,
 140             })
 141
 142         access_token = self._search_regex(
 143             r'access_token=([^=&]+)', urlh.geturl(),
 144             'access token', default=None)
 145         if not access_token:
 146             access_token = self._parse_json(
 147                 self._search_regex(
 148                     r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
 149                     'authorization response'), None,
 150                 transform_source=js_to_json)['response']['access_token']
 151
 152         self._download_webpage(
 153             'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
 154             % access_token, None, 'Downloading token validation page')
 155
 156     def _real_extract(self, url):
 157         mobj = self._match_valid_url(url)
 158         chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
 159         item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
 160
 161         webpage = self._download_webpage(url, item_id)
 162
 163         # course path
 164         if course_id:
 165             module = self._parse_json(
 166                 self._search_regex(
 167                     r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
 168                 item_id)
 169             entries = []
 170             chapter_number = None
 171             chapter = None
 172             chapter_id = None
 173             for item in module['items']:
 174                 if not isinstance(item, dict):
 175                     continue
 176
 177                 def type_field(key):
 178                     return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
 179                 type_fields = (type_field('name'), type_field('slug'))
 180                 # Move to next module section
 181                 if 'section' in type_fields:
 182                     chapter = item.get('course_name')
 183                     chapter_id = item.get('course_module')
 184                     chapter_number = 1 if not chapter_number else chapter_number + 1
 185                     continue
 186                 # Skip non-lessons
 187                 if 'lesson' not in type_fields:
 188                     continue
 189                 lesson_url = urljoin(url, item.get('url'))
 190                 if not lesson_url:
 191                     continue
 192                 title = item.get('title') or item.get('lesson_name')
 193                 description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
 194                 entries.append({
 195                     '_type': 'url_transparent',
 196                     'url': lesson_url,
 197                     'ie_key': LinuxAcademyIE.ie_key(),
 198                     'title': title,
 199                     'description': description,
 200                     'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
 201                     'duration': parse_duration(item.get('duration')),
 202                     'chapter': chapter,
 203                     'chapter_id': chapter_id,
 204                     'chapter_number': chapter_number,
 205                 })
 206             return {
 207                 '_type': 'playlist',
 208                 'entries': entries,
 209                 'id': course_id,
 210                 'title': module.get('title'),
 211                 'description': module.get('md_desc') or clean_html(module.get('desc')),
 212                 'duration': parse_duration(module.get('duration')),
 213             }
 214
 215         # single video path
 216         m3u8_url = self._parse_json(
 217             self._search_regex(
 218                 r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
 219             item_id)[0]['file']
 220         formats = self._extract_m3u8_formats(
 221             m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
 222             m3u8_id='hls')
 223         self._sort_formats(formats)
 224         info = {
 225             'id': item_id,
 226             'formats': formats,
 227         }
 228         lesson = self._parse_json(
 229             self._search_regex(
 230                 (r'window\.lesson\s*=\s*({.+?})\s*;',
 231                  r'player\.lesson\s*=\s*({.+?})\s*;'),
 232                 webpage, 'lesson', default='{}'), item_id, fatal=False)
 233         if lesson:
 234             info.update({
 235                 'title': lesson.get('lesson_name'),
 236                 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
 237                 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
 238                 'duration': parse_duration(lesson.get('duration')),
 239             })
 240         if not info.get('title'):
 241             info['title'] = self._search_regex(
 242                 (r'>Lecture\s*:\s*(?P<value>[^<]+)',
 243                  r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
 244                 'title', group='value')
 245         return info