yt_dlp/extractor/lynda.py

   1 import re
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     int_or_none,
   8     urlencode_postdata,
   9 )
  10
  11
  12 class LyndaBaseIE(InfoExtractor):
  13     _SIGNIN_URL = 'https://www.lynda.com/signin/lynda'
  14     _PASSWORD_URL = 'https://www.lynda.com/signin/password'
  15     _USER_URL = 'https://www.lynda.com/signin/user'
  16     _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
  17     _NETRC_MACHINE = 'lynda'
  18
  19     @staticmethod
  20     def _check_error(json_string, key_or_keys):
  21         keys = [key_or_keys] if isinstance(key_or_keys, str) else key_or_keys
  22         for key in keys:
  23             error = json_string.get(key)
  24             if error:
  25                 raise ExtractorError(f'Unable to login: {error}', expected=True)
  26
  27     def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
  28         action_url = self._search_regex(
  29             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
  30             'post url', default=fallback_action_url, group='url')
  31
  32         if not action_url.startswith('http'):
  33             action_url = urllib.parse.urljoin(self._SIGNIN_URL, action_url)
  34
  35         form_data = self._hidden_inputs(form_html)
  36         form_data.update(extra_form_data)
  37
  38         response = self._download_json(
  39             action_url, None, note,
  40             data=urlencode_postdata(form_data),
  41             headers={
  42                 'Referer': referrer_url,
  43                 'X-Requested-With': 'XMLHttpRequest',
  44             }, expected_status=(418, 500))
  45
  46         self._check_error(response, ('email', 'password', 'ErrorMessage'))
  47
  48         return response, action_url
  49
  50     def _perform_login(self, username, password):
  51         # Step 1: download signin page
  52         signin_page = self._download_webpage(
  53             self._SIGNIN_URL, None, 'Downloading signin page')
  54
  55         # Already logged in
  56         if any(re.search(p, signin_page) for p in (
  57                 r'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
  58             return
  59
  60         # Step 2: submit email
  61         signin_form = self._search_regex(
  62             r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
  63             signin_page, 'signin form')
  64         signin_page, signin_url = self._login_step(
  65             signin_form, self._PASSWORD_URL, {'email': username},
  66             'Submitting email', self._SIGNIN_URL)
  67
  68         # Step 3: submit password
  69         password_form = signin_page['body']
  70         self._login_step(
  71             password_form, self._USER_URL, {'email': username, 'password': password},
  72             'Submitting password', signin_url)
  73
  74
  75 class LyndaIE(LyndaBaseIE):
  76     IE_NAME = 'lynda'
  77     IE_DESC = 'lynda.com videos'
  78     _VALID_URL = r'''(?x)
  79                     https?://
  80                         (?:www\.)?(?:lynda\.com|educourse\.ga)/
  81                         (?:
  82                             (?:[^/]+/){2,3}(?P<course_id>\d+)|
  83                             player/embed
  84                         )/
  85                         (?P<id>\d+)
  86                     '''
  87
  88     _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
  89
  90     _TESTS = [{
  91         'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
  92         # md5 is unstable
  93         'info_dict': {
  94             'id': '114408',
  95             'ext': 'mp4',
  96             'title': 'Using the exercise files',
  97             'duration': 68,
  98         },
  99     }, {
 100         'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0',
 101         'only_matching': True,
 102     }, {
 103         'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
 104         'only_matching': True,
 105     }, {
 106         'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html',
 107         'only_matching': True,
 108     }, {
 109         # Status="NotFound", Message="Transcript not found"
 110         'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html',
 111         'only_matching': True,
 112     }]
 113
 114     def _raise_unavailable(self, video_id):
 115         self.raise_login_required(
 116             f'Video {video_id} is only available for members')
 117
 118     def _real_extract(self, url):
 119         mobj = self._match_valid_url(url)
 120         video_id = mobj.group('id')
 121         course_id = mobj.group('course_id')
 122
 123         query = {
 124             'videoId': video_id,
 125             'type': 'video',
 126         }
 127
 128         video = self._download_json(
 129             'https://www.lynda.com/ajax/player', video_id,
 130             'Downloading video JSON', fatal=False, query=query)
 131
 132         # Fallback scenario
 133         if not video:
 134             query['courseId'] = course_id
 135
 136             play = self._download_json(
 137                 f'https://www.lynda.com/ajax/course/{course_id}/{video_id}/play', video_id, 'Downloading play JSON')
 138
 139             if not play:
 140                 self._raise_unavailable(video_id)
 141
 142             formats = []
 143             for formats_dict in play:
 144                 urls = formats_dict.get('urls')
 145                 if not isinstance(urls, dict):
 146                     continue
 147                 cdn = formats_dict.get('name')
 148                 for format_id, format_url in urls.items():
 149                     if not format_url:
 150                         continue
 151                     formats.append({
 152                         'url': format_url,
 153                         'format_id': f'{cdn}-{format_id}' if cdn else format_id,
 154                         'height': int_or_none(format_id),
 155                     })
 156
 157             conviva = self._download_json(
 158                 'https://www.lynda.com/ajax/player/conviva', video_id,
 159                 'Downloading conviva JSON', query=query)
 160
 161             return {
 162                 'id': video_id,
 163                 'title': conviva['VideoTitle'],
 164                 'description': conviva.get('VideoDescription'),
 165                 'release_year': int_or_none(conviva.get('ReleaseYear')),
 166                 'duration': int_or_none(conviva.get('Duration')),
 167                 'creator': conviva.get('Author'),
 168                 'formats': formats,
 169             }
 170
 171         if 'Status' in video:
 172             raise ExtractorError(
 173                 'lynda returned error: {}'.format(video['Message']), expected=True)
 174
 175         if video.get('HasAccess') is False:
 176             self._raise_unavailable(video_id)
 177
 178         video_id = str(video.get('ID') or video_id)
 179         duration = int_or_none(video.get('DurationInSeconds'))
 180         title = video['Title']
 181
 182         formats = []
 183
 184         fmts = video.get('Formats')
 185         if fmts:
 186             formats.extend([{
 187                 'url': f['Url'],
 188                 'ext': f.get('Extension'),
 189                 'width': int_or_none(f.get('Width')),
 190                 'height': int_or_none(f.get('Height')),
 191                 'filesize': int_or_none(f.get('FileSize')),
 192                 'format_id': str(f.get('Resolution')) if f.get('Resolution') else None,
 193             } for f in fmts if f.get('Url')])
 194
 195         prioritized_streams = video.get('PrioritizedStreams')
 196         if prioritized_streams:
 197             for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
 198                 formats.extend([{
 199                     'url': video_url,
 200                     'height': int_or_none(format_id),
 201                     'format_id': f'{prioritized_stream_id}-{format_id}',
 202                 } for format_id, video_url in prioritized_stream.items()])
 203
 204         self._check_formats(formats, video_id)
 205
 206         subtitles = self.extract_subtitles(video_id)
 207
 208         return {
 209             'id': video_id,
 210             'title': title,
 211             'duration': duration,
 212             'subtitles': subtitles,
 213             'formats': formats,
 214         }
 215
 216     def _fix_subtitles(self, subs):
 217         srt = ''
 218         seq_counter = 0
 219         for seq_current, seq_next in zip(subs, subs[1:]):
 220             m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
 221             if m_current is None:
 222                 continue
 223             m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
 224             if m_next is None:
 225                 continue
 226             appear_time = m_current.group('timecode')
 227             disappear_time = m_next.group('timecode')
 228             text = seq_current['Caption'].strip()
 229             if text:
 230                 seq_counter += 1
 231                 srt += f'{seq_counter}\r\n{appear_time} --> {disappear_time}\r\n{text}\r\n\r\n'
 232         if srt:
 233             return srt
 234
 235     def _get_subtitles(self, video_id):
 236         url = f'https://www.lynda.com/ajax/player?videoId={video_id}&type=transcript'
 237         subs = self._download_webpage(
 238             url, video_id, 'Downloading subtitles JSON', fatal=False)
 239         if not subs or 'Status="NotFound"' in subs:
 240             return {}
 241         subs = self._parse_json(subs, video_id, fatal=False)
 242         if not subs:
 243             return {}
 244         fixed_subs = self._fix_subtitles(subs)
 245         if fixed_subs:
 246             return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
 247         return {}
 248
 249
 250 class LyndaCourseIE(LyndaBaseIE):
 251     IE_NAME = 'lynda:course'
 252     IE_DESC = 'lynda.com online courses'
 253
 254     # Course link equals to welcome/introduction video link of same course
 255     # We will recognize it as course link
 256     _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html'
 257
 258     _TESTS = [{
 259         'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
 260         'only_matching': True,
 261     }, {
 262         'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
 263         'only_matching': True,
 264     }]
 265
 266     def _real_extract(self, url):
 267         mobj = self._match_valid_url(url)
 268         course_path = mobj.group('coursepath')
 269         course_id = mobj.group('courseid')
 270
 271         item_template = f'https://www.lynda.com/{course_path}/%s-4.html'
 272
 273         course = self._download_json(
 274             f'https://www.lynda.com/ajax/player?courseId={course_id}&type=course',
 275             course_id, 'Downloading course JSON', fatal=False)
 276
 277         if not course:
 278             webpage = self._download_webpage(url, course_id)
 279             entries = [
 280                 self.url_result(
 281                     item_template % video_id, ie=LyndaIE.ie_key(),
 282                     video_id=video_id)
 283                 for video_id in re.findall(
 284                     r'data-video-id=["\'](\d+)', webpage)]
 285             return self.playlist_result(
 286                 entries, course_id,
 287                 self._og_search_title(webpage, fatal=False),
 288                 self._og_search_description(webpage))
 289
 290         if course.get('Status') == 'NotFound':
 291             raise ExtractorError(
 292                 f'Course {course_id} does not exist', expected=True)
 293
 294         unaccessible_videos = 0
 295         entries = []
 296
 297         # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
 298         # by single video API anymore
 299
 300         for chapter in course['Chapters']:
 301             for video in chapter.get('Videos', []):
 302                 if video.get('HasAccess') is False:
 303                     unaccessible_videos += 1
 304                     continue
 305                 video_id = video.get('ID')
 306                 if video_id:
 307                     entries.append({
 308                         '_type': 'url_transparent',
 309                         'url': item_template % video_id,
 310                         'ie_key': LyndaIE.ie_key(),
 311                         'chapter': chapter.get('Title'),
 312                         'chapter_number': int_or_none(chapter.get('ChapterIndex')),
 313                         'chapter_id': str(chapter.get('ID')),
 314                     })
 315
 316         if unaccessible_videos > 0:
 317             self.report_warning(
 318                 f'{unaccessible_videos} videos are only available for members (or paid members) '
 319                 f'and will not be downloaded. {self._ACCOUNT_CREDENTIALS_HINT}')
 320
 321         course_title = course.get('Title')
 322         course_description = course.get('Description')
 323
 324         return self.playlist_result(entries, course_id, course_title, course_description)