yt_dlp/extractor/alura.py

   1 import re
   2
   3 from .common import InfoExtractor
   4
   5 from ..compat import (
   6     compat_urlparse,
   7 )
   8
   9 from ..utils import (
  10     urlencode_postdata,
  11     urljoin,
  12     int_or_none,
  13     clean_html,
  14     ExtractorError
  15 )
  16
  17
  18 class AluraIE(InfoExtractor):
  19     _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)'
  20     _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
  21     _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video'
  22     _NETRC_MACHINE = 'alura'
  23     _TESTS = [{
  24         'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095',
  25         'info_dict': {
  26             'id': '60095',
  27             'ext': 'mp4',
  28             'title': 'Referências, ref-set e alter'
  29         },
  30         'skip': 'Requires alura account credentials'},
  31         {
  32             # URL without video
  33             'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098',
  34             'only_matching': True},
  35         {
  36             'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219',
  37             'only_matching': True}
  38     ]
  39
  40     def _real_extract(self, url):
  41
  42         course, video_id = self._match_valid_url(url)
  43         video_url = self._VIDEO_URL % (course, video_id)
  44
  45         video_dict = self._download_json(video_url, video_id, 'Searching for videos')
  46
  47         if video_dict:
  48             webpage = self._download_webpage(url, video_id)
  49             video_title = clean_html(self._search_regex(
  50                 r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)',
  51                 webpage, 'title', group='title'))
  52
  53             formats = []
  54             for video_obj in video_dict:
  55                 video_url_m3u8 = video_obj.get('link')
  56                 video_format = self._extract_m3u8_formats(
  57                     video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native',
  58                     m3u8_id='hls', fatal=False)
  59                 for f in video_format:
  60                     m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url'])
  61                     if m:
  62                         if not f.get('height'):
  63                             f['height'] = int('720' if m.group('res') == 'hd' else '480')
  64                 formats.extend(video_format)
  65
  66             self._sort_formats(formats)
  67
  68             return {
  69                 'id': video_id,
  70                 'title': video_title,
  71                 "formats": formats
  72             }
  73
  74     def _perform_login(self, username, password):
  75
  76         login_page = self._download_webpage(
  77             self._LOGIN_URL, None, 'Downloading login popup')
  78
  79         def is_logged(webpage):
  80             return any(re.search(p, webpage) for p in (
  81                 r'href=[\"|\']?/signout[\"|\']',
  82                 r'>Logout<'))
  83
  84         # already logged in
  85         if is_logged(login_page):
  86             return
  87
  88         login_form = self._hidden_inputs(login_page)
  89
  90         login_form.update({
  91             'username': username,
  92             'password': password,
  93         })
  94
  95         post_url = self._search_regex(
  96             r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page,
  97             'post url', default=self._LOGIN_URL, group='url')
  98
  99         if not post_url.startswith('http'):
 100             post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
 101
 102         response = self._download_webpage(
 103             post_url, None, 'Logging in',
 104             data=urlencode_postdata(login_form),
 105             headers={'Content-Type': 'application/x-www-form-urlencoded'})
 106
 107         if not is_logged(response):
 108             error = self._html_search_regex(
 109                 r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>',
 110                 response, 'error message', default=None)
 111             if error:
 112                 raise ExtractorError('Unable to login: %s' % error, expected=True)
 113             raise ExtractorError('Unable to log in')
 114
 115
 116 class AluraCourseIE(AluraIE):
 117
 118     _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)'
 119     _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm'
 120     _NETRC_MACHINE = 'aluracourse'
 121     _TESTS = [{
 122         'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs',
 123         'only_matching': True,
 124     }]
 125
 126     @classmethod
 127     def suitable(cls, url):
 128         return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url)
 129
 130     def _real_extract(self, url):
 131
 132         course_path = self._match_id(url)
 133         webpage = self._download_webpage(url, course_path)
 134
 135         course_title = self._search_regex(
 136             r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage,
 137             'course title', default=course_path, group='course_title')
 138
 139         entries = []
 140         if webpage:
 141             for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage):
 142                 page_url = urljoin(url, path)
 143                 section_path = self._download_webpage(page_url, course_path)
 144                 for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path):
 145                     chapter = clean_html(
 146                         self._search_regex(
 147                             r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',
 148                             section_path,
 149                             'chapter',
 150                             group='chapter'))
 151
 152                     chapter_number = int_or_none(
 153                         self._search_regex(
 154                             r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',
 155                             section_path,
 156                             'chapter number',
 157                             group='chapter_number'))
 158                     video_url = urljoin(url, path_video)
 159
 160                     entry = {
 161                         '_type': 'url_transparent',
 162                         'id': self._match_id(video_url),
 163                         'url': video_url,
 164                         'id_key': self.ie_key(),
 165                         'chapter': chapter,
 166                         'chapter_number': chapter_number
 167                     }
 168                     entries.append(entry)
 169         return self.playlist_result(entries, course_path, course_title)