[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py

import json
import random

from .common import InfoExtractor
from ..compat import compat_b64decode, compat_str
from ..networking.exceptions import HTTPError
from ..utils import (
    clean_html,
    ExtractorError,
    js_to_json,
    parse_duration,
    try_get,
    unified_timestamp,
    urlencode_postdata,
    urljoin,
)


class LinuxAcademyIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?linuxacademy\.com/cp/
                        (?:
                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
                            modules/view/id/(?P<course_id>\d+)
                        )
                    '''
    _TESTS = [{
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
        'info_dict': {
            'id': '7971-2',
            'ext': 'mp4',
            'title': 'What Is Data Science',
            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
            'timestamp': int,  # The timestamp and upload date changes
            'upload_date': r're:\d+',
            'duration': 304,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
        'only_matching': True,
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
        'info_dict': {
            'id': '154',
            'title': 'AWS Certified Cloud Practitioner',
            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
            'duration': 28835,
        },
        'playlist_count': 41,
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/39',
        'info_dict': {
            'id': '39',
            'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
            'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
            'duration': 89280,
        },
        'playlist_count': 73,
        'skip': 'Requires Linux Academy account credentials',
    }]

    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
    _ORIGIN_URL = 'https://linuxacademy.com'
    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
    _NETRC_MACHINE = 'linuxacademy'

    def _perform_login(self, username, password):
        def random_string():
            return ''.join(random.choices(
                '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))

        webpage, urlh = self._download_webpage_handle(
            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                'client_id': self._CLIENT_ID,
                'response_type': 'token id_token',
                'response_mode': 'web_message',
                'redirect_uri': self._ORIGIN_URL,
                'scope': 'openid email user_impersonation profile',
                'audience': self._ORIGIN_URL,
                'state': random_string(),
                'nonce': random_string(),
            })

        login_data = self._parse_json(
            self._search_regex(
                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
                'login info', group='value'), None,
            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
        )['extraParams']

        login_data.update({
            'client_id': self._CLIENT_ID,
            'redirect_uri': self._ORIGIN_URL,
            'tenant': 'lacausers',
            'connection': 'Username-Password-ACG-Proxy',
            'username': username,
            'password': password,
            'sso': 'true',
        })

        login_state_url = urlh.url

        try:
            login_page = self._download_webpage(
                'https://login.linuxacademy.com/usernamepassword/login', None,
                'Downloading login page', data=json.dumps(login_data).encode(),
                headers={
                    'Content-Type': 'application/json',
                    'Origin': 'https://login.linuxacademy.com',
                    'Referer': login_state_url,
                })
        except ExtractorError as e:
            if isinstance(e.cause, HTTPError) and e.cause.status == 401:
                error = self._parse_json(e.cause.response.read(), None)
                message = error.get('description') or error['code']
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, message), expected=True)
            raise

        callback_page, urlh = self._download_webpage_handle(
            'https://login.linuxacademy.com/login/callback', None,
            'Downloading callback page',
            data=urlencode_postdata(self._hidden_inputs(login_page)),
            headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Origin': 'https://login.linuxacademy.com',
                'Referer': login_state_url,
            })

        access_token = self._search_regex(
            r'access_token=([^=&]+)', urlh.url,
            'access token', default=None)
        if not access_token:
            access_token = self._parse_json(
                self._search_regex(
                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
                    'authorization response'), None,
                transform_source=js_to_json)['response']['access_token']

        self._download_webpage(
            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
            % access_token, None, 'Downloading token validation page')

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)

        webpage = self._download_webpage(url, item_id)

        # course path
        if course_id:
            module = self._parse_json(
                self._search_regex(
                    r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
                item_id)
            entries = []
            chapter_number = None
            chapter = None
            chapter_id = None
            for item in module['items']:
                if not isinstance(item, dict):
                    continue

                def type_field(key):
                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
                type_fields = (type_field('name'), type_field('slug'))
                # Move to next module section
                if 'section' in type_fields:
                    chapter = item.get('course_name')
                    chapter_id = item.get('course_module')
                    chapter_number = 1 if not chapter_number else chapter_number + 1
                    continue
                # Skip non-lessons
                if 'lesson' not in type_fields:
                    continue
                lesson_url = urljoin(url, item.get('url'))
                if not lesson_url:
                    continue
                title = item.get('title') or item.get('lesson_name')
                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
                entries.append({
                    '_type': 'url_transparent',
                    'url': lesson_url,
                    'ie_key': LinuxAcademyIE.ie_key(),
                    'title': title,
                    'description': description,
                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
                    'duration': parse_duration(item.get('duration')),
                    'chapter': chapter,
                    'chapter_id': chapter_id,
                    'chapter_number': chapter_number,
                })
            return {
                '_type': 'playlist',
                'entries': entries,
                'id': course_id,
                'title': module.get('title'),
                'description': module.get('md_desc') or clean_html(module.get('desc')),
                'duration': parse_duration(module.get('duration')),
            }

        # single video path
        m3u8_url = self._parse_json(
            self._search_regex(
                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
            item_id)[0]['file']
        formats = self._extract_m3u8_formats(
            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls')
        info = {
            'id': item_id,
            'formats': formats,
        }
        lesson = self._parse_json(
            self._search_regex(
                (r'window\.lesson\s*=\s*({.+?})\s*;',
                 r'player\.lesson\s*=\s*({.+?})\s*;'),
                webpage, 'lesson', default='{}'), item_id, fatal=False)
        if lesson:
            info.update({
                'title': lesson.get('lesson_name'),
                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
                'duration': parse_duration(lesson.get('duration')),
            })
        if not info.get('title'):
            info['title'] = self._search_regex(
                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
                'title', group='value')
        return info
Commit	Line	Data
659e93fc S	1	import json
659e93fc S	2	import random
659e93fc S	3
659e93fc S	4	from .common import InfoExtractor
3d2623a8	5	from ..compat import compat_b64decode, compat_str
3d2623a8	6	from ..networking.exceptions import HTTPError
659e93fc	7	from ..utils import (
29f7c58a	8	clean_html,
659e93fc	9	ExtractorError,
29f7c58a	10	js_to_json,
	11	parse_duration,
	12	try_get,
	13	unified_timestamp,
659e93fc S	14	urlencode_postdata,
	15	urljoin,
	16	)
	17
	18
	19	class LinuxAcademyIE(InfoExtractor):
	20	_VALID_URL = r'''(?x)
	21	https?://
	22	(?:www\.)?linuxacademy\.com/cp/
	23	(?:
	24	courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)\|
	25	modules/view/id/(?P<course_id>\d+)
	26	)
	27	'''
	28	_TESTS = [{
29f7c58a	29	'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
659e93fc	30	'info_dict': {
29f7c58a	31	'id': '7971-2',
659e93fc	32	'ext': 'mp4',
29f7c58a	33	'title': 'What Is Data Science',
29f7c58a	34	'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
037cc66e	35	'timestamp': int, # The timestamp and upload date changes
037cc66e	36	'upload_date': r're:\d+',
29f7c58a	37	'duration': 304,
659e93fc S	38	},
	39	'params': {
	40	'skip_download': True,
	41	},
	42	'skip': 'Requires Linux Academy account credentials',
	43	}, {
	44	'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
	45	'only_matching': True,
	46	}, {
	47	'url': 'https://linuxacademy.com/cp/modules/view/id/154',
	48	'info_dict': {
	49	'id': '154',
	50	'title': 'AWS Certified Cloud Practitioner',
29f7c58a	51	'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
29f7c58a	52	'duration': 28835,
659e93fc S	53	},
	54	'playlist_count': 41,
	55	'skip': 'Requires Linux Academy account credentials',
037cc66e	56	}, {
	57	'url': 'https://linuxacademy.com/cp/modules/view/id/39',
	58	'info_dict': {
	59	'id': '39',
	60	'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
	61	'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
	62	'duration': 89280,
	63	},
	64	'playlist_count': 73,
	65	'skip': 'Requires Linux Academy account credentials',
659e93fc S	66	}]
	67
	68	_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
	69	_ORIGIN_URL = 'https://linuxacademy.com'
	70	_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
	71	_NETRC_MACHINE = 'linuxacademy'
	72
52efa4b3	73	def _perform_login(self, username, password):
659e93fc	74	def random_string():
efa944f4 AM	75	return ''.join(random.choices(
efa944f4 AM	76	'0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))
659e93fc S	77
	78	webpage, urlh = self._download_webpage_handle(
	79	self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
	80	'client_id': self._CLIENT_ID,
	81	'response_type': 'token id_token',
29f7c58a	82	'response_mode': 'web_message',
659e93fc S	83	'redirect_uri': self._ORIGIN_URL,
	84	'scope': 'openid email user_impersonation profile',
	85	'audience': self._ORIGIN_URL,
	86	'state': random_string(),
	87	'nonce': random_string(),
	88	})
	89
	90	login_data = self._parse_json(
	91	self._search_regex(
	92	r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
	93	'login info', group='value'), None,
	94	transform_source=lambda x: compat_b64decode(x).decode('utf-8')
	95	)['extraParams']
	96
	97	login_data.update({
	98	'client_id': self._CLIENT_ID,
	99	'redirect_uri': self._ORIGIN_URL,
	100	'tenant': 'lacausers',
3700c7ef	101	'connection': 'Username-Password-ACG-Proxy',
659e93fc S	102	'username': username,
	103	'password': password,
	104	'sso': 'true',
	105	})
	106
3d2623a8	107	login_state_url = urlh.url
659e93fc S	108
	109	try:
	110	login_page = self._download_webpage(
	111	'https://login.linuxacademy.com/usernamepassword/login', None,
	112	'Downloading login page', data=json.dumps(login_data).encode(),
	113	headers={
	114	'Content-Type': 'application/json',
	115	'Origin': 'https://login.linuxacademy.com',
	116	'Referer': login_state_url,
	117	})
	118	except ExtractorError as e:
3d2623a8	119	if isinstance(e.cause, HTTPError) and e.cause.status == 401:
3d2623a8	120	error = self._parse_json(e.cause.response.read(), None)
659e93fc S	121	message = error.get('description') or error['code']
	122	raise ExtractorError(
	123	'%s said: %s' % (self.IE_NAME, message), expected=True)
	124	raise
	125
	126	callback_page, urlh = self._download_webpage_handle(
	127	'https://login.linuxacademy.com/login/callback', None,
	128	'Downloading callback page',
	129	data=urlencode_postdata(self._hidden_inputs(login_page)),
	130	headers={
	131	'Content-Type': 'application/x-www-form-urlencoded',
	132	'Origin': 'https://login.linuxacademy.com',
	133	'Referer': login_state_url,
	134	})
	135
	136	access_token = self._search_regex(
3d2623a8	137	r'access_token=([^=&]+)', urlh.url,
29f7c58a	138	'access token', default=None)
	139	if not access_token:
	140	access_token = self._parse_json(
	141	self._search_regex(
	142	r'authorizationResponse\s=\s({.+?})\s*;', callback_page,
	143	'authorization response'), None,
	144	transform_source=js_to_json)['response']['access_token']
659e93fc S	145
	146	self._download_webpage(
	147	'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
	148	% access_token, None, 'Downloading token validation page')
	149
	150	def _real_extract(self, url):
5ad28e7f	151	mobj = self._match_valid_url(url)
659e93fc S	152	chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
	153	item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
	154
	155	webpage = self._download_webpage(url, item_id)
	156
	157	# course path
	158	if course_id:
29f7c58a	159	module = self._parse_json(
29f7c58a	160	self._search_regex(
037cc66e	161	r'window\.module\s=\s({(?:(?!};)[^"]\|"([^"]\|\\")")+})\s;', webpage, 'module'),
29f7c58a	162	item_id)
	163	entries = []
	164	chapter_number = None
	165	chapter = None
	166	chapter_id = None
	167	for item in module['items']:
	168	if not isinstance(item, dict):
	169	continue
	170
	171	def type_field(key):
	172	return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
	173	type_fields = (type_field('name'), type_field('slug'))
	174	# Move to next module section
	175	if 'section' in type_fields:
	176	chapter = item.get('course_name')
	177	chapter_id = item.get('course_module')
	178	chapter_number = 1 if not chapter_number else chapter_number + 1
	179	continue
	180	# Skip non-lessons
	181	if 'lesson' not in type_fields:
	182	continue
	183	lesson_url = urljoin(url, item.get('url'))
	184	if not lesson_url:
	185	continue
	186	title = item.get('title') or item.get('lesson_name')
	187	description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
	188	entries.append({
	189	'_type': 'url_transparent',
	190	'url': lesson_url,
	191	'ie_key': LinuxAcademyIE.ie_key(),
	192	'title': title,
	193	'description': description,
	194	'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
	195	'duration': parse_duration(item.get('duration')),
	196	'chapter': chapter,
	197	'chapter_id': chapter_id,
	198	'chapter_number': chapter_number,
	199	})
	200	return {
	201	'_type': 'playlist',
	202	'entries': entries,
	203	'id': course_id,
	204	'title': module.get('title'),
	205	'description': module.get('md_desc') or clean_html(module.get('desc')),
	206	'duration': parse_duration(module.get('duration')),
	207	}
659e93fc S	208
659e93fc S	209	# single video path
29f7c58a	210	m3u8_url = self._parse_json(
	211	self._search_regex(
	212	r'player\.playlist\s=\s(\[.+?\])\s*;', webpage, 'playlist'),
	213	item_id)[0]['file']
	214	formats = self._extract_m3u8_formats(
	215	m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
	216	m3u8_id='hls')
29f7c58a	217	info = {
659e93fc	218	'id': item_id,
29f7c58a	219	'formats': formats,
	220	}
	221	lesson = self._parse_json(
	222	self._search_regex(
	223	(r'window\.lesson\s=\s({.+?})\s*;',
	224	r'player\.lesson\s=\s({.+?})\s*;'),
	225	webpage, 'lesson', default='{}'), item_id, fatal=False)
	226	if lesson:
	227	info.update({
	228	'title': lesson.get('lesson_name'),
	229	'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
	230	'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
	231	'duration': parse_duration(lesson.get('duration')),
	232	})
	233	if not info.get('title'):
	234	info['title'] = self._search_regex(
	235	(r'>Lecture\s:\s(?P<value>[^<]+)',
	236	r'lessonName\s=\s(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
	237	'title', group='value')
659e93fc	238	return info