[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py

import json
import random

from .common import InfoExtractor
from ..compat import (
    compat_b64decode,
    compat_HTTPError,
    compat_str,
)
from ..utils import (
    clean_html,
    ExtractorError,
    js_to_json,
    parse_duration,
    try_get,
    unified_timestamp,
    urlencode_postdata,
    urljoin,
)


class LinuxAcademyIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?linuxacademy\.com/cp/
                        (?:
                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
                            modules/view/id/(?P<course_id>\d+)
                        )
                    '''
    _TESTS = [{
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
        'info_dict': {
            'id': '7971-2',
            'ext': 'mp4',
            'title': 'What Is Data Science',
            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
            'timestamp': int,  # The timestamp and upload date changes
            'upload_date': r're:\d+',
            'duration': 304,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
        'only_matching': True,
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
        'info_dict': {
            'id': '154',
            'title': 'AWS Certified Cloud Practitioner',
            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
            'duration': 28835,
        },
        'playlist_count': 41,
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/39',
        'info_dict': {
            'id': '39',
            'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
            'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
            'duration': 89280,
        },
        'playlist_count': 73,
        'skip': 'Requires Linux Academy account credentials',
    }]

    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
    _ORIGIN_URL = 'https://linuxacademy.com'
    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
    _NETRC_MACHINE = 'linuxacademy'

    def _perform_login(self, username, password):
        def random_string():
            return ''.join([
                random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
                for _ in range(32)])

        webpage, urlh = self._download_webpage_handle(
            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                'client_id': self._CLIENT_ID,
                'response_type': 'token id_token',
                'response_mode': 'web_message',
                'redirect_uri': self._ORIGIN_URL,
                'scope': 'openid email user_impersonation profile',
                'audience': self._ORIGIN_URL,
                'state': random_string(),
                'nonce': random_string(),
            })

        login_data = self._parse_json(
            self._search_regex(
                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
                'login info', group='value'), None,
            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
        )['extraParams']

        login_data.update({
            'client_id': self._CLIENT_ID,
            'redirect_uri': self._ORIGIN_URL,
            'tenant': 'lacausers',
            'connection': 'Username-Password-ACG-Proxy',
            'username': username,
            'password': password,
            'sso': 'true',
        })

        login_state_url = urlh.geturl()

        try:
            login_page = self._download_webpage(
                'https://login.linuxacademy.com/usernamepassword/login', None,
                'Downloading login page', data=json.dumps(login_data).encode(),
                headers={
                    'Content-Type': 'application/json',
                    'Origin': 'https://login.linuxacademy.com',
                    'Referer': login_state_url,
                })
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
                error = self._parse_json(e.cause.read(), None)
                message = error.get('description') or error['code']
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, message), expected=True)
            raise

        callback_page, urlh = self._download_webpage_handle(
            'https://login.linuxacademy.com/login/callback', None,
            'Downloading callback page',
            data=urlencode_postdata(self._hidden_inputs(login_page)),
            headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Origin': 'https://login.linuxacademy.com',
                'Referer': login_state_url,
            })

        access_token = self._search_regex(
            r'access_token=([^=&]+)', urlh.geturl(),
            'access token', default=None)
        if not access_token:
            access_token = self._parse_json(
                self._search_regex(
                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
                    'authorization response'), None,
                transform_source=js_to_json)['response']['access_token']

        self._download_webpage(
            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
            % access_token, None, 'Downloading token validation page')

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)

        webpage = self._download_webpage(url, item_id)

        # course path
        if course_id:
            module = self._parse_json(
                self._search_regex(
                    r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
                item_id)
            entries = []
            chapter_number = None
            chapter = None
            chapter_id = None
            for item in module['items']:
                if not isinstance(item, dict):
                    continue

                def type_field(key):
                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
                type_fields = (type_field('name'), type_field('slug'))
                # Move to next module section
                if 'section' in type_fields:
                    chapter = item.get('course_name')
                    chapter_id = item.get('course_module')
                    chapter_number = 1 if not chapter_number else chapter_number + 1
                    continue
                # Skip non-lessons
                if 'lesson' not in type_fields:
                    continue
                lesson_url = urljoin(url, item.get('url'))
                if not lesson_url:
                    continue
                title = item.get('title') or item.get('lesson_name')
                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
                entries.append({
                    '_type': 'url_transparent',
                    'url': lesson_url,
                    'ie_key': LinuxAcademyIE.ie_key(),
                    'title': title,
                    'description': description,
                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
                    'duration': parse_duration(item.get('duration')),
                    'chapter': chapter,
                    'chapter_id': chapter_id,
                    'chapter_number': chapter_number,
                })
            return {
                '_type': 'playlist',
                'entries': entries,
                'id': course_id,
                'title': module.get('title'),
                'description': module.get('md_desc') or clean_html(module.get('desc')),
                'duration': parse_duration(module.get('duration')),
            }

        # single video path
        m3u8_url = self._parse_json(
            self._search_regex(
                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
            item_id)[0]['file']
        formats = self._extract_m3u8_formats(
            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls')
        self._sort_formats(formats)
        info = {
            'id': item_id,
            'formats': formats,
        }
        lesson = self._parse_json(
            self._search_regex(
                (r'window\.lesson\s*=\s*({.+?})\s*;',
                 r'player\.lesson\s*=\s*({.+?})\s*;'),
                webpage, 'lesson', default='{}'), item_id, fatal=False)
        if lesson:
            info.update({
                'title': lesson.get('lesson_name'),
                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
                'duration': parse_duration(lesson.get('duration')),
            })
        if not info.get('title'):
            info['title'] = self._search_regex(
                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
                'title', group='value')
        return info
Commit	Line	Data
659e93fc S	1	import json
659e93fc S	2	import random
659e93fc S	3
	4	from .common import InfoExtractor
	5	from ..compat import (
	6	compat_b64decode,
	7	compat_HTTPError,
29f7c58a	8	compat_str,
659e93fc S	9	)
659e93fc S	10	from ..utils import (
29f7c58a	11	clean_html,
659e93fc	12	ExtractorError,
29f7c58a	13	js_to_json,
	14	parse_duration,
	15	try_get,
	16	unified_timestamp,
659e93fc S	17	urlencode_postdata,
	18	urljoin,
	19	)
	20
	21
	22	class LinuxAcademyIE(InfoExtractor):
	23	_VALID_URL = r'''(?x)
	24	https?://
	25	(?:www\.)?linuxacademy\.com/cp/
	26	(?:
	27	courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)\|
	28	modules/view/id/(?P<course_id>\d+)
	29	)
	30	'''
	31	_TESTS = [{
29f7c58a	32	'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
659e93fc	33	'info_dict': {
29f7c58a	34	'id': '7971-2',
659e93fc	35	'ext': 'mp4',
29f7c58a	36	'title': 'What Is Data Science',
29f7c58a	37	'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
037cc66e	38	'timestamp': int, # The timestamp and upload date changes
037cc66e	39	'upload_date': r're:\d+',
29f7c58a	40	'duration': 304,
659e93fc S	41	},
	42	'params': {
	43	'skip_download': True,
	44	},
	45	'skip': 'Requires Linux Academy account credentials',
	46	}, {
	47	'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
	48	'only_matching': True,
	49	}, {
	50	'url': 'https://linuxacademy.com/cp/modules/view/id/154',
	51	'info_dict': {
	52	'id': '154',
	53	'title': 'AWS Certified Cloud Practitioner',
29f7c58a	54	'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
29f7c58a	55	'duration': 28835,
659e93fc S	56	},
	57	'playlist_count': 41,
	58	'skip': 'Requires Linux Academy account credentials',
037cc66e	59	}, {
	60	'url': 'https://linuxacademy.com/cp/modules/view/id/39',
	61	'info_dict': {
	62	'id': '39',
	63	'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
	64	'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
	65	'duration': 89280,
	66	},
	67	'playlist_count': 73,
	68	'skip': 'Requires Linux Academy account credentials',
659e93fc S	69	}]
	70
	71	_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
	72	_ORIGIN_URL = 'https://linuxacademy.com'
	73	_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
	74	_NETRC_MACHINE = 'linuxacademy'
	75
52efa4b3	76	def _perform_login(self, username, password):
659e93fc S	77	def random_string():
	78	return ''.join([
	79	random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
	80	for _ in range(32)])
	81
	82	webpage, urlh = self._download_webpage_handle(
	83	self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
	84	'client_id': self._CLIENT_ID,
	85	'response_type': 'token id_token',
29f7c58a	86	'response_mode': 'web_message',
659e93fc S	87	'redirect_uri': self._ORIGIN_URL,
	88	'scope': 'openid email user_impersonation profile',
	89	'audience': self._ORIGIN_URL,
	90	'state': random_string(),
	91	'nonce': random_string(),
	92	})
	93
	94	login_data = self._parse_json(
	95	self._search_regex(
	96	r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
	97	'login info', group='value'), None,
	98	transform_source=lambda x: compat_b64decode(x).decode('utf-8')
	99	)['extraParams']
	100
	101	login_data.update({
	102	'client_id': self._CLIENT_ID,
	103	'redirect_uri': self._ORIGIN_URL,
	104	'tenant': 'lacausers',
3700c7ef	105	'connection': 'Username-Password-ACG-Proxy',
659e93fc S	106	'username': username,
	107	'password': password,
	108	'sso': 'true',
	109	})
	110
7947a1f7	111	login_state_url = urlh.geturl()
659e93fc S	112
	113	try:
	114	login_page = self._download_webpage(
	115	'https://login.linuxacademy.com/usernamepassword/login', None,
	116	'Downloading login page', data=json.dumps(login_data).encode(),
	117	headers={
	118	'Content-Type': 'application/json',
	119	'Origin': 'https://login.linuxacademy.com',
	120	'Referer': login_state_url,
	121	})
	122	except ExtractorError as e:
	123	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
	124	error = self._parse_json(e.cause.read(), None)
	125	message = error.get('description') or error['code']
	126	raise ExtractorError(
	127	'%s said: %s' % (self.IE_NAME, message), expected=True)
	128	raise
	129
	130	callback_page, urlh = self._download_webpage_handle(
	131	'https://login.linuxacademy.com/login/callback', None,
	132	'Downloading callback page',
	133	data=urlencode_postdata(self._hidden_inputs(login_page)),
	134	headers={
	135	'Content-Type': 'application/x-www-form-urlencoded',
	136	'Origin': 'https://login.linuxacademy.com',
	137	'Referer': login_state_url,
	138	})
	139
	140	access_token = self._search_regex(
7947a1f7	141	r'access_token=([^=&]+)', urlh.geturl(),
29f7c58a	142	'access token', default=None)
	143	if not access_token:
	144	access_token = self._parse_json(
	145	self._search_regex(
	146	r'authorizationResponse\s=\s({.+?})\s*;', callback_page,
	147	'authorization response'), None,
	148	transform_source=js_to_json)['response']['access_token']
659e93fc S	149
	150	self._download_webpage(
	151	'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
	152	% access_token, None, 'Downloading token validation page')
	153
	154	def _real_extract(self, url):
5ad28e7f	155	mobj = self._match_valid_url(url)
659e93fc S	156	chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
	157	item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
	158
	159	webpage = self._download_webpage(url, item_id)
	160
	161	# course path
	162	if course_id:
29f7c58a	163	module = self._parse_json(
29f7c58a	164	self._search_regex(
037cc66e	165	r'window\.module\s=\s({(?:(?!};)[^"]\|"([^"]\|\\")")+})\s;', webpage, 'module'),
29f7c58a	166	item_id)
	167	entries = []
	168	chapter_number = None
	169	chapter = None
	170	chapter_id = None
	171	for item in module['items']:
	172	if not isinstance(item, dict):
	173	continue
	174
	175	def type_field(key):
	176	return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
	177	type_fields = (type_field('name'), type_field('slug'))
	178	# Move to next module section
	179	if 'section' in type_fields:
	180	chapter = item.get('course_name')
	181	chapter_id = item.get('course_module')
	182	chapter_number = 1 if not chapter_number else chapter_number + 1
	183	continue
	184	# Skip non-lessons
	185	if 'lesson' not in type_fields:
	186	continue
	187	lesson_url = urljoin(url, item.get('url'))
	188	if not lesson_url:
	189	continue
	190	title = item.get('title') or item.get('lesson_name')
	191	description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
	192	entries.append({
	193	'_type': 'url_transparent',
	194	'url': lesson_url,
	195	'ie_key': LinuxAcademyIE.ie_key(),
	196	'title': title,
	197	'description': description,
	198	'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
	199	'duration': parse_duration(item.get('duration')),
	200	'chapter': chapter,
	201	'chapter_id': chapter_id,
	202	'chapter_number': chapter_number,
	203	})
	204	return {
	205	'_type': 'playlist',
	206	'entries': entries,
	207	'id': course_id,
	208	'title': module.get('title'),
	209	'description': module.get('md_desc') or clean_html(module.get('desc')),
	210	'duration': parse_duration(module.get('duration')),
	211	}
659e93fc S	212
659e93fc S	213	# single video path
29f7c58a	214	m3u8_url = self._parse_json(
	215	self._search_regex(
	216	r'player\.playlist\s=\s(\[.+?\])\s*;', webpage, 'playlist'),
	217	item_id)[0]['file']
	218	formats = self._extract_m3u8_formats(
	219	m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
	220	m3u8_id='hls')
	221	self._sort_formats(formats)
	222	info = {
659e93fc	223	'id': item_id,
29f7c58a	224	'formats': formats,
	225	}
	226	lesson = self._parse_json(
	227	self._search_regex(
	228	(r'window\.lesson\s=\s({.+?})\s*;',
	229	r'player\.lesson\s=\s({.+?})\s*;'),
	230	webpage, 'lesson', default='{}'), item_id, fatal=False)
	231	if lesson:
	232	info.update({
	233	'title': lesson.get('lesson_name'),
	234	'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
	235	'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
	236	'duration': parse_duration(lesson.get('duration')),
	237	})
	238	if not info.get('title'):
	239	info['title'] = self._search_regex(
	240	(r'>Lecture\s:\s(?P<value>[^<]+)',
	241	r'lessonName\s=\s(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
	242	'title', group='value')
659e93fc	243	return info