[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py

import json
import random

from .common import InfoExtractor
from ..compat import (
    compat_b64decode,
    compat_HTTPError,
    compat_str,
)
from ..utils import (
    clean_html,
    ExtractorError,
    js_to_json,
    parse_duration,
    try_get,
    unified_timestamp,
    urlencode_postdata,
    urljoin,
)


class LinuxAcademyIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?linuxacademy\.com/cp/
                        (?:
                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
                            modules/view/id/(?P<course_id>\d+)
                        )
                    '''
    _TESTS = [{
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
        'info_dict': {
            'id': '7971-2',
            'ext': 'mp4',
            'title': 'What Is Data Science',
            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
            'timestamp': int,  # The timestamp and upload date changes
            'upload_date': r're:\d+',
            'duration': 304,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
        'only_matching': True,
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
        'info_dict': {
            'id': '154',
            'title': 'AWS Certified Cloud Practitioner',
            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
            'duration': 28835,
        },
        'playlist_count': 41,
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/39',
        'info_dict': {
            'id': '39',
            'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
            'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
            'duration': 89280,
        },
        'playlist_count': 73,
        'skip': 'Requires Linux Academy account credentials',
    }]

    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
    _ORIGIN_URL = 'https://linuxacademy.com'
    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
    _NETRC_MACHINE = 'linuxacademy'

    def _perform_login(self, username, password):
        def random_string():
            return ''.join(random.choices(
                '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))

        webpage, urlh = self._download_webpage_handle(
            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                'client_id': self._CLIENT_ID,
                'response_type': 'token id_token',
                'response_mode': 'web_message',
                'redirect_uri': self._ORIGIN_URL,
                'scope': 'openid email user_impersonation profile',
                'audience': self._ORIGIN_URL,
                'state': random_string(),
                'nonce': random_string(),
            })

        login_data = self._parse_json(
            self._search_regex(
                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
                'login info', group='value'), None,
            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
        )['extraParams']

        login_data.update({
            'client_id': self._CLIENT_ID,
            'redirect_uri': self._ORIGIN_URL,
            'tenant': 'lacausers',
            'connection': 'Username-Password-ACG-Proxy',
            'username': username,
            'password': password,
            'sso': 'true',
        })

        login_state_url = urlh.geturl()

        try:
            login_page = self._download_webpage(
                'https://login.linuxacademy.com/usernamepassword/login', None,
                'Downloading login page', data=json.dumps(login_data).encode(),
                headers={
                    'Content-Type': 'application/json',
                    'Origin': 'https://login.linuxacademy.com',
                    'Referer': login_state_url,
                })
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
                error = self._parse_json(e.cause.read(), None)
                message = error.get('description') or error['code']
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, message), expected=True)
            raise

        callback_page, urlh = self._download_webpage_handle(
            'https://login.linuxacademy.com/login/callback', None,
            'Downloading callback page',
            data=urlencode_postdata(self._hidden_inputs(login_page)),
            headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Origin': 'https://login.linuxacademy.com',
                'Referer': login_state_url,
            })

        access_token = self._search_regex(
            r'access_token=([^=&]+)', urlh.geturl(),
            'access token', default=None)
        if not access_token:
            access_token = self._parse_json(
                self._search_regex(
                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
                    'authorization response'), None,
                transform_source=js_to_json)['response']['access_token']

        self._download_webpage(
            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
            % access_token, None, 'Downloading token validation page')

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)

        webpage = self._download_webpage(url, item_id)

        # course path
        if course_id:
            module = self._parse_json(
                self._search_regex(
                    r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
                item_id)
            entries = []
            chapter_number = None
            chapter = None
            chapter_id = None
            for item in module['items']:
                if not isinstance(item, dict):
                    continue

                def type_field(key):
                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
                type_fields = (type_field('name'), type_field('slug'))
                # Move to next module section
                if 'section' in type_fields:
                    chapter = item.get('course_name')
                    chapter_id = item.get('course_module')
                    chapter_number = 1 if not chapter_number else chapter_number + 1
                    continue
                # Skip non-lessons
                if 'lesson' not in type_fields:
                    continue
                lesson_url = urljoin(url, item.get('url'))
                if not lesson_url:
                    continue
                title = item.get('title') or item.get('lesson_name')
                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
                entries.append({
                    '_type': 'url_transparent',
                    'url': lesson_url,
                    'ie_key': LinuxAcademyIE.ie_key(),
                    'title': title,
                    'description': description,
                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
                    'duration': parse_duration(item.get('duration')),
                    'chapter': chapter,
                    'chapter_id': chapter_id,
                    'chapter_number': chapter_number,
                })
            return {
                '_type': 'playlist',
                'entries': entries,
                'id': course_id,
                'title': module.get('title'),
                'description': module.get('md_desc') or clean_html(module.get('desc')),
                'duration': parse_duration(module.get('duration')),
            }

        # single video path
        m3u8_url = self._parse_json(
            self._search_regex(
                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
            item_id)[0]['file']
        formats = self._extract_m3u8_formats(
            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls')
        info = {
            'id': item_id,
            'formats': formats,
        }
        lesson = self._parse_json(
            self._search_regex(
                (r'window\.lesson\s*=\s*({.+?})\s*;',
                 r'player\.lesson\s*=\s*({.+?})\s*;'),
                webpage, 'lesson', default='{}'), item_id, fatal=False)
        if lesson:
            info.update({
                'title': lesson.get('lesson_name'),
                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
                'duration': parse_duration(lesson.get('duration')),
            })
        if not info.get('title'):
            info['title'] = self._search_regex(
                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
                'title', group='value')
        return info
Commit	Line	Data
659e93fc S	1	import json
659e93fc S	2	import random
659e93fc S	3
	4	from .common import InfoExtractor
	5	from ..compat import (
	6	compat_b64decode,
	7	compat_HTTPError,
29f7c58a	8	compat_str,
659e93fc S	9	)
659e93fc S	10	from ..utils import (
29f7c58a	11	clean_html,
659e93fc	12	ExtractorError,
29f7c58a	13	js_to_json,
	14	parse_duration,
	15	try_get,
	16	unified_timestamp,
659e93fc S	17	urlencode_postdata,
	18	urljoin,
	19	)
	20
	21
	22	class LinuxAcademyIE(InfoExtractor):
	23	_VALID_URL = r'''(?x)
	24	https?://
	25	(?:www\.)?linuxacademy\.com/cp/
	26	(?:
	27	courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)\|
	28	modules/view/id/(?P<course_id>\d+)
	29	)
	30	'''
	31	_TESTS = [{
29f7c58a	32	'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
659e93fc	33	'info_dict': {
29f7c58a	34	'id': '7971-2',
659e93fc	35	'ext': 'mp4',
29f7c58a	36	'title': 'What Is Data Science',
29f7c58a	37	'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
037cc66e	38	'timestamp': int, # The timestamp and upload date changes
037cc66e	39	'upload_date': r're:\d+',
29f7c58a	40	'duration': 304,
659e93fc S	41	},
	42	'params': {
	43	'skip_download': True,
	44	},
	45	'skip': 'Requires Linux Academy account credentials',
	46	}, {
	47	'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
	48	'only_matching': True,
	49	}, {
	50	'url': 'https://linuxacademy.com/cp/modules/view/id/154',
	51	'info_dict': {
	52	'id': '154',
	53	'title': 'AWS Certified Cloud Practitioner',
29f7c58a	54	'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
29f7c58a	55	'duration': 28835,
659e93fc S	56	},
	57	'playlist_count': 41,
	58	'skip': 'Requires Linux Academy account credentials',
037cc66e	59	}, {
	60	'url': 'https://linuxacademy.com/cp/modules/view/id/39',
	61	'info_dict': {
	62	'id': '39',
	63	'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
	64	'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
	65	'duration': 89280,
	66	},
	67	'playlist_count': 73,
	68	'skip': 'Requires Linux Academy account credentials',
659e93fc S	69	}]
	70
	71	_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
	72	_ORIGIN_URL = 'https://linuxacademy.com'
	73	_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
	74	_NETRC_MACHINE = 'linuxacademy'
	75
52efa4b3	76	def _perform_login(self, username, password):
659e93fc	77	def random_string():
efa944f4 AM	78	return ''.join(random.choices(
efa944f4 AM	79	'0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32))
659e93fc S	80
	81	webpage, urlh = self._download_webpage_handle(
	82	self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
	83	'client_id': self._CLIENT_ID,
	84	'response_type': 'token id_token',
29f7c58a	85	'response_mode': 'web_message',
659e93fc S	86	'redirect_uri': self._ORIGIN_URL,
	87	'scope': 'openid email user_impersonation profile',
	88	'audience': self._ORIGIN_URL,
	89	'state': random_string(),
	90	'nonce': random_string(),
	91	})
	92
	93	login_data = self._parse_json(
	94	self._search_regex(
	95	r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
	96	'login info', group='value'), None,
	97	transform_source=lambda x: compat_b64decode(x).decode('utf-8')
	98	)['extraParams']
	99
	100	login_data.update({
	101	'client_id': self._CLIENT_ID,
	102	'redirect_uri': self._ORIGIN_URL,
	103	'tenant': 'lacausers',
3700c7ef	104	'connection': 'Username-Password-ACG-Proxy',
659e93fc S	105	'username': username,
	106	'password': password,
	107	'sso': 'true',
	108	})
	109
7947a1f7	110	login_state_url = urlh.geturl()
659e93fc S	111
	112	try:
	113	login_page = self._download_webpage(
	114	'https://login.linuxacademy.com/usernamepassword/login', None,
	115	'Downloading login page', data=json.dumps(login_data).encode(),
	116	headers={
	117	'Content-Type': 'application/json',
	118	'Origin': 'https://login.linuxacademy.com',
	119	'Referer': login_state_url,
	120	})
	121	except ExtractorError as e:
	122	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
	123	error = self._parse_json(e.cause.read(), None)
	124	message = error.get('description') or error['code']
	125	raise ExtractorError(
	126	'%s said: %s' % (self.IE_NAME, message), expected=True)
	127	raise
	128
	129	callback_page, urlh = self._download_webpage_handle(
	130	'https://login.linuxacademy.com/login/callback', None,
	131	'Downloading callback page',
	132	data=urlencode_postdata(self._hidden_inputs(login_page)),
	133	headers={
	134	'Content-Type': 'application/x-www-form-urlencoded',
	135	'Origin': 'https://login.linuxacademy.com',
	136	'Referer': login_state_url,
	137	})
	138
	139	access_token = self._search_regex(
7947a1f7	140	r'access_token=([^=&]+)', urlh.geturl(),
29f7c58a	141	'access token', default=None)
	142	if not access_token:
	143	access_token = self._parse_json(
	144	self._search_regex(
	145	r'authorizationResponse\s=\s({.+?})\s*;', callback_page,
	146	'authorization response'), None,
	147	transform_source=js_to_json)['response']['access_token']
659e93fc S	148
	149	self._download_webpage(
	150	'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
	151	% access_token, None, 'Downloading token validation page')
	152
	153	def _real_extract(self, url):
5ad28e7f	154	mobj = self._match_valid_url(url)
659e93fc S	155	chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
	156	item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
	157
	158	webpage = self._download_webpage(url, item_id)
	159
	160	# course path
	161	if course_id:
29f7c58a	162	module = self._parse_json(
29f7c58a	163	self._search_regex(
037cc66e	164	r'window\.module\s=\s({(?:(?!};)[^"]\|"([^"]\|\\")")+})\s;', webpage, 'module'),
29f7c58a	165	item_id)
	166	entries = []
	167	chapter_number = None
	168	chapter = None
	169	chapter_id = None
	170	for item in module['items']:
	171	if not isinstance(item, dict):
	172	continue
	173
	174	def type_field(key):
	175	return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
	176	type_fields = (type_field('name'), type_field('slug'))
	177	# Move to next module section
	178	if 'section' in type_fields:
	179	chapter = item.get('course_name')
	180	chapter_id = item.get('course_module')
	181	chapter_number = 1 if not chapter_number else chapter_number + 1
	182	continue
	183	# Skip non-lessons
	184	if 'lesson' not in type_fields:
	185	continue
	186	lesson_url = urljoin(url, item.get('url'))
	187	if not lesson_url:
	188	continue
	189	title = item.get('title') or item.get('lesson_name')
	190	description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
	191	entries.append({
	192	'_type': 'url_transparent',
	193	'url': lesson_url,
	194	'ie_key': LinuxAcademyIE.ie_key(),
	195	'title': title,
	196	'description': description,
	197	'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
	198	'duration': parse_duration(item.get('duration')),
	199	'chapter': chapter,
	200	'chapter_id': chapter_id,
	201	'chapter_number': chapter_number,
	202	})
	203	return {
	204	'_type': 'playlist',
	205	'entries': entries,
	206	'id': course_id,
	207	'title': module.get('title'),
	208	'description': module.get('md_desc') or clean_html(module.get('desc')),
	209	'duration': parse_duration(module.get('duration')),
	210	}
659e93fc S	211
659e93fc S	212	# single video path
29f7c58a	213	m3u8_url = self._parse_json(
	214	self._search_regex(
	215	r'player\.playlist\s=\s(\[.+?\])\s*;', webpage, 'playlist'),
	216	item_id)[0]['file']
	217	formats = self._extract_m3u8_formats(
	218	m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
	219	m3u8_id='hls')
29f7c58a	220	info = {
659e93fc	221	'id': item_id,
29f7c58a	222	'formats': formats,
	223	}
	224	lesson = self._parse_json(
	225	self._search_regex(
	226	(r'window\.lesson\s=\s({.+?})\s*;',
	227	r'player\.lesson\s=\s({.+?})\s*;'),
	228	webpage, 'lesson', default='{}'), item_id, fatal=False)
	229	if lesson:
	230	info.update({
	231	'title': lesson.get('lesson_name'),
	232	'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
	233	'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
	234	'duration': parse_duration(lesson.get('duration')),
	235	})
	236	if not info.get('title'):
	237	info['title'] = self._search_regex(
	238	(r'>Lecture\s:\s(?P<value>[^<]+)',
	239	r'lessonName\s=\s(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
	240	'title', group='value')
659e93fc	241	return info