[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py

from __future__ import unicode_literals

import json
import random

from .common import InfoExtractor
from ..compat import (
    compat_b64decode,
    compat_HTTPError,
    compat_str,
)
from ..utils import (
    clean_html,
    ExtractorError,
    js_to_json,
    parse_duration,
    try_get,
    unified_timestamp,
    urlencode_postdata,
    urljoin,
)


class LinuxAcademyIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?linuxacademy\.com/cp/
                        (?:
                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
                            modules/view/id/(?P<course_id>\d+)
                        )
                    '''
    _TESTS = [{
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
        'info_dict': {
            'id': '7971-2',
            'ext': 'mp4',
            'title': 'What Is Data Science',
            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
            'timestamp': int,  # The timestamp and upload date changes
            'upload_date': r're:\d+',
            'duration': 304,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
        'only_matching': True,
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
        'info_dict': {
            'id': '154',
            'title': 'AWS Certified Cloud Practitioner',
            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
            'duration': 28835,
        },
        'playlist_count': 41,
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/39',
        'info_dict': {
            'id': '39',
            'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
            'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
            'duration': 89280,
        },
        'playlist_count': 73,
        'skip': 'Requires Linux Academy account credentials',
    }]

    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
    _ORIGIN_URL = 'https://linuxacademy.com'
    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
    _NETRC_MACHINE = 'linuxacademy'

    def _perform_login(self, username, password):
        def random_string():
            return ''.join([
                random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
                for _ in range(32)])

        webpage, urlh = self._download_webpage_handle(
            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                'client_id': self._CLIENT_ID,
                'response_type': 'token id_token',
                'response_mode': 'web_message',
                'redirect_uri': self._ORIGIN_URL,
                'scope': 'openid email user_impersonation profile',
                'audience': self._ORIGIN_URL,
                'state': random_string(),
                'nonce': random_string(),
            })

        login_data = self._parse_json(
            self._search_regex(
                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
                'login info', group='value'), None,
            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
        )['extraParams']

        login_data.update({
            'client_id': self._CLIENT_ID,
            'redirect_uri': self._ORIGIN_URL,
            'tenant': 'lacausers',
            'connection': 'Username-Password-ACG-Proxy',
            'username': username,
            'password': password,
            'sso': 'true',
        })

        login_state_url = urlh.geturl()

        try:
            login_page = self._download_webpage(
                'https://login.linuxacademy.com/usernamepassword/login', None,
                'Downloading login page', data=json.dumps(login_data).encode(),
                headers={
                    'Content-Type': 'application/json',
                    'Origin': 'https://login.linuxacademy.com',
                    'Referer': login_state_url,
                })
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
                error = self._parse_json(e.cause.read(), None)
                message = error.get('description') or error['code']
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, message), expected=True)
            raise

        callback_page, urlh = self._download_webpage_handle(
            'https://login.linuxacademy.com/login/callback', None,
            'Downloading callback page',
            data=urlencode_postdata(self._hidden_inputs(login_page)),
            headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Origin': 'https://login.linuxacademy.com',
                'Referer': login_state_url,
            })

        access_token = self._search_regex(
            r'access_token=([^=&]+)', urlh.geturl(),
            'access token', default=None)
        if not access_token:
            access_token = self._parse_json(
                self._search_regex(
                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
                    'authorization response'), None,
                transform_source=js_to_json)['response']['access_token']

        self._download_webpage(
            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
            % access_token, None, 'Downloading token validation page')

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)

        webpage = self._download_webpage(url, item_id)

        # course path
        if course_id:
            module = self._parse_json(
                self._search_regex(
                    r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
                item_id)
            entries = []
            chapter_number = None
            chapter = None
            chapter_id = None
            for item in module['items']:
                if not isinstance(item, dict):
                    continue

                def type_field(key):
                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
                type_fields = (type_field('name'), type_field('slug'))
                # Move to next module section
                if 'section' in type_fields:
                    chapter = item.get('course_name')
                    chapter_id = item.get('course_module')
                    chapter_number = 1 if not chapter_number else chapter_number + 1
                    continue
                # Skip non-lessons
                if 'lesson' not in type_fields:
                    continue
                lesson_url = urljoin(url, item.get('url'))
                if not lesson_url:
                    continue
                title = item.get('title') or item.get('lesson_name')
                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
                entries.append({
                    '_type': 'url_transparent',
                    'url': lesson_url,
                    'ie_key': LinuxAcademyIE.ie_key(),
                    'title': title,
                    'description': description,
                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
                    'duration': parse_duration(item.get('duration')),
                    'chapter': chapter,
                    'chapter_id': chapter_id,
                    'chapter_number': chapter_number,
                })
            return {
                '_type': 'playlist',
                'entries': entries,
                'id': course_id,
                'title': module.get('title'),
                'description': module.get('md_desc') or clean_html(module.get('desc')),
                'duration': parse_duration(module.get('duration')),
            }

        # single video path
        m3u8_url = self._parse_json(
            self._search_regex(
                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
            item_id)[0]['file']
        formats = self._extract_m3u8_formats(
            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls')
        self._sort_formats(formats)
        info = {
            'id': item_id,
            'formats': formats,
        }
        lesson = self._parse_json(
            self._search_regex(
                (r'window\.lesson\s*=\s*({.+?})\s*;',
                 r'player\.lesson\s*=\s*({.+?})\s*;'),
                webpage, 'lesson', default='{}'), item_id, fatal=False)
        if lesson:
            info.update({
                'title': lesson.get('lesson_name'),
                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
                'duration': parse_duration(lesson.get('duration')),
            })
        if not info.get('title'):
            info['title'] = self._search_regex(
                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
                'title', group='value')
        return info
Commit	Line	Data
659e93fc S	1	from __future__ import unicode_literals
	2
	3	import json
	4	import random
659e93fc S	5
	6	from .common import InfoExtractor
	7	from ..compat import (
	8	compat_b64decode,
	9	compat_HTTPError,
29f7c58a	10	compat_str,
659e93fc S	11	)
659e93fc S	12	from ..utils import (
29f7c58a	13	clean_html,
659e93fc	14	ExtractorError,
29f7c58a	15	js_to_json,
	16	parse_duration,
	17	try_get,
	18	unified_timestamp,
659e93fc S	19	urlencode_postdata,
	20	urljoin,
	21	)
	22
	23
	24	class LinuxAcademyIE(InfoExtractor):
	25	_VALID_URL = r'''(?x)
	26	https?://
	27	(?:www\.)?linuxacademy\.com/cp/
	28	(?:
	29	courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)\|
	30	modules/view/id/(?P<course_id>\d+)
	31	)
	32	'''
	33	_TESTS = [{
29f7c58a	34	'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
659e93fc	35	'info_dict': {
29f7c58a	36	'id': '7971-2',
659e93fc	37	'ext': 'mp4',
29f7c58a	38	'title': 'What Is Data Science',
29f7c58a	39	'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
037cc66e	40	'timestamp': int, # The timestamp and upload date changes
037cc66e	41	'upload_date': r're:\d+',
29f7c58a	42	'duration': 304,
659e93fc S	43	},
	44	'params': {
	45	'skip_download': True,
	46	},
	47	'skip': 'Requires Linux Academy account credentials',
	48	}, {
	49	'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
	50	'only_matching': True,
	51	}, {
	52	'url': 'https://linuxacademy.com/cp/modules/view/id/154',
	53	'info_dict': {
	54	'id': '154',
	55	'title': 'AWS Certified Cloud Practitioner',
29f7c58a	56	'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
29f7c58a	57	'duration': 28835,
659e93fc S	58	},
	59	'playlist_count': 41,
	60	'skip': 'Requires Linux Academy account credentials',
037cc66e	61	}, {
	62	'url': 'https://linuxacademy.com/cp/modules/view/id/39',
	63	'info_dict': {
	64	'id': '39',
	65	'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
	66	'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
	67	'duration': 89280,
	68	},
	69	'playlist_count': 73,
	70	'skip': 'Requires Linux Academy account credentials',
659e93fc S	71	}]
	72
	73	_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
	74	_ORIGIN_URL = 'https://linuxacademy.com'
	75	_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
	76	_NETRC_MACHINE = 'linuxacademy'
	77
52efa4b3	78	def _perform_login(self, username, password):
659e93fc S	79	def random_string():
	80	return ''.join([
	81	random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
	82	for _ in range(32)])
	83
	84	webpage, urlh = self._download_webpage_handle(
	85	self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
	86	'client_id': self._CLIENT_ID,
	87	'response_type': 'token id_token',
29f7c58a	88	'response_mode': 'web_message',
659e93fc S	89	'redirect_uri': self._ORIGIN_URL,
	90	'scope': 'openid email user_impersonation profile',
	91	'audience': self._ORIGIN_URL,
	92	'state': random_string(),
	93	'nonce': random_string(),
	94	})
	95
	96	login_data = self._parse_json(
	97	self._search_regex(
	98	r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
	99	'login info', group='value'), None,
	100	transform_source=lambda x: compat_b64decode(x).decode('utf-8')
	101	)['extraParams']
	102
	103	login_data.update({
	104	'client_id': self._CLIENT_ID,
	105	'redirect_uri': self._ORIGIN_URL,
	106	'tenant': 'lacausers',
3700c7ef	107	'connection': 'Username-Password-ACG-Proxy',
659e93fc S	108	'username': username,
	109	'password': password,
	110	'sso': 'true',
	111	})
	112
7947a1f7	113	login_state_url = urlh.geturl()
659e93fc S	114
	115	try:
	116	login_page = self._download_webpage(
	117	'https://login.linuxacademy.com/usernamepassword/login', None,
	118	'Downloading login page', data=json.dumps(login_data).encode(),
	119	headers={
	120	'Content-Type': 'application/json',
	121	'Origin': 'https://login.linuxacademy.com',
	122	'Referer': login_state_url,
	123	})
	124	except ExtractorError as e:
	125	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
	126	error = self._parse_json(e.cause.read(), None)
	127	message = error.get('description') or error['code']
	128	raise ExtractorError(
	129	'%s said: %s' % (self.IE_NAME, message), expected=True)
	130	raise
	131
	132	callback_page, urlh = self._download_webpage_handle(
	133	'https://login.linuxacademy.com/login/callback', None,
	134	'Downloading callback page',
	135	data=urlencode_postdata(self._hidden_inputs(login_page)),
	136	headers={
	137	'Content-Type': 'application/x-www-form-urlencoded',
	138	'Origin': 'https://login.linuxacademy.com',
	139	'Referer': login_state_url,
	140	})
	141
	142	access_token = self._search_regex(
7947a1f7	143	r'access_token=([^=&]+)', urlh.geturl(),
29f7c58a	144	'access token', default=None)
	145	if not access_token:
	146	access_token = self._parse_json(
	147	self._search_regex(
	148	r'authorizationResponse\s=\s({.+?})\s*;', callback_page,
	149	'authorization response'), None,
	150	transform_source=js_to_json)['response']['access_token']
659e93fc S	151
	152	self._download_webpage(
	153	'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
	154	% access_token, None, 'Downloading token validation page')
	155
	156	def _real_extract(self, url):
5ad28e7f	157	mobj = self._match_valid_url(url)
659e93fc S	158	chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
	159	item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
	160
	161	webpage = self._download_webpage(url, item_id)
	162
	163	# course path
	164	if course_id:
29f7c58a	165	module = self._parse_json(
29f7c58a	166	self._search_regex(
037cc66e	167	r'window\.module\s=\s({(?:(?!};)[^"]\|"([^"]\|\\")")+})\s;', webpage, 'module'),
29f7c58a	168	item_id)
	169	entries = []
	170	chapter_number = None
	171	chapter = None
	172	chapter_id = None
	173	for item in module['items']:
	174	if not isinstance(item, dict):
	175	continue
	176
	177	def type_field(key):
	178	return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
	179	type_fields = (type_field('name'), type_field('slug'))
	180	# Move to next module section
	181	if 'section' in type_fields:
	182	chapter = item.get('course_name')
	183	chapter_id = item.get('course_module')
	184	chapter_number = 1 if not chapter_number else chapter_number + 1
	185	continue
	186	# Skip non-lessons
	187	if 'lesson' not in type_fields:
	188	continue
	189	lesson_url = urljoin(url, item.get('url'))
	190	if not lesson_url:
	191	continue
	192	title = item.get('title') or item.get('lesson_name')
	193	description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
	194	entries.append({
	195	'_type': 'url_transparent',
	196	'url': lesson_url,
	197	'ie_key': LinuxAcademyIE.ie_key(),
	198	'title': title,
	199	'description': description,
	200	'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
	201	'duration': parse_duration(item.get('duration')),
	202	'chapter': chapter,
	203	'chapter_id': chapter_id,
	204	'chapter_number': chapter_number,
	205	})
	206	return {
	207	'_type': 'playlist',
	208	'entries': entries,
	209	'id': course_id,
	210	'title': module.get('title'),
	211	'description': module.get('md_desc') or clean_html(module.get('desc')),
	212	'duration': parse_duration(module.get('duration')),
	213	}
659e93fc S	214
659e93fc S	215	# single video path
29f7c58a	216	m3u8_url = self._parse_json(
	217	self._search_regex(
	218	r'player\.playlist\s=\s(\[.+?\])\s*;', webpage, 'playlist'),
	219	item_id)[0]['file']
	220	formats = self._extract_m3u8_formats(
	221	m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
	222	m3u8_id='hls')
	223	self._sort_formats(formats)
	224	info = {
659e93fc	225	'id': item_id,
29f7c58a	226	'formats': formats,
	227	}
	228	lesson = self._parse_json(
	229	self._search_regex(
	230	(r'window\.lesson\s=\s({.+?})\s*;',
	231	r'player\.lesson\s=\s({.+?})\s*;'),
	232	webpage, 'lesson', default='{}'), item_id, fatal=False)
	233	if lesson:
	234	info.update({
	235	'title': lesson.get('lesson_name'),
	236	'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
	237	'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
	238	'duration': parse_duration(lesson.get('duration')),
	239	})
	240	if not info.get('title'):
	241	info['title'] = self._search_regex(
	242	(r'>Lecture\s:\s(?P<value>[^<]+)',
	243	r'lessonName\s=\s(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
	244	'title', group='value')
659e93fc	245	return info