[yt-dlp.git] / yt_dlp / extractor / linuxacademy.py

from __future__ import unicode_literals

import json
import random
import re

from .common import InfoExtractor
from ..compat import (
    compat_b64decode,
    compat_HTTPError,
    compat_str,
)
from ..utils import (
    clean_html,
    ExtractorError,
    js_to_json,
    parse_duration,
    try_get,
    unified_timestamp,
    urlencode_postdata,
    urljoin,
)


class LinuxAcademyIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:www\.)?linuxacademy\.com/cp/
                        (?:
                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
                            modules/view/id/(?P<course_id>\d+)
                        )
                    '''
    _TESTS = [{
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
        'info_dict': {
            'id': '7971-2',
            'ext': 'mp4',
            'title': 'What Is Data Science',
            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
            'timestamp': int,  # The timestamp and upload date changes
            'upload_date': r're:\d+',
            'duration': 304,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
        'only_matching': True,
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
        'info_dict': {
            'id': '154',
            'title': 'AWS Certified Cloud Practitioner',
            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
            'duration': 28835,
        },
        'playlist_count': 41,
        'skip': 'Requires Linux Academy account credentials',
    }, {
        'url': 'https://linuxacademy.com/cp/modules/view/id/39',
        'info_dict': {
            'id': '39',
            'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep  (legacy)',
            'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
            'duration': 89280,
        },
        'playlist_count': 73,
        'skip': 'Requires Linux Academy account credentials',
    }]

    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
    _ORIGIN_URL = 'https://linuxacademy.com'
    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
    _NETRC_MACHINE = 'linuxacademy'

    def _real_initialize(self):
        self._login()

    def _login(self):
        username, password = self._get_login_info()
        if username is None:
            return

        def random_string():
            return ''.join([
                random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
                for _ in range(32)])

        webpage, urlh = self._download_webpage_handle(
            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                'client_id': self._CLIENT_ID,
                'response_type': 'token id_token',
                'response_mode': 'web_message',
                'redirect_uri': self._ORIGIN_URL,
                'scope': 'openid email user_impersonation profile',
                'audience': self._ORIGIN_URL,
                'state': random_string(),
                'nonce': random_string(),
            })

        login_data = self._parse_json(
            self._search_regex(
                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
                'login info', group='value'), None,
            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
        )['extraParams']

        login_data.update({
            'client_id': self._CLIENT_ID,
            'redirect_uri': self._ORIGIN_URL,
            'tenant': 'lacausers',
            'connection': 'Username-Password-ACG-Proxy',
            'username': username,
            'password': password,
            'sso': 'true',
        })

        login_state_url = urlh.geturl()

        try:
            login_page = self._download_webpage(
                'https://login.linuxacademy.com/usernamepassword/login', None,
                'Downloading login page', data=json.dumps(login_data).encode(),
                headers={
                    'Content-Type': 'application/json',
                    'Origin': 'https://login.linuxacademy.com',
                    'Referer': login_state_url,
                })
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
                error = self._parse_json(e.cause.read(), None)
                message = error.get('description') or error['code']
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, message), expected=True)
            raise

        callback_page, urlh = self._download_webpage_handle(
            'https://login.linuxacademy.com/login/callback', None,
            'Downloading callback page',
            data=urlencode_postdata(self._hidden_inputs(login_page)),
            headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Origin': 'https://login.linuxacademy.com',
                'Referer': login_state_url,
            })

        access_token = self._search_regex(
            r'access_token=([^=&]+)', urlh.geturl(),
            'access token', default=None)
        if not access_token:
            access_token = self._parse_json(
                self._search_regex(
                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
                    'authorization response'), None,
                transform_source=js_to_json)['response']['access_token']

        self._download_webpage(
            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
            % access_token, None, 'Downloading token validation page')

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)

        webpage = self._download_webpage(url, item_id)

        # course path
        if course_id:
            module = self._parse_json(
                self._search_regex(
                    r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'),
                item_id)
            entries = []
            chapter_number = None
            chapter = None
            chapter_id = None
            for item in module['items']:
                if not isinstance(item, dict):
                    continue

                def type_field(key):
                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
                type_fields = (type_field('name'), type_field('slug'))
                # Move to next module section
                if 'section' in type_fields:
                    chapter = item.get('course_name')
                    chapter_id = item.get('course_module')
                    chapter_number = 1 if not chapter_number else chapter_number + 1
                    continue
                # Skip non-lessons
                if 'lesson' not in type_fields:
                    continue
                lesson_url = urljoin(url, item.get('url'))
                if not lesson_url:
                    continue
                title = item.get('title') or item.get('lesson_name')
                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
                entries.append({
                    '_type': 'url_transparent',
                    'url': lesson_url,
                    'ie_key': LinuxAcademyIE.ie_key(),
                    'title': title,
                    'description': description,
                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
                    'duration': parse_duration(item.get('duration')),
                    'chapter': chapter,
                    'chapter_id': chapter_id,
                    'chapter_number': chapter_number,
                })
            return {
                '_type': 'playlist',
                'entries': entries,
                'id': course_id,
                'title': module.get('title'),
                'description': module.get('md_desc') or clean_html(module.get('desc')),
                'duration': parse_duration(module.get('duration')),
            }

        # single video path
        m3u8_url = self._parse_json(
            self._search_regex(
                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
            item_id)[0]['file']
        formats = self._extract_m3u8_formats(
            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
            m3u8_id='hls')
        self._sort_formats(formats)
        info = {
            'id': item_id,
            'formats': formats,
        }
        lesson = self._parse_json(
            self._search_regex(
                (r'window\.lesson\s*=\s*({.+?})\s*;',
                 r'player\.lesson\s*=\s*({.+?})\s*;'),
                webpage, 'lesson', default='{}'), item_id, fatal=False)
        if lesson:
            info.update({
                'title': lesson.get('lesson_name'),
                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
                'duration': parse_duration(lesson.get('duration')),
            })
        if not info.get('title'):
            info['title'] = self._search_regex(
                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
                'title', group='value')
        return info
Commit	Line	Data
659e93fc S	1	from __future__ import unicode_literals
	2
	3	import json
	4	import random
	5	import re
	6
	7	from .common import InfoExtractor
	8	from ..compat import (
	9	compat_b64decode,
	10	compat_HTTPError,
29f7c58a	11	compat_str,
659e93fc S	12	)
659e93fc S	13	from ..utils import (
29f7c58a	14	clean_html,
659e93fc	15	ExtractorError,
29f7c58a	16	js_to_json,
	17	parse_duration,
	18	try_get,
	19	unified_timestamp,
659e93fc S	20	urlencode_postdata,
	21	urljoin,
	22	)
	23
	24
	25	class LinuxAcademyIE(InfoExtractor):
	26	_VALID_URL = r'''(?x)
	27	https?://
	28	(?:www\.)?linuxacademy\.com/cp/
	29	(?:
	30	courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)\|
	31	modules/view/id/(?P<course_id>\d+)
	32	)
	33	'''
	34	_TESTS = [{
29f7c58a	35	'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
659e93fc	36	'info_dict': {
29f7c58a	37	'id': '7971-2',
659e93fc	38	'ext': 'mp4',
29f7c58a	39	'title': 'What Is Data Science',
29f7c58a	40	'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
037cc66e	41	'timestamp': int, # The timestamp and upload date changes
037cc66e	42	'upload_date': r're:\d+',
29f7c58a	43	'duration': 304,
659e93fc S	44	},
	45	'params': {
	46	'skip_download': True,
	47	},
	48	'skip': 'Requires Linux Academy account credentials',
	49	}, {
	50	'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
	51	'only_matching': True,
	52	}, {
	53	'url': 'https://linuxacademy.com/cp/modules/view/id/154',
	54	'info_dict': {
	55	'id': '154',
	56	'title': 'AWS Certified Cloud Practitioner',
29f7c58a	57	'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
29f7c58a	58	'duration': 28835,
659e93fc S	59	},
	60	'playlist_count': 41,
	61	'skip': 'Requires Linux Academy account credentials',
037cc66e	62	}, {
	63	'url': 'https://linuxacademy.com/cp/modules/view/id/39',
	64	'info_dict': {
	65	'id': '39',
	66	'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)',
	67	'description': 'md5:0f1d3369e90c3fb14a79813b863c902f',
	68	'duration': 89280,
	69	},
	70	'playlist_count': 73,
	71	'skip': 'Requires Linux Academy account credentials',
659e93fc S	72	}]
	73
	74	_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
	75	_ORIGIN_URL = 'https://linuxacademy.com'
	76	_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
	77	_NETRC_MACHINE = 'linuxacademy'
	78
	79	def _real_initialize(self):
	80	self._login()
	81
	82	def _login(self):
	83	username, password = self._get_login_info()
	84	if username is None:
	85	return
	86
	87	def random_string():
	88	return ''.join([
	89	random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
	90	for _ in range(32)])
	91
	92	webpage, urlh = self._download_webpage_handle(
	93	self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
	94	'client_id': self._CLIENT_ID,
	95	'response_type': 'token id_token',
29f7c58a	96	'response_mode': 'web_message',
659e93fc S	97	'redirect_uri': self._ORIGIN_URL,
	98	'scope': 'openid email user_impersonation profile',
	99	'audience': self._ORIGIN_URL,
	100	'state': random_string(),
	101	'nonce': random_string(),
	102	})
	103
	104	login_data = self._parse_json(
	105	self._search_regex(
	106	r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
	107	'login info', group='value'), None,
	108	transform_source=lambda x: compat_b64decode(x).decode('utf-8')
	109	)['extraParams']
	110
	111	login_data.update({
	112	'client_id': self._CLIENT_ID,
	113	'redirect_uri': self._ORIGIN_URL,
	114	'tenant': 'lacausers',
3700c7ef	115	'connection': 'Username-Password-ACG-Proxy',
659e93fc S	116	'username': username,
	117	'password': password,
	118	'sso': 'true',
	119	})
	120
7947a1f7	121	login_state_url = urlh.geturl()
659e93fc S	122
	123	try:
	124	login_page = self._download_webpage(
	125	'https://login.linuxacademy.com/usernamepassword/login', None,
	126	'Downloading login page', data=json.dumps(login_data).encode(),
	127	headers={
	128	'Content-Type': 'application/json',
	129	'Origin': 'https://login.linuxacademy.com',
	130	'Referer': login_state_url,
	131	})
	132	except ExtractorError as e:
	133	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
	134	error = self._parse_json(e.cause.read(), None)
	135	message = error.get('description') or error['code']
	136	raise ExtractorError(
	137	'%s said: %s' % (self.IE_NAME, message), expected=True)
	138	raise
	139
	140	callback_page, urlh = self._download_webpage_handle(
	141	'https://login.linuxacademy.com/login/callback', None,
	142	'Downloading callback page',
	143	data=urlencode_postdata(self._hidden_inputs(login_page)),
	144	headers={
	145	'Content-Type': 'application/x-www-form-urlencoded',
	146	'Origin': 'https://login.linuxacademy.com',
	147	'Referer': login_state_url,
	148	})
	149
	150	access_token = self._search_regex(
7947a1f7	151	r'access_token=([^=&]+)', urlh.geturl(),
29f7c58a	152	'access token', default=None)
	153	if not access_token:
	154	access_token = self._parse_json(
	155	self._search_regex(
	156	r'authorizationResponse\s=\s({.+?})\s*;', callback_page,
	157	'authorization response'), None,
	158	transform_source=js_to_json)['response']['access_token']
659e93fc S	159
	160	self._download_webpage(
	161	'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
	162	% access_token, None, 'Downloading token validation page')
	163
	164	def _real_extract(self, url):
	165	mobj = re.match(self._VALID_URL, url)
	166	chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
	167	item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
	168
	169	webpage = self._download_webpage(url, item_id)
	170
	171	# course path
	172	if course_id:
29f7c58a	173	module = self._parse_json(
29f7c58a	174	self._search_regex(
037cc66e	175	r'window\.module\s=\s({(?:(?!};)[^"]\|"([^"]\|\\")")+})\s;', webpage, 'module'),
29f7c58a	176	item_id)
	177	entries = []
	178	chapter_number = None
	179	chapter = None
	180	chapter_id = None
	181	for item in module['items']:
	182	if not isinstance(item, dict):
	183	continue
	184
	185	def type_field(key):
	186	return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
	187	type_fields = (type_field('name'), type_field('slug'))
	188	# Move to next module section
	189	if 'section' in type_fields:
	190	chapter = item.get('course_name')
	191	chapter_id = item.get('course_module')
	192	chapter_number = 1 if not chapter_number else chapter_number + 1
	193	continue
	194	# Skip non-lessons
	195	if 'lesson' not in type_fields:
	196	continue
	197	lesson_url = urljoin(url, item.get('url'))
	198	if not lesson_url:
	199	continue
	200	title = item.get('title') or item.get('lesson_name')
	201	description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
	202	entries.append({
	203	'_type': 'url_transparent',
	204	'url': lesson_url,
	205	'ie_key': LinuxAcademyIE.ie_key(),
	206	'title': title,
	207	'description': description,
	208	'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
	209	'duration': parse_duration(item.get('duration')),
	210	'chapter': chapter,
	211	'chapter_id': chapter_id,
	212	'chapter_number': chapter_number,
	213	})
	214	return {
	215	'_type': 'playlist',
	216	'entries': entries,
	217	'id': course_id,
	218	'title': module.get('title'),
	219	'description': module.get('md_desc') or clean_html(module.get('desc')),
	220	'duration': parse_duration(module.get('duration')),
	221	}
659e93fc S	222
659e93fc S	223	# single video path
29f7c58a	224	m3u8_url = self._parse_json(
	225	self._search_regex(
	226	r'player\.playlist\s=\s(\[.+?\])\s*;', webpage, 'playlist'),
	227	item_id)[0]['file']
	228	formats = self._extract_m3u8_formats(
	229	m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
	230	m3u8_id='hls')
	231	self._sort_formats(formats)
	232	info = {
659e93fc	233	'id': item_id,
29f7c58a	234	'formats': formats,
	235	}
	236	lesson = self._parse_json(
	237	self._search_regex(
	238	(r'window\.lesson\s=\s({.+?})\s*;',
	239	r'player\.lesson\s=\s({.+?})\s*;'),
	240	webpage, 'lesson', default='{}'), item_id, fatal=False)
	241	if lesson:
	242	info.update({
	243	'title': lesson.get('lesson_name'),
	244	'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
	245	'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
	246	'duration': parse_duration(lesson.get('duration')),
	247	})
	248	if not info.get('title'):
	249	info['title'] = self._search_regex(
	250	(r'>Lecture\s:\s(?P<value>[^<]+)',
	251	r'lessonName\s=\s(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
	252	'title', group='value')
659e93fc	253	return info