[yt-dlp.git] / youtube_dl / extractor / platzi.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import (
    compat_b64decode,
    compat_str,
)
from ..utils import (
    clean_html,
    ExtractorError,
    int_or_none,
    str_or_none,
    try_get,
    url_or_none,
    urlencode_postdata,
    urljoin,
)


class PlatziBaseIE(InfoExtractor):
    _LOGIN_URL = 'https://platzi.com/login/'
    _NETRC_MACHINE = 'platzi'

    def _real_initialize(self):
        self._login()

    def _login(self):
        username, password = self._get_login_info()
        if username is None:
            return

        login_page = self._download_webpage(
            self._LOGIN_URL, None, 'Downloading login page')

        login_form = self._hidden_inputs(login_page)

        login_form.update({
            'email': username,
            'password': password,
        })

        urlh = self._request_webpage(
            self._LOGIN_URL, None, 'Logging in',
            data=urlencode_postdata(login_form),
            headers={'Referer': self._LOGIN_URL})

        # login succeeded
        if 'platzi.com/login' not in urlh.geturl():
            return

        login_error = self._webpage_read_content(
            urlh, self._LOGIN_URL, None, 'Downloading login error page')

        login = self._parse_json(
            self._search_regex(
                r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'),
            None)

        for kind in ('error', 'password', 'nonFields'):
            error = str_or_none(login.get('%sError' % kind))
            if error:
                raise ExtractorError(
                    'Unable to login: %s' % error, expected=True)
        raise ExtractorError('Unable to log in')


class PlatziIE(PlatziBaseIE):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            platzi\.com/clases|           # es version
                            courses\.platzi\.com/classes  # en version
                        )/[^/]+/(?P<id>\d+)-[^/?\#&]+
                    '''

    _TESTS = [{
        'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/',
        'md5': '8f56448241005b561c10f11a595b37e3',
        'info_dict': {
            'id': '12074',
            'ext': 'mp4',
            'title': 'Creando nuestra primera página',
            'description': 'md5:4c866e45034fc76412fbf6e60ae008bc',
            'duration': 420,
        },
        'skip': 'Requires platzi account credentials',
    }, {
        'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/',
        'info_dict': {
            'id': '13430',
            'ext': 'mp4',
            'title': 'Background',
            'description': 'md5:49c83c09404b15e6e71defaf87f6b305',
            'duration': 360,
        },
        'skip': 'Requires platzi account credentials',
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        lecture_id = self._match_id(url)

        webpage = self._download_webpage(url, lecture_id)

        data = self._parse_json(
            self._search_regex(
                # client_data may contain "};" so that we have to try more
                # strict regex first
                (r'client_data\s*=\s*({.+?})\s*;\s*\n',
                 r'client_data\s*=\s*({.+?})\s*;'),
                webpage, 'client data'),
            lecture_id)

        material = data['initialState']['material']
        desc = material['description']
        title = desc['title']

        formats = []
        for server_id, server in material['videos'].items():
            if not isinstance(server, dict):
                continue
            for format_id in ('hls', 'dash'):
                format_url = url_or_none(server.get(format_id))
                if not format_url:
                    continue
                if format_id == 'hls':
                    formats.extend(self._extract_m3u8_formats(
                        format_url, lecture_id, 'mp4',
                        entry_protocol='m3u8_native', m3u8_id=format_id,
                        note='Downloading %s m3u8 information' % server_id,
                        fatal=False))
                elif format_id == 'dash':
                    formats.extend(self._extract_mpd_formats(
                        format_url, lecture_id, mpd_id=format_id,
                        note='Downloading %s MPD manifest' % server_id,
                        fatal=False))
        self._sort_formats(formats)

        content = str_or_none(desc.get('content'))
        description = (clean_html(compat_b64decode(content).decode('utf-8'))
                       if content else None)
        duration = int_or_none(material.get('duration'), invscale=60)

        return {
            'id': lecture_id,
            'title': title,
            'description': description,
            'duration': duration,
            'formats': formats,
        }


class PlatziCourseIE(PlatziBaseIE):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            platzi\.com/clases|           # es version
                            courses\.platzi\.com/classes  # en version
                        )/(?P<id>[^/?\#&]+)
                    '''
    _TESTS = [{
        'url': 'https://platzi.com/clases/next-js/',
        'info_dict': {
            'id': '1311',
            'title': 'Curso de Next.js',
        },
        'playlist_count': 22,
    }, {
        'url': 'https://courses.platzi.com/classes/communication-codestream/',
        'info_dict': {
            'id': '1367',
            'title': 'Codestream Course',
        },
        'playlist_count': 14,
    }]

    @classmethod
    def suitable(cls, url):
        return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url)

    def _real_extract(self, url):
        course_name = self._match_id(url)

        webpage = self._download_webpage(url, course_name)

        props = self._parse_json(
            self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'),
            course_name)['initialProps']

        entries = []
        for chapter_num, chapter in enumerate(props['concepts'], 1):
            if not isinstance(chapter, dict):
                continue
            materials = chapter.get('materials')
            if not materials or not isinstance(materials, list):
                continue
            chapter_title = chapter.get('title')
            chapter_id = str_or_none(chapter.get('id'))
            for material in materials:
                if not isinstance(material, dict):
                    continue
                if material.get('material_type') != 'video':
                    continue
                video_url = urljoin(url, material.get('url'))
                if not video_url:
                    continue
                entries.append({
                    '_type': 'url_transparent',
                    'url': video_url,
                    'title': str_or_none(material.get('name')),
                    'id': str_or_none(material.get('id')),
                    'ie_key': PlatziIE.ie_key(),
                    'chapter': chapter_title,
                    'chapter_number': chapter_num,
                    'chapter_id': chapter_id,
                })

        course_id = compat_str(try_get(props, lambda x: x['course']['id']))
        course_title = try_get(props, lambda x: x['course']['name'], compat_str)

        return self.playlist_result(entries, course_id, course_title)
Commit	Line	Data
c701472f S	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
	5	from ..compat import (
	6	compat_b64decode,
	7	compat_str,
	8	)
	9	from ..utils import (
	10	clean_html,
	11	ExtractorError,
	12	int_or_none,
	13	str_or_none,
	14	try_get,
	15	url_or_none,
	16	urlencode_postdata,
	17	urljoin,
	18	)
	19
	20
66d04c74	21	class PlatziBaseIE(InfoExtractor):
c701472f S	22	_LOGIN_URL = 'https://platzi.com/login/'
	23	_NETRC_MACHINE = 'platzi'
	24
c701472f S	25	def _real_initialize(self):
	26	self._login()
	27
	28	def _login(self):
	29	username, password = self._get_login_info()
	30	if username is None:
	31	return
	32
	33	login_page = self._download_webpage(
	34	self._LOGIN_URL, None, 'Downloading login page')
	35
	36	login_form = self._hidden_inputs(login_page)
	37
	38	login_form.update({
	39	'email': username,
	40	'password': password,
	41	})
	42
	43	urlh = self._request_webpage(
	44	self._LOGIN_URL, None, 'Logging in',
	45	data=urlencode_postdata(login_form),
	46	headers={'Referer': self._LOGIN_URL})
	47
	48	# login succeeded
7947a1f7	49	if 'platzi.com/login' not in urlh.geturl():
c701472f S	50	return
	51
	52	login_error = self._webpage_read_content(
	53	urlh, self._LOGIN_URL, None, 'Downloading login error page')
	54
	55	login = self._parse_json(
	56	self._search_regex(
	57	r'login\s=\s({.+?})(?:\s;\|\s</script)', login_error, 'login'),
	58	None)
	59
	60	for kind in ('error', 'password', 'nonFields'):
	61	error = str_or_none(login.get('%sError' % kind))
	62	if error:
	63	raise ExtractorError(
	64	'Unable to login: %s' % error, expected=True)
	65	raise ExtractorError('Unable to log in')
	66
66d04c74 S	67
	68	class PlatziIE(PlatziBaseIE):
	69	_VALID_URL = r'''(?x)
	70	https?://
	71	(?:
	72	platzi\.com/clases\| # es version
	73	courses\.platzi\.com/classes # en version
	74	)/[^/]+/(?P<id>\d+)-[^/?\#&]+
	75	'''
	76
	77	_TESTS = [{
	78	'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/',
	79	'md5': '8f56448241005b561c10f11a595b37e3',
	80	'info_dict': {
	81	'id': '12074',
	82	'ext': 'mp4',
	83	'title': 'Creando nuestra primera página',
	84	'description': 'md5:4c866e45034fc76412fbf6e60ae008bc',
	85	'duration': 420,
	86	},
	87	'skip': 'Requires platzi account credentials',
	88	}, {
	89	'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/',
	90	'info_dict': {
	91	'id': '13430',
	92	'ext': 'mp4',
	93	'title': 'Background',
	94	'description': 'md5:49c83c09404b15e6e71defaf87f6b305',
	95	'duration': 360,
	96	},
	97	'skip': 'Requires platzi account credentials',
	98	'params': {
	99	'skip_download': True,
	100	},
	101	}]
	102
c701472f S	103	def _real_extract(self, url):
	104	lecture_id = self._match_id(url)
	105
	106	webpage = self._download_webpage(url, lecture_id)
	107
	108	data = self._parse_json(
	109	self._search_regex(
31dbd054 S	110	# client_data may contain "};" so that we have to try more
	111	# strict regex first
	112	(r'client_data\s=\s({.+?})\s;\s\n',
	113	r'client_data\s=\s({.+?})\s*;'),
	114	webpage, 'client data'),
c701472f S	115	lecture_id)
	116
	117	material = data['initialState']['material']
	118	desc = material['description']
	119	title = desc['title']
	120
	121	formats = []
	122	for server_id, server in material['videos'].items():
	123	if not isinstance(server, dict):
	124	continue
	125	for format_id in ('hls', 'dash'):
	126	format_url = url_or_none(server.get(format_id))
	127	if not format_url:
	128	continue
	129	if format_id == 'hls':
	130	formats.extend(self._extract_m3u8_formats(
	131	format_url, lecture_id, 'mp4',
	132	entry_protocol='m3u8_native', m3u8_id=format_id,
	133	note='Downloading %s m3u8 information' % server_id,
	134	fatal=False))
	135	elif format_id == 'dash':
	136	formats.extend(self._extract_mpd_formats(
	137	format_url, lecture_id, mpd_id=format_id,
	138	note='Downloading %s MPD manifest' % server_id,
	139	fatal=False))
	140	self._sort_formats(formats)
	141
	142	content = str_or_none(desc.get('content'))
	143	description = (clean_html(compat_b64decode(content).decode('utf-8'))
	144	if content else None)
	145	duration = int_or_none(material.get('duration'), invscale=60)
	146
	147	return {
	148	'id': lecture_id,
	149	'title': title,
	150	'description': description,
	151	'duration': duration,
	152	'formats': formats,
	153	}
	154
	155
66d04c74	156	class PlatziCourseIE(PlatziBaseIE):
c701472f S	157	_VALID_URL = r'''(?x)
	158	https?://
	159	(?:
	160	platzi\.com/clases\| # es version
	161	courses\.platzi\.com/classes # en version
	162	)/(?P<id>[^/?\#&]+)
	163	'''
	164	_TESTS = [{
	165	'url': 'https://platzi.com/clases/next-js/',
	166	'info_dict': {
	167	'id': '1311',
	168	'title': 'Curso de Next.js',
	169	},
	170	'playlist_count': 22,
	171	}, {
	172	'url': 'https://courses.platzi.com/classes/communication-codestream/',
	173	'info_dict': {
	174	'id': '1367',
	175	'title': 'Codestream Course',
	176	},
	177	'playlist_count': 14,
	178	}]
	179
	180	@classmethod
	181	def suitable(cls, url):
	182	return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url)
	183
	184	def _real_extract(self, url):
	185	course_name = self._match_id(url)
	186
	187	webpage = self._download_webpage(url, course_name)
	188
	189	props = self._parse_json(
	190	self._search_regex(r'data\s=\s({.+?})\s*;', webpage, 'data'),
	191	course_name)['initialProps']
	192
	193	entries = []
	194	for chapter_num, chapter in enumerate(props['concepts'], 1):
	195	if not isinstance(chapter, dict):
	196	continue
	197	materials = chapter.get('materials')
	198	if not materials or not isinstance(materials, list):
	199	continue
	200	chapter_title = chapter.get('title')
	201	chapter_id = str_or_none(chapter.get('id'))
	202	for material in materials:
	203	if not isinstance(material, dict):
	204	continue
	205	if material.get('material_type') != 'video':
	206	continue
	207	video_url = urljoin(url, material.get('url'))
	208	if not video_url:
	209	continue
	210	entries.append({
	211	'_type': 'url_transparent',
	212	'url': video_url,
	213	'title': str_or_none(material.get('name')),
	214	'id': str_or_none(material.get('id')),
	215	'ie_key': PlatziIE.ie_key(),
	216	'chapter': chapter_title,
	217	'chapter_number': chapter_num,
	218	'chapter_id': chapter_id,
	219	})
	220
221	course_id = compat_str(try_get(props, lambda x: x['course']['id']))
222	course_title = try_get(props, lambda x: x['course']['name'], compat_str)
223
224	return self.playlist_result(entries, course_id, course_title)