[yt-dlp.git] / yt_dlp / extractor / scte.py

import re

from .common import InfoExtractor
from ..utils import (
    decode_packed_codes,
    ExtractorError,
    urlencode_postdata,
)


class SCTEBaseIE(InfoExtractor):
    _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
    _NETRC_MACHINE = 'scte'

    def _perform_login(self, username, password):
        login_popup = self._download_webpage(
            self._LOGIN_URL, None, 'Downloading login popup')

        def is_logged(webpage):
            return any(re.search(p, webpage) for p in (
                r'class=["\']welcome\b', r'>Sign Out<'))

        # already logged in
        if is_logged(login_popup):
            return

        login_form = self._hidden_inputs(login_popup)

        login_form.update({
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
        })

        response = self._download_webpage(
            self._LOGIN_URL, None, 'Logging in',
            data=urlencode_postdata(login_form))

        if '|pageRedirect|' not in response and not is_logged(response):
            error = self._html_search_regex(
                r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
                response, 'error message', default=None)
            if error:
                raise ExtractorError('Unable to login: %s' % error, expected=True)
            raise ExtractorError('Unable to log in')


class SCTEIE(SCTEBaseIE):
    _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
        'info_dict': {
            'title': 'Introduction to DOCSIS Engineering Professional',
            'id': '31484',
        },
        'playlist_count': 5,
        'skip': 'Requires account credentials',
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')

        context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
        content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
        context = decode_packed_codes(self._download_webpage(
            '%smobile/data.js' % content_base, video_id))

        data = self._parse_xml(
            self._search_regex(
                r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
            video_id)

        entries = []
        for asset in data.findall('.//asset'):
            asset_url = asset.get('url')
            if not asset_url or not asset_url.endswith('.mp4'):
                continue
            asset_id = self._search_regex(
                r'video_([^_]+)_', asset_url, 'asset id', default=None)
            if not asset_id:
                continue
            entries.append({
                'id': asset_id,
                'title': title,
                'url': content_base + asset_url,
            })

        return self.playlist_result(entries, video_id, title)


class SCTECourseIE(SCTEBaseIE):
    _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
        'only_matching': True,
    }, {
        'url': 'https://learning.scte.org/course/view.php?id=3639',
        'only_matching': True,
    }, {
        'url': 'https://learning.scte.org/course/view.php?id=3073',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        course_id = self._match_id(url)

        webpage = self._download_webpage(url, course_id)

        title = self._search_regex(
            r'<h1>(.+?)</h1>', webpage, 'title', default=None)

        entries = []
        for mobj in re.finditer(
                r'''(?x)
                    <a[^>]+
                        href=(["\'])
                        (?P<url>
                            https?://learning\.scte\.org/mod/
                            (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*?
                            \bid=\d+
                        )
                    ''',
                webpage):
            item_url = mobj.group('url')
            if item_url == url:
                continue
            ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
                  else SCTECourseIE.ie_key())
            entries.append(self.url_result(item_url, ie=ie))

        return self.playlist_result(entries, course_id, title)
Commit	Line	Data
20218040 S	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	decode_packed_codes,
	6	ExtractorError,
	7	urlencode_postdata,
	8	)
	9
	10
	11	class SCTEBaseIE(InfoExtractor):
	12	_LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
	13	_NETRC_MACHINE = 'scte'
	14
52efa4b3	15	def _perform_login(self, username, password):
20218040 S	16	login_popup = self._download_webpage(
	17	self._LOGIN_URL, None, 'Downloading login popup')
	18
	19	def is_logged(webpage):
	20	return any(re.search(p, webpage) for p in (
	21	r'class=["\']welcome\b', r'>Sign Out<'))
	22
	23	# already logged in
	24	if is_logged(login_popup):
	25	return
	26
	27	login_form = self._hidden_inputs(login_popup)
	28
	29	login_form.update({
	30	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
	31	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
	32	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
	33	})
	34
	35	response = self._download_webpage(
	36	self._LOGIN_URL, None, 'Logging in',
	37	data=urlencode_postdata(login_form))
	38
	39	if '\|pageRedirect\|' not in response and not is_logged(response):
	40	error = self._html_search_regex(
	41	r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
	42	response, 'error message', default=None)
	43	if error:
	44	raise ExtractorError('Unable to login: %s' % error, expected=True)
	45	raise ExtractorError('Unable to log in')
	46
	47
	48	class SCTEIE(SCTEBaseIE):
	49	_VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
	50	_TESTS = [{
	51	'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
	52	'info_dict': {
	53	'title': 'Introduction to DOCSIS Engineering Professional',
	54	'id': '31484',
	55	},
	56	'playlist_count': 5,
	57	'skip': 'Requires account credentials',
	58	}]
	59
	60	def _real_extract(self, url):
	61	video_id = self._match_id(url)
	62
	63	webpage = self._download_webpage(url, video_id)
	64
	65	title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
	66
	67	context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
	68	content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
	69	context = decode_packed_codes(self._download_webpage(
	70	'%smobile/data.js' % content_base, video_id))
	71
	72	data = self._parse_xml(
	73	self._search_regex(
	74	r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
	75	video_id)
	76
	77	entries = []
	78	for asset in data.findall('.//asset'):
	79	asset_url = asset.get('url')
80	if not asset_url or not asset_url.endswith('.mp4'):
81	continue
82	asset_id = self._search_regex(
83	r'video_([^_]+)_', asset_url, 'asset id', default=None)
84	if not asset_id:
85	continue
86	entries.append({
87	'id': asset_id,
88	'title': title,
89	'url': content_base + asset_url,
90	})
91
92	return self.playlist_result(entries, video_id, title)
93
94
95	class SCTECourseIE(SCTEBaseIE):
96	_VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
97	_TESTS = [{
98	'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
99	'only_matching': True,
100	}, {
101	'url': 'https://learning.scte.org/course/view.php?id=3639',
102	'only_matching': True,
103	}, {
104	'url': 'https://learning.scte.org/course/view.php?id=3073',
105	'only_matching': True,
106	}]
107
108	def _real_extract(self, url):
109	course_id = self._match_id(url)
110
111	webpage = self._download_webpage(url, course_id)
112
113	title = self._search_regex(
114	r'<h1>(.+?)</h1>', webpage, 'title', default=None)
115
116	entries = []
117	for mobj in re.finditer(
118	r'''(?x)
119	<a[^>]+
120	href=(["\'])
121	(?P<url>
122	https?://learning\.scte\.org/mod/
123	(?P<kind>scorm\|subcourse)/view\.php?(?:(?!\1).)*?
124	\bid=\d+
125	)
126	''',
127	webpage):
128	item_url = mobj.group('url')
129	if item_url == url:
130	continue
131	ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
132	else SCTECourseIE.ie_key())
133	entries.append(self.url_result(item_url, ie=ie))
134
135	return self.playlist_result(entries, course_id, title)