[yt-dlp.git] / yt_dlp / extractor / scte.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    decode_packed_codes,
    ExtractorError,
    urlencode_postdata,
)


class SCTEBaseIE(InfoExtractor):
    _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
    _NETRC_MACHINE = 'scte'

    def _real_initialize(self):
        self._login()

    def _login(self):
        username, password = self._get_login_info()
        if username is None:
            return

        login_popup = self._download_webpage(
            self._LOGIN_URL, None, 'Downloading login popup')

        def is_logged(webpage):
            return any(re.search(p, webpage) for p in (
                r'class=["\']welcome\b', r'>Sign Out<'))

        # already logged in
        if is_logged(login_popup):
            return

        login_form = self._hidden_inputs(login_popup)

        login_form.update({
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
        })

        response = self._download_webpage(
            self._LOGIN_URL, None, 'Logging in',
            data=urlencode_postdata(login_form))

        if '|pageRedirect|' not in response and not is_logged(response):
            error = self._html_search_regex(
                r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
                response, 'error message', default=None)
            if error:
                raise ExtractorError('Unable to login: %s' % error, expected=True)
            raise ExtractorError('Unable to log in')


class SCTEIE(SCTEBaseIE):
    _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
        'info_dict': {
            'title': 'Introduction to DOCSIS Engineering Professional',
            'id': '31484',
        },
        'playlist_count': 5,
        'skip': 'Requires account credentials',
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')

        context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
        content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
        context = decode_packed_codes(self._download_webpage(
            '%smobile/data.js' % content_base, video_id))

        data = self._parse_xml(
            self._search_regex(
                r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
            video_id)

        entries = []
        for asset in data.findall('.//asset'):
            asset_url = asset.get('url')
            if not asset_url or not asset_url.endswith('.mp4'):
                continue
            asset_id = self._search_regex(
                r'video_([^_]+)_', asset_url, 'asset id', default=None)
            if not asset_id:
                continue
            entries.append({
                'id': asset_id,
                'title': title,
                'url': content_base + asset_url,
            })

        return self.playlist_result(entries, video_id, title)


class SCTECourseIE(SCTEBaseIE):
    _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
        'only_matching': True,
    }, {
        'url': 'https://learning.scte.org/course/view.php?id=3639',
        'only_matching': True,
    }, {
        'url': 'https://learning.scte.org/course/view.php?id=3073',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        course_id = self._match_id(url)

        webpage = self._download_webpage(url, course_id)

        title = self._search_regex(
            r'<h1>(.+?)</h1>', webpage, 'title', default=None)

        entries = []
        for mobj in re.finditer(
                r'''(?x)
                    <a[^>]+
                        href=(["\'])
                        (?P<url>
                            https?://learning\.scte\.org/mod/
                            (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*?
                            \bid=\d+
                        )
                    ''',
                webpage):
            item_url = mobj.group('url')
            if item_url == url:
                continue
            ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
                  else SCTECourseIE.ie_key())
            entries.append(self.url_result(item_url, ie=ie))

        return self.playlist_result(entries, course_id, title)
Commit	Line	Data
20218040 S	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	decode_packed_codes,
	8	ExtractorError,
	9	urlencode_postdata,
	10	)
	11
	12
	13	class SCTEBaseIE(InfoExtractor):
	14	_LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
	15	_NETRC_MACHINE = 'scte'
	16
	17	def _real_initialize(self):
	18	self._login()
	19
	20	def _login(self):
	21	username, password = self._get_login_info()
	22	if username is None:
	23	return
	24
	25	login_popup = self._download_webpage(
	26	self._LOGIN_URL, None, 'Downloading login popup')
	27
	28	def is_logged(webpage):
	29	return any(re.search(p, webpage) for p in (
	30	r'class=["\']welcome\b', r'>Sign Out<'))
	31
	32	# already logged in
	33	if is_logged(login_popup):
	34	return
	35
	36	login_form = self._hidden_inputs(login_popup)
	37
	38	login_form.update({
	39	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
	40	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
	41	'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
	42	})
	43
	44	response = self._download_webpage(
	45	self._LOGIN_URL, None, 'Logging in',
	46	data=urlencode_postdata(login_form))
	47
	48	if '\|pageRedirect\|' not in response and not is_logged(response):
	49	error = self._html_search_regex(
	50	r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
	51	response, 'error message', default=None)
	52	if error:
	53	raise ExtractorError('Unable to login: %s' % error, expected=True)
	54	raise ExtractorError('Unable to log in')
	55
	56
	57	class SCTEIE(SCTEBaseIE):
	58	_VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
	59	_TESTS = [{
	60	'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
	61	'info_dict': {
	62	'title': 'Introduction to DOCSIS Engineering Professional',
	63	'id': '31484',
	64	},
65	'playlist_count': 5,
66	'skip': 'Requires account credentials',
67	}]
68
69	def _real_extract(self, url):
70	video_id = self._match_id(url)
71
72	webpage = self._download_webpage(url, video_id)
73
74	title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
75
76	context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
77	content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
78	context = decode_packed_codes(self._download_webpage(
79	'%smobile/data.js' % content_base, video_id))
80
81	data = self._parse_xml(
82	self._search_regex(
83	r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
84	video_id)
85
86	entries = []
87	for asset in data.findall('.//asset'):
88	asset_url = asset.get('url')
89	if not asset_url or not asset_url.endswith('.mp4'):
90	continue
91	asset_id = self._search_regex(
92	r'video_([^_]+)_', asset_url, 'asset id', default=None)
93	if not asset_id:
94	continue
95	entries.append({
96	'id': asset_id,
97	'title': title,
98	'url': content_base + asset_url,
99	})
100
101	return self.playlist_result(entries, video_id, title)
102
103
104	class SCTECourseIE(SCTEBaseIE):
105	_VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
106	_TESTS = [{
107	'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
108	'only_matching': True,
109	}, {
110	'url': 'https://learning.scte.org/course/view.php?id=3639',
111	'only_matching': True,
112	}, {
113	'url': 'https://learning.scte.org/course/view.php?id=3073',
114	'only_matching': True,
115	}]
116
117	def _real_extract(self, url):
118	course_id = self._match_id(url)
119
120	webpage = self._download_webpage(url, course_id)
121
122	title = self._search_regex(
123	r'<h1>(.+?)</h1>', webpage, 'title', default=None)
124
125	entries = []
126	for mobj in re.finditer(
127	r'''(?x)
128	<a[^>]+
129	href=(["\'])
130	(?P<url>
131	https?://learning\.scte\.org/mod/
132	(?P<kind>scorm\|subcourse)/view\.php?(?:(?!\1).)*?
133	\bid=\d+
134	)
135	''',
136	webpage):
137	item_url = mobj.group('url')
138	if item_url == url:
139	continue
140	ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
141	else SCTECourseIE.ie_key())
142	entries.append(self.url_result(item_url, ie=ie))
143
144	return self.playlist_result(entries, course_id, title)