jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..compat import (
	7	compat_str,
	8	compat_urlparse,
	9	)
	10	from ..utils import (
	11	ExtractorError,
	12	int_or_none,
	13	urlencode_postdata,
	14	)
	15
	16
	17	class LyndaBaseIE(InfoExtractor):
	18	_SIGNIN_URL = 'https://www.lynda.com/signin/lynda'
	19	_PASSWORD_URL = 'https://www.lynda.com/signin/password'
	20	_USER_URL = 'https://www.lynda.com/signin/user'
	21	_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
	22	_NETRC_MACHINE = 'lynda'
	23
	24	def _real_initialize(self):
	25	self._login()
	26
	27	@staticmethod
	28	def _check_error(json_string, key_or_keys):
	29	keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
	30	for key in keys:
	31	error = json_string.get(key)
	32	if error:
	33	raise ExtractorError('Unable to login: %s' % error, expected=True)
	34
	35	def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
	36	action_url = self._search_regex(
	37	r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
	38	'post url', default=fallback_action_url, group='url')
	39
	40	if not action_url.startswith('http'):
	41	action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url)
	42
	43	form_data = self._hidden_inputs(form_html)
	44	form_data.update(extra_form_data)
	45
	46	response = self._download_json(
	47	action_url, None, note,
	48	data=urlencode_postdata(form_data),
	49	headers={
	50	'Referer': referrer_url,
	51	'X-Requested-With': 'XMLHttpRequest',
	52	}, expected_status=(418, 500, ))
	53
	54	self._check_error(response, ('email', 'password', 'ErrorMessage'))
	55
	56	return response, action_url
	57
	58	def _login(self):
	59	username, password = self._get_login_info()
	60	if username is None:
	61	return
	62
	63	# Step 1: download signin page
	64	signin_page = self._download_webpage(
	65	self._SIGNIN_URL, None, 'Downloading signin page')
	66
	67	# Already logged in
	68	if any(re.search(p, signin_page) for p in (
	69	r'isLoggedIn\s:\strue', r'logout\.aspx', r'>Log out<')):
	70	return
	71
	72	# Step 2: submit email
	73	signin_form = self._search_regex(
	74	r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
	75	signin_page, 'signin form')
	76	signin_page, signin_url = self._login_step(
	77	signin_form, self._PASSWORD_URL, {'email': username},
	78	'Submitting email', self._SIGNIN_URL)
	79
	80	# Step 3: submit password
	81	password_form = signin_page['body']
	82	self._login_step(
	83	password_form, self._USER_URL, {'email': username, 'password': password},
	84	'Submitting password', signin_url)
	85
	86
	87	class LyndaIE(LyndaBaseIE):
	88	IE_NAME = 'lynda'
	89	IE_DESC = 'lynda.com videos'
	90	_VALID_URL = r'''(?x)
	91	https?://
	92	(?:www\.)?(?:lynda\.com\|educourse\.ga)/
	93	(?:
	94	(?:[^/]+/){2,3}(?P<course_id>\d+)\|
	95	player/embed
	96	)/
	97	(?P<id>\d+)
	98	'''
	99
	100	_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
	101
	102	_TESTS = [{
	103	'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
	104	# md5 is unstable
	105	'info_dict': {
	106	'id': '114408',
	107	'ext': 'mp4',
	108	'title': 'Using the exercise files',
	109	'duration': 68
	110	}
	111	}, {
	112	'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0',
	113	'only_matching': True,
	114	}, {
	115	'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
	116	'only_matching': True,
	117	}, {
	118	'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html',
	119	'only_matching': True,
	120	}, {
	121	# Status="NotFound", Message="Transcript not found"
	122	'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html',
	123	'only_matching': True,
	124	}]
	125
	126	def _raise_unavailable(self, video_id):
	127	self.raise_login_required(
	128	'Video %s is only available for members' % video_id)
	129
	130	def _real_extract(self, url):
	131	mobj = re.match(self._VALID_URL, url)
	132	video_id = mobj.group('id')
	133	course_id = mobj.group('course_id')
	134
	135	query = {
	136	'videoId': video_id,
	137	'type': 'video',
	138	}
	139
	140	video = self._download_json(
	141	'https://www.lynda.com/ajax/player', video_id,
	142	'Downloading video JSON', fatal=False, query=query)
	143
	144	# Fallback scenario
	145	if not video:
	146	query['courseId'] = course_id
	147
	148	play = self._download_json(
	149	'https://www.lynda.com/ajax/course/%s/%s/play'
	150	% (course_id, video_id), video_id, 'Downloading play JSON')
	151
	152	if not play:
	153	self._raise_unavailable(video_id)
	154
	155	formats = []
	156	for formats_dict in play:
	157	urls = formats_dict.get('urls')
	158	if not isinstance(urls, dict):
	159	continue
	160	cdn = formats_dict.get('name')
	161	for format_id, format_url in urls.items():
	162	if not format_url:
	163	continue
	164	formats.append({
	165	'url': format_url,
	166	'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id,
	167	'height': int_or_none(format_id),
	168	})
	169	self._sort_formats(formats)
	170
	171	conviva = self._download_json(
	172	'https://www.lynda.com/ajax/player/conviva', video_id,
	173	'Downloading conviva JSON', query=query)
	174
	175	return {
	176	'id': video_id,
	177	'title': conviva['VideoTitle'],
	178	'description': conviva.get('VideoDescription'),
	179	'release_year': int_or_none(conviva.get('ReleaseYear')),
	180	'duration': int_or_none(conviva.get('Duration')),
	181	'creator': conviva.get('Author'),
	182	'formats': formats,
	183	}
	184
	185	if 'Status' in video:
	186	raise ExtractorError(
	187	'lynda returned error: %s' % video['Message'], expected=True)
	188
	189	if video.get('HasAccess') is False:
	190	self._raise_unavailable(video_id)
	191
	192	video_id = compat_str(video.get('ID') or video_id)
	193	duration = int_or_none(video.get('DurationInSeconds'))
	194	title = video['Title']
	195
	196	formats = []
	197
	198	fmts = video.get('Formats')
	199	if fmts:
	200	formats.extend([{
	201	'url': f['Url'],
	202	'ext': f.get('Extension'),
	203	'width': int_or_none(f.get('Width')),
	204	'height': int_or_none(f.get('Height')),
	205	'filesize': int_or_none(f.get('FileSize')),
	206	'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None,
	207	} for f in fmts if f.get('Url')])
	208
	209	prioritized_streams = video.get('PrioritizedStreams')
	210	if prioritized_streams:
	211	for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
	212	formats.extend([{
	213	'url': video_url,
	214	'height': int_or_none(format_id),
	215	'format_id': '%s-%s' % (prioritized_stream_id, format_id),
	216	} for format_id, video_url in prioritized_stream.items()])
	217
	218	self._check_formats(formats, video_id)
	219	self._sort_formats(formats)
	220
	221	subtitles = self.extract_subtitles(video_id)
	222
	223	return {
	224	'id': video_id,
	225	'title': title,
	226	'duration': duration,
	227	'subtitles': subtitles,
	228	'formats': formats
	229	}
	230
	231	def _fix_subtitles(self, subs):
	232	srt = ''
	233	seq_counter = 0
	234	for pos in range(0, len(subs) - 1):
	235	seq_current = subs[pos]
	236	m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
	237	if m_current is None:
	238	continue
	239	seq_next = subs[pos + 1]
	240	m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
	241	if m_next is None:
	242	continue
	243	appear_time = m_current.group('timecode')
	244	disappear_time = m_next.group('timecode')
	245	text = seq_current['Caption'].strip()
	246	if text:
	247	seq_counter += 1
	248	srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
	249	if srt:
	250	return srt
	251
	252	def _get_subtitles(self, video_id):
	253	url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
	254	subs = self._download_webpage(
	255	url, video_id, 'Downloading subtitles JSON', fatal=False)
	256	if not subs or 'Status="NotFound"' in subs:
	257	return {}
	258	subs = self._parse_json(subs, video_id, fatal=False)
	259	if not subs:
	260	return {}
	261	fixed_subs = self._fix_subtitles(subs)
	262	if fixed_subs:
	263	return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
	264	return {}
	265
	266
	267	class LyndaCourseIE(LyndaBaseIE):
	268	IE_NAME = 'lynda:course'
	269	IE_DESC = 'lynda.com online courses'
	270
	271	# Course link equals to welcome/introduction video link of same course
	272	# We will recognize it as course link
	273	_VALID_URL = r'https?://(?:www\|m)\.(?:lynda\.com\|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html'
	274
	275	_TESTS = [{
	276	'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
	277	'only_matching': True,
	278	}, {
	279	'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
	280	'only_matching': True,
	281	}]
	282
	283	def _real_extract(self, url):
	284	mobj = re.match(self._VALID_URL, url)
	285	course_path = mobj.group('coursepath')
	286	course_id = mobj.group('courseid')
	287
	288	item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path
	289
	290	course = self._download_json(
	291	'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
	292	course_id, 'Downloading course JSON', fatal=False)
	293
	294	if not course:
	295	webpage = self._download_webpage(url, course_id)
	296	entries = [
	297	self.url_result(
	298	item_template % video_id, ie=LyndaIE.ie_key(),
	299	video_id=video_id)
	300	for video_id in re.findall(
	301	r'data-video-id=["\'](\d+)', webpage)]
	302	return self.playlist_result(
	303	entries, course_id,
	304	self._og_search_title(webpage, fatal=False),
	305	self._og_search_description(webpage))
	306
	307	if course.get('Status') == 'NotFound':
	308	raise ExtractorError(
	309	'Course %s does not exist' % course_id, expected=True)
	310
	311	unaccessible_videos = 0
	312	entries = []
	313
	314	# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
	315	# by single video API anymore
	316
	317	for chapter in course['Chapters']:
	318	for video in chapter.get('Videos', []):
	319	if video.get('HasAccess') is False:
	320	unaccessible_videos += 1
	321	continue
	322	video_id = video.get('ID')
	323	if video_id:
	324	entries.append({
	325	'_type': 'url_transparent',
	326	'url': item_template % video_id,
	327	'ie_key': LyndaIE.ie_key(),
	328	'chapter': chapter.get('Title'),
	329	'chapter_number': int_or_none(chapter.get('ChapterIndex')),
	330	'chapter_id': compat_str(chapter.get('ID')),
	331	})
	332
	333	if unaccessible_videos > 0:
	334	self.report_warning(
	335	'%s videos are only available for members (or paid members) and will not be downloaded. '
	336	% unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
	337
	338	course_title = course.get('Title')
	339	course_description = course.get('Description')
	340
	341	return self.playlist_result(entries, course_id, course_title, course_description)