jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..compat import (
	5	compat_str,
	6	compat_urlparse,
	7	)
	8	from ..utils import (
	9	ExtractorError,
	10	int_or_none,
	11	urlencode_postdata,
	12	)
	13
	14
	15	class LyndaBaseIE(InfoExtractor):
	16	_SIGNIN_URL = 'https://www.lynda.com/signin/lynda'
	17	_PASSWORD_URL = 'https://www.lynda.com/signin/password'
	18	_USER_URL = 'https://www.lynda.com/signin/user'
	19	_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
	20	_NETRC_MACHINE = 'lynda'
	21
	22	@staticmethod
	23	def _check_error(json_string, key_or_keys):
	24	keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
	25	for key in keys:
	26	error = json_string.get(key)
	27	if error:
	28	raise ExtractorError('Unable to login: %s' % error, expected=True)
	29
	30	def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
	31	action_url = self._search_regex(
	32	r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
	33	'post url', default=fallback_action_url, group='url')
	34
	35	if not action_url.startswith('http'):
	36	action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url)
	37
	38	form_data = self._hidden_inputs(form_html)
	39	form_data.update(extra_form_data)
	40
	41	response = self._download_json(
	42	action_url, None, note,
	43	data=urlencode_postdata(form_data),
	44	headers={
	45	'Referer': referrer_url,
	46	'X-Requested-With': 'XMLHttpRequest',
	47	}, expected_status=(418, 500, ))
	48
	49	self._check_error(response, ('email', 'password', 'ErrorMessage'))
	50
	51	return response, action_url
	52
	53	def _perform_login(self, username, password):
	54	# Step 1: download signin page
	55	signin_page = self._download_webpage(
	56	self._SIGNIN_URL, None, 'Downloading signin page')
	57
	58	# Already logged in
	59	if any(re.search(p, signin_page) for p in (
	60	r'isLoggedIn\s:\strue', r'logout\.aspx', r'>Log out<')):
	61	return
	62
	63	# Step 2: submit email
	64	signin_form = self._search_regex(
	65	r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
	66	signin_page, 'signin form')
	67	signin_page, signin_url = self._login_step(
	68	signin_form, self._PASSWORD_URL, {'email': username},
	69	'Submitting email', self._SIGNIN_URL)
	70
	71	# Step 3: submit password
	72	password_form = signin_page['body']
	73	self._login_step(
	74	password_form, self._USER_URL, {'email': username, 'password': password},
	75	'Submitting password', signin_url)
	76
	77
	78	class LyndaIE(LyndaBaseIE):
	79	IE_NAME = 'lynda'
	80	IE_DESC = 'lynda.com videos'
	81	_VALID_URL = r'''(?x)
	82	https?://
	83	(?:www\.)?(?:lynda\.com\|educourse\.ga)/
	84	(?:
	85	(?:[^/]+/){2,3}(?P<course_id>\d+)\|
	86	player/embed
	87	)/
	88	(?P<id>\d+)
	89	'''
	90
	91	_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
	92
	93	_TESTS = [{
	94	'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
	95	# md5 is unstable
	96	'info_dict': {
	97	'id': '114408',
	98	'ext': 'mp4',
	99	'title': 'Using the exercise files',
	100	'duration': 68
	101	}
	102	}, {
	103	'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0',
	104	'only_matching': True,
	105	}, {
	106	'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
	107	'only_matching': True,
	108	}, {
	109	'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html',
	110	'only_matching': True,
	111	}, {
	112	# Status="NotFound", Message="Transcript not found"
	113	'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html',
	114	'only_matching': True,
	115	}]
	116
	117	def _raise_unavailable(self, video_id):
	118	self.raise_login_required(
	119	'Video %s is only available for members' % video_id)
	120
	121	def _real_extract(self, url):
	122	mobj = self._match_valid_url(url)
	123	video_id = mobj.group('id')
	124	course_id = mobj.group('course_id')
	125
	126	query = {
	127	'videoId': video_id,
	128	'type': 'video',
	129	}
	130
	131	video = self._download_json(
	132	'https://www.lynda.com/ajax/player', video_id,
	133	'Downloading video JSON', fatal=False, query=query)
	134
	135	# Fallback scenario
	136	if not video:
	137	query['courseId'] = course_id
	138
	139	play = self._download_json(
	140	'https://www.lynda.com/ajax/course/%s/%s/play'
	141	% (course_id, video_id), video_id, 'Downloading play JSON')
	142
	143	if not play:
	144	self._raise_unavailable(video_id)
	145
	146	formats = []
	147	for formats_dict in play:
	148	urls = formats_dict.get('urls')
	149	if not isinstance(urls, dict):
	150	continue
	151	cdn = formats_dict.get('name')
	152	for format_id, format_url in urls.items():
	153	if not format_url:
	154	continue
	155	formats.append({
	156	'url': format_url,
	157	'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id,
	158	'height': int_or_none(format_id),
	159	})
	160
	161	conviva = self._download_json(
	162	'https://www.lynda.com/ajax/player/conviva', video_id,
	163	'Downloading conviva JSON', query=query)
	164
	165	return {
	166	'id': video_id,
	167	'title': conviva['VideoTitle'],
	168	'description': conviva.get('VideoDescription'),
	169	'release_year': int_or_none(conviva.get('ReleaseYear')),
	170	'duration': int_or_none(conviva.get('Duration')),
	171	'creator': conviva.get('Author'),
	172	'formats': formats,
	173	}
	174
	175	if 'Status' in video:
	176	raise ExtractorError(
	177	'lynda returned error: %s' % video['Message'], expected=True)
	178
	179	if video.get('HasAccess') is False:
	180	self._raise_unavailable(video_id)
	181
	182	video_id = compat_str(video.get('ID') or video_id)
	183	duration = int_or_none(video.get('DurationInSeconds'))
	184	title = video['Title']
	185
	186	formats = []
	187
	188	fmts = video.get('Formats')
	189	if fmts:
	190	formats.extend([{
	191	'url': f['Url'],
	192	'ext': f.get('Extension'),
	193	'width': int_or_none(f.get('Width')),
	194	'height': int_or_none(f.get('Height')),
	195	'filesize': int_or_none(f.get('FileSize')),
	196	'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None,
	197	} for f in fmts if f.get('Url')])
	198
	199	prioritized_streams = video.get('PrioritizedStreams')
	200	if prioritized_streams:
	201	for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
	202	formats.extend([{
	203	'url': video_url,
	204	'height': int_or_none(format_id),
	205	'format_id': '%s-%s' % (prioritized_stream_id, format_id),
	206	} for format_id, video_url in prioritized_stream.items()])
	207
	208	self._check_formats(formats, video_id)
	209
	210	subtitles = self.extract_subtitles(video_id)
	211
	212	return {
	213	'id': video_id,
	214	'title': title,
	215	'duration': duration,
	216	'subtitles': subtitles,
	217	'formats': formats
	218	}
	219
	220	def _fix_subtitles(self, subs):
	221	srt = ''
	222	seq_counter = 0
	223	for pos in range(0, len(subs) - 1):
	224	seq_current = subs[pos]
	225	m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
	226	if m_current is None:
	227	continue
	228	seq_next = subs[pos + 1]
	229	m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
	230	if m_next is None:
	231	continue
	232	appear_time = m_current.group('timecode')
	233	disappear_time = m_next.group('timecode')
	234	text = seq_current['Caption'].strip()
	235	if text:
	236	seq_counter += 1
	237	srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
	238	if srt:
	239	return srt
	240
	241	def _get_subtitles(self, video_id):
	242	url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
	243	subs = self._download_webpage(
	244	url, video_id, 'Downloading subtitles JSON', fatal=False)
	245	if not subs or 'Status="NotFound"' in subs:
	246	return {}
	247	subs = self._parse_json(subs, video_id, fatal=False)
	248	if not subs:
	249	return {}
	250	fixed_subs = self._fix_subtitles(subs)
	251	if fixed_subs:
	252	return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
	253	return {}
	254
	255
	256	class LyndaCourseIE(LyndaBaseIE):
	257	IE_NAME = 'lynda:course'
	258	IE_DESC = 'lynda.com online courses'
	259
	260	# Course link equals to welcome/introduction video link of same course
	261	# We will recognize it as course link
	262	_VALID_URL = r'https?://(?:www\|m)\.(?:lynda\.com\|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html'
	263
	264	_TESTS = [{
	265	'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
	266	'only_matching': True,
	267	}, {
	268	'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
	269	'only_matching': True,
	270	}]
	271
	272	def _real_extract(self, url):
	273	mobj = self._match_valid_url(url)
	274	course_path = mobj.group('coursepath')
	275	course_id = mobj.group('courseid')
	276
	277	item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path
	278
	279	course = self._download_json(
	280	'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
	281	course_id, 'Downloading course JSON', fatal=False)
	282
	283	if not course:
	284	webpage = self._download_webpage(url, course_id)
	285	entries = [
	286	self.url_result(
	287	item_template % video_id, ie=LyndaIE.ie_key(),
	288	video_id=video_id)
	289	for video_id in re.findall(
	290	r'data-video-id=["\'](\d+)', webpage)]
	291	return self.playlist_result(
	292	entries, course_id,
	293	self._og_search_title(webpage, fatal=False),
	294	self._og_search_description(webpage))
	295
	296	if course.get('Status') == 'NotFound':
	297	raise ExtractorError(
	298	'Course %s does not exist' % course_id, expected=True)
	299
	300	unaccessible_videos = 0
	301	entries = []
	302
	303	# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
	304	# by single video API anymore
	305
	306	for chapter in course['Chapters']:
	307	for video in chapter.get('Videos', []):
	308	if video.get('HasAccess') is False:
	309	unaccessible_videos += 1
	310	continue
	311	video_id = video.get('ID')
	312	if video_id:
	313	entries.append({
	314	'_type': 'url_transparent',
	315	'url': item_template % video_id,
	316	'ie_key': LyndaIE.ie_key(),
	317	'chapter': chapter.get('Title'),
	318	'chapter_number': int_or_none(chapter.get('ChapterIndex')),
	319	'chapter_id': compat_str(chapter.get('ID')),
	320	})
	321
	322	if unaccessible_videos > 0:
	323	self.report_warning(
	324	'%s videos are only available for members (or paid members) and will not be downloaded. '
	325	% unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
	326
	327	course_title = course.get('Title')
	328	course_description = course.get('Description')
	329
	330	return self.playlist_result(entries, course_id, course_title, course_description)