jfr.im git - yt-dlp.git/blame_incremental

... / ...

Commit	Line	Data
	1	import re
	2	import json
	3
	4	from .common import InfoExtractor
	5	from .youtube import YoutubeIE
	6	from ..utils import (
	7	clean_html,
	8	ExtractorError,
	9	get_element_by_id,
	10	)
	11
	12
	13	class TechTVMITIE(InfoExtractor):
	14	IE_NAME = 'techtv.mit.edu'
	15	_VALID_URL = r'https?://techtv\.mit\.edu/(?:videos\|embeds)/(?P<id>\d+)'
	16
	17	_TEST = {
	18	'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
	19	'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
	20	'info_dict': {
	21	'id': '25418',
	22	'ext': 'mp4',
	23	'title': 'MIT DNA and Protein Sets',
	24	'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
	25	},
	26	}
	27
	28	def _real_extract(self, url):
	29	video_id = self._match_id(url)
	30	raw_page = self._download_webpage(
	31	'http://techtv.mit.edu/videos/%s' % video_id, video_id)
	32	clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
	33
	34	base_url = self._proto_relative_url(self._search_regex(
	35	r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
	36	formats_json = self._search_regex(
	37	r'bitrates: (\[.+?\])', raw_page, 'video formats')
	38	formats_mit = json.loads(formats_json)
	39	formats = [
	40	{
	41	'format_id': f['label'],
	42	'url': base_url + f['url'].partition(':')[2],
	43	'ext': f['url'].partition(':')[0],
	44	'format': f['label'],
	45	'width': f['width'],
	46	'vbr': f['bitrate'],
	47	}
	48	for f in formats_mit
	49	]
	50
	51	title = get_element_by_id('edit-title', clean_page)
	52	description = clean_html(get_element_by_id('edit-description', clean_page))
	53	thumbnail = self._search_regex(
	54	r'playlist:.*?url: \'(.+?)\'',
	55	raw_page, 'thumbnail', flags=re.DOTALL)
	56
	57	return {
	58	'id': video_id,
	59	'title': title,
	60	'formats': formats,
	61	'description': description,
	62	'thumbnail': thumbnail,
	63	}
	64
	65
	66	class OCWMITIE(InfoExtractor):
	67	IE_NAME = 'ocw.mit.edu'
	68	_VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
	69	_BASE_URL = 'http://ocw.mit.edu/'
	70
	71	_TESTS = [
	72	{
	73	'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
	74	'info_dict': {
	75	'id': 'EObHWIEKGjA',
	76	'ext': 'webm',
	77	'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
	78	'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
	79	'upload_date': '20121109',
	80	'uploader_id': 'MIT',
	81	'uploader': 'MIT OpenCourseWare',
	82	}
	83	},
	84	{
	85	'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
	86	'info_dict': {
	87	'id': '7K1sB05pE0A',
	88	'ext': 'mp4',
	89	'title': 'Session 1: Introduction to Derivatives',
	90	'upload_date': '20090818',
	91	'uploader_id': 'MIT',
	92	'uploader': 'MIT OpenCourseWare',
	93	'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
	94	}
	95	}
	96	]
	97
	98	def _real_extract(self, url):
	99	mobj = self._match_valid_url(url)
	100	topic = mobj.group('topic')
	101
	102	webpage = self._download_webpage(url, topic)
	103	title = self._html_search_meta('WT.cg_s', webpage)
	104	description = self._html_search_meta('Description', webpage)
	105
	106	# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
	107	embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
	108	if embed_chapter_media:
	109	metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
	110	metadata = re.split(r', ?', metadata)
	111	yt = metadata[1]
	112	else:
	113	# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
	114	embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
	115	if embed_media:
	116	metadata = re.sub(r'[\'"]', '', embed_media.group(1))
	117	metadata = re.split(r', ?', metadata)
	118	yt = metadata[1]
	119	else:
	120	raise ExtractorError('Unable to find embedded YouTube video.')
	121	video_id = YoutubeIE.extract_id(yt)
	122
	123	return {
	124	'_type': 'url_transparent',
	125	'id': video_id,
	126	'title': title,
	127	'description': description,
	128	'url': yt,
	129	'ie_key': 'Youtube',
	130	}