jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..compat import (
	7	compat_HTTPError,
	8	compat_str,
	9	compat_urlparse,
	10	)
	11	from ..utils import (
	12	ExtractorError,
	13	js_to_json,
	14	parse_duration,
	15	parse_iso8601,
	16	)
	17
	18
	19	class ViideaIE(InfoExtractor):
	20	_VALID_URL = r'''(?x)https?://(?:www\.)?(?:
	21	videolectures\.net\|
	22	flexilearn\.viidea\.net\|
	23	presentations\.ocwconsortium\.org\|
	24	video\.travel-zoom\.si\|
	25	video\.pomp-forum\.si\|
	26	tv\.nil\.si\|
	27	video\.hekovnik.com\|
	28	video\.szko\.si\|
	29	kpk\.viidea\.com\|
	30	inside\.viidea\.net\|
	31	video\.kiberpipa\.org\|
	32	bvvideo\.si\|
	33	kongres\.viidea\.net\|
	34	edemokracija\.viidea\.com
	35	)(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/(?:[#?].)?$'''
	36
	37	_TESTS = [{
	38	'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
	39	'info_dict': {
	40	'id': '20171',
	41	'display_id': 'promogram_igor_mekjavic_eng',
	42	'ext': 'mp4',
	43	'title': 'Automatics, robotics and biocybernetics',
	44	'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
	45	'thumbnail': r're:http://.*\.jpg',
	46	'timestamp': 1372349289,
	47	'upload_date': '20130627',
	48	'duration': 565,
	49	},
	50	'params': {
	51	# m3u8 download
	52	'skip_download': True,
	53	},
	54	}, {
	55	# video with invalid direct format links (HTTP 403)
	56	'url': 'http://videolectures.net/russir2010_filippova_nlp/',
	57	'info_dict': {
	58	'id': '14891',
	59	'display_id': 'russir2010_filippova_nlp',
	60	'ext': 'flv',
	61	'title': 'NLP at Google',
	62	'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
	63	'thumbnail': r're:http://.*\.jpg',
	64	'timestamp': 1284375600,
	65	'upload_date': '20100913',
	66	'duration': 5352,
	67	},
	68	'params': {
	69	# rtmp download
	70	'skip_download': True,
	71	},
	72	}, {
	73	# event playlist
	74	'url': 'http://videolectures.net/deeplearning2015_montreal/',
	75	'info_dict': {
	76	'id': '23181',
	77	'title': 'Deep Learning Summer School, Montreal 2015',
	78	'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
	79	'thumbnail': r're:http://.*\.jpg',
	80	'timestamp': 1438560000,
	81	},
	82	'playlist_count': 30,
	83	}, {
	84	# multi part lecture
	85	'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
	86	'info_dict': {
	87	'id': '9737',
	88	'display_id': 'mlss09uk_bishop_ibi',
	89	'title': 'Introduction To Bayesian Inference',
	90	'thumbnail': r're:http://.*\.jpg',
	91	'timestamp': 1251622800,
	92	},
	93	'playlist': [{
	94	'info_dict': {
	95	'id': '9737_part1',
	96	'display_id': 'mlss09uk_bishop_ibi_part1',
	97	'ext': 'wmv',
	98	'title': 'Introduction To Bayesian Inference (Part 1)',
	99	'thumbnail': r're:http://.*\.jpg',
	100	'duration': 4622,
	101	'timestamp': 1251622800,
	102	'upload_date': '20090830',
	103	},
	104	}, {
	105	'info_dict': {
	106	'id': '9737_part2',
	107	'display_id': 'mlss09uk_bishop_ibi_part2',
	108	'ext': 'wmv',
	109	'title': 'Introduction To Bayesian Inference (Part 2)',
	110	'thumbnail': r're:http://.*\.jpg',
	111	'duration': 5641,
	112	'timestamp': 1251622800,
	113	'upload_date': '20090830',
	114	},
	115	}],
	116	'playlist_count': 2,
	117	}]
	118
	119	def _real_extract(self, url):
	120	lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
	121
	122	webpage = self._download_webpage(url, lecture_slug)
	123
	124	cfg = self._parse_json(self._search_regex(
	125	[r'cfg\s:\s({.+?})\s,\s[\da-zA-Z_]+\s:\s\(?\s*function',
	126	r'cfg\s:\s({[^}]+})'],
	127	webpage, 'cfg'), lecture_slug, js_to_json)
	128
	129	lecture_id = compat_str(cfg['obj_id'])
	130
	131	base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
	132
	133	try:
	134	lecture_data = self._download_json(
	135	'%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
	136	lecture_id)['lecture'][0]
	137	except ExtractorError as e:
	138	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
	139	msg = self._parse_json(
	140	e.cause.read().decode('utf-8'), lecture_id)
	141	raise ExtractorError(msg['detail'], expected=True)
	142	raise
	143
	144	lecture_info = {
	145	'id': lecture_id,
	146	'display_id': lecture_slug,
	147	'title': lecture_data['title'],
	148	'timestamp': parse_iso8601(lecture_data.get('time')),
	149	'description': lecture_data.get('description_wiki'),
	150	'thumbnail': lecture_data.get('thumb'),
	151	}
	152
	153	playlist_entries = []
	154	lecture_type = lecture_data.get('type')
	155	parts = [compat_str(video) for video in cfg.get('videos', [])]
	156	if parts:
	157	multipart = len(parts) > 1
	158
	159	def extract_part(part_id):
	160	smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
	161	smil = self._download_smil(smil_url, lecture_id)
	162	info = self._parse_smil(smil, smil_url, lecture_id)
	163	self._sort_formats(info['formats'])
	164	info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
	165	info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
	166	if multipart:
	167	info['title'] += ' (Part %s)' % part_id
	168	switch = smil.find('.//switch')
	169	if switch is not None:
	170	info['duration'] = parse_duration(switch.attrib.get('dur'))
	171	item_info = lecture_info.copy()
	172	item_info.update(info)
	173	return item_info
	174
	175	if explicit_part_id or not multipart:
	176	result = extract_part(explicit_part_id or parts[0])
	177	else:
	178	result = {
	179	'_type': 'multi_video',
	180	'entries': [extract_part(part) for part in parts],
	181	}
	182	result.update(lecture_info)
	183
	184	# Immediately return explicitly requested part or non event item
	185	if explicit_part_id or lecture_type != 'evt':
	186	return result
	187
	188	playlist_entries.append(result)
	189
	190	# It's probably a playlist
	191	if not parts or lecture_type == 'evt':
	192	playlist_webpage = self._download_webpage(
	193	'%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
	194	entries = [
	195	self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
	196	for _, video_url in re.findall(
	197	r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
	198	playlist_entries.extend(entries)
	199
	200	playlist = self.playlist_result(playlist_entries, lecture_id)
	201	playlist.update(lecture_info)
	202	return playlist