jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import base64
	2	import json
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..compat import (
	7	compat_urlparse,
	8	compat_parse_qs,
	9	)
	10	from ..utils import (
	11	clean_html,
	12	ExtractorError,
	13	format_field,
	14	int_or_none,
	15	unsmuggle_url,
	16	smuggle_url,
	17	traverse_obj,
	18	)
	19
	20
	21	class KalturaIE(InfoExtractor):
	22	_VALID_URL = r'''(?x)
	23	(?:
	24	kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)\|
	25	https?://
	26	(:?(?:www\|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
	27	(?:
	28	(?:
	29	# flash player
	30	index\.php/(?:kwidget\|extwidget/preview)\|
	31	# html5 player
	32	html5/html5lib/[^/]+/mwEmbedFrame\.php
	33	)
	34	)(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
	35	)
	36	'''
	37	_SERVICE_URL = 'http://cdnapi.kaltura.com'
	38	_SERVICE_BASE = '/api_v3/service/multirequest'
	39	# See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php
	40	_CAPTION_TYPES = {
	41	1: 'srt',
	42	2: 'ttml',
	43	3: 'vtt',
	44	}
	45	_TESTS = [
	46	{
	47	'url': 'kaltura:269692:1_1jc2y3e4',
	48	'md5': '3adcbdb3dcc02d647539e53f284ba171',
	49	'info_dict': {
	50	'id': '1_1jc2y3e4',
	51	'ext': 'mp4',
	52	'title': 'Straight from the Heart',
	53	'upload_date': '20131219',
	54	'uploader_id': 'mlundberg@wolfgangsvault.com',
	55	'description': 'The Allman Brothers Band, 12/16/1981',
	56	'thumbnail': 're:^https?://./thumbnail/.',
	57	'timestamp': int,
	58	},
	59	},
	60	{
	61	'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4',
	62	'only_matching': True,
	63	},
	64	{
	65	'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3',
	66	'only_matching': True,
	67	},
	68	{
	69	'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342',
	70	'only_matching': True,
	71	},
	72	{
	73	# video with subtitles
	74	'url': 'kaltura:111032:1_cw786r8q',
	75	'only_matching': True,
	76	},
	77	{
	78	# video with ttml subtitles (no fileExt)
	79	'url': 'kaltura:1926081:0_l5ye1133',
	80	'info_dict': {
	81	'id': '0_l5ye1133',
	82	'ext': 'mp4',
	83	'title': 'What Can You Do With Python?',
	84	'upload_date': '20160221',
	85	'uploader_id': 'stork',
	86	'thumbnail': 're:^https?://./thumbnail/.',
	87	'timestamp': int,
	88	'subtitles': {
	89	'en': [{
	90	'ext': 'ttml',
	91	}],
	92	},
	93	},
	94	'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/',
	95	'params': {
	96	'skip_download': True,
	97	},
	98	},
	99	{
	100	'url': 'https://www.kaltura.com/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
	101	'only_matching': True,
	102	},
	103	{
	104	'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
	105	'only_matching': True,
	106	},
	107	{
	108	# unavailable source format
	109	'url': 'kaltura:513551:1_66x4rg7o',
	110	'only_matching': True,
	111	}
	112	]
	113
	114	@classmethod
	115	def _extract_embed_urls(cls, url, webpage):
	116	# Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
	117	finditer = (
	118	list(re.finditer(
	119	r"""(?xs)
	120	kWidget\.(?:thumb)?[Ee]mbed\(
	121	\{.*?
	122	(?P<q1>['"])wid(?P=q1)\s:\s
	123	(?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
	124	(?P<q3>['"])entry_?[Ii]d(?P=q3)\s:\s
	125	(?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,\|\s*\})
	126	""", webpage))
	127	or list(re.finditer(
	128	r'''(?xs)
	129	(?P<q1>["'])
	130	(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)\b(?:p\|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)
	131	(?P=q1).*?
	132	(?:
	133	(?:
	134	entry_?[Ii]d\|
	135	(?P<q2>["'])entry_?[Ii]d(?P=q2)
	136	)\s:\s\|
	137	\[\s(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s\]\s=\s
	138	)
	139	(?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
	140	''', webpage))
	141	or list(re.finditer(
	142	r'''(?xs)
	143	<(?:iframe[^>]+src\|meta[^>]+\bcontent)=(?P<q1>["'])\s*
	144	(?:https?:)?//(?:(?:www\|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p\|partner_id)/(?P<partner_id>\d+)
	145	(?:(?!(?P=q1)).)*
	146	[?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
	147	(?:(?!(?P=q1)).)*
	148	(?P=q1)
	149	''', webpage))
	150	)
	151	urls = []
	152	for mobj in finditer:
	153	embed_info = mobj.groupdict()
	154	for k, v in embed_info.items():
	155	if v:
	156	embed_info[k] = v.strip()
	157	embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
	158	escaped_pid = re.escape(embed_info['partner_id'])
	159	service_mobj = re.search(
	160	r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
	161	webpage)
	162	if service_mobj:
	163	embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')})
	164	urls.append(embed_url)
	165	return urls
	166
	167	def _kaltura_api_call(self, video_id, actions, service_url=None, args, *kwargs):
	168	params = actions[0]
	169	params.update({i: a for i, a in enumerate(actions[1:], start=1)})
	170
	171	data = self._download_json(
	172	(service_url or self._SERVICE_URL) + self._SERVICE_BASE,
	173	video_id, data=json.dumps(params).encode('utf-8'),
	174	headers={
	175	'Content-Type': 'application/json',
	176	'Accept-Encoding': 'gzip, deflate, br',
	177	}, args, *kwargs)
	178
	179	for idx, status in enumerate(data):
	180	if not isinstance(status, dict):
	181	continue
	182	if status.get('objectType') == 'KalturaAPIException':
	183	raise ExtractorError(
	184	'%s said: %s (%d)' % (self.IE_NAME, status['message'], idx))
	185
	186	data[1] = traverse_obj(data, (1, 'objects', 0))
	187
	188	return data
	189
	190	def _get_video_info(self, video_id, partner_id, service_url=None):
	191	actions = [
	192	{
	193	'apiVersion': '3.3.0',
	194	'clientTag': 'html5:v3.1.0',
	195	'format': 1, # JSON, 2 = XML, 3 = PHP
	196	'ks': '',
	197	'partnerId': partner_id,
	198	},
	199	{
	200	'expiry': 86400,
	201	'service': 'session',
	202	'action': 'startWidgetSession',
	203	'widgetId': '_%s' % partner_id,
	204	},
	205	{
	206	'action': 'list',
	207	'filter': {'redirectFromEntryId': video_id},
	208	'service': 'baseentry',
	209	'ks': '{1:result:ks}',
	210	'responseProfile': {
	211	'type': 1,
	212	'fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId',
	213	},
	214	},
	215	{
	216	'action': 'getbyentryid',
	217	'entryId': video_id,
	218	'service': 'flavorAsset',
	219	'ks': '{1:result:ks}',
	220	},
	221	{
	222	'action': 'list',
	223	'filter:entryIdEqual': video_id,
	224	'service': 'caption_captionasset',
	225	'ks': '{1:result:ks}',
	226	},
	227	]
	228	return self._kaltura_api_call(
	229	video_id, actions, service_url, note='Downloading video info JSON')
	230
	231	def _real_extract(self, url):
	232	url, smuggled_data = unsmuggle_url(url, {})
	233
	234	mobj = self._match_valid_url(url)
	235	partner_id, entry_id = mobj.group('partner_id', 'id')
	236	ks = None
	237	captions = None
	238	if partner_id and entry_id:
	239	_, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))
	240	else:
	241	path, query = mobj.group('path', 'query')
	242	if not path and not query:
	243	raise ExtractorError('Invalid URL', expected=True)
	244	params = {}
	245	if query:
	246	params = compat_parse_qs(query)
	247	if path:
	248	splitted_path = path.split('/')
	249	params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
	250	if 'wid' in params:
	251	partner_id = params['wid'][0][1:]
	252	elif 'p' in params:
	253	partner_id = params['p'][0]
	254	elif 'partner_id' in params:
	255	partner_id = params['partner_id'][0]
	256	else:
	257	raise ExtractorError('Invalid URL', expected=True)
	258	if 'entry_id' in params:
	259	entry_id = params['entry_id'][0]
	260	_, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id)
	261	elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
	262	reference_id = params['flashvars[referenceId]'][0]
	263	webpage = self._download_webpage(url, reference_id)
	264	entry_data = self._parse_json(self._search_regex(
	265	r'window\.kalturaIframePackageData\s=\s({.*});',
	266	webpage, 'kalturaIframePackageData'),
	267	reference_id)['entryResult']
	268	info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
	269	entry_id = info['id']
	270	# Unfortunately, data returned in kalturaIframePackageData lacks
	271	# captions so we will try requesting the complete data using
	272	# regular approach since we now know the entry_id
	273	try:
	274	_, info, flavor_assets, captions = self._get_video_info(
	275	entry_id, partner_id)
	276	except ExtractorError:
	277	# Regular scenario failed but we already have everything
	278	# extracted apart from captions and can process at least
	279	# with this
	280	pass
	281	else:
	282	raise ExtractorError('Invalid URL', expected=True)
	283	ks = params.get('flashvars[ks]', [None])[0]
	284
	285	source_url = smuggled_data.get('source_url')
	286	if source_url:
	287	referrer = base64.b64encode(
	288	'://'.join(compat_urlparse.urlparse(source_url)[:2])
	289	.encode('utf-8')).decode('utf-8')
	290	else:
	291	referrer = None
	292
	293	def sign_url(unsigned_url):
	294	if ks:
	295	unsigned_url += '/ks/%s' % ks
	296	if referrer:
	297	unsigned_url += '?referrer=%s' % referrer
	298	return unsigned_url
	299
	300	data_url = info['dataUrl']
	301	if '/flvclipper/' in data_url:
	302	data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
	303
	304	formats = []
	305	subtitles = {}
	306	for f in flavor_assets:
	307	# Continue if asset is not ready
	308	if f.get('status') != 2:
	309	continue
	310	# Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g)
	311	# skip for now.
	312	if f.get('fileExt') == 'chun':
	313	continue
	314	# DRM-protected video, cannot be decrypted
	315	if not self.get_param('allow_unplayable_formats') and f.get('fileExt') == 'wvm':
	316	continue
	317	if not f.get('fileExt'):
	318	# QT indicates QuickTime; some videos have broken fileExt
	319	if f.get('containerFormat') == 'qt':
	320	f['fileExt'] = 'mov'
	321	else:
	322	f['fileExt'] = 'mp4'
	323	video_url = sign_url(
	324	'%s/flavorId/%s' % (data_url, f['id']))
	325	format_id = '%(fileExt)s-%(bitrate)s' % f
	326	# Source format may not be available (e.g. kaltura:513551:1_66x4rg7o)
	327	if f.get('isOriginal') is True and not self._is_valid_url(
	328	video_url, entry_id, format_id):
	329	continue
	330	# audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g
	331	# -f mp4-56)
	332	vcodec = 'none' if 'videoCodecId' not in f and f.get(
	333	'frameRate') == 0 else f.get('videoCodecId')
	334	formats.append({
	335	'format_id': format_id,
	336	'ext': f.get('fileExt'),
	337	'tbr': int_or_none(f['bitrate']),
	338	'fps': int_or_none(f.get('frameRate')),
	339	'filesize_approx': int_or_none(f.get('size'), invscale=1024),
	340	'container': f.get('containerFormat'),
	341	'vcodec': vcodec,
	342	'height': int_or_none(f.get('height')),
	343	'width': int_or_none(f.get('width')),
	344	'url': video_url,
	345	})
	346	if '/playManifest/' in data_url:
	347	m3u8_url = sign_url(data_url.replace(
	348	'format/url', 'format/applehttp'))
	349	fmts, subs = self._extract_m3u8_formats_and_subtitles(
	350	m3u8_url, entry_id, 'mp4', 'm3u8_native',
	351	m3u8_id='hls', fatal=False)
	352	formats.extend(fmts)
	353	self._merge_subtitles(subs, target=subtitles)
	354
	355	self._sort_formats(formats)
	356
	357	if captions:
	358	for caption in captions.get('objects', []):
	359	# Continue if caption is not ready
	360	if caption.get('status') != 2:
	361	continue
	362	if not caption.get('id'):
	363	continue
	364	caption_format = int_or_none(caption.get('format'))
	365	subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
	366	'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
	367	'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml',
	368	})
	369
	370	return {
	371	'id': entry_id,
	372	'title': info['name'],
	373	'formats': formats,
	374	'subtitles': subtitles,
	375	'description': clean_html(info.get('description')),
	376	'thumbnail': info.get('thumbnailUrl'),
	377	'duration': info.get('duration'),
	378	'timestamp': info.get('createdAt'),
	379	'uploader_id': format_field(info, 'userId', ignore=('None', None)),
	380	'view_count': int_or_none(info.get('plays')),
	381	}