jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import itertools
	2	import re
	3
	4	from .common import InfoExtractor
	5	from ..compat import compat_str, compat_HTTPError
	6	from ..utils import (
	7	determine_ext,
	8	int_or_none,
	9	parse_iso8601,
	10	try_get,
	11	unescapeHTML,
	12	ExtractorError,
	13	)
	14
	15
	16	class RumbleEmbedIE(InfoExtractor):
	17	_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
	18	_EMBED_REGEX = [fr'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>{_VALID_URL})']
	19	_TESTS = [{
	20	'url': 'https://rumble.com/embed/v5pv5f',
	21	'md5': '36a18a049856720189f30977ccbb2c34',
	22	'info_dict': {
	23	'id': 'v5pv5f',
	24	'ext': 'mp4',
	25	'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',
	26	'timestamp': 1571611968,
	27	'upload_date': '20191020',
	28	'channel_url': 'https://rumble.com/c/WMAR',
	29	'channel': 'WMAR',
	30	'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
	31	'duration': 234,
	32	'uploader': 'WMAR',
	33	}
	34	}, {
	35	'url': 'https://rumble.com/embed/vslb7v',
	36	'md5': '7418035de1a30a178b8af34dc2b6a52b',
	37	'info_dict': {
	38	'id': 'vslb7v',
	39	'ext': 'mp4',
	40	'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
	41	'timestamp': 1645142135,
	42	'upload_date': '20220217',
	43	'channel_url': 'https://rumble.com/c/CyberTechNews',
	44	'channel': 'CTNews',
	45	'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
	46	'duration': 901,
	47	'uploader': 'CTNews',
	48	}
	49	}, {
	50	'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
	51	'only_matching': True,
	52	}]
	53
	54	@classmethod
	55	def _extract_embed_urls(cls, url, webpage):
	56	embeds = tuple(super()._extract_embed_urls(url, webpage))
	57	if embeds:
	58	return embeds
	59	return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
	60	r'<script>\sRumble\(\s"play"\s,\s{\s[\'"]video[\'"]\s:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
	61
	62	def _real_extract(self, url):
	63	video_id = self._match_id(url)
	64	video = self._download_json(
	65	'https://rumble.com/embedJS/', video_id,
	66	query={'request': 'video', 'v': video_id})
	67	title = unescapeHTML(video['title'])
	68
	69	formats = []
	70	for height, ua in (video.get('ua') or {}).items():
	71	for i in range(2):
	72	f_url = try_get(ua, lambda x: x[i], compat_str)
	73	if f_url:
	74	ext = determine_ext(f_url)
	75	f = {
	76	'ext': ext,
	77	'format_id': '%s-%sp' % (ext, height),
	78	'height': int_or_none(height),
	79	'url': f_url,
	80	}
	81	bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
	82	if bitrate:
	83	f['tbr'] = int_or_none(bitrate)
	84	formats.append(f)
	85	self._sort_formats(formats)
	86
	87	subtitles = {
	88	lang: [{
	89	'url': sub_info['path'],
	90	'name': sub_info.get('language') or '',
	91	}] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
	92	}
	93
	94	author = video.get('author') or {}
	95
	96	return {
	97	'id': video_id,
	98	'title': title,
	99	'formats': formats,
	100	'subtitles': subtitles,
	101	'thumbnail': video.get('i'),
	102	'timestamp': parse_iso8601(video.get('pubDate')),
	103	'channel': author.get('name'),
	104	'channel_url': author.get('url'),
	105	'duration': int_or_none(video.get('duration')),
	106	'uploader': author.get('name'),
	107	}
	108
	109
	110	class RumbleChannelIE(InfoExtractor):
	111	_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'
	112
	113	_TESTS = [{
	114	'url': 'https://rumble.com/c/Styxhexenhammer666',
	115	'playlist_mincount': 1160,
	116	'info_dict': {
	117	'id': 'Styxhexenhammer666',
	118	},
	119	}, {
	120	'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
	121	'playlist_count': 4,
	122	'info_dict': {
	123	'id': 'goldenpoodleharleyeuna',
	124	},
	125	}]
	126
	127	def entries(self, url, playlist_id):
	128	for page in itertools.count(1):
	129	try:
	130	webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
	131	except ExtractorError as e:
	132	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
	133	break
	134	raise
	135	for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
	136	yield self.url_result('https://rumble.com' + video_url)
	137
	138	def _real_extract(self, url):
	139	url, playlist_id = self._match_valid_url(url).groups()
	140	return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)