[yt-dlp.git] / yt_dlp / extractor / rumble.py

import itertools
import re

from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
    int_or_none,
    parse_iso8601,
    traverse_obj,
    unescapeHTML,
    ExtractorError,
)


class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
            'channel_url': 'https://rumble.com/c/WMAR',
            'channel': 'WMAR',
            'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
            'duration': 234,
            'uploader': 'WMAR',
            'live_status': 'not_live',
        }
    }, {
        'url': 'https://rumble.com/embed/vslb7v',
        'md5': '7418035de1a30a178b8af34dc2b6a52b',
        'info_dict': {
            'id': 'vslb7v',
            'ext': 'mp4',
            'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
            'timestamp': 1645142135,
            'upload_date': '20220217',
            'channel_url': 'https://rumble.com/c/CyberTechNews',
            'channel': 'CTNews',
            'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
            'duration': 901,
            'uploader': 'CTNews',
            'live_status': 'not_live',
        }
    }, {
        'url': 'https://rumble.com/embed/vunh1h',
        'info_dict': {
            'id': 'vunh1h',
            'ext': 'mp4',
            'title': '‘Gideon, op zoek naar de waarheid’ including ENG SUBS',
            'timestamp': 1647197663,
            'upload_date': '20220313',
            'channel_url': 'https://rumble.com/user/BLCKBX',
            'channel': 'BLCKBX',
            'thumbnail': r're:https://.+\.jpg',
            'duration': 5069,
            'uploader': 'BLCKBX',
            'live_status': 'not_live',
            'subtitles': {
                'en': [
                    {
                        'url': r're:https://.+\.vtt',
                        'name': 'English',
                        'ext': 'vtt'
                    }
                ]
            },
        },
        'params': {'skip_download': True}
    }, {
        'url': 'https://rumble.com/embed/v1essrt',
        'info_dict': {
            'id': 'v1essrt',
            'ext': 'mp4',
            'title': 'startswith:lofi hip hop radio - beats to relax/study',
            'timestamp': 1661519399,
            'upload_date': '20220826',
            'channel_url': 'https://rumble.com/c/LofiGirl',
            'channel': 'Lofi Girl',
            'thumbnail': r're:https://.+\.jpg',
            'duration': None,
            'uploader': 'Lofi Girl',
            'live_status': 'is_live',
        },
        'params': {'skip_download': True}
    }, {
        'url': 'https://rumble.com/embed/v1amumr',
        'info_dict': {
            'id': 'v1amumr',
            'ext': 'webm',
            'fps': 60,
            'title': 'Turning Point USA 2022 Student Action Summit DAY 1  - Rumble Exclusive Live',
            'timestamp': 1658518457,
            'upload_date': '20220722',
            'channel_url': 'https://rumble.com/c/RumbleEvents',
            'channel': 'Rumble Events',
            'thumbnail': r're:https://.+\.jpg',
            'duration': 16427,
            'uploader': 'Rumble Events',
            'live_status': 'was_live',
        },
        'params': {'skip_download': True}
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]

    _WEBPAGE_TESTS = [
        {
            'note': 'Rumble embed',
            'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
            'md5': '53af34098a7f92c4e51cf0bd1c33f009',
            'info_dict': {
                'id': 'vb0ofn',
                'ext': 'mp4',
                'timestamp': 1612662578,
                'uploader': 'LovingMontana',
                'channel': 'LovingMontana',
                'upload_date': '20210207',
                'title': 'Winter-loving dog helps girls dig a snow fort ',
                'channel_url': 'https://rumble.com/c/c-546523',
                'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
                'duration': 103,
                'live_status': 'not_live',
            }
        },
        {
            'note': 'Rumble JS embed',
            'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
            'md5': '4701209ac99095592e73dbba21889690',
            'info_dict': {
                'id': 'v15eqxl',
                'ext': 'mp4',
                'channel': 'Mr Producer Media',
                'duration': 92,
                'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
                'channel_url': 'https://rumble.com/c/RichSementa',
                'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
                'timestamp': 1654892716,
                'uploader': 'Mr Producer Media',
                'upload_date': '20220610',
                'live_status': 'not_live',
            }
        },
    ]

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        embeds = tuple(super()._extract_embed_urls(url, webpage))
        if embeds:
            return embeds
        return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
            r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/u3/', video_id,
            query={'request': 'video', 'ver': 2, 'v': video_id})

        sys_msg = traverse_obj(video, ('sys', 'msg'))
        if sys_msg:
            self.report_warning(sys_msg, video_id=video_id)

        if video.get('live') == 0:
            live_status = 'not_live' if video.get('livestream_has_dvr') is None else 'was_live'
        elif video.get('live') == 1:
            live_status = 'is_upcoming' if video.get('livestream_has_dvr') else 'was_live'
        elif video.get('live') == 2:
            live_status = 'is_live'
        else:
            live_status = None

        formats = []
        for ext, ext_info in (video.get('ua') or {}).items():
            for height, video_info in (ext_info or {}).items():
                meta = video_info.get('meta') or {}
                if not video_info.get('url'):
                    continue
                if ext == 'hls':
                    if meta.get('live') is True and video.get('live') == 1:
                        live_status = 'post_live'
                    formats.extend(self._extract_m3u8_formats(
                        video_info['url'], video_id,
                        ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live'))
                    continue
                formats.append({
                    'ext': ext,
                    'url': video_info['url'],
                    'format_id': '%s-%sp' % (ext, height),
                    'height': int_or_none(height),
                    'fps': video.get('fps'),
                    **traverse_obj(meta, {
                        'tbr': 'bitrate',
                        'filesize': 'size',
                        'width': 'w',
                        'height': 'h',
                    }, default={})
                })

        subtitles = {
            lang: [{
                'url': sub_info['path'],
                'name': sub_info.get('language') or '',
            }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
        }

        author = video.get('author') or {}
        thumbnails = traverse_obj(video, ('t', ..., {'url': 'i', 'width': 'w', 'height': 'h'}))
        if not thumbnails and video.get('i'):
            thumbnails = [{'url': video['i']}]

        if live_status in {'is_live', 'post_live'}:
            duration = None
        else:
            duration = int_or_none(video.get('duration'))

        return {
            'id': video_id,
            'title': unescapeHTML(video.get('title')),
            'formats': formats,
            'subtitles': subtitles,
            'thumbnails': thumbnails,
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': duration,
            'uploader': author.get('name'),
            'live_status': live_status,
        }


class RumbleChannelIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'

    _TESTS = [{
        'url': 'https://rumble.com/c/Styxhexenhammer666',
        'playlist_mincount': 1160,
        'info_dict': {
            'id': 'Styxhexenhammer666',
        },
    }, {
        'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
        'playlist_mincount': 4,
        'info_dict': {
            'id': 'goldenpoodleharleyeuna',
        },
    }]

    def entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
                raise
            for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
                yield self.url_result('https://rumble.com' + video_url)

    def _real_extract(self, url):
        url, playlist_id = self._match_valid_url(url).groups()
        return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
Commit	Line	Data
f1d42a83	1	import itertools
62852977	2	import re
62852977	3
70c5802b	4	from .common import InfoExtractor
0d8affc1	5	from ..compat import compat_HTTPError
70c5802b	6	from ..utils import (
70c5802b	7	int_or_none,
70c5802b	8	parse_iso8601,
0d8affc1	9	traverse_obj,
4e34889f	10	unescapeHTML,
f1d42a83	11	ExtractorError,
70c5802b	12	)
	13
	14
	15	class RumbleEmbedIE(InfoExtractor):
	16	_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
bfd973ec	17	_EMBED_REGEX = [fr'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>{_VALID_URL})']
70c5802b	18	_TESTS = [{
	19	'url': 'https://rumble.com/embed/v5pv5f',
	20	'md5': '36a18a049856720189f30977ccbb2c34',
	21	'info_dict': {
	22	'id': 'v5pv5f',
	23	'ext': 'mp4',
	24	'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',
	25	'timestamp': 1571611968,
	26	'upload_date': '20191020',
64fa820c	27	'channel_url': 'https://rumble.com/c/WMAR',
	28	'channel': 'WMAR',
	29	'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
	30	'duration': 234,
	31	'uploader': 'WMAR',
0d8affc1	32	'live_status': 'not_live',
70c5802b	33	}
4e34889f	34	}, {
	35	'url': 'https://rumble.com/embed/vslb7v',
	36	'md5': '7418035de1a30a178b8af34dc2b6a52b',
	37	'info_dict': {
	38	'id': 'vslb7v',
	39	'ext': 'mp4',
	40	'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
	41	'timestamp': 1645142135,
	42	'upload_date': '20220217',
	43	'channel_url': 'https://rumble.com/c/CyberTechNews',
	44	'channel': 'CTNews',
	45	'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
	46	'duration': 901,
64fa820c	47	'uploader': 'CTNews',
0d8affc1	48	'live_status': 'not_live',
4e34889f	49	}
0d8affc1 M	50	}, {
	51	'url': 'https://rumble.com/embed/vunh1h',
	52	'info_dict': {
	53	'id': 'vunh1h',
	54	'ext': 'mp4',
	55	'title': '‘Gideon, op zoek naar de waarheid’ including ENG SUBS',
	56	'timestamp': 1647197663,
	57	'upload_date': '20220313',
	58	'channel_url': 'https://rumble.com/user/BLCKBX',
	59	'channel': 'BLCKBX',
	60	'thumbnail': r're:https://.+\.jpg',
	61	'duration': 5069,
	62	'uploader': 'BLCKBX',
	63	'live_status': 'not_live',
	64	'subtitles': {
	65	'en': [
	66	{
	67	'url': r're:https://.+\.vtt',
	68	'name': 'English',
	69	'ext': 'vtt'
	70	}
	71	]
	72	},
	73	},
	74	'params': {'skip_download': True}
	75	}, {
	76	'url': 'https://rumble.com/embed/v1essrt',
	77	'info_dict': {
	78	'id': 'v1essrt',
	79	'ext': 'mp4',
	80	'title': 'startswith:lofi hip hop radio - beats to relax/study',
	81	'timestamp': 1661519399,
	82	'upload_date': '20220826',
	83	'channel_url': 'https://rumble.com/c/LofiGirl',
	84	'channel': 'Lofi Girl',
	85	'thumbnail': r're:https://.+\.jpg',
	86	'duration': None,
	87	'uploader': 'Lofi Girl',
	88	'live_status': 'is_live',
	89	},
	90	'params': {'skip_download': True}
	91	}, {
	92	'url': 'https://rumble.com/embed/v1amumr',
	93	'info_dict': {
	94	'id': 'v1amumr',
	95	'ext': 'webm',
	96	'fps': 60,
	97	'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live',
	98	'timestamp': 1658518457,
	99	'upload_date': '20220722',
	100	'channel_url': 'https://rumble.com/c/RumbleEvents',
	101	'channel': 'Rumble Events',
	102	'thumbnail': r're:https://.+\.jpg',
	103	'duration': 16427,
	104	'uploader': 'Rumble Events',
	105	'live_status': 'was_live',
	106	},
	107	'params': {'skip_download': True}
70c5802b	108	}, {
	109	'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
	110	'only_matching': True,
	111	}]
	112
0d8affc1 M	113	_WEBPAGE_TESTS = [
	114	{
	115	'note': 'Rumble embed',
	116	'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
	117	'md5': '53af34098a7f92c4e51cf0bd1c33f009',
	118	'info_dict': {
	119	'id': 'vb0ofn',
	120	'ext': 'mp4',
	121	'timestamp': 1612662578,
	122	'uploader': 'LovingMontana',
	123	'channel': 'LovingMontana',
	124	'upload_date': '20210207',
	125	'title': 'Winter-loving dog helps girls dig a snow fort ',
	126	'channel_url': 'https://rumble.com/c/c-546523',
	127	'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
	128	'duration': 103,
	129	'live_status': 'not_live',
	130	}
	131	},
	132	{
	133	'note': 'Rumble JS embed',
	134	'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
	135	'md5': '4701209ac99095592e73dbba21889690',
	136	'info_dict': {
	137	'id': 'v15eqxl',
	138	'ext': 'mp4',
	139	'channel': 'Mr Producer Media',
	140	'duration': 92,
	141	'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
	142	'channel_url': 'https://rumble.com/c/RichSementa',
	143	'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
	144	'timestamp': 1654892716,
	145	'uploader': 'Mr Producer Media',
	146	'upload_date': '20220610',
	147	'live_status': 'not_live',
	148	}
	149	},
	150	]
	151
79e591b5	152	@classmethod
bfd973ec	153	def _extract_embed_urls(cls, url, webpage):
bfd973ec	154	embeds = tuple(super()._extract_embed_urls(url, webpage))
79e591b5	155	if embeds:
bfd973ec	156	return embeds
79e591b5	157	return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
79e591b5	158	r'<script>\sRumble\(\s"play"\s,\s{\s[\'"]video[\'"]\s:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
62852977	159
70c5802b	160	def _real_extract(self, url):
	161	video_id = self._match_id(url)
	162	video = self._download_json(
0d8affc1 M	163	'https://rumble.com/embedJS/u3/', video_id,
	164	query={'request': 'video', 'ver': 2, 'v': video_id})
	165
	166	sys_msg = traverse_obj(video, ('sys', 'msg'))
	167	if sys_msg:
	168	self.report_warning(sys_msg, video_id=video_id)
	169
	170	if video.get('live') == 0:
	171	live_status = 'not_live' if video.get('livestream_has_dvr') is None else 'was_live'
	172	elif video.get('live') == 1:
	173	live_status = 'is_upcoming' if video.get('livestream_has_dvr') else 'was_live'
	174	elif video.get('live') == 2:
	175	live_status = 'is_live'
	176	else:
	177	live_status = None
70c5802b	178
70c5802b	179	formats = []
0d8affc1 M	180	for ext, ext_info in (video.get('ua') or {}).items():
	181	for height, video_info in (ext_info or {}).items():
	182	meta = video_info.get('meta') or {}
	183	if not video_info.get('url'):
	184	continue
	185	if ext == 'hls':
	186	if meta.get('live') is True and video.get('live') == 1:
	187	live_status = 'post_live'
	188	formats.extend(self._extract_m3u8_formats(
	189	video_info['url'], video_id,
	190	ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live'))
	191	continue
	192	formats.append({
	193	'ext': ext,
	194	'url': video_info['url'],
	195	'format_id': '%s-%sp' % (ext, height),
	196	'height': int_or_none(height),
	197	'fps': video.get('fps'),
	198	**traverse_obj(meta, {
	199	'tbr': 'bitrate',
	200	'filesize': 'size',
	201	'width': 'w',
	202	'height': 'h',
	203	}, default={})
	204	})
70c5802b	205
92922fe7 F	206	subtitles = {
	207	lang: [{
	208	'url': sub_info['path'],
	209	'name': sub_info.get('language') or '',
	210	}] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
	211	}
	212
70c5802b	213	author = video.get('author') or {}
0d8affc1 M	214	thumbnails = traverse_obj(video, ('t', ..., {'url': 'i', 'width': 'w', 'height': 'h'}))
	215	if not thumbnails and video.get('i'):
	216	thumbnails = [{'url': video['i']}]
	217
	218	if live_status in {'is_live', 'post_live'}:
	219	duration = None
	220	else:
	221	duration = int_or_none(video.get('duration'))
70c5802b	222
	223	return {
	224	'id': video_id,
0d8affc1	225	'title': unescapeHTML(video.get('title')),
70c5802b	226	'formats': formats,
92922fe7	227	'subtitles': subtitles,
0d8affc1	228	'thumbnails': thumbnails,
70c5802b	229	'timestamp': parse_iso8601(video.get('pubDate')),
	230	'channel': author.get('name'),
	231	'channel_url': author.get('url'),
0d8affc1	232	'duration': duration,
64fa820c	233	'uploader': author.get('name'),
0d8affc1	234	'live_status': live_status,
70c5802b	235	}
f1d42a83 AG	236
	237
	238	class RumbleChannelIE(InfoExtractor):
	239	_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'
	240
	241	_TESTS = [{
	242	'url': 'https://rumble.com/c/Styxhexenhammer666',
	243	'playlist_mincount': 1160,
	244	'info_dict': {
	245	'id': 'Styxhexenhammer666',
	246	},
	247	}, {
	248	'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
0d8affc1	249	'playlist_mincount': 4,
f1d42a83 AG	250	'info_dict': {
	251	'id': 'goldenpoodleharleyeuna',
	252	},
	253	}]
	254
	255	def entries(self, url, playlist_id):
	256	for page in itertools.count(1):
	257	try:
	258	webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
	259	except ExtractorError as e:
	260	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
	261	break
	262	raise
	263	for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
	264	yield self.url_result('https://rumble.com' + video_url)
	265
	266	def _real_extract(self, url):
	267	url, playlist_id = self._match_valid_url(url).groups()
	268	return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)