[yt-dlp.git] / yt_dlp / extractor / rumble.py

import itertools
import re

from .common import InfoExtractor
from ..compat import compat_str, compat_HTTPError
from ..utils import (
    determine_ext,
    int_or_none,
    parse_iso8601,
    try_get,
    unescapeHTML,
    ExtractorError,
)


class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
            'channel_url': 'https://rumble.com/c/WMAR',
            'channel': 'WMAR',
            'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
            'duration': 234,
            'uploader': 'WMAR',
        }
    }, {
        'url': 'https://rumble.com/embed/vslb7v',
        'md5': '7418035de1a30a178b8af34dc2b6a52b',
        'info_dict': {
            'id': 'vslb7v',
            'ext': 'mp4',
            'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
            'timestamp': 1645142135,
            'upload_date': '20220217',
            'channel_url': 'https://rumble.com/c/CyberTechNews',
            'channel': 'CTNews',
            'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
            'duration': 901,
            'uploader': 'CTNews',
        }
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        embeds = tuple(super()._extract_embed_urls(url, webpage))
        if embeds:
            return embeds
        return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
            r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/', video_id,
            query={'request': 'video', 'v': video_id})
        title = unescapeHTML(video['title'])

        formats = []
        for height, ua in (video.get('ua') or {}).items():
            for i in range(2):
                f_url = try_get(ua, lambda x: x[i], compat_str)
                if f_url:
                    ext = determine_ext(f_url)
                    f = {
                        'ext': ext,
                        'format_id': '%s-%sp' % (ext, height),
                        'height': int_or_none(height),
                        'url': f_url,
                    }
                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
                    if bitrate:
                        f['tbr'] = int_or_none(bitrate)
                    formats.append(f)
        self._sort_formats(formats)

        subtitles = {
            lang: [{
                'url': sub_info['path'],
                'name': sub_info.get('language') or '',
            }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
        }

        author = video.get('author') or {}

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'subtitles': subtitles,
            'thumbnail': video.get('i'),
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': int_or_none(video.get('duration')),
            'uploader': author.get('name'),
        }


class RumbleChannelIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'

    _TESTS = [{
        'url': 'https://rumble.com/c/Styxhexenhammer666',
        'playlist_mincount': 1160,
        'info_dict': {
            'id': 'Styxhexenhammer666',
        },
    }, {
        'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
        'playlist_count': 4,
        'info_dict': {
            'id': 'goldenpoodleharleyeuna',
        },
    }]

    def entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
                raise
            for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
                yield self.url_result('https://rumble.com' + video_url)

    def _real_extract(self, url):
        url, playlist_id = self._match_valid_url(url).groups()
        return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
Commit	Line	Data
f1d42a83	1	import itertools
62852977	2	import re
62852977	3
70c5802b	4	from .common import InfoExtractor
f1d42a83	5	from ..compat import compat_str, compat_HTTPError
70c5802b	6	from ..utils import (
	7	determine_ext,
	8	int_or_none,
	9	parse_iso8601,
	10	try_get,
4e34889f	11	unescapeHTML,
f1d42a83	12	ExtractorError,
70c5802b	13	)
	14
	15
	16	class RumbleEmbedIE(InfoExtractor):
	17	_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
bfd973ec	18	_EMBED_REGEX = [fr'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>{_VALID_URL})']
70c5802b	19	_TESTS = [{
	20	'url': 'https://rumble.com/embed/v5pv5f',
	21	'md5': '36a18a049856720189f30977ccbb2c34',
	22	'info_dict': {
	23	'id': 'v5pv5f',
	24	'ext': 'mp4',
	25	'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',
	26	'timestamp': 1571611968,
	27	'upload_date': '20191020',
64fa820c	28	'channel_url': 'https://rumble.com/c/WMAR',
	29	'channel': 'WMAR',
	30	'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
	31	'duration': 234,
	32	'uploader': 'WMAR',
70c5802b	33	}
4e34889f	34	}, {
	35	'url': 'https://rumble.com/embed/vslb7v',
	36	'md5': '7418035de1a30a178b8af34dc2b6a52b',
	37	'info_dict': {
	38	'id': 'vslb7v',
	39	'ext': 'mp4',
	40	'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
	41	'timestamp': 1645142135,
	42	'upload_date': '20220217',
	43	'channel_url': 'https://rumble.com/c/CyberTechNews',
	44	'channel': 'CTNews',
	45	'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
	46	'duration': 901,
64fa820c	47	'uploader': 'CTNews',
4e34889f	48	}
70c5802b	49	}, {
	50	'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
	51	'only_matching': True,
	52	}]
	53
79e591b5	54	@classmethod
bfd973ec	55	def _extract_embed_urls(cls, url, webpage):
bfd973ec	56	embeds = tuple(super()._extract_embed_urls(url, webpage))
79e591b5	57	if embeds:
bfd973ec	58	return embeds
79e591b5	59	return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
79e591b5	60	r'<script>\sRumble\(\s"play"\s,\s{\s[\'"]video[\'"]\s:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
62852977	61
70c5802b	62	def _real_extract(self, url):
	63	video_id = self._match_id(url)
	64	video = self._download_json(
	65	'https://rumble.com/embedJS/', video_id,
	66	query={'request': 'video', 'v': video_id})
4e34889f	67	title = unescapeHTML(video['title'])
70c5802b	68
	69	formats = []
	70	for height, ua in (video.get('ua') or {}).items():
	71	for i in range(2):
	72	f_url = try_get(ua, lambda x: x[i], compat_str)
	73	if f_url:
	74	ext = determine_ext(f_url)
	75	f = {
	76	'ext': ext,
	77	'format_id': '%s-%sp' % (ext, height),
	78	'height': int_or_none(height),
	79	'url': f_url,
	80	}
	81	bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
	82	if bitrate:
	83	f['tbr'] = int_or_none(bitrate)
	84	formats.append(f)
	85	self._sort_formats(formats)
	86
92922fe7 F	87	subtitles = {
	88	lang: [{
	89	'url': sub_info['path'],
	90	'name': sub_info.get('language') or '',
	91	}] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
	92	}
	93
70c5802b	94	author = video.get('author') or {}
	95
	96	return {
	97	'id': video_id,
	98	'title': title,
	99	'formats': formats,
92922fe7	100	'subtitles': subtitles,
70c5802b	101	'thumbnail': video.get('i'),
	102	'timestamp': parse_iso8601(video.get('pubDate')),
	103	'channel': author.get('name'),
	104	'channel_url': author.get('url'),
	105	'duration': int_or_none(video.get('duration')),
64fa820c	106	'uploader': author.get('name'),
70c5802b	107	}
f1d42a83 AG	108
	109
	110	class RumbleChannelIE(InfoExtractor):
	111	_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'
	112
	113	_TESTS = [{
	114	'url': 'https://rumble.com/c/Styxhexenhammer666',
	115	'playlist_mincount': 1160,
	116	'info_dict': {
	117	'id': 'Styxhexenhammer666',
	118	},
	119	}, {
	120	'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
	121	'playlist_count': 4,
	122	'info_dict': {
	123	'id': 'goldenpoodleharleyeuna',
	124	},
	125	}]
	126
	127	def entries(self, url, playlist_id):
	128	for page in itertools.count(1):
	129	try:
	130	webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
	131	except ExtractorError as e:
	132	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
	133	break
	134	raise
	135	for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
	136	yield self.url_result('https://rumble.com' + video_url)
	137
	138	def _real_extract(self, url):
	139	url, playlist_id = self._match_valid_url(url).groups()
	140	return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)