[yt-dlp.git] / yt_dlp / extractor / rumble.py

import itertools
import re

from .common import InfoExtractor
from ..compat import compat_str, compat_HTTPError
from ..utils import (
    determine_ext,
    int_or_none,
    parse_iso8601,
    try_get,
    unescapeHTML,
    ExtractorError,
)


class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
        }
    }, {
        'url': 'https://rumble.com/embed/vslb7v',
        'md5': '7418035de1a30a178b8af34dc2b6a52b',
        'info_dict': {
            'id': 'vslb7v',
            'ext': 'mp4',
            'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
            'timestamp': 1645142135,
            'upload_date': '20220217',
            'channel_url': 'https://rumble.com/c/CyberTechNews',
            'channel': 'CTNews',
            'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
            'duration': 901,
        }
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url')
            for mobj in re.finditer(
                r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
                webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/', video_id,
            query={'request': 'video', 'v': video_id})
        title = unescapeHTML(video['title'])

        formats = []
        for height, ua in (video.get('ua') or {}).items():
            for i in range(2):
                f_url = try_get(ua, lambda x: x[i], compat_str)
                if f_url:
                    ext = determine_ext(f_url)
                    f = {
                        'ext': ext,
                        'format_id': '%s-%sp' % (ext, height),
                        'height': int_or_none(height),
                        'url': f_url,
                    }
                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
                    if bitrate:
                        f['tbr'] = int_or_none(bitrate)
                    formats.append(f)
        self._sort_formats(formats)

        author = video.get('author') or {}

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': video.get('i'),
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': int_or_none(video.get('duration')),
        }


class RumbleChannelIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'

    _TESTS = [{
        'url': 'https://rumble.com/c/Styxhexenhammer666',
        'playlist_mincount': 1160,
        'info_dict': {
            'id': 'Styxhexenhammer666',
        },
    }, {
        'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
        'playlist_count': 4,
        'info_dict': {
            'id': 'goldenpoodleharleyeuna',
        },
    }]

    def entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
                raise
            for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
                yield self.url_result('https://rumble.com' + video_url)

    def _real_extract(self, url):
        url, playlist_id = self._match_valid_url(url).groups()
        return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
Commit	Line	Data
f1d42a83	1	import itertools
62852977	2	import re
62852977	3
70c5802b	4	from .common import InfoExtractor
f1d42a83	5	from ..compat import compat_str, compat_HTTPError
70c5802b	6	from ..utils import (
	7	determine_ext,
	8	int_or_none,
	9	parse_iso8601,
	10	try_get,
4e34889f	11	unescapeHTML,
f1d42a83	12	ExtractorError,
70c5802b	13	)
	14
	15
	16	class RumbleEmbedIE(InfoExtractor):
	17	_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
	18	_TESTS = [{
	19	'url': 'https://rumble.com/embed/v5pv5f',
	20	'md5': '36a18a049856720189f30977ccbb2c34',
	21	'info_dict': {
	22	'id': 'v5pv5f',
	23	'ext': 'mp4',
	24	'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',
	25	'timestamp': 1571611968,
	26	'upload_date': '20191020',
	27	}
4e34889f	28	}, {
	29	'url': 'https://rumble.com/embed/vslb7v',
	30	'md5': '7418035de1a30a178b8af34dc2b6a52b',
	31	'info_dict': {
	32	'id': 'vslb7v',
	33	'ext': 'mp4',
	34	'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
	35	'timestamp': 1645142135,
	36	'upload_date': '20220217',
	37	'channel_url': 'https://rumble.com/c/CyberTechNews',
	38	'channel': 'CTNews',
	39	'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
	40	'duration': 901,
	41	}
70c5802b	42	}, {
	43	'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
	44	'only_matching': True,
	45	}]
	46
62852977	47	@staticmethod
	48	def _extract_urls(webpage):
	49	return [
	50	mobj.group('url')
	51	for mobj in re.finditer(
	52	r'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
	53	webpage)]
	54
70c5802b	55	def _real_extract(self, url):
	56	video_id = self._match_id(url)
	57	video = self._download_json(
	58	'https://rumble.com/embedJS/', video_id,
	59	query={'request': 'video', 'v': video_id})
4e34889f	60	title = unescapeHTML(video['title'])
70c5802b	61
	62	formats = []
	63	for height, ua in (video.get('ua') or {}).items():
	64	for i in range(2):
	65	f_url = try_get(ua, lambda x: x[i], compat_str)
	66	if f_url:
	67	ext = determine_ext(f_url)
	68	f = {
	69	'ext': ext,
	70	'format_id': '%s-%sp' % (ext, height),
	71	'height': int_or_none(height),
	72	'url': f_url,
	73	}
	74	bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
	75	if bitrate:
	76	f['tbr'] = int_or_none(bitrate)
	77	formats.append(f)
	78	self._sort_formats(formats)
	79
	80	author = video.get('author') or {}
	81
	82	return {
	83	'id': video_id,
	84	'title': title,
	85	'formats': formats,
	86	'thumbnail': video.get('i'),
	87	'timestamp': parse_iso8601(video.get('pubDate')),
	88	'channel': author.get('name'),
	89	'channel_url': author.get('url'),
	90	'duration': int_or_none(video.get('duration')),
	91	}
f1d42a83 AG	92
	93
	94	class RumbleChannelIE(InfoExtractor):
	95	_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'
	96
	97	_TESTS = [{
	98	'url': 'https://rumble.com/c/Styxhexenhammer666',
	99	'playlist_mincount': 1160,
	100	'info_dict': {
	101	'id': 'Styxhexenhammer666',
	102	},
	103	}, {
	104	'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
	105	'playlist_count': 4,
	106	'info_dict': {
	107	'id': 'goldenpoodleharleyeuna',
	108	},
	109	}]
	110
	111	def entries(self, url, playlist_id):
	112	for page in itertools.count(1):
	113	try:
	114	webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
	115	except ExtractorError as e:
	116	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
	117	break
	118	raise
	119	for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
	120	yield self.url_result('https://rumble.com' + video_url)
	121
	122	def _real_extract(self, url):
	123	url, playlist_id = self._match_valid_url(url).groups()
	124	return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)