]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rumble.py
[extractors] Use new framework for existing embeds (#4307)
[yt-dlp.git] / yt_dlp / extractor / rumble.py
CommitLineData
f1d42a83 1import itertools
62852977 2import re
3
70c5802b 4from .common import InfoExtractor
f1d42a83 5from ..compat import compat_str, compat_HTTPError
70c5802b 6from ..utils import (
7 determine_ext,
8 int_or_none,
9 parse_iso8601,
10 try_get,
4e34889f 11 unescapeHTML,
f1d42a83 12 ExtractorError,
70c5802b 13)
14
15
16class RumbleEmbedIE(InfoExtractor):
17 _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
bfd973ec 18 _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})']
70c5802b 19 _TESTS = [{
20 'url': 'https://rumble.com/embed/v5pv5f',
21 'md5': '36a18a049856720189f30977ccbb2c34',
22 'info_dict': {
23 'id': 'v5pv5f',
24 'ext': 'mp4',
25 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
26 'timestamp': 1571611968,
27 'upload_date': '20191020',
64fa820c 28 'channel_url': 'https://rumble.com/c/WMAR',
29 'channel': 'WMAR',
30 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
31 'duration': 234,
32 'uploader': 'WMAR',
70c5802b 33 }
4e34889f 34 }, {
35 'url': 'https://rumble.com/embed/vslb7v',
36 'md5': '7418035de1a30a178b8af34dc2b6a52b',
37 'info_dict': {
38 'id': 'vslb7v',
39 'ext': 'mp4',
40 'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'',
41 'timestamp': 1645142135,
42 'upload_date': '20220217',
43 'channel_url': 'https://rumble.com/c/CyberTechNews',
44 'channel': 'CTNews',
45 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg',
46 'duration': 901,
64fa820c 47 'uploader': 'CTNews',
4e34889f 48 }
70c5802b 49 }, {
50 'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
51 'only_matching': True,
52 }]
53
79e591b5 54 @classmethod
bfd973ec 55 def _extract_embed_urls(cls, url, webpage):
56 embeds = tuple(super()._extract_embed_urls(url, webpage))
79e591b5 57 if embeds:
bfd973ec 58 return embeds
79e591b5 59 return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
60 r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
62852977 61
70c5802b 62 def _real_extract(self, url):
63 video_id = self._match_id(url)
64 video = self._download_json(
65 'https://rumble.com/embedJS/', video_id,
66 query={'request': 'video', 'v': video_id})
4e34889f 67 title = unescapeHTML(video['title'])
70c5802b 68
69 formats = []
70 for height, ua in (video.get('ua') or {}).items():
71 for i in range(2):
72 f_url = try_get(ua, lambda x: x[i], compat_str)
73 if f_url:
74 ext = determine_ext(f_url)
75 f = {
76 'ext': ext,
77 'format_id': '%s-%sp' % (ext, height),
78 'height': int_or_none(height),
79 'url': f_url,
80 }
81 bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
82 if bitrate:
83 f['tbr'] = int_or_none(bitrate)
84 formats.append(f)
85 self._sort_formats(formats)
86
92922fe7
F
87 subtitles = {
88 lang: [{
89 'url': sub_info['path'],
90 'name': sub_info.get('language') or '',
91 }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path')
92 }
93
70c5802b 94 author = video.get('author') or {}
95
96 return {
97 'id': video_id,
98 'title': title,
99 'formats': formats,
92922fe7 100 'subtitles': subtitles,
70c5802b 101 'thumbnail': video.get('i'),
102 'timestamp': parse_iso8601(video.get('pubDate')),
103 'channel': author.get('name'),
104 'channel_url': author.get('url'),
105 'duration': int_or_none(video.get('duration')),
64fa820c 106 'uploader': author.get('name'),
70c5802b 107 }
f1d42a83
AG
108
109
110class RumbleChannelIE(InfoExtractor):
111 _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'
112
113 _TESTS = [{
114 'url': 'https://rumble.com/c/Styxhexenhammer666',
115 'playlist_mincount': 1160,
116 'info_dict': {
117 'id': 'Styxhexenhammer666',
118 },
119 }, {
120 'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
121 'playlist_count': 4,
122 'info_dict': {
123 'id': 'goldenpoodleharleyeuna',
124 },
125 }]
126
127 def entries(self, url, playlist_id):
128 for page in itertools.count(1):
129 try:
130 webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
131 except ExtractorError as e:
132 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
133 break
134 raise
135 for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
136 yield self.url_result('https://rumble.com' + video_url)
137
138 def _real_extract(self, url):
139 url, playlist_id = self._match_valid_url(url).groups()
140 return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)