]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/rumble.py
[youtube] Avoid false positives when detecting damaged formats
[yt-dlp.git] / yt_dlp / extractor / rumble.py
CommitLineData
70c5802b 1# coding: utf-8
2from __future__ import unicode_literals
3
f1d42a83 4import itertools
62852977 5import re
6
70c5802b 7from .common import InfoExtractor
f1d42a83 8from ..compat import compat_str, compat_HTTPError
70c5802b 9from ..utils import (
10 determine_ext,
11 int_or_none,
12 parse_iso8601,
13 try_get,
f1d42a83 14 ExtractorError,
70c5802b 15)
16
17
18class RumbleEmbedIE(InfoExtractor):
19 _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
20 _TESTS = [{
21 'url': 'https://rumble.com/embed/v5pv5f',
22 'md5': '36a18a049856720189f30977ccbb2c34',
23 'info_dict': {
24 'id': 'v5pv5f',
25 'ext': 'mp4',
26 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
27 'timestamp': 1571611968,
28 'upload_date': '20191020',
29 }
30 }, {
31 'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
32 'only_matching': True,
33 }]
34
62852977 35 @staticmethod
36 def _extract_urls(webpage):
37 return [
38 mobj.group('url')
39 for mobj in re.finditer(
40 r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
41 webpage)]
42
70c5802b 43 def _real_extract(self, url):
44 video_id = self._match_id(url)
45 video = self._download_json(
46 'https://rumble.com/embedJS/', video_id,
47 query={'request': 'video', 'v': video_id})
48 title = video['title']
49
50 formats = []
51 for height, ua in (video.get('ua') or {}).items():
52 for i in range(2):
53 f_url = try_get(ua, lambda x: x[i], compat_str)
54 if f_url:
55 ext = determine_ext(f_url)
56 f = {
57 'ext': ext,
58 'format_id': '%s-%sp' % (ext, height),
59 'height': int_or_none(height),
60 'url': f_url,
61 }
62 bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
63 if bitrate:
64 f['tbr'] = int_or_none(bitrate)
65 formats.append(f)
66 self._sort_formats(formats)
67
68 author = video.get('author') or {}
69
70 return {
71 'id': video_id,
72 'title': title,
73 'formats': formats,
74 'thumbnail': video.get('i'),
75 'timestamp': parse_iso8601(video.get('pubDate')),
76 'channel': author.get('name'),
77 'channel_url': author.get('url'),
78 'duration': int_or_none(video.get('duration')),
79 }
f1d42a83
AG
80
81
82class RumbleChannelIE(InfoExtractor):
83 _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'
84
85 _TESTS = [{
86 'url': 'https://rumble.com/c/Styxhexenhammer666',
87 'playlist_mincount': 1160,
88 'info_dict': {
89 'id': 'Styxhexenhammer666',
90 },
91 }, {
92 'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
93 'playlist_count': 4,
94 'info_dict': {
95 'id': 'goldenpoodleharleyeuna',
96 },
97 }]
98
99 def entries(self, url, playlist_id):
100 for page in itertools.count(1):
101 try:
102 webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
103 except ExtractorError as e:
104 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
105 break
106 raise
107 for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
108 yield self.url_result('https://rumble.com' + video_url)
109
110 def _real_extract(self, url):
111 url, playlist_id = self._match_valid_url(url).groups()
112 return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)