[yt-dlp.git] / yt_dlp / extractor / rumble.py

# coding: utf-8
from __future__ import unicode_literals

import itertools
import re

from .common import InfoExtractor
from ..compat import compat_str, compat_HTTPError
from ..utils import (
    determine_ext,
    int_or_none,
    parse_iso8601,
    try_get,
    ExtractorError,
)


class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
        }
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url')
            for mobj in re.finditer(
                r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
                webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/', video_id,
            query={'request': 'video', 'v': video_id})
        title = video['title']

        formats = []
        for height, ua in (video.get('ua') or {}).items():
            for i in range(2):
                f_url = try_get(ua, lambda x: x[i], compat_str)
                if f_url:
                    ext = determine_ext(f_url)
                    f = {
                        'ext': ext,
                        'format_id': '%s-%sp' % (ext, height),
                        'height': int_or_none(height),
                        'url': f_url,
                    }
                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
                    if bitrate:
                        f['tbr'] = int_or_none(bitrate)
                    formats.append(f)
        self._sort_formats(formats)

        author = video.get('author') or {}

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': video.get('i'),
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': int_or_none(video.get('duration')),
        }


class RumbleChannelIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))'

    _TESTS = [{
        'url': 'https://rumble.com/c/Styxhexenhammer666',
        'playlist_mincount': 1160,
        'info_dict': {
            'id': 'Styxhexenhammer666',
        },
    }, {
        'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
        'playlist_count': 4,
        'info_dict': {
            'id': 'goldenpoodleharleyeuna',
        },
    }]

    def entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
            except ExtractorError as e:
                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                    break
                raise
            for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
                yield self.url_result('https://rumble.com' + video_url)

    def _real_extract(self, url):
        url, playlist_id = self._match_valid_url(url).groups()
        return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)
Commit	Line	Data
70c5802b	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
f1d42a83	4	import itertools
62852977	5	import re
62852977	6
70c5802b	7	from .common import InfoExtractor
f1d42a83	8	from ..compat import compat_str, compat_HTTPError
70c5802b	9	from ..utils import (
	10	determine_ext,
	11	int_or_none,
	12	parse_iso8601,
	13	try_get,
f1d42a83	14	ExtractorError,
70c5802b	15	)
	16
	17
	18	class RumbleEmbedIE(InfoExtractor):
	19	_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
	20	_TESTS = [{
	21	'url': 'https://rumble.com/embed/v5pv5f',
	22	'md5': '36a18a049856720189f30977ccbb2c34',
	23	'info_dict': {
	24	'id': 'v5pv5f',
	25	'ext': 'mp4',
	26	'title': 'WMAR 2 News Latest Headlines \| October 20, 6pm',
	27	'timestamp': 1571611968,
	28	'upload_date': '20191020',
	29	}
	30	}, {
	31	'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
	32	'only_matching': True,
	33	}]
	34
62852977	35	@staticmethod
	36	def _extract_urls(webpage):
	37	return [
	38	mobj.group('url')
	39	for mobj in re.finditer(
	40	r'(?:<(?:script\|iframe)[^>]+\bsrc=\|["\']embedUrl["\']\s:\s)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL,
	41	webpage)]
	42
70c5802b	43	def _real_extract(self, url):
	44	video_id = self._match_id(url)
	45	video = self._download_json(
	46	'https://rumble.com/embedJS/', video_id,
	47	query={'request': 'video', 'v': video_id})
	48	title = video['title']
	49
	50	formats = []
	51	for height, ua in (video.get('ua') or {}).items():
	52	for i in range(2):
	53	f_url = try_get(ua, lambda x: x[i], compat_str)
	54	if f_url:
	55	ext = determine_ext(f_url)
	56	f = {
	57	'ext': ext,
	58	'format_id': '%s-%sp' % (ext, height),
	59	'height': int_or_none(height),
	60	'url': f_url,
	61	}
	62	bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
	63	if bitrate:
	64	f['tbr'] = int_or_none(bitrate)
	65	formats.append(f)
	66	self._sort_formats(formats)
	67
	68	author = video.get('author') or {}
	69
	70	return {
	71	'id': video_id,
	72	'title': title,
	73	'formats': formats,
	74	'thumbnail': video.get('i'),
	75	'timestamp': parse_iso8601(video.get('pubDate')),
	76	'channel': author.get('name'),
	77	'channel_url': author.get('url'),
	78	'duration': int_or_none(video.get('duration')),
	79	}
f1d42a83 AG	80
	81
	82	class RumbleChannelIE(InfoExtractor):
	83	_VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c\|user)/(?P<id>[^&?#$/]+))'
	84
	85	_TESTS = [{
	86	'url': 'https://rumble.com/c/Styxhexenhammer666',
	87	'playlist_mincount': 1160,
	88	'info_dict': {
	89	'id': 'Styxhexenhammer666',
	90	},
	91	}, {
	92	'url': 'https://rumble.com/user/goldenpoodleharleyeuna',
	93	'playlist_count': 4,
	94	'info_dict': {
	95	'id': 'goldenpoodleharleyeuna',
	96	},
	97	}]
	98
	99	def entries(self, url, playlist_id):
	100	for page in itertools.count(1):
	101	try:
	102	webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page)
	103	except ExtractorError as e:
	104	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
	105	break
	106	raise
	107	for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
	108	yield self.url_result('https://rumble.com' + video_url)
	109
	110	def _real_extract(self, url):
	111	url, playlist_id = self._match_valid_url(url).groups()
	112	return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id)