[yt-dlp.git] / yt_dlp / extractor / bitchute.py

import itertools
import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    HEADRequest,
    clean_html,
    get_element_by_class,
    int_or_none,
    orderedSet,
    traverse_obj,
    unified_strdate,
    urlencode_postdata,
)


class BitChuteIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
        'md5': '7e427d7ed7af5a75b5855705ec750e2b',
        'info_dict': {
            'id': 'UGlrF9o9b-Q',
            'ext': 'mp4',
            'title': 'This is the first video on #BitChute !',
            'description': 'md5:a0337e7b1fe39e32336974af8173a034',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20170103',
        },
    }, {
        # video not downloadable in browser, but we can recover it
        'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
        'md5': '05c12397d5354bf24494885b08d24ed1',
        'info_dict': {
            'id': '2s6B3nZjAk7R',
            'ext': 'mp4',
            'filesize': 71537926,
            'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
            'description': 'md5:228ee93bd840a24938f536aeac9cf749',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20181113',
        },
        'params': {'check_formats': None},
    }, {
        'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
        'only_matching': True,
    }, {
        'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
        'only_matching': True,
    }]

    _HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
        'Referer': 'https://www.bitchute.com/',
    }

    def _check_format(self, video_url, video_id):
        urls = orderedSet(
            re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
            for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
        for url in urls:
            try:
                response = self._request_webpage(
                    HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
            except ExtractorError as e:
                self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
                continue
            return {
                'url': url,
                'filesize': int_or_none(response.headers.get('Content-Length'))
            }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)

        publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
        entries = self._parse_html5_media_entries(url, webpage, video_id)

        formats = []
        for format_ in traverse_obj(entries, (0, 'formats', ...)):
            if self.get_param('check_formats') is not False:
                format_.update(self._check_format(format_.pop('url'), video_id) or {})
                if 'url' not in format_:
                    continue
            formats.append(format_)

        if not formats:
            self.raise_no_formats(
                'Video is unavailable. Please make sure this video is playable in the browser '
                'before reporting this issue.', expected=True, video_id=video_id)
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
            'description': self._og_search_description(webpage, default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': clean_html(get_element_by_class('owner', webpage)),
            'upload_date': unified_strdate(self._search_regex(
                r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
            'formats': formats,
        }


class BitChuteChannelIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
    _TEST = {
        'url': 'https://www.bitchute.com/channel/victoriaxrave/',
        'playlist_mincount': 185,
        'info_dict': {
            'id': 'victoriaxrave',
        },
    }

    _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'

    def _entries(self, channel_id):
        channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
        offset = 0
        for page_num in itertools.count(1):
            data = self._download_json(
                '%sextend/' % channel_url, channel_id,
                'Downloading channel page %d' % page_num,
                data=urlencode_postdata({
                    'csrfmiddlewaretoken': self._TOKEN,
                    'name': '',
                    'offset': offset,
                }), headers={
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Referer': channel_url,
                    'X-Requested-With': 'XMLHttpRequest',
                    'Cookie': 'csrftoken=%s' % self._TOKEN,
                })
            if data.get('success') is False:
                break
            html = data.get('html')
            if not html:
                break
            video_ids = re.findall(
                r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
                html)
            if not video_ids:
                break
            offset += len(video_ids)
            for video_id in video_ids:
                yield self.url_result(
                    'https://www.bitchute.com/video/%s' % video_id,
                    ie=BitChuteIE.ie_key(), video_id=video_id)

    def _real_extract(self, url):
        channel_id = self._match_id(url)
        return self.playlist_result(
            self._entries(channel_id), playlist_id=channel_id)
Commit	Line	Data
b65e3b06 S	1	import itertools
	2	import re
	3
	4	from .common import InfoExtractor
6b688b89	5	from ..utils import (
37fb591c	6	ExtractorError,
f72218c1	7	HEADRequest,
	8	clean_html,
	9	get_element_by_class,
	10	int_or_none,
6b688b89	11	orderedSet,
f72218c1	12	traverse_obj,
6ddd4bf6	13	unified_strdate,
6b688b89 S	14	urlencode_postdata,
6b688b89 S	15	)
b65e3b06 S	16
	17
	18	class BitChuteIE(InfoExtractor):
	19	_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video\|embed\|torrent/[^/]+)/(?P<id>[^/?#&]+)'
bfd973ec	20	_EMBED_REGEX = [rf'<(?:script\|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
b65e3b06	21	_TESTS = [{
aca5774e	22	'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
aca5774e	23	'md5': '7e427d7ed7af5a75b5855705ec750e2b',
b65e3b06	24	'info_dict': {
f72218c1	25	'id': 'UGlrF9o9b-Q',
b65e3b06	26	'ext': 'mp4',
aca5774e	27	'title': 'This is the first video on #BitChute !',
aca5774e	28	'description': 'md5:a0337e7b1fe39e32336974af8173a034',
b65e3b06	29	'thumbnail': r're:^https?://.*\.jpg$',
aca5774e	30	'uploader': 'BitChute',
aca5774e	31	'upload_date': '20170103',
b65e3b06	32	},
f72218c1	33	}, {
	34	# video not downloadable in browser, but we can recover it
	35	'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
	36	'md5': '05c12397d5354bf24494885b08d24ed1',
	37	'info_dict': {
	38	'id': '2s6B3nZjAk7R',
	39	'ext': 'mp4',
	40	'filesize': 71537926,
	41	'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
	42	'description': 'md5:228ee93bd840a24938f536aeac9cf749',
	43	'thumbnail': r're:^https?://.*\.jpg$',
	44	'uploader': 'BitChute',
	45	'upload_date': '20181113',
	46	},
	47	'params': {'check_formats': None},
b65e3b06 S	48	}, {
	49	'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
	50	'only_matching': True,
	51	}, {
	52	'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
	53	'only_matching': True,
	54	}]
	55
f72218c1	56	_HEADERS = {
	57	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
	58	'Referer': 'https://www.bitchute.com/',
	59	}
	60
	61	def _check_format(self, video_url, video_id):
	62	urls = orderedSet(
	63	re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
	64	for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
	65	for url in urls:
	66	try:
	67	response = self._request_webpage(
	68	HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
	69	except ExtractorError as e:
	70	self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
	71	continue
	72	return {
	73	'url': url,
	74	'filesize': int_or_none(response.headers.get('Content-Length'))
	75	}
	76
b65e3b06 S	77	def _real_extract(self, url):
b65e3b06 S	78	video_id = self._match_id(url)
b65e3b06	79	webpage = self._download_webpage(
f72218c1	80	f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
b65e3b06	81
f72218c1	82	publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
f72218c1	83	entries = self._parse_html5_media_entries(url, webpage, video_id)
b65e3b06	84
f72218c1	85	formats = []
	86	for format_ in traverse_obj(entries, (0, 'formats', ...)):
	87	if self.get_param('check_formats') is not False:
	88	format_.update(self._check_format(format_.pop('url'), video_id) or {})
	89	if 'url' not in format_:
	90	continue
	91	formats.append(format_)
4c78c3d7 S	92
4c78c3d7 S	93	if not formats:
f72218c1	94	self.raise_no_formats(
	95	'Video is unavailable. Please make sure this video is playable in the browser '
	96	'before reporting this issue.', expected=True, video_id=video_id)
b65e3b06 S	97	self._sort_formats(formats)
b65e3b06 S	98
b65e3b06 S	99	return {
b65e3b06 S	100	'id': video_id,
f72218c1	101	'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
	102	'description': self._og_search_description(webpage, default=None),
	103	'thumbnail': self._og_search_thumbnail(webpage),
	104	'uploader': clean_html(get_element_by_class('owner', webpage)),
	105	'upload_date': unified_strdate(self._search_regex(
	106	r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
b65e3b06 S	107	'formats': formats,
	108	}
	109
	110
	111	class BitChuteChannelIE(InfoExtractor):
	112	_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
	113	_TEST = {
	114	'url': 'https://www.bitchute.com/channel/victoriaxrave/',
	115	'playlist_mincount': 185,
	116	'info_dict': {
	117	'id': 'victoriaxrave',
	118	},
	119	}
	120
	121	_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
	122
	123	def _entries(self, channel_id):
	124	channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
de4c41b4 S	125	offset = 0
de4c41b4 S	126	for page_num in itertools.count(1):
b65e3b06 S	127	data = self._download_json(
b65e3b06 S	128	'%sextend/' % channel_url, channel_id,
de4c41b4	129	'Downloading channel page %d' % page_num,
b65e3b06 S	130	data=urlencode_postdata({
	131	'csrfmiddlewaretoken': self._TOKEN,
	132	'name': '',
de4c41b4	133	'offset': offset,
b65e3b06 S	134	}), headers={
	135	'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
	136	'Referer': channel_url,
	137	'X-Requested-With': 'XMLHttpRequest',
	138	'Cookie': 'csrftoken=%s' % self._TOKEN,
	139	})
	140	if data.get('success') is False:
	141	break
	142	html = data.get('html')
	143	if not html:
	144	break
	145	video_ids = re.findall(
	146	r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
	147	html)
	148	if not video_ids:
	149	break
de4c41b4	150	offset += len(video_ids)
b65e3b06 S	151	for video_id in video_ids:
	152	yield self.url_result(
	153	'https://www.bitchute.com/video/%s' % video_id,
	154	ie=BitChuteIE.ie_key(), video_id=video_id)
	155
	156	def _real_extract(self, url):
	157	channel_id = self._match_id(url)
	158	return self.playlist_result(
	159	self._entries(channel_id), playlist_id=channel_id)