[yt-dlp.git] / yt_dlp / extractor / bitchute.py

# coding: utf-8
from __future__ import unicode_literals

import itertools
import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    GeoRestrictedError,
    orderedSet,
    unified_strdate,
    urlencode_postdata,
)


class BitChuteIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
        'md5': '7e427d7ed7af5a75b5855705ec750e2b',
        'info_dict': {
            'id': 'szoMrox2JEI',
            'ext': 'mp4',
            'title': 'This is the first video on #BitChute !',
            'description': 'md5:a0337e7b1fe39e32336974af8173a034',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20170103',
        },
    }, {
        'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
        'only_matching': True,
    }, {
        'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url')
            for mobj in re.finditer(
                r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
                webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
            })

        title = self._html_search_regex(
            (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
            webpage, 'title', default=None) or self._html_search_meta(
            'description', webpage, 'title',
            default=None) or self._og_search_description(webpage)

        format_urls = []
        for mobj in re.finditer(
                r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
            format_urls.append(mobj.group('url'))
        format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))

        formats = [
            {'url': format_url}
            for format_url in orderedSet(format_urls)]

        if not formats:
            entries = self._parse_html5_media_entries(
                url, webpage, video_id)
            if not entries:
                error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
                if error == 'Video Unavailable':
                    raise GeoRestrictedError(error)
                raise ExtractorError(error)
            formats = entries[0]['formats']

        self._check_formats(formats, video_id)
        self._sort_formats(formats)

        description = self._html_search_regex(
            r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
            webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(
            webpage, default=None) or self._html_search_meta(
            'twitter:image:src', webpage, 'thumbnail')
        uploader = self._html_search_regex(
            (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
             r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
            webpage, 'uploader', fatal=False)

        upload_date = unified_strdate(self._search_regex(
            r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
            webpage, 'upload date', fatal=False))

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'uploader': uploader,
            'upload_date': upload_date,
            'formats': formats,
        }


class BitChuteChannelIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
    _TEST = {
        'url': 'https://www.bitchute.com/channel/victoriaxrave/',
        'playlist_mincount': 185,
        'info_dict': {
            'id': 'victoriaxrave',
        },
    }

    _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'

    def _entries(self, channel_id):
        channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
        offset = 0
        for page_num in itertools.count(1):
            data = self._download_json(
                '%sextend/' % channel_url, channel_id,
                'Downloading channel page %d' % page_num,
                data=urlencode_postdata({
                    'csrfmiddlewaretoken': self._TOKEN,
                    'name': '',
                    'offset': offset,
                }), headers={
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Referer': channel_url,
                    'X-Requested-With': 'XMLHttpRequest',
                    'Cookie': 'csrftoken=%s' % self._TOKEN,
                })
            if data.get('success') is False:
                break
            html = data.get('html')
            if not html:
                break
            video_ids = re.findall(
                r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
                html)
            if not video_ids:
                break
            offset += len(video_ids)
            for video_id in video_ids:
                yield self.url_result(
                    'https://www.bitchute.com/video/%s' % video_id,
                    ie=BitChuteIE.ie_key(), video_id=video_id)

    def _real_extract(self, url):
        channel_id = self._match_id(url)
        return self.playlist_result(
            self._entries(channel_id), playlist_id=channel_id)
Commit	Line	Data
b65e3b06 S	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import itertools
	5	import re
	6
	7	from .common import InfoExtractor
6b688b89	8	from ..utils import (
37fb591c AH	9	ExtractorError,
37fb591c AH	10	GeoRestrictedError,
6b688b89	11	orderedSet,
6ddd4bf6	12	unified_strdate,
6b688b89 S	13	urlencode_postdata,
6b688b89 S	14	)
b65e3b06 S	15
	16
	17	class BitChuteIE(InfoExtractor):
	18	_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video\|embed\|torrent/[^/]+)/(?P<id>[^/?#&]+)'
	19	_TESTS = [{
aca5774e	20	'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
aca5774e	21	'md5': '7e427d7ed7af5a75b5855705ec750e2b',
b65e3b06 S	22	'info_dict': {
	23	'id': 'szoMrox2JEI',
	24	'ext': 'mp4',
aca5774e	25	'title': 'This is the first video on #BitChute !',
aca5774e	26	'description': 'md5:a0337e7b1fe39e32336974af8173a034',
b65e3b06	27	'thumbnail': r're:^https?://.*\.jpg$',
aca5774e	28	'uploader': 'BitChute',
aca5774e	29	'upload_date': '20170103',
b65e3b06 S	30	},
	31	}, {
	32	'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
	33	'only_matching': True,
	34	}, {
	35	'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
	36	'only_matching': True,
	37	}]
	38
097f1663	39	@staticmethod
	40	def _extract_urls(webpage):
	41	return [
	42	mobj.group('url')
	43	for mobj in re.finditer(
	44	r'<(?:script\|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
	45	webpage)]
	46
b65e3b06 S	47	def _real_extract(self, url):
	48	video_id = self._match_id(url)
	49
	50	webpage = self._download_webpage(
02df4135 AU	51	'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
	52	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
	53	})
b65e3b06	54
8578ea4d	55	title = self._html_search_regex(
b65e3b06 S	56	(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
	57	webpage, 'title', default=None) or self._html_search_meta(
	58	'description', webpage, 'title',
	59	default=None) or self._og_search_description(webpage)
	60
6b688b89 S	61	format_urls = []
	62	for mobj in re.finditer(
	63	r'addWebSeed\s\(\s(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
	64	format_urls.append(mobj.group('url'))
	65	format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
	66
b65e3b06	67	formats = [
6b688b89 S	68	{'url': format_url}
6b688b89 S	69	for format_url in orderedSet(format_urls)]
4c78c3d7 S	70
4c78c3d7 S	71	if not formats:
37fb591c AH	72	entries = self._parse_html5_media_entries(
	73	url, webpage, video_id)
	74	if not entries:
	75	error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
	76	if error == 'Video Unavailable':
	77	raise GeoRestrictedError(error)
	78	raise ExtractorError(error)
	79	formats = entries[0]['formats']
4c78c3d7	80
d65f6e73	81	self._check_formats(formats, video_id)
b65e3b06 S	82	self._sort_formats(formats)
	83
	84	description = self._html_search_regex(
	85	r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
	86	webpage, 'description', fatal=False)
	87	thumbnail = self._og_search_thumbnail(
	88	webpage, default=None) or self._html_search_meta(
	89	'twitter:image:src', webpage, 'thumbnail')
	90	uploader = self._html_search_regex(
bbf1defe GS	91	(r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
	92	r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
	93	webpage, 'uploader', fatal=False)
b65e3b06	94
6ddd4bf6 I	95	upload_date = unified_strdate(self._search_regex(
	96	r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
	97	webpage, 'upload date', fatal=False))
	98
b65e3b06 S	99	return {
	100	'id': video_id,
	101	'title': title,
	102	'description': description,
	103	'thumbnail': thumbnail,
	104	'uploader': uploader,
6ddd4bf6	105	'upload_date': upload_date,
b65e3b06 S	106	'formats': formats,
	107	}
	108
	109
	110	class BitChuteChannelIE(InfoExtractor):
	111	_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
	112	_TEST = {
	113	'url': 'https://www.bitchute.com/channel/victoriaxrave/',
	114	'playlist_mincount': 185,
	115	'info_dict': {
	116	'id': 'victoriaxrave',
	117	},
	118	}
	119
	120	_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
	121
	122	def _entries(self, channel_id):
	123	channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
de4c41b4 S	124	offset = 0
de4c41b4 S	125	for page_num in itertools.count(1):
b65e3b06 S	126	data = self._download_json(
b65e3b06 S	127	'%sextend/' % channel_url, channel_id,
de4c41b4	128	'Downloading channel page %d' % page_num,
b65e3b06 S	129	data=urlencode_postdata({
	130	'csrfmiddlewaretoken': self._TOKEN,
	131	'name': '',
de4c41b4	132	'offset': offset,
b65e3b06 S	133	}), headers={
	134	'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
	135	'Referer': channel_url,
	136	'X-Requested-With': 'XMLHttpRequest',
	137	'Cookie': 'csrftoken=%s' % self._TOKEN,
	138	})
	139	if data.get('success') is False:
	140	break
	141	html = data.get('html')
	142	if not html:
	143	break
	144	video_ids = re.findall(
	145	r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
	146	html)
	147	if not video_ids:
	148	break
de4c41b4	149	offset += len(video_ids)
b65e3b06 S	150	for video_id in video_ids:
	151	yield self.url_result(
	152	'https://www.bitchute.com/video/%s' % video_id,
	153	ie=BitChuteIE.ie_key(), video_id=video_id)
	154
	155	def _real_extract(self, url):
	156	channel_id = self._match_id(url)
	157	return self.playlist_result(
	158	self._entries(channel_id), playlist_id=channel_id)