yt_dlp/extractor/bitchute.py

   1 import functools
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..networking import HEADRequest
   6 from ..utils import (
   7     ExtractorError,
   8     OnDemandPagedList,
   9     clean_html,
  10     extract_attributes,
  11     get_element_by_class,
  12     get_element_by_id,
  13     get_element_html_by_class,
  14     get_elements_html_by_class,
  15     int_or_none,
  16     orderedSet,
  17     parse_count,
  18     parse_duration,
  19     traverse_obj,
  20     unified_strdate,
  21     urlencode_postdata,
  22     urljoin,
  23 )
  24
  25
  26 class BitChuteIE(InfoExtractor):
  27     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
  28     _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
  29     _TESTS = [{
  30         'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
  31         'md5': '7e427d7ed7af5a75b5855705ec750e2b',
  32         'info_dict': {
  33             'id': 'UGlrF9o9b-Q',
  34             'ext': 'mp4',
  35             'title': 'This is the first video on #BitChute !',
  36             'description': 'md5:a0337e7b1fe39e32336974af8173a034',
  37             'thumbnail': r're:^https?://.*\.jpg$',
  38             'uploader': 'BitChute',
  39             'upload_date': '20170103',
  40             'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
  41             'channel': 'BitChute',
  42             'channel_url': 'https://www.bitchute.com/channel/bitchute/'
  43         },
  44     }, {
  45         # test case: video with different channel and uploader
  46         'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/',
  47         'md5': 'f10e6a8e787766235946d0868703f1d0',
  48         'info_dict': {
  49             'id': 'Yti_j9A-UZ4',
  50             'ext': 'mp4',
  51             'title': 'Israel at War | Full Measure',
  52             'description': 'md5:38cf7bc6f42da1a877835539111c69ef',
  53             'thumbnail': r're:^https?://.*\.jpg$',
  54             'uploader': 'sharylattkisson',
  55             'upload_date': '20231106',
  56             'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/',
  57             'channel': 'Full Measure with Sharyl Attkisson',
  58             'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/'
  59         },
  60     }, {
  61         # video not downloadable in browser, but we can recover it
  62         'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
  63         'md5': '05c12397d5354bf24494885b08d24ed1',
  64         'info_dict': {
  65             'id': '2s6B3nZjAk7R',
  66             'ext': 'mp4',
  67             'filesize': 71537926,
  68             'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
  69             'description': 'md5:228ee93bd840a24938f536aeac9cf749',
  70             'thumbnail': r're:^https?://.*\.jpg$',
  71             'uploader': 'BitChute',
  72             'upload_date': '20181113',
  73             'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
  74             'channel': 'BitChute',
  75             'channel_url': 'https://www.bitchute.com/channel/bitchute/'
  76         },
  77         'params': {'check_formats': None},
  78     }, {
  79         # restricted video
  80         'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/',
  81         'info_dict': {
  82             'id': 'WEnQU7XGcTdl',
  83             'ext': 'mp4',
  84             'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft',
  85         },
  86         'params': {'skip_download': True},
  87         'skip': 'Georestricted in DE',
  88     }, {
  89         'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
  90         'only_matching': True,
  91     }, {
  92         'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
  93         'only_matching': True,
  94     }]
  95     _GEO_BYPASS = False
  96
  97     _HEADERS = {
  98         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
  99         'Referer': 'https://www.bitchute.com/',
 100     }
 101
 102     def _check_format(self, video_url, video_id):
 103         urls = orderedSet(
 104             re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
 105             for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128',
 106                          'seed132', 'seed150', 'seed151', 'seed152', 'seed153',
 107                          'seed167', 'seed171', 'seed177', 'seed305', 'seed307',
 108                          'seedp29xb', 'zb10-7gsop1v78'))
 109         for url in urls:
 110             try:
 111                 response = self._request_webpage(
 112                     HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
 113             except ExtractorError as e:
 114                 self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
 115                 continue
 116             return {
 117                 'url': url,
 118                 'filesize': int_or_none(response.headers.get('Content-Length'))
 119             }
 120
 121     def _raise_if_restricted(self, webpage):
 122         page_title = clean_html(get_element_by_class('page-title', webpage)) or ''
 123         if re.fullmatch(r'(?:Channel|Video) Restricted', page_title):
 124             reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title
 125             self.raise_geo_restricted(reason)
 126
 127     @staticmethod
 128     def _make_url(html):
 129         path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href')
 130         return urljoin('https://www.bitchute.com', path)
 131
 132     def _real_extract(self, url):
 133         video_id = self._match_id(url)
 134         webpage = self._download_webpage(
 135             f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)
 136
 137         self._raise_if_restricted(webpage)
 138         publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
 139         entries = self._parse_html5_media_entries(url, webpage, video_id)
 140
 141         formats = []
 142         for format_ in traverse_obj(entries, (0, 'formats', ...)):
 143             if self.get_param('check_formats') is not False:
 144                 format_.update(self._check_format(format_.pop('url'), video_id) or {})
 145                 if 'url' not in format_:
 146                     continue
 147             formats.append(format_)
 148
 149         if not formats:
 150             self.raise_no_formats(
 151                 'Video is unavailable. Please make sure this video is playable in the browser '
 152                 'before reporting this issue.', expected=True, video_id=video_id)
 153
 154         details = get_element_by_class('details', webpage) or ''
 155         uploader_html = get_element_html_by_class('creator', details) or ''
 156         channel_html = get_element_html_by_class('name', details) or ''
 157
 158         return {
 159             'id': video_id,
 160             'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
 161             'description': self._og_search_description(webpage, default=None),
 162             'thumbnail': self._og_search_thumbnail(webpage),
 163             'uploader': clean_html(uploader_html),
 164             'uploader_url': self._make_url(uploader_html),
 165             'channel': clean_html(channel_html),
 166             'channel_url': self._make_url(channel_html),
 167             'upload_date': unified_strdate(self._search_regex(
 168                 r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
 169             'formats': formats,
 170         }
 171
 172
 173 class BitChuteChannelIE(InfoExtractor):
 174     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)'
 175     _TESTS = [{
 176         'url': 'https://www.bitchute.com/channel/bitchute/',
 177         'info_dict': {
 178             'id': 'bitchute',
 179             'title': 'BitChute',
 180             'description': 'md5:5329fb3866125afa9446835594a9b138',
 181         },
 182         'playlist': [
 183             {
 184                 'md5': '7e427d7ed7af5a75b5855705ec750e2b',
 185                 'info_dict': {
 186                     'id': 'UGlrF9o9b-Q',
 187                     'ext': 'mp4',
 188                     'title': 'This is the first video on #BitChute !',
 189                     'description': 'md5:a0337e7b1fe39e32336974af8173a034',
 190                     'thumbnail': r're:^https?://.*\.jpg$',
 191                     'uploader': 'BitChute',
 192                     'upload_date': '20170103',
 193                     'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
 194                     'channel': 'BitChute',
 195                     'channel_url': 'https://www.bitchute.com/channel/bitchute/',
 196                     'duration': 16,
 197                     'view_count': int,
 198                 },
 199             }
 200         ],
 201         'params': {
 202             'skip_download': True,
 203             'playlist_items': '-1',
 204         },
 205     }, {
 206         'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/',
 207         'playlist_mincount': 20,
 208         'info_dict': {
 209             'id': 'wV9Imujxasw9',
 210             'title': 'Bruce MacDonald and "The Light of Darkness"',
 211             'description': 'md5:747724ef404eebdfc04277714f81863e',
 212         }
 213     }]
 214
 215     _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
 216     PAGE_SIZE = 25
 217     HTML_CLASS_NAMES = {
 218         'channel': {
 219             'container': 'channel-videos-container',
 220             'title': 'channel-videos-title',
 221             'description': 'channel-videos-text',
 222         },
 223         'playlist': {
 224             'container': 'playlist-video',
 225             'title': 'title',
 226             'description': 'description',
 227         }
 228
 229     }
 230
 231     @staticmethod
 232     def _make_url(playlist_id, playlist_type):
 233         return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/'
 234
 235     def _fetch_page(self, playlist_id, playlist_type, page_num):
 236         playlist_url = self._make_url(playlist_id, playlist_type)
 237         data = self._download_json(
 238             f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}',
 239             data=urlencode_postdata({
 240                 'csrfmiddlewaretoken': self._TOKEN,
 241                 'name': '',
 242                 'offset': page_num * self.PAGE_SIZE,
 243             }), headers={
 244                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 245                 'Referer': playlist_url,
 246                 'X-Requested-With': 'XMLHttpRequest',
 247                 'Cookie': f'csrftoken={self._TOKEN}',
 248             })
 249         if not data.get('success'):
 250             return
 251         classes = self.HTML_CLASS_NAMES[playlist_type]
 252         for video_html in get_elements_html_by_class(classes['container'], data.get('html')):
 253             video_id = self._search_regex(
 254                 r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None)
 255             if not video_id:
 256                 continue
 257             yield self.url_result(
 258                 f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True,
 259                 title=clean_html(get_element_by_class(classes['title'], video_html)),
 260                 description=clean_html(get_element_by_class(classes['description'], video_html)),
 261                 duration=parse_duration(get_element_by_class('video-duration', video_html)),
 262                 view_count=parse_count(clean_html(get_element_by_class('video-views', video_html))))
 263
 264     def _real_extract(self, url):
 265         playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id')
 266         webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id)
 267
 268         page_func = functools.partial(self._fetch_page, playlist_id, playlist_type)
 269         return self.playlist_result(
 270             OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id,
 271             title=self._html_extract_title(webpage, default=None),
 272             description=self._html_search_meta(
 273                 ('description', 'og:description', 'twitter:description'), webpage, default=None),
 274             playlist_count=int_or_none(self._html_search_regex(
 275                 r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None)))