yt_dlp/extractor/tokentube.py

   1 import functools
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     clean_html,
   7     get_element_by_class,
   8     parse_count,
   9     remove_end,
  10     unified_strdate,
  11     js_to_json,
  12     OnDemandPagedList,
  13 )
  14
  15
  16 class TokentubeIE(InfoExtractor):
  17     _VALID_URL = r'https?://(?:www\.)?tokentube\.net/(?:view\?[vl]=|[vl]/)(?P<id>\d+)'
  18     _TESTS = [{
  19         'url': 'https://tokentube.net/l/3236632011/Praise-A-Thon-Pastori-Chrisin-ja-Pastori-Bennyn-kanssa-27-8-2021',
  20         'info_dict': {
  21             'id': '3236632011',
  22             'ext': 'mp4',
  23             'title': 'Praise-A-Thon Pastori Chrisin ja Pastori Bennyn kanssa 27.8.2021',
  24             'description': '',
  25             'uploader': 'Pastori Chris - Rapsodia.fi',
  26             'upload_date': '20210827',
  27         },
  28         'params': {
  29             'skip_download': True,
  30         },
  31     }, {
  32         'url': 'https://tokentube.net/v/3950239124/Linux-Ubuntu-Studio-perus-k%C3%A4ytt%C3%B6',
  33         'md5': '0e1f00421f501f5eada9890d38fcfb56',
  34         'info_dict': {
  35             'id': '3950239124',
  36             'ext': 'mp4',
  37             'title': 'Linux Ubuntu Studio perus käyttö',
  38             'description': 'md5:46077d0daaba1974f2dc381257f9d64c',
  39             'uploader': 'jyrilehtonen',
  40             'upload_date': '20210825',
  41         },
  42     }, {
  43         'url': 'https://tokentube.net/view?v=3582463289',
  44         'info_dict': {
  45             'id': '3582463289',
  46             'ext': 'mp4',
  47             'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??',
  48             'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e',
  49             'uploader': 'Voitontie',
  50             'upload_date': '20210428',
  51         }
  52     }]
  53
  54     def _real_extract(self, url):
  55         video_id = self._match_id(url)
  56         webpage = self._download_webpage(url, video_id)
  57
  58         title = self._html_search_regex(r'<h1\s*class=["\']title-text["\']>(.+?)</h1>', webpage, 'title')
  59
  60         data_json = self._html_search_regex(r'({["\']html5["\'].+?}}}+)', webpage, 'data json')
  61         data_json = self._parse_json(js_to_json(data_json), video_id, fatal=False)
  62
  63         sources = data_json.get('sources') or self._parse_json(
  64             self._html_search_regex(r'updateSrc\(([^\)]+)\)', webpage, 'sources'),
  65             video_id, transform_source=js_to_json)
  66
  67         formats = [{
  68             'url': format.get('src'),
  69             'format_id': format.get('label'),
  70             'height': format.get('res'),
  71         } for format in sources]
  72
  73         view_count = parse_count(self._html_search_regex(
  74             r'<p\s*class=["\']views_counter["\']>\s*([\d\.,]+)\s*<span>views?</span></p>',
  75             webpage, 'view_count', fatal=False))
  76
  77         like_count = parse_count(self._html_search_regex(
  78             r'<div\s*class="sh_button\s*likes_count">\s*(\d+)\s*</div>',
  79             webpage, 'like count', fatal=False))
  80
  81         dislike_count = parse_count(self._html_search_regex(
  82             r'<div\s*class="sh_button\s*dislikes_count">\s*(\d+)\s*</div>',
  83             webpage, 'dislike count', fatal=False))
  84
  85         upload_date = unified_strdate(self._html_search_regex(
  86             r'<span\s*class="p-date">Published\s*on\s+([^<]+)',
  87             webpage, 'upload date', fatal=False))
  88
  89         uploader = self._html_search_regex(
  90             r'<a\s*class="place-left"[^>]+>(.+?)</a>',
  91             webpage, 'uploader', fatal=False)
  92
  93         description = (clean_html(get_element_by_class('p-d-txt', webpage))
  94                        or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage))
  95
  96         description = remove_end(description, 'Category')
  97
  98         self._sort_formats(formats)
  99
 100         return {
 101             'id': video_id,
 102             'formats': formats,
 103             'title': title,
 104             'view_count': view_count,
 105             'like_count': like_count,
 106             'dislike_count': dislike_count,
 107             'upload_date': upload_date,
 108             'description': description,
 109             'uploader': uploader,
 110         }
 111
 112
 113 class TokentubeChannelIE(InfoExtractor):
 114     _PAGE_SIZE = 20
 115     IE_NAME = 'Tokentube:channel'
 116     _VALID_URL = r'https?://(?:www\.)?tokentube\.net/channel/(?P<id>\d+)/[^/]+(?:/videos)?'
 117     _TESTS = [{
 118         'url': 'https://tokentube.net/channel/3697658904/TokenTube',
 119         'info_dict': {
 120             'id': '3697658904',
 121         },
 122         'playlist_mincount': 7,
 123     }, {
 124         'url': 'https://tokentube.net/channel/3353234420/Linux/videos',
 125         'info_dict': {
 126             'id': '3353234420',
 127         },
 128         'playlist_mincount': 20,
 129     }, {
 130         'url': 'https://tokentube.net/channel/3475834195/Voitontie',
 131         'info_dict': {
 132             'id': '3475834195',
 133         },
 134         'playlist_mincount': 150,
 135     }]
 136
 137     def _fetch_page(self, channel_id, page):
 138         page += 1
 139         videos_info = self._download_webpage(
 140             f'https://tokentube.net/videos?p=0&m=1&sort=recent&u={channel_id}&page={page}',
 141             channel_id, headers={'X-Requested-With': 'XMLHttpRequest'},
 142             note=f'Downloading page {page}', fatal=False)
 143         if '</i> Sorry, no results were found.' not in videos_info:
 144             for path, media_id in re.findall(
 145                     r'<a[^>]+\bhref=["\']([^"\']+/[lv]/(\d+)/\S+)["\'][^>]+>',
 146                     videos_info):
 147                 yield self.url_result(path, ie=TokentubeIE.ie_key(), video_id=media_id)
 148
 149     def _real_extract(self, url):
 150         channel_id = self._match_id(url)
 151
 152         entries = OnDemandPagedList(functools.partial(
 153             self._fetch_page, channel_id), self._PAGE_SIZE)
 154
 155         return self.playlist_result(entries, channel_id)