yt_dlp/extractor/tokentube.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     clean_html,
  10     get_element_by_class,
  11     parse_count,
  12     remove_end,
  13     unified_strdate,
  14     js_to_json,
  15     OnDemandPagedList,
  16 )
  17
  18
  19 class TokentubeIE(InfoExtractor):
  20     _VALID_URL = r'https?://(?:www\.)?tokentube\.net/(?:view\?[vl]=|[vl]/)(?P<id>\d+)'
  21     _TESTS = [{
  22         'url': 'https://tokentube.net/l/3236632011/Praise-A-Thon-Pastori-Chrisin-ja-Pastori-Bennyn-kanssa-27-8-2021',
  23         'info_dict': {
  24             'id': '3236632011',
  25             'ext': 'mp4',
  26             'title': 'Praise-A-Thon Pastori Chrisin ja Pastori Bennyn kanssa 27.8.2021',
  27             'description': '',
  28             'uploader': 'Pastori Chris - Rapsodia.fi',
  29             'upload_date': '20210827',
  30         },
  31         'params': {
  32             'skip_download': True,
  33         },
  34     }, {
  35         'url': 'https://tokentube.net/v/3950239124/Linux-Ubuntu-Studio-perus-k%C3%A4ytt%C3%B6',
  36         'md5': '0e1f00421f501f5eada9890d38fcfb56',
  37         'info_dict': {
  38             'id': '3950239124',
  39             'ext': 'mp4',
  40             'title': 'Linux Ubuntu Studio perus käyttö',
  41             'description': 'md5:46077d0daaba1974f2dc381257f9d64c',
  42             'uploader': 'jyrilehtonen',
  43             'upload_date': '20210825',
  44         },
  45     }, {
  46         'url': 'https://tokentube.net/view?v=3582463289',
  47         'info_dict': {
  48             'id': '3582463289',
  49             'ext': 'mp4',
  50             'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??',
  51             'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e',
  52             'uploader': 'Voitontie',
  53             'upload_date': '20210428',
  54         }
  55     }]
  56
  57     def _real_extract(self, url):
  58         video_id = self._match_id(url)
  59         webpage = self._download_webpage(url, video_id)
  60
  61         title = self._html_search_regex(r'<h1\s*class=["\']title-text["\']>(.+?)</h1>', webpage, 'title')
  62
  63         data_json = self._html_search_regex(r'({["\']html5["\'].+?}}}+)', webpage, 'data json')
  64         data_json = self._parse_json(js_to_json(data_json), video_id, fatal=False)
  65
  66         sources = data_json.get('sources') or self._parse_json(
  67             self._html_search_regex(r'updateSrc\(([^\)]+)\)', webpage, 'sources'),
  68             video_id, transform_source=js_to_json)
  69
  70         formats = [{
  71             'url': format.get('src'),
  72             'format_id': format.get('label'),
  73             'height': format.get('res'),
  74         } for format in sources]
  75
  76         view_count = parse_count(self._html_search_regex(
  77             r'<p\s*class=["\']views_counter["\']>\s*([\d\.,]+)\s*<span>views?</span></p>',
  78             webpage, 'view_count', fatal=False))
  79
  80         like_count = parse_count(self._html_search_regex(
  81             r'<div\s*class="sh_button\s*likes_count">\s*(\d+)\s*</div>',
  82             webpage, 'like count', fatal=False))
  83
  84         dislike_count = parse_count(self._html_search_regex(
  85             r'<div\s*class="sh_button\s*dislikes_count">\s*(\d+)\s*</div>',
  86             webpage, 'dislike count', fatal=False))
  87
  88         upload_date = unified_strdate(self._html_search_regex(
  89             r'<span\s*class="p-date">Published\s*on\s+([^<]+)',
  90             webpage, 'upload date', fatal=False))
  91
  92         uploader = self._html_search_regex(
  93             r'<a\s*class="place-left"[^>]+>(.+?)</a>',
  94             webpage, 'uploader', fatal=False)
  95
  96         description = (clean_html(get_element_by_class('p-d-txt', webpage))
  97                        or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage))
  98
  99         description = remove_end(description, 'Category')
 100
 101         self._sort_formats(formats)
 102
 103         return {
 104             'id': video_id,
 105             'formats': formats,
 106             'title': title,
 107             'view_count': view_count,
 108             'like_count': like_count,
 109             'dislike_count': dislike_count,
 110             'upload_date': upload_date,
 111             'description': description,
 112             'uploader': uploader,
 113         }
 114
 115
 116 class TokentubeChannelIE(InfoExtractor):
 117     _PAGE_SIZE = 20
 118     IE_NAME = 'Tokentube:channel'
 119     _VALID_URL = r'https?://(?:www\.)?tokentube\.net/channel/(?P<id>\d+)/[^/]+(?:/videos)?'
 120     _TESTS = [{
 121         'url': 'https://tokentube.net/channel/3697658904/TokenTube',
 122         'info_dict': {
 123             'id': '3697658904',
 124         },
 125         'playlist_mincount': 7,
 126     }, {
 127         'url': 'https://tokentube.net/channel/3353234420/Linux/videos',
 128         'info_dict': {
 129             'id': '3353234420',
 130         },
 131         'playlist_mincount': 20,
 132     }, {
 133         'url': 'https://tokentube.net/channel/3475834195/Voitontie',
 134         'info_dict': {
 135             'id': '3475834195',
 136         },
 137         'playlist_mincount': 150,
 138     }]
 139
 140     def _fetch_page(self, channel_id, page):
 141         page += 1
 142         videos_info = self._download_webpage(
 143             f'https://tokentube.net/videos?p=0&m=1&sort=recent&u={channel_id}&page={page}',
 144             channel_id, headers={'X-Requested-With': 'XMLHttpRequest'},
 145             note=f'Downloading page {page}', fatal=False)
 146         if '</i> Sorry, no results were found.' not in videos_info:
 147             for path, media_id in re.findall(
 148                     r'<a[^>]+\bhref=["\']([^"\']+/[lv]/(\d+)/\S+)["\'][^>]+>',
 149                     videos_info):
 150                 yield self.url_result(path, ie=TokentubeIE.ie_key(), video_id=media_id)
 151
 152     def _real_extract(self, url):
 153         channel_id = self._match_id(url)
 154
 155         entries = OnDemandPagedList(functools.partial(
 156             self._fetch_page, channel_id), self._PAGE_SIZE)
 157
 158         return self.playlist_result(entries, channel_id)